scikit-learn-contrib · MatthewSZhang · May 8, 2025 · May 8, 2025
diff --git a/fastcan/narx.py b/fastcan/narx.py
@@ -27,6 +27,7 @@
 from ._fastcan import FastCan
 from ._narx_fast import _predict_step, _update_cfd, _update_terms  # type: ignore
 from ._refine import refine
+from .utils import mask_missing_values
 
 
 @validate_params(
@@ -273,14 +274,6 @@ def make_poly_ids(
     return np.delete(ids, const_id, 0)  # remove the constant featrue
 
 
-def _mask_missing_value(*arr, return_mask=False):
-    """Remove missing value for all arrays."""
-    mask_nomissing = np.all(np.isfinite(np.c_[arr]), axis=1)
-    if return_mask:
-        return mask_nomissing
-    return tuple([x[mask_nomissing] for x in arr])
-
-
 def _valiate_time_shift_poly_ids(
     time_shift_ids, poly_ids, n_samples=None, n_features=None, n_outputs=None
 ):
@@ -374,7 +367,7 @@ def _validate_feat_delay_ids(
         )
     if (delay_ids_.min() < -1) or (delay_ids_.max() >= n_samples):
         raise ValueError(
-            "The element x of delay_ids should " f"satisfy -1 <= x < {n_samples}."
+            f"The element x of delay_ids should satisfy -1 <= x < {n_samples}."
         )
     return feat_ids_, delay_ids_
 
@@ -783,7 +776,7 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
             time_shift_vars = make_time_shift_features(xy_hstack, time_shift_ids)
             poly_terms = make_poly_features(time_shift_vars, poly_ids)
             # Remove missing values
-            poly_terms_masked, y_masked, sample_weight_masked = _mask_missing_value(
+            poly_terms_masked, y_masked, sample_weight_masked = mask_missing_values(
                 poly_terms, y, sample_weight
             )
             coef = np.zeros(n_terms, dtype=float)
@@ -1060,7 +1053,7 @@ def _loss(
             output_ids,
         )
 
-        y_masked, y_hat_masked, sample_weight_sqrt_masked = _mask_missing_value(
+        y_masked, y_hat_masked, sample_weight_sqrt_masked = mask_missing_values(
             y, y_hat, sample_weight_sqrt
         )
 
@@ -1115,12 +1108,10 @@ def _grad(
             grad_delay_ids,
         )
 
-        mask_nomissing = _mask_missing_value(
-            y, y_hat, sample_weight_sqrt, return_mask=True
-        )
+        mask_valid = mask_missing_values(y, y_hat, sample_weight_sqrt, return_mask=True)
 
-        sample_weight_sqrt_masked = sample_weight_sqrt[mask_nomissing]
-        dydx_masked = dydx[mask_nomissing]
+        sample_weight_sqrt_masked = sample_weight_sqrt[mask_valid]
+        dydx_masked = dydx[mask_valid]
 
         return dydx_masked.sum(axis=1) * sample_weight_sqrt_masked
 
@@ -1264,7 +1255,7 @@ def _get_term_str(term_feat_ids, term_delay_ids):
                 else:
                     term_str += f"*X[k-{delay_id},{feat_id}]"
             elif feat_id >= narx.n_features_in_:
-                term_str += f"*y_hat[k-{delay_id},{feat_id-narx.n_features_in_}]"
+                term_str += f"*y_hat[k-{delay_id},{feat_id - narx.n_features_in_}]"
         return term_str[1:]
 
     yid_space = 5
@@ -1472,7 +1463,7 @@ def make_narx(
     poly_terms = make_poly_features(time_shift_vars, poly_ids_all)
 
     # Remove missing values
-    poly_terms_masked, y_masked = _mask_missing_value(poly_terms, y)
+    poly_terms_masked, y_masked = mask_missing_values(poly_terms, y)
 
     selected_poly_ids = []
     for i in range(n_outputs):

diff --git a/fastcan/utils.py b/fastcan/utils.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 from sklearn.cross_decomposition import CCA
-from sklearn.utils import check_X_y
+from sklearn.utils import _safe_indexing, check_consistent_length, check_X_y
 from sklearn.utils._param_validation import Interval, validate_params
 
 
@@ -120,3 +120,52 @@ def ols(X, y, t=1):
             if not mask[j]:
                 w[:, j] = w[:, j] - w[:, d] * (w[:, d] @ w[:, j])
                 w[:, j] /= np.linalg.norm(w[:, j], axis=0)
+
+
+@validate_params(
+    {
+        "return_mask": ["boolean"],
+    },
+    prefer_skip_nested_validation=True,
+)
+def mask_missing_values(*arrays, return_mask=False):
+    """Remove missing values for all arrays.
+
+    Parameters
+    ----------
+    *arrays : sequence of array-like of shape (n_samples,) or \
+            (n_samples, n_outputs)
+        Arrays with consistent first dimension.
+
+    return_mask : bool, default=False
+        If True, return a mask of valid values.
+        If False, return the arrays with missing values removed.
+
+    Returns
+    -------
+    mask_valid : ndarray of shape (n_samples,)
+        Mask of valid values.
+
+    masked_arrays : sequence of array-like of shape (n_samples,) or \
+            (n_samples, n_outputs)
+        Arrays with missing values removed.
+        The order of the arrays is the same as the input arrays.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from fastcan.utils import mask_missing_values
+    >>> a = [[1, 2], [3, np.nan], [5, 6]]
+    >>> b = [1, 2, 3]
+    >>> mask_missing_values(a, b)
+    [[[1, 2], [5, 6]], [1, 3]]
+    >>> mask_missing_values(a, b, return_mask=True)
+    array([ True, False,  True])
+    """
+    if len(arrays) == 0:
+        return None
+    check_consistent_length(*arrays)
+    mask_valid = np.all(np.isfinite(np.c_[arrays]), axis=1)
+    if return_mask:
+        return mask_valid
+    return [_safe_indexing(x, mask_valid) for x in arrays]