Skip to content

FEAT add mask_missing_values in utils #81

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 9 additions & 18 deletions fastcan/narx.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ._fastcan import FastCan
from ._narx_fast import _predict_step, _update_cfd, _update_terms # type: ignore
from ._refine import refine
from .utils import mask_missing_values


@validate_params(
Expand Down Expand Up @@ -273,14 +274,6 @@ def make_poly_ids(
return np.delete(ids, const_id, 0) # remove the constant featrue


def _mask_missing_value(*arr, return_mask=False):
"""Remove missing value for all arrays."""
mask_nomissing = np.all(np.isfinite(np.c_[arr]), axis=1)
if return_mask:
return mask_nomissing
return tuple([x[mask_nomissing] for x in arr])


def _valiate_time_shift_poly_ids(
time_shift_ids, poly_ids, n_samples=None, n_features=None, n_outputs=None
):
Expand Down Expand Up @@ -374,7 +367,7 @@ def _validate_feat_delay_ids(
)
if (delay_ids_.min() < -1) or (delay_ids_.max() >= n_samples):
raise ValueError(
"The element x of delay_ids should " f"satisfy -1 <= x < {n_samples}."
f"The element x of delay_ids should satisfy -1 <= x < {n_samples}."
)
return feat_ids_, delay_ids_

Expand Down Expand Up @@ -783,7 +776,7 @@ def fit(self, X, y, sample_weight=None, coef_init=None, **params):
time_shift_vars = make_time_shift_features(xy_hstack, time_shift_ids)
poly_terms = make_poly_features(time_shift_vars, poly_ids)
# Remove missing values
poly_terms_masked, y_masked, sample_weight_masked = _mask_missing_value(
poly_terms_masked, y_masked, sample_weight_masked = mask_missing_values(
poly_terms, y, sample_weight
)
coef = np.zeros(n_terms, dtype=float)
Expand Down Expand Up @@ -1060,7 +1053,7 @@ def _loss(
output_ids,
)

y_masked, y_hat_masked, sample_weight_sqrt_masked = _mask_missing_value(
y_masked, y_hat_masked, sample_weight_sqrt_masked = mask_missing_values(
y, y_hat, sample_weight_sqrt
)

Expand Down Expand Up @@ -1115,12 +1108,10 @@ def _grad(
grad_delay_ids,
)

mask_nomissing = _mask_missing_value(
y, y_hat, sample_weight_sqrt, return_mask=True
)
mask_valid = mask_missing_values(y, y_hat, sample_weight_sqrt, return_mask=True)

sample_weight_sqrt_masked = sample_weight_sqrt[mask_nomissing]
dydx_masked = dydx[mask_nomissing]
sample_weight_sqrt_masked = sample_weight_sqrt[mask_valid]
dydx_masked = dydx[mask_valid]

return dydx_masked.sum(axis=1) * sample_weight_sqrt_masked

Expand Down Expand Up @@ -1264,7 +1255,7 @@ def _get_term_str(term_feat_ids, term_delay_ids):
else:
term_str += f"*X[k-{delay_id},{feat_id}]"
elif feat_id >= narx.n_features_in_:
term_str += f"*y_hat[k-{delay_id},{feat_id-narx.n_features_in_}]"
term_str += f"*y_hat[k-{delay_id},{feat_id - narx.n_features_in_}]"
return term_str[1:]

yid_space = 5
Expand Down Expand Up @@ -1472,7 +1463,7 @@ def make_narx(
poly_terms = make_poly_features(time_shift_vars, poly_ids_all)

# Remove missing values
poly_terms_masked, y_masked = _mask_missing_value(poly_terms, y)
poly_terms_masked, y_masked = mask_missing_values(poly_terms, y)

selected_poly_ids = []
for i in range(n_outputs):
Expand Down
51 changes: 50 additions & 1 deletion fastcan/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import numpy as np
from sklearn.cross_decomposition import CCA
from sklearn.utils import check_X_y
from sklearn.utils import _safe_indexing, check_consistent_length, check_X_y
from sklearn.utils._param_validation import Interval, validate_params


Expand Down Expand Up @@ -120,3 +120,52 @@ def ols(X, y, t=1):
if not mask[j]:
w[:, j] = w[:, j] - w[:, d] * (w[:, d] @ w[:, j])
w[:, j] /= np.linalg.norm(w[:, j], axis=0)


@validate_params(
{
"return_mask": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def mask_missing_values(*arrays, return_mask=False):
"""Remove missing values for all arrays.

Parameters
----------
*arrays : sequence of array-like of shape (n_samples,) or \
(n_samples, n_outputs)
Arrays with consistent first dimension.

return_mask : bool, default=False
If True, return a mask of valid values.
If False, return the arrays with missing values removed.

Returns
-------
mask_valid : ndarray of shape (n_samples,)
Mask of valid values.

masked_arrays : sequence of array-like of shape (n_samples,) or \
(n_samples, n_outputs)
Arrays with missing values removed.
The order of the arrays is the same as the input arrays.

Examples
--------
>>> import numpy as np
>>> from fastcan.utils import mask_missing_values
>>> a = [[1, 2], [3, np.nan], [5, 6]]
>>> b = [1, 2, 3]
>>> mask_missing_values(a, b)
[[[1, 2], [5, 6]], [1, 3]]
>>> mask_missing_values(a, b, return_mask=True)
array([ True, False, True])
"""
if len(arrays) == 0:
return None
check_consistent_length(*arrays)
mask_valid = np.all(np.isfinite(np.c_[arrays]), axis=1)
if return_mask:
return mask_valid
return [_safe_indexing(x, mask_valid) for x in arrays]
Loading
Loading