Merge pull request #409 from bsc-wdc/gs_sklearn_estimators_merge

FernandoVN98 · web-flow · commit bd55f6e3daac · 2022-09-09T17:15:12.000+02:00
Ready to merge sklearn estimators in GridSearch
diff --git a/dislib/model_selection/_search.py b/dislib/model_selection/_search.py
@@ -13,7 +13,8 @@
 
 from dislib.model_selection._split import infer_cv
 from dislib.model_selection._validation import check_scorer, \
-    validate_score, aggregate_score_dicts, fit, score_func
+    validate_score, aggregate_score_dicts, fit, score_func, \
+    sklearn_fit, sklearn_score
 
 
 class BaseSearchCV(ABC):
@@ -55,6 +56,29 @@ def fit(self, x, y=None, **fit_params):
         all_candidate_params = []
         all_out = []
 
+        def evaluate_candidates_sklearn(candidate_params):
+            """Evaluate some parameters"""
+            candidate_params = list(candidate_params)
+
+            validation_data = []
+            fits = []
+            for parameters, (train, validation) in product(candidate_params,
+                                                           cv.split(x, y)):
+                validation_data.append(validation)
+                fits.append(sklearn_fit(clone(base_estimator), train,
+                                        parameters=parameters,
+                                        fit_params=fit_params))
+            out = [sklearn_score(estimator, validation, scorer=scorers) for
+                   estimator, validation in zip(fits, validation_data)]
+
+            out = compss_wait_on(out)
+
+            nonlocal n_splits
+            n_splits = cv.get_n_splits()
+
+            all_candidate_params.extend(candidate_params)
+            all_out.extend(out)
+
         def evaluate_candidates(candidate_params):
             """Evaluate some parameters"""
             candidate_params = list(candidate_params)
@@ -75,8 +99,10 @@ def evaluate_candidates(candidate_params):
 
             all_candidate_params.extend(candidate_params)
             all_out.extend(out)
-
-        self._run_search(evaluate_candidates)
+        if 'sklearn' in str(type(estimator)):
+            self._run_search(evaluate_candidates_sklearn)
+        else:
+            self._run_search(evaluate_candidates)
 
         for params_result in all_out:
             scores = params_result[0]
@@ -110,6 +136,9 @@ def evaluate_candidates(candidate_params):
         if self.refit:
             self.best_estimator_ = clone(base_estimator).set_params(
                 **self.best_params_)
+            if 'sklearn' in str(type(estimator)):
+                x = x.collect()
+                y = y.collect()
             self.best_estimator_.fit(x, y, **fit_params)
 
         # Store the only scorer not as a dict for single metric evaluation
diff --git a/dislib/model_selection/_validation.py b/dislib/model_selection/_validation.py
@@ -1,5 +1,9 @@
 import numbers
 
+from dislib.data.array import Array
+from pycompss.api.task import task
+from pycompss.api.parameter import INOUT, Depth, Type, COLLECTION_IN
+
 import numpy as np
 
 
@@ -18,6 +22,41 @@ def score_func(estimator, validation_ds, scorer):
     return [test_scores]
 
 
+@task(est=INOUT, blocks_x={Type: COLLECTION_IN, Depth: 2},
+      blocks_y={Type: COLLECTION_IN, Depth: 2})
+def fit_sklearn_estimator(est, blocks_x, blocks_y, **fit_params):
+    x = Array._merge_blocks(blocks_x)
+    y = Array._merge_blocks(blocks_y)
+    return est.fit(x, y, **fit_params)
+
+
+@task(blocks_x={Type: COLLECTION_IN, Depth: 2},
+      blocks_y={Type: COLLECTION_IN, Depth: 2},
+      returns=1)
+def score_sklearn_estimator(est, scorer,  blocks_x, blocks_y):
+    x = Array._merge_blocks(blocks_x)
+    y = Array._merge_blocks(blocks_y)
+    return _score(est, x, y, scorer)
+
+
+def sklearn_fit(estimator, train_ds,
+                parameters, fit_params):
+    if parameters is not None:
+        estimator.set_params(**parameters)
+    x_train, y_train = train_ds
+
+    return fit_sklearn_estimator(estimator, x_train._blocks,
+                                 y_train._blocks, **fit_params)
+
+
+def sklearn_score(estimator, validation_ds, scorer):
+    x_test, y_test = validation_ds
+    test_scores = score_sklearn_estimator(estimator, scorer,
+                                          x_test._blocks, y_test._blocks)
+
+    return [test_scores]
+
+
 def _score(estimator, x, y, scorers):
     """Return a dict of scores"""
     scores = {}
diff --git a/tests/test_gridsearch.py b/tests/test_gridsearch.py
@@ -2,6 +2,7 @@
 
 from sklearn import clone, datasets
 
+from sklearn.ensemble import RandomForestClassifier as SklearnRF
 import dislib as ds
 from dislib.classification import CascadeSVM, RandomForestClassifier
 from dislib.cluster import DBSCAN, KMeans, GaussianMixture
@@ -79,6 +80,43 @@ def test_fit(self):
         self.assertTrue(hasattr(searcher, 'scorer_'))
         self.assertEqual(searcher.n_splits_, 5)
 
+    def test_fit_sk(self):
+        """Tests GridSearchCV fit()."""
+        x_np, y_np = datasets.load_iris(return_X_y=True)
+        x = ds.array(x_np, (30, 4))
+        y = ds.array(y_np[:, np.newaxis], (30, 1))
+
+        param_grid = {'n_estimators': (2, 4),
+                      'max_depth': range(3, 5)}
+        rf = SklearnRF()
+        print("ESTIMATOR TYPE")
+        print(str(type(rf)))
+
+        searcher = GridSearchCV(rf, param_grid)
+        searcher.fit(x, y)
+
+        expected_keys = {'param_max_depth', 'param_n_estimators', 'params',
+                         'mean_test_score', 'std_test_score',
+                         'rank_test_score'}
+        split_keys = {'split%d_test_score' % i for i in range(5)}
+        expected_keys.update(split_keys)
+        self.assertSetEqual(set(searcher.cv_results_.keys()), expected_keys)
+
+        expected_params = [(3, 2), (3, 4), (4, 2), (4, 4)]
+        for params in searcher.cv_results_['params']:
+            m = params['max_depth']
+            n = params['n_estimators']
+            self.assertIn((m, n), expected_params)
+            expected_params.remove((m, n))
+        self.assertEqual(len(expected_params), 0)
+
+        self.assertTrue(hasattr(searcher, 'best_estimator_'))
+        self.assertTrue(hasattr(searcher, 'best_score_'))
+        self.assertTrue(hasattr(searcher, 'best_params_'))
+        self.assertTrue(hasattr(searcher, 'best_index_'))
+        self.assertTrue(hasattr(searcher, 'scorer_'))
+        self.assertEqual(searcher.n_splits_, 5)
+
     def test_fit_2(self):
         """Tests GridSearchCV fit() with different data."""
         x_np, y_np = datasets.load_breast_cancer(return_X_y=True)