Description
!-- Please include a self-contained copy-pastable example that generates the issue if possible.
Please be concise with code posted. See guidelines below on how to provide a good bug report:
- Craft Minimal Bug Reports http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports
- Minimal Complete Verifiable Examples https://stackoverflow.com/help/mcve
Bug reports that follow these guidelines are easier to diagnose, and so are often handled much more quickly.
-->
What happened:
Got an error from XGBoost: "XGBoostError: need to call fit or load_model beforehand"
What you expected to happen:
I want to use dask_ml.xgboost.XGBRegressor with dask_ml.model_selection.HyperbandSearchCV.
Minimal Complete Verifiable Example:
import dask
import dask.dataframe as dd
from distributed import Client
from dask_ml.model_selection import train_test_split
from dask_ml.xgboost import XGBRegressor
from dask_ml.model_selection import HyperbandSearchCV
from dask_ml import datasets
client = Client('10.118.232.173:8786')
X, y = datasets.make_classification(chunks=50)
param_space = {
'n_estimators': range(100, 200, 50),
'max_depth': range(3, 6, 2),
'booster': ('gbtree', 'dart'),
}
model = XGBRegressor()
search = HyperbandSearchCV(model, param_space, random_state=0, patience=True, verbose=True, test_size=0.05)
search.fit(X, y)
Anything else we need to know?:
Here the full error message:
_opt/conda/lib/python3.8/site-packages/sklearn/model_selection/_search.py:285: UserWarning: The total space of parameters 8 is smaller than n_iter=81. Running 8 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
/opt/conda/lib/python3.8/site-packages/sklearn/model_selection/_search.py:285: UserWarning: The total space of parameters 8 is smaller than n_iter=34. Running 8 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
/opt/conda/lib/python3.8/site-packages/sklearn/model_selection/_search.py:285: UserWarning: The total space of parameters 8 is smaller than n_iter=15. Running 8 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
XGBoostError Traceback (most recent call last)
in
11 search = HyperbandSearchCV(model, param_space, random_state=0, patience=True, verbose=True, test_size=0.05)
12
---> 13 search.fit(X, y)
/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py in fit(self, X, y, **fit_params)
715 client = default_client()
716 if not client.asynchronous:
--> 717 return client.sync(self._fit, X, y, **fit_params)
718 return self._fit(X, y, **fit_params)
719
/opt/conda/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
849 return future
850 else:
--> 851 return sync(
852 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
853 )
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
352 if error[0]:
353 typ, exc, tb = error[0]
--> 354 raise exc.with_traceback(tb)
355 else:
356 return result[0]
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
335 if callback_timeout is not None:
336 future = asyncio.wait_for(future, callback_timeout)
--> 337 result[0] = yield future
338 except Exception as exc:
339 error[0] = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_hyperband.py in _fit(self, X, y, **fit_params)
399 _brackets_ids = list(reversed(sorted(SHAs)))
400
--> 401 _SHAs = await asyncio.gather(
402 *[SHAs[b]._fit(X, y, **fit_params) for b in _brackets_ids]
403 )
/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py in _fit(self, X, y, **fit_params)
661
662 with context:
--> 663 results = await fit(
664 self.estimator,
665 self._get_params(),
/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py in fit(model, params, X_train, y_train, X_test, y_test, additional_calls, fit_params, scorer, random_state, verbose, prefix)
475 A history of all models scores over time
476 """
--> 477 return await _fit(
478 model,
479 params,
/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py in _fit(model, params, X_train, y_train, X_test, y_test, additional_calls, fit_params, scorer, random_state, verbose, prefix)
169 logger.info("[CV%s] creating %d models", prefix, len(params))
170 for ident, param in enumerate(params):
--> 171 model = client.submit(_create_model, original_model, ident, **param)
172 info[ident] = []
173 models[ident] = model
/opt/conda/lib/python3.8/site-packages/distributed/client.py in submit(self, func, key, workers, resources, retries, priority, fifo_timeout, allow_other_workers, actor, actors, pure, *args, **kwargs)
1571 if key is None:
1572 if pure:
-> 1573 key = funcname(func) + "-" + tokenize(func, kwargs, *args)
1574 else:
1575 key = funcname(func) + "-" + str(uuid.uuid4())
/opt/conda/lib/python3.8/site-packages/dask/base.py in tokenize(*args, **kwargs)
793 if kwargs:
794 args = args + (kwargs,)
--> 795 return md5(str(tuple(map(normalize_token, args))).encode()).hexdigest()
796
797
/opt/conda/lib/python3.8/site-packages/dask/utils.py in call(self, arg, *args, **kwargs)
510 """
511 meth = self.dispatch(type(arg))
--> 512 return meth(arg, *args, **kwargs)
513
514 @Property
/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_normalize.py in normalize_estimator(est)
36 continue
37 try:
---> 38 val = getattr(est, attr)
39 except (sklearn.exceptions.NotFittedError, AttributeError):
40 continue
/opt/conda/lib/python3.8/site-packages/xgboost/sklearn.py in feature_importances_(self)
540 raise AttributeError('Feature importance is not defined for Booster type {}'
541 .format(self.booster))
--> 542 b = self.get_booster()
543 score = b.get_score(importance_type=self.importance_type)
544 all_features = [score.get(f, 0.) for f in b.feature_names]
/opt/conda/lib/python3.8/site-packages/xgboost/sklearn.py in get_booster(self)
191 """
192 if self._Booster is None:
--> 193 raise XGBoostError('need to call fit or load_model beforehand')
194 return self._Booster
195
XGBoostError: need to call fit or load_model beforehand_
Environment:
- Dask version: daskdev/dask:2021.5.0 Docker image
- Python version: 3.8
- Operating System: Linux (Ubuntu)
- Install method (conda, pip, source): Kubernetes Helm Chart. I'm running the script from the Jupyter pod in the K8 cluster.