Skip to content

XGBRegressor does not work with HyperbandSearchCV #839

Open
@vecorro

Description

@vecorro

!-- Please include a self-contained copy-pastable example that generates the issue if possible.

Please be concise with code posted. See guidelines below on how to provide a good bug report:

Bug reports that follow these guidelines are easier to diagnose, and so are often handled much more quickly.
-->

What happened:
Got an error from XGBoost: "XGBoostError: need to call fit or load_model beforehand"

What you expected to happen:
I want to use dask_ml.xgboost.XGBRegressor with dask_ml.model_selection.HyperbandSearchCV.

Minimal Complete Verifiable Example:

import dask
import dask.dataframe as dd
from distributed import Client
from dask_ml.model_selection import train_test_split
from dask_ml.xgboost import XGBRegressor
from dask_ml.model_selection import HyperbandSearchCV
from dask_ml import datasets

client = Client('10.118.232.173:8786')

X, y = datasets.make_classification(chunks=50)
param_space = {
    'n_estimators': range(100, 200, 50),
    'max_depth': range(3, 6, 2),
    'booster': ('gbtree', 'dart'),
}

model = XGBRegressor()
search = HyperbandSearchCV(model, param_space, random_state=0, patience=True, verbose=True, test_size=0.05)
search.fit(X, y)

Anything else we need to know?:
Here the full error message:

_opt/conda/lib/python3.8/site-packages/sklearn/model_selection/_search.py:285: UserWarning: The total space of parameters 8 is smaller than n_iter=81. Running 8 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
/opt/conda/lib/python3.8/site-packages/sklearn/model_selection/_search.py:285: UserWarning: The total space of parameters 8 is smaller than n_iter=34. Running 8 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(
/opt/conda/lib/python3.8/site-packages/sklearn/model_selection/_search.py:285: UserWarning: The total space of parameters 8 is smaller than n_iter=15. Running 8 iterations. For exhaustive searches, use GridSearchCV.
warnings.warn(


XGBoostError Traceback (most recent call last)
in
11 search = HyperbandSearchCV(model, param_space, random_state=0, patience=True, verbose=True, test_size=0.05)
12
---> 13 search.fit(X, y)

/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py in fit(self, X, y, **fit_params)
715 client = default_client()
716 if not client.asynchronous:
--> 717 return client.sync(self._fit, X, y, **fit_params)
718 return self._fit(X, y, **fit_params)
719

/opt/conda/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
849 return future
850 else:
--> 851 return sync(
852 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
853 )

/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
352 if error[0]:
353 typ, exc, tb = error[0]
--> 354 raise exc.with_traceback(tb)
355 else:
356 return result[0]

/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
335 if callback_timeout is not None:
336 future = asyncio.wait_for(future, callback_timeout)
--> 337 result[0] = yield future
338 except Exception as exc:
339 error[0] = sys.exc_info()

/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()

/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_hyperband.py in _fit(self, X, y, **fit_params)
399 _brackets_ids = list(reversed(sorted(SHAs)))
400
--> 401 _SHAs = await asyncio.gather(
402 *[SHAs[b]._fit(X, y, **fit_params) for b in _brackets_ids]
403 )

/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py in _fit(self, X, y, **fit_params)
661
662 with context:
--> 663 results = await fit(
664 self.estimator,
665 self._get_params(),

/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py in fit(model, params, X_train, y_train, X_test, y_test, additional_calls, fit_params, scorer, random_state, verbose, prefix)
475 A history of all models scores over time
476 """
--> 477 return await _fit(
478 model,
479 params,

/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py in _fit(model, params, X_train, y_train, X_test, y_test, additional_calls, fit_params, scorer, random_state, verbose, prefix)
169 logger.info("[CV%s] creating %d models", prefix, len(params))
170 for ident, param in enumerate(params):
--> 171 model = client.submit(_create_model, original_model, ident, **param)
172 info[ident] = []
173 models[ident] = model

/opt/conda/lib/python3.8/site-packages/distributed/client.py in submit(self, func, key, workers, resources, retries, priority, fifo_timeout, allow_other_workers, actor, actors, pure, *args, **kwargs)
1571 if key is None:
1572 if pure:
-> 1573 key = funcname(func) + "-" + tokenize(func, kwargs, *args)
1574 else:
1575 key = funcname(func) + "-" + str(uuid.uuid4())

/opt/conda/lib/python3.8/site-packages/dask/base.py in tokenize(*args, **kwargs)
793 if kwargs:
794 args = args + (kwargs,)
--> 795 return md5(str(tuple(map(normalize_token, args))).encode()).hexdigest()
796
797

/opt/conda/lib/python3.8/site-packages/dask/utils.py in call(self, arg, *args, **kwargs)
510 """
511 meth = self.dispatch(type(arg))
--> 512 return meth(arg, *args, **kwargs)
513
514 @Property

/opt/conda/lib/python3.8/site-packages/dask_ml/model_selection/_normalize.py in normalize_estimator(est)
36 continue
37 try:
---> 38 val = getattr(est, attr)
39 except (sklearn.exceptions.NotFittedError, AttributeError):
40 continue

/opt/conda/lib/python3.8/site-packages/xgboost/sklearn.py in feature_importances_(self)
540 raise AttributeError('Feature importance is not defined for Booster type {}'
541 .format(self.booster))
--> 542 b = self.get_booster()
543 score = b.get_score(importance_type=self.importance_type)
544 all_features = [score.get(f, 0.) for f in b.feature_names]

/opt/conda/lib/python3.8/site-packages/xgboost/sklearn.py in get_booster(self)
191 """
192 if self._Booster is None:
--> 193 raise XGBoostError('need to call fit or load_model beforehand')
194 return self._Booster
195

XGBoostError: need to call fit or load_model beforehand_

Environment:

  • Dask version: daskdev/dask:2021.5.0 Docker image
  • Python version: 3.8
  • Operating System: Linux (Ubuntu)
  • Install method (conda, pip, source): Kubernetes Helm Chart. I'm running the script from the Jupyter pod in the K8 cluster.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions