Merge branch 'r/remove-deprecated' of github.com:microsoft/LightGBM i…

…nto r/remove-deprecated
microsoft · Feb 12, 2025 · c749f6d · c749f6d
2 parents 4730d38 + 73383bf
commit c749f6d
Show file tree

Hide file tree

Showing 5 changed files with 467 additions and 11 deletions.
diff --git a/docs/FAQ.rst b/docs/FAQ.rst
@@ -377,3 +377,42 @@ We strongly recommend installation from the ``conda-forge`` channel and not from
 For some specific examples, see `this comment <https://github.com/microsoft/LightGBM/issues/4948#issuecomment-1013766397>`__.
 
 In addition, as of ``lightgbm==4.4.0``, the ``conda-forge`` package automatically supports CUDA-based GPU acceleration.
+
+5. How do I subclass ``scikit-learn`` estimators?
+-------------------------------------------------
+
+For ``lightgbm <= 4.5.0``, copy all of the constructor arguments from the corresponding
+``lightgbm`` class into the constructor of your custom estimator.
+
+For later versions, just ensure that the constructor of your custom estimator calls ``super().__init__()``.
+
+Consider the example below, which implements a regressor that allows creation of truncated predictions.
+This pattern will work with ``lightgbm > 4.5.0``.
+
+.. code-block:: python
+
+    import numpy as np
+    from lightgbm import LGBMRegressor
+    from sklearn.datasets import make_regression
+
+    class TruncatedRegressor(LGBMRegressor):
+
+        def __init__(self, **kwargs):
+            super().__init__(**kwargs)
+
+        def predict(self, X, max_score: float = np.inf):
+            preds = super().predict(X)
+            np.clip(preds, a_min=None, a_max=max_score, out=preds)
+            return preds
+
+    X, y = make_regression(n_samples=1_000, n_features=4)
+
+    reg_trunc = TruncatedRegressor().fit(X, y)
+
+    preds = reg_trunc.predict(X)
+    print(f"mean: {preds.mean():.2f}, max: {preds.max():.2f}")
+    # mean: -6.81, max: 345.10
+
+    preds_trunc = reg_trunc.predict(X, max_score=preds.mean())
+    print(f"mean: {preds_trunc.mean():.2f}, max: {preds_trunc.max():.2f}")
+    # mean: -56.50, max: -6.81
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
@@ -1115,6 +1115,7 @@ class DaskLGBMClassifier(LGBMClassifier, _DaskLGBMModel):
 
     def __init__(
         self,
+        *,
         boosting_type: str = "gbdt",
         num_leaves: int = 31,
         max_depth: int = -1,
@@ -1318,6 +1319,7 @@ class DaskLGBMRegressor(LGBMRegressor, _DaskLGBMModel):
 
     def __init__(
         self,
+        *,
         boosting_type: str = "gbdt",
         num_leaves: int = 31,
         max_depth: int = -1,
@@ -1485,6 +1487,7 @@ class DaskLGBMRanker(LGBMRanker, _DaskLGBMModel):
 
     def __init__(
         self,
+        *,
         boosting_type: str = "gbdt",
         num_leaves: int = 31,
         max_depth: int = -1,

diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
@@ -488,6 +488,7 @@ class LGBMModel(_LGBMModelBase):
 
     def __init__(
         self,
+        *,
         boosting_type: str = "gbdt",
         num_leaves: int = 31,
         max_depth: int = -1,
@@ -745,7 +746,35 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]:
         params : dict
             Parameter names mapped to their values.
         """
+        # Based on: https://github.com/dmlc/xgboost/blob/bd92b1c9c0db3e75ec3dfa513e1435d518bb535d/python-package/xgboost/sklearn.py#L941
+        # which was based on: https://stackoverflow.com/questions/59248211
+        #
+        # `get_params()` flows like this:
+        #
+        # 0. Get parameters in subclass (self.__class__) first, by using inspect.
+        # 1. Get parameters in all parent classes (especially `LGBMModel`).
+        # 2. Get whatever was passed via `**kwargs`.
+        # 3. Merge them.
+        #
+        # This needs to accommodate being called recursively in the following
+        # inheritance graphs (and similar for classification and ranking):
+        #
+        #   DaskLGBMRegressor -> LGBMRegressor     -> LGBMModel -> BaseEstimator
+        #   (custom subclass) -> LGBMRegressor     -> LGBMModel -> BaseEstimator
+        #                        LGBMRegressor     -> LGBMModel -> BaseEstimator
+        #                        (custom subclass) -> LGBMModel -> BaseEstimator
+        #                                             LGBMModel -> BaseEstimator
+        #
         params = super().get_params(deep=deep)
+        cp = copy.copy(self)
+        # If the immediate parent defines get_params(), use that.
+        if callable(getattr(cp.__class__.__bases__[0], "get_params", None)):
+            cp.__class__ = cp.__class__.__bases__[0]
+        # Otherwise, skip it and assume the next class will have it.
+        # This is here primarily for cases where the first class in MRO is a scikit-learn mixin.
+        else:
+            cp.__class__ = cp.__class__.__bases__[1]
+        params.update(cp.__class__.get_params(cp, deep))
         params.update(self._other_params)
         return params
 
@@ -1285,6 +1314,57 @@ def feature_names_in_(self) -> None:
 class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
     """LightGBM regressor."""
 
+    # NOTE: all args from LGBMModel.__init__() are intentionally repeated here for
+    #       docs, help(), and tab completion.
+    def __init__(
+        self,
+        *,
+        boosting_type: str = "gbdt",
+        num_leaves: int = 31,
+        max_depth: int = -1,
+        learning_rate: float = 0.1,
+        n_estimators: int = 100,
+        subsample_for_bin: int = 200000,
+        objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
+        class_weight: Optional[Union[Dict, str]] = None,
+        min_split_gain: float = 0.0,
+        min_child_weight: float = 1e-3,
+        min_child_samples: int = 20,
+        subsample: float = 1.0,
+        subsample_freq: int = 0,
+        colsample_bytree: float = 1.0,
+        reg_alpha: float = 0.0,
+        reg_lambda: float = 0.0,
+        random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
+        n_jobs: Optional[int] = None,
+        importance_type: str = "split",
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            boosting_type=boosting_type,
+            num_leaves=num_leaves,
+            max_depth=max_depth,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            subsample_for_bin=subsample_for_bin,
+            objective=objective,
+            class_weight=class_weight,
+            min_split_gain=min_split_gain,
+            min_child_weight=min_child_weight,
+            min_child_samples=min_child_samples,
+            subsample=subsample,
+            subsample_freq=subsample_freq,
+            colsample_bytree=colsample_bytree,
+            reg_alpha=reg_alpha,
+            reg_lambda=reg_lambda,
+            random_state=random_state,
+            n_jobs=n_jobs,
+            importance_type=importance_type,
+            **kwargs,
+        )
+
+    __init__.__doc__ = LGBMModel.__init__.__doc__
+
     def _more_tags(self) -> Dict[str, Any]:
         # handle the case where RegressorMixin possibly provides _more_tags()
         if callable(getattr(_LGBMRegressorBase, "_more_tags", None)):
@@ -1344,6 +1424,57 @@ def fit(  # type: ignore[override]
 class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
     """LightGBM classifier."""
 
+    # NOTE: all args from LGBMModel.__init__() are intentionally repeated here for
+    #       docs, help(), and tab completion.
+    def __init__(
+        self,
+        *,
+        boosting_type: str = "gbdt",
+        num_leaves: int = 31,
+        max_depth: int = -1,
+        learning_rate: float = 0.1,
+        n_estimators: int = 100,
+        subsample_for_bin: int = 200000,
+        objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
+        class_weight: Optional[Union[Dict, str]] = None,
+        min_split_gain: float = 0.0,
+        min_child_weight: float = 1e-3,
+        min_child_samples: int = 20,
+        subsample: float = 1.0,
+        subsample_freq: int = 0,
+        colsample_bytree: float = 1.0,
+        reg_alpha: float = 0.0,
+        reg_lambda: float = 0.0,
+        random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
+        n_jobs: Optional[int] = None,
+        importance_type: str = "split",
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            boosting_type=boosting_type,
+            num_leaves=num_leaves,
+            max_depth=max_depth,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            subsample_for_bin=subsample_for_bin,
+            objective=objective,
+            class_weight=class_weight,
+            min_split_gain=min_split_gain,
+            min_child_weight=min_child_weight,
+            min_child_samples=min_child_samples,
+            subsample=subsample,
+            subsample_freq=subsample_freq,
+            colsample_bytree=colsample_bytree,
+            reg_alpha=reg_alpha,
+            reg_lambda=reg_lambda,
+            random_state=random_state,
+            n_jobs=n_jobs,
+            importance_type=importance_type,
+            **kwargs,
+        )
+
+    __init__.__doc__ = LGBMModel.__init__.__doc__
+
     def _more_tags(self) -> Dict[str, Any]:
         # handle the case where ClassifierMixin possibly provides _more_tags()
         if callable(getattr(_LGBMClassifierBase, "_more_tags", None)):
@@ -1554,6 +1685,57 @@ class LGBMRanker(LGBMModel):
         Please use this class mainly for training and applying ranking models in common sklearnish way.
     """
 
+    # NOTE: all args from LGBMModel.__init__() are intentionally repeated here for
+    #       docs, help(), and tab completion.
+    def __init__(
+        self,
+        *,
+        boosting_type: str = "gbdt",
+        num_leaves: int = 31,
+        max_depth: int = -1,
+        learning_rate: float = 0.1,
+        n_estimators: int = 100,
+        subsample_for_bin: int = 200000,
+        objective: Optional[Union[str, _LGBM_ScikitCustomObjectiveFunction]] = None,
+        class_weight: Optional[Union[Dict, str]] = None,
+        min_split_gain: float = 0.0,
+        min_child_weight: float = 1e-3,
+        min_child_samples: int = 20,
+        subsample: float = 1.0,
+        subsample_freq: int = 0,
+        colsample_bytree: float = 1.0,
+        reg_alpha: float = 0.0,
+        reg_lambda: float = 0.0,
+        random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
+        n_jobs: Optional[int] = None,
+        importance_type: str = "split",
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            boosting_type=boosting_type,
+            num_leaves=num_leaves,
+            max_depth=max_depth,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            subsample_for_bin=subsample_for_bin,
+            objective=objective,
+            class_weight=class_weight,
+            min_split_gain=min_split_gain,
+            min_child_weight=min_child_weight,
+            min_child_samples=min_child_samples,
+            subsample=subsample,
+            subsample_freq=subsample_freq,
+            colsample_bytree=colsample_bytree,
+            reg_alpha=reg_alpha,
+            reg_lambda=reg_lambda,
+            random_state=random_state,
+            n_jobs=n_jobs,
+            importance_type=importance_type,
+            **kwargs,
+        )
+
+    __init__.__doc__ = LGBMModel.__init__.__doc__
+
     def fit(  # type: ignore[override]
         self,
         X: _LGBM_ScikitMatrixLike,

diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
@@ -1373,26 +1373,42 @@ def test_machines_should_be_used_if_provided(task, cluster):
 
 
 @pytest.mark.parametrize(
-    "classes",
+    "dask_est,sklearn_est",
     [
         (lgb.DaskLGBMClassifier, lgb.LGBMClassifier),
         (lgb.DaskLGBMRegressor, lgb.LGBMRegressor),
         (lgb.DaskLGBMRanker, lgb.LGBMRanker),
     ],
 )
-def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except_client_arg(classes):
-    dask_spec = inspect.getfullargspec(classes[0])
-    sklearn_spec = inspect.getfullargspec(classes[1])
+def test_dask_classes_and_sklearn_equivalents_have_identical_constructors_except_client_arg(dask_est, sklearn_est):
+    dask_spec = inspect.getfullargspec(dask_est)
+    sklearn_spec = inspect.getfullargspec(sklearn_est)
+
+    # should not allow for any varargs
     assert dask_spec.varargs == sklearn_spec.varargs
+    assert dask_spec.varargs is None
+
+    # the only varkw should be **kwargs,
+    # for pass-through to parent classes' __init__()
     assert dask_spec.varkw == sklearn_spec.varkw
-    assert dask_spec.kwonlyargs == sklearn_spec.kwonlyargs
-    assert dask_spec.kwonlydefaults == sklearn_spec.kwonlydefaults
+    assert dask_spec.varkw == "kwargs"
 
     # "client" should be the only different, and the final argument
-    assert dask_spec.args[:-1] == sklearn_spec.args
-    assert dask_spec.defaults[:-1] == sklearn_spec.defaults
-    assert dask_spec.args[-1] == "client"
-    assert dask_spec.defaults[-1] is None
+    assert dask_spec.kwonlyargs == [*sklearn_spec.kwonlyargs, "client"]
+
+    # default values for all constructor arguments should be identical
+    #
+    # NOTE: if LGBMClassifier / LGBMRanker / LGBMRegressor ever override
+    #       any of LGBMModel's constructor arguments, this will need to be updated
+    assert dask_spec.kwonlydefaults == {**sklearn_spec.kwonlydefaults, "client": None}
+
+    # only positional argument should be 'self'
+    assert dask_spec.args == sklearn_spec.args
+    assert dask_spec.args == ["self"]
+    assert dask_spec.defaults is None
+
+    # get_params() should be identical, except for "client"
+    assert dask_est().get_params() == {**sklearn_est().get_params(), "client": None}
 
 
 @pytest.mark.parametrize(