Update schemas for xgboost 2.0.

preprocess test data as needed Loosen version contraint on xgboost accordingly Signed-off-by: Avi Shinnar <[email protected]>
IBM · Jan 29, 2024 · 72d1063 · 72d1063
1 parent c7a2e3c
commit 72d1063
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 3 deletions.
diff --git a/lale/lib/xgboost/xgb_classifier.py b/lale/lib/xgboost/xgb_classifier.py
@@ -916,5 +916,48 @@ def score(self, X, y):
         set_as_available=True,
     )
 
+if xgboost_version is not None and xgboost_version >= version.Version("2.0"):
+    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
+    XGBClassifier = XGBClassifier.customize_schema(
+        n_estimators={
+            "description": "Number of trees to fit.",
+            "anyOf": [
+                {
+                    "type": "integer",
+                    "default": 200,
+                    "minimumForOptimizer": 50,
+                    "maximumForOptimizer": 1000,
+                },
+                {"enum": [None]},
+            ],
+        },
+        device={
+            "description": """Device ordinal""",
+            "anyOf": [
+                {"enum": ["cpu", "cuda", "gpu"]},
+                {"enum": [None]},
+            ],
+            "default": None,
+        },
+        multi_strategy={
+            "description": """The strategy used for training multi-target models,
+             including multi-target regression and multi-class classification.
+             See Multiple Outputs for more information.""",
+            "anyOf": [
+                {
+                    "description": "One model for each target.",
+                    "enum": ["one_output_per_tree"],
+                },
+                {
+                    "description": "Use multi-target trees.",
+                    "enum": ["multi_output_tree"],
+                },
+                {"enum": [None]},
+            ],
+            "default": None,
+        },
+        set_as_available=True,
+    )
+
 
 lale.docstrings.set_docstrings(XGBClassifier)
diff --git a/lale/lib/xgboost/xgb_regressor.py b/lale/lib/xgboost/xgb_regressor.py
@@ -860,5 +860,47 @@ def score(self, X, y):
         set_as_available=True,
     )
 
+if xgboost_version is not None and xgboost_version >= version.Version("2.0"):
+    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
+    XGBRegressor = XGBRegressor.customize_schema(
+        n_estimators={
+            "description": "Number of trees to fit.",
+            "anyOf": [
+                {
+                    "type": "integer",
+                    "default": 200,
+                    "minimumForOptimizer": 50,
+                    "maximumForOptimizer": 1000,
+                },
+                {"enum": [None]},
+            ],
+        },
+        device={
+            "description": """Device ordinal""",
+            "anyOf": [
+                {"enum": ["cpu", "cuda", "gpu"]},
+                {"enum": [None]},
+            ],
+            "default": None,
+        },
+        multi_strategy={
+            "description": """The strategy used for training multi-target models,
+             including multi-target regression and multi-class classification.
+             See Multiple Outputs for more information.""",
+            "anyOf": [
+                {
+                    "description": "One model for each target.",
+                    "enum": ["one_output_per_tree"],
+                },
+                {
+                    "description": "Use multi-target trees.",
+                    "enum": ["multi_output_tree"],
+                },
+                {"enum": [None]},
+            ],
+            "default": None,
+        },
+        set_as_available=True,
+    )
 
 lale.docstrings.set_docstrings(XGBRegressor)
diff --git a/setup.py b/setup.py
@@ -67,7 +67,7 @@
 extras_require = {
     "full": [
         "mystic",
-        "xgboost<=1.5.1",
+        "xgboost<2.1.0",
         "lightgbm<4.0.0",
         "snapml>=1.7.0rc3,<1.12.0",
         "liac-arff>=2.4.0",

diff --git a/test/test_core_pipeline.py b/test/test_core_pipeline.py
@@ -829,7 +829,7 @@ def test_sklearn_diabetes(self):
     def test_openml_creditg(self):
         # classification, categoricals+numbers incl. string, no missing values
         (orig_train_X, orig_train_y), _ = lale.datasets.openml.fetch(
-            "credit-g", "classification", preprocess=False
+            "credit-g", "classification", preprocess=True
         )
         subsample_X, _, subsample_y, _ = train_test_split(
             orig_train_X, orig_train_y, train_size=0.05
@@ -859,7 +859,7 @@ def test_missing_boston(self):
     def test_missing_creditg(self):
         # classification, categoricals+numbers incl. string, synth. missing
         (orig_train_X, orig_train_y), _ = lale.datasets.openml.fetch(
-            "credit-g", "classification", preprocess=False
+            "credit-g", "classification", preprocess=True
         )
         subsample_X, _, subsample_y, _ = train_test_split(
             orig_train_X, orig_train_y, train_size=0.05