From 5d7e1738007742f44176d747759c9dbd1092fc16 Mon Sep 17 00:00:00 2001
From: Ryan Soley <ryan.e.soley@gmail.com>
Date: Tue, 27 Feb 2024 11:29:23 -0500
Subject: [PATCH] add `H2OGeneralizedLinearEstimator` schema

---
 rubicon_ml/schema/registry.py                 |   3 +
 .../h2o__H2OGeneralizedLinearEstimator.yaml   | 156 ++++++++++++++++++
 tests/integration/test_schema.py              |  14 +-
 3 files changed, 169 insertions(+), 4 deletions(-)
 create mode 100644 rubicon_ml/schema/schema/h2o__H2OGeneralizedLinearEstimator.yaml

diff --git a/rubicon_ml/schema/registry.py b/rubicon_ml/schema/registry.py
index 05c0fc29..5b436709 100644
--- a/rubicon_ml/schema/registry.py
+++ b/rubicon_ml/schema/registry.py
@@ -6,6 +6,9 @@
 import yaml
 
 RUBICON_SCHEMA_REGISTRY = {
+    "h2o__H2OGeneralizedLinearEstimator": lambda: _load_schema(
+        os.path.join("schema", "h2o__H2OGeneralizedLinearEstimator.yaml")
+    ),
     "h2o__H2OGradientBoostingEstimator": lambda: _load_schema(
         os.path.join("schema", "h2o__H2OGradientBoostingEstimator.yaml")
     ),
diff --git a/rubicon_ml/schema/schema/h2o__H2OGeneralizedLinearEstimator.yaml b/rubicon_ml/schema/schema/h2o__H2OGeneralizedLinearEstimator.yaml
new file mode 100644
index 00000000..5c234f2b
--- /dev/null
+++ b/rubicon_ml/schema/schema/h2o__H2OGeneralizedLinearEstimator.yaml
@@ -0,0 +1,156 @@
+name: h2o__H2OGeneralizedLinearEstimator
+version: 1.0.0
+
+compatibility:
+  lightgbm:
+    max_version:
+    min_version: 3.44.0.1
+docs_url: https://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/modeling.html#h2ogeneralizedlinearestimator
+
+parameters:
+  - name: alpha
+    value_attr: alpha
+  - name: auc_type
+    value_attr: auc_type
+  - name: balance_classes
+    value_attr: balance_classes
+  - name: beta_constraints
+    value_attr: beta_constraints
+  - name: beta_epsilon
+    value_attr: beta_epsilon
+  - name: build_null_model
+    value_attr: build_null_model
+  - name: calc_like
+    value_attr: calc_like
+  - name: class_sampling_factors
+    value_attr: class_sampling_factors
+  - name: cold_start
+    value_attr: cold_start
+  - name: compute_p_values
+    value_attr: compute_p_values
+  - name: custom_metric_func
+    value_attr: custom_metric_func
+  - name: dispersion_epsilon
+    value_attr: dispersion_epsilon
+  - name: dispersion_learning_rate
+    value_attr: dispersion_learning_rate
+  - name: dispersion_parameter_method
+    value_attr: dispersion_parameter_method
+  - name: early_stopping
+    value_attr: early_stopping
+  - name: export_checkpoints_dir
+    value_attr: export_checkpoints_dir
+  - name: family
+    value_attr: family
+  - name: fix_dispersion_parameter
+    value_attr: fix_dispersion_parameter
+  - name: fix_tweedie_variance_power
+    value_attr: fix_tweedie_variance_power
+  - name: fold_assignment
+    value_attr: fold_assignment
+  - name: fold_column
+    value_attr: fold_column
+  - name: gainslift_bins
+    value_attr: gainslift_bins
+  - name: generate_scoring_history
+    value_attr: generate_scoring_history
+  - name: generate_variable_inflation_factors
+    value_attr: generate_variable_inflation_factors
+  - name: gradient_epsilon
+    value_attr: gradient_epsilon
+  - name: HGLM
+    value_attr: HGLM
+  - name: ignore_const_cols
+    value_attr: ignore_const_cols
+  - name: ignored_columns
+    value_attr: ignored_columns
+  - name: influence
+    value_attr: influence
+  - name: init_dispersion_parameter
+    value_attr: init_dispersion_parameter
+  - name: interaction_pairs
+    value_attr: interaction_pairs
+  - name: interactions
+    value_attr: interactions
+  - name: intercept
+    value_attr: intercept
+  - name: keep_cross_validation_fold_assignment
+    value_attr: keep_cross_validation_fold_assignment
+  - name: keep_cross_validation_models
+    value_attr: keep_cross_validation_models
+  - name: keep_cross_validation_predictions
+    value_attr: keep_cross_validation_predictions
+  - name: lambda_
+    value_attr: lambda_
+  - name: lambda_min_ratio
+    value_attr: lambda_min_ratio
+  - name: lambda_search
+    value_attr: lambda_search
+  - name: link
+    value_attr: link
+  - name: max_active_predictors
+    value_attr: max_active_predictors
+  - name: max_after_balance_size
+    value_attr: max_after_balance_size
+  - name: max_confusion_matrix_size
+    value_attr: max_confusion_matrix_size
+  - name: max_iterations
+    value_attr: max_iterations
+  - name: max_iterations_dispersion
+    value_attr: max_iterations_dispersion
+  - name: max_runtime_secs
+    value_attr: max_runtime_secs
+  - name: missing_values_handling
+    value_attr: missing_values_handling
+  - name: nfolds
+    value_attr: nfolds
+  - name: nlambdas
+    value_attr: nlambdas
+  - name: non_negative
+    value_attr: non_negative
+  - name: obj_reg
+    value_attr: obj_reg
+  - name: objective_epsilon
+    value_attr: objective_epsilon
+  - name: offset_column
+    value_attr: offset_column
+  - name: prior
+    value_attr: prior
+  - name: rand_family
+    value_attr: rand_family
+  - name: rand_link
+    value_attr: rand_link
+  - name: random_columns
+    value_attr: random_columns
+  - name: remove_collinear_columns
+    value_attr: remove_collinear_columns
+  - name: response_column
+    value_attr: response_column
+  - name: score_each_iteration
+    value_attr: score_each_iteration
+  - name: score_iteration_interval
+    value_attr: score_iteration_interval
+  - name: seed
+    value_attr: seed
+  - name: solver
+    value_attr: solver
+  - name: standardize
+    value_attr: standardize
+  - name: startval
+    value_attr: startval
+  - name: stopping_metric
+    value_attr: stopping_metric
+  - name: stopping_rounds
+    value_attr: stopping_rounds
+  - name: stopping_tolerance
+    value_attr: stopping_tolerance
+  - name: theta
+    value_attr: theta
+  - name: tweedie_epsilon
+    value_attr: tweedie_epsilon
+  - name: tweedie_link_power
+    value_attr: tweedie_link_power
+  - name: tweedie_variance_power
+    value_attr: tweedie_variance_power
+  - name: weights_column
+    value_attr: weights_column
diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py
index 699ae05e..9a0ec663 100644
--- a/tests/integration/test_schema.py
+++ b/tests/integration/test_schema.py
@@ -3,12 +3,16 @@
 import pytest
 from h2o import H2OFrame
 from h2o.estimators.gbm import H2OGradientBoostingEstimator
+from h2o.estimators.glm import H2OGeneralizedLinearEstimator
 from lightgbm import LGBMClassifier, LGBMRegressor
 from sklearn.ensemble import RandomForestClassifier
 from xgboost import XGBClassifier, XGBRegressor
 from xgboost.dask import DaskXGBClassifier, DaskXGBRegressor
 
-H2O_SCHEMA_CLS = [H2OGradientBoostingEstimator]
+H2O_SCHEMA_CLS = [
+    H2OGeneralizedLinearEstimator,
+    H2OGradientBoostingEstimator,
+]
 PANDAS_SCHEMA_CLS = [
     LGBMClassifier,
     LGBMRegressor,
@@ -39,7 +43,7 @@ def _train_and_log(X, y, schema_cls, rubicon_project):
         y=target_name,
     )
 
-    rubicon_project.log_with_schema(model)
+    return rubicon_project.log_with_schema(model)
 
 
 @pytest.mark.integration
@@ -83,10 +87,12 @@ def test_estimator_schema_fit_dask_df(
 
 @pytest.mark.integration
 @pytest.mark.parametrize("schema_cls", H2O_SCHEMA_CLS)
-def test_estimator_h2o_schema_fit_df(schema_cls, make_classification_df, rubicon_project):
+def test_estimator_h2o_schema_train(schema_cls, make_classification_df, rubicon_project):
     X, y = make_classification_df
     y = y > y.mean()
 
     h2o.init(nthreads=-1)
 
-    _train_and_log(X, y, schema_cls, rubicon_project)
+    experiment = _train_and_log(X, y, schema_cls, rubicon_project)
+
+    assert len(rubicon_project.schema_["parameters"]) == len(experiment.parameters())