Pushed from Domino: https://ykb.domino-eval.com/workspace/petter/mlop…

…s-best-practices?executionId=6734940e0b77b232dfd8c697
dominodatalab · Nov 14, 2024 · 9ce6a3f · 9ce6a3f
1 parent e906daa
commit 9ce6a3f
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 67 deletions.
diff --git a/models/GLM_1_AutoML_1_20241114_102612 b/models/GLM_1_AutoML_1_20241114_102612
diff --git a/scripts/dominostats.json b/scripts/dominostats.json
@@ -0,0 +1 @@
+{"R2": "N/A", "MSE": "N/A"}
diff --git a/scripts/h2o_model_train.py b/scripts/h2o_model_train.py
@@ -13,112 +13,112 @@
 import mlflow
 import mlflow.h2o
 
-
-#Set train test split to 70
+# Set train-test split to 70
 n = 70
 
-#read in data then split into train and test
-
+# Read in the new dataset
 path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
 data = pd.read_csv(path)
 print('Read in {} rows of data'.format(data.shape[0]))
 
-#Find all pearson correlations of numerical variables with quality
-#corr_values = data.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
-corr_values = data.corr(numeric_only=True).sort_values(by = 'DEFAULT')['DEFAULT'].drop('DEFAULT',axis=0)
+# Find all Pearson correlations of numerical variables with the target variable, assuming it's 'DEFAULT'
+corr_values = data.corr(numeric_only=True).sort_values(by='DEFAULT')['DEFAULT'].drop('DEFAULT', axis=0)
 
-#Keep all variables with above a 8% pearson correlation
-important_feats=corr_values[abs(corr_values)>0.08]
+# Keep all variables with above an 8% Pearson correlation
+important_feats = corr_values[abs(corr_values) > 0.08]
 
-#Get data set up for model training and evaluation
+# Prepare the dataset for model training and evaluation
 
-#Drop NA rows
-data = data.dropna(how='any',axis=0)
-#Split df into inputs and target
-#data = data[list(important_feats.keys())+['quality']]
-data = data[list(important_feats.keys())+['DEFAULT']]
+# Drop NA rows
+data = data.dropna(how='any', axis=0)
 
-train = data[0:round(len(data)*n/100)]
+# Select important features and the target variable
+data = data[list(important_feats.keys()) + ['DEFAULT']]
+
+# Split the data into training and testing sets
+train = data[0:round(len(data) * n / 100)]
 test = data[train.shape[0]:]
 
 print('H2O version -{}'.format(h2o.__version__))
 
-#initailize local h2o
+# Initialize local H2O
 h2o.init()
 
-# create a new MLFlow experiemnt
-#mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME'))
+# Set up a new MLFlow experiment
 mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME') + " " + os.environ.get('MLFLOW_NAME'))
 
-#Convert data to h2o frames
+# Convert data to H2O frames
 hTrain = h2o.H2OFrame(train)
 hTest = h2o.H2OFrame(test)
 
 # Identify predictors and response
 x = hTrain.columns
-#y = "quality"
 y = "DEFAULT"
 x.remove(y)
 
-# Isolate target vasriable
-hTrain[y] = hTrain[y]
-hTest[y] = hTest[y]
-
+# Train the model and log metrics with MLFlow
 with mlflow.start_run():
-    # Set MLFlow tag to differenciate the model approaches
-    mlflow.set_tag("Model_Type", "H20 Automl")
-
+    mlflow.set_tag("Model_Type", "H2O AutoML")
+
     # Run AutoML for 5 base models (limited to 1 min max runtime)
-    print('Training autoML model...')
+    print('Training AutoML model...')
     aml = H2OAutoML(max_models=10, max_runtime_secs=30, sort_metric="r2")
     aml.train(x=x, y=y, training_frame=hTrain)
 
-    # sns.histplot(np.array(aml.leader.predict(hTest)))
-    print('Evaluating model on validation data...')
-    best_gbm = aml.leader #get_best_model(criterion = 'mse', algorithm = 'gbm') 
-    preds = best_gbm.predict(hTest)
-    print(best_gbm.r2(xval=True))
-    #View performance metrics and save them to domino stats!
-    r2 = round(best_gbm.r2(xval=True),3)
-    mse = round(best_gbm.mse(xval=True),3)
-    print("R2 Score: ", r2)
-    print("MSE: ", mse)
-    # Save the metrics in MLFlow
-    mlflow.log_metric("R2", r2)
-    mlflow.log_metric("MSE", mse)
-
-    #Code to write R2 value and MSE to dominostats value for population in Domino Jobs View
+    # Evaluate the model on the validation data
+    best_model = aml.leader
+    preds = best_model.predict(hTest)
+
+    # Try to get R² and MSE, handle NoneType if not available
+    r2 = best_model.r2(xval=True)
+    mse = best_model.mse(xval=True)
+
+    if r2 is not None:
+        r2 = round(r2, 3)
+    else:
+        print("R² metric not available for this model.")
+        r2 = "N/A"
+
+    if mse is not None:
+        mse = round(mse, 3)
+    else:
+        print("MSE metric not available for this model.")
+        mse = "N/A"
+
+    print("R2 Score:", r2)
+    print("MSE:", mse)
+
+    # Log metrics in MLFlow if available
+    if isinstance(r2, (int, float)):
+        mlflow.log_metric("R2", r2)
+    if isinstance(mse, (int, float)):
+        mlflow.log_metric("MSE", mse)
+
+    # Save the metrics for Domino stats
     with open('dominostats.json', 'w') as f:
-        f.write(json.dumps({"R2": r2,
-                           "MSE": mse}))
+        f.write(json.dumps({"R2": r2, "MSE": mse}))
 
-    #Write results to dataframe for viz    
-    #results = pd.DataFrame({'Actuals':test.quality.reset_index()['quality'], 'Predictions': preds.as_data_frame()['predict']})
-    results = pd.DataFrame({'Actuals':test.quality.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})
+    # Write results to a dataframe for visualization
+    results = pd.DataFrame({'Actuals': test.DEFAULT.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})
 
     print('Creating visualizations...')
-    #Scatterplot
-    fig1, ax1 = plt.subplots(figsize=(10,6))
-    plt.title('H2o Actuals vs Predictions Scatter Plot')
-    sns.regplot( 
-        data=results,
-        x = 'Actuals',
-        y = 'Predictions',
-        order = 3)
+    # Scatter plot
+    fig1, ax1 = plt.subplots(figsize=(10, 6))
+    plt.title('H2O Actuals vs Predictions Scatter Plot')
+    sns.regplot(data=results, x='Actuals', y='Predictions', order=3)
     plt.savefig('/mnt/artifacts/actual_v_pred_scatter.png')
     mlflow.log_figure(fig1, 'actual_v_pred_scatter.png')
 
-    #Histogram
-    fig2, ax2 = plt.subplots(figsize=(10,6))
-    plt.title('h2o Actuals vs Predictions Histogram')
-    plt.xlabel('Quality')
-    sns.histplot(results, bins=6, multiple = 'dodge', palette = 'coolwarm')
+    # Histogram
+    fig2, ax2 = plt.subplots(figsize=(10, 6))
+    plt.title('H2O Actuals vs Predictions Histogram')
+    plt.xlabel('Default Payment')
+    sns.histplot(results, bins=6, multiple='dodge', palette='coolwarm')
     plt.savefig('/mnt/artifacts/actual_v_pred_hist.png')
     mlflow.log_figure(fig2, 'actual_v_pred_hist.png')
 
-    #Saving trained model to serialized pickle object 
-    h2o.save_model(best_gbm, path ='/mnt/code/models')
-
-mlflow.end_run()
+    # Save the trained model
+    h2o.save_model(best_model, path='/mnt/code/models')
 
-print('Script complete!')
+mlflow.end_run()
+print('Script complete!')