Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Petter Olsson committed Nov 14, 2024
1 parent e906daa commit 9ce6a3f
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 67 deletions.
Binary file added models/GLM_1_AutoML_1_20241114_102612
Binary file not shown.
1 change: 1 addition & 0 deletions scripts/dominostats.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"R2": "N/A", "MSE": "N/A"}
134 changes: 67 additions & 67 deletions scripts/h2o_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,112 +13,112 @@
import mlflow
import mlflow.h2o


#Set train test split to 70
# Set train-test split to 70
n = 70

#read in data then split into train and test

# Read in the new dataset
path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
data = pd.read_csv(path)
print('Read in {} rows of data'.format(data.shape[0]))

#Find all pearson correlations of numerical variables with quality
#corr_values = data.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
corr_values = data.corr(numeric_only=True).sort_values(by = 'DEFAULT')['DEFAULT'].drop('DEFAULT',axis=0)
# Find all Pearson correlations of numerical variables with the target variable, assuming it's 'DEFAULT'
corr_values = data.corr(numeric_only=True).sort_values(by='DEFAULT')['DEFAULT'].drop('DEFAULT', axis=0)

#Keep all variables with above a 8% pearson correlation
important_feats=corr_values[abs(corr_values)>0.08]
# Keep all variables with above an 8% Pearson correlation
important_feats = corr_values[abs(corr_values) > 0.08]

#Get data set up for model training and evaluation
# Prepare the dataset for model training and evaluation

#Drop NA rows
data = data.dropna(how='any',axis=0)
#Split df into inputs and target
#data = data[list(important_feats.keys())+['quality']]
data = data[list(important_feats.keys())+['DEFAULT']]
# Drop NA rows
data = data.dropna(how='any', axis=0)

train = data[0:round(len(data)*n/100)]
# Select important features and the target variable
data = data[list(important_feats.keys()) + ['DEFAULT']]

# Split the data into training and testing sets
train = data[0:round(len(data) * n / 100)]
test = data[train.shape[0]:]

print('H2O version -{}'.format(h2o.__version__))

#initailize local h2o
# Initialize local H2O
h2o.init()

# create a new MLFlow experiemnt
#mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME'))
# Set up a new MLFlow experiment
mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME') + " " + os.environ.get('MLFLOW_NAME'))

#Convert data to h2o frames
# Convert data to H2O frames
hTrain = h2o.H2OFrame(train)
hTest = h2o.H2OFrame(test)

# Identify predictors and response
x = hTrain.columns
#y = "quality"
y = "DEFAULT"
x.remove(y)

# Isolate target vasriable
hTrain[y] = hTrain[y]
hTest[y] = hTest[y]

# Train the model and log metrics with MLFlow
with mlflow.start_run():
# Set MLFlow tag to differenciate the model approaches
mlflow.set_tag("Model_Type", "H20 Automl")

mlflow.set_tag("Model_Type", "H2O AutoML")

# Run AutoML for 5 base models (limited to 1 min max runtime)
print('Training autoML model...')
print('Training AutoML model...')
aml = H2OAutoML(max_models=10, max_runtime_secs=30, sort_metric="r2")
aml.train(x=x, y=y, training_frame=hTrain)

# sns.histplot(np.array(aml.leader.predict(hTest)))
print('Evaluating model on validation data...')
best_gbm = aml.leader #get_best_model(criterion = 'mse', algorithm = 'gbm')
preds = best_gbm.predict(hTest)
print(best_gbm.r2(xval=True))
#View performance metrics and save them to domino stats!
r2 = round(best_gbm.r2(xval=True),3)
mse = round(best_gbm.mse(xval=True),3)
print("R2 Score: ", r2)
print("MSE: ", mse)
# Save the metrics in MLFlow
mlflow.log_metric("R2", r2)
mlflow.log_metric("MSE", mse)

#Code to write R2 value and MSE to dominostats value for population in Domino Jobs View
# Evaluate the model on the validation data
best_model = aml.leader
preds = best_model.predict(hTest)

# Try to get R² and MSE, handle NoneType if not available
r2 = best_model.r2(xval=True)
mse = best_model.mse(xval=True)

if r2 is not None:
r2 = round(r2, 3)
else:
print("R² metric not available for this model.")
r2 = "N/A"

if mse is not None:
mse = round(mse, 3)
else:
print("MSE metric not available for this model.")
mse = "N/A"

print("R2 Score:", r2)
print("MSE:", mse)

# Log metrics in MLFlow if available
if isinstance(r2, (int, float)):
mlflow.log_metric("R2", r2)
if isinstance(mse, (int, float)):
mlflow.log_metric("MSE", mse)

# Save the metrics for Domino stats
with open('dominostats.json', 'w') as f:
f.write(json.dumps({"R2": r2,
"MSE": mse}))
f.write(json.dumps({"R2": r2, "MSE": mse}))

#Write results to dataframe for viz
#results = pd.DataFrame({'Actuals':test.quality.reset_index()['quality'], 'Predictions': preds.as_data_frame()['predict']})
results = pd.DataFrame({'Actuals':test.quality.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})
# Write results to a dataframe for visualization
results = pd.DataFrame({'Actuals': test.DEFAULT.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})

print('Creating visualizations...')
#Scatterplot
fig1, ax1 = plt.subplots(figsize=(10,6))
plt.title('H2o Actuals vs Predictions Scatter Plot')
sns.regplot(
data=results,
x = 'Actuals',
y = 'Predictions',
order = 3)
# Scatter plot
fig1, ax1 = plt.subplots(figsize=(10, 6))
plt.title('H2O Actuals vs Predictions Scatter Plot')
sns.regplot(data=results, x='Actuals', y='Predictions', order=3)
plt.savefig('/mnt/artifacts/actual_v_pred_scatter.png')
mlflow.log_figure(fig1, 'actual_v_pred_scatter.png')

#Histogram
fig2, ax2 = plt.subplots(figsize=(10,6))
plt.title('h2o Actuals vs Predictions Histogram')
plt.xlabel('Quality')
sns.histplot(results, bins=6, multiple = 'dodge', palette = 'coolwarm')
# Histogram
fig2, ax2 = plt.subplots(figsize=(10, 6))
plt.title('H2O Actuals vs Predictions Histogram')
plt.xlabel('Default Payment')
sns.histplot(results, bins=6, multiple='dodge', palette='coolwarm')
plt.savefig('/mnt/artifacts/actual_v_pred_hist.png')
mlflow.log_figure(fig2, 'actual_v_pred_hist.png')

#Saving trained model to serialized pickle object
h2o.save_model(best_gbm, path ='/mnt/code/models')

mlflow.end_run()
# Save the trained model
h2o.save_model(best_model, path='/mnt/code/models')

print('Script complete!')
mlflow.end_run()
print('Script complete!')

0 comments on commit 9ce6a3f

Please sign in to comment.