Updated dataset path in the /scripts dir as it is now static

Pushed from Domino: https://ykb.domino-eval.com/workspace/petter/mlops-best-practices?executionId=6734940e0b77b232dfd8c697
dominodatalab · Nov 17, 2024 · 5767504 · 5767504
1 parent b704e2f
commit 5767504
Show file tree

Hide file tree

Showing 6 changed files with 137 additions and 125 deletions.
diff --git a/scripts/.ipynb_checkpoints/R_model_train-checkpoint.R b/scripts/.ipynb_checkpoints/R_model_train-checkpoint.R
@@ -1,67 +1,79 @@
 library(mlflow)
+library(jsonlite)
+
 print("Reading in data")
 project_name <- Sys.getenv('DOMINO_PROJECT_NAME')
-path <- paste('/mnt/data/',project_name,'/credit_card_default.csv')
+path <- paste('/mnt/data/mlops-best-practices/credit_card_default.csv')
 path <- gsub(" ", "", path, fixed = TRUE)
-data <- read.csv(file=path)
+data <- read.csv(file = path)
 head(data)
 
-#mlflow_set_experiment(experiment_name=paste(Sys.getenv('DOMINO_PROJECT_NAME'), Sys.getenv('DOMINO_STARTING_USERNAME')))
-mlflow_set_experiment(experiment_name = paste0(Sys.getenv('DOMINO_PROJECT_NAME'), " ", Sys.getenv('DOMINO_STARTING_USERNAME'), " ", Sys.getenv('MLFLOW_NAME')))
-
-#data$is_red <- as.integer(data$type != 'white')
-
-data <-na.omit(data)
-dim(data)[1]-sum(complete.cases(data))
-
-train <-data[sample(nrow(data), round(dim(data)[1]*0.75)),]
-# test <- data[(round(dim(data)[1]*0.75)+1):dim(data)[1], 2:dim(data)[2]]
-test <- data[(data$id %in% train$id)==FALSE,]
-train <- subset(train, select = -c(DEFAULT) )
-test <- subset(test, select = -c(DEFAULT) )
-
-train_matrix <-  as.matrix(train)
-test_matrix <-  as.matrix(test)
-label_matrix <- as.matrix(train$DEFAULT)
-test_lab_matrix <- as.matrix(test$DEFAULT)
-
-dim(train)+dim(test)
+# Verify the renaming
+print("Columns in data after renaming:")
+print(colnames(data))
 
-with(mlflow_start_run(), {
-    mlflow_set_tag("Model_Type", "R")
-    print("Training Model")
-
-    lm_model <- lm(formula = DEFAULT ~., data = train)
-    lm_model
-
-
-    RSQUARE = function(y_actual,y_predict){
-      cor(y_actual,y_predict)^2
-    }
+# Define MLflow experiment
+mlflow_set_experiment(experiment_name = paste0(Sys.getenv('DOMINO_PROJECT_NAME'), " ", Sys.getenv('DOMINO_STARTING_USERNAME'), " ", Sys.getenv('MLFLOW_NAME')))
 
-    preds_lm <- predict(lm_model, newdata = test)
+# Remove missing values
+data <- na.omit(data)
+print(paste("Number of rows with missing values removed:", dim(data)[1] - sum(complete.cases(data))))
 
-    rsquared_lm <-round(RSQUARE(preds_lm, test$DEFAULT),3)
-    print(rsquared_lm[1])
+# Split data into training and testing sets
+set.seed(123)  # Set seed for reproducibility
+train <- data[sample(nrow(data), round(dim(data)[1] * 0.75)), ]
+test <- data[!(rownames(data) %in% rownames(train)), ]
 
-    #mse
-    mse_lm<- round(mean((test_lab_matrix - preds_lm)^2),3)
-    print(mse_lm)
+# Verify that the train and test sets include the "DEFAULT" column
+if (!("DEFAULT" %in% colnames(train))) {
+  stop("Column 'DEFAULT' is not present in the training set.")
+}
 
-    mlflow_log_metric("R2", rsquared_lm[1])
-    mlflow_log_metric("MSE", mse_lm)
+# Define target and feature columns
+target_variable <- "DEFAULT"
+features <- setdiff(names(data), target_variable)
 
-    diagnostics = list("R2" = rsquared_lm[1], 
-                       "MSE"=mse_lm)
-    library(jsonlite)
-    fileConn<-file("dominostats.json")
-    writeLines(toJSON(diagnostics), fileConn)
-    close(fileConn)
+train_matrix <- as.matrix(train[, features])
+test_matrix <- as.matrix(test[, features])
+label_matrix <- as.matrix(train[[target_variable]])
+test_lab_matrix <- as.matrix(test[[target_variable]])
 
-    save(lm_model, file="/mnt/code/models/R_linear_model.Rda")
-})
+dim(train) + dim(test)
 
-# install.packages("SHAPforxgboost")
-# install.packages("SHAPforxgboost")
-# library("SHAPforxgboost")
-# shap_values <- shap.values(xgb_model = mod, X_train = dataX)
+# Start MLflow run
+with(mlflow_start_run(), {
+  mlflow_set_tag("Model_Type", "R")
+  print("Training Model")
+
+  # Train the model (update formula for new dataset)
+  lm_model <- lm(formula = as.formula(paste(target_variable, "~ .")), data = train)
+  print(lm_model)
+
+  # Define RSQUARE function
+  RSQUARE <- function(y_actual, y_predict) {
+    cor(y_actual, y_predict)^2
+  }
+
+  # Predict and calculate metrics
+  preds_lm <- predict(lm_model, newdata = test)
+
+  rsquared_lm <- round(RSQUARE(test[[target_variable]], preds_lm), 3)
+  print(rsquared_lm)
+
+  # Mean Squared Error
+  mse_lm <- round(mean((test_lab_matrix - preds_lm)^2), 3)
+  print(mse_lm)
+
+  # Log metrics to MLflow
+  mlflow_log_metric("R2", rsquared_lm)
+  mlflow_log_metric("MSE", mse_lm)
+
+  # Save diagnostics to JSON
+  diagnostics <- list("R2" = rsquared_lm, "MSE" = mse_lm)
+  fileConn <- file("dominostats.json")
+  writeLines(toJSON(diagnostics), fileConn)
+  close(fileConn)
+
+  # Save model
+  save(lm_model, file = "/mnt/code/models/R_linear_model.Rda")
+})
diff --git a/scripts/.ipynb_checkpoints/h2o_model_train-checkpoint.py b/scripts/.ipynb_checkpoints/h2o_model_train-checkpoint.py
@@ -13,112 +13,112 @@
 import mlflow
 import mlflow.h2o
 
-
-#Set train test split to 70
+# Set train-test split to 70
 n = 70
 
-#read in data then split into train and test
-
-path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
+# Read in the new dataset
+path = str('/mnt/data/mlops-best-practices/credit_card_default.csv')
 data = pd.read_csv(path)
 print('Read in {} rows of data'.format(data.shape[0]))
 
-#Find all pearson correlations of numerical variables with quality
-#corr_values = data.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
-corr_values = data.corr(numeric_only=True).sort_values(by = 'DEFAULT')['DEFAULT'].drop('DEFAULT',axis=0)
+# Find all Pearson correlations of numerical variables with the target variable, assuming it's 'DEFAULT'
+corr_values = data.corr(numeric_only=True).sort_values(by='DEFAULT')['DEFAULT'].drop('DEFAULT', axis=0)
 
-#Keep all variables with above a 8% pearson correlation
-important_feats=corr_values[abs(corr_values)>0.08]
+# Keep all variables with above an 8% Pearson correlation
+important_feats = corr_values[abs(corr_values) > 0.08]
 
-#Get data set up for model training and evaluation
+# Prepare the dataset for model training and evaluation
 
-#Drop NA rows
-data = data.dropna(how='any',axis=0)
-#Split df into inputs and target
-#data = data[list(important_feats.keys())+['quality']]
-data = data[list(important_feats.keys())+['DEFAULT']]
+# Drop NA rows
+data = data.dropna(how='any', axis=0)
 
-train = data[0:round(len(data)*n/100)]
+# Select important features and the target variable
+data = data[list(important_feats.keys()) + ['DEFAULT']]
+
+# Split the data into training and testing sets
+train = data[0:round(len(data) * n / 100)]
 test = data[train.shape[0]:]
 
 print('H2O version -{}'.format(h2o.__version__))
 
-#initailize local h2o
+# Initialize local H2O
 h2o.init()
 
-# create a new MLFlow experiemnt
-#mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME'))
+# Set up a new MLFlow experiment
 mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME') + " " + os.environ.get('MLFLOW_NAME'))
 
-#Convert data to h2o frames
+# Convert data to H2O frames
 hTrain = h2o.H2OFrame(train)
 hTest = h2o.H2OFrame(test)
 
 # Identify predictors and response
 x = hTrain.columns
-#y = "quality"
 y = "DEFAULT"
 x.remove(y)
 
-# Isolate target vasriable
-hTrain[y] = hTrain[y]
-hTest[y] = hTest[y]
-
+# Train the model and log metrics with MLFlow
 with mlflow.start_run():
-    # Set MLFlow tag to differenciate the model approaches
-    mlflow.set_tag("Model_Type", "H20 Automl")
-
+    mlflow.set_tag("Model_Type", "H2O AutoML")
+
     # Run AutoML for 5 base models (limited to 1 min max runtime)
-    print('Training autoML model...')
+    print('Training AutoML model...')
     aml = H2OAutoML(max_models=10, max_runtime_secs=30, sort_metric="r2")
     aml.train(x=x, y=y, training_frame=hTrain)
 
-    # sns.histplot(np.array(aml.leader.predict(hTest)))
-    print('Evaluating model on validation data...')
-    best_gbm = aml.leader #get_best_model(criterion = 'mse', algorithm = 'gbm') 
-    preds = best_gbm.predict(hTest)
-    print(best_gbm.r2(xval=True))
-    #View performance metrics and save them to domino stats!
-    r2 = round(best_gbm.r2(xval=True),3)
-    mse = round(best_gbm.mse(xval=True),3)
-    print("R2 Score: ", r2)
-    print("MSE: ", mse)
-    # Save the metrics in MLFlow
-    mlflow.log_metric("R2", r2)
-    mlflow.log_metric("MSE", mse)
-
-    #Code to write R2 value and MSE to dominostats value for population in Domino Jobs View
+    # Evaluate the model on the validation data
+    best_model = aml.leader
+    preds = best_model.predict(hTest)
+
+    # Try to get R² and MSE, handle NoneType if not available
+    r2 = best_model.r2(xval=True)
+    mse = best_model.mse(xval=True)
+
+    if r2 is not None:
+        r2 = round(r2, 3)
+    else:
+        print("R² metric not available for this model.")
+        r2 = "N/A"
+
+    if mse is not None:
+        mse = round(mse, 3)
+    else:
+        print("MSE metric not available for this model.")
+        mse = "N/A"
+
+    print("R2 Score:", r2)
+    print("MSE:", mse)
+
+    # Log metrics in MLFlow if available
+    if isinstance(r2, (int, float)):
+        mlflow.log_metric("R2", r2)
+    if isinstance(mse, (int, float)):
+        mlflow.log_metric("MSE", mse)
+
+    # Save the metrics for Domino stats
     with open('dominostats.json', 'w') as f:
-        f.write(json.dumps({"R2": r2,
-                           "MSE": mse}))
+        f.write(json.dumps({"R2": r2, "MSE": mse}))
 
-    #Write results to dataframe for viz    
-    #results = pd.DataFrame({'Actuals':test.quality.reset_index()['quality'], 'Predictions': preds.as_data_frame()['predict']})
-    results = pd.DataFrame({'Actuals':test.quality.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})
+    # Write results to a dataframe for visualization
+    results = pd.DataFrame({'Actuals': test.DEFAULT.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})
 
     print('Creating visualizations...')
-    #Scatterplot
-    fig1, ax1 = plt.subplots(figsize=(10,6))
-    plt.title('H2o Actuals vs Predictions Scatter Plot')
-    sns.regplot( 
-        data=results,
-        x = 'Actuals',
-        y = 'Predictions',
-        order = 3)
+    # Scatter plot
+    fig1, ax1 = plt.subplots(figsize=(10, 6))
+    plt.title('H2O Actuals vs Predictions Scatter Plot')
+    sns.regplot(data=results, x='Actuals', y='Predictions', order=3)
     plt.savefig('/mnt/artifacts/actual_v_pred_scatter.png')
     mlflow.log_figure(fig1, 'actual_v_pred_scatter.png')
 
-    #Histogram
-    fig2, ax2 = plt.subplots(figsize=(10,6))
-    plt.title('h2o Actuals vs Predictions Histogram')
-    plt.xlabel('Quality')
-    sns.histplot(results, bins=6, multiple = 'dodge', palette = 'coolwarm')
+    # Histogram
+    fig2, ax2 = plt.subplots(figsize=(10, 6))
+    plt.title('H2O Actuals vs Predictions Histogram')
+    plt.xlabel('Default Payment')
+    sns.histplot(results, bins=6, multiple='dodge', palette='coolwarm')
     plt.savefig('/mnt/artifacts/actual_v_pred_hist.png')
     mlflow.log_figure(fig2, 'actual_v_pred_hist.png')
 
-    #Saving trained model to serialized pickle object 
-    h2o.save_model(best_gbm, path ='/mnt/code/models')
-
-mlflow.end_run()
+    # Save the trained model
+    h2o.save_model(best_model, path='/mnt/code/models')
 
-print('Script complete!')
+mlflow.end_run()
+print('Script complete!')
diff --git a/scripts/.ipynb_checkpoints/sklearn_model_train-checkpoint.py b/scripts/.ipynb_checkpoints/sklearn_model_train-checkpoint.py
@@ -20,7 +20,7 @@
 scipy.linalg.pinv2 = np.linalg.pinv
 
 #Read in data
-path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
+path = str('/mnt/data/mlops-best-practices/credit_card_default.csv')
 df = pd.read_csv(path)
 print('Read in {} rows of data'.format(df.shape[0]))
 

diff --git a/scripts/R_model_train.R b/scripts/R_model_train.R
@@ -3,7 +3,7 @@ library(jsonlite)
 
 print("Reading in data")
 project_name <- Sys.getenv('DOMINO_PROJECT_NAME')
-path <- paste('/mnt/data/', project_name, '/credit_card_default.csv')
+path <- paste('/mnt/data/mlops-best-practices/credit_card_default.csv')
 path <- gsub(" ", "", path, fixed = TRUE)
 data <- read.csv(file = path)
 head(data)

diff --git a/scripts/h2o_model_train.py b/scripts/h2o_model_train.py
@@ -17,7 +17,7 @@
 n = 70
 
 # Read in the new dataset
-path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
+path = str('/mnt/data/mlops-best-practices/credit_card_default.csv')
 data = pd.read_csv(path)
 print('Read in {} rows of data'.format(data.shape[0]))
 

diff --git a/scripts/sklearn_model_train.py b/scripts/sklearn_model_train.py
@@ -20,7 +20,7 @@
 scipy.linalg.pinv2 = np.linalg.pinv
 
 #Read in data
-path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
+path = str('/mnt/data/mlops-best-practices/credit_card_default.csv')
 df = pd.read_csv(path)
 print('Read in {} rows of data'.format(df.shape[0]))