Skip to content

Commit

Permalink
Updated dataset path in the /scripts dir as it is now static
Browse files Browse the repository at this point in the history
  • Loading branch information
Petter Olsson committed Nov 17, 2024
1 parent b704e2f commit 5767504
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 125 deletions.
118 changes: 65 additions & 53 deletions scripts/.ipynb_checkpoints/R_model_train-checkpoint.R
Original file line number Diff line number Diff line change
@@ -1,67 +1,79 @@
library(mlflow)
library(jsonlite)

print("Reading in data")
project_name <- Sys.getenv('DOMINO_PROJECT_NAME')
path <- paste('/mnt/data/',project_name,'/credit_card_default.csv')
path <- paste('/mnt/data/mlops-best-practices/credit_card_default.csv')
path <- gsub(" ", "", path, fixed = TRUE)
data <- read.csv(file=path)
data <- read.csv(file = path)
head(data)

#mlflow_set_experiment(experiment_name=paste(Sys.getenv('DOMINO_PROJECT_NAME'), Sys.getenv('DOMINO_STARTING_USERNAME')))
mlflow_set_experiment(experiment_name = paste0(Sys.getenv('DOMINO_PROJECT_NAME'), " ", Sys.getenv('DOMINO_STARTING_USERNAME'), " ", Sys.getenv('MLFLOW_NAME')))

#data$is_red <- as.integer(data$type != 'white')

data <-na.omit(data)
dim(data)[1]-sum(complete.cases(data))

train <-data[sample(nrow(data), round(dim(data)[1]*0.75)),]
# test <- data[(round(dim(data)[1]*0.75)+1):dim(data)[1], 2:dim(data)[2]]
test <- data[(data$id %in% train$id)==FALSE,]
train <- subset(train, select = -c(DEFAULT) )
test <- subset(test, select = -c(DEFAULT) )

train_matrix <- as.matrix(train)
test_matrix <- as.matrix(test)
label_matrix <- as.matrix(train$DEFAULT)
test_lab_matrix <- as.matrix(test$DEFAULT)

dim(train)+dim(test)
# Verify the renaming
print("Columns in data after renaming:")
print(colnames(data))

with(mlflow_start_run(), {
mlflow_set_tag("Model_Type", "R")
print("Training Model")

lm_model <- lm(formula = DEFAULT ~., data = train)
lm_model


RSQUARE = function(y_actual,y_predict){
cor(y_actual,y_predict)^2
}
# Define MLflow experiment
mlflow_set_experiment(experiment_name = paste0(Sys.getenv('DOMINO_PROJECT_NAME'), " ", Sys.getenv('DOMINO_STARTING_USERNAME'), " ", Sys.getenv('MLFLOW_NAME')))

preds_lm <- predict(lm_model, newdata = test)
# Remove missing values
data <- na.omit(data)
print(paste("Number of rows with missing values removed:", dim(data)[1] - sum(complete.cases(data))))

rsquared_lm <-round(RSQUARE(preds_lm, test$DEFAULT),3)
print(rsquared_lm[1])
# Split data into training and testing sets
set.seed(123) # Set seed for reproducibility
train <- data[sample(nrow(data), round(dim(data)[1] * 0.75)), ]
test <- data[!(rownames(data) %in% rownames(train)), ]

#mse
mse_lm<- round(mean((test_lab_matrix - preds_lm)^2),3)
print(mse_lm)
# Verify that the train and test sets include the "DEFAULT" column
if (!("DEFAULT" %in% colnames(train))) {
stop("Column 'DEFAULT' is not present in the training set.")
}

mlflow_log_metric("R2", rsquared_lm[1])
mlflow_log_metric("MSE", mse_lm)
# Define target and feature columns
target_variable <- "DEFAULT"
features <- setdiff(names(data), target_variable)

diagnostics = list("R2" = rsquared_lm[1],
"MSE"=mse_lm)
library(jsonlite)
fileConn<-file("dominostats.json")
writeLines(toJSON(diagnostics), fileConn)
close(fileConn)
train_matrix <- as.matrix(train[, features])
test_matrix <- as.matrix(test[, features])
label_matrix <- as.matrix(train[[target_variable]])
test_lab_matrix <- as.matrix(test[[target_variable]])

save(lm_model, file="/mnt/code/models/R_linear_model.Rda")
})
dim(train) + dim(test)

# install.packages("SHAPforxgboost")
# install.packages("SHAPforxgboost")
# library("SHAPforxgboost")
# shap_values <- shap.values(xgb_model = mod, X_train = dataX)
# Start MLflow run
with(mlflow_start_run(), {
mlflow_set_tag("Model_Type", "R")
print("Training Model")

# Train the model (update formula for new dataset)
lm_model <- lm(formula = as.formula(paste(target_variable, "~ .")), data = train)
print(lm_model)

# Define RSQUARE function
RSQUARE <- function(y_actual, y_predict) {
cor(y_actual, y_predict)^2
}

# Predict and calculate metrics
preds_lm <- predict(lm_model, newdata = test)

rsquared_lm <- round(RSQUARE(test[[target_variable]], preds_lm), 3)
print(rsquared_lm)

# Mean Squared Error
mse_lm <- round(mean((test_lab_matrix - preds_lm)^2), 3)
print(mse_lm)

# Log metrics to MLflow
mlflow_log_metric("R2", rsquared_lm)
mlflow_log_metric("MSE", mse_lm)

# Save diagnostics to JSON
diagnostics <- list("R2" = rsquared_lm, "MSE" = mse_lm)
fileConn <- file("dominostats.json")
writeLines(toJSON(diagnostics), fileConn)
close(fileConn)

# Save model
save(lm_model, file = "/mnt/code/models/R_linear_model.Rda")
})
136 changes: 68 additions & 68 deletions scripts/.ipynb_checkpoints/h2o_model_train-checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,112 +13,112 @@
import mlflow
import mlflow.h2o


#Set train test split to 70
# Set train-test split to 70
n = 70

#read in data then split into train and test

path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
# Read in the new dataset
path = str('/mnt/data/mlops-best-practices/credit_card_default.csv')
data = pd.read_csv(path)
print('Read in {} rows of data'.format(data.shape[0]))

#Find all pearson correlations of numerical variables with quality
#corr_values = data.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
corr_values = data.corr(numeric_only=True).sort_values(by = 'DEFAULT')['DEFAULT'].drop('DEFAULT',axis=0)
# Find all Pearson correlations of numerical variables with the target variable, assuming it's 'DEFAULT'
corr_values = data.corr(numeric_only=True).sort_values(by='DEFAULT')['DEFAULT'].drop('DEFAULT', axis=0)

#Keep all variables with above a 8% pearson correlation
important_feats=corr_values[abs(corr_values)>0.08]
# Keep all variables with above an 8% Pearson correlation
important_feats = corr_values[abs(corr_values) > 0.08]

#Get data set up for model training and evaluation
# Prepare the dataset for model training and evaluation

#Drop NA rows
data = data.dropna(how='any',axis=0)
#Split df into inputs and target
#data = data[list(important_feats.keys())+['quality']]
data = data[list(important_feats.keys())+['DEFAULT']]
# Drop NA rows
data = data.dropna(how='any', axis=0)

train = data[0:round(len(data)*n/100)]
# Select important features and the target variable
data = data[list(important_feats.keys()) + ['DEFAULT']]

# Split the data into training and testing sets
train = data[0:round(len(data) * n / 100)]
test = data[train.shape[0]:]

print('H2O version -{}'.format(h2o.__version__))

#initailize local h2o
# Initialize local H2O
h2o.init()

# create a new MLFlow experiemnt
#mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME'))
# Set up a new MLFlow experiment
mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME') + " " + os.environ.get('MLFLOW_NAME'))

#Convert data to h2o frames
# Convert data to H2O frames
hTrain = h2o.H2OFrame(train)
hTest = h2o.H2OFrame(test)

# Identify predictors and response
x = hTrain.columns
#y = "quality"
y = "DEFAULT"
x.remove(y)

# Isolate target vasriable
hTrain[y] = hTrain[y]
hTest[y] = hTest[y]

# Train the model and log metrics with MLFlow
with mlflow.start_run():
# Set MLFlow tag to differenciate the model approaches
mlflow.set_tag("Model_Type", "H20 Automl")

mlflow.set_tag("Model_Type", "H2O AutoML")

# Run AutoML for 5 base models (limited to 1 min max runtime)
print('Training autoML model...')
print('Training AutoML model...')
aml = H2OAutoML(max_models=10, max_runtime_secs=30, sort_metric="r2")
aml.train(x=x, y=y, training_frame=hTrain)

# sns.histplot(np.array(aml.leader.predict(hTest)))
print('Evaluating model on validation data...')
best_gbm = aml.leader #get_best_model(criterion = 'mse', algorithm = 'gbm')
preds = best_gbm.predict(hTest)
print(best_gbm.r2(xval=True))
#View performance metrics and save them to domino stats!
r2 = round(best_gbm.r2(xval=True),3)
mse = round(best_gbm.mse(xval=True),3)
print("R2 Score: ", r2)
print("MSE: ", mse)
# Save the metrics in MLFlow
mlflow.log_metric("R2", r2)
mlflow.log_metric("MSE", mse)

#Code to write R2 value and MSE to dominostats value for population in Domino Jobs View
# Evaluate the model on the validation data
best_model = aml.leader
preds = best_model.predict(hTest)

# Try to get R² and MSE, handle NoneType if not available
r2 = best_model.r2(xval=True)
mse = best_model.mse(xval=True)

if r2 is not None:
r2 = round(r2, 3)
else:
print("R² metric not available for this model.")
r2 = "N/A"

if mse is not None:
mse = round(mse, 3)
else:
print("MSE metric not available for this model.")
mse = "N/A"

print("R2 Score:", r2)
print("MSE:", mse)

# Log metrics in MLFlow if available
if isinstance(r2, (int, float)):
mlflow.log_metric("R2", r2)
if isinstance(mse, (int, float)):
mlflow.log_metric("MSE", mse)

# Save the metrics for Domino stats
with open('dominostats.json', 'w') as f:
f.write(json.dumps({"R2": r2,
"MSE": mse}))
f.write(json.dumps({"R2": r2, "MSE": mse}))

#Write results to dataframe for viz
#results = pd.DataFrame({'Actuals':test.quality.reset_index()['quality'], 'Predictions': preds.as_data_frame()['predict']})
results = pd.DataFrame({'Actuals':test.quality.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})
# Write results to a dataframe for visualization
results = pd.DataFrame({'Actuals': test.DEFAULT.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})

print('Creating visualizations...')
#Scatterplot
fig1, ax1 = plt.subplots(figsize=(10,6))
plt.title('H2o Actuals vs Predictions Scatter Plot')
sns.regplot(
data=results,
x = 'Actuals',
y = 'Predictions',
order = 3)
# Scatter plot
fig1, ax1 = plt.subplots(figsize=(10, 6))
plt.title('H2O Actuals vs Predictions Scatter Plot')
sns.regplot(data=results, x='Actuals', y='Predictions', order=3)
plt.savefig('/mnt/artifacts/actual_v_pred_scatter.png')
mlflow.log_figure(fig1, 'actual_v_pred_scatter.png')

#Histogram
fig2, ax2 = plt.subplots(figsize=(10,6))
plt.title('h2o Actuals vs Predictions Histogram')
plt.xlabel('Quality')
sns.histplot(results, bins=6, multiple = 'dodge', palette = 'coolwarm')
# Histogram
fig2, ax2 = plt.subplots(figsize=(10, 6))
plt.title('H2O Actuals vs Predictions Histogram')
plt.xlabel('Default Payment')
sns.histplot(results, bins=6, multiple='dodge', palette='coolwarm')
plt.savefig('/mnt/artifacts/actual_v_pred_hist.png')
mlflow.log_figure(fig2, 'actual_v_pred_hist.png')

#Saving trained model to serialized pickle object
h2o.save_model(best_gbm, path ='/mnt/code/models')

mlflow.end_run()
# Save the trained model
h2o.save_model(best_model, path='/mnt/code/models')

print('Script complete!')
mlflow.end_run()
print('Script complete!')
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
scipy.linalg.pinv2 = np.linalg.pinv

#Read in data
path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
path = str('/mnt/data/mlops-best-practices/credit_card_default.csv')
df = pd.read_csv(path)
print('Read in {} rows of data'.format(df.shape[0]))

Expand Down
2 changes: 1 addition & 1 deletion scripts/R_model_train.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ library(jsonlite)

print("Reading in data")
project_name <- Sys.getenv('DOMINO_PROJECT_NAME')
path <- paste('/mnt/data/', project_name, '/credit_card_default.csv')
path <- paste('/mnt/data/mlops-best-practices/credit_card_default.csv')
path <- gsub(" ", "", path, fixed = TRUE)
data <- read.csv(file = path)
head(data)
Expand Down
2 changes: 1 addition & 1 deletion scripts/h2o_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
n = 70

# Read in the new dataset
path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
path = str('/mnt/data/mlops-best-practices/credit_card_default.csv')
data = pd.read_csv(path)
print('Read in {} rows of data'.format(data.shape[0]))

Expand Down
2 changes: 1 addition & 1 deletion scripts/sklearn_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
scipy.linalg.pinv2 = np.linalg.pinv

#Read in data
path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
path = str('/mnt/data/mlops-best-practices/credit_card_default.csv')
df = pd.read_csv(path)
print('Read in {} rows of data'.format(df.shape[0]))

Expand Down

0 comments on commit 5767504

Please sign in to comment.