00.All_fTrainModel_StataData.R

# Loading the required libraries
library(caret)
library(xgboost); library(Laurae); library(DMwR); library(Metrics)
library(ggplot2)
library(ggplotify)
library(hrbrthemes)
library(parallel)
library(gridExtra)
library(lubridate)
library(reshape2)
library(haven)
library(dplyr)

# Reading the STATA .dta data, filtering by Year with dfRes, and adding ADM codes&names
dfStata = read_dta("drought_adm2_merged.dta")

# Defining the columns to be kept
sIDs = c("adm0_code", "adm1_code", "adm2_code", "year")
sFeats = c("female", "child_out_sch", "pri_age_ch", "working", "children", "senior", "head_female", "head_education", 
           "head_age", "head_lit", "head_act", "head_mar", "head_ageg", "hhsize", "province", "region", 
           "region_urban", "surmonth", "urban", "pw", "own", "rooms", "area", "frame", 
           "car", "motorcycle", "bicycle", "radio", "cassette", "tvbw", "tvcolor", "vcr", 
           "computer", "cellphone", "freezer", "refrigerator", "freezrefrig", "gasstove", "vacuum", "washingmachine", 
           "sewingmachine", "fan", "mobile_ac", "mobile_gas_ac", "dishwasher", "neither", "pipedwater", "electricity", 
           "naturalgas", "phone", "internet", "bath", "kitchen", "cooler", "centralac", "centralheat", 
           "package", "gas_cooler", "sewage", "cookfuel", "heatfuel", "hotwater_fuel", 
           "male", "adults", "dependecy_r")
sTgts = c("pcer", "pcer_17", "pline685")

# Keeping only the columns listed above
dfStata = dfStata[,names(dfStata) %in% c(sIDs, sFeats, sTgts)]

# Dropping NAs
dfStata = na.omit(dfStata)
dfStata = dfStata[!dfStata$adm0_code=="",,drop=FALSE]
dfStata = dfStata[!dfStata$adm1_code=="",,drop=FALSE]
dfStata = dfStata[!dfStata$adm2_code=="",,drop=FALSE]

# Dropping the year after 2019 due to Covid-19
dfStata = dfStata[dfStata$year<=2019,,drop=F]

# Computing if a household is below or above the poverty line
dfStata$pcer_pline = 1; dfStata$pcer_pline[dfStata$pcer >= dfStata$pline685] = 0
dfStata$pcer_17_pline = 1; dfStata$pcer_17_pline[dfStata$pcer_17 >= dfStata$pline685] = 0
sTgts = c("pcer_pline", "pcer_17_pline")

# Adding the drought-related data
dfData = read.csv("Yearly_Summary_ADM2.csv")
dfData = dfData[,!(names(dfData) %in% c("ntl"))]
dfStataPlus = merge(dfStata, dfData, 
              by.x=c("adm0_code", "adm1_code", "adm2_code", "year"),
              by.y=c("ADM0_CODE", "ADM1_CODE", "ADM2_CODE", "Year"),
              all.x=TRUE)
dfStataPlus = na.omit(dfStataPlus)
sFeatsPlus = c(sFeats, "scpdsi", "asi", "era5land_t2m", "era5land_tp", "gws", "rtzsm", "sfsm", 
               "spei01", "spei03", "spei06", "spei12", "spei24", "spei48", 
               "spi01", "spi03", "spi06", "spi09", "spi12")

# Creating the training database - 
dfDataset = dfStata 
dfDataset = dfStataPlus
dfDataset = dfDataset[order(dfDataset$year),]
dfDataset = na.omit(dfDataset)

# Specifying the % train-test split
trn_idx = as.vector(createDataPartition(dfDataset[[1]], p=0.60, list=FALSE))
tst_idx = seq(1,NROW(dfDataset),1)[-trn_idx]
#trn_idx = which(dfDataset$year <  2019, arr.ind=TRUE)
#tst_idx = which(dfDataset$year >= 2019, arr.ind=TRUE)

# Training the model for the selected target variable
for (sTgt in sTgts) {
  
  # Progress message
  message("Processing target variable: ", sTgt, " | ")
  
  # Creating the training and test sets
  #dfDatasetTargt = dfDataset[,names(dfDataset) %in% c(sFeats,sTgt),drop=FALSE]
  dfDatasetTargt = dfDataset[,names(dfDataset) %in% c(sFeatsPlus,sTgt),drop=FALSE]
  names(dfDatasetTargt)[names(dfDatasetTargt)==sTgt] = "y"
  dfTrn = dfDatasetTargt[trn_idx,,drop=FALSE]
  dfTst = dfDatasetTargt[tst_idx,,drop=FALSE]
  
  # Preparing the data for a xgb model
  dfTrnXGB = xgb.DMatrix(data=as.matrix(subset(dfTrn, select=-c(y))), label=dfTrn$y)
  dfTstXGB = xgb.DMatrix(data=as.matrix(subset(dfTst, select=-c(y))), label=dfTst$y)
  watchlist = list(train=dfTrnXGB, validation=dfTstXGB) 
  scale_pos_weight = sum(dfTrn$y==0)/sum(dfTrn$y==1)
  
  # Training using an eXtreme Gradient Boosting model
  xgbTree_model = xgb.train(
    watchlist = watchlist,             # this is the dataset used for validation
    data = dfTrnXGB,                   # Training data to use
    objective = "binary:logistic",     # logistic regression for binary classification.
    eval_metric = "error",             # Using binary classification error rate
    scale_pos_weight=scale_pos_weight, # Control the balance of positive and negative weights
    nrounds = 10000,                   # Number of trees to consider
    max_depth = 5,                     # Max Tree Depth
    early_stopping_rounds = 150,       # Early stop
    eta = 0.10,                        # learning rate
    gamma = 0.001,                     # minimum loss reduction
  )
  
  # Saving the trained model to the disk
  #saveRDS(xgbTree_model, paste("xgb_model_featsSTATAonly_trgtVar_",sTgt,".rds",sep=""))
  saveRDS(xgbTree_model, paste("xgb_model_featsSTATplus_trgtVar_",sTgt,".rds",sep=""))
  
  # Creating the data.frame to store the results
  dfRes = data.frame(id=c(trn_idx, tst_idx), 
                     set=c(rep("Train",length(trn_idx)),
                           rep("Test", length(tst_idx))))
  
  # Storing the target and the predictions of the model in the training and test sets
  dfResTrn = predict(xgbTree_model, newdata=dfTrnXGB)
  dfResTst = predict(xgbTree_model, newdata=dfTstXGB)
  dfRes = cbind(dfRes, c(dfTrn$y,dfTst$y))
  names(dfRes)[NCOL(dfRes)] = paste("trgt_",sTgt,sep="")
  dfRes = cbind(dfRes, c(dfResTrn,dfResTst))
  names(dfRes)[NCOL(dfRes)] = paste("pred_",sTgt,sep="")
  
  # Storing the plot results
  cfTrn = caret::confusionMatrix(data=factor(round(dfResTrn,0)), reference=factor(dfTrn$y)); print(cfTrn)
  objPlotsTrn = fourfoldplot(as.table(cfTrn),color=c("darkgreen","darkred"),main = "Confusion Matrix - Training")
  cfTst = caret::confusionMatrix(data=factor(round(dfResTst,0)), reference=factor(dfTst$y)); print(cfTst)
  objPlotsTrn = fourfoldplot(as.table(cfTst),color=c("darkgreen","darkred"),main = "Confusion Matrix - Test")
  
  # Storing the error metrics
  errMetricsTrn = c(accuracy(dfTrn$y, round(dfResTrn,0)),
                    recall(dfTrn$y, round(dfResTrn,0)),
                    precision(dfTrn$y, round(dfResTrn,0)), 
                    f1(dfTrn$y, round(dfResTrn,0)))
  errMetricsTst = c(accuracy(dfTst$y, round(dfResTst,0)),
                    recall(dfTst$y, round(dfResTst,0)),
                    precision(dfTst$y, round(dfResTst,0)), 
                    f1(dfTst$y, round(dfResTst,0)))
  errMetrics = rbind(errMetricsTrn, errMetricsTst)
  rownames(errMetrics) = c("Training", "Test")
  colnames(errMetrics) = c("accuracy", "recall", "precision", "f1_score")
  errMetrics[,4] = (2*errMetrics[,3]*errMetrics[,2])/(errMetrics[,3]+errMetrics[,2]) # Fixing f1_score
  print(round(errMetrics,3))
  
  # Storing the variable importance
  dfVarImp = xgb.importance(model=xgbTree_model)
  dfVarImp = dfVarImp[order(dfVarImp$Gain,decreasing=T)]
  pVarImp = ggplot(dfVarImp, aes(reorder(Feature, +Gain), Gain)) +   
    geom_bar(stat = "identity") + 
    theme_bw() + 
    coord_flip()
  plot(pVarImp)
}