-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path00.All_fTrainModel_StataData.R
150 lines (130 loc) · 6.97 KB
/
00.All_fTrainModel_StataData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# Loading the required libraries
library(caret)
library(xgboost); library(Laurae); library(DMwR); library(Metrics)
library(ggplot2)
library(ggplotify)
library(hrbrthemes)
library(parallel)
library(gridExtra)
library(lubridate)
library(reshape2)
library(haven)
library(dplyr)
# Reading the STATA .dta data, filtering by Year with dfRes, and adding ADM codes&names
dfStata = read_dta("drought_adm2_merged.dta")
# Defining the columns to be kept
sIDs = c("adm0_code", "adm1_code", "adm2_code", "year")
sFeats = c("female", "child_out_sch", "pri_age_ch", "working", "children", "senior", "head_female", "head_education",
"head_age", "head_lit", "head_act", "head_mar", "head_ageg", "hhsize", "province", "region",
"region_urban", "surmonth", "urban", "pw", "own", "rooms", "area", "frame",
"car", "motorcycle", "bicycle", "radio", "cassette", "tvbw", "tvcolor", "vcr",
"computer", "cellphone", "freezer", "refrigerator", "freezrefrig", "gasstove", "vacuum", "washingmachine",
"sewingmachine", "fan", "mobile_ac", "mobile_gas_ac", "dishwasher", "neither", "pipedwater", "electricity",
"naturalgas", "phone", "internet", "bath", "kitchen", "cooler", "centralac", "centralheat",
"package", "gas_cooler", "sewage", "cookfuel", "heatfuel", "hotwater_fuel",
"male", "adults", "dependecy_r")
sTgts = c("pcer", "pcer_17", "pline685")
# Keeping only the columns listed above
dfStata = dfStata[,names(dfStata) %in% c(sIDs, sFeats, sTgts)]
# Dropping NAs
dfStata = na.omit(dfStata)
dfStata = dfStata[!dfStata$adm0_code=="",,drop=FALSE]
dfStata = dfStata[!dfStata$adm1_code=="",,drop=FALSE]
dfStata = dfStata[!dfStata$adm2_code=="",,drop=FALSE]
# Dropping the year after 2019 due to Covid-19
dfStata = dfStata[dfStata$year<=2019,,drop=F]
# Computing if a household is below or above the poverty line
dfStata$pcer_pline = 1; dfStata$pcer_pline[dfStata$pcer >= dfStata$pline685] = 0
dfStata$pcer_17_pline = 1; dfStata$pcer_17_pline[dfStata$pcer_17 >= dfStata$pline685] = 0
sTgts = c("pcer_pline", "pcer_17_pline")
# Adding the drought-related data
dfData = read.csv("Yearly_Summary_ADM2.csv")
dfData = dfData[,!(names(dfData) %in% c("ntl"))]
dfStataPlus = merge(dfStata, dfData,
by.x=c("adm0_code", "adm1_code", "adm2_code", "year"),
by.y=c("ADM0_CODE", "ADM1_CODE", "ADM2_CODE", "Year"),
all.x=TRUE)
dfStataPlus = na.omit(dfStataPlus)
sFeatsPlus = c(sFeats, "scpdsi", "asi", "era5land_t2m", "era5land_tp", "gws", "rtzsm", "sfsm",
"spei01", "spei03", "spei06", "spei12", "spei24", "spei48",
"spi01", "spi03", "spi06", "spi09", "spi12")
# Creating the training database -
dfDataset = dfStata
dfDataset = dfStataPlus
dfDataset = dfDataset[order(dfDataset$year),]
dfDataset = na.omit(dfDataset)
# Specifying the % train-test split
trn_idx = as.vector(createDataPartition(dfDataset[[1]], p=0.60, list=FALSE))
tst_idx = seq(1,NROW(dfDataset),1)[-trn_idx]
#trn_idx = which(dfDataset$year < 2019, arr.ind=TRUE)
#tst_idx = which(dfDataset$year >= 2019, arr.ind=TRUE)
# Training the model for the selected target variable
for (sTgt in sTgts) {
# Progress message
message("Processing target variable: ", sTgt, " | ")
# Creating the training and test sets
#dfDatasetTargt = dfDataset[,names(dfDataset) %in% c(sFeats,sTgt),drop=FALSE]
dfDatasetTargt = dfDataset[,names(dfDataset) %in% c(sFeatsPlus,sTgt),drop=FALSE]
names(dfDatasetTargt)[names(dfDatasetTargt)==sTgt] = "y"
dfTrn = dfDatasetTargt[trn_idx,,drop=FALSE]
dfTst = dfDatasetTargt[tst_idx,,drop=FALSE]
# Preparing the data for a xgb model
dfTrnXGB = xgb.DMatrix(data=as.matrix(subset(dfTrn, select=-c(y))), label=dfTrn$y)
dfTstXGB = xgb.DMatrix(data=as.matrix(subset(dfTst, select=-c(y))), label=dfTst$y)
watchlist = list(train=dfTrnXGB, validation=dfTstXGB)
scale_pos_weight = sum(dfTrn$y==0)/sum(dfTrn$y==1)
# Training using an eXtreme Gradient Boosting model
xgbTree_model = xgb.train(
watchlist = watchlist, # this is the dataset used for validation
data = dfTrnXGB, # Training data to use
objective = "binary:logistic", # logistic regression for binary classification.
eval_metric = "error", # Using binary classification error rate
scale_pos_weight=scale_pos_weight, # Control the balance of positive and negative weights
nrounds = 10000, # Number of trees to consider
max_depth = 5, # Max Tree Depth
early_stopping_rounds = 150, # Early stop
eta = 0.10, # learning rate
gamma = 0.001, # minimum loss reduction
)
# Saving the trained model to the disk
#saveRDS(xgbTree_model, paste("xgb_model_featsSTATAonly_trgtVar_",sTgt,".rds",sep=""))
saveRDS(xgbTree_model, paste("xgb_model_featsSTATplus_trgtVar_",sTgt,".rds",sep=""))
# Creating the data.frame to store the results
dfRes = data.frame(id=c(trn_idx, tst_idx),
set=c(rep("Train",length(trn_idx)),
rep("Test", length(tst_idx))))
# Storing the target and the predictions of the model in the training and test sets
dfResTrn = predict(xgbTree_model, newdata=dfTrnXGB)
dfResTst = predict(xgbTree_model, newdata=dfTstXGB)
dfRes = cbind(dfRes, c(dfTrn$y,dfTst$y))
names(dfRes)[NCOL(dfRes)] = paste("trgt_",sTgt,sep="")
dfRes = cbind(dfRes, c(dfResTrn,dfResTst))
names(dfRes)[NCOL(dfRes)] = paste("pred_",sTgt,sep="")
# Storing the plot results
cfTrn = caret::confusionMatrix(data=factor(round(dfResTrn,0)), reference=factor(dfTrn$y)); print(cfTrn)
objPlotsTrn = fourfoldplot(as.table(cfTrn),color=c("darkgreen","darkred"),main = "Confusion Matrix - Training")
cfTst = caret::confusionMatrix(data=factor(round(dfResTst,0)), reference=factor(dfTst$y)); print(cfTst)
objPlotsTrn = fourfoldplot(as.table(cfTst),color=c("darkgreen","darkred"),main = "Confusion Matrix - Test")
# Storing the error metrics
errMetricsTrn = c(accuracy(dfTrn$y, round(dfResTrn,0)),
recall(dfTrn$y, round(dfResTrn,0)),
precision(dfTrn$y, round(dfResTrn,0)),
f1(dfTrn$y, round(dfResTrn,0)))
errMetricsTst = c(accuracy(dfTst$y, round(dfResTst,0)),
recall(dfTst$y, round(dfResTst,0)),
precision(dfTst$y, round(dfResTst,0)),
f1(dfTst$y, round(dfResTst,0)))
errMetrics = rbind(errMetricsTrn, errMetricsTst)
rownames(errMetrics) = c("Training", "Test")
colnames(errMetrics) = c("accuracy", "recall", "precision", "f1_score")
errMetrics[,4] = (2*errMetrics[,3]*errMetrics[,2])/(errMetrics[,3]+errMetrics[,2]) # Fixing f1_score
print(round(errMetrics,3))
# Storing the variable importance
dfVarImp = xgb.importance(model=xgbTree_model)
dfVarImp = dfVarImp[order(dfVarImp$Gain,decreasing=T)]
pVarImp = ggplot(dfVarImp, aes(reorder(Feature, +Gain), Gain)) +
geom_bar(stat = "identity") +
theme_bw() +
coord_flip()
plot(pVarImp)
}