-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCUST_MARKET_TRAIN.py
207 lines (166 loc) · 7.69 KB
/
CUST_MARKET_TRAIN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 22 09:19:26 2022
There are a few essential aspects of the marketing campaign namely,
customer segmentation, promotional strategy, and etc. Correctly identified strategy may help to expand and grow the bank’s revenue.
1) Develop a deep learning model using TensorFlow which only comprises of Dense, Dropout, and Batch Normalization layers.
2) The accuracy of the model must be more than 70%.
3) Display the training loss and accuracy on TensorBoard
4) Create modules (classes) for repeated functions to ease your training and testing process
@author: Amirah Heng
"""
import os
import pickle
import datetime
import numpy as np
import pandas as pd
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import TensorBoard,EarlyStopping
from module_for_customer_market_train import EDA, DataPreprocessing
from module_for_customer_market_train import ModelCreation, ModelEvaluation
#EDA
#%% STEP 1) Statics
CSV_PATH =os.path.join(os.getcwd(),'Market_Train.csv')
JOB_TYPE_PICKLE_PATH=os.path.join(os.getcwd(),'model','job.pkl')
MARITAL_PICKLE_PATH=os.path.join(os.getcwd(),'model','marital.pkl')
EDUCATION_PICKLE_PATH=os.path.join(os.getcwd(),'model','education.pkl')
DEFAULT_PICKLE_PATH=os.path.join(os.getcwd(),'model','default.pkl')
HOUSING_LOAN_PICKLE_PATH=os.path.join(os.getcwd(),'model','housing_loan.pkl')
PERSONAL_LOAN_PICKLE_PATH=os.path.join(os.getcwd(),'model','personal_loan.pkl')
COMMUNICATION_TYPE_PICKLE_PATH=os.path.join(os.getcwd(),'model','communication_type.pkl')
MONTH_PICKLE_PATH=os.path.join(os.getcwd(),'model','month.pkl')
PREV_CAMPAIGN_OUTCOME_PICKLE_PATH=os.path.join(os.getcwd(),'model','prev_campaign_outcome.pkl')
OHE_PATH =os.path.join(os.getcwd(),'model','ohe.pkl')
SS_PATH =os.path.join(os.getcwd(),'model','ss.pkl')
MODEL_SAVE_PATH = os.path.join(os.getcwd(),'model.h5')
log_dir = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
LOG_FOLDER_PATH = os.path.join(os.getcwd(),'log',log_dir)
MODEL_SAVE_PATH = os.path.join(os.getcwd(),'model','model.h5')
#%% STEP 1) DATA LOADING
df = pd.read_csv(CSV_PATH)
#%% STEP 2) DATA INSPECTION
df.info() #to see NaNs and datatype
df=df.drop(labels='id',axis=1)
column_names= ['customer_age', 'job_type', 'marital', 'education', 'default','balance',
'housing_loan', 'personal_loan', 'communication_type', 'day_of_month',
'month', 'last_contact_duration', 'num_contacts_in_campaign',
'num_contacts_prev_campaign', 'prev_campaign_outcome',
'term_deposit_subscribed']
#describe column from categorical and continuous data
target = 'term_deposit_subscribed' #target selected as term_deposit_subscribed
#continuous data column
con_column=['customer_age', 'balance', 'day_of_month', 'last_contact_duration',
'num_contacts_in_campaign', 'days_since_prev_campaign_contact',
'num_contacts_prev_campaign']
#categorical data column
cat_column=['job_type','marital','education','default','housing_loan',
'personal_loan','communication_type','month','prev_campaign_outcome']
#plot data
EDA().plot_data(con_column,cat_column,target,df)
#check for NaN values
df.isna().any()
df.isna().sum()# customer_age:619 marital:150, balance: 399, personal_loan:149
# last_contact_duration:311, num_contacts_in_campaign :112, days_since_prev_campaign_contact :25831
df.count() #31,647 max data
#since the NaN values is about 80% of the data, days_since_prev_campaign_contact will be removed
temp= df.describe().T #balance has many outliers
#%% STEP 3) DATA CLEANING
#remove days_since_prev_campaign_contact column,since the NaN values is about 80% of the data
df= df.drop(labels='days_since_prev_campaign_contact',axis=1)
con_column.remove('days_since_prev_campaign_contact')
#label encoder categorical data
#save label encoder pickle
pickle_path=[JOB_TYPE_PICKLE_PATH,
MARITAL_PICKLE_PATH,
EDUCATION_PICKLE_PATH,
DEFAULT_PICKLE_PATH,
HOUSING_LOAN_PICKLE_PATH,
PERSONAL_LOAN_PICKLE_PATH,
COMMUNICATION_TYPE_PICKLE_PATH,
MONTH_PICKLE_PATH,
PREV_CAMPAIGN_OUTCOME_PICKLE_PATH]
dp=DataPreprocessing()
dp.label_encoder_data(cat_column,df,pickle_path)
df.isna().sum() #NaN values remain same
#use iterative imputer to fill NaN values
ii = IterativeImputer()
df_II = ii.fit_transform(df)
df= pd.DataFrame(df_II)
df.columns= column_names
#round of data to integer value
for index, i in enumerate(column_names):
df[i]= np.floor(df[i]).astype('int')
#check for NaN values
df.isna().any()
df.isna().sum() #no More Nan values
#%% STEP 4) FEATURES SELECTION
logreg=LogisticRegression()
#Logistic Regression to determine accuracy score between continuous data and categorical data
for con in con_column:
logreg.fit(np.expand_dims(df[con],axis=-1),df['term_deposit_subscribed'])
print(con,':',logreg.score(np.expand_dims(df[con],axis=-1),df['term_deposit_subscribed']))
#Cramer's V to determine accuracy score between categorical data and categorical data
for cat in cat_column:
confusion_matriX = pd.crosstab(df[cat], df['term_deposit_subscribed']).to_numpy()
print('{}: accuracy is {}'.format(cat,dp.cramers_corrected_stat(confusion_matriX)))
#All continuous data has high correlation value(>0.8) with target term_deposit_subscribed
#Hence is selected as Features -> X
X = df[con_column]
y = df['term_deposit_subscribed']
#%% STEP 5) DATA PRE-PROCESSING
Std_scaler= StandardScaler()
scaled_X= Std_scaler.fit_transform(X)
#Use OneHotEncoder for target since term_deposit_subscribed is categorical
ohe= OneHotEncoder(sparse=False)
y=ohe.fit_transform(np.expand_dims(y,axis=-1))
#train test split data
x_train,x_test,y_train,y_test=train_test_split(X,y,
test_size=0.3,
random_state=123)
#%% STEP 6) MODEL DEVELOPMENT
md=ModelCreation()
nb_features= np.shape(X)[1:]
nb_class= len(np.unique(y))
model=md.two_layer_model(nb_features,nb_class,nodenum=32)
# Visualising Neural Network model
plot_model(model,show_shapes=True, show_layer_names=(True))
# Compile model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])
#Callbacks
tensorboard_callback=TensorBoard(log_dir=LOG_FOLDER_PATH)
early_stopping_callback = EarlyStopping(monitor='loss',patience=5)
#%% STEP 7) MODEL ANALYSIS
# Model Fitting and Testing
hist = model.fit(x=x_train,y=y_train,batch_size=64,epochs=100,
validation_data=(x_test,y_test),
callbacks=[tensorboard_callback,early_stopping_callback])
#accuracy is 90%
#%% STEP 8) MODEL EVALUATION
me=ModelEvaluation()
#Plot hist evaluation
me.plot_evaluation(hist)
#Model evaluation
me.model_evaluation(model, x_test, y_test)
#%% STEP 9) MODEL SAVING
#saving ohe model
with open(OHE_PATH,'wb') as file:
pickle.dump(ohe,file)
#save std scaler model
with open(SS_PATH,'wb') as file:
pickle.dump(Std_scaler,file)
#to save Neural network model
model.save(MODEL_SAVE_PATH)
#%% Discussion
# The deep learning model achieved a good performance with 90% accuracy
# showing 'customer_age', 'balance', 'day_of_month', 'last_contact_duration',
# 'num_contacts_in_campaign', 'days_since_prev_campaign_contact',
# 'num_contacts_prev_campaign'has the highest correlation (above 80%) with
# term_deposit_subscribed
# The test set can be improved by training more dataset into the model
# and reducing the underfitting model