-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLoan_Payback_Analysis_Code.py
268 lines (163 loc) · 7.76 KB
/
Loan_Payback_Analysis_Code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
"""
Created on Sat Sep 28 16:39:06 2019
@ISQS6339-001-2019-GROUP-3
Author: Dinesh Poudel
Aman Panwar
Anurag Sharma
"""
# Import libraries
import pandas as pd
import numpy as np
import os
import scipy.stats as ss
import re
#define the input and output local computer file path
os.chdir("C:\\Users\\Dinesh Poudel\\Desktop\\Risk Analysis")
# import all four datasets from local computer
df_application=pd.read_csv("application_train.csv")
df_bureau=pd.read_csv("bureau.csv")
df_credit_balance=pd.read_csv("credit_card_balance.csv")
df_previous=pd.read_csv("previous_application.csv")
# Select only needed columns from loan applicant's data
df= df_application[['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE','CODE_GENDER',
'FLAG_OWN_CAR', 'FLAG_OWN_REALTY','AMT_INCOME_TOTAL',
'AMT_CREDIT','AMT_GOODS_PRICE','NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS',
'REGION_POPULATION_RELATIVE','DAYS_BIRTH','DAYS_EMPLOYED',
'FLAG_CONT_MOBILE','OCCUPATION_TYPE','CNT_FAM_MEMBERS',
'REGION_RATING_CLIENT','ORGANIZATION_TYPE',
'AMT_REQ_CREDIT_BUREAU_YEAR']]
# Select only needed columns from bureau data
df1= df_bureau[['SK_ID_CURR','CREDIT_ACTIVE']]
df1= pd.get_dummies(df1, columns=['CREDIT_ACTIVE'])
#create new column for each unique category in "Credit Active" column
df1= df1.rename(columns={x: x.split('_')[0]+'_1_'+
x.split('_')[2] for x in df1.columns[1:]})
# fill with aggregate number for each new column
df1= df1.groupby('SK_ID_CURR',as_index = False).sum()
# Select only needed columns credit balance data
df2= df_credit_balance[['SK_ID_CURR','AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL']]
df2=df2.groupby(['SK_ID_CURR'],as_index = False)['AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL'].sum()
# Select only needed columns previous application
df3= df_previous[['SK_ID_CURR','NAME_CONTRACT_STATUS']]
df3= pd.get_dummies(df3, columns=['NAME_CONTRACT_STATUS'])
df3= df3.rename(columns={x: x.split('_')[3]+'_1_'+x.split('_')[1]+'_'+x.split('_')[2] for x in df3.columns[1:]})
df3= df3.groupby('SK_ID_CURR',as_index = False).sum()
# left merge bureau data on loan application data
dft= df.merge(df1,on='SK_ID_CURR', how='left', indicator=True)
# for missing bureau data( person with no previous credit history), assign -1
dft.loc[dft['_merge']=='left_only',dft.columns[dft.columns.str.contains('CREDIT_1_')]] =-1
# left merge credit balance data on mergered dataframe
dft= dft.merge(df2,on='SK_ID_CURR', how='left', indicator='_merge2')
# for missing credit balance data( person with no previous credit history), assign -2
dft.loc[dft['_merge2']=='left_only',['AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL']] =-2
# left merged previous application data on mergered dataframe
dft= dft.merge(df3,on='SK_ID_CURR', how='left', indicator='_merge3')
# for missing previous application data( person with no previous credit history), assign -3
dft.loc[dft['_merge3']=='left_only',dft.columns[dft.columns.str.contains('CONTRACT_STATUS')]] =-3
### check the columns with nan values.
nan_cols = [i for i in dft.columns if dft[i].isnull().any()]
dft.isnull().sum()
# filling missing values on categorical data
dft['OCCUPATION_TYPE'] = dft['OCCUPATION_TYPE'].fillna('Unknown')
# filling missing values with mean on numerical data
mean_value=dft['AMT_GOODS_PRICE'].mean()
dft['AMT_GOODS_PRICE']=dft['AMT_GOODS_PRICE'].fillna(mean_value)
# filling missing values with mean on numerical data
mean_value=dft['CNT_FAM_MEMBERS'].mean()
dft['CNT_FAM_MEMBERS']=dft['CNT_FAM_MEMBERS'].fillna(mean_value)
# filling missing values with number=0
dft['AMT_REQ_CREDIT_BUREAU_YEAR']=dft['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0)
# file output
dft.to_csv("our_output_file.csv",index=False)
#filter out only inner join elements
dft_70 = dft[(dft != 'left_only').all(axis=1)]
#################################################################################
# correlation
#creates a list of all column names
all_col_dft=list(dft.columns.values)
#creates a list of all continous variables
num_col= ['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_GOODS_PRICE','REGION_POPULATION_RELATIVE',
'DAYS_BIRTH','DAYS_EMPLOYED',
'AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL']
#converts target column to numpy array for Vcramer calculation
target_arr=dft['TARGET'].to_numpy()
#declares list of r for continous variables
list_cor_num=[]
#calculates r(correlation) b/w continous variable and target
for num_col in num_col:
a=dft[num_col].to_numpy()
s=str(ss.pointbiserialr(a, target_arr))
s1=re.search(r"(?<==).*?(?=,)", s).group(0)
list_cor_num.append(s1)
print(num_col +':'+ s1)
##creates a list of all categorical variables
dft_cat_col = list(set(all_col_dft) - set(['SK_ID_CURR','TARGET',
'AMT_INCOME_TOTAL','AMT_CREDIT','AMT_GOODS_PRICE','REGION_POPULATION_RELATIVE',
'DAYS_BIRTH','DAYS_EMPLOYED',
'AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL']))
#declares list of r for categorical variables
list_cor_cat=[]
#calculates r(correlation) b/w categorical variable and target
for dft_cat_col in dft_cat_col:
confusion_matrix = pd.crosstab(dft['TARGET'], dft[dft_cat_col])
confusion_matrix
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
l=np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))
list_cor_cat.append(l)
print(dft_cat_col +':'+ str(l))
##############################################################
# for df_70 only the inner join elements
##############################################################
#creates a list of all column names
all_col_dft_70=list(dft_70.columns.values)
#creates a list of all continous variables
num_col= ['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_GOODS_PRICE','REGION_POPULATION_RELATIVE',
'DAYS_BIRTH','DAYS_EMPLOYED',
'AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL']
#converts target column to numpy array for Vcramer calculation
target_arr=dft_70['TARGET'].to_numpy()
#declares list of r for continous variables
list_cor_num=[]
#calculates r(correlation) b/w continous variable and target
for num_col in num_col:
a=dft_70[num_col].to_numpy()
s=str(ss.pointbiserialr(a, target_arr))
s1=re.search(r"(?<==).*?(?=,)", s).group(0)
list_cor_num.append(s1)
print(num_col +':'+ s1)
##creates a list of all categorical variables
dft_cat_col = list(set(all_col_dft_70) - set(['SK_ID_CURR','TARGET',
'AMT_INCOME_TOTAL','AMT_CREDIT','AMT_GOODS_PRICE','REGION_POPULATION_RELATIVE',
'DAYS_BIRTH','DAYS_EMPLOYED',
'AMT_BALANCE','AMT_CREDIT_LIMIT_ACTUAL']))
#declares list of r for categorical variables
list_cor_cat=[]
#calculates r(correlation) b/w categorical variable and target
for dft_cat_col in dft_cat_col:
confusion_matrix = pd.crosstab(dft_70['TARGET'], dft_70[dft_cat_col])
confusion_matrix
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
l=np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))
list_cor_cat.append(l)
print(dft_cat_col +':'+ str(l))
##############################
# correlation matrix for left join dataframe
f=dft.corr()
# correlation matrix for inner join dataframe
g=dft_70.corr()
##########################
##END
##########################