-
Notifications
You must be signed in to change notification settings - Fork 5
/
compute_model_0_12.Rmd
163 lines (154 loc) · 4.89 KB
/
compute_model_0_12.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
---
title: "R Notebook"
params:
actual_date: '2017-10-01'
output:
html_document: default
html_notebook: default
---
```{r}
params$actual_date
```
```{r setup}
library("opensignauxfaibles")
library("dplyr")
database_signauxfaibles <- database_connect()
table_wholesample <- collect_wholesample(db = database_signauxfaibles, table = "wholesample")
sample_train <- table_wholesample %>%
filter(periode == "2015-01-01") %>%
filter(effectif >= 10)
sample_test <- table_wholesample %>%
filter(periode == "2016-01-01") %>%
filter(effectif >= 10)
sample_actual <- table_wholesample %>%
filter(periode == params$actual_date) %>%
filter(effectif >= 10)
```
```{r check-na}
table_wholesample %>%
detect_na()
```
```{r formulas}
formulas_0_12 <- list(
"effectif" = outcome_0_12 ~ cut_effectif,
"growth_effectif" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing,
"apart" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees,
"cotisation_effectif" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif,
"dettecumulee" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation_12m + indicatrice_dettecumulee_12m,
"croissancedettecumulee" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
indicatrice_croissance_dettecumulee,
"nb_debits" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
indicatrice_croissance_dettecumulee +
nb_debits,
"delais" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
indicatrice_croissance_dettecumulee +
nb_debits +
delai + delai_sup_6mois,
"codenaf" = outcome_0_12 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
indicatrice_croissance_dettecumulee +
nb_debits +
delai + delai_sup_6mois +
libelle_naf_niveau1
)
```
```{r compare-auc}
plyr::ldply(
.data = formulas_0_12,
.fun = function(x) {
glm(formula = x, family = "binomial", data = sample_train) %>%
broom::augment(newdata = sample_test, type.predict = "response") %>%
pROC::roc(outcome_0_12 ~ .fitted, data = . , smooth = FALSE) %>%
.$auc %>%
tibble::as_tibble()
}
)
```
```{r compute-prediction}
output_prediction_0_12 <- formulas_0_12$dettecumulee %>%
glm(
formula = .,
data = sample_train,
family = "binomial"
) %>%
broom::augment(
newdata = sample_actual,
type.predict = "response"
) %>%
dplyr::rename(prediction_0_12 = .fitted)
```
```{r export-top100}
output_prediction_0_12 %>%
dplyr::anti_join(
y = compute_filter_proccollectives(
db = database_signauxfaibles,
.date = "2017-10-01"),
by = "numero_compte",
copy = TRUE
) %>%
dplyr::anti_join(
y = compute_filter_ccsv(
db = database_signauxfaibles,
.date = "2017-10-01"),
by = c("numero_compte"),
copy = TRUE
) %>%
dplyr::arrange(
dplyr::desc(
prediction_0_12
)
) %>%
dplyr::filter(is.na(prediction_0_12) == FALSE) %>%
dplyr::select(
raison_sociale, siret,
libelle_naf_niveau1,
code_departement,
prediction_0_12,
effectif,
apart_last12_months, apart_consommee, apart_share_heuresconsommees,
nb_debits, delai, delai_sup_6mois, everything()
) %>%
dplyr::slice(1:100) %>%
readr::write_csv(path = "output/table_predictions_2017_11_29.csv")
```
```{r}
output_prediction_0_12 %>%
dplyr::anti_join(
y = compute_filter_proccollectives(
db = database_signauxfaibles,
.date = "2017-10-01"),
by = "numero_compte",
copy = TRUE
) %>%
dplyr::anti_join(
y = compute_filter_ccsv(
db = database_signauxfaibles,
.date = "2017-10-01"),
by = c("numero_compte"),
copy = TRUE
) %>%
dplyr::arrange(
dplyr::desc(
prediction_0_12
)
) %>%
dplyr::filter(is.na(prediction_0_12) == FALSE) %>%
readr::write_csv(path = "output/table_predictions_2017_11_29_all.csv")
```