-
Notifications
You must be signed in to change notification settings - Fork 5
/
compute_model_12_24.Rmd
154 lines (146 loc) · 4.79 KB
/
compute_model_12_24.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
---
title: "R Notebook"
params:
actual_date: '2017-07-01'
output:
html_document: default
html_notebook: default
---
```{r setup}
library("opensignauxfaibles")
library("dplyr")
database_signauxfaibles <- database_connect()
table_wholesample <- collect_wholesample(db = database_signauxfaibles, table = "wholesample")
sample_train <- table_wholesample %>%
filter(periode == "2014-01-01")
sample_test <- table_wholesample %>%
filter(periode == "2015-01-01")
sample_actual <- table_wholesample %>%
filter(periode == params$actual_date)
```
```{r check-na}
table_wholesample %>%
detect_na()
```
```{r formulas}
formulas_12_24 <- list(
"effectif" = outcome_12_24 ~ cut_effectif,
"growth_effectif" = outcome_12_24 ~ cut_effectif + cut_growthrate + lag_effectif_missing,
"apart" = outcome_12_24 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees,
"cotisation_effectif" = outcome_12_24 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif,
"dettecumulee" = outcome_12_24 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee,
"croissancedettecumulee" = outcome_12_24 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
indicatrice_croissance_dettecumulee,
"nb_debits" = outcome_12_24 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
indicatrice_croissance_dettecumulee +
nb_debits,
"delais" = outcome_12_24 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
indicatrice_croissance_dettecumulee +
nb_debits +
delai + delai_sup_6mois,
"codenaf" = outcome_12_24 ~ cut_effectif + cut_growthrate + lag_effectif_missing +
apart_last12_months + apart_consommee + apart_share_heuresconsommees +
log_cotisationdue_effectif +
log_ratio_dettecumulee_cotisation + indicatrice_dettecumulee +
indicatrice_croissance_dettecumulee +
nb_debits +
delai + delai_sup_6mois +
libelle_naf_niveau1
)
```
```{r compare-auc}
plyr::ldply(
.data = formulas_12_24,
.fun = function(x) {
glm(formula = x, family = "binomial", data = sample_train) %>%
broom::augment(newdata = sample_test, type.predict = "response") %>%
pROC::roc(outcome_12_24 ~ .fitted, data = . , smooth = FALSE) %>%
.$auc %>%
tibble::as_tibble()
}
)
```
```{r compute-prediction}
output_prediction_12_24 <- formulas_12_24$dettecumulee %>%
glm(
formula = .,
data = sample_train,
family = "binomial"
) %>%
broom::augment(
newdata = sample_actual,
type.predict = "response"
) %>%
dplyr::rename(prediction_12_24 = .fitted)
```
```{r export-top100}
output_prediction_12_24 %>%
dplyr::anti_join(
y = compute_filter_proccollectives(
db = database_signauxfaibles,
.date = "2017-08-01"),
by = "numero_compte",
copy = TRUE
) %>%
dplyr::anti_join(
y = compute_filter_ccsv(
db = database_signauxfaibles,
.date = "2017-08-01"),
by = c("numero_compte"),
copy = TRUE
) %>%
dplyr::arrange(
dplyr::desc(
prediction_12_24
)
) %>%
dplyr::filter(is.na(prediction_12_24) == FALSE) %>%
dplyr::select(raison_sociale, siren, siret,
libelle_naf_niveau1, code_departement, siege,
prediction_12_24,
effectif,
apart_last12_months, apart_consommee, apart_share_heuresconsommees,
nb_debits, delai, delai_sup_6mois
) %>%
dplyr::slice(1:100) %>%
readr::write_csv(path = "data/table_predictions_12_24_2017_09_18.csv")
```
```{r}
output_prediction_12_24 %>%
dplyr::anti_join(
y = compute_filter_proccollectives(
db = database_signauxfaibles,
.date = "2017-08-01"),
by = "numero_compte",
copy = TRUE
) %>%
dplyr::anti_join(
y = compute_filter_ccsv(
db = database_signauxfaibles,
.date = "2017-08-01"),
by = c("numero_compte"),
copy = TRUE
) %>%
dplyr::arrange(
dplyr::desc(
prediction_12_24
)
) %>%
dplyr::filter(is.na(prediction_12_24) == FALSE) %>%
readr::write_csv(path = "data/table_predictions_12_24_2017_09_18_all.csv")
```