Merge pull request #6 from HanzhangRen/master

Hand-pick 62 variables
citp · Apr 21, 2024 · 2d32099 · 2d32099
2 parents eceb143 + f538452
commit 2d32099
Show file tree

Hide file tree

Showing 4 changed files with 78 additions and 12 deletions.
diff --git a/description.md b/description.md
@@ -1,3 +1,3 @@
 # Description of submission
 
-Univariate logistic regression.
+Logistic regression with 62 hand-picked variables.
diff --git a/model.rds b/model.rds
diff --git a/submission.R b/submission.R
@@ -29,17 +29,83 @@ clean_df <- function(df, background_df = NULL){
   # Returns:
   # data frame: The cleaned dataframe with only the necessary columns and processed variables.
 
-  ## This script contains a bare minimum working example
-  # Create new age variable
-  df$age <- 2024 - df$birthyear_bg
-
   # Selecting variables for modelling
 
-  keepcols = c('nomem_encr', # ID variable required for predictions,
-               'age')        # newly created variable
+  keepcols = c("nomem_encr", # ID variable required for predictions,
+               "outcome_available", # Is there an outcome to predict?
+               "cf20m024",
+               "cf20m025",
+               "cf20m030",
+               "cf20m031",
+               "cf20m129",
+               "cf20m130",
+               "cf20m166",
+               "cf20m454",
+               "cf20m455",
+               "cf20m513",
+               "cf20m514",
+               "cf20m515",
+               "cf20m516",
+               "cf20m517",
+               "cf20m518",
+               "cf20m519",
+               "cf20m520",
+               "cf20m521",
+               "ca20g012",
+               "ca20g013",
+               "cv20l109",
+               "cv20l110",
+               "cv20l111",
+               "cv20l112",
+               "cv20l113",
+               "cv20l114",
+               "cv20l115",
+               "cv20l124",
+               "cv20l125",
+               "cv20l126",
+               "cv20l127",
+               "cv20l128",
+               "cv20l129",
+               "cv20l130",
+               "cv10c135",
+               "cv10c136",
+               "cv10c137",
+               "cv10c138",
+               "cv20l143",
+               "cv20l144",
+               "cv20l145",
+               "cv20l151",
+               "cv20l152",
+               "cv20l153",
+               "cv20l154",
+               "cr18k101",
+               "cr18k102",
+               "cr18k103",
+               "cr18k104",
+               "cr18k105",
+               "cr20m162",
+               "cw20m440",
+               "cw20m529",
+               "cw20m587",
+               "cw20m592",
+               "birthyear_bg",
+               "gender_bg",
+               "migration_background_bg",
+               "nettohh_f_2020",
+               "oplmet_2020",
+               "sted_2020",
+               "woning_2020")
 
   ## Keeping data with variables selected
   df <- df[ , keepcols ]
+
+  df <- filter(df, outcome_available == 1) %>% 
+    # Mean impute everything
+    mutate_all(as.numeric) %>% 
+    recipe(. ~ ., .) %>%
+    step_impute_mean(everything()) %>%
+    prep() %>%
+    bake(new_data = NULL)
 
   return(df)
 }
@@ -64,7 +130,7 @@ predict_outcomes <- function(df, background_df = NULL, model_path = "./model.rds
   # Returns:
   # dataframe: A dataframe containing the identifiers and their corresponding predictions.
 
-  ## This script contains a bare minimum working example
+  # Test for presence of nomem_encr
   if( !("nomem_encr" %in% colnames(df)) ) {
     warning("The identifier variable 'nomem_encr' should be in the dataset")
   }

diff --git a/training.R b/training.R
@@ -18,11 +18,11 @@ train_save_model <- function(cleaned_df, outcome_df) {
   # cleaned_df (dataframe): The cleaned data from clean_df function to be used for training the model.
   # outcome_df (dataframe): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv).
 
-  ## This script contains a bare minimum working example
-  set.seed(1) # not useful here because logistic regression deterministic
+  set.seed(1)
 
   # Combine cleaned_df and outcome_df
   model_df <- merge(cleaned_df, outcome_df, by = "nomem_encr") %>%
+    select(-nomem_encr, -outcome_available) %>% 
     mutate(new_child = factor(new_child))
 
   # Logistic regression model
@@ -41,10 +41,10 @@ train_save_model <- function(cleaned_df, outcome_df) {
   #     truth = new_child, estimate = .pred_class,
   #     event_level = "second"
   #   )) %>%
-  #   map(~ .x$.estimate) %>%
+  #   map_dbl(~ .x$.estimate) %>%
   #   mean()
 
-  model <- fit(model_to_fit, new_child ~ age, model_df)
+  model <- fit(model_to_fit, new_child ~ ., model_df)
 
   # Save the model
   saveRDS(model, "model.rds")