Adapt missing data sims (#443)

openpharma · Jun 3, 2024 · 7c478d1 · 7c478d1
1 parent 239fe10
commit 7c478d1
Show file tree

Hide file tree

Showing 36 changed files with 1,676 additions and 311 deletions.
diff --git a/.gitignore b/.gitignore
@@ -37,7 +37,6 @@ src/*.so
 src/*.dll
 testthat-problems.rds
 README.md
-**/sascfg_personal.py
 .Rprofile
 /doc/
 /Meta/

diff --git a/simulations/missing-data-benchmarks/.gitignore b/simulations/missing-data-benchmarks/.gitignore
@@ -1,2 +1,3 @@
 **/fit_results.[Rr]ds
 slurm.out
+.future
diff --git a/simulations/missing-data-benchmarks/R/customizations/experiment.R b/simulations/missing-data-benchmarks/R/customizations/experiment.R
@@ -0,0 +1,108 @@
+stopifnot(any(grepl("simChef", search())))
+
+# Custom new method which only kicks off the future jobs.
+# This is modified from the fit() method, see
+# https://github.com/Yu-Group/simChef/blob/main/R/experiment.R#L976C3-L980C62
+Experiment$set(
+  "public",
+  "onlystart",
+  function(n_reps = 1,
+           future.globals = NULL,
+           future.packages = NULL,
+           future.seed = TRUE,
+           verbose = 1, ...) {
+    parallel_strategy <- "reps"
+
+    dgp_list <- private$.get_obj_list("dgp")
+    method_list <- private$.get_obj_list("method")
+
+    if (length(dgp_list) == 0) {
+      private$.throw_empty_list_error("dgp", "generate data from")
+    }
+
+    if (length(method_list) == 0) {
+      private$.throw_empty_list_error("method", "fit methods in")
+    }
+
+    private$.update_fit_params()
+
+    checkpoint <- FALSE
+    n_reps_cached <- 0
+    n_reps_total <- n_reps
+    fit_results <- data.frame()
+
+    if (verbose >= 1) {
+      inform(sprintf("Fitting %s...", self$name))
+      start_time <- Sys.time()
+    }
+
+    if (is.null(future.packages)) {
+      future.packages <- private$.future.packages
+    }
+
+    if (is.null(future.globals)) {
+      future.globals <- private$.future.globals
+    }
+
+    dgp_params_list <- private$.combine_vary_params("dgp")
+    method_params_list <- private$.combine_vary_params("method")
+
+    # if new_fit_params is not NULL after the if statement below, then not all
+    # combos of (dgp_params_list, method_params_list) need to be rerun so need
+    # to check cache ids when fitting
+    new_fit_params <- NULL
+
+    duplicate_param_names <- private$.get_duplicate_param_names()
+
+    # simulations
+    n_reps <- min(n_reps, n_reps_total - n_reps_cached)
+
+    new_fit_results <- local({
+
+      # create an env with objs/funcs that the future workers need
+      workenv <- rlang::new_environment(
+        data = list(
+          verbose = verbose,
+          dgp_list = dgp_list,
+          method_list = method_list,
+          new_fit_params = new_fit_params,
+          dgp_params_list = dgp_params_list,
+          method_params_list = method_params_list,
+          duplicate_param_names = duplicate_param_names,
+          do_call_wrapper = function(name,
+                                     fun,
+                                     params,
+                                     verbose,
+                                     call) {
+            tryCatch(
+              do_call_handler(
+                name, fun, params, verbose, call
+              ),
+              error = identity
+            )
+          }
+        ),
+        parent = rlang::ns_env()
+      )
+
+      # get the experiment compute fun
+      compute_fun <- compute_rep
+
+      environment(compute_fun) <- workenv
+
+      # compute the experiment
+      compute_fun(n_reps,
+                  future.globals,
+                  future.packages,
+                  future.seed)
+    })
+
+    gc()
+
+    new_fit_results %>%
+      dplyr::mutate(
+        .rep = as.character(as.numeric(.rep) + n_reps_cached)
+      ) %>%
+      simplify_tibble()
+  }
+)
diff --git a/simulations/missing-data-benchmarks/R/customizations/format-fit-results.R b/simulations/missing-data-benchmarks/R/customizations/format-fit-results.R
@@ -0,0 +1,44 @@
+# format the fit results
+format_fit_results <- function(fit_results) {
+  fit_results %>%
+    transmute(
+      effect_size = str_extract(.dgp_name, "^([^_])+"),
+      rep = .rep,
+      dgp_name = .dgp_name,
+      method_name = .method_name,
+      converged = pmap(
+        .l = list(fit, .method_name, converged),
+        .f = function(f, method_name, conv_status) {
+          get_convergence(method_name, f, conv_status)
+        }
+      ),
+      converged = unlist(converged),
+      fit_time = fit_time,
+      emmeans_output = pmap(
+        .l = list(fit, .dgp_name, .method_name, data, converged),
+        .f = function(f, dgp_name, method_name, dt, conv_status) {
+          get_emmeans_output(method_name, f, dt, conv_status)
+        }
+      ),
+      covmat_estimates = pmap(
+        .l = list(fit, .method_name, converged),
+        .f = function(f, method_name, conv_status) {
+          get_cov_mat_estimate(method_name, f, conv_status)
+        }
+      )
+    )
+}
+
+# The whole process
+format_fit_and_save <- function(experiment) {
+  gc()
+  fit_results <- experiment$get_cached_results("fit")
+  formatted_fit_results <- format_fit_results(fit_results)
+  formatted_fit_results_file <- file.path(experiment$get_save_dir(), "formatted_fit_results.rds")
+  saveRDS(formatted_fit_results, file = formatted_fit_results_file)
+  formatted_fit_results
+}
+
+
+
+
diff --git a/simulations/missing-data-benchmarks/R/eval/helpers.R b/simulations/missing-data-benchmarks/R/eval/helpers.R
@@ -242,19 +242,26 @@ format_emmeans_df <- function(emmeans_df) {
 
 # extract model fits output at each visit from mmrm fit
 get_mmrm_emmeans_output <- function(fit) {
-  # extract emmeans output
-  marginal_means <- emmeans(
-    fit,
-    spec = trt.vs.ctrl ~ trt | visit_num,
-    weights = "proportional"
-  )
-  emmeans_df <- as.data.frame(marginal_means$contrasts)
+  result <- try({
+    # extract emmeans output
+    marginal_means <- emmeans(
+      fit,
+      spec = trt.vs.ctrl ~ trt | visit_num,
+      weights = "proportional"
+    )
+    emmeans_df <- as.data.frame(marginal_means$contrasts)
 
-  # compute lower and upper 95% CI
-  emmeans_df <- get_95_ci(emmeans_df)
+    # compute lower and upper 95% CI
+    emmeans_df <- get_95_ci(emmeans_df)
 
-  # format to resemble SAS output
-  format_emmeans_df(emmeans_df)
+    # format to resemble SAS output
+    format_emmeans_df(emmeans_df)
+  })
+  if (!inherits(result, "try-error")) {
+    result
+  } else {
+    NA
+  }
 }
 
 # extract model fit outputs at each visit from glmmTMB fit
@@ -323,3 +330,49 @@ get_emmeans_output <- function(method, fit, dt, converged) {
     NA
   }
 }
+
+# get_cov_mat_estimate ----
+
+# extract covariance matrix estimate from mmrm fit
+get_mmrm_cov_mat_estimate <- function(fit) {
+  mmrm::VarCorr(fit)
+}
+
+# extract covariance matrix estimate from glmmTMB fit
+get_glmm_cov_mat_estimate <- function(fit) {
+  mat_with_attrs <- glmmTMB::VarCorr(fit)[[c("cond", "participant")]]
+  dim <- nrow(mat_with_attrs)
+  ind <- seq_len(dim)
+  mat_with_attrs[ind, ind]
+}
+
+# extract covariance matrix estimate from nlme fit
+get_nlme_cov_mat_estimate <- function(fit) {
+  mat_with_attrs <- nlme::getVarCov(fit)
+  dim <- nrow(mat_with_attrs)
+  ind <- seq_len(dim)
+  mat_with_attrs[ind, ind]
+}
+
+# extract covariance matrix estimate from proc mixed fit
+get_mixed_cov_mat_estimate <- function(fit) {
+  attr(fit, "covmat")
+}
+
+# general function for covariance matrix estimate
+get_cov_mat_estimate <- function(method, fit, converged) {
+  if (get_convergence(method, fit, converged)) {
+    if (str_detect(method, "mmrm")) {
+      get_mmrm_cov_mat_estimate(fit)
+    } else if (str_detect(method, "glmmtmb")) {
+      get_glmm_cov_mat_estimate(fit)
+    } else if (str_detect(method, "nlme")) {
+      get_nlme_cov_mat_estimate(fit)
+    } else if (str_detect(method, "proc_mixed")) {
+      get_mixed_cov_mat_estimate(fit)
+    }
+  } else {
+    NA
+  }
+}
+
diff --git a/simulations/missing-data-benchmarks/R/format-replicate-results/format-results.R b/simulations/missing-data-benchmarks/R/format-replicate-results/format-results.R
@@ -9,12 +9,26 @@ library(tidyr)
 library(emmeans)
 
 # load the required helper functions
-source("R/format-replicate-results/helpers.R")
+# This is just an example how to use these functions.
+
+source("R/customizations/format-fit-results.R")
 source("R/eval/helpers.R")
 
 # load a fit_results.rds file
-fit_results <- readRDS("R/format-replicate-results/fit_results_100.rds")
+fit_results <- readRDS("R/format-replicate-results/fit_results_test.rds")
 
 # format the results
-formatted_fit_results_df <- fit_results %>%
-  format_fit_results(missingness = "none", sample_size = 200)
+formatted_fit_results_df <- format_fit_results(fit_results)
+saveRDS(formatted_fit_results_df, file = "R/format-replicate-results/df_test.rds")
+
+# formatted_fit_results_df <- readRDS("df_test.rds")
+head(formatted_fit_results_df)
+
+# to obtain the estimated covariance matrices:
+formatted_fit_results_df$covmat_estimates[1:2]
+
+# to obtain the emmeans by visit:
+formatted_fit_results_df |>
+  select(- covmat_estimates, - fit_time, - converged) |>
+  unnest(cols = emmeans_output)
+