From af26e5b5c60a041ac24fcca656efbed632ead18c Mon Sep 17 00:00:00 2001
From: lkook <lucasheinrich.kook@uzh.ch>
Date: Fri, 6 Oct 2023 16:34:57 +0200
Subject: [PATCH] add support2 application

---
 .gitignore             |   1 +
 inst/Makefile          |   5 +-
 inst/code/case-study.R | 165 +++++++++++++++++++++++++++++++++++++++++
 inst/data/README.md    |   2 +
 4 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 inst/code/case-study.R
 create mode 100644 inst/data/README.md

diff --git a/.gitignore b/.gitignore
index 1e40415..af4cbe8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,4 +43,5 @@ inst/ignore
 inst/results*
 *.pdf
 *.sh
+*.sav
 
diff --git a/inst/Makefile b/inst/Makefile
index 796db58..e1fae6b 100644
--- a/inst/Makefile
+++ b/inst/Makefile
@@ -21,6 +21,9 @@ identifiability:
 invariance-example:
 	(cd .. && Rscript --vanilla inst/code/invariance-example.R)
 
+case-study:
+	(cd .. && R CMD BATCH --vanilla inst/code/case-study.R)
+
 sim-correct:
 	(cd .. && Rscript --vanilla inst/code/run-simulation.R 1 10 10 100 0 $(ROW))
 
@@ -59,7 +62,7 @@ vis-wald:
 vis-larger:
 	(cd .. && Rscript --vanilla inst/code/vis-simulation.R 6)
 
-all: intro faithfulness gcm-smooth gcm-penalized identifiability invariance-example
+all: intro faithfulness gcm-smooth gcm-penalized identifiability invariance-example case-study
 
 vis: vis-main vis-app vis-hidden vis-link vis-larger
 
diff --git a/inst/code/case-study.R b/inst/code/case-study.R
new file mode 100644
index 0000000..ff29bf3
--- /dev/null
+++ b/inst/code/case-study.R
@@ -0,0 +1,165 @@
+# Case study: SUPPORT2 data
+# LK 2023
+
+# Dependencies ------------------------------------------------------------
+
+library("tramicp")
+library("tidyverse")
+library("forcats")
+library("survival")
+library("tram")
+
+bpath <- file.path("inst", "results", "case-study")
+
+# Options -----------------------------------------------------------------
+
+### Don't fit/test interactions for TRAMICP-Wald
+options("wald_test_interactions" = FALSE)
+
+# Load data ---------------------------------------------------------------
+
+load("inst/data/support2.sav")
+
+### Preprocessing
+nrow(support2)
+dat <- support2 |>
+  mutate(surv = Surv(d.time, death)) |>
+  select(all_of(c("surv", "num.co", "sex", "race", "scoma", "dzgroup", "ca",
+                  "diabetes", "dementia", "age"))) %>%
+  mutate(num.co = fct_lump_min(factor(num.co), 100),
+         scoma = factor(scoma), age = sqrt(age)) %>%
+  mutate_if(is.factor, ~ factor(.x, levels = levels(.x),
+                                labels = make.names(levels(.x)),
+                                ordered = FALSE)) %>%
+  na.omit()
+nrow(dat) # 43 patients removed due to missingness
+
+# The set of all predictors is not invariant ------------------------------
+
+summary(coxph(surv ~ scoma + dzgroup + ca + sex + race + num.co +
+                age + diabetes + dementia, data = dat))
+
+# Evidence of age and cancer being direct causes of time-to-death ---------
+
+fm <- surv ~ scoma + dzgroup + ca + age + diabetes + dementia + sex + race
+fe <- ~ num.co
+if (file.exists(file.path(bpath, "gcm.rds"))) {
+  gcm <- readRDS(file.path(bpath, "gcm.rds"))
+} else {
+  gcm <- coxphICP(formula = fm, data = dat, env = fe, test = "gcm.test")
+  saveRDS(gcm, file.path(bpath, "gcm.rds"))
+}
+gcm
+(wald <- coxphICP(formula = fm, data = dat, env = fe, type = "wald"))
+
+# Multiple environments ---------------------------------------------------
+
+tfe <- ~ num.co + race
+tfm <- surv ~ scoma + dzgroup + ca + age + diabetes + dementia + sex
+if (file.exists(file.path(bpath, "tgcm.rds"))) {
+  tgcm <- readRDS(file.path(bpath, "tgcm.rds"))
+} else {
+  tgcm <- coxphICP(formula = tfm, data = dat, env = tfe, test = "gcm.test")
+  saveRDS(tgcm, file.path(bpath, "tgcm.rds"))
+}
+tgcm
+(twald <- coxphICP(formula = tfm, data = dat, env = tfe, type = "wald"))
+
+
+# Incorporating prior knowledge about direct causes -----------------------
+
+ffm <- surv ~ scoma + dzgroup + ca + sex + race
+fmand <- ~ age + diabetes + dementia
+if (file.exists(file.path(bpath, "fgcm.rds"))) {
+  fgcm <- readRDS(file.path(bpath, "fgcm.rds"))
+} else {
+  fgcm <- coxphICP(formula = ffm, data = dat, env = fe, test = "gcm.test",
+                         mandatory = fmand)
+  saveRDS(fgcm, file.path(bpath, "fgcm.rds"))
+}
+fgcm
+(fwald <- coxphICP(formula = ffm, data = dat, env = fe, type = "wald",
+                   mandatory = fmand))
+
+# Create Table ------------------------------------------------------------
+
+tab <- lapply(list(gcm, wald, tgcm, twald, fgcm, fwald),
+              \(mod) {
+                tmp <- pvalues(gcm)
+                tmp[] <- NA
+                pvals <- pvalues(mod)
+                tmp[names(pvals)] <- pvals
+                tmp
+              }) |> bind_rows()
+tab
+knitr::kable(tab, booktabs = TRUE, format = "latex", digits = 3L)
+
+# Create Figure -----------------------------------------------------------
+
+pdat <- data.frame(set = names(pvalues(gcm, "set")),
+                   gcm = pvalues(gcm, "set"),
+                   wald = pvalues(wald, "set")) |>
+  mutate(caage = grepl("age", set) & grepl("ca", set),
+         invariant = gcm > 0.05 | wald > 0.05)
+
+p1 <- ggplot(pdat, aes(x = gcm, y = wald, shape = caage)) +
+  geom_point(size = 2, alpha = 0.8) +
+  annotate("rect", xmin = 0, xmax = 0.05, ymin = 0, ymax = 1, fill = "darkblue", alpha = 0.1) +
+  annotate("rect", ymin = 0, ymax = 0.05, xmin = 0, xmax = 1, fill = "darkred", alpha = 0.1) +
+  theme_bw() +
+  labs(x = "TRAM-GCM p-value", y = "TRAM-Wald p-value", shape = "Set contains both ca and age") +
+  scale_shape_manual(values = c("TRUE" = 1, "FALSE" = 4),
+                     labels = c("TRUE" = "yes", "FALSE" = "no")) +
+  theme(legend.position = "top", text = element_text(size = 13.5)) +
+  geom_abline(intercept = 0, slope = 1, lty = 2) +
+  coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 0.25))
+
+p2 <- ggplot(pdat, aes(x = gcm, y = wald, shape = caage)) +
+  geom_point(size = 2, alpha = 0.4) +
+  theme_bw() +
+  annotate("rect", xmin = 0, xmax = 0.05, ymin = 0, ymax = 1, fill = "darkblue", alpha = 0.1) +
+  annotate("rect", ymin = 0, ymax = 0.05, xmin = 0, xmax = 1, fill = "darkred", alpha = 0.1) +
+  labs(x = "TRAM-GCM p-value", y = "TRAM-Wald p-value", shape = "Set contains both ca and age") +
+  scale_shape_manual(values = c("TRUE" = 1, "FALSE" = 4),
+                     labels = c("TRUE" = "yes", "FALSE" = "no")) +
+  theme(legend.position = "top", text = element_text(size = 12.5)) +
+  geom_abline(intercept = 0, slope = 1, lty = 2) +
+  scale_x_log10() + scale_y_log10()
+
+library("ggpubr")
+ggarrange(p1, p2, common.legend = TRUE)
+ggsave("inst/figures/casestudy.pdf", height = 4, width = 7.5)
+
+# Appendix: Informative censoring -----------------------------------------
+
+### Informative right-censoring in random fractions
+if (!file.exists(file.path(bpath, "censoring.csv"))) {
+  probs <- seq(1, 9, 1)/10
+  eventobs <- which(dat$surv[, 2] == 1)
+  nsim <- 10
+  set.seed(241068)
+  out <- lapply(seq_along(probs), \(idx) {
+    lapply(1:nsim, \(iter) {
+      dat$nd <- dat$surv[, 2]
+      to_censor <- sample(eventobs, floor(probs[idx] * length(eventobs)))
+      dat$nd[to_censor] <- 0
+      dat$nsurv <- Surv(dat$surv[, 1], dat$nd)
+      cfm <- nsurv ~ scoma + dzgroup + ca + age + diabetes + dementia + sex + race
+      cat(paste0("\nTotal fraction of censored observations is ", round(mean(dat$nd == 0) * 100, 2), "%\n"))
+      cicp <- coxphICP(formula = cfm, data = dat, env = fe, type = "wald")
+      tibble(frac = probs[idx], output = paste0(cicp$cand, collapse = "+"),
+             bind_rows(pvalues(cicp, "set")))
+    }) |> bind_rows()
+  }) |> bind_rows()
+  write_csv(out, file.path(bpath, "censoring.csv"))
+} else {
+  out <- read_csv(file.path(bpath, "censoring.csv"))
+}
+
+### Generate tables
+o2 <- out |> group_by(frac) |> count(output) |>
+  pivot_wider(names_from = output, values_from = n, values_fill = 0)
+o2
+
+o2 |> select(frac, Empty, ca, `ca+age`) |>
+  knitr::kable(booktabs = TRUE, format = "latex", digits = 3L)
diff --git a/inst/data/README.md b/inst/data/README.md
new file mode 100644
index 0000000..0428218
--- /dev/null
+++ b/inst/data/README.md
@@ -0,0 +1,2 @@
+The SUPPORT2 dataset `support2.sav` can be downloaded
+[at this link](https://hbiostat.org/data/repo/support2.sav).