From af26e5b5c60a041ac24fcca656efbed632ead18c Mon Sep 17 00:00:00 2001 From: lkook Date: Fri, 6 Oct 2023 16:34:57 +0200 Subject: [PATCH] add support2 application --- .gitignore | 1 + inst/Makefile | 5 +- inst/code/case-study.R | 165 +++++++++++++++++++++++++++++++++++++++++ inst/data/README.md | 2 + 4 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 inst/code/case-study.R create mode 100644 inst/data/README.md diff --git a/.gitignore b/.gitignore index 1e40415..af4cbe8 100644 --- a/.gitignore +++ b/.gitignore @@ -43,4 +43,5 @@ inst/ignore inst/results* *.pdf *.sh +*.sav diff --git a/inst/Makefile b/inst/Makefile index 796db58..e1fae6b 100644 --- a/inst/Makefile +++ b/inst/Makefile @@ -21,6 +21,9 @@ identifiability: invariance-example: (cd .. && Rscript --vanilla inst/code/invariance-example.R) +case-study: + (cd .. && R CMD BATCH --vanilla inst/code/case-study.R) + sim-correct: (cd .. && Rscript --vanilla inst/code/run-simulation.R 1 10 10 100 0 $(ROW)) @@ -59,7 +62,7 @@ vis-wald: vis-larger: (cd .. && Rscript --vanilla inst/code/vis-simulation.R 6) -all: intro faithfulness gcm-smooth gcm-penalized identifiability invariance-example +all: intro faithfulness gcm-smooth gcm-penalized identifiability invariance-example case-study vis: vis-main vis-app vis-hidden vis-link vis-larger diff --git a/inst/code/case-study.R b/inst/code/case-study.R new file mode 100644 index 0000000..ff29bf3 --- /dev/null +++ b/inst/code/case-study.R @@ -0,0 +1,165 @@ +# Case study: SUPPORT2 data +# LK 2023 + +# Dependencies ------------------------------------------------------------ + +library("tramicp") +library("tidyverse") +library("forcats") +library("survival") +library("tram") + +bpath <- file.path("inst", "results", "case-study") + +# Options ----------------------------------------------------------------- + +### Don't fit/test interactions for TRAMICP-Wald +options("wald_test_interactions" = FALSE) + +# Load data --------------------------------------------------------------- + +load("inst/data/support2.sav") + +### Preprocessing +nrow(support2) +dat <- support2 |> + mutate(surv = Surv(d.time, death)) |> + select(all_of(c("surv", "num.co", "sex", "race", "scoma", "dzgroup", "ca", + "diabetes", "dementia", "age"))) %>% + mutate(num.co = fct_lump_min(factor(num.co), 100), + scoma = factor(scoma), age = sqrt(age)) %>% + mutate_if(is.factor, ~ factor(.x, levels = levels(.x), + labels = make.names(levels(.x)), + ordered = FALSE)) %>% + na.omit() +nrow(dat) # 43 patients removed due to missingness + +# The set of all predictors is not invariant ------------------------------ + +summary(coxph(surv ~ scoma + dzgroup + ca + sex + race + num.co + + age + diabetes + dementia, data = dat)) + +# Evidence of age and cancer being direct causes of time-to-death --------- + +fm <- surv ~ scoma + dzgroup + ca + age + diabetes + dementia + sex + race +fe <- ~ num.co +if (file.exists(file.path(bpath, "gcm.rds"))) { + gcm <- readRDS(file.path(bpath, "gcm.rds")) +} else { + gcm <- coxphICP(formula = fm, data = dat, env = fe, test = "gcm.test") + saveRDS(gcm, file.path(bpath, "gcm.rds")) +} +gcm +(wald <- coxphICP(formula = fm, data = dat, env = fe, type = "wald")) + +# Multiple environments --------------------------------------------------- + +tfe <- ~ num.co + race +tfm <- surv ~ scoma + dzgroup + ca + age + diabetes + dementia + sex +if (file.exists(file.path(bpath, "tgcm.rds"))) { + tgcm <- readRDS(file.path(bpath, "tgcm.rds")) +} else { + tgcm <- coxphICP(formula = tfm, data = dat, env = tfe, test = "gcm.test") + saveRDS(tgcm, file.path(bpath, "tgcm.rds")) +} +tgcm +(twald <- coxphICP(formula = tfm, data = dat, env = tfe, type = "wald")) + + +# Incorporating prior knowledge about direct causes ----------------------- + +ffm <- surv ~ scoma + dzgroup + ca + sex + race +fmand <- ~ age + diabetes + dementia +if (file.exists(file.path(bpath, "fgcm.rds"))) { + fgcm <- readRDS(file.path(bpath, "fgcm.rds")) +} else { + fgcm <- coxphICP(formula = ffm, data = dat, env = fe, test = "gcm.test", + mandatory = fmand) + saveRDS(fgcm, file.path(bpath, "fgcm.rds")) +} +fgcm +(fwald <- coxphICP(formula = ffm, data = dat, env = fe, type = "wald", + mandatory = fmand)) + +# Create Table ------------------------------------------------------------ + +tab <- lapply(list(gcm, wald, tgcm, twald, fgcm, fwald), + \(mod) { + tmp <- pvalues(gcm) + tmp[] <- NA + pvals <- pvalues(mod) + tmp[names(pvals)] <- pvals + tmp + }) |> bind_rows() +tab +knitr::kable(tab, booktabs = TRUE, format = "latex", digits = 3L) + +# Create Figure ----------------------------------------------------------- + +pdat <- data.frame(set = names(pvalues(gcm, "set")), + gcm = pvalues(gcm, "set"), + wald = pvalues(wald, "set")) |> + mutate(caage = grepl("age", set) & grepl("ca", set), + invariant = gcm > 0.05 | wald > 0.05) + +p1 <- ggplot(pdat, aes(x = gcm, y = wald, shape = caage)) + + geom_point(size = 2, alpha = 0.8) + + annotate("rect", xmin = 0, xmax = 0.05, ymin = 0, ymax = 1, fill = "darkblue", alpha = 0.1) + + annotate("rect", ymin = 0, ymax = 0.05, xmin = 0, xmax = 1, fill = "darkred", alpha = 0.1) + + theme_bw() + + labs(x = "TRAM-GCM p-value", y = "TRAM-Wald p-value", shape = "Set contains both ca and age") + + scale_shape_manual(values = c("TRUE" = 1, "FALSE" = 4), + labels = c("TRUE" = "yes", "FALSE" = "no")) + + theme(legend.position = "top", text = element_text(size = 13.5)) + + geom_abline(intercept = 0, slope = 1, lty = 2) + + coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 0.25)) + +p2 <- ggplot(pdat, aes(x = gcm, y = wald, shape = caage)) + + geom_point(size = 2, alpha = 0.4) + + theme_bw() + + annotate("rect", xmin = 0, xmax = 0.05, ymin = 0, ymax = 1, fill = "darkblue", alpha = 0.1) + + annotate("rect", ymin = 0, ymax = 0.05, xmin = 0, xmax = 1, fill = "darkred", alpha = 0.1) + + labs(x = "TRAM-GCM p-value", y = "TRAM-Wald p-value", shape = "Set contains both ca and age") + + scale_shape_manual(values = c("TRUE" = 1, "FALSE" = 4), + labels = c("TRUE" = "yes", "FALSE" = "no")) + + theme(legend.position = "top", text = element_text(size = 12.5)) + + geom_abline(intercept = 0, slope = 1, lty = 2) + + scale_x_log10() + scale_y_log10() + +library("ggpubr") +ggarrange(p1, p2, common.legend = TRUE) +ggsave("inst/figures/casestudy.pdf", height = 4, width = 7.5) + +# Appendix: Informative censoring ----------------------------------------- + +### Informative right-censoring in random fractions +if (!file.exists(file.path(bpath, "censoring.csv"))) { + probs <- seq(1, 9, 1)/10 + eventobs <- which(dat$surv[, 2] == 1) + nsim <- 10 + set.seed(241068) + out <- lapply(seq_along(probs), \(idx) { + lapply(1:nsim, \(iter) { + dat$nd <- dat$surv[, 2] + to_censor <- sample(eventobs, floor(probs[idx] * length(eventobs))) + dat$nd[to_censor] <- 0 + dat$nsurv <- Surv(dat$surv[, 1], dat$nd) + cfm <- nsurv ~ scoma + dzgroup + ca + age + diabetes + dementia + sex + race + cat(paste0("\nTotal fraction of censored observations is ", round(mean(dat$nd == 0) * 100, 2), "%\n")) + cicp <- coxphICP(formula = cfm, data = dat, env = fe, type = "wald") + tibble(frac = probs[idx], output = paste0(cicp$cand, collapse = "+"), + bind_rows(pvalues(cicp, "set"))) + }) |> bind_rows() + }) |> bind_rows() + write_csv(out, file.path(bpath, "censoring.csv")) +} else { + out <- read_csv(file.path(bpath, "censoring.csv")) +} + +### Generate tables +o2 <- out |> group_by(frac) |> count(output) |> + pivot_wider(names_from = output, values_from = n, values_fill = 0) +o2 + +o2 |> select(frac, Empty, ca, `ca+age`) |> + knitr::kable(booktabs = TRUE, format = "latex", digits = 3L) diff --git a/inst/data/README.md b/inst/data/README.md new file mode 100644 index 0000000..0428218 --- /dev/null +++ b/inst/data/README.md @@ -0,0 +1,2 @@ +The SUPPORT2 dataset `support2.sav` can be downloaded +[at this link](https://hbiostat.org/data/repo/support2.sav).