Merge pull request #201 from jr-leary7/dev

Dev
jr-leary7 · Sep 21, 2024 · 233e72a · 233e72a
2 parents 880bf7f + aaef7b6
commit 233e72a
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 23 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,11 +1,13 @@
 # Changes in version 0.8.1
 
++ Added small-sample bias correction method to GEE sandwich variance-covariance matrix, results in smaller Wald test statistics. 
++ Parallelized `getResultsDE()` using `future` backend. 
 + Added a function called `chooseCandidateGenes()` to identify good genes for trajectory DE testing based on mean / SD expression and sparsity. 
 
 # Changes in version 0.8.0
 
 + Added implicit regularization of selected basis functions to the GLMM mode using a NB LASSO. 
-+ Switched candidate knot subsampling to a uniform sequence of candidate1 knots across pseudotime's support. 
++ Switched candidate knot subsampling to a uniform sequence of candidate knots across pseudotime's support. 
 
 # Changes in version 0.7.9
 

diff --git a/R/getResultsDE.R b/R/getResultsDE.R
@@ -11,6 +11,7 @@
 #' @param test.dyn.res The nested list returned by \code{\link{testDynamic}}. Defaults to NULL.
 #' @param p.adj.method (Optional) The method used to adjust \emph{p}-values for multiple hypothesis testing. Defaults to "holm".
 #' @param fdr.cutoff (Optional) The FDR threshold for determining statistical significance. Defaults to 0.01.
+#' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 2L.
 #' @return A data.frame containing differential expression results & test statistics for each gene.
 #' @export
 #' @seealso \code{\link{testDynamic}}
@@ -21,19 +22,22 @@
 
 getResultsDE <- function(test.dyn.res = NULL,
                          p.adj.method = "holm",
-                         fdr.cutoff = 0.01) {
+                         fdr.cutoff = 0.01, 
+                         n.cores = 2L) {
   # check inputs
   if (is.null(test.dyn.res)) { stop("Please provide a result list.") }
   if (!p.adj.method %in% stats::p.adjust.methods) { stop("Please choose a valid p-value adjustment method.") }
+  # set up parallel processing 
+  future::plan(future::multisession, workers = n.cores)
   # iterates first over genes, then over lineages per-gene & coerces to final data.frame after unlisting everything
-  result_df <- purrr::map_dfr(test.dyn.res,
-                              function(x) {
-                                purrr::map_dfr(x,
-                                               function(y) {
-                                                 as.data.frame(rbind(y[seq(13)])) %>%
-                                                   dplyr::mutate(dplyr::across(tidyselect::everything(), \(z) unname(unlist(z))))
-                                               })
-                              }) %>%
+  result_df <- furrr::future_map_dfr(test.dyn.res,
+                                     function(x) {
+                                       purrr::map_dfr(x,
+                                                      function(y) {
+                                                        as.data.frame(rbind(y[seq(13)])) %>%
+                                                          dplyr::mutate(dplyr::across(tidyselect::everything(), \(z) unname(unlist(z))))
+                                                      })
+                                     }) %>%
                dplyr::arrange(dplyr::desc(Test_Stat)) %>%
                dplyr::mutate(P_Val_Adj = stats::p.adjust(P_Val, method = p.adj.method),
                              Gene_Dynamic_Lineage = dplyr::if_else(P_Val_Adj < fdr.cutoff, 1, 0, missing = 0)) %>%
@@ -44,5 +48,7 @@ getResultsDE <- function(test.dyn.res = NULL,
                                Lineage,
                                Test_Stat,
                                P_Val)
+  # shut down parallel processing 
+  future::plan(future::sequential)
   return(result_df)
 }
diff --git a/R/pull_sumy.R b/R/pull_sumy.R
@@ -67,7 +67,7 @@ pull.marge.sumy <- function(mod.obj,
     res <- list(marge_pred_df = NA,
                 marge_sumy_df = NA,
                 ll_marge = NA_real_,
-                marge_fit_notes = NA_character_)
+                marge_fit_notes = mod.obj[1])
   } else {
     # pull model summary
     if (is.gee) {

diff --git a/R/testDynamic.R b/R/testDynamic.R
@@ -19,25 +19,27 @@
 #' @importFrom stats predict logLik deviance offset
 #' @importFrom geeM geem
 #' @importFrom glmmTMB glmmTMB nbinom2
-#' @param expr.mat Either a \code{SingleCellExperiment} or \code{Seurat} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL.
+#' @param expr.mat Either a \code{SingleCellExperiment}, \code{Seurat}, or \code{CellDataSet} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL.
 #' @param pt Either the output from \code{\link[slingshot]{SlingshotDataSet}} object from which pseudotime can be generated, or a data.frame containing the pseudotime or latent time estimates for each cell (can be multiple columns / lineages). Defaults to NULL.
 #' @param genes A character vector of genes to model. If not provided, defaults to all genes in \code{expr.mat}. Defaults to NULL.
 #' @param n.potential.basis.fns (Optional) The maximum number of possible basis functions. See the parameter \code{M} in \code{\link{marge2}}. Defaults to 5.
 #' @param size.factor.offset (Optional) An offset to be included in the final model fit. Can be generated easily with \code{\link{createCellOffset}}. Defaults to NULL.
 #' @param is.gee Should a GEE framework be used instead of the default GLM? Defaults to FALSE.
 #' @param cor.structure If the GEE framework is used, specifies the desired working correlation structure. Must be one of "ar1", "independence", or "exchangeable". Defaults to "ar1".
+#' @param gee.bias.correct Should a small-sample bias correction be use on the sandwich variance-covariance matrix prior to test statistic estimation? Defaults to TRUE. 
 #' @param id.vec If a GEE or GLMM framework is being used, a vector of subject IDs to use as input to \code{\link[geeM]{geem}} or \code{\link[glmmTMB]{glmmTMB}}. Defaults to NULL.
 #' @param is.glmm Should a GLMM framework be used instead of the default GLM? Defaults to FALSE.
 #' @param parallel.exec A boolean indicating whether a parallel \code{\link[foreach]{foreach}} loop should be used to generate results more quickly. Defaults to TRUE.
-#' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 2.
+#' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 4L.
 #' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.
 #' @param glmm.adaptive (Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to FALSE.
 #' @param verbose (Optional) A boolean indicating whether a progress bar should be printed to the console. Defaults to TRUE.
 #' @param random.seed (Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312.
 #' @details
 #' \itemize{
-#' \item If \code{expr.mat} is a \code{Seurat} object, counts will be extracted from the output of \code{\link[SeuratObject]{DefaultAssay}}. If using this functionality, check to ensure the specified assay is correct before running the function. If the input is a \code{SingleCellExperiment} object, the raw counts will be extracted with \code{\link[BiocGenerics]{counts}}.
+#' \item If \code{expr.mat} is a \code{Seurat} object, counts will be extracted from the output of \code{\link[SeuratObject]{DefaultAssay}}. If using this functionality, check to ensure the specified assay is correct before running the function. If the input is a \code{SingleCellExperiment} or \code{CellDataSet} object, the raw counts will be extracted with \code{\link[BiocGenerics]{counts}}.
 #' \item If using the GEE or GLMM model architectures, ensure that the observations are sorted by subject ID (this is assumed by the underlying fit implementations). If they are not, the models will error out.
+#' \item If \code{gee.bias.correct} is set to TRUE, a degrees-of-freedom correct will be used to inflate the variance-covariance matrix prior to estimating the Wald test statistic. This is useful when the number of subjects is small and / or the number of per-subject observations is very large. Doing so will lead to shrunken test statistics and thus more conservative test results. 
 #' }
 #' @return A list of lists, where each element is a gene and each gene contains sublists for each element. Each gene-lineage sublist contains a gene name, lineage number, default \code{marge} vs. null model test results, model statistics, and fitted values. Use \code{\link{getResultsDE}} to tidy the results.
 #' @seealso \code{\link{getResultsDE}}
@@ -62,11 +64,12 @@ testDynamic <- function(expr.mat = NULL,
                         size.factor.offset = NULL,
                         is.gee = FALSE,
                         cor.structure = "ar1",
+                        gee.bias.correct = TRUE, 
                         is.glmm = FALSE,
                         glmm.adaptive = FALSE,
                         id.vec = NULL,
                         parallel.exec = TRUE,
-                        n.cores = 2,
+                        n.cores = 4L,
                         approx.knot = TRUE,
                         verbose = TRUE,
                         random.seed = 312) {
@@ -84,7 +87,9 @@ testDynamic <- function(expr.mat = NULL,
                                      slot = "counts",
                                      assay = Seurat::DefaultAssay(expr.mat))
     expr.mat <- expr.mat[genes, ]
-  } else if (inherits(expr.mat, "dgCMatrix")) {
+  } else if (inherits(expr.mat, "cell_data_set")) {
+    expr.mat <- BiocGenerics::counts(expr.mat)[genes, ]
+  } else if (inherits(expr.mat, "dgCMatrix") || inherits(expr.mat, "dgRMatrix")) {
     expr.mat <- expr.mat[genes, ]
   } else {
     expr.mat <- expr.mat[genes, ]
@@ -140,7 +145,7 @@ testDynamic <- function(expr.mat = NULL,
                                 is_read_only = TRUE)
 
   # build list of objects to prevent from being sent to parallel workers
-  necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm",
+  necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "gee.bias.correct", 
                       "n_lineages", "id.vec", "cor.structure", "is.gee", "glmm.adaptive", "size.factor.offset")
   if (any(ls(envir = .GlobalEnv) %in% necessary_vars)) {
     no_export <- c(ls(envir = .GlobalEnv)[-which(ls(envir = .GlobalEnv) %in% necessary_vars)],
@@ -333,7 +338,7 @@ testDynamic <- function(expr.mat = NULL,
      if (is.gee) {
        test_res <- waldTestGEE(mod.1 = marge_mod, 
                                mod.0 = null_mod, 
-                               bias.correct = ifelse(length(unique(id.vec)) < 30, TRUE, FALSE))
+                               bias.correct = gee.bias.correct)
      } else {
        test_res <- modelLRT(mod.1 = marge_mod,
                             mod.0 = null_mod,

diff --git a/man/getResultsDE.Rd b/man/getResultsDE.Rd
diff --git a/man/testDynamic.Rd b/man/testDynamic.Rd