From 17d0db935a242df65a7e6b6e1999acc759b76246 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Tue, 20 Jun 2023 10:10:43 +0200 Subject: [PATCH 01/35] Updated detectDoublets --- R/CRMetrics.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 778b464..bc1ac86 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -884,8 +884,15 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, n.cores = self$n.cores, verbose = self$verbose, args = list()) { + # Checks method %<>% tolower() %>% match.arg(c("scrublet","doubletdetection")) if (!is.list(args)) stop("'args' must be a list.") + if (inherits(cms, "list")) stop("'cms' must be a list") + if (!all(sapply(cms, inherits, "Matrix"))) { + warning("All samples in 'cms' must be a matrix, trying to convert to dgCMatrix...") + cms %<>% lapply(as, "CsparseMatrix") + if (!all(sapply(cms, inherits, "Matrix"))) stop("Could not convert automatically.") + } # Prepare arguments if (method == "doubletdetection") { @@ -1077,7 +1084,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param verbose logical Print messages or not (default = self$verbose). #' @param n.cores integer Number of cores for the calculations (default = self$n.cores). #' @param arg.buildGraph list A list with additional arguments for the `buildGraph` function in Conos (default = list()) - #' @param arg.findCommunities list A list with additional arguments for the `findCommunities` function in Conos (default = list(n.iterations = 1)) # Should be updated when Conos issue #123 is resolved + #' @param arg.findCommunities list A list with additional arguments for the `findCommunities` function in Conos (default = list()) #' @param arg.embedGraph list A list with additional arguments for the `embedGraph` function in Conos (default = list(method = "UMAP)) #' @return Conos object #' @examples @@ -1110,7 +1117,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, verbose = self$verbose, n.cores = self$n.cores, arg.buildGraph = list(), - arg.findCommunities = list(n.iterations = 1), + arg.findCommunities = list(), arg.embedGraph = list(method = "UMAP")) { checkPackageInstalled("conos", cran = TRUE) if (is.null(cms)) { From ffff0a144d883748509d9b3d4efd7d0286a5b82f Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Tue, 20 Jun 2023 10:11:18 +0200 Subject: [PATCH 02/35] Updated as(...) to circumvent deprecated statements --- R/inner_functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/inner_functions.R b/R/inner_functions.R index 0fd30b1..6ebc1e9 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -82,9 +82,9 @@ read10x <- function(data.path, mat.path <- tmp.dir %>% .[grepl("mtx", .)] if (grepl("gz", mat.path)) { - mat <- as(Matrix::readMM(gzcon(file(mat.path, "rb"))), "dgCMatrix") + mat <- as(Matrix::readMM(gzcon(file(mat.path, "rb"))), "CsparseMatrix") } else { - mat <- as(Matrix::readMM(mat.path), "dgCMatrix") + mat <- as(Matrix::readMM(mat.path), "CsparseMatrix") } # Add features From 41800711372f0d1c716a0d24a531ec3874c52494 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Tue, 20 Jun 2023 10:50:52 +0200 Subject: [PATCH 03/35] Omitted checkPackageInstalled now in sccore --- R/inner_functions.R | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/R/inner_functions.R b/R/inner_functions.R index 6ebc1e9..4bd3d1f 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -3,6 +3,7 @@ #' @importFrom Matrix sparseMatrix #' @importFrom methods as #' @importFrom utils globalVariables read.table +#' @importFrom sccore checkPackageInstalled NULL utils::globalVariables(c(".","value","variable","V1","V2","metric")) @@ -547,31 +548,3 @@ filterVector <- function(num.vec, checkDataPath <- function(data.path) { if (is.null(data.path)) stop("'data.path' cannot be NULL.") } - -#' @title Check whether a package is installed -#' @description This function is shamelessly stolen from github.com/kharchenkolab/Cacoa -#' @keywords internal -checkPackageInstalled <- function(pkgs, details='to run this function', install.help=NULL, bioc=FALSE, cran=FALSE) { - pkgs <- pkgs[!sapply(pkgs, requireNamespace, quietly=TRUE)] - if (length(pkgs) == 0) { - return(NULL) - } - - if (length(pkgs) > 1) { - pkgs <- paste0("c('", paste0(pkgs, collapse="', '"), "')") - error.text <- paste("Packages", pkgs, "must be installed", details) - } else { - pkgs <- paste0("'", pkgs, "'") - error.text <- paste(pkgs, "package must be installed", details) - } - - if (!is.null(install.help)) { - error.text <- paste0(error.text, ". Please, run `", install.help, "` to do it.") - } else if (bioc) { - error.text <- paste0(error.text, ". Please, run `BiocManager::install(", pkgs, ")", "` to do it.") - } else if (cran) { - error.text <- paste0(error.text, ". Please, run `install.packages(", pkgs, ")", "` to do it.") - } - - stop(error.text) -} From d8b0744bde3ecfb8f3c2a6fb7b93d6ebfa25358f Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Tue, 20 Jun 2023 10:59:05 +0200 Subject: [PATCH 04/35] Updated getConosDepth to getDepth using UMI counts for depth instead of Conos adjusted depth --- R/CRMetrics.R | 44 ++++++++++++++++-------------------------- tests/testthat/tests.R | 4 ++-- 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index bc1ac86..7cd42e1 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -1,6 +1,6 @@ #' @import dplyr magrittr ggplot2 ggrepel #' @importFrom R6 R6Class -#' @importFrom sccore plapply +#' @importFrom sccore plapply checkPackageInstalled #' @importFrom Matrix t #' @importFrom ggpubr stat_compare_means #' @importFrom cowplot plot_grid @@ -10,6 +10,7 @@ #' @importFrom tibble add_column #' @importFrom ggpmisc stat_poly_eq #' @importFrom scales comma +#' @importFrom sparseMatrixSats colSums2 rowSums2 #' @importFrom utils globalVariables NULL @@ -603,7 +604,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Depth if (depth) { - depths <- self$getConosDepth() %>% + depths <- self$getDepth() %>% filterVector("depth.cutoff", depth.cutoff, self$con$samples %>% names(), sep) if (length(depth.cutoff) > 1) { main <- "Cells with low depth with sample-specific cutoff" @@ -689,7 +690,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(tmp)," samples. Please adjust.")) - depths <- self$getConosDepth() + depths <- self$getDepth() # Preparations tmp <- depths %>% @@ -1137,14 +1138,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, do.call(con$embedGraph, arg.embedGraph) self$con <- con - if (!is.null(self$depth)) { - warning("Overwriting previous depth vector") - self$depth <- self$getConosDepth(force = TRUE) - } - if (!is.null(self$mito.fraction)) { - warning("Overwriting previous mito.fraction vector") - self$mito.frac <- self$getMitoFraction(force = TRUE) - } + invisible(con) }, @@ -1223,7 +1217,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Depth if (!is.null(depth.cutoff)) { - depth.filter <- self$getConosDepth() %>% + depth.filter <- self$getDepth() %>% filterVector("depth.cutoff", depth.cutoff, samples, sep) } else { depth.filter <- NULL @@ -1372,7 +1366,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Prepare data if (depth) { checkPackageInstalled("conos", cran = TRUE) - depths <- self$getConosDepth() %>% + depths <- self$getDepth() %>% filterVector("depth.cutoff", depth.cutoff, depth.cutoff %>% names(), sep) %>% {ifelse(!., "depth", "")} } else { @@ -1503,8 +1497,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' crm$doPreprocessing() #' crm$createEmbedding() #' - #' # Get Conos depth - #' crm$getConosDepth() + #' # Get depth + #' crm$getDepth() #' } else { #' message("Package 'conos' not available.") #' } @@ -1512,21 +1506,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' message("Package 'pagoda2' not available.") #' } #' } - getConosDepth = function() { - tmp <- self$con$samples %>% - lapply(`[[`, "depth") %>% - Reduce(c, .) + getDepth = function() { + checkPackageInstalled("conos", cran = TRUE) - return(tmp) + self$con$samples %>% + lapply(`[[`, "misc") %>% + lapply(`[[`, "rawCounts") %>% + lapply(\(x) `names<-`(sparseMatrixStats::rowSums2(x), rownames(x))) %>% + Reduce(c, .) %>% + .[lapply(self$con$samples, conos::getCellNames) %>% unlist()] }, - # getUmiDepth = function() { - # checkPackageInstalled("sparseMatrixStats", bioc = TRUE) - # tmp <- self$cms %>% - # lapply(\(cm) setNames(cm %>% sparseMatrixStats::colSums2(), cm %>% colnames())) %>% - # Reduce(c, .) - # }, - #' @description Calculate the fraction of mitochondrial genes. #' @param species character Species to calculate the mitochondrial fraction for (default = "human"). #' @param force logical Force update of stored vector (default = FALSE) diff --git a/tests/testthat/tests.R b/tests/testthat/tests.R index 741bacc..5a13bde 100644 --- a/tests/testthat/tests.R +++ b/tests/testthat/tests.R @@ -45,6 +45,6 @@ test_that("Check embedding object", { test_that("Check depth vector", { skip_if_not_installed("pagoda2") skip_if_not_installed("conos") - crm$getConosDepth() - expect_equal(length(crm$getConosDepth()), 1.2e4) + crm$getDepth() + expect_equal(length(crm$getDepth()), 1.2e4) }) From c1c0db9149a5bc32f05490a0f24df1aad2b88192 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Tue, 20 Jun 2023 14:43:04 +0200 Subject: [PATCH 05/35] Updated addSummaryMetrics and class initialization to allow for multiple data.paths --- R/CRMetrics.R | 15 +++++++++------ R/inner_functions.R | 23 ++++++++++++++++++----- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 7cd42e1..b374a3c 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -91,11 +91,14 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Check that last character is slash if (!is.null(data.path)) { - length.path <- nchar(data.path) - last.char <- data.path %>% - substr(length.path, length.path) - - if (last.char != "/") data.path <- paste0(data.path,"/") else data.path <- data.path + data.path %<>% + sapply(\(path) { + length.path <- nchar(path) + last.char <- path %>% + substr(length.path, length.path) + + if (last.char != "/") paste0(path,"/") else path + }) } # Write stuff to object @@ -112,7 +115,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, full.names = FALSE)) } } else { - if (is(metadata, "data.frame")) { + if (inherits(metadata, "data.frame")) { self$metadata <- metadata %>% arrange(sample) } else { diff --git a/R/inner_functions.R b/R/inner_functions.R index 4bd3d1f..6983aa0 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -247,15 +247,27 @@ addSummaryMetrics <- function(data.path, n.cores = 1, verbose = TRUE) { samples.tmp <- list.dirs(data.path, recursive = FALSE, full.names = FALSE) - samples <- intersect(samples.tmp, metadata$sample %>% unique()) + samples <- intersect(samples.tmp, metadata$sample) + doubles <- table(samples.tmp) %>% + .[. > 1] %>% + names() + + if (length(doubles) > 0) stop(paste0("One or more samples are present twice in 'data.path'. Sample names must be unique. Affected sample(s): ",paste(doubles, collapse = " "))) if (length(samples) != length(samples.tmp)) message("'metadata' doesn't contain the following sample(s) derived from 'data.path' (dropped): ",setdiff(samples.tmp, samples) %>% paste(collapse = " ")) if (verbose) message(paste0(Sys.time()," Adding ",length(samples)," samples")) # extract and combine metrics summary for all samples - metrics <- samples %>% + metrics <- data.path %>% + lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% + .[. %in% samples] %>% + data.frame(sample = ., path = path)) %>% + bind_rows() %>% + t() %>% + data.frame() %>% + as.list() %>% plapply(\(s) { - tmp <- read.table(dir(paste(data.path,s,"outs", sep = "/"), glob2rx("*ummary.csv"), full.names = TRUE), header = TRUE, sep = ",", colClasses = numeric()) %>% + tmp <- read.table(dir(paste(s[2],s[1],"outs", sep = "/"), glob2rx("*ummary.csv"), full.names = TRUE), header = TRUE, sep = ",", colClasses = numeric()) %>% mutate(., across(.cols = grep("%", .), ~ as.numeric(gsub("%", "", .x)) / 100), across(.cols = grep(",", .), @@ -265,13 +277,14 @@ addSummaryMetrics <- function(data.path, if ("Sample.ID" %in% colnames(tmp)) tmp %<>% select(-c("Sample.ID","Genome","Pipeline.version")) tmp %>% - mutate(sample = s) %>% + mutate(sample = s[1]) %>% pivot_longer(cols = -c(sample), names_to = "metric", values_to = "value") %>% mutate(metric = metric %>% gsub(".", " ", ., fixed = TRUE) %>% tolower()) }, n.cores = n.cores) %>% - bind_rows() + bind_rows() %>% + arrange(sample) if (verbose) message(paste0(Sys.time()," Done!")) return(metrics) } From 15ce35b115825c84ba9271ea30618fe6fb76a9f0 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 22 Jun 2023 12:27:11 +0200 Subject: [PATCH 06/35] Added plotting palette --- R/CRMetrics.R | 89 +++++++++++++++++++++++++++++---------------- R/inner_functions.R | 23 ++++++++---- 2 files changed, 73 insertions(+), 39 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index b374a3c..83ce613 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -76,7 +76,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, theme = theme_bw(), n.cores = 1, sep.meta = ",", - raw.meta = FALSE) { + raw.meta = FALSE, + pal = NULL) { if ('CRMetrics' %in% class(data.path)) { # copy constructor for (n in ls(data.path)) { @@ -106,6 +107,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, self$data.path <- data.path self$verbose <- verbose self$theme <- theme + self$pal <- pal # Metadata if (is.null(metadata)) { @@ -245,6 +247,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param exact logical Whether to calculate exact p values (default = FALSE). #' @param metadata data.frame Metadata for samples (default = self$metadata). #' @param second.comp.group character Second comparison metric, must match a column name of metadata (default = NULL). + #' @param pal character Plotting palette (default = NULL) #' @return ggplot2 object #' @examples #' sample.names <- c("sample1", "sample2") @@ -273,8 +276,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, h.adj = 0.05, exact = FALSE, metadata = self$metadata, - second.comp.group = NULL) { - comp.group %<>% checkCompGroup(comp.group, self$verbose) + second.comp.group = NULL, + pal = self$pal) { + comp.group %<>% checkCompGroup("sample", self$verbose) if (!is.null(second.comp.group)) { second.comp.group %<>% checkCompGroup(second.comp.group, self$verbose) } else { @@ -296,6 +300,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, g %<>% addPlotStatsSamples(comp.group, metadata, h.adj, exact, second.comp.group) } + if (!is.null(pal)) + g <- g + scale_fill_manual(values = pal) return(g) }, @@ -313,6 +319,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param se logical For regression lines, show SE (default = FALSE) #' @param group.reg.lines logical For regression lines, if FALSE show one line, if TRUE show line per group defined by second.comp.group (default = FALSE) #' @param secondary.testing logical Whether to show post hoc testing (default = TRUE) + #' @param pal character Plotting palette (default = NULL) #' @return ggplot2 object #' @examples #' \donttest{ @@ -345,7 +352,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, plot.geom = "bar", se = FALSE, group.reg.lines = FALSE, - secondary.testing = TRUE) { + secondary.testing = TRUE, + pal = self$pal) { # Checks comp.group %<>% checkCompGroup("sample", self$verbose) if (is.null(plot.geom)) { @@ -359,14 +367,14 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, unique() } else { # check if selected metrics are available - difs <- setdiff(metrics, self$summary.metrics$metric %>% unique()) + difs <- setdiff(metrics, summary.metrics$metric %>% unique()) if ("samples per group" %in% difs) difs <- difs[difs != "samples per group"] if (length(difs) > 0) stop(paste0("The following 'metrics' are not valid: ",paste(difs, collapse=" "))) } # if samples per group is one of the metrics to plot use the plotSamples function to plot if ("samples per group" %in% metrics){ - sample.plot <- self$plotSamples(comp.group, h.adj, exact, metadata, second.comp.group) + sample.plot <- self$plotSamples(comp.group, h.adj, exact, metadata, second.comp.group, pal) metrics <- metrics[metrics != "samples per group"] } @@ -377,18 +385,22 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, filter(metric == met) %>% merge(metadata, by = "sample") + # Create ggplot object + g <- tmp %>% + ggplot(aes(!!sym(comp.group), value)) + + self$theme + + # Add geom + palette if (is.null(second.comp.group)) { - g <- tmp %>% - ggplot(aes(x = !!sym(comp.group), y = value)) + - plotGeom(plot.geom, col = comp.group) + - labs(y = met, x = element_blank()) + - self$theme + g %<>% + plotGeom(plot.geom, col = comp.group, pal) + g <- g + + labs(y = met, x = element_blank()) } else { - g <- tmp %>% - ggplot(aes(!!sym(comp.group), value)) + - plotGeom(plot.geom, col = second.comp.group) + - labs(y = met, x = comp.group) + - self$theme + g %<>% + plotGeom(plot.geom, col = second.comp.group, pal) + g <- g + + labs(y = met, x = comp.group) } if (is.numeric(metadata[[comp.group]])) { @@ -459,8 +471,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param metadata data.frame Metadata for samples (default = self$metadata). #' @param metrics character Metrics to plot. NULL plots both plots (default = NULL). #' @param plot.geom character How to plot the data (default = "violin"). - #' @param data.path character Path to cellranger count data (default = self$data.path). + #' @param data.path character Path to Cell Ranger count data (default = self$data.path). #' @param hline logical Whether to show median as horizontal line (default = TRUE) + #' @param pal character Plotting palette (default = NULL) #' @return ggplot2 object #' @examples #' \donttest{ @@ -486,9 +499,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, detailed.metrics = self$detailed.metrics, metadata = self$metadata, metrics = NULL, - plot.geom = "violin", - data.path = self$data.path, - hline = TRUE){ + plot.geom = "violin", + hline = TRUE, + pal = self$pal){ # Checks if (is.null(detailed.metrics)) stop("'detailed.metrics' not calculated. Please run 'addDetailedMetrics()'.") comp.group %<>% checkCompGroup("sample", self$verbose) @@ -513,8 +526,10 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, filter(metric == met) %>% merge(metadata, by = "sample") - g <- ggplot(tmp, aes(x = sample, y = value)) + - plotGeom(plot.geom, col = comp.group) + + g <- ggplot(tmp, aes(x = sample, y = value)) + g %<>% + plotGeom(plot.geom, comp.group, pal) + g <- g + {if (plot.geom == "violin") scale_y_log10()} + {if (hline) geom_hline(yintercept = median(tmp$value))} + labs(y = met, x = element_blank()) + @@ -554,6 +569,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param species character Species to calculate the mitochondrial fraction for (default = c("human","mouse")). #' @param size numeric Dot size (default = 0.3) #' @param sep character Separator for creating unique cell names (default = "!!") + #' @param pal character Plotting palette (default = NULL) #' @param ... Plotting parameters passed to `sccore::embeddingPlot`. #' @return ggplot2 object #' @examples @@ -593,6 +609,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, species = c("human","mouse"), size = 0.3, sep = "!!", + pal = self$pal, ...) { checkPackageInstalled("conos", cran = TRUE) if (sum(depth, mito.frac, !is.null(doublet.method)) > 1) stop("Only one filter allowed. For multiple filters, use plotFilteredCells(type = 'embedding').") @@ -629,7 +646,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, label <- "labels" } doublets %<>% setNames(rownames(dres)) - g <- self$con$plotGraph(colors = doublets, title = paste(doublet.method,label, collapse = " "), size = size, ...) + g <- self$con$plotGraph(colors = doublets, title = paste(doublet.method,label, collapse = " "), size = size, palette = pal, ...) } # Mitochondrial fraction @@ -644,7 +661,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, g <- self$con$plotGraph(colors = mf * 1, title = main, size = size, ...) } - if (!exists("g")) g <- self$con$plotGraph(...) + if (!exists("g")) g <- self$con$plotGraph(palette = pal, ...) return(g) }, @@ -652,6 +669,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param cutoff numeric The depth cutoff to color the cells in the embedding (default = 1e3). #' @param samples character Sample names to include for plotting (default = $metadata$sample). #' @param sep character Separator for creating unique cell names (default = "!!") + #' @param keep.col character Color for density of cells that are kept (default = "E7CDC2") + #' @param filter.col Character Color for density of cells to be filtered (default = "A65141") #' @return ggplot2 object #' @examples #' \donttest{ @@ -1850,6 +1869,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Plot the results from the CellBender estimations #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) + #' @param pal character Plotting palette (default = NULL) #' @return A ggplot2 object #' @examples #' \dontrun{ @@ -1860,7 +1880,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' crm$plotCbTraining() #' } plotCbTraining = function(data.path = self$data.path, - samples = self$metadata$sample) { + samples = self$metadata$sample, + pal = self$pal) { checkDataPath(data.path) checkPackageInstalled("rhdf5", bioc = TRUE) paths <- getH5Paths(data.path, samples, "cellbender") @@ -2011,13 +2032,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, arrange(desc(Freq)) %>% mutate(Freq = Freq / length(samples), Var1 = factor(Var1, levels = Var1)) - -ggplot(amb, aes(Var1, Freq, fill = Var1)) + - geom_bar(stat = "identity") + - self$theme + - labs(x = "", y = "Proportion") + - theme(axis.text.x = element_text(angle = 90)) + - guides(fill = "none") + + g <- ggplot(amb, aes(Var1, Freq, fill = Var1)) + + geom_bar(stat = "identity") + + self$theme + + labs(x = "", y = "Proportion") + + theme(axis.text.x = element_text(angle = 90)) + + guides(fill = "none") + + if (!is.null(pal)) g <- g + scale_fill_manual(values = pal) + + return(g) }, #' @description Add summary metrics from a list of count matrices diff --git a/R/inner_functions.R b/R/inner_functions.R index 6983aa0..0ad84b8 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -291,21 +291,30 @@ addSummaryMetrics <- function(data.path, #' @title Plot the data as points, as bars as a histogram, or as a violin #' @description Plot the data as points, barplot, histogram or violin +#' @param g ggplot2 object #' @param plot.geom The plot.geom to use, "point", "bar", "histogram", or "violin". +#' @param pal character Palette (default = NULL) #' @keywords internal #' @return geom -plotGeom <- function(plot.geom, - col){ +plotGeom <- function(g, plot.geom, col, pal = NULL) { if (plot.geom == "point"){ - geom <- geom_quasirandom(size = 1, groupOnX = TRUE, aes(col = !!sym(col))) + g <- g + + geom_quasirandom(size = 1, groupOnX = TRUE, aes(col = !!sym(col))) + + if (is.null(pal)) scale_color_hue() else scale_color_manual(values = pal) } else if (plot.geom == "bar"){ - geom <- geom_bar(stat = "identity", position = "dodge", aes(fill = !!sym(col))) + g <- g + + geom_bar(stat = "identity", position = "dodge", aes(fill = !!sym(col))) + + if (is.null(pal)) scale_fill_hue() else scale_fill_manual(values = pal) } else if (plot.geom == "histogram"){ - geom <- geom_histogram(binwidth = 25, aes(fill = !!sym(col))) + g <- g + + geom_histogram(binwidth = 25, aes(fill = !!sym(col))) + + if (is.null(pal)) scale_fill_hue() else scale_fill_manual(values = pal) } else if (plot.geom == "violin"){ - geom <- geom_violin(show.legend = TRUE, aes(fill = !!sym(col))) + g <- g + + geom_violin(show.legend = TRUE, aes(fill = !!sym(col))) + + if (is.null(pal)) scale_fill_hue() else scale_fill_manual(values = pal) } - return(geom) + return(g) } #' @title Calculate percentage of filtered cells From 87dc6f7d6d09bd444d88762520ec5ee2b3d538fb Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 22 Jun 2023 12:28:26 +0200 Subject: [PATCH 07/35] Allowed for multiple data.paths --- R/CRMetrics.R | 9 ++++++--- R/inner_functions.R | 31 +++++++++++++++++++------------ 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 83ce613..cfe5df5 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -1720,7 +1720,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Preparations checkDataPath(data.path) inputs <- getH5Paths(data.path, samples, "raw") - outputs <- sapply(samples, \(sample) paste0(data.path,sample,"/outs/cellbender.h5")) %>% + outputs <- data.path %>% + pathsToList(samples) %>% + sapply(\(sample) paste0(sample[2],sample[1],"/outs/cellbender.h5")) %>% setNames(samples) if (is.null(expected.cells)) expected.cells <- self$getExpectedCells(samples) @@ -2125,9 +2127,10 @@ runSoupX = function(data.path = self$data.path, # Create SoupX objects if (verbose) message(paste0(Sys.time()," Loading data")) - soupx.list <- samples %>% + soupx.list <- data.path %>% + pathsToList(samples) %>% plapply(\(sample) { - arg <- list(dataDir = paste(data.path,sample,"outs", sep = "/")) %>% + arg <- list(dataDir = paste(sample[2],sample[1],"outs", sep = "/")) %>% append(arg.load10X) out <- do.call(SoupX::load10X, arg) return(out) diff --git a/R/inner_functions.R b/R/inner_functions.R index 0ad84b8..295c0ff 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -67,10 +67,11 @@ read10x <- function(data.path, checkPackageInstalled("data.table", cran = TRUE) if (is.null(sample.names)) sample.names <- list.dirs(data.path, full.names = FALSE, recursive = FALSE) - full.path <- sample.names %>% + full.path <- data.path %>% + pathsToList(sample.names) %>% sapply(\(sample) { if (raw) pat <- glob2rx("raw_*_bc_matri*") else pat <- glob2rx("filtered_*_bc_matri*") - dir(paste(data.path,sample,"outs", sep = "/"), pattern = pat, full.names = TRUE) %>% + dir(paste(sample[2],sample[1],"outs", sep = "/"), pattern = pat, full.names = TRUE) %>% .[!grepl(".h5", .)] }) @@ -259,13 +260,7 @@ addSummaryMetrics <- function(data.path, if (verbose) message(paste0(Sys.time()," Adding ",length(samples)," samples")) # extract and combine metrics summary for all samples metrics <- data.path %>% - lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% - .[. %in% samples] %>% - data.frame(sample = ., path = path)) %>% - bind_rows() %>% - t() %>% - data.frame() %>% - as.list() %>% + pathsToList(metadata$sample) %>% plapply(\(s) { tmp <- read.table(dir(paste(s[2],s[1],"outs", sep = "/"), glob2rx("*ummary.csv"), full.names = TRUE), header = TRUE, sep = ",", colClasses = numeric()) %>% mutate(., across(.cols = grep("%", .), @@ -483,12 +478,13 @@ getH5Paths <- function(data.path, match.arg(c("raw","filtered","cellbender","cellbender_filtered")) # Get H5 paths - paths <- samples %>% + paths <- data.path %>% + pathsToList(samples) %>% sapply(\(sample) { if (grepl("cellbender", type)) { - paste0(data.path,"/",sample,"/outs/",type,".h5") + paste0(sample[2],"/",sample[1],"/outs/",type,".h5") } else { - dir(paste0(data.path,sample,"/outs"), glob2rx(paste0(type,"*.h5")), full.names = TRUE) + dir(paste0(sample[2],sample[1],"/outs"), glob2rx(paste0(type,"*.h5")), full.names = TRUE) } }) %>% setNames(samples) @@ -570,3 +566,14 @@ filterVector <- function(num.vec, checkDataPath <- function(data.path) { if (is.null(data.path)) stop("'data.path' cannot be NULL.") } + +pathsToList <- function(data.path, samples) { + tmp <- data.path %>% + lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% + {if (!is.null(samples)) .[. %in% samples] else . } %>% + data.frame(sample = ., path = path)) %>% + bind_rows() %>% + t() %>% + data.frame() %>% + as.list() +} \ No newline at end of file From 0393f867d0ea843daf604b71e60a9083db3a3820 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 22 Jun 2023 12:29:37 +0200 Subject: [PATCH 08/35] Included color arguments to function calls --- R/CRMetrics.R | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index cfe5df5..bce3cf7 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -703,7 +703,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } plotDepth = function(cutoff = 1e3, samples = self$metadata$sample, - sep = "!!"){ + sep = "!!", + keep.col = "E7CDC2", + filter.col = "A65141"){ # Checks checkPackageInstalled("conos", cran = TRUE) if (is.null(self$con)) { @@ -755,11 +757,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (all(tmp.plot$x < plot.cutoff)) { g <- g + - geom_area(fill = "#A65141") + geom_area(fill = filter.col) } else { g <- g + - geom_area(fill = "#A65141") + - geom_area(data = tmp.plot %>% filter(x > plot.cutoff), aes(x), fill = "#E7CDC2") + geom_area(fill = filter.col) + + geom_area(data = tmp.plot %>% filter(x > plot.cutoff), aes(x), fill = keep.col) } return(g) @@ -774,6 +776,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param species character Species to calculate the mitochondrial fraction for (default = "human") #' @param samples character Sample names to include for plotting (default = $metadata$sample) #' @param sep character Separator for creating unique cell names (default = "!!") + #' @param keep.col character Color for density of cells that are kept (default = "E7CDC2") + #' @param filter.col Character Color for density of cells to be filtered (default = "A65141") #' @return ggplot2 object #' @examples #' \donttest{ @@ -807,7 +811,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, plotMitoFraction = function(cutoff = 0.05, species = c("human","mouse"), samples = self$metadata$sample, - sep = "!!"){ + sep = "!!", + keep.col = "E7CDC2", + filter.col = "A65141"){ # Checks checkPackageInstalled("conos", cran = TRUE) if (is.null(self$con)) { @@ -858,11 +864,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (all(tmp.plot$x < plot.cutoff)) { g <- g + - geom_area(fill = "#A65141") + geom_area(fill = filter.col) } else { g <- g + - geom_area(fill = "#A65141") + - geom_area(data = tmp.plot %>% filter(x < plot.cutoff), aes(x), fill = "#E7CDC2") + geom_area(fill = filter.col) + + geom_area(data = tmp.plot %>% filter(x < plot.cutoff), aes(x), fill = keep.col) } return(g) @@ -1337,6 +1343,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param species character Species to calculate the mitochondrial fraction for (default = c("human","mouse")). #' @param size numeric Dot size (default = 0.3) #' @param sep character Separator for creating unique cell names (default = "!!") + #' @param cols character Colors used for plotting (default = c("grey80","red","blue","green","yellow","black","pink","purple")) #' @param ... Plotting parameters passed to `sccore::embeddingPlot`. #' @return ggplot2 object or data frame #' @examples @@ -1378,6 +1385,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, species = c("human","mouse"), size = 0.3, sep = "!!", + cols = c("grey80","red","blue","green","yellow","black","pink","purple"), ...) { type %<>% tolower() %>% @@ -1457,7 +1465,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Embedding plot if (type == "embedding"){ g <- self$con$plotGraph(groups = tmp$filter %>% setNames(rownames(tmp)), mark.groups = FALSE, show.legend = TRUE, shuffle.colors = TRUE, title = "Cells to filter", size = size, ...) + - scale_color_manual(values = c("grey80","red","blue","green","yellow","black","pink","purple")[colstart:(tmp$filter %>% levels() %>% length())]) + scale_color_manual(values = cols[colstart:(tmp$filter %>% levels() %>% length())]) } # Bar plot if (type == "bar") { @@ -1916,6 +1924,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, self$theme + labs(col = "") + facet_wrap(~sample, scales = "free_y") + + if (!is.null(pal)) g <- g + scale_color_manual(values = pal) return(g) }, @@ -1923,6 +1933,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Plot the CellBender assigned cell probabilities #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) + #' @param low.col character Color for low probabilities (default = "gray") + #' @param high.col character Color for high probabilities (default = "red") #' @return A ggplot2 object #' @examples #' \dontrun{ @@ -1933,7 +1945,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' crm$plotCbCellProbs() #' } plotCbCellProbs = function(data.path = self$data.path, - samples = self$metadata$sample) { + samples = self$metadata$sample, + low.col = "gray", + high.col = "red") { checkDataPath(data.path) checkPackageInstalled("rhdf5", bioc = TRUE) paths <- getH5Paths(data.path, samples, "cellbender") @@ -1950,7 +1964,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, ggplot(cell.prob, aes(cell, prob, col = prob)) + geom_point() + - scale_color_gradient(low="gray", high="red") + + scale_color_gradient(low=low.col, high=high.col) + self$theme + labs(x = "Cells", y = "Cell probability", col = "") + facet_wrap(~sample, scales = "free_x") From ead47a3628b2c6fc5231796a6356cb6df8fbd13d Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 22 Jun 2023 12:30:04 +0200 Subject: [PATCH 09/35] Minor fixes --- R/CRMetrics.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index bce3cf7..bdd1ab8 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -1179,6 +1179,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param samples.to.exclude character Sample names to exclude (default = NULL) #' @param verbose logical Show progress (default = self$verbose) #' @param sep character Separator for creating unique cell names (default = "!!") + #' @param raw boolean Filter on raw, unfiltered count matrices. Usually not intended (default = FALSE) #' @return list of filtered count matrices #' @examples #' \donttest{ @@ -1217,16 +1218,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, species = c("human","mouse"), samples.to.exclude = NULL, verbose = self$verbose, - sep = "!!") { + sep = "!!", + raw = FALSE) { # Preparations species %<>% tolower() %>% match.arg(c("human","mouse")) # Extract CMs - cms <- self$cms + if (!raw) cms <- self$cms else cms <- self$cms.raw - if (is.null(cms)) stop("$cms is NULL. filterCms depends on this object. Aborting") + if (is.null(cms)) stop(if (raw) "$cms.raw" else "$cms"," is NULL. filterCms depends on this object. Aborting") # Exclude samples if (!is.null(samples.to.exclude)) { From 5cbebacfa769832be7d18fc6cff5c773e9f53a84 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 22 Jun 2023 12:30:24 +0200 Subject: [PATCH 10/35] Updated documentation, DESCRIPTION, NAMESPACE, NEWS --- DESCRIPTION | 6 +- NAMESPACE | 3 + NEWS.md | 9 +++ R/CRMetrics.R | 45 +++++++----- man/CRMetrics.Rd | 129 ++++++++++++++++++++++++----------- man/checkPackageInstalled.Rd | 18 ----- man/plotGeom.Rd | 6 +- 7 files changed, 140 insertions(+), 76 deletions(-) delete mode 100644 man/checkPackageInstalled.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 532a904..bf5a0a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: CRMetrics Title: Cell Ranger Output Filtering and Metrics Visualization -Version: 0.2.3 -Authors@R: c(person("Rasmus", "Rydbirk", email = "rasmus.rydbirk@bric.ku.dk", role = c("aut", "cre")), person("Fabienne", "Kick", email="fabienne.kick@bric.ku.dk", role="aut"), person("Henrietta","Holze", email="@bric.ku.dk", role="aut"), person("Xian", "Xin", email="xian.xin@bric.ku.dk", role="ctb")) +Version: 0.3.0 +Authors@R: c(person("Rasmus", "Rydbirk", email = "rrydbirk@bmb.sdu.dk", role = c("aut", "cre")), person("Fabienne", "Kick", email="fabienne.kick@bric.ku.dk", role="aut"), person("Henrietta","Holze", email="@bric.ku.dk", role="aut"), person("Xian", "Xin", email="xian.xin@bric.ku.dk", role="ctb")) Description: Sample and cell filtering as well as visualisation of output metrics from 'Cell Ranger' by Grace X.Y. Zheng et al. (2017) . 'CRMetrics' allows for easy plotting of output metrics across multiple samples as well as comparative plots including statistical assessments of these. 'CRMetrics' allows for easy removal of ambient RNA using 'SoupX' by Matthew D Young and Sam Behjati (2020) or 'CellBender' by Stephen J Fleming et al. (2022) . Furthermore, it is possible to preprocess data using 'Pagoda2' by Nikolas Barkas et al. (2021) or 'Seurat' by Yuhan Hao et al. (2021) followed by embedding of cells using 'Conos' by Nikolas Barkas et al. (2019) . Finally, doublets can be detected using 'scrublet' by Samuel L. Wolock et al. (2019) or 'DoubletDetection' by Gayoso et al. (2020) . In the end, cells are filtered based on user input for use in downstream applications. License: GPL-3 Encoding: UTF-8 @@ -38,7 +38,7 @@ Suggests: SoupX, testthat (>= 3.0.0) Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.2 +RoxygenNote: 7.2.3 URL: https://github.com/khodosevichlab/CRMetrics BugReports: https://github.com/khodosevichlab/CRMetrics/issues Maintainer: Rasmus Rydbirk diff --git a/NAMESPACE b/NAMESPACE index bbcd781..d2e5822 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,7 +16,10 @@ importFrom(ggpmisc,stat_poly_eq) importFrom(ggpubr,stat_compare_means) importFrom(methods,as) importFrom(scales,comma) +importFrom(sccore,checkPackageInstalled) importFrom(sccore,plapply) +importFrom(sparseMatrixStats,colSums2) +importFrom(sparseMatrixStats,rowSums2) importFrom(stats,chisq.test) importFrom(stats,fisher.test) importFrom(stats,relevel) diff --git a/NEWS.md b/NEWS.md index 04ea146..f4d70d4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,12 @@ +# CRMetrics 0.3.0 + +* Updated createEmbedding upon resolve of https://github.com/kharchenkolab/conos/issues/123 +* Updated DESCRIPTION +* Added checks for detectDoublets +* Added possibility for multiple paths for "data.path" argument in initialize +* Changed depth calculation from Conos depth to raw depth (= colSums) +* Added plotting palette to object + # CRMetrics 0.2.3 * Updated tests and examples to pass CRAN checks diff --git a/R/CRMetrics.R b/R/CRMetrics.R index bdd1ab8..2ed2dd1 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -10,7 +10,7 @@ #' @importFrom tibble add_column #' @importFrom ggpmisc stat_poly_eq #' @importFrom scales comma -#' @importFrom sparseMatrixSats colSums2 rowSums2 +#' @importFrom sparseMatrixStats colSums2 rowSums2 #' @importFrom utils globalVariables NULL @@ -18,48 +18,61 @@ utils::globalVariables(c("Valid Barcodes","Fraction Reads in Cells")) #' CRMetrics class object #' -#' @description Functions to analyze Cell Ranger count data +#' @description Functions to analyze Cell Ranger count data. To initialize a new object, 'data.path' or 'cms' is needed. 'metadata' is also recommended, but not required. #' @export CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, public = list( - #' @field metadata (default = NULL) + #' @field metadata data.frame or character Path to metadata file or name of metadata data.frame object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL) metadata = NULL, - #' @field data.path (default = NULL) + #' @field data.path character Path(s) to Cell Ranger count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL) data.path = NULL, - #' @field summary.metrics (default = NULL) + #' @field cms list List with count matrices (default = NULL) + cms = NULL, + + #' @field cms.preprocessed list List with preprocessed count matrices after $doPreprocessing() (default = NULL) + cms.preprocessed = NULL, + + #' @field cms.raw list List with raw, unfiltered count matrices, i.e., including all CBs detected also empty droplets (default = NULL) + cms.raw = NULL, + + #' @field summary.metrics data.frame Summary metrics from Cell Ranger (default = NULL) summary.metrics = NULL, - #' @field detailed.metrics (default = NULL) + #' @field detailed.metrics data.frame Detailed metrics, i.e., no. genes and UMIs per cell (default = NULL) detailed.metrics = NULL, - #' @field comp.group (default = NULL) + #' @field comp.group character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL) comp.group = NULL, - #' @field verbose (default = TRUE) + #' @field verbose logical Print messages or not (default = TRUE) verbose = TRUE, - #' @field theme (default = NULL) - theme = NULL, + #' @field theme ggplot2 theme (default: theme_bw()) + theme = ggplot2::theme_bw(), - #' @field n.cores (default = 1) + #' @field pal Plotting palette (default = NULL) + pal = NULL, + + #' @field n.cores numeric Number of cores for calculations (default = 1) n.cores = 1, #' Initialize a CRMetrics object #' @description To initialize new object, 'data.path' or 'cms' is needed. 'metadata' is also recommended, but not required. #' @param data.path character Path to directory with Cell Ranger count data, one directory per sample (default = NULL). - #' @param metadata data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL). + #' @param metadata data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL) #' @param cms list List with count matrices (default = NULL) #' @param sample.names character Sample names. Only relevant is cms is provided (default = NULL) #' @param unique.names logical Create unique cell names. Only relevant if cms is provided (default = TRUE) #' @param sep.cells character Sample-cell separator. Only relevant if cms is provided and `unique.names=TRUE` (default = "!!") - #' @param comp.group character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL). - #' @param verbose logical Print messages or not (default = TRUE). - #' @param theme ggplot2 theme (default: theme_bw()). - #' @param n.cores integer Number of cores for the calculations (default = self$n.cores). + #' @param comp.group character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL) + #' @param verbose logical Print messages or not (default = TRUE) + #' @param theme ggplot2 theme (default: theme_bw()) + #' @param n.cores integer Number of cores for the calculations (default = self$n.cores) #' @param sep.meta character Separator for metadata file (default = ",") #' @param raw.meta logical Keep metadata in its raw format. If FALSE, classes will be converted using "type.convert" (default = FALSE) + #' @param pal character Plotting palette (default = NULL) #' @return CRMetrics object #' @examples #' \dontrun{ diff --git a/man/CRMetrics.Rd b/man/CRMetrics.Rd index cd3d015..1b28776 100644 --- a/man/CRMetrics.Rd +++ b/man/CRMetrics.Rd @@ -4,7 +4,7 @@ \alias{CRMetrics} \title{CRMetrics class object} \description{ -Functions to analyze Cell Ranger count data +Functions to analyze Cell Ranger count data. To initialize a new object, 'data.path' or 'cms' is needed. 'metadata' is also recommended, but not required. } \examples{ @@ -396,7 +396,7 @@ message("Package 'pagoda2' not available.") } ## ------------------------------------------------ -## Method `CRMetrics$getConosDepth` +## Method `CRMetrics$getDepth` ## ------------------------------------------------ \donttest{ @@ -418,8 +418,8 @@ crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), crm$doPreprocessing() crm$createEmbedding() -# Get Conos depth -crm$getConosDepth() +# Get depth +crm$getDepth() } else { message("Package 'conos' not available.") } @@ -644,21 +644,29 @@ crm$plotCbCells() \section{Public fields}{ \if{html}{\out{
}} \describe{ -\item{\code{metadata}}{(default = NULL)} +\item{\code{metadata}}{data.frame or character Path to metadata file or name of metadata data.frame object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL)} -\item{\code{data.path}}{(default = NULL)} +\item{\code{data.path}}{character Path(s) to Cell Ranger count data, one directory per sample. If multiple paths, do c("path1","path2") (default = NULL)} -\item{\code{summary.metrics}}{(default = NULL)} +\item{\code{cms}}{list List with count matrices (default = NULL)} + +\item{\code{cms.preprocessed}}{list List with preprocessed count matrices after $doPreprocessing() (default = NULL)} + +\item{\code{cms.raw}}{list List with raw, unfiltered count matrices, i.e., including all CBs detected also empty droplets (default = NULL)} + +\item{\code{summary.metrics}}{data.frame Summary metrics from Cell Ranger (default = NULL)} + +\item{\code{detailed.metrics}}{data.frame Detailed metrics, i.e., no. genes and UMIs per cell (default = NULL)} -\item{\code{detailed.metrics}}{(default = NULL)} +\item{\code{comp.group}}{character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL)} -\item{\code{comp.group}}{(default = NULL)} +\item{\code{verbose}}{logical Print messages or not (default = TRUE)} -\item{\code{verbose}}{(default = TRUE)} +\item{\code{theme}}{ggplot2 theme (default: theme_bw())} -\item{\code{theme}}{(default = NULL)} +\item{\code{pal}}{Plotting palette (default = NULL)} -\item{\code{n.cores}}{(default = 1) +\item{\code{n.cores}}{numeric Number of cores for calculations (default = 1) Initialize a CRMetrics object} } \if{html}{\out{
}} @@ -681,7 +689,7 @@ Initialize a CRMetrics object} \item \href{#method-CRMetrics-filterCms}{\code{CRMetrics$filterCms()}} \item \href{#method-CRMetrics-selectMetrics}{\code{CRMetrics$selectMetrics()}} \item \href{#method-CRMetrics-plotFilteredCells}{\code{CRMetrics$plotFilteredCells()}} -\item \href{#method-CRMetrics-getConosDepth}{\code{CRMetrics$getConosDepth()}} +\item \href{#method-CRMetrics-getDepth}{\code{CRMetrics$getDepth()}} \item \href{#method-CRMetrics-getMitoFraction}{\code{CRMetrics$getMitoFraction()}} \item \href{#method-CRMetrics-prepareCellbender}{\code{CRMetrics$prepareCellbender()}} \item \href{#method-CRMetrics-saveCellbenderScript}{\code{CRMetrics$saveCellbenderScript()}} @@ -717,7 +725,8 @@ To initialize new object, 'data.path' or 'cms' is needed. 'metadata' is also rec theme = theme_bw(), n.cores = 1, sep.meta = ",", - raw.meta = FALSE + raw.meta = FALSE, + pal = NULL )}\if{html}{\out{}} } @@ -726,7 +735,7 @@ To initialize new object, 'data.path' or 'cms' is needed. 'metadata' is also rec \describe{ \item{\code{data.path}}{character Path to directory with Cell Ranger count data, one directory per sample (default = NULL).} -\item{\code{metadata}}{data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL).} +\item{\code{metadata}}{data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL)} \item{\code{cms}}{list List with count matrices (default = NULL)} @@ -736,17 +745,19 @@ To initialize new object, 'data.path' or 'cms' is needed. 'metadata' is also rec \item{\code{sep.cells}}{character Sample-cell separator. Only relevant if cms is provided and \code{unique.names=TRUE} (default = "!!")} -\item{\code{comp.group}}{character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL).} +\item{\code{comp.group}}{character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL)} -\item{\code{verbose}}{logical Print messages or not (default = TRUE).} +\item{\code{verbose}}{logical Print messages or not (default = TRUE)} -\item{\code{theme}}{ggplot2 theme (default: theme_bw()).} +\item{\code{theme}}{ggplot2 theme (default: theme_bw())} -\item{\code{n.cores}}{integer Number of cores for the calculations (default = self$n.cores).} +\item{\code{n.cores}}{integer Number of cores for the calculations (default = self$n.cores)} \item{\code{sep.meta}}{character Separator for metadata file (default = ",")} \item{\code{raw.meta}}{logical Keep metadata in its raw format. If FALSE, classes will be converted using "type.convert" (default = FALSE)} + +\item{\code{pal}}{character Plotting palette (default = NULL)} } \if{html}{\out{}} } @@ -891,7 +902,8 @@ Plot the number of samples. h.adj = 0.05, exact = FALSE, metadata = self$metadata, - second.comp.group = NULL + second.comp.group = NULL, + pal = self$pal )}\if{html}{\out{}} } @@ -907,6 +919,8 @@ Plot the number of samples. \item{\code{metadata}}{data.frame Metadata for samples (default = self$metadata).} \item{\code{second.comp.group}}{character Second comparison metric, must match a column name of metadata (default = NULL).} + +\item{\code{pal}}{character Plotting palette (default = NULL)} } \if{html}{\out{}} } @@ -962,7 +976,8 @@ Plot all summary stats or a selected list. plot.geom = "bar", se = FALSE, group.reg.lines = FALSE, - secondary.testing = TRUE + secondary.testing = TRUE, + pal = self$pal )}\if{html}{\out{}} } @@ -994,6 +1009,8 @@ Plot all summary stats or a selected list. \item{\code{group.reg.lines}}{logical For regression lines, if FALSE show one line, if TRUE show line per group defined by second.comp.group (default = FALSE)} \item{\code{secondary.testing}}{logical Whether to show post hoc testing (default = TRUE)} + +\item{\code{pal}}{character Palette (default = self$pol)} } \if{html}{\out{}} } @@ -1038,8 +1055,8 @@ Plot detailed metrics from the detailed.metrics object metadata = self$metadata, metrics = NULL, plot.geom = "violin", - data.path = self$data.path, - hline = TRUE + hline = TRUE, + pal = self$pal )}\if{html}{\out{}} } @@ -1056,9 +1073,11 @@ Plot detailed metrics from the detailed.metrics object \item{\code{plot.geom}}{character How to plot the data (default = "violin").} -\item{\code{data.path}}{character Path to cellranger count data (default = self$data.path).} - \item{\code{hline}}{logical Whether to show median as horizontal line (default = TRUE)} + +\item{\code{pal}}{character Plotting palette (default = NULL)} + +\item{\code{data.path}}{character Path to Cell Ranger count data (default = self$data.path).} } \if{html}{\out{}} } @@ -1108,6 +1127,7 @@ Plot cells in embedding using Conos and color by depth and doublets. species = c("human", "mouse"), size = 0.3, sep = "!!", + pal = self$pal, ... )}\if{html}{\out{}} } @@ -1133,6 +1153,8 @@ Plot cells in embedding using Conos and color by depth and doublets. \item{\code{sep}}{character Separator for creating unique cell names (default = "!!")} +\item{\code{pal}}{character Plotting palette (default = NULL)} + \item{\code{...}}{Plotting parameters passed to \code{sccore::embeddingPlot}.} } \if{html}{\out{}} @@ -1181,7 +1203,13 @@ message("Package 'pagoda2' not available.") \subsection{Method \code{plotDepth()}}{ Plot the sequencing depth in histogram. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{CRMetrics$plotDepth(cutoff = 1000, samples = self$metadata$sample, sep = "!!")}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{CRMetrics$plotDepth( + cutoff = 1000, + samples = self$metadata$sample, + sep = "!!", + keep.col = "E7CDC2", + filter.col = "A65141" +)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -1192,6 +1220,10 @@ Plot the sequencing depth in histogram. \item{\code{samples}}{character Sample names to include for plotting (default = $metadata$sample).} \item{\code{sep}}{character Separator for creating unique cell names (default = "!!")} + +\item{\code{keep.col}}{character Color for density of cells that are kept (default = "E7CDC2")} + +\item{\code{filter.col}}{Character Color for density of cells to be filtered (default = "A65141")} } \if{html}{\out{}} } @@ -1244,7 +1276,9 @@ Plot the mitochondrial fraction in histogram. cutoff = 0.05, species = c("human", "mouse"), samples = self$metadata$sample, - sep = "!!" + sep = "!!", + keep.col = "E7CDC2", + filter.col = "A65141" )}\if{html}{\out{}} } @@ -1258,6 +1292,10 @@ Plot the mitochondrial fraction in histogram. \item{\code{samples}}{character Sample names to include for plotting (default = $metadata$sample)} \item{\code{sep}}{character Separator for creating unique cell names (default = "!!")} + +\item{\code{keep.col}}{character Color for density of cells that are kept (default = "E7CDC2")} + +\item{\code{filter.col}}{Character Color for density of cells to be filtered (default = "A65141")} } \if{html}{\out{}} } @@ -1452,7 +1490,7 @@ Create Conos embedding. verbose = self$verbose, n.cores = self$n.cores, arg.buildGraph = list(), - arg.findCommunities = list(n.iterations = 1), + arg.findCommunities = list(), arg.embedGraph = list(method = "UMAP") )}\if{html}{\out{}} } @@ -1468,7 +1506,7 @@ Create Conos embedding. \item{\code{arg.buildGraph}}{list A list with additional arguments for the \code{buildGraph} function in Conos (default = list())} -\item{\code{arg.findCommunities}}{list A list with additional arguments for the \code{findCommunities} function in Conos (default = list(n.iterations = 1)) # Should be updated when Conos issue #123 is resolved} +\item{\code{arg.findCommunities}}{list A list with additional arguments for the \code{findCommunities} function in Conos (default = list())} \item{\code{arg.embedGraph}}{list A list with additional arguments for the \code{embedGraph} function in Conos (default = list(method = "UMAP))} } @@ -1524,7 +1562,8 @@ Filter cells based on depth, mitochondrial fraction and doublets from the count species = c("human", "mouse"), samples.to.exclude = NULL, verbose = self$verbose, - sep = "!!" + sep = "!!", + raw = FALSE )}\if{html}{\out{}} } @@ -1546,6 +1585,8 @@ Filter cells based on depth, mitochondrial fraction and doublets from the count \item{\code{verbose}}{logical Show progress (default = self$verbose)} \item{\code{sep}}{character Separator for creating unique cell names (default = "!!")} + +\item{\code{raw}}{boolean Filter on raw, unfiltered count matrices. Usually not intended (default = FALSE)} } \if{html}{\out{}} } @@ -1647,6 +1688,7 @@ Plot filtered cells in an embedding, in a bar plot, on a tile or export the data species = c("human", "mouse"), size = 0.3, sep = "!!", + cols = c("grey80", "red", "blue", "green", "yellow", "black", "pink", "purple"), ... )}\if{html}{\out{}} } @@ -1672,6 +1714,8 @@ Plot filtered cells in an embedding, in a bar plot, on a tile or export the data \item{\code{sep}}{character Separator for creating unique cell names (default = "!!")} +\item{\code{cols}}{character Colors used for plotting (default = c("grey80","red","blue","green","yellow","black","pink","purple"))} + \item{\code{...}}{Plotting parameters passed to \code{sccore::embeddingPlot}.} } \if{html}{\out{}} @@ -1717,12 +1761,12 @@ message("Package 'pagoda2' not available.") } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-CRMetrics-getConosDepth}{}}} -\subsection{Method \code{getConosDepth()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CRMetrics-getDepth}{}}} +\subsection{Method \code{getDepth()}}{ Extract sequencing depth from Conos object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{CRMetrics$getConosDepth()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{CRMetrics$getDepth()}\if{html}{\out{
}} } \subsection{Returns}{ @@ -1749,8 +1793,8 @@ crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), crm$doPreprocessing() crm$createEmbedding() -# Get Conos depth -crm$getConosDepth() +# Get depth +crm$getDepth() } else { message("Package 'conos' not available.") } @@ -2100,7 +2144,8 @@ Plot the results from the CellBender estimations \subsection{Usage}{ \if{html}{\out{
}}\preformatted{CRMetrics$plotCbTraining( data.path = self$data.path, - samples = self$metadata$sample + samples = self$metadata$sample, + pal = self$pal )}\if{html}{\out{
}} } @@ -2110,6 +2155,8 @@ Plot the results from the CellBender estimations \item{\code{data.path}}{character Path to Cell Ranger outputs (default = self$data.path)} \item{\code{samples}}{character Sample names to include (default = self$metadata$sample)} + +\item{\code{pal}}{character Plotting palette (default = NULL)} } \if{html}{\out{}} } @@ -2139,7 +2186,9 @@ Plot the CellBender assigned cell probabilities \subsection{Usage}{ \if{html}{\out{
}}\preformatted{CRMetrics$plotCbCellProbs( data.path = self$data.path, - samples = self$metadata$sample + samples = self$metadata$sample, + low.col = "gray", + high.col = "red" )}\if{html}{\out{
}} } @@ -2149,6 +2198,10 @@ Plot the CellBender assigned cell probabilities \item{\code{data.path}}{character Path to Cell Ranger outputs (default = self$data.path)} \item{\code{samples}}{character Sample names to include (default = self$metadata$sample)} + +\item{\code{low.col}}{character Color for low probabilities (default = "gray")} + +\item{\code{high.col}}{character Color for high probabilities (default = "red")} } \if{html}{\out{}} } diff --git a/man/checkPackageInstalled.Rd b/man/checkPackageInstalled.Rd deleted file mode 100644 index ce09def..0000000 --- a/man/checkPackageInstalled.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/inner_functions.R -\name{checkPackageInstalled} -\alias{checkPackageInstalled} -\title{Check whether a package is installed} -\usage{ -checkPackageInstalled( - pkgs, - details = "to run this function", - install.help = NULL, - bioc = FALSE, - cran = FALSE -) -} -\description{ -This function is shamelessly stolen from github.com/kharchenkolab/Cacoa -} -\keyword{internal} diff --git a/man/plotGeom.Rd b/man/plotGeom.Rd index 44d8193..7c432be 100644 --- a/man/plotGeom.Rd +++ b/man/plotGeom.Rd @@ -4,10 +4,14 @@ \alias{plotGeom} \title{Plot the data as points, as bars as a histogram, or as a violin} \usage{ -plotGeom(plot.geom, col) +plotGeom(g, plot.geom, col, pal = NULL) } \arguments{ +\item{g}{ggplot2 object} + \item{plot.geom}{The plot.geom to use, "point", "bar", "histogram", or "violin".} + +\item{pal}{character Palette (default = NULL)} } \value{ geom From 52c2b86f1790599a78a2621c2be45024d01ad0bb Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 22 Jun 2023 13:11:48 +0200 Subject: [PATCH 11/35] Hotfix, col arguments --- R/CRMetrics.R | 16 ++++++++-------- man/CRMetrics.Rd | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 2ed2dd1..1a31c9b 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -682,8 +682,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param cutoff numeric The depth cutoff to color the cells in the embedding (default = 1e3). #' @param samples character Sample names to include for plotting (default = $metadata$sample). #' @param sep character Separator for creating unique cell names (default = "!!") - #' @param keep.col character Color for density of cells that are kept (default = "E7CDC2") - #' @param filter.col Character Color for density of cells to be filtered (default = "A65141") + #' @param keep.col character Color for density of cells that are kept (default = "#E7CDC2") + #' @param filter.col Character Color for density of cells to be filtered (default = "#A65141") #' @return ggplot2 object #' @examples #' \donttest{ @@ -717,8 +717,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, plotDepth = function(cutoff = 1e3, samples = self$metadata$sample, sep = "!!", - keep.col = "E7CDC2", - filter.col = "A65141"){ + keep.col = "#E7CDC2", + filter.col = "#A65141"){ # Checks checkPackageInstalled("conos", cran = TRUE) if (is.null(self$con)) { @@ -789,8 +789,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param species character Species to calculate the mitochondrial fraction for (default = "human") #' @param samples character Sample names to include for plotting (default = $metadata$sample) #' @param sep character Separator for creating unique cell names (default = "!!") - #' @param keep.col character Color for density of cells that are kept (default = "E7CDC2") - #' @param filter.col Character Color for density of cells to be filtered (default = "A65141") + #' @param keep.col character Color for density of cells that are kept (default = "#E7CDC2") + #' @param filter.col Character Color for density of cells to be filtered (default = "#A65141") #' @return ggplot2 object #' @examples #' \donttest{ @@ -825,8 +825,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, species = c("human","mouse"), samples = self$metadata$sample, sep = "!!", - keep.col = "E7CDC2", - filter.col = "A65141"){ + keep.col = "#E7CDC2", + filter.col = "#A65141"){ # Checks checkPackageInstalled("conos", cran = TRUE) if (is.null(self$con)) { diff --git a/man/CRMetrics.Rd b/man/CRMetrics.Rd index 1b28776..916c7ad 100644 --- a/man/CRMetrics.Rd +++ b/man/CRMetrics.Rd @@ -1010,7 +1010,7 @@ Plot all summary stats or a selected list. \item{\code{secondary.testing}}{logical Whether to show post hoc testing (default = TRUE)} -\item{\code{pal}}{character Palette (default = self$pol)} +\item{\code{pal}}{character Plotting palette (default = NULL)} } \if{html}{\out{}} } @@ -1207,8 +1207,8 @@ Plot the sequencing depth in histogram. cutoff = 1000, samples = self$metadata$sample, sep = "!!", - keep.col = "E7CDC2", - filter.col = "A65141" + keep.col = "#E7CDC2", + filter.col = "#A65141" )}\if{html}{\out{}} } @@ -1221,9 +1221,9 @@ Plot the sequencing depth in histogram. \item{\code{sep}}{character Separator for creating unique cell names (default = "!!")} -\item{\code{keep.col}}{character Color for density of cells that are kept (default = "E7CDC2")} +\item{\code{keep.col}}{character Color for density of cells that are kept (default = "#E7CDC2")} -\item{\code{filter.col}}{Character Color for density of cells to be filtered (default = "A65141")} +\item{\code{filter.col}}{Character Color for density of cells to be filtered (default = "#A65141")} } \if{html}{\out{}} } @@ -1277,8 +1277,8 @@ Plot the mitochondrial fraction in histogram. species = c("human", "mouse"), samples = self$metadata$sample, sep = "!!", - keep.col = "E7CDC2", - filter.col = "A65141" + keep.col = "#E7CDC2", + filter.col = "#A65141" )}\if{html}{\out{}} } @@ -1293,9 +1293,9 @@ Plot the mitochondrial fraction in histogram. \item{\code{sep}}{character Separator for creating unique cell names (default = "!!")} -\item{\code{keep.col}}{character Color for density of cells that are kept (default = "E7CDC2")} +\item{\code{keep.col}}{character Color for density of cells that are kept (default = "#E7CDC2")} -\item{\code{filter.col}}{Character Color for density of cells to be filtered (default = "A65141")} +\item{\code{filter.col}}{Character Color for density of cells to be filtered (default = "#A65141")} } \if{html}{\out{}} } From dd7a36d83aae95c8667de3a5cb381014426ad283 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Mon, 26 Jun 2023 13:42:19 +0200 Subject: [PATCH 12/35] Check for palette length and no. genes in plotCbAmbGenes --- R/CRMetrics.R | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 1a31c9b..8fc3130 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -260,7 +260,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param exact logical Whether to calculate exact p values (default = FALSE). #' @param metadata data.frame Metadata for samples (default = self$metadata). #' @param second.comp.group character Second comparison metric, must match a column name of metadata (default = NULL). - #' @param pal character Plotting palette (default = NULL) + #' @param pal character Plotting palette (default = self$pal) #' @return ggplot2 object #' @examples #' sample.names <- c("sample1", "sample2") @@ -332,7 +332,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param se logical For regression lines, show SE (default = FALSE) #' @param group.reg.lines logical For regression lines, if FALSE show one line, if TRUE show line per group defined by second.comp.group (default = FALSE) #' @param secondary.testing logical Whether to show post hoc testing (default = TRUE) - #' @param pal character Plotting palette (default = NULL) + #' @param pal character Plotting palette (default = self$pal) #' @return ggplot2 object #' @examples #' \donttest{ @@ -486,7 +486,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param plot.geom character How to plot the data (default = "violin"). #' @param data.path character Path to Cell Ranger count data (default = self$data.path). #' @param hline logical Whether to show median as horizontal line (default = TRUE) - #' @param pal character Plotting palette (default = NULL) + #' @param pal character Plotting palette (default = self$pal) #' @return ggplot2 object #' @examples #' \donttest{ @@ -622,7 +622,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, species = c("human","mouse"), size = 0.3, sep = "!!", - pal = self$pal, + pal = NULL, ...) { checkPackageInstalled("conos", cran = TRUE) if (sum(depth, mito.frac, !is.null(doublet.method)) > 1) stop("Only one filter allowed. For multiple filters, use plotFilteredCells(type = 'embedding').") @@ -1894,7 +1894,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Plot the results from the CellBender estimations #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) - #' @param pal character Plotting palette (default = NULL) + #' @param pal character Plotting palette (default = self$pal) #' @return A ggplot2 object #' @examples #' \dontrun{ @@ -2031,6 +2031,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param cutoff numeric Cutoff of ambient gene expression to use to extract ambient genes per sample #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) + #' @param pal character Plotting palette (default = self$pal) #' @return A ggplot2 object #' @examples #' \dontrun{ @@ -2042,7 +2043,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } plotCbAmbGenes = function(cutoff = 0.005, data.path = self$data.path, - samples = self$metadata$sample) { + samples = self$metadata$sample, + pal = self$pal) { checkDataPath(data.path) checkPackageInstalled("rhdf5", bioc = TRUE) paths <- getH5Paths(data.path, samples, "cellbender") @@ -2057,21 +2059,27 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, filter(exp >= cutoff) }) %>% setNames(samples) %>% - bind_rows() %>% - {table(.$gene.names)} %>% + bind_rows() %$% + table(gene.names) %>% as.data.frame() %>% arrange(desc(Freq)) %>% mutate(Freq = Freq / length(samples), - Var1 = factor(Var1, levels = Var1)) + gene.names = factor(gene.names, levels = gene.names)) - g <- ggplot(amb, aes(Var1, Freq, fill = Var1)) + + g <- ggplot(amb, aes(gene.names, Freq, fill = gene.names)) + geom_bar(stat = "identity") + self$theme + labs(x = "", y = "Proportion") + theme(axis.text.x = element_text(angle = 90)) + guides(fill = "none") - if (!is.null(pal)) g <- g + scale_fill_manual(values = pal) + if (!is.null(pal)) { + gene.len <- amb$gene.names %>% + unique() %>% + length() + + if (length(pal) < gene.len) warning(paste0("Palette has ",length(pal)," colors but there are ",gene.len," genes, omitting palette.")) else g <- g + scale_fill_manual(values = pal) + } return(g) }, From 6a0220a2d8b501025a38aee942db06dc9210d1fa Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Mon, 26 Jun 2023 13:44:16 +0200 Subject: [PATCH 13/35] Updated documentation --- man/CRMetrics.Rd | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/man/CRMetrics.Rd b/man/CRMetrics.Rd index 916c7ad..236c565 100644 --- a/man/CRMetrics.Rd +++ b/man/CRMetrics.Rd @@ -920,7 +920,7 @@ Plot the number of samples. \item{\code{second.comp.group}}{character Second comparison metric, must match a column name of metadata (default = NULL).} -\item{\code{pal}}{character Plotting palette (default = NULL)} +\item{\code{pal}}{character Plotting palette (default = self$pal)} } \if{html}{\out{}} } @@ -1010,7 +1010,7 @@ Plot all summary stats or a selected list. \item{\code{secondary.testing}}{logical Whether to show post hoc testing (default = TRUE)} -\item{\code{pal}}{character Plotting palette (default = NULL)} +\item{\code{pal}}{character Plotting palette (default = self$pal)} } \if{html}{\out{}} } @@ -1075,7 +1075,7 @@ Plot detailed metrics from the detailed.metrics object \item{\code{hline}}{logical Whether to show median as horizontal line (default = TRUE)} -\item{\code{pal}}{character Plotting palette (default = NULL)} +\item{\code{pal}}{character Plotting palette (default = self$pal)} \item{\code{data.path}}{character Path to Cell Ranger count data (default = self$data.path).} } @@ -1127,7 +1127,7 @@ Plot cells in embedding using Conos and color by depth and doublets. species = c("human", "mouse"), size = 0.3, sep = "!!", - pal = self$pal, + pal = NULL, ... )}\if{html}{\out{}} } @@ -2156,7 +2156,7 @@ Plot the results from the CellBender estimations \item{\code{samples}}{character Sample names to include (default = self$metadata$sample)} -\item{\code{pal}}{character Plotting palette (default = NULL)} +\item{\code{pal}}{character Plotting palette (default = self$pal)} } \if{html}{\out{}} } @@ -2274,7 +2274,8 @@ Plot the most abundant estimated ambient genes from the CellBender calculations \if{html}{\out{
}}\preformatted{CRMetrics$plotCbAmbGenes( cutoff = 0.005, data.path = self$data.path, - samples = self$metadata$sample + samples = self$metadata$sample, + pal = self$pal )}\if{html}{\out{
}} } @@ -2286,6 +2287,8 @@ Plot the most abundant estimated ambient genes from the CellBender calculations \item{\code{data.path}}{character Path to Cell Ranger outputs (default = self$data.path)} \item{\code{samples}}{character Sample names to include (default = self$metadata$sample)} + +\item{\code{pal}}{character Plotting palette (default = self$pal)} } \if{html}{\out{}} } From 3240bb0939f9077c22ba3e05f9fc4a339724ea51 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Tue, 27 Jun 2023 13:29:01 +0200 Subject: [PATCH 14/35] Bugfix for `filterCms` where "species" was not forwarded to `getMitoFraction` internally --- NEWS.md | 7 ++++--- R/CRMetrics.R | 36 ++++++++++++++++++++++-------------- man/CRMetrics.Rd | 5 +---- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/NEWS.md b/NEWS.md index f4d70d4..3c3c0cd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,11 +1,12 @@ # CRMetrics 0.3.0 -* Updated createEmbedding upon resolve of https://github.com/kharchenkolab/conos/issues/123 +* Updated `createEmbedding` upon resolve of https://github.com/kharchenkolab/conos/issues/123 * Updated DESCRIPTION -* Added checks for detectDoublets -* Added possibility for multiple paths for "data.path" argument in initialize +* Added checks for `detectDoublets` +* Added possibility for multiple paths for "data.path" argument in `initialize` * Changed depth calculation from Conos depth to raw depth (= colSums) * Added plotting palette to object +* Fixed bug for `filterCms` where "species"" was not forwarded to `getMitoFraction` internally # CRMetrics 0.2.3 diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 8fc3130..724f197 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -1184,8 +1184,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }, #' @description Filter cells based on depth, mitochondrial fraction and doublets from the count matrix. - #' @param min.transcripts.per.cell numeric Minimal transcripts per cell (default = 100) - #' @param depth.cutoff numeric Depth cutoff (default = NULL). + #' @param depth.cutoff numeric Depth (transcripts per cell) cutoff (default = NULL). #' @param mito.cutoff numeric Mitochondrial fraction cutoff (default = NULL). #' @param doublets character Doublet detection method to use (default = NULL). #' @param species character Species to calculate the mitochondrial fraction for (default = "human"). @@ -1224,8 +1223,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' message("Package 'pagoda2' not available.") #' } #' } - filterCms = function(min.transcripts.per.cell = 100, - depth.cutoff = NULL, + filterCms = function(depth.cutoff = NULL, mito.cutoff = NULL, doublets = NULL, species = c("human","mouse"), @@ -1233,6 +1231,16 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, verbose = self$verbose, sep = "!!", raw = FALSE) { + + if (verbose) { + filters <- c() + if (!is.null(depth.cutoff)) filters %<>% c(paste0("depth.cutoff = ",depth.cutoff)) + if (!is.null(mito.cutoff)) filters %<>% c(paste0("mito.cutoff = ",mito.cutoff," and species = ",species)) + if (!is.null(doublets)) filters %<>% c(paste0("doublet method = ",doublets)) + + message(paste0("Filtering based on: ",paste(filters, collapse="; "))) + } + # Preparations species %<>% tolower() %>% @@ -1255,9 +1263,6 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, samples <- cms %>% names() - # Apply min.transcripts.per.cell - if (min.transcripts.per.cell > 0) cms %<>% lapply(\(cm) cm[,sparseMatrixStats::colSums2(cm) >= min.transcripts.per.cell]) - # Depth if (!is.null(depth.cutoff)) { depth.filter <- self$getDepth() %>% @@ -1268,7 +1273,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Mitochondrial fraction if (!is.null(mito.cutoff)) { - mito.filter <- self$getMitoFraction() %>% + mito.filter <- self$getMitoFraction(species = species) %>% filterVector("mito.cutoff", mito.cutoff, samples, sep) %>% !. # NB, has to be negative } else { @@ -1287,10 +1292,10 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, } # Get cell index - cell.idx <- cms %>% - sapply(colnames) %>% - unlist() %>% - unname() + cell.idx <- list(names(depth.filter), + names(mito.filter), + names(doublets.filter)) %>% + Reduce(intersect, .) # Create split vector split.vec <- strsplit(cell.idx, sep) %>% @@ -1302,13 +1307,16 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, doublets = doublets.filter) %>% .[!sapply(., is.null)] %>% lapply(\(filter) filter[cell.idx]) %>% # Ensure same order of cells - bind_cols() %>% + bind_cols() %>% apply(1, all) %>% split(split.vec) if (verbose) { + cells.total <- cms %>% + sapply(ncol) %>% + sum() cells.remove <- sum(!filter.list %>% unlist()) - cells.total <- length(cell.idx) + if (!any(is.null(depth.filter), is.null(mito.filter))) cells.remove <- cells.remove + cells.total - nrow(self$con$embedding) cells.percent <- cells.remove / cells.total * 100 message(paste0(Sys.time()," Removing ",cells.remove," of ", cells.total," cells (",formatC(cells.percent, digits = 3),"%)")) } diff --git a/man/CRMetrics.Rd b/man/CRMetrics.Rd index 236c565..a924cc4 100644 --- a/man/CRMetrics.Rd +++ b/man/CRMetrics.Rd @@ -1555,7 +1555,6 @@ message("Package 'pagoda2' not available.") Filter cells based on depth, mitochondrial fraction and doublets from the count matrix. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{CRMetrics$filterCms( - min.transcripts.per.cell = 100, depth.cutoff = NULL, mito.cutoff = NULL, doublets = NULL, @@ -1570,9 +1569,7 @@ Filter cells based on depth, mitochondrial fraction and doublets from the count \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{min.transcripts.per.cell}}{numeric Minimal transcripts per cell (default = 100)} - -\item{\code{depth.cutoff}}{numeric Depth cutoff (default = NULL).} +\item{\code{depth.cutoff}}{numeric Depth (transcripts per cell) cutoff (default = NULL).} \item{\code{mito.cutoff}}{numeric Mitochondrial fraction cutoff (default = NULL).} From 52acac5dc8af5f5dd2b6842a14759eb17c7a4605 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Tue, 27 Jun 2023 14:26:07 +0200 Subject: [PATCH 15/35] Added missing "species" argument internally for `plotMitoFraction` --- R/CRMetrics.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 724f197..ff41077 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -835,11 +835,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (length(cutoff) > 1 & length(self$con$samples) != length(cutoff)) stop(paste0("'cutoff' has a length of ",length(cutoff),", but the conos object contains ",length(tmp)," samples. Please adjust.")) - mf <- self$getMitoFraction() + mf <- self$getMitoFraction(species = species) mf.zero <- sum(mf == 0) / length(mf) * 100 - if (mf.zero > 95) warning(paste0(mf.zero,"% of all cells does not express mitochondrial genes. Plotting may behave unexpected.")) + if (mf.zero > 95) warning(paste0(mf.zero,"% of all cells do not express mitochondrial genes. Plotting may behave unexpected.")) # Preparations tmp <- mf %>% From b00df1e6cb5b217b81cdbfe60ef2ba7dc00be517 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Wed, 28 Jun 2023 09:16:56 +0200 Subject: [PATCH 16/35] Fix to `pathsToList` --- R/inner_functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/inner_functions.R b/R/inner_functions.R index 295c0ff..54b749b 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -568,7 +568,7 @@ checkDataPath <- function(data.path) { } pathsToList <- function(data.path, samples) { - tmp <- data.path %>% + data.path %>% lapply(\(path) list.dirs(path, recursive = F, full.names = F) %>% {if (!is.null(samples)) .[. %in% samples] else . } %>% data.frame(sample = ., path = path)) %>% From 1c599628efd2fbf753df1a78b279f393571d5da2 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Wed, 28 Jun 2023 09:17:14 +0200 Subject: [PATCH 17/35] Updated initialize to only include 10x output folders --- R/CRMetrics.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index ff41077..7fbd22b 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -125,9 +125,11 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Metadata if (is.null(metadata)) { if (!is.null(data.path)) { - self$metadata <- data.frame(sample = list.dirs(data.path, - recursive = FALSE, - full.names = FALSE)) + self$metadata <- list.dirs(data.path, + recursive = FALSE, + full.names = FALSE) %>% + .[pathsToList(data.path, .) %>% sapply(\(path) file.exists(paste0(path[2],"/",path[1],"/outs")))] %>% + {data.frame(sample = .)} } } else { if (inherits(metadata, "data.frame")) { From 8f6ad2193fbae51279e9d2d1220fcb4c0c0d3ae3 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Wed, 28 Jun 2023 12:21:56 +0200 Subject: [PATCH 18/35] Updated `getDepth` to not be dependent on Conos --- R/CRMetrics.R | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 7fbd22b..533c999 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -1561,15 +1561,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' message("Package 'pagoda2' not available.") #' } #' } - getDepth = function() { - checkPackageInstalled("conos", cran = TRUE) + getDepth = function(cms = self$cms) { + checkPackageInstalled("sparseMatrixStats", bioc = TRUE) - self$con$samples %>% - lapply(`[[`, "misc") %>% - lapply(`[[`, "rawCounts") %>% - lapply(\(x) `names<-`(sparseMatrixStats::rowSums2(x), rownames(x))) %>% - Reduce(c, .) %>% - .[lapply(self$con$samples, conos::getCellNames) %>% unlist()] + cms %>% + lapply(\(cm) `names<-`(sparseMatrixStats::colSums2(cm), colnames(cm))) %>% + Reduce(c, .) }, #' @description Calculate the fraction of mitochondrial genes. From e049b3717d9c11913fc5cf9b030ad56c6690d07e Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Wed, 28 Jun 2023 13:16:50 +0200 Subject: [PATCH 19/35] Bug fix `getMitoFraction` --- R/CRMetrics.R | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 533c999..acb4a20 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -1562,8 +1562,6 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } #' } getDepth = function(cms = self$cms) { - checkPackageInstalled("sparseMatrixStats", bioc = TRUE) - cms %>% lapply(\(cm) `names<-`(sparseMatrixStats::colSums2(cm), colnames(cm))) %>% Reduce(c, .) @@ -1602,23 +1600,29 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' message("Package 'pagoda2' not available.") #' } #' } - getMitoFraction = function(species = c("human", "mouse")) { - checkPackageInstalled("conos", cran = TRUE) + getMitoFraction = function(species = c("human", "mouse"), cms = self$cms) { + # Checks species %<>% match.arg(c("human", "mouse")) - + if (is.null(cms)) stop("Cms is NULL, aborting.") if (species=="human") symb <- "MT-" else if (species=="mouse") symb <- "mt-" else stop("Species must either be 'human' or 'mouse'.") - tmp <- self$con$samples %>% - lapply(`[[`, "counts") %>% + + # Calculate + tmp <- cms %>% lapply(\(cm) { - tmp.mat <- cm[,grep(symb, colnames(cm))] - if (is(tmp.mat, "numeric")) { + tmp.mat <- cm[grep(symb, rownames(cm)),] + + if (inherits(tmp.mat, "numeric")) { nom <- tmp.mat } else { - nom <- sparseMatrixStats::rowSums2(tmp.mat) + nom <- sparseMatrixStats::colSums2(tmp.mat) } - (nom / sparseMatrixStats::rowSums2(cm)) %>% - setNames(cm %>% rownames()) + + out <- (nom / sparseMatrixStats::colSums2(cm)) %>% + `names<-`(colnames(cm)) + out[is.na(out)] <- 0 + + return(out) }) %>% Reduce(c, .) @@ -2115,7 +2119,7 @@ addSummaryFromCms = function(cms = self$cms, n.cores = self$n.cores, verbose = self$verbose) { checkPackageInstalled("sparseMatrixStats", bioc = TRUE) - if (!is.null(self$summary.metrics)) warning("Overvriting existing summary metrics") + if (!is.null(self$summary.metrics)) warning("Overwriting existing summary metrics \n") if (verbose) message(paste0(Sys.time()," Calculating ",length(cms)," summaries using ", if (n.cores < length(cms)) n.cores else length(cms)," cores")) From 7835d7a6331496fb3a73c04136dce66c71ebd62a Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 29 Jun 2023 14:38:54 +0200 Subject: [PATCH 20/35] Minor fixes --- R/CRMetrics.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index acb4a20..4386c0d 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -676,7 +676,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, g <- self$con$plotGraph(colors = mf * 1, title = main, size = size, ...) } - if (!exists("g")) g <- self$con$plotGraph(palette = pal, ...) + if (!exists("g")) g <- self$con$plotGraph(palette = pal, size = size, ...) return(g) }, @@ -1297,6 +1297,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, cell.idx <- list(names(depth.filter), names(mito.filter), names(doublets.filter)) %>% + .[!sapply(., is.null)] %>% Reduce(intersect, .) # Create split vector From b939db91d627a99d7e39313b373c69c52db2dbbf Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Tue, 4 Jul 2023 11:21:35 +0200 Subject: [PATCH 21/35] Moved adding list of CMs from `addDetailedMetrics` to `addCms`, updated documentation --- NEWS.md | 1 + R/CRMetrics.R | 142 +++++++++++++++++++++++------------------------ man/CRMetrics.Rd | 66 +++++++++++----------- 3 files changed, 107 insertions(+), 102 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3c3c0cd..1b3e716 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,7 @@ * Changed depth calculation from Conos depth to raw depth (= colSums) * Added plotting palette to object * Fixed bug for `filterCms` where "species"" was not forwarded to `getMitoFraction` internally +* Moved adding list of CMs to CRMetrics object from addDetailedMetrics() to addCms() since this is more logical # CRMetrics 0.2.3 diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 4386c0d..54f41ca 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -165,14 +165,8 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }, #' @description Function to read in detailed metrics. This is not done upon initialization for speed. + #' @param cms list List of (sparse) count matrices (default = self$cms) #' @param min.transcripts.per.cell numeric Minimal number of transcripts per cell (default = 100) - #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE` (default = FALSE) - #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE) - #' @param sep character Separator for cell names (default = "!!"). - #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE) - #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE) - #' @param data.path character Path to cellranger count data (default = self$data.path). - #' @param sample.names character Vector containing sample names (default = self$metadata$sample). #' @param n.cores integer Number of cores for the calculations (default = self$n.cores). #' @param verbose logical Print messages or not (default = self$verbose). #' @return Count matrices @@ -191,41 +185,23 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' #' # Run function #' crm$addDetailedMetrics() - addDetailedMetrics = function(min.transcripts.per.cell = 100, - raw = FALSE, - symbol = TRUE, - sep = "!!", - cellbender = FALSE, - unique.names = TRUE, - data.path = self$data.path, - sample.names = self$metadata$sample, + addDetailedMetrics = function(cms = self$cms, + min.transcripts.per.cell = 100, n.cores = self$n.cores, verbose = self$verbose) { - # Read data - if (is.null(self$cms)) { - if (cellbender) { - self$cms <- read10xH5(data.path = data.path, sample.names = sample.names, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) - } else { - self$cms <- read10x(data.path = data.path, sample.names = sample.names, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose) - } - } else { - message("CMs already present. To overwrite, set $cms = NULL and rerun this function.") - } - - # Check for unrealistic large samples - size.check <- self$cms %>% + # Checks + if (is.null(self$detailed.metrics)) stop("Detailed metrics already present. To overwrite, set $detailed.metrics = NULL and rerun this function") + + size.check <- cms %>% sapply(dim) %>% apply(2, prod) %>% {. > 2^31-1} if (any(size.check)) warning(message(paste0("Unrealistic large samples detected that are larger than what can be handled in R. Consider removing ",paste(size.check[size.check] %>% names(), collapse = " "),". If kept, you may experience errors."))) # Calculate metrics - if (is.null(self$detailed.metrics)) { - if (min.transcripts.per.cell > 0) cms <- self$cms %>% lapply(\(cm) cm[,sparseMatrixStats::colSums2(cm) > min.transcripts.per.cell]) - self$detailed.metrics <- addDetailedMetricsInner(cms = cms, verbose = verbose, n.cores = n.cores) - } else { - message("Detailed metrics already present. To overwrite, set $detailed.metrics = NULL and rerun this function") - } + if (min.transcripts.per.cell > 0) cms %<>% lapply(\(cm) cm[,sparseMatrixStats::colSums2(cm) > min.transcripts.per.cell]) + + self$detailed.metrics <- addDetailedMetricsInner(cms = cms, verbose = verbose, n.cores = n.cores) }, #' @description Add comparison group for statistical testing. @@ -1532,6 +1508,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }, #' @description Extract sequencing depth from Conos object. + #' @param cms list List of (sparse) count matrices (default = self$cms) #' @return data frame #' @examples #' \donttest{ @@ -1570,7 +1547,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Calculate the fraction of mitochondrial genes. #' @param species character Species to calculate the mitochondrial fraction for (default = "human"). - #' @param force logical Force update of stored vector (default = FALSE) + #' @param cms list List of (sparse) count matrices (default = self$cms) #' @return data frame #' @examples #' \donttest{ @@ -1837,12 +1814,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, }, #' @description Add a list of count matrices to the CRMetrics object. - #' @param cms list List of count matrices - #' @param sample.names character Vector of sample names. If NULL, sample.names are extracted from cms (default = NULL) - #' @param unique.names logical Create unique cell names (default = TRUE) + #' @param cms list List of (sparse) count matrices (default = NULL) + #' @param data.path character Path to cellranger count data (default = self$data.path). + #' @param sample.names character Vector of sample names. If NULL, sample.names are extracted from cms (default = self$metadata$sample) + #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE) + #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE` (default = FALSE) + #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE) + #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE) #' @param sep character Separator used to create unique cell names (default = "!!") #' @param n.cores integer Number of cores to use (default = self$n.cores) - #' @return A ggplot2 object + #' @param verbose boolean Print progress (default = self$verbose) + #' @return Add list of (sparse) count matrices to R6 class object #' @examples #' \dontrun{ #' crm <- CRMetrics$new(data.path = "/path/to/count/data/") @@ -1858,49 +1840,67 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' #' crm$addCms(cms = testdata.cms) #' } - addCms = function(cms, - sample.names = NULL, + addCms = function(cms = NULL, + data.path = self$data.path, + sample.names = self$metadata$sample, + cellbender = FALSE, + raw = FALSE, + symbol = TRUE, unique.names = TRUE, sep = "!!", - n.cores = self$n.cores) { - # Checks begin - if (!is.list(cms)) stop("'cms' must be a list of count matrices") - - sample.class <- sapply(cms, class) %>% - unlist() %>% - sapply(\(x) grepl("Matrix", x)) - if (!any(sample.class)) { - warning(paste0("Some samples are not a matrix (maybe they only contain 1 cell). Removing the following samples: ",paste(sample.class[!sample.class] %>% names(), collapse = " "))) - cms %<>% .[sample.class] - } + n.cores = self$n.cores, + verbose = self$verbose) { + # Check + if (is.null(cms) && is.null(data.path)) stop("Either 'cms' or 'data.path' must be provided.") + if (!is.null(self$cms)) stop("CMs already present. To overwrite, set $cms = NULL and rerun this function.") - sample.cells <- sapply(cms, ncol) %>% unlist() - if (any(sample.cells == 0)) { - warning(paste0("Some samples does not contain cells. Removing the following samples: ",paste(sample.cells[sample.cells == 0] %>% names(), collapse=" "))) - cms %<>% .[sample.cells > 0] + if (!is.null(cms)) { + # Add from cms argument + + ## Checks + if (!is.list(cms)) stop("'cms' must be a list of count matrices") + + if (verbose) message(paste0("Adding list of ",length(cms)," count matrices.")) + + sample.class <- sapply(cms, class) %>% + unlist() %>% + sapply(\(x) grepl("Matrix", x)) + if (!any(sample.class)) { + warning(paste0("Some samples are not a matrix (maybe they only contain 1 cell). Removing the following samples: ",paste(sample.class[!sample.class] %>% names(), collapse = " "))) + cms %<>% .[sample.class] + } + + sample.cells <- sapply(cms, ncol) %>% unlist() + if (any(sample.cells == 0)) { + warning(paste0("Some samples does not contain cells. Removing the following samples: ",paste(sample.cells[sample.cells == 0] %>% names(), collapse=" "))) + cms %<>% .[sample.cells > 0] + } + + if (is.null(sample.names)) sample.names <- names(cms) + if (is.null(sample.names)) stop("Either 'cms' must be named or 'sample.names' cannot be NULL") + if (length(sample.names) != length(cms)) stop("Length of 'sample.names' does not match length of 'cms'.") + + ## Create unique names + if (unique.names) cms %<>% createUniqueCellNames(sample.names, sep) + } else { + # Add from data.path argument + if (cellbender) { + cms <- read10xH5(data.path = data.path, sample.names = sample.names, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + } else { + cms <- read10x(data.path = data.path, sample.names = sample.names, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + } } - if (is.null(sample.names)) sample.names <- names(cms) - if (is.null(sample.names)) stop("Either cms must be named or names cannot be NULL") - if (length(sample.names) != length(cms)) stop("Length of 'sample.names' does not match length of 'cms'.") - # Checks end - - if (unique.names) cms %<>% createUniqueCellNames(sample.names, sep) - self$cms <- cms if (!is.null(self$metadata)) { - if (length(cms) != nrow(self$metadata)) { - warning("Overwriting metadata") - self$metadata <- data.frame(sample = sample.names) - } - } else { + warning("Overwriting metadata") self$metadata <- data.frame(sample = sample.names) } - if (!is.null(self$detailed.metrics)) warning("Consider updating detailed metrics by setting $detailed.metrics <- NULL and running $addDetailedMetrics()") - if (!is.null(self$con)) warning("Consider updating embedding by setting $cms.preprocessed <- NULL and $con <- NULL, and running $doPreprocessing() and $createEmbedding()") - if (!is.null(self$doublets)) warning("Consider updating doublet scores by setting $doublets <- NULL and running $detectDoublets()") + if (!is.null(self$detailed.metrics)) warning("Consider updating detailed metrics by setting $detailed.metrics <- NULL and running $addDetailedMetrics(). ") + if (!is.null(self$con)) warning("Consider updating embedding by setting $cms.preprocessed <- NULL and $con <- NULL, and running $doPreprocessing() and $createEmbedding(). ") + if (!is.null(self$doublets)) warning("Consider updating doublet scores by setting $doublets <- NULL and running $detectDoublets(). ") }, #' @description Plot the results from the CellBender estimations diff --git a/man/CRMetrics.Rd b/man/CRMetrics.Rd index a924cc4..ca8da86 100644 --- a/man/CRMetrics.Rd +++ b/man/CRMetrics.Rd @@ -782,14 +782,8 @@ crm <- CRMetrics$new(data.path = "/path/to/count/data/") Function to read in detailed metrics. This is not done upon initialization for speed. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{CRMetrics$addDetailedMetrics( + cms = self$cms, min.transcripts.per.cell = 100, - raw = FALSE, - symbol = TRUE, - sep = "!!", - cellbender = FALSE, - unique.names = TRUE, - data.path = self$data.path, - sample.names = self$metadata$sample, n.cores = self$n.cores, verbose = self$verbose )}\if{html}{\out{
}} @@ -798,21 +792,9 @@ Function to read in detailed metrics. This is not done upon initialization for s \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{min.transcripts.per.cell}}{numeric Minimal number of transcripts per cell (default = 100)} - -\item{\code{raw}}{logical Add raw count matrices from Cell Ranger output. Cannot be combined with \code{cellbender=TRUE} (default = FALSE)} - -\item{\code{symbol}}{character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE)} - -\item{\code{sep}}{character Separator for cell names (default = "!!").} - -\item{\code{cellbender}}{logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE)} - -\item{\code{unique.names}}{logical Make cell names unique based on \code{sep} parameter (default = TRUE)} +\item{\code{cms}}{list List of (sparse) count matrices (default = self$cms)} -\item{\code{data.path}}{character Path to cellranger count data (default = self$data.path).} - -\item{\code{sample.names}}{character Vector containing sample names (default = self$metadata$sample).} +\item{\code{min.transcripts.per.cell}}{numeric Minimal number of transcripts per cell (default = 100)} \item{\code{n.cores}}{integer Number of cores for the calculations (default = self$n.cores).} @@ -1763,9 +1745,16 @@ message("Package 'pagoda2' not available.") \subsection{Method \code{getDepth()}}{ Extract sequencing depth from Conos object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{CRMetrics$getDepth()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{CRMetrics$getDepth(cms = self$cms)}\if{html}{\out{
}} } +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{cms}}{list List of (sparse) count matrices (default = self$cms)} +} +\if{html}{\out{
}} +} \subsection{Returns}{ data frame } @@ -1811,7 +1800,7 @@ message("Package 'pagoda2' not available.") \subsection{Method \code{getMitoFraction()}}{ Calculate the fraction of mitochondrial genes. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{CRMetrics$getMitoFraction(species = c("human", "mouse"))}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{CRMetrics$getMitoFraction(species = c("human", "mouse"), cms = self$cms)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -1819,7 +1808,7 @@ Calculate the fraction of mitochondrial genes. \describe{ \item{\code{species}}{character Species to calculate the mitochondrial fraction for (default = "human").} -\item{\code{force}}{logical Force update of stored vector (default = FALSE)} +\item{\code{cms}}{list List of (sparse) count matrices (default = self$cms)} } \if{html}{\out{
}} } @@ -2085,31 +2074,46 @@ crm$getTotalDroplets() Add a list of count matrices to the CRMetrics object. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{CRMetrics$addCms( - cms, - sample.names = NULL, + cms = NULL, + data.path = self$data.path, + sample.names = self$metadata$sample, + cellbender = FALSE, + raw = FALSE, + symbol = TRUE, unique.names = TRUE, sep = "!!", - n.cores = self$n.cores + n.cores = self$n.cores, + verbose = self$verbose )}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{cms}}{list List of count matrices} +\item{\code{cms}}{list List of (sparse) count matrices (default = NULL)} -\item{\code{sample.names}}{character Vector of sample names. If NULL, sample.names are extracted from cms (default = NULL)} +\item{\code{data.path}}{character Path to cellranger count data (default = self$data.path).} + +\item{\code{sample.names}}{character Vector of sample names. If NULL, sample.names are extracted from cms (default = self$metadata$sample)} + +\item{\code{cellbender}}{logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE)} -\item{\code{unique.names}}{logical Create unique cell names (default = TRUE)} +\item{\code{raw}}{logical Add raw count matrices from Cell Ranger output. Cannot be combined with \code{cellbender=TRUE} (default = FALSE)} + +\item{\code{symbol}}{character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE)} + +\item{\code{unique.names}}{logical Make cell names unique based on \code{sep} parameter (default = TRUE)} \item{\code{sep}}{character Separator used to create unique cell names (default = "!!")} \item{\code{n.cores}}{integer Number of cores to use (default = self$n.cores)} + +\item{\code{verbose}}{boolean Print progress (default = self$verbose)} } \if{html}{\out{
}} } \subsection{Returns}{ -A ggplot2 object +Add list of (sparse) count matrices to R6 class object } \subsection{Examples}{ \if{html}{\out{
}} From 01eb14d44dc83e68e2241ed0ae55c5e2d905dbbf Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Wed, 5 Jul 2023 11:13:43 +0200 Subject: [PATCH 22/35] Added to `detectDoublets` possibility to export Python script --- NEWS.md | 1 + R/CRMetrics.R | 152 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 100 insertions(+), 53 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1b3e716..c19a0dc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,7 @@ * Added plotting palette to object * Fixed bug for `filterCms` where "species"" was not forwarded to `getMitoFraction` internally * Moved adding list of CMs to CRMetrics object from addDetailedMetrics() to addCms() since this is more logical +* `detectDoublets` can now export Python script with argument `export = TRUE` # CRMetrics 0.2.3 diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 54f41ca..1dd94cc 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -872,11 +872,14 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Detect doublet cells. #' @param method character Which method to use, either `scrublet` or `doubletdetection` (default="scrublet"). #' @param cms list List containing the count matrices (default=self$cms). + #' @param sample.names character Vector of sample names. If NULL, sample.names are extracted from cms (default = self$metadata$sample) #' @param env character Environment to run python in (default="r-reticulate"). #' @param conda.path character Path to conda environment (default=system("whereis conda")). #' @param n.cores integer Number of cores to use (default = self$n.cores) #' @param verbose logical Print messages or not (default = self$verbose) #' @param args list A list with additional arguments for either `DoubletDetection` or `scrublet`. Please check the respective manuals. + #' @param export boolean Export CMs in order to detect doublets outside R (default = FALSE) + #' @param data.path character Path to write data, only relevant if `export = TRUE`. Last character must be `/` (default = self$data.path) #' @return data.frame #' @examples #' \dontrun{ @@ -898,21 +901,25 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' conda.path = "/opt/software/miniconda/4.12.0/condabin/conda") #' } detectDoublets = function(method = c("scrublet","doubletdetection"), - cms = self$cms, + cms = self$cms, + sample.names = self$metadata$sample, env = "r-reticulate", conda.path = system("whereis conda"), n.cores = self$n.cores, verbose = self$verbose, - args = list()) { + args = list(), + export = FALSE, + data.path = self$data.path) { # Checks method %<>% tolower() %>% match.arg(c("scrublet","doubletdetection")) if (!is.list(args)) stop("'args' must be a list.") - if (inherits(cms, "list")) stop("'cms' must be a list") + if (!inherits(cms, "list")) stop("'cms' must be a list") if (!all(sapply(cms, inherits, "Matrix"))) { warning("All samples in 'cms' must be a matrix, trying to convert to dgCMatrix...") cms %<>% lapply(as, "CsparseMatrix") if (!all(sapply(cms, inherits, "Matrix"))) stop("Could not convert automatically.") } + if (export && is.null(data.path)) stop("When 'export = TRUE', 'data.path' must be provided.") # Prepare arguments if (method == "doubletdetection") { @@ -967,56 +974,95 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, args.std[[i]] <- as.integer(args.std[[i]]) } - # Prep environment - if (verbose) message("Loading prerequisites...") - checkPackageInstalled("reticulate", cran = TRUE) - reticulate::use_condaenv(condaenv = env, conda = conda.path, required = TRUE) - if (!reticulate::py_module_available(method)) stop(paste0("'",method,"' is not installed in your current conda environment.")) - reticulate::source_python(paste(system.file(package="CRMetrics"), paste0(method,".py"), sep ="/")) - - if (verbose) message("Identifying doublets using '",method,"'...") - - # Calculate - tmp <- cms %>% - names() %>% - lapply(\(cm) { - if (verbose) message(paste0("Running sample '",cm,"'...")) - args.out <- list(cm = Matrix::t(cms[[cm]])) %>% append(args.std) - - if (method == "doubletdetection") { - tmp.out <- do.call("doubletdetection_py", args.out) - } else { - tmp.out <- do.call("scrublet_py", args.out) - } - - tmp.out %<>% - setNames(c("labels","scores","output")) - }) %>% - setNames(cms %>% names()) - - df <- tmp %>% - names() %>% - lapply(\(name) { - tmp[[name]] %>% - .[c("labels","scores")] %>% - bind_rows() %>% - as.data.frame() %>% - mutate(sample = name) %>% - `rownames<-`(cms[[name]] %>% colnames()) - }) %>% - bind_rows() - - df[is.na(df)] <- FALSE - - df %<>% mutate(labels = as.logical(labels)) - - output <- tmp %>% lapply(`[[`, 3) %>% - setNames(tmp %>% names()) - - res <- list(result = df, - output = output) - if (verbose) message("Detected ",sum(df$labels, na.rm = TRUE)," possible doublets out of ",nrow(df)," cells.") - self$doublets[[method]] <- res + if (!export) { + # Prep environment + if (verbose) message(paste0(Sys.time()," Loading prerequisites...")) + checkPackageInstalled("reticulate", cran = TRUE) + reticulate::use_condaenv(condaenv = env, conda = conda.path, required = TRUE) + if (!reticulate::py_module_available(method)) stop(paste0("'",method,"' is not installed in your current conda environment.")) + reticulate::source_python(paste(system.file(package="CRMetrics"), paste0(method,".py"), sep ="/")) + + if (verbose) message(paste0(Sys.time()," Identifying doublets using '",method,"'...")) + + # Calculate + tmp <- cms %>% + names() %>% + lapply(\(cm) { + if (verbose) message(paste0(Sys.time()," Running sample '",cm,"'...")) + args.out <- list(cm = Matrix::t(cms[[cm]])) %>% append(args.std) + + if (method == "doubletdetection") { + tmp.out <- do.call("doubletdetection_py", args.out) + } else { + tmp.out <- do.call("scrublet_py", args.out) + } + + tmp.out %<>% + setNames(c("labels","scores","output")) + }) %>% + setNames(cms %>% names()) + + df <- tmp %>% + names() %>% + lapply(\(name) { + tmp[[name]] %>% + .[c("labels","scores")] %>% + bind_rows() %>% + as.data.frame() %>% + mutate(sample = name) %>% + `rownames<-`(cms[[name]] %>% colnames()) + }) %>% + bind_rows() + + df[is.na(df)] <- FALSE + + df %<>% mutate(labels = as.logical(labels)) + + output <- tmp %>% lapply(`[[`, 3) %>% + setNames(tmp %>% names()) + + res <- list(result = df, + output = output) + if (verbose) message(paste0(Sys.time()," Detected ",sum(df$labels, na.rm = TRUE)," possible doublets out of ",nrow(df)," cells.")) + self$doublets[[method]] <- res + } else { + # Check for existing files + files <- setdiff(sample.names %>% sapply(paste0, ".mtx"), dir(data.path)) %>% + strsplit(".mtx",) %>% + sapply('[[', 1) + + diff <- length(sample.names) - length(files) + + # Save data + if (verbose) message(paste0(Sys.time()," Saving ",length(cms)," CMs")) + if (diff > 0) message("Existing save files already found, skipping ",diff," samples: ",paste(c("",setdiff(sample.names, files)), collapse = "\n")) + + for (i in files) { + cms[[i]] %>% + Matrix::t() %>% + Matrix::writeMM(paste0(data.path,i,".mtx")) + } + + if (verbose) message(paste0(Sys.time()," Done! Python script is saved as ",data.path,toupper(method),".py")) + # Create Python script + args.std %<>% + `names<-`(sapply(names(.), paste0, "X")) %>% + lapply(\(x) { + if (is.null(x)) return("None") + if (is.logical(x) && x) return("True") + if (is.logical(x) && !x) return("False") + if (is.character(x)) return(paste0('"',x,'"')) + return(x) + }) %>% + append(list(data.path = data.path, + method = method)) + + tmp <- readLines(paste(system.file(package="CRMetrics"), paste0(method,"_manual.py"), sep ="/")) + for (i in names(args.std)) { + tmp %<>% gsub(pattern = i, replace = args.std[i], x = .) + } + writeLines(tmp, con=paste0(data.path,toupper(method),".py")) + } }, #' @description Perform conos preprocessing. From 1cb62df8fb42380089edce71862e41e9a8ca4d7e Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Wed, 5 Jul 2023 11:51:21 +0200 Subject: [PATCH 23/35] Added py templates --- inst/doubletdetection_manual.py | 23 +++++++++++++++++++++++ inst/scrublet_manual.py | 21 +++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 inst/doubletdetection_manual.py create mode 100644 inst/scrublet_manual.py diff --git a/inst/doubletdetection_manual.py b/inst/doubletdetection_manual.py new file mode 100644 index 0000000..ba971bf --- /dev/null +++ b/inst/doubletdetection_manual.py @@ -0,0 +1,23 @@ +print("Importing libraries") + +# Setup +import doubletdetection +from scipy import io +import pandas as pd +import glob + +flist = glob.glob("data.path*.mtx") + +for x in flist: + print("Creating "+x+".method") + + # Load + cm = io.mmread(x) + + # Run + clf = doubletdetection.BoostClassifier(boost_rate = boost_rateX, clustering_algorithm = clustering_algorithmX, clustering_kwargs = clustering_kwargsX, n_components = n_componentsX, n_iters = n_itersX, n_jobs = n_jobsX, n_top_var_genes = n_top_var_genesX, normalizer = normalizerX, pseudocount = pseudocountX, random_state = random_stateX, replace = replaceX, standard_scaling = standard_scalingX) + labels = clf.fit(cm).predict(p_thresh = p_threshX, voter_thresh = voter_threshX) + scores = clf.doublet_score() + + # Save + pd.DataFrame(labels, scores).to_csv(x+".method") diff --git a/inst/scrublet_manual.py b/inst/scrublet_manual.py new file mode 100644 index 0000000..301af41 --- /dev/null +++ b/inst/scrublet_manual.py @@ -0,0 +1,21 @@ +print("Importing libraries") + +import scrublet +from scipy import io +import pandas as pd +import glob + +flist = glob.glob("data.path*.mtx") + +for x in flist: + print("Creating "+x+".method") + + # Load + cm = io.mmread(x) + + # Run + tmp = scrublet.Scrublet(cm, total_counts = total_countsX, sim_doublet_ratio = sim_doublet_ratioX, n_neighbors = n_neighborsX, expected_doublet_rate = expected_doublet_rateX, stdev_doublet_rate = stdev_doublet_rateX, random_state = random_stateX) + doublet_scores, predicted_doublets = tmp.scrub_doublets(synthetic_doublet_umi_subsampling = synthetic_doublet_umi_subsamplingX, use_approx_neighbors = use_approx_neighborsX, distance_metric = distance_metricX, get_doublet_neighbor_parents = get_doublet_neighbor_parentsX, min_counts = min_countsX, min_cells = min_cellsX, min_gene_variability_pctl = min_gene_variability_pctlX, log_transform = log_transformX, mean_center = mean_centerX, normalize_variance = normalize_varianceX, n_prin_comps = n_prin_compsX, svd_solver = svd_solverX) + + # Save + pd.DataFrame(predicted_doublets, doublet_scores).to_csv(x+".method") From 2bba67cefdba9abdb01d246d37bac5d11b488de8 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Wed, 5 Jul 2023 14:22:20 +0200 Subject: [PATCH 24/35] Added `addDoublets` function, changed `sample.names` argument name to `samples` across functions --- R/CRMetrics.R | 140 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 102 insertions(+), 38 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 1dd94cc..8a98aed 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -63,7 +63,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param data.path character Path to directory with Cell Ranger count data, one directory per sample (default = NULL). #' @param metadata data.frame or character Path to metadata file (comma-separated) or name of metadata dataframe object. Metadata must contain a column named 'sample' containing sample names that must match folder names in 'data.path' (default = NULL) #' @param cms list List with count matrices (default = NULL) - #' @param sample.names character Sample names. Only relevant is cms is provided (default = NULL) + #' @param samples character Sample names. Only relevant is cms is provided (default = NULL) #' @param unique.names logical Create unique cell names. Only relevant if cms is provided (default = TRUE) #' @param sep.cells character Sample-cell separator. Only relevant if cms is provided and `unique.names=TRUE` (default = "!!") #' @param comp.group character A group present in the metadata to compare the metrics by, can be added with addComparison (default = NULL) @@ -81,7 +81,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, initialize = function(data.path = NULL, metadata = NULL, cms = NULL, - sample.names = NULL, + samples = NULL, unique.names = TRUE, sep.cells = "!!", comp.group = NULL, @@ -152,7 +152,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, # Add CMs if (!is.null(cms)) { self$addCms(cms = cms, - sample.names = sample.names, + samples = samples, unique.names = unique.names, sep = sep.cells, n.cores = self$n.cores) @@ -181,7 +181,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Run function #' crm$addDetailedMetrics() @@ -219,7 +219,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Add metadata #' crm$metadata$sex <- c("male","female") @@ -241,7 +241,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param pal character Plotting palette (default = self$pal) #' @return ggplot2 object #' @examples - #' sample.names <- c("sample1", "sample2") + #' samples <- c("sample1", "sample2") #' #' # Simulate data #' testdata.cms <- lapply(seq_len(2), \(x) { @@ -251,10 +251,10 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' sapply(seq_len(1e3), \(x) paste0("cell",x))) #' return(out) #' }) - #' names(testdata.cms) <- sample.names + #' names(testdata.cms) <- samples #' #' # Create metadata - #' metadata <- data.frame(sample = sample.names, + #' metadata <- data.frame(sample = samples, #' sex = c("male","female"), #' condition = c("a","b")) #' @@ -324,7 +324,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Add summary metrics #' crm$addSummaryFromCms() @@ -478,7 +478,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Add detailed metrics #' crm$addDetailedMetrics() @@ -577,7 +577,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Create embedding #' crm$doPreprocessing() @@ -677,7 +677,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Create embedding #' crm$doPreprocessing() @@ -784,7 +784,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Create embedding #' crm$doPreprocessing() @@ -872,7 +872,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Detect doublet cells. #' @param method character Which method to use, either `scrublet` or `doubletdetection` (default="scrublet"). #' @param cms list List containing the count matrices (default=self$cms). - #' @param sample.names character Vector of sample names. If NULL, sample.names are extracted from cms (default = self$metadata$sample) + #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample) #' @param env character Environment to run python in (default="r-reticulate"). #' @param conda.path character Path to conda environment (default=system("whereis conda")). #' @param n.cores integer Number of cores to use (default = self$n.cores) @@ -893,7 +893,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' #' # Detect doublets @@ -902,7 +902,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } detectDoublets = function(method = c("scrublet","doubletdetection"), cms = self$cms, - sample.names = self$metadata$sample, + samples = self$metadata$sample, env = "r-reticulate", conda.path = system("whereis conda"), n.cores = self$n.cores, @@ -919,6 +919,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, cms %<>% lapply(as, "CsparseMatrix") if (!all(sapply(cms, inherits, "Matrix"))) stop("Could not convert automatically.") } + if (length(data.path) > 1) data.path <- data.path[1] if (export && is.null(data.path)) stop("When 'export = TRUE', 'data.path' must be provided.") # Prepare arguments @@ -1027,15 +1028,15 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, self$doublets[[method]] <- res } else { # Check for existing files - files <- setdiff(sample.names %>% sapply(paste0, ".mtx"), dir(data.path)) %>% + files <- setdiff(samples %>% sapply(paste0, ".mtx"), dir(data.path)) %>% strsplit(".mtx",) %>% sapply('[[', 1) - diff <- length(sample.names) - length(files) + diff <- length(samples) - length(files) # Save data if (verbose) message(paste0(Sys.time()," Saving ",length(cms)," CMs")) - if (diff > 0) message("Existing save files already found, skipping ",diff," samples: ",paste(c("",setdiff(sample.names, files)), collapse = "\n")) + if (diff > 0) message("Existing save files already found, skipping ",diff," samples: ",paste(c("",setdiff(samples, files)), collapse = "\n")) for (i in files) { cms[[i]] %>% @@ -1090,7 +1091,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Perform preprocessing #' crm$doPreprocessing(preprocess = "pagoda2") @@ -1167,7 +1168,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Create embedding #' crm$doPreprocessing() @@ -1231,7 +1232,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Create embedding #' crm$doPreprocessing() @@ -1367,7 +1368,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Select metrics #' crm$selectMetrics() @@ -1408,7 +1409,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Create embedding #' crm$doPreprocessing() @@ -1570,7 +1571,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Create embedding #' crm$doPreprocessing() @@ -1609,7 +1610,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Create embedding #' crm$doPreprocessing() @@ -1812,7 +1813,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Get summary #' crm$addSummaryFromCms() @@ -1843,7 +1844,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize - #' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) + #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Add summary #' crm$addSummaryFromCms() @@ -1862,7 +1863,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @description Add a list of count matrices to the CRMetrics object. #' @param cms list List of (sparse) count matrices (default = NULL) #' @param data.path character Path to cellranger count data (default = self$data.path). - #' @param sample.names character Vector of sample names. If NULL, sample.names are extracted from cms (default = self$metadata$sample) + #' @param samples character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample) #' @param cellbender logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE) #' @param raw logical Add raw count matrices from Cell Ranger output. Cannot be combined with `cellbender=TRUE` (default = FALSE) #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE) @@ -1888,7 +1889,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' } addCms = function(cms = NULL, data.path = self$data.path, - sample.names = self$metadata$sample, + samples = self$metadata$sample, cellbender = FALSE, raw = FALSE, symbol = TRUE, @@ -1922,18 +1923,18 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, cms %<>% .[sample.cells > 0] } - if (is.null(sample.names)) sample.names <- names(cms) - if (is.null(sample.names)) stop("Either 'cms' must be named or 'sample.names' cannot be NULL") - if (length(sample.names) != length(cms)) stop("Length of 'sample.names' does not match length of 'cms'.") + if (is.null(samples)) samples <- names(cms) + if (is.null(samples)) stop("Either 'cms' must be named or 'samples' cannot be NULL") + if (length(samples) != length(cms)) stop("Length of 'samples' does not match length of 'cms'.") ## Create unique names - if (unique.names) cms %<>% createUniqueCellNames(sample.names, sep) + if (unique.names) cms %<>% createUniqueCellNames(samples, sep) } else { # Add from data.path argument if (cellbender) { - cms <- read10xH5(data.path = data.path, sample.names = sample.names, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) } else { - cms <- read10x(data.path = data.path, sample.names = sample.names, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + cms <- read10x(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) } } @@ -1941,7 +1942,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (!is.null(self$metadata)) { warning("Overwriting metadata") - self$metadata <- data.frame(sample = sample.names) + self$metadata <- data.frame(sample = samples) } if (!is.null(self$detailed.metrics)) warning("Consider updating detailed metrics by setting $detailed.metrics <- NULL and running $addDetailedMetrics(). ") @@ -2158,7 +2159,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' }) #' #' # Initialize -#' crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +#' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Add summary #' crm$addSummaryFromCms() @@ -2360,5 +2361,68 @@ plotCbCells = function(data.path = self$data.path, guides(fill = "none") + labs(x = "", y = "") + facet_wrap(~variable, scales = "free_y", nrow = 2, ncol = 2) +}, + +addDoublets = function(method = c("doubletdetection","scrublet"), + data.path = self$data.path, + samples = self$metadata$sample, + cms = self$cms, + verbose = self$verbose) { + # Check + if (length(data.path) > 1) data.path <- data.path[1] + + unk <- setdiff(method, c("doubletdetection","scrublet")) + + if (length(unk) > 0) stop(paste0("Method(s) not recognised: ",paste(unk, collapse = " "))) + + ex.res <- self$doublets + + if (!is.null(names(ex.res))) { + warning(paste0("Results for method(s) '",paste(intersect(names(ex.res), method), collapse = " "),"' found, skipping. Set $doublets$ <- NULL and rerun this function to load.")) + method <- setdiff(method, names(ex.res)) + } + + if (length(method) == 0) stop("No method left to load data for.") + + # Load data + files <- dir(data.path, full.names = TRUE) + + files.list <- method %>% + {`names<-`(lapply(., \(met) files[grepl(paste0(".",met), files, fixed = TRUE)]), .)} + + not.found <- names(files.list)[sapply(files.list, length) == 0] + if (length(not.found) > 0) { + warning("No results found for method(s): ",paste(not.found, collapse = " ")) + files.list %<>% .[setdiff(names(.), not.found)] + } + + if (length(files.list) == 0) stop("No method left to load data for.") + + res.list <- files.list %>% + names() %>% + lapply(\(met) { + len <- length(files.list[[met]]) + if (len > 0) { + if (verbose) message(paste0("Found ",len," results for ",met)) + tmp <- files.list[[met]] %>% + lapply(read.delim, sep = ",", header = TRUE) %>% + bind_rows() %>% + `names<-`(c("scores","labels")) + + tmp[is.na(tmp)] <- 0 + + tmp %<>% + mutate(labels = as.logical(labels)) %>% + `rownames<-`(cms %>% lapply(colnames) %>% unlist() %>% unname()) + + if (verbose) message(paste0("Detected ",sum(tmp$labels, na.rm = TRUE)," possible doublets out of ",nrow(tmp)," cells.")) + out <- list(results = tmp) + return(out) + } + }) %>% + `names<-`(method) + + if (!is.null(ex.res)) res.list <- append(ex.res, res.list) + return(invisible(self$doublets <- res.list)) } )) From a086cce3b4c1a92faa6a1056d96f563fe8ee1bef Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Wed, 5 Jul 2023 14:23:22 +0200 Subject: [PATCH 25/35] Updated README and NEWS --- NEWS.md | 8 +++++--- README.md | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index c19a0dc..9fecbea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,11 +4,13 @@ * Updated DESCRIPTION * Added checks for `detectDoublets` * Added possibility for multiple paths for "data.path" argument in `initialize` -* Changed depth calculation from Conos depth to raw depth (= colSums) -* Added plotting palette to object -* Fixed bug for `filterCms` where "species"" was not forwarded to `getMitoFraction` internally +* Changed depth calculation from Conos depth to raw depth, i.e., column sums +* Added plotting palette to object through argument `pal` +* Fixed bug for `filterCms` where `species` was not forwarded to `getMitoFraction` internally * Moved adding list of CMs to CRMetrics object from addDetailedMetrics() to addCms() since this is more logical * `detectDoublets` can now export Python script with argument `export = TRUE` +* Added `addDoublets` function to add doublet results generated with exported Python scripts +* Updated vignette # CRMetrics 0.2.3 diff --git a/README.md b/README.md index b4c9316..2cfe21a 100644 --- a/README.md +++ b/README.md @@ -9,26 +9,26 @@ CRMetrics ================ -14/10/2022 +05-07-2023 Cell Ranger output filtering and metrics visualisation # Installation -To install the latest version, use - ``` r -install.packages("devtools") -devtools::install_github("khodosevichlab/CRMetrics") +install.packages("remotes") +remotes::install_github("khodosevichlab/CRMetrics") # CRAN version +remotes::install_github("khodosevichlab/CRMetrics", ref = "dev") # developer version ``` # Initialization A CRMetrics object can be initialized in different ways using -`CRMetrics$new()`. The most important arguments are: +`CRMetrics$new()`. Either `data.path` or `cms` must be provided. The most important arguments are: - `data.path`: A path to a directory containing sample-wise - directories with outputs from `cellranger count`. Can also be `NULL` + directories with outputs from `cellranger count`. Can also be `NULL`. + Can also be a vector of multiple paths. - `cms`: A list with count matrices. Must be named with sample IDs. Can also be `NULL` - `metadata`: Can either be 1) a `data.frame`, or 2) a path to a table @@ -43,12 +43,12 @@ A CRMetrics object can be initialized in different ways using For usage, please see the [vignette](http://kkh.bric.ku.dk/rasmus/CRMetrics/walkthrough.html) -([code](https://github.com/khodosevichlab/CRMetrics/blob/main/inst/docs/walkthrough.Rmd). +/ [code](https://github.com/khodosevichlab/CRMetrics/blob/main/inst/docs/walkthrough.Rmd). # Python integrations -CRMetrics makes use of several Python packages, most of them through the -`reticulate` package in R. For these to work, we included an [example +CRMetrics makes use of several Python packages, some of them through the +`reticulate` package in R, please see the included [example workflow](https://github.com/khodosevichlab/CRMetrics/blob/main/inst/docs/walkthrough.md#using-python-modules) in the vignette. From 30a30a6781aaf0648995dcaee46f24aa3b3fab18 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Wed, 5 Jul 2023 14:31:33 +0200 Subject: [PATCH 26/35] Updated walkthrough --- inst/docs/walkthrough.Rmd | 397 +++++++++++++++++++------------------- 1 file changed, 202 insertions(+), 195 deletions(-) diff --git a/inst/docs/walkthrough.Rmd b/inst/docs/walkthrough.Rmd index e0c1f6f..43eee0e 100644 --- a/inst/docs/walkthrough.Rmd +++ b/inst/docs/walkthrough.Rmd @@ -1,52 +1,50 @@ --- title: "CRMetrics - Cell Ranger Filtering and Metrics Visualization" output: - rmarkdown::html_document: + html_document: toc: true toc_float: true -vignette: > - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{"CRMetrics - Cell Ranger Filtering and Metrics Visualization"} - %\usepackage[UTF-8]{inputenc} --- -# Preparations +# Prologue + +## Preparations We have selected a [publicly available dataset](https://www.ncbi.nlm.nih.gov/geo/) from GEO with accession number GSE179590 which can be downloaded [here](http://kkh.bric.ku.dk/fabienne/crmetrics_testdata.tar.gz). You can download the zipped data using wget or curl, e.g. `wget http://kkh.bric.ku.dk/fabienne/crmetrics_testdata.tar.gz`, and then unpack using `tar -xvf crmetrics_testdata.tar.gz` -# Using Python modules +## Using Python modules -We have included several Python modules in this package. If you work on a server using RStudio Server, there may be some preparational steps needed for getting the doublet detection to work. If you are on your own machine, it should be enough to install `reticulate` and the relevant Python module(s). +CRMetrics utilizes several Python packages. Of these, the packages for doublet detection, `scrublet` and `DoubletDetection`, can be run from R using `reticulate`. However, CRMetrics can provide a Python script to run the analyses directly in Python instead through the `export` parameter. -First, you should install `reticulate`: +In order to run the analyses in R, first you should install `reticulate`: ```{r, eval=FALSE} install.packages("reticulate") library(reticulate) ``` - -Then you are ready to create a conda environment. In this example, we're on a server and we load `miniconda` using modules. The `conda` parameter should point to wherever your conda binary is located (in terminal, try `whereis conda`) +Also, you need to install Conda. Then, you are ready to create a Conda virtual environment. In this example, we're on a server and we load `miniconda` using modules. You may need to include the `conda` parameter to point to wherever your Conda binary is located (in terminal, try `whereis conda`). In this example, we install a virtual environment for `scrublet`. ```{r, eval=FALSE} -conda_create("r-reticulate", +conda_create("scrublet", conda = "/opt/software/miniconda/4.12.0/condabin/conda", python_version = 3.8) ``` + There is a known problem with openBLAS which may be different between R and Python. If this is the case, you will receive the error `floating point exception` and R will crash when you try to run a Python script using `reticulate`. -In Python, the problem lies within numpy. numba requires numpy < 1.23, so force reinstall from scratch with no binaries in the `r-reticulate` conda environment from terminal +In Python, the problem lies within `numpy`. `numba` requires `numpy` < 1.23, so force reinstall from scratch with no binaries in the `scrublet` Conda environment from terminal `module load miniconda/4.12.0` -`conda activate r-reticulate` +`conda activate scrublet` `python -m pip install numpy==1.22.0 --force-reinstall --no-binary numpy` -Finally, restart your R session. +Then, follow the instructions to install [`scrublet`](https://github.com/swolock/scrublet). Finally, restart your R session. Please note, if at any point you receive an error that you can't change the current Python instance, please remove any Python-dependent object in your environment and restart your R session. -# Initializing a CRMetrics class +# Main use Load the library @@ -57,7 +55,7 @@ library(dplyr) ``` There are two ways to initialize a new object of class `CRMetrics`, either by providing `data.path` or `cms`. -`data.path` is the path to a directory containing sample-wise directories with the Cell Ranger count outputs. +`data.path` is the path to a directory containing sample-wise directories with the Cell Ranger count outputs. Optionally, it can be a vector of multiple paths. `cms` is a (named, optional) list of (sparse, optional) count matrices. Please note, if `data.path` is not provided, some functionality is lost, e.g. ambient RNA removal. @@ -67,7 +65,7 @@ Optionally, metadata can be provided, either as a file or as a data.frame. For a If `cms` is provided, it is recommended to add summary metrices afterwards: ```{r cms, eval=FALSE} -crm <- CRMetrics$new(cms = cms, n.cores = 1) +crm <- CRMetrics$new(cms = cms, n.cores = 10) crm$addSummaryFromCms() ``` @@ -79,12 +77,12 @@ Here, the folder with our test data is stored in `/data/ExtData/CRMetrics_testda crm <- CRMetrics$new(data.path = "/data/ExtData/CRMetrics_testdata/", metadata = "/data/ExtData/CRMetrics_testdata/metadata.csv", sep.meta = ",", - n.cores = 1, + n.cores = 10, verbose = FALSE) ``` ```{r load-obj, include=FALSE} -crm <- qs::qread("/data/ExtData/CRMetrics_testdata/crm.qs", nthreads = 2) +crm <- qs::qread("/data/ExtData/CRMetrics_testdata/crm.qs", nthreads = 10) ``` We can review our metadata @@ -93,150 +91,7 @@ We can review our metadata crm$metadata ``` -# Remove ambient RNA - -We have added functionality to remove ambient RNA from our samples. This approach should be used with caution since it induces changes to the UMI counts (NB: it does not overwrite the outputs from Cell Ranger). -We have included preparative steps for [CellBender](https://github.com/broadinstitute/CellBender/) as well as incorporated [SoupX](https://github.com/constantAmateur/SoupX) into CRMetrics. - -## CellBender - -### Installation - -To install, follow [these instructions](https://cellbender.readthedocs.io/en/latest/installation/index.html#manual-installation). It is highly recommended to run `CellBender` using GPU acceleration. -If you are more comfortable installing through `reticulate` in R, these lines should be run: - -```{r cellbender-install, eval=FALSE} -library(reticulate) -conda_create("cellbender", - conda = "/opt/software/miniconda/4.12.0/condabin/conda", - python_version = 3.7) -conda_install("cellbender", - conda = "/opt/software/miniconda/4.12.0/condabin/conda", - forge = FALSE, - channel = "anaconda", - packages = "pytables") -conda_install("cellbender", - conda = "/opt/software/miniconda/4.12.0/condabin/conda", - packages = c("pytorch","torchvision","torchaudio"), - channel = "pytorch") -``` - -Then, clone the `CellBender` repository as instructed in the manual. Here, we clone to `/apps/` through `cd /apps/; git clone https://github.com/broadinstitute/CellBender.git` and then `CellBender` can be installed: - -```{r cellbender-install-2, eval=FALSE} -conda_install("cellbender", - conda = "/opt/software/miniconda/4.12.0/condabin/conda", - pip = TRUE, - pip_options = "-e", - packages = "/apps/CellBender/") -``` - -### Analysis - -For `CellBender`, we need to specify expected number of cells and total droplets included (please see the [manual](https://cellbender.readthedocs.io/en/latest/usage/index.html) for additional information). As hinted in the manual, the number of total droplets included could be expected number of cells multiplied by 3 (which we set as default). First, we plot these measures: - -```{r cbprep, cache=TRUE} -crm$prepareCellbender(shrinkage = 100, # Subsamples every 100th datapoint for faster plotting - show.expected.cells = TRUE, - show.total.droplets = TRUE) -``` - -We could change the total droplets included for any sample. Let us first look at the vector. - -```{r totdrops} -droplets <- crm$getTotalDroplets() -droplets -``` - -Then we change the total droplets for SRR15054424. - -```{r change-topdrops} -droplets["SRR15054424"] <- 2e4 -``` - -We plot this change. - -```{r cbprep-totdrops, cache=TRUE} -crm$prepareCellbender(shrinkage = 100, - show.expected.cells = TRUE, - show.total.droplets = TRUE, - total.droplets = droplets) -``` - -We could also multiply expected cells by 2.5 for all samples and save this in our CRMetrics object. - -```{r cb-totdrops-multiply, eval=FALSE} -crm$cellbender$total.droplets <- crm$getTotalDroplets(multiplier = 2.5) -``` - -Finally, we save a script for running `CellBender` on all our samples. Here, we use our modified total droplet vector. If `total.droplets` is not specified, it will use the stored vector at `crm$cellbender$total.droplets`. - -```{r cb-save, eval=FALSE} -crm$saveCellbenderScript(file = "/apps/cellbender_script.sh", - fpr = 0.01, - epochs = 150, - use.gpu = TRUE, - total.droplets = droplets) -``` - -We can run this script in the terminal. Here, we load our miniconda module: `module load miniconda\4.12.0`, we activate the environment: `conda activate cellbender` and we run the bash script: `sh /apps/cellbender_script.sh` - -### Plotting - -We can plot the changes in cell numbers following CellBender estimations. - -```{r cb-plotcells, cache=TRUE} -crm$plotCbCells() -``` - -We can plot the CellBender training results. - -```{r cb-plottraining, fig.width = 12, fig.height = 10, cache=TRUE} -crm$plotCbTraining() -``` - -We can plot the cell probabilities. - -```{r cb-plotcellprobs, fig.width = 12, fig.height = 10, cache=TRUE} -crm$plotCbCellProbs() -``` - -We can plot the identified ambient genes per sample. - -```{r cb-plotambexp, fig.width = 12, fig.height = 10, cache=TRUE} -crm$plotCbAmbExp(cutoff = 0.005) -``` - -Lastly, we can plot the proportion of samples expressing ambient genes. We see that *MALAT1* is identified as an ambient gene in all samples [which is expected](https://kb.10xgenomics.com/hc/en-us/articles/360004729092-Why-do-I-see-high-levels-of-Malat1-in-my-gene-expression-data-). - -```{r cb-plotambgenes, cache=TRUE} -crm$plotCbAmbGenes(cutoff = 0.005) -``` - -## SoupX - -The implementation of SoupX uses the automated estimation of contamination and correction. Please note, SoupX depends on Seurat for import of data. -Since this calculation takes several minutes, it is not run in this vignette. - -```{r runsoupx, eval=FALSE} -crm$runSoupX() -``` - -Then, we can plot the corrections. - -```{r plotsoupx, cache=TRUE} -crm$plotSoupX() -``` - -In the end, we add the SoupX adjusted CMs to our object. - -```{r add-adj-cms} -crm$addCms(cms = crm$soupx$cms.adj, - unique.names = TRUE, - sep = "!!") -``` - -# Plot summary statistics +## Plot summary statistics We can investigate which metrics are available and choose the ones we would like to plot @@ -244,7 +99,7 @@ We can investigate which metrics are available and choose the ones we would like crm$selectMetrics() ``` -## Samples per condition +### Samples per condition First, we can plot the number of samples per condition. Here, we investigate how the distribution of the sex differs between the type of MS of the samples where RRMS is short for relapsing remitting MS, and SPMS is short for secondary progressive MS. @@ -255,7 +110,7 @@ crm$plotSummaryMetrics(comp.group = "sex", plot.geom = "bar") ``` -## Metrics per sample +### Metrics per sample In one plot, we can illustrate selected metric summary stats. If no comparison group is set, it defaults to `sample`. @@ -266,7 +121,7 @@ crm$plotSummaryMetrics(comp.group = "sample", plot.geom = "bar") ``` -## Metrics per condition +### Metrics per condition We can do the same, but set the comparison group to `type`. This will add statistics to the plots. Additionally, we can add a second comparison group for coloring. @@ -278,7 +133,7 @@ crm$plotSummaryMetrics(comp.group = "type", second.comp.group = "sex") ``` -## Metrics per condition with >2 levels +### Metrics per condition with >2 levels For the sake of the example, we change the `RIN` values to `low` (RIN<6), `medium` (67). This will provide us with three comparisons groups to exemplify how to use automated statistics for such situations. @@ -296,7 +151,7 @@ crm$plotSummaryMetrics(comp.group = "RIN", secondary.testing = TRUE) ``` -## Metrics per condition with numeric covariate +### Metrics per condition with numeric covariate We can choose a numeric comparison group, in this case `age`, which will add regression lines to the plots. @@ -320,7 +175,7 @@ crm$plotSummaryMetrics(comp.group = "age", We see that there is no significant effect of the numeric vector on neither of the MS types. -# Add detailed metrics +## Add detailed metrics We can read in count matrices to assess detailed metrics. Otherwise, if count matrices have already been added earlier, this step prepares data for plotting UMI and gene counts. @@ -338,7 +193,7 @@ crm$plotDetailedMetrics(comp.group = "type", plot.geom = "violin") ``` -# Embed cells using Conos +## Embed cells using Conos In order to plot our cells in our embedding, we need to perform preprocessing of the raw count matrices. To do this, either `pagoda2` (default) or `Seurat` can be used. @@ -359,7 +214,7 @@ crm$plotEmbedding() ``` -# Cell depth +## Cell depth We can plot cell depth, both in the embedding or as histograms per sample. @@ -396,29 +251,197 @@ crm$plotEmbedding(depth = TRUE, depth.cutoff = depth_cutoff_vec) ``` -# Doublet detection +## Mitochondrial fraction + +We can also investigate the mitochondrial fraction in our cells + +```{r plot-emb-mf, cache=TRUE} +crm$plotEmbedding(mito.frac = TRUE, + mito.cutoff = 0.05, + species = "human") +``` + +Similar as for depth, we can plot the distribution of the mitochondrial fraction per sample and include sample-wise cutoffs (not shown here). + +```{r plot-mf, cache=TRUE} +crm$plotMitoFraction(cutoff = 0.05) +``` + +# Remove ambient RNA -For doublet detection, we included the possibility to do so using the Python modules `scrublet` and `DoubletDetection`. First, we should install these packages: +We have added functionality to remove ambient RNA from our samples. This approach should be used with caution since it induces changes to the UMI counts (NB: it does not overwrite the outputs from Cell Ranger). +We have included preparative steps for [CellBender](https://github.com/broadinstitute/CellBender/) as well as incorporated [SoupX](https://github.com/constantAmateur/SoupX) into CRMetrics. + +## CellBender + +### Installation -```{r install-dd-software, eval=FALSE} +To install, follow [these instructions](https://cellbender.readthedocs.io/en/latest/installation/index.html#manual-installation). It is highly recommended to run `CellBender` using GPU acceleration. +If you are more comfortable installing through `reticulate` in R, these lines should be run: + +```{r cellbender-install, eval=FALSE} library(reticulate) -conda_install(envname = "r-reticulate", +conda_create("cellbender", + conda = "/opt/software/miniconda/4.12.0/condabin/conda", + python_version = 3.7) +conda_install("cellbender", + conda = "/opt/software/miniconda/4.12.0/condabin/conda", + forge = FALSE, + channel = "anaconda", + packages = "pytables") +conda_install("cellbender", + conda = "/opt/software/miniconda/4.12.0/condabin/conda", + packages = c("pytorch","torchvision","torchaudio"), + channel = "pytorch") +``` + +Then, clone the `CellBender` repository as instructed in the manual. Here, we clone to `/apps/` through `cd /apps/; git clone https://github.com/broadinstitute/CellBender.git` and then `CellBender` can be installed: + +```{r cellbender-install-2, eval=FALSE} +conda_install("cellbender", conda = "/opt/software/miniconda/4.12.0/condabin/conda", pip = TRUE, - packages = c("scrublet","doubletdetection")) + pip_options = "-e", + packages = "/apps/CellBender/") +``` + +### Analysis + +For `CellBender`, we need to specify expected number of cells and total droplets included (please see the [manual](https://cellbender.readthedocs.io/en/latest/usage/index.html) for additional information). As hinted in the manual, the number of total droplets included could be expected number of cells multiplied by 3 (which we set as default). First, we plot these measures: + +```{r cbprep, cache=TRUE} +crm$prepareCellbender(shrinkage = 100, # Subsamples every 100th datapoint for faster plotting + show.expected.cells = TRUE, + show.total.droplets = TRUE) +``` + +We could change the total droplets included for any sample. Let us first look at the vector. + +```{r totdrops} +droplets <- crm$getTotalDroplets() +droplets +``` + +Then we change the total droplets for SRR15054424. + +```{r change-topdrops} +droplets["SRR15054424"] <- 2e4 +``` + +We plot this change. + +```{r cbprep-totdrops, cache=TRUE} +crm$prepareCellbender(shrinkage = 100, + show.expected.cells = TRUE, + show.total.droplets = TRUE, + total.droplets = droplets) +``` + +We could also multiply expected cells by 2.5 for all samples and save this in our CRMetrics object. + +```{r cb-totdrops-multiply, eval=FALSE} +crm$cellbender$total.droplets <- crm$getTotalDroplets(multiplier = 2.5) +``` + +Finally, we save a script for running `CellBender` on all our samples. To only select specific samples, use the `samples` parameter. Here, we use our modified total droplet vector. If `total.droplets` is not specified, it will use the stored vector at `crm$cellbender$total.droplets`. + +```{r cb-save, eval=FALSE} +crm$saveCellbenderScript(file = "/apps/cellbender_script.sh", + fpr = 0.01, + epochs = 150, + use.gpu = TRUE, + total.droplets = droplets) +``` + +We can run this script in the terminal. Here, we activate the environment: `conda activate cellbender` and we run the bash script: `sh /apps/cellbender_script.sh` + +### Plotting + +We can plot the changes in cell numbers following CellBender estimations. + +```{r cb-plotcells, cache=TRUE} +crm$plotCbCells() +``` + +We can plot the CellBender training results. + +```{r cb-plottraining, fig.width = 12, fig.height = 10, cache=TRUE} +crm$plotCbTraining() +``` + +We can plot the cell probabilities. + +```{r cb-plotcellprobs, fig.width = 12, fig.height = 10, cache=TRUE} +crm$plotCbCellProbs() +``` + +We can plot the identified ambient genes per sample. + +```{r cb-plotambexp, fig.width = 12, fig.height = 10, cache=TRUE} +crm$plotCbAmbExp(cutoff = 0.005) +``` + +Lastly, we can plot the proportion of samples expressing ambient genes. We see that *MALAT1* is identified as an ambient gene in all samples [which is expected](https://kb.10xgenomics.com/hc/en-us/articles/360004729092-Why-do-I-see-high-levels-of-Malat1-in-my-gene-expression-data-). + +```{r cb-plotambgenes, cache=TRUE} +crm$plotCbAmbGenes(cutoff = 0.005) +``` + +Then, we can add the filtered CMs to our object. Additionally, it is recommended to generate new summary metrics from the adjusted CMs which can then be plotted. + +```{r} +crm$cms <- NULL +crm$addCms(cellbender = T) +crm$addSummaryFromCms() +crm$addDetailedMetrics() +``` + +## SoupX + +The implementation of SoupX uses the automated estimation of contamination and correction. Please note, SoupX depends on Seurat for import of data. +Since this calculation takes several minutes, it is not run in this vignette. + +```{r runsoupx, eval=FALSE} +crm$runSoupX() +``` + +Then, we can plot the corrections. + +```{r plotsoupx, cache=TRUE} +crm$plotSoupX() ``` +Then, we can add the SoupX adjusted CMs to our object. Additionally, it is recommended to generate new summary metrics from the adjusted CMs which can then be plotted. + +```{r add-adj-cms} +crm$addCms(cms = crm$soupx$cms.adj, + unique.names = TRUE, + sep = "!!") +crm$addSummaryFromCms() +crm$addDetailedMetrics() +``` + +# Doublet detection + +For doublet detection, we included the possibility to do so using the Python modules `scrublet` and `DoubletDetection`. Here, we use virtual environments where we installed each method. + `scrublet` is the default method, which is fast. `DoubletDetection` is significantly slower, but performs better according to [this](https://www.sciencedirect.com/science/article/pii/S2405471220304592) review. Here, we show how to run `scrublet` and `DoubletDetection` to compare in the next section. Since this takes some time, the results have been precalculated and are not run in this vignette. ```{r run-dd, eval=FALSE} -crm$detectDoublets(env = "r-reticulate", +crm$detectDoublets(env = "scrublet", conda.path = "/opt/software/miniconda/4.12.0/condabin/conda", method = "scrublet") -crm$detectDoublets(env = "r-reticulate", +crm$detectDoublets(env = "doubletdetection", conda.path = "/opt/software/miniconda/4.12.0/condabin/conda", method = "doubletdetection") ``` +It is also possible to generate a Python script to run each method from the terminal. To do this, set `export = TRUE` and run `python .py` inside the virtual environment to generate data. Then, load data with: + +```{r} +crm$addDoublets() +``` + We can plot the estimated doublets in the embedding. ```{r plot-scrublet-embedding, cache=TRUE} @@ -496,22 +519,6 @@ crm$con$plotGraph(groups = plot.vec, scale_color_manual(values = c("grey80","red","blue","black")) ``` -# Mitochondrial fraction - -We can also investigate the mitochondrial fraction in our cells - -```{r plot-emb-mf, cache=TRUE} -crm$plotEmbedding(mito.frac = TRUE, - mito.cutoff = 0.05, - species = "human") -``` - -Similar as for depth, we can plot the distribution of the mitochondrial fraction per sample and include sample-wise cutoffs (not shown here). - -```{r plot-mf, cache=TRUE} -crm$plotMitoFraction(cutoff = 0.05) -``` - # Plot filtered cells We can plot all the cells to be filtered in our embedding From 707a191f24ce97ddef8c675db18c53f3e46ea780 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 6 Jul 2023 11:24:22 +0200 Subject: [PATCH 27/35] Minor bugfixes to `initialize`, `addDetailedMetrics`, `addCms` --- R/CRMetrics.R | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 8a98aed..8156508 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -130,6 +130,18 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, full.names = FALSE) %>% .[pathsToList(data.path, .) %>% sapply(\(path) file.exists(paste0(path[2],"/",path[1],"/outs")))] %>% {data.frame(sample = .)} + } else { + if (is.null(names(cms))) { + if (is.null(samples)) { + stop("Either `samples` must be provided, or `cms` must be named.") + } else { + sample.out <- samples + } + } else { + sample.out <- names(cms) + } + self$metadata <- data.frame(sample = sample.out) %>% + arrange(sample) } } else { if (inherits(metadata, "data.frame")) { @@ -155,7 +167,9 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, samples = samples, unique.names = unique.names, sep = sep.cells, - n.cores = self$n.cores) + n.cores = self$n.cores, + add.metadata = FALSE, + verbose = verbose) } checkCompMeta(comp.group, self$metadata) @@ -190,7 +204,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, n.cores = self$n.cores, verbose = self$verbose) { # Checks - if (is.null(self$detailed.metrics)) stop("Detailed metrics already present. To overwrite, set $detailed.metrics = NULL and rerun this function") + if (!is.null(self$detailed.metrics)) stop("Detailed metrics already present. To overwrite, set $detailed.metrics = NULL and rerun this function") size.check <- cms %>% sapply(dim) %>% @@ -1895,6 +1909,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, symbol = TRUE, unique.names = TRUE, sep = "!!", + add.metadata = TRUE, n.cores = self$n.cores, verbose = self$verbose) { # Check @@ -1934,14 +1949,17 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (cellbender) { cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) } else { - cms <- read10x(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + cms <- read10x(data.path = data.path, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) } } self$cms <- cms - if (!is.null(self$metadata)) { - warning("Overwriting metadata") + if (add.metadata) { + if (!is.null(self$metadata)) { + warning("Overwriting metadata") + } + self$metadata <- data.frame(sample = samples) } From 4cd4d3c87f556ceeffd6367e17bc59589e8e6c07 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 6 Jul 2023 11:24:56 +0200 Subject: [PATCH 28/35] Changed "sample.names" to "samples" in inner functions --- R/inner_functions.R | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/R/inner_functions.R b/R/inner_functions.R index 54b749b..86a7421 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -39,7 +39,7 @@ checkCompMeta <- function(comp.group, #' @title Load 10x count matrices #' @description Load gene expression count data #' @param data.path Path to cellranger count data. -#' @param sample.names Vector of sample names (default = NULL) +#' @param samples Vector of sample names (default = NULL) #' @param raw logical Add raw count matrices (default = FALSE) #' @param symbol The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE). #' @param sep Separator for cell names (default = "!!"). @@ -57,7 +57,7 @@ checkCompMeta <- function(comp.group, #' } #' @export read10x <- function(data.path, - sample.names = NULL, + sampes = NULL, raw = FALSE, symbol = TRUE, sep = "!!", @@ -65,10 +65,10 @@ read10x <- function(data.path, n.cores = 1, verbose = TRUE) { checkPackageInstalled("data.table", cran = TRUE) - if (is.null(sample.names)) sample.names <- list.dirs(data.path, full.names = FALSE, recursive = FALSE) + if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE) full.path <- data.path %>% - pathsToList(sample.names) %>% + pathsToList(samples) %>% sapply(\(sample) { if (raw) pat <- glob2rx("raw_*_bc_matri*") else pat <- glob2rx("filtered_*_bc_matri*") dir(paste(sample[2],sample[1],"outs", sep = "/"), pattern = pat, full.names = TRUE) %>% @@ -102,9 +102,9 @@ read10x <- function(data.path, colnames(mat) <- barcodes %>% pull(V1) return(mat) }, n.cores = n.cores) %>% - setNames(sample.names) + setNames(samples) - if (unique.names) tmp %<>% createUniqueCellNames(sample.names, sep) + if (unique.names) tmp %<>% createUniqueCellNames(samples, sep) if (verbose) message(paste0(Sys.time()," Done!")) @@ -375,7 +375,7 @@ labelsFilter <- function(filter.data) { #' @title Read 10x HDF5 files #' @param data.path character -#' @param sample.names character vector, select specific samples for processing (default = NULL) +#' @param samples character vector, select specific samples for processing (default = NULL) #' @param type name of H5 file to search for, "raw" and "filtered" are Cell Ranger count outputs, "cellbender" is output from CellBender after running script from saveCellbenderScript #' @param symbol logical Use gene SYMBOLs (TRUE) or ENSEMBL IDs (FALSE) (default = TRUE) #' @param sep character Separator for creating unique cell names from sample IDs and cell IDs (default = "!!") @@ -389,7 +389,7 @@ labelsFilter <- function(filter.data) { #' } #' @export read10xH5 <- function(data.path, - sample.names = NULL, + samples = NULL, type = c("raw","filtered","cellbender","cellbender_filtered"), symbol = TRUE, sep = "!!", @@ -398,9 +398,9 @@ read10xH5 <- function(data.path, unique.names = FALSE) { checkPackageInstalled("rhdf5", bioc = TRUE) - if (is.null(sample.names)) sample.names <- list.dirs(data.path, full.names = FALSE, recursive = FALSE) + if (is.null(samples)) samples <- list.dirs(data.path, full.names = FALSE, recursive = FALSE) - full.path <- getH5Paths(data.path, sample.names, type) + full.path <- getH5Paths(data.path, samples, type) if (verbose) message(paste0(Sys.time()," Loading ",length(full.path)," count matrices using ", if (n.cores < length(full.path)) n.cores else length(full.path)," cores")) out <- full.path %>% @@ -435,9 +435,9 @@ read10xH5 <- function(data.path, return(tmp) }, n.cores = n.cores) %>% - setNames(sample.names) + setNames(samples) - if (unique.names) out %<>% createUniqueCellNames(sample.names, sep) + if (unique.names) out %<>% createUniqueCellNames(samples, sep) if (verbose) message(paste0(Sys.time()," Done!")) @@ -447,20 +447,20 @@ read10xH5 <- function(data.path, #' @title Create unique cell names #' @description Create unique cell names from sample IDs and cell IDs #' @param cms list List of count matrices, should be named (optional) -#' @param sample.names character Optional, list of sample names +#' @param samples character Optional, list of sample names #' @param sep character Separator between sample IDs and cell IDs (default = "!!") #' @keywords internal createUniqueCellNames <- function(cms, - sample.names, + samples, sep = "!!") { - names(cms) <- sample.names + names(cms) <- samples - sample.names %>% + samples %>% lapply(\(sample) { cms[[sample]] %>% `colnames<-`(., paste0(sample,sep,colnames(.))) }) %>% - setNames(sample.names) + setNames(samples) } #' @title Get H5 file paths From 70f068edb5a551947a3cb85f5e58627ed7327858 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 6 Jul 2023 11:25:20 +0200 Subject: [PATCH 29/35] Updated documentation --- R/CRMetrics.R | 19 +++- man/CRMetrics.Rd | 169 +++++++++++++++++++++++++---------- man/createUniqueCellNames.Rd | 4 +- man/read10x.Rd | 6 +- man/read10xH5.Rd | 4 +- 5 files changed, 144 insertions(+), 58 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 8156508..264d39c 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -236,7 +236,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) #' #' # Add metadata - #' crm$metadata$sex <- c("male","female") + #' crm$metadata <- data.frame(sex = c("male","female")) #' #' # Add comparison group #' crm$addComparison(comp.group = "sex") @@ -1883,6 +1883,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, #' @param symbol character The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE) #' @param unique.names logical Make cell names unique based on `sep` parameter (default = TRUE) #' @param sep character Separator used to create unique cell names (default = "!!") + #' @param add.metadata boolean Add metadata from cms or not (default = TRUE) #' @param n.cores integer Number of cores to use (default = self$n.cores) #' @param verbose boolean Print progress (default = self$verbose) #' @return Add list of (sparse) count matrices to R6 class object @@ -2381,7 +2382,21 @@ plotCbCells = function(data.path = self$data.path, facet_wrap(~variable, scales = "free_y", nrow = 2, ncol = 2) }, -addDoublets = function(method = c("doubletdetection","scrublet"), +#' @description Add doublet results created from exported Python script +#' @param method character Which method to use, either `scrublet` or `doubletdetection` (default is both). +#' @param data.path character Path to Cell Ranger outputs (default = self$data.path) +#' @param samples character Sample names to include (default = self$metadata$sample) +#' @param cms list List containing the count matrices (default = self$cms). +#' @param verbose boolean Print progress (default = self$verbose) +#' @return List of doublet results +#' @examples +#' \dontrun{ +#' crm <- CRMetrics$new(data.path = "/path/to/count/data/") +#' crm$detectDoublets(export = TRUE) +#' ## Run Python script +#' crm$addDoublets() +#' } +addDoublets = function(method = c("scrublet","doubletdetection"), data.path = self$data.path, samples = self$metadata$sample, cms = self$cms, diff --git a/man/CRMetrics.Rd b/man/CRMetrics.Rd index ca8da86..9b060a7 100644 --- a/man/CRMetrics.Rd +++ b/man/CRMetrics.Rd @@ -30,7 +30,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Run function crm$addDetailedMetrics() @@ -49,10 +49,10 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add metadata -crm$metadata$sex <- c("male","female") +crm$metadata <- data.frame(sex = c("male","female")) # Add comparison group crm$addComparison(comp.group = "sex") @@ -61,7 +61,7 @@ crm$addComparison(comp.group = "sex") ## Method `CRMetrics$plotSamples` ## ------------------------------------------------ -sample.names <- c("sample1", "sample2") +samples <- c("sample1", "sample2") # Simulate data testdata.cms <- lapply(seq_len(2), \(x) { @@ -71,10 +71,10 @@ dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)), sapply(seq_len(1e3), \(x) paste0("cell",x))) return(out) }) -names(testdata.cms) <- sample.names +names(testdata.cms) <- samples # Create metadata -metadata <- data.frame(sample = sample.names, +metadata <- data.frame(sample = samples, sex = c("male","female"), condition = c("a","b")) @@ -99,7 +99,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add summary metrics crm$addSummaryFromCms() @@ -122,7 +122,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add detailed metrics crm$addDetailedMetrics() @@ -148,7 +148,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -180,7 +180,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -213,7 +213,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -244,7 +244,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Detect doublets @@ -268,7 +268,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Perform preprocessing crm$doPreprocessing(preprocess = "pagoda2") @@ -294,7 +294,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -324,7 +324,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -355,7 +355,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Select metrics crm$selectMetrics() @@ -378,7 +378,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -412,7 +412,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -445,7 +445,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -494,7 +494,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Get summary crm$addSummaryFromCms() @@ -516,7 +516,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add summary crm$addSummaryFromCms() @@ -605,7 +605,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add summary crm$addSummaryFromCms() @@ -640,6 +640,17 @@ crm$saveCellbenderScript() ## Run CellBender script crm$plotCbCells() } + +## ------------------------------------------------ +## Method `CRMetrics$addDoublets` +## ------------------------------------------------ + +\dontrun{ +crm <- CRMetrics$new(data.path = "/path/to/count/data/") +crm$detectDoublets(export = TRUE) +## Run Python script +crm$addDoublets() +} } \section{Public fields}{ \if{html}{\out{
}} @@ -704,6 +715,7 @@ Initialize a CRMetrics object} \item \href{#method-CRMetrics-runSoupX}{\code{CRMetrics$runSoupX()}} \item \href{#method-CRMetrics-plotSoupX}{\code{CRMetrics$plotSoupX()}} \item \href{#method-CRMetrics-plotCbCells}{\code{CRMetrics$plotCbCells()}} +\item \href{#method-CRMetrics-addDoublets}{\code{CRMetrics$addDoublets()}} \item \href{#method-CRMetrics-clone}{\code{CRMetrics$clone()}} } } @@ -717,7 +729,7 @@ To initialize new object, 'data.path' or 'cms' is needed. 'metadata' is also rec data.path = NULL, metadata = NULL, cms = NULL, - sample.names = NULL, + samples = NULL, unique.names = TRUE, sep.cells = "!!", comp.group = NULL, @@ -739,7 +751,7 @@ To initialize new object, 'data.path' or 'cms' is needed. 'metadata' is also rec \item{\code{cms}}{list List with count matrices (default = NULL)} -\item{\code{sample.names}}{character Sample names. Only relevant is cms is provided (default = NULL)} +\item{\code{samples}}{character Sample names. Only relevant is cms is provided (default = NULL)} \item{\code{unique.names}}{logical Create unique cell names. Only relevant if cms is provided (default = TRUE)} @@ -817,7 +829,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Run function crm$addDetailedMetrics() @@ -860,10 +872,10 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add metadata -crm$metadata$sex <- c("male","female") +crm$metadata <- data.frame(sex = c("male","female")) # Add comparison group crm$addComparison(comp.group = "sex") @@ -911,7 +923,7 @@ ggplot2 object } \subsection{Examples}{ \if{html}{\out{
}} -\preformatted{sample.names <- c("sample1", "sample2") +\preformatted{samples <- c("sample1", "sample2") # Simulate data testdata.cms <- lapply(seq_len(2), \(x) { @@ -921,10 +933,10 @@ dimnames(out) <- list(sapply(seq_len(2e3), \(x) paste0("gene",x)), sapply(seq_len(1e3), \(x) paste0("cell",x))) return(out) }) -names(testdata.cms) <- sample.names +names(testdata.cms) <- samples # Create metadata -metadata <- data.frame(sample = sample.names, +metadata <- data.frame(sample = samples, sex = c("male","female"), condition = c("a","b")) @@ -1012,7 +1024,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add summary metrics crm$addSummaryFromCms() @@ -1079,7 +1091,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add detailed metrics crm$addDetailedMetrics() @@ -1159,7 +1171,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -1227,7 +1239,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -1299,7 +1311,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -1329,11 +1341,14 @@ Detect doublet cells. \if{html}{\out{
}}\preformatted{CRMetrics$detectDoublets( method = c("scrublet", "doubletdetection"), cms = self$cms, + samples = self$metadata$sample, env = "r-reticulate", conda.path = system("whereis conda"), n.cores = self$n.cores, verbose = self$verbose, - args = list() + args = list(), + export = FALSE, + data.path = self$data.path )}\if{html}{\out{
}} } @@ -1344,6 +1359,8 @@ Detect doublet cells. \item{\code{cms}}{list List containing the count matrices (default=self$cms).} +\item{\code{samples}}{character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample)} + \item{\code{env}}{character Environment to run python in (default="r-reticulate").} \item{\code{conda.path}}{character Path to conda environment (default=system("whereis conda")).} @@ -1353,6 +1370,10 @@ Detect doublet cells. \item{\code{verbose}}{logical Print messages or not (default = self$verbose)} \item{\code{args}}{list A list with additional arguments for either \code{DoubletDetection} or \code{scrublet}. Please check the respective manuals.} + +\item{\code{export}}{boolean Export CMs in order to detect doublets outside R (default = FALSE)} + +\item{\code{data.path}}{character Path to write data, only relevant if \code{export = TRUE}. Last character must be \code{/} (default = self$data.path)} } \if{html}{\out{
}} } @@ -1372,7 +1393,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Detect doublets @@ -1447,7 +1468,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Perform preprocessing crm$doPreprocessing(preprocess = "pagoda2") @@ -1512,7 +1533,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -1587,7 +1608,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -1640,7 +1661,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Select metrics crm$selectMetrics() @@ -1717,7 +1738,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -1773,7 +1794,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -1830,7 +1851,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Create embedding crm$doPreprocessing() @@ -2008,7 +2029,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Get summary crm$addSummaryFromCms() @@ -2054,7 +2075,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add summary crm$addSummaryFromCms() @@ -2076,12 +2097,13 @@ Add a list of count matrices to the CRMetrics object. \if{html}{\out{
}}\preformatted{CRMetrics$addCms( cms = NULL, data.path = self$data.path, - sample.names = self$metadata$sample, + samples = self$metadata$sample, cellbender = FALSE, raw = FALSE, symbol = TRUE, unique.names = TRUE, sep = "!!", + add.metadata = TRUE, n.cores = self$n.cores, verbose = self$verbose )}\if{html}{\out{
}} @@ -2094,7 +2116,7 @@ Add a list of count matrices to the CRMetrics object. \item{\code{data.path}}{character Path to cellranger count data (default = self$data.path).} -\item{\code{sample.names}}{character Vector of sample names. If NULL, sample.names are extracted from cms (default = self$metadata$sample)} +\item{\code{samples}}{character Vector of sample names. If NULL, samples are extracted from cms (default = self$metadata$sample)} \item{\code{cellbender}}{logical Add CellBender filtered count matrices in HDF5 format. Requires that "cellbender" is in the names of the files (default = FALSE)} @@ -2106,6 +2128,8 @@ Add a list of count matrices to the CRMetrics object. \item{\code{sep}}{character Separator used to create unique cell names (default = "!!")} +\item{\code{add.metadata}}{boolean Add metadata from cms or not (default = TRUE)} + \item{\code{n.cores}}{integer Number of cores to use (default = self$n.cores)} \item{\code{verbose}}{boolean Print progress (default = self$verbose)} @@ -2350,7 +2374,7 @@ return(out) }) # Initialize -crm <- CRMetrics$new(cms = testdata.cms, sample.names = c("sample1", "sample2"), n.cores = 1) +crm <- CRMetrics$new(cms = testdata.cms, samples = c("sample1", "sample2"), n.cores = 1) # Add summary crm$addSummaryFromCms() @@ -2481,6 +2505,53 @@ crm$plotCbCells() } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CRMetrics-addDoublets}{}}} +\subsection{Method \code{addDoublets()}}{ +Add doublet results created from exported Python script +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CRMetrics$addDoublets( + method = c("scrublet", "doubletdetection"), + data.path = self$data.path, + samples = self$metadata$sample, + cms = self$cms, + verbose = self$verbose +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{method}}{character Which method to use, either \code{scrublet} or \code{doubletdetection} (default is both).} + +\item{\code{data.path}}{character Path to Cell Ranger outputs (default = self$data.path)} + +\item{\code{samples}}{character Sample names to include (default = self$metadata$sample)} + +\item{\code{cms}}{list List containing the count matrices (default = self$cms).} + +\item{\code{verbose}}{boolean Print progress (default = self$verbose)} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +List of doublet results +} +\subsection{Examples}{ +\if{html}{\out{
}} +\preformatted{\dontrun{ +crm <- CRMetrics$new(data.path = "/path/to/count/data/") +crm$detectDoublets(export = TRUE) +## Run Python script +crm$addDoublets() +} +} +\if{html}{\out{
}} + +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/createUniqueCellNames.Rd b/man/createUniqueCellNames.Rd index 5045e4a..baf85b5 100644 --- a/man/createUniqueCellNames.Rd +++ b/man/createUniqueCellNames.Rd @@ -4,12 +4,12 @@ \alias{createUniqueCellNames} \title{Create unique cell names} \usage{ -createUniqueCellNames(cms, sample.names, sep = "!!") +createUniqueCellNames(cms, samples, sep = "!!") } \arguments{ \item{cms}{list List of count matrices, should be named (optional)} -\item{sample.names}{character Optional, list of sample names} +\item{samples}{character Optional, list of sample names} \item{sep}{character Separator between sample IDs and cell IDs (default = "!!")} } diff --git a/man/read10x.Rd b/man/read10x.Rd index 68ae4e7..d046e25 100644 --- a/man/read10x.Rd +++ b/man/read10x.Rd @@ -6,7 +6,7 @@ \usage{ read10x( data.path, - sample.names = NULL, + sampes = NULL, raw = FALSE, symbol = TRUE, sep = "!!", @@ -18,8 +18,6 @@ read10x( \arguments{ \item{data.path}{Path to cellranger count data.} -\item{sample.names}{Vector of sample names (default = NULL)} - \item{raw}{logical Add raw count matrices (default = FALSE)} \item{symbol}{The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE).} @@ -29,6 +27,8 @@ read10x( \item{n.cores}{Number of cores for the calculations (default = 1).} \item{verbose}{Print messages (default = TRUE).} + +\item{samples}{Vector of sample names (default = NULL)} } \value{ data frame diff --git a/man/read10xH5.Rd b/man/read10xH5.Rd index 16193a7..81f0b07 100644 --- a/man/read10xH5.Rd +++ b/man/read10xH5.Rd @@ -6,7 +6,7 @@ \usage{ read10xH5( data.path, - sample.names = NULL, + samples = NULL, type = c("raw", "filtered", "cellbender", "cellbender_filtered"), symbol = TRUE, sep = "!!", @@ -18,7 +18,7 @@ read10xH5( \arguments{ \item{data.path}{character} -\item{sample.names}{character vector, select specific samples for processing (default = NULL)} +\item{samples}{character vector, select specific samples for processing (default = NULL)} \item{type}{name of H5 file to search for, "raw" and "filtered" are Cell Ranger count outputs, "cellbender" is output from CellBender after running script from saveCellbenderScript} From a00680f6647fc19fe5f49b8691e12cb3d12e27e0 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 6 Jul 2023 11:34:39 +0200 Subject: [PATCH 30/35] Updated maintainer --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index bf5a0a5..b17c9a3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,5 +41,5 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 URL: https://github.com/khodosevichlab/CRMetrics BugReports: https://github.com/khodosevichlab/CRMetrics/issues -Maintainer: Rasmus Rydbirk +Maintainer: Rasmus Rydbirk Config/testthat/edition: 3 From 4636e92149707f9b30e93bc9fd87467ed51760f7 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 6 Jul 2023 13:51:42 +0200 Subject: [PATCH 31/35] Minor bugs --- R/CRMetrics.R | 20 +++++++++++++------- man/CRMetrics.Rd | 5 ++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index 264d39c..a5d8929 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -175,7 +175,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, checkCompMeta(comp.group, self$metadata) # Add summary metrics - if (is.null(cms)) self$summary.metrics <- addSummaryMetrics(data.path, self$metadata, verbose) + if (is.null(cms)) self$summary.metrics <- addSummaryMetrics(data.path = data.path, metadata = self$metadata, n.cores = self$n.cores, verbose = verbose) }, #' @description Function to read in detailed metrics. This is not done upon initialization for speed. @@ -1958,15 +1958,15 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (add.metadata) { if (!is.null(self$metadata)) { - warning("Overwriting metadata") + warning("Overwriting metadata\n") } self$metadata <- data.frame(sample = samples) } - if (!is.null(self$detailed.metrics)) warning("Consider updating detailed metrics by setting $detailed.metrics <- NULL and running $addDetailedMetrics(). ") - if (!is.null(self$con)) warning("Consider updating embedding by setting $cms.preprocessed <- NULL and $con <- NULL, and running $doPreprocessing() and $createEmbedding(). ") - if (!is.null(self$doublets)) warning("Consider updating doublet scores by setting $doublets <- NULL and running $detectDoublets(). ") + if (!is.null(self$detailed.metrics)) warning("Consider updating detailed metrics by setting $detailed.metrics <- NULL and running $addDetailedMetrics().\n") + if (!is.null(self$con)) warning("Consider updating embedding by setting $cms.preprocessed <- NULL and $con <- NULL, and running $doPreprocessing() and $createEmbedding().\n") + if (!is.null(self$doublets)) warning("Consider updating doublet scores by setting $doublets <- NULL and running $detectDoublets().\n") }, #' @description Plot the results from the CellBender estimations @@ -2342,6 +2342,7 @@ plotSoupX = function(plot.df = self$soupx$plot.df) { #' @description Plot CellBender cell estimations against the estimated cell numbers from Cell Ranger #' @param data.path character Path to Cell Ranger outputs (default = self$data.path) #' @param samples character Sample names to include (default = self$metadata$sample) +#' @param pal character Plotting palette (default = self$pal) #' @return A ggplot2 object #' @examples #' \dontrun{ @@ -2352,7 +2353,8 @@ plotSoupX = function(plot.df = self$soupx$plot.df) { #' crm$plotCbCells() #' } plotCbCells = function(data.path = self$data.path, - samples = self$metadata$sample) { + samples = self$metadata$sample, + pal = self$pal) { checkDataPath(data.path) checkPackageInstalled("rhdf5", bioc = TRUE) paths <- getH5Paths(data.path, samples, "cellbender_filtered") @@ -2373,13 +2375,17 @@ plotCbCells = function(data.path = self$data.path, "Expected cells", "Relative difference to exp. cells"))) - ggplot(df, aes(sample, value, fill = sample)) + + g <- ggplot(df, aes(sample, value, fill = sample)) + geom_bar(stat = "identity") + self$theme + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + guides(fill = "none") + labs(x = "", y = "") + facet_wrap(~variable, scales = "free_y", nrow = 2, ncol = 2) + + if (!is.null(pal)) g <- g + scale_fill_manual(values = pal) + + return(g) }, #' @description Add doublet results created from exported Python script diff --git a/man/CRMetrics.Rd b/man/CRMetrics.Rd index 9b060a7..6945894 100644 --- a/man/CRMetrics.Rd +++ b/man/CRMetrics.Rd @@ -2475,7 +2475,8 @@ Plot CellBender cell estimations against the estimated cell numbers from Cell Ra \subsection{Usage}{ \if{html}{\out{
}}\preformatted{CRMetrics$plotCbCells( data.path = self$data.path, - samples = self$metadata$sample + samples = self$metadata$sample, + pal = self$pal )}\if{html}{\out{
}} } @@ -2485,6 +2486,8 @@ Plot CellBender cell estimations against the estimated cell numbers from Cell Ra \item{\code{data.path}}{character Path to Cell Ranger outputs (default = self$data.path)} \item{\code{samples}}{character Sample names to include (default = self$metadata$sample)} + +\item{\code{pal}}{character Plotting palette (default = self$pal)} } \if{html}{\out{
}} } From 404f03fded335884248bdcd6dee1e5ca40d54bd3 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 6 Jul 2023 16:24:40 +0200 Subject: [PATCH 32/35] Minor bugs --- R/CRMetrics.R | 14 +++++++------- R/inner_functions.R | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/CRMetrics.R b/R/CRMetrics.R index a5d8929..cf961e7 100644 --- a/R/CRMetrics.R +++ b/R/CRMetrics.R @@ -1,4 +1,4 @@ -#' @import dplyr magrittr ggplot2 ggrepel +#' @import dplyr magrittr ggplot2 ggrepel ggpmisc #' @importFrom R6 R6Class #' @importFrom sccore plapply checkPackageInstalled #' @importFrom Matrix t @@ -8,7 +8,6 @@ #' @importFrom tidyr pivot_longer replace_na #' @importFrom ggbeeswarm geom_quasirandom #' @importFrom tibble add_column -#' @importFrom ggpmisc stat_poly_eq #' @importFrom scales comma #' @importFrom sparseMatrixStats colSums2 rowSums2 #' @importFrom utils globalVariables @@ -205,6 +204,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, verbose = self$verbose) { # Checks if (!is.null(self$detailed.metrics)) stop("Detailed metrics already present. To overwrite, set $detailed.metrics = NULL and rerun this function") + if (is.null(cms)) stop("No CMs found, run $addCms first.") size.check <- cms %>% sapply(dim) %>% @@ -411,12 +411,12 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (is.numeric(metadata[[comp.group]])) { if (!group.reg.lines) { g <- g + - ggpmisc::stat_poly_eq(color = "black", aes(label = paste(after_stat(rr.label), after_stat(p.value.label), sep = "*\", \"*"))) + - ggpmisc::stat_poly_line(color = "black", se = se) + stat_poly_eq(color = "black", aes(label = paste(after_stat(rr.label), after_stat(p.value.label), sep = "*\", \"*"))) + + stat_poly_line(color = "black", se = se) } else { g <- g + - ggpmisc::stat_poly_eq(aes(label = paste(after_stat(rr.label), after_stat(p.value.label), sep = "*\", \"*"), col = !!sym(second.comp.group))) + - ggpmisc::stat_poly_line(aes(col = !!sym(second.comp.group)), se = se) + stat_poly_eq(aes(label = paste(after_stat(rr.label), after_stat(p.value.label), sep = "*\", \"*"), col = !!sym(second.comp.group))) + + stat_poly_line(aes(col = !!sym(second.comp.group)), se = se) } } @@ -1950,7 +1950,7 @@ CRMetrics <- R6Class("CRMetrics", lock_objects = FALSE, if (cellbender) { cms <- read10xH5(data.path = data.path, samples = samples, symbol = symbol, type = "cellbender_filtered", sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) } else { - cms <- read10x(data.path = data.path, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) + cms <- read10x(data.path = data.path, samples = samples, raw = raw, symbol = symbol, sep = sep, n.cores = n.cores, verbose = verbose, unique.names = unique.names) } } diff --git a/R/inner_functions.R b/R/inner_functions.R index 86a7421..3745f5c 100644 --- a/R/inner_functions.R +++ b/R/inner_functions.R @@ -57,7 +57,7 @@ checkCompMeta <- function(comp.group, #' } #' @export read10x <- function(data.path, - sampes = NULL, + samples = NULL, raw = FALSE, symbol = TRUE, sep = "!!", From 09765184328ccb1632d8d52236fa64b76ce427fe Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Thu, 6 Jul 2023 16:25:12 +0200 Subject: [PATCH 33/35] Updated documentation --- NAMESPACE | 2 +- inst/docs/walkthrough.Rmd | 254 ++++++++++++++++++++++++-------------- man/read10x.Rd | 6 +- 3 files changed, 167 insertions(+), 95 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d2e5822..9bc5de4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(read10x) export(read10xH5) import(dplyr) import(ggplot2) +import(ggpmisc) import(ggrepel) import(magrittr) importFrom(Matrix,sparseMatrix) @@ -12,7 +13,6 @@ importFrom(Matrix,t) importFrom(R6,R6Class) importFrom(cowplot,plot_grid) importFrom(ggbeeswarm,geom_quasirandom) -importFrom(ggpmisc,stat_poly_eq) importFrom(ggpubr,stat_compare_means) importFrom(methods,as) importFrom(scales,comma) diff --git a/inst/docs/walkthrough.Rmd b/inst/docs/walkthrough.Rmd index 43eee0e..b9406b5 100644 --- a/inst/docs/walkthrough.Rmd +++ b/inst/docs/walkthrough.Rmd @@ -6,17 +6,18 @@ output: toc_float: true --- -# Prologue +# Introduction ## Preparations -We have selected a [publicly available dataset](https://www.ncbi.nlm.nih.gov/geo/) from GEO with accession number GSE179590 which can be downloaded [here](http://kkh.bric.ku.dk/fabienne/crmetrics_testdata.tar.gz). You can download the zipped data using wget or curl, e.g. -`wget http://kkh.bric.ku.dk/fabienne/crmetrics_testdata.tar.gz`, and then unpack using -`tar -xvf crmetrics_testdata.tar.gz` +We have selected a [publicly available dataset](https://www.ncbi.nlm.nih.gov/geo/) from GEO with accession number GSE179590 which can be downloaded [here](http://kkh.bric.ku.dk/fabienne/crmetrics_testdata.tar.gz). +You can download the zipped data using wget or curl, e.g. `wget http://kkh.bric.ku.dk/fabienne/crmetrics_testdata.tar.gz`, and then unpack using `tar -xvf crmetrics_testdata.tar.gz` ## Using Python modules -CRMetrics utilizes several Python packages. Of these, the packages for doublet detection, `scrublet` and `DoubletDetection`, can be run from R using `reticulate`. However, CRMetrics can provide a Python script to run the analyses directly in Python instead through the `export` parameter. +CRMetrics utilizes several Python packages. +Of these, the packages for doublet detection, `scrublet` and `DoubletDetection`, can be run from R using `reticulate`. +However, CRMetrics can provide a Python script to run the analyses directly in Python instead through the `export` parameter. In order to run the analyses in R, first you should install `reticulate`: @@ -25,7 +26,11 @@ install.packages("reticulate") library(reticulate) ``` -Also, you need to install Conda. Then, you are ready to create a Conda virtual environment. In this example, we're on a server and we load `miniconda` using modules. You may need to include the `conda` parameter to point to wherever your Conda binary is located (in terminal, try `whereis conda`). In this example, we install a virtual environment for `scrublet`. +Also, you need to install Conda. +Then, you are ready to create a Conda virtual environment. +In this example, we're on a server and we load `miniconda` using modules. +You may need to include the `conda` parameter to point to wherever your Conda binary is located (in terminal, try `whereis conda`). +In this example, we install a virtual environment for `scrublet`. ```{r, eval=FALSE} conda_create("scrublet", @@ -33,14 +38,13 @@ conda_create("scrublet", python_version = 3.8) ``` +There is a known problem with openBLAS which may be different between R and Python. +If this is the case, you will receive the error `floating point exception` and R will crash when you try to run a Python script using `reticulate`. +In Python, the problem lies within `numpy`. +`numba` requires `numpy` \< 1.23, so force reinstall from scratch with no binaries in the `scrublet` Conda environment from terminal `module load miniconda/4.12.0` `conda activate scrublet` `python -m pip install numpy==1.22.0 --force-reinstall --no-binary numpy` -There is a known problem with openBLAS which may be different between R and Python. If this is the case, you will receive the error `floating point exception` and R will crash when you try to run a Python script using `reticulate`. -In Python, the problem lies within `numpy`. `numba` requires `numpy` < 1.23, so force reinstall from scratch with no binaries in the `scrublet` Conda environment from terminal -`module load miniconda/4.12.0` -`conda activate scrublet` -`python -m pip install numpy==1.22.0 --force-reinstall --no-binary numpy` - -Then, follow the instructions to install [`scrublet`](https://github.com/swolock/scrublet). Finally, restart your R session. +Then, follow the instructions to install [`scrublet`](https://github.com/swolock/scrublet). +Finally, restart your R session. Please note, if at any point you receive an error that you can't change the current Python instance, please remove any Python-dependent object in your environment and restart your R session. @@ -54,22 +58,32 @@ library(magrittr) library(dplyr) ``` -There are two ways to initialize a new object of class `CRMetrics`, either by providing `data.path` or `cms`. -`data.path` is the path to a directory containing sample-wise directories with the Cell Ranger count outputs. Optionally, it can be a vector of multiple paths. -`cms` is a (named, optional) list of (sparse, optional) count matrices. +There are two ways to initialize a new object of class `CRMetrics`, either by providing `data.path` or `cms`. +`data.path` is the path to a directory containing sample-wise directories with the Cell Ranger count outputs. +Optionally, it can be a vector of multiple paths. +`cms` is a (named, optional) list of (sparse, optional) count matrices. -Please note, if `data.path` is not provided, some functionality is lost, e.g. ambient RNA removal. +Please note, if `data.path` is not provided, some functionality is lost, e.g. ambient RNA removal. -Optionally, metadata can be provided, either as a file or as a data.frame. For a file, the separator can be set with the parameter `sep.meta` (most often, either `,` (comma) or `\t` (tab) is used). In either format, the columns must be named and one column must be named `sample` and contain sample names. In combination with `data.path`, the sample names must match the sample directory names. Unmatched directory names are dropped. +Optionally, metadata can be provided, either as a file or as a data.frame. +For a file, the separator can be set with the parameter `sep.meta` (most often, either `,` (comma) or `\t` (tab) is used). +In either format, the columns must be named and one column must be named `sample` and contain sample names. +In combination with `data.path`, the sample names must match the sample directory names. +Unmatched directory names are dropped. -If `cms` is provided, it is recommended to add summary metrices afterwards: +If `cms` is provided, it is recommended to add summary metrics afterwards: ```{r cms, eval=FALSE} -crm <- CRMetrics$new(cms = cms, n.cores = 10) +crm <- CRMetrics$new(cms = cms, + n.cores = 10, + pal = grDevices::rainbow(8), + theme = ggplot2::theme_bw()) crm$addSummaryFromCms() ``` -Please note, some functionality depends on aggregation of sample and cell IDs using the `sep.cell` parameter. The default is `!!` which creates cell names in the format of `!!`. If another separator is used, this needs to be provided in relevant function calls. +Please note, some functionality depends on aggregation of sample and cell IDs using the `sep.cell` parameter. +The default is `!!` which creates cell names in the format of `!!`. +If another separator is used, this needs to be provided in relevant function calls. Here, the folder with our test data is stored in `/data/ExtData/CRMetrics_testdata/` and we provide metadata in a comma-separated file. @@ -78,10 +92,12 @@ crm <- CRMetrics$new(data.path = "/data/ExtData/CRMetrics_testdata/", metadata = "/data/ExtData/CRMetrics_testdata/metadata.csv", sep.meta = ",", n.cores = 10, - verbose = FALSE) + verbose = FALSE, + pal = grDevices::rainbow(8), + theme = ggplot2::theme_bw()) ``` -```{r load-obj, include=FALSE} +```{r load-obj, echo = F} crm <- qs::qread("/data/ExtData/CRMetrics_testdata/crm.qs", nthreads = 10) ``` @@ -101,43 +117,51 @@ crm$selectMetrics() ### Samples per condition -First, we can plot the number of samples per condition. Here, we investigate how the distribution of the sex differs between the type of MS of the samples where RRMS is short for relapsing remitting MS, and SPMS is short for secondary progressive MS. +First, we can plot the number of samples per condition. +Here, we investigate how the distribution of the sex differs between the type of MS of the samples where RRMS is short for relapsing remitting MS, and SPMS is short for secondary progressive MS. -```{r plot-summary-metrics, warning=FALSE, cache=TRUE} +```{r plot-summary-metrics, eval=FALSE} crm$plotSummaryMetrics(comp.group = "sex", metrics = "samples per group", second.comp.group = "type", plot.geom = "bar") ``` +![](img/fig1.png) ### Metrics per sample -In one plot, we can illustrate selected metric summary stats. If no comparison group is set, it defaults to `sample`. +In one plot, we can illustrate selected metric summary stats. +If no comparison group is set, it defaults to `sample`. -```{r plot-sum-metrics-selected, fig.width=12, fig.height=12, warning=FALSE, cache=TRUE} +```{r plot-sum-metrics-selected, fig.width=12, fig.height=12, eval=FALSE} metrics.to.plot <- crm$selectMetrics(ids = c(1:4,6,18,19)) crm$plotSummaryMetrics(comp.group = "sample", metrics = metrics.to.plot, plot.geom = "bar") ``` +![](img/fig2.png) ### Metrics per condition -We can do the same, but set the comparison group to `type`. This will add statistics to the plots. Additionally, we can add a second comparison group for coloring. +We can do the same, but set the comparison group to `type`. +This will add statistics to the plots. +Additionally, we can add a second comparison group for coloring. -```{r plot-sum-metrics-comp, fig.width=12, fig.height=10, warning=FALSE, cache=TRUE} +```{r plot-sum-metrics-comp, fig.width=12, fig.height=10, eval=FALSE} crm$plotSummaryMetrics(comp.group = "type", metrics = metrics.to.plot, plot.geom = "point", stat.test = "non-parametric", second.comp.group = "sex") ``` +![](img/fig3.png) ### Metrics per condition with >2 levels -For the sake of the example, we change the `RIN` values to `low` (RIN<6), `medium` (67). This will provide us with three comparisons groups to exemplify how to use automated statistics for such situations. +For the sake of the example, we change the `RIN` values to `low` (RIN\<6), `medium` (6\7). +This will provide us with three comparisons groups to exemplify how to use automated statistics for such situations. -```{r plot-sum-metrics-multilevel, fig.width=12, fig.height=10, cache=TRUE} +```{r plot-sum-metrics-multilevel, fig.width=12, fig.height=10, eval=FALSE} crm$metadata$RIN %<>% as.character() %>% {c("medium","high","high","medium","high","high","low","high")} %>% @@ -150,52 +174,60 @@ crm$plotSummaryMetrics(comp.group = "RIN", second.comp.group = "type", secondary.testing = TRUE) ``` +![](img/fig4.png) ### Metrics per condition with numeric covariate We can choose a numeric comparison group, in this case `age`, which will add regression lines to the plots. -```{r plot-sum-metrics-num-cov, fig.height=10, fig.width=12, cache=TRUE} +```{r plot-sum-metrics-num-cov, fig.height=10, fig.width=12, eval=FALSE} crm$plotSummaryMetrics(comp.group = "age", metrics = metrics.to.plot, plot.geom = "point", second.comp.group = "type", se = FALSE) ``` +![](img/fig5.png) If the numeric vector has a significant effect on one of the metrics we can investigate it closer by performing regression analyses for both conditions of `type`. -```{r plot-sum-metrics-sec-comp, cache=TRUE} +```{r plot-sum-metrics-sec-comp, eval=FALSE} crm$plotSummaryMetrics(comp.group = "age", - metrics = "Mean Reads per Cell", + metrics = "mean reads per cell", plot.geom = "point", second.comp.group = "type", group.reg.lines = TRUE) ``` +![](img/fig6.png) We see that there is no significant effect of the numeric vector on neither of the MS types. ## Add detailed metrics -We can read in count matrices to assess detailed metrics. Otherwise, if count matrices have already been added earlier, this step prepares data for plotting UMI and gene counts. +We can read in count matrices to assess detailed metrics. +Otherwise, if count matrices have already been added earlier, this step prepares data for plotting UMI and gene counts. ```{r add-detailed-metrics, eval=FALSE} +crm$addCms(add.metadata = FALSE) crm$addDetailedMetrics() ``` -We plot the detailed metrics. The horizontal lines indicates the median values for all samples. +We plot the detailed metrics. +The horizontal lines indicates the median values for all samples. -```{r plot-detailed-metrics, cache=TRUE} +```{r plot-detailed-metrics, eval=FALSE} metrics.to.plot <- crm$detailed.metrics$metric %>% unique() crm$plotDetailedMetrics(comp.group = "type", metrics = metrics.to.plot, plot.geom = "violin") ``` +![](img/fig7.png) ## Embed cells using Conos -In order to plot our cells in our embedding, we need to perform preprocessing of the raw count matrices. To do this, either `pagoda2` (default) or `Seurat` can be used. +In order to plot our cells in our embedding, we need to perform preprocessing of the raw count matrices. +To do this, either `pagoda2` (default) or `Seurat` can be used. ```{r preprocessing, eval=FALSE} crm$doPreprocessing() @@ -209,27 +241,29 @@ crm$createEmbedding() We can now plot our cells. -```{r plot-embedding, cache=TRUE} +```{r plot-embedding, eval=FALSE} crm$plotEmbedding() ``` - +![](img/fig8.png) ## Cell depth We can plot cell depth, both in the embedding or as histograms per sample. -```{r plot-embedding-depth, cache=TRUE} +```{r plot-embedding-depth, eval=FALSE} crm$plotEmbedding(depth = TRUE, depth.cutoff = 1e3) ``` +![](img/fig9.png) -```{r plot-depth, cache=TRUE, warning=FALSE} +```{r plot-depth, eval=FALSE} crm$plotDepth() ``` +![](img/fig10.png) We can see that the depth distribution varies between samples. We can create a cutoff vector specifying the depth cutoff per sample. It should be a named vector containing sample names. -```{r plot-sw-depth, cache=TRUE} +```{r plot-sw-depth} depth_cutoff_vec <- c(2.5e3, 2e3, 1e3, 1.5e3, 1.5e3, 2e3, 2.5e3, 2e3) %>% setNames(crm$detailed.metrics$sample %>% unique() %>% @@ -240,43 +274,49 @@ depth_cutoff_vec Let's plot the updated cutoffs: -```{r plot-upd-depth, warning=FALSE, cache=TRUE} +```{r plot-upd-depth, eval=FALSE} crm$plotDepth(cutoff = depth_cutoff_vec) ``` +![](img/fig11.png) Also, we can do this in the embedding: -```{r plot-embedding-depth-upd, cache=TRUE} +```{r plot-embedding-depth-upd, eval=FALSE} crm$plotEmbedding(depth = TRUE, depth.cutoff = depth_cutoff_vec) ``` +![](img/fig12.png) ## Mitochondrial fraction We can also investigate the mitochondrial fraction in our cells -```{r plot-emb-mf, cache=TRUE} +```{r plot-emb-mf, eval=FALSE} crm$plotEmbedding(mito.frac = TRUE, mito.cutoff = 0.05, species = "human") ``` +![](img/fig13.png) Similar as for depth, we can plot the distribution of the mitochondrial fraction per sample and include sample-wise cutoffs (not shown here). -```{r plot-mf, cache=TRUE} +```{r plot-mf, eval=FALSE} crm$plotMitoFraction(cutoff = 0.05) ``` +![](img/fig14.png) # Remove ambient RNA -We have added functionality to remove ambient RNA from our samples. This approach should be used with caution since it induces changes to the UMI counts (NB: it does not overwrite the outputs from Cell Ranger). -We have included preparative steps for [CellBender](https://github.com/broadinstitute/CellBender/) as well as incorporated [SoupX](https://github.com/constantAmateur/SoupX) into CRMetrics. +We have added functionality to remove ambient RNA from our samples. +This approach should be used with caution since it induces changes to the UMI counts (NB: it does not overwrite the outputs from Cell Ranger). +We have included preparative steps for [CellBender](https://github.com/broadinstitute/CellBender/) as well as incorporated [SoupX](https://github.com/constantAmateur/SoupX) into CRMetrics. ## CellBender ### Installation -To install, follow [these instructions](https://cellbender.readthedocs.io/en/latest/installation/index.html#manual-installation). It is highly recommended to run `CellBender` using GPU acceleration. +To install, follow [these instructions](https://cellbender.readthedocs.io/en/latest/installation/index.html#manual-installation). +It is highly recommended to run `CellBender` using GPU acceleration. If you are more comfortable installing through `reticulate` in R, these lines should be run: ```{r cellbender-install, eval=FALSE} @@ -295,7 +335,8 @@ conda_install("cellbender", channel = "pytorch") ``` -Then, clone the `CellBender` repository as instructed in the manual. Here, we clone to `/apps/` through `cd /apps/; git clone https://github.com/broadinstitute/CellBender.git` and then `CellBender` can be installed: +Then, clone the `CellBender` repository as instructed in the manual. +Here, we clone to `/apps/` through `cd /apps/; git clone https://github.com/broadinstitute/CellBender.git` and then `CellBender` can be installed: ```{r cellbender-install-2, eval=FALSE} conda_install("cellbender", @@ -307,15 +348,17 @@ conda_install("cellbender", ### Analysis -For `CellBender`, we need to specify expected number of cells and total droplets included (please see the [manual](https://cellbender.readthedocs.io/en/latest/usage/index.html) for additional information). As hinted in the manual, the number of total droplets included could be expected number of cells multiplied by 3 (which we set as default). First, we plot these measures: +For `CellBender`, we need to specify expected number of cells and total droplets included (please see the [manual](https://cellbender.readthedocs.io/en/latest/usage/index.html) for additional information). +As hinted in the manual, the number of total droplets included could be expected number of cells multiplied by 3 (which we set as default). +First, we plot these measures: -```{r cbprep, cache=TRUE} -crm$prepareCellbender(shrinkage = 100, # Subsamples every 100th datapoint for faster plotting - show.expected.cells = TRUE, - show.total.droplets = TRUE) +```{r cbprep, eval=FALSE} +crm$prepareCellbender() ``` +![](img/fig15.png) -We could change the total droplets included for any sample. Let us first look at the vector. +We could change the total droplets included for any sample. +Let us first look at the vector. ```{r totdrops} droplets <- crm$getTotalDroplets() @@ -330,12 +373,13 @@ droplets["SRR15054424"] <- 2e4 We plot this change. -```{r cbprep-totdrops, cache=TRUE} +```{r cbprep-totdrops, eval=FALSE} crm$prepareCellbender(shrinkage = 100, show.expected.cells = TRUE, show.total.droplets = TRUE, total.droplets = droplets) ``` +![](img/fig16.png) We could also multiply expected cells by 2.5 for all samples and save this in our CRMetrics object. @@ -359,47 +403,52 @@ We can run this script in the terminal. Here, we activate the environment: `cond We can plot the changes in cell numbers following CellBender estimations. -```{r cb-plotcells, cache=TRUE} +```{r cb-plotcells, eval=FALSE} crm$plotCbCells() ``` +![](img/fig17.png) We can plot the CellBender training results. -```{r cb-plottraining, fig.width = 12, fig.height = 10, cache=TRUE} +```{r cb-plottraining, fig.width = 12, fig.height = 10, eval=FALSE} crm$plotCbTraining() ``` +![](img/fig18.png) We can plot the cell probabilities. -```{r cb-plotcellprobs, fig.width = 12, fig.height = 10, cache=TRUE} +```{r cb-plotcellprobs, fig.width = 12, fig.height = 10, eval=FALSE} crm$plotCbCellProbs() ``` +![](img/fig19.png) We can plot the identified ambient genes per sample. -```{r cb-plotambexp, fig.width = 12, fig.height = 10, cache=TRUE} +```{r cb-plotambexp, fig.width = 12, fig.height = 10, eval=FALSE} crm$plotCbAmbExp(cutoff = 0.005) ``` +![](img/fig20.png) Lastly, we can plot the proportion of samples expressing ambient genes. We see that *MALAT1* is identified as an ambient gene in all samples [which is expected](https://kb.10xgenomics.com/hc/en-us/articles/360004729092-Why-do-I-see-high-levels-of-Malat1-in-my-gene-expression-data-). -```{r cb-plotambgenes, cache=TRUE} -crm$plotCbAmbGenes(cutoff = 0.005) +```{r cb-plotambgenes, eval=FALSE} +crm$plotCbAmbGenes(cutoff = 0.005, pal = grDevices::rainbow(17)) ``` +![](img/fig21.png) Then, we can add the filtered CMs to our object. Additionally, it is recommended to generate new summary metrics from the adjusted CMs which can then be plotted. -```{r} +```{r, eval = FALSE} crm$cms <- NULL crm$addCms(cellbender = T) crm$addSummaryFromCms() +crm$detailed.metrics <- NULL crm$addDetailedMetrics() ``` ## SoupX -The implementation of SoupX uses the automated estimation of contamination and correction. Please note, SoupX depends on Seurat for import of data. -Since this calculation takes several minutes, it is not run in this vignette. +The implementation of SoupX uses the automated estimation of contamination and correction. Please note, SoupX depends on Seurat for import of data. Since this calculation takes several minutes, it is not run in this vignette. ```{r runsoupx, eval=FALSE} crm$runSoupX() @@ -407,25 +456,33 @@ crm$runSoupX() Then, we can plot the corrections. -```{r plotsoupx, cache=TRUE} +```{r plotsoupx, eval=FALSE} crm$plotSoupX() ``` +![](img/fig22.png) -Then, we can add the SoupX adjusted CMs to our object. Additionally, it is recommended to generate new summary metrics from the adjusted CMs which can then be plotted. +Then, we can add the SoupX adjusted CMs to our object. +Additionally, it is recommended to generate new summary metrics from the adjusted CMs which can then be plotted. ```{r add-adj-cms} +crm$cms <- NULL crm$addCms(cms = crm$soupx$cms.adj, unique.names = TRUE, sep = "!!") crm$addSummaryFromCms() +crm$detailed.metrics <- NULL crm$addDetailedMetrics() ``` # Doublet detection -For doublet detection, we included the possibility to do so using the Python modules `scrublet` and `DoubletDetection`. Here, we use virtual environments where we installed each method. +For doublet detection, we included the possibility to do so using the Python modules `scrublet` and `DoubletDetection`. +Here, we use virtual environments where we installed each method. -`scrublet` is the default method, which is fast. `DoubletDetection` is significantly slower, but performs better according to [this](https://www.sciencedirect.com/science/article/pii/S2405471220304592) review. Here, we show how to run `scrublet` and `DoubletDetection` to compare in the next section. Since this takes some time, the results have been precalculated and are not run in this vignette. +`scrublet` is the default method, which is fast. +`DoubletDetection` is significantly slower, but performs better according to [this](https://www.sciencedirect.com/science/article/pii/S2405471220304592) review. +Here, we show how to run `scrublet` and `DoubletDetection` to compare in the next section. +Since this takes some time, the results have been precalculated and are not run in this vignette. ```{r run-dd, eval=FALSE} crm$detectDoublets(env = "scrublet", @@ -436,34 +493,40 @@ crm$detectDoublets(env = "doubletdetection", method = "doubletdetection") ``` -It is also possible to generate a Python script to run each method from the terminal. To do this, set `export = TRUE` and run `python .py` inside the virtual environment to generate data. Then, load data with: +It is also possible to generate a Python script to run each method from the terminal. +To do this, set `export = TRUE` and run `python .py` inside the virtual environment to generate data. +Then, load data with: -```{r} +```{r, eval = FALSE} crm$addDoublets() ``` We can plot the estimated doublets in the embedding. -```{r plot-scrublet-embedding, cache=TRUE} +```{r plot-scrublet-embedding, eval=FALSE} crm$plotEmbedding(doublet.method = "scrublet") crm$plotEmbedding(doublet.method = "doubletdetection") ``` +![](img/fig23a.png) +![](img/fig23b.png) And we can plot the scores for the doublet estimations. -```{r plot-scrublet-scores, cache=TRUE} +```{r plot-scrublet-scores, eval=FALSE} crm$plotEmbedding(doublet.method = "scrublet", doublet.scores = TRUE) crm$plotEmbedding(doublet.method = "doubletdetection", doublet.scores = TRUE) ``` +![](img/fig24a.png) +![](img/fig24b.png) ## Differences between methods We can compare how much `scrublet` and `DoubletDetection` overlap in their doublets estimates. First, let us plot a bar plot of the number of doublets per sample. -```{r compare-dd-res, cache=TRUE} +```{r compare-dd-res, eval=FALSE} scrub.res <- crm$doublets$scrublet$result %>% select(labels, sample) %>% mutate(method = "scrublet") @@ -485,12 +548,14 @@ ggplot(plot.df, aes(sample, count, fill = method)) + geom_bar(stat = "identity", position = position_dodge()) + crm$theme + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + - labs(x = "", y = "No. doublets", fill = "Method", title = "Doublets per sample") + labs(x = "", y = "No. doublets", fill = "Method", title = "Doublets per sample") + + scale_fill_manual(values = crm$pal) ``` +![](img/fig25.png) We can also show the total number of doublets detected per method. -```{r plot-dd-per-method, cache=TRUE} +```{r plot-dd-per-method, eval=FALSE} plot.df %>% group_by(method) %>% summarise(count = sum(count)) %>% @@ -498,12 +563,14 @@ plot.df %>% geom_bar(stat = "identity") + crm$theme + guides(fill = "none") + - labs(x = "", y = "No. doublets", title = "Total doublets per method") + labs(x = "", y = "No. doublets", title = "Total doublets per method") + + scale_fill_manual(values = crm$pal) ``` +![](img/fig26.png) Finally, let's plot an embedding showing the method-wise estimations as well as overlaps. -```{r plot-dd-emb-per-method, cache=TRUE} +```{r plot-dd-emb-per-method, eval=FALSE} plot.vec <- data.frame(scrublet = scrub.res$labels %>% as.numeric(), doubletdetection = dd.res$labels %>% as.numeric()) %>% apply(1, \(x) if (x[1] == 0 & x[2] == 0) "Kept" else if (x[1] > x[2]) "scrublet" else if (x[1] < x[2]) "DoubletDetection" else "Both") %>% @@ -518,12 +585,13 @@ crm$con$plotGraph(groups = plot.vec, size = 0.3) + scale_color_manual(values = c("grey80","red","blue","black")) ``` +![](img/fig27.png) # Plot filtered cells We can plot all the cells to be filtered in our embedding -```{r plot-filtered-cells-emb, cache=TRUE} +```{r plot-filtered-cells-emb, eval=FALSE} crm$plotFilteredCells(type = "embedding", depth = TRUE, depth.cutoff = depth_cutoff_vec, @@ -532,10 +600,11 @@ crm$plotFilteredCells(type = "embedding", mito.cutoff = 0.05, species = "human") ``` +![](img/fig28.png) And we can plot the cells to be filtered per sample where `combination` means a cell that has at least two filter labels, e.g. `mito` and `depth`. -```{r plot-filtered-cells-bar, cache=TRUE} +```{r plot-filtered-cells-bar, eval=FALSE} crm$plotFilteredCells(type = "bar", doublet.method = "scrublet", depth = TRUE, @@ -544,10 +613,12 @@ crm$plotFilteredCells(type = "bar", mito.cutoff = 0.05, species = "human") ``` +![](img/fig29.png) -Finally, we can create a tile plot with an overview of sample quality for the different filters. NB, this is experimental and has not been validated across datasets. +Finally, we can create a tile plot with an overview of sample quality for the different filters. +NB, this is experimental and has not been validated across datasets. -```{r plot-filtered-cells-tile, cache=TRUE} +```{r plot-filtered-cells-tile, eval=FALSE} crm$plotFilteredCells(type = "tile", doublet.method = "doubletdetection", depth = TRUE, @@ -556,10 +627,11 @@ crm$plotFilteredCells(type = "tile", mito.cutoff = 0.05, species = "human") ``` +![](img/fig30.png) We can also extract the raw numbers for plotting in other ways than those included here -```{r export-filtered-cells} +```{r export-filtered-cells, eval=FALSE} filter.data <- crm$plotFilteredCells(type = "export") filter.data %>% head() ``` @@ -568,19 +640,19 @@ filter.data %>% head() Finally, we can filter the count matrices to create a cleaned list to be used in downstream applications. -```{r filter, eval = FALSE} +```{r filter} crm$filterCms(depth.cutoff = depth_cutoff_vec, mito.cutoff = 0.05, - doublets = "doubletdetection", + doublets = "scrublet", samples.to.exclude = NULL, - species = "human") + species = "human", + verbose = TRUE) ``` -The filtered list of count matrices is stored in $cms.filtered which can be saved on disk afterwards. +The filtered list of count matrices is stored in \$cms.filtered which can be saved on disk afterwards. ```{r save, eval=FALSE} -library(qs) -qsave(crm$cms.filtered, "/data/ExtData/CRMetrics_testdata/cms_filtered.qs", +qs::qsave(crm$cms.filtered, "/data/ExtData/CRMetrics_testdata/cms_filtered.qs", nthreads = 10) ``` diff --git a/man/read10x.Rd b/man/read10x.Rd index d046e25..5c6c2c3 100644 --- a/man/read10x.Rd +++ b/man/read10x.Rd @@ -6,7 +6,7 @@ \usage{ read10x( data.path, - sampes = NULL, + samples = NULL, raw = FALSE, symbol = TRUE, sep = "!!", @@ -18,6 +18,8 @@ read10x( \arguments{ \item{data.path}{Path to cellranger count data.} +\item{samples}{Vector of sample names (default = NULL)} + \item{raw}{logical Add raw count matrices (default = FALSE)} \item{symbol}{The type of gene IDs to use, SYMBOL (TRUE) or ENSEMBLE (default = TRUE).} @@ -27,8 +29,6 @@ read10x( \item{n.cores}{Number of cores for the calculations (default = 1).} \item{verbose}{Print messages (default = TRUE).} - -\item{samples}{Vector of sample names (default = NULL)} } \value{ data frame From 9231873af78a51c4dc73f0376c3c19b781288ca3 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Fri, 7 Jul 2023 08:41:05 +0200 Subject: [PATCH 34/35] Updated NEWS --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 9fecbea..f59c145 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # CRMetrics 0.3.0 +* Several minor bug fixes * Updated `createEmbedding` upon resolve of https://github.com/kharchenkolab/conos/issues/123 * Updated DESCRIPTION * Added checks for `detectDoublets` From 93811e6029f51561b6dc46524c536adc799de822 Mon Sep 17 00:00:00 2001 From: rrydbirk Date: Fri, 7 Jul 2023 08:49:47 +0200 Subject: [PATCH 35/35] Added preprint citation to README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2cfe21a..7963ed9 100644 --- a/README.md +++ b/README.md @@ -54,4 +54,6 @@ in the vignette. # Cite -To cite this work, please run `citation(CRMetrics)`. +To cite this work, please run `citation("CRMetrics")` or cite our preprint: + +Fabienne Lorena Kick, Henrietta Holze, Rasmus Rydbirk, Konstantin Khodosevich: CRMetrics - an R package for Cell Ranger Filtering and Metrics Visualisation, 06 July 2023, PREPRINT (Version 1) available at Research Square [https://doi.org/10.21203/rs.3.rs-2853524/v1]