Add pseudocount to WMS fix V13 by removing samples with only 1 featur…

…e - gingival dataset
waldronlab · Jan 17, 2025 · a7f268e · a7f268e
1 parent 8480221
commit a7f268e
Show file tree

Hide file tree

Showing 3 changed files with 427 additions and 113 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: MicrobiomeBenchmarkDataAnalyses
 Title: Benchmarking analyses with the MicrobiomeBenchmarkData package
-Version: 0.99.27
+Version: 0.99.28
 Authors@R: 
     person("Samuel David", "Gamboa-Tuz", email = "[email protected]",
         role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6863-7943"))

diff --git a/vignettes/articles/HMP_2012_16S_gingival_V13.Rmd b/vignettes/articles/HMP_2012_16S_gingival_V13.Rmd
@@ -0,0 +1,339 @@
+---
+title: "Subgingival vs supgragingival plaque (V13 - OTU)"
+subtitle: "HMP_2012_16S_gingival_V13"
+author: "Samuel Gamboa"
+date: "`r format(Sys.time(), '%d %B, %Y')`"
+output:
+    html_document:
+        toc: yes
+---
+
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  warning = FALSE
+)
+```
+
+```{r packages, message=FALSE, warning=FALSE}
+library(MicrobiomeBenchmarkDataAnalyses)
+library(MicrobiomeBenchmarkData)
+library(dplyr)
+library(purrr)
+library(phyloseq)
+library(mia)
+library(benchdamic)
+library(ggplot2)
+library(ggpubr)
+```
+
+## Data
+
+Import data:
+
+```{r import data, message=FALSE}
+dat_name <- 'HMP_2012_16S_gingival_V13'
+conditions_col <- 'body_subsite'
+conditions <- c(condB = 'subgingival_plaque', condA = 'supragingival_plaque')
+tse <- getBenchmarkData(dat_name, dryrun = FALSE)[[1]]
+tse
+```
+
+Filter low-abundant features:
+
+```{r filter features}
+tse_subset <- filterTaxa(tse, min_ab = 1, min_per = 0.2)
+tse_subset <- tse_subset[,which(!colSums(assay(tse_subset)) <= 1)] ## Needed for metagenomeSeq
+tse_subset
+```
+
+Counts per body subsite:
+
+```{r samples per body subsite}
+col_data <- as_tibble(colData(tse_subset)) 
+count(col_data, body_subsite)
+```
+
+Number of subjects:
+
+```{r unique subjects}
+length(unique(col_data$subject_id))
+```
+
+## Prior information
+
+```{r feature annotations}
+row_data <- as.data.frame(rowData(tse_subset))
+prior_info <- row_data[, c('genus', 'taxon_annotation')]
+prior_info$taxon_name <- rownames(row_data)
+prior_info$new_names <- paste0(prior_info$taxon_name, '|', prior_info$genus)
+prior_info <- 
+    dplyr::relocate(prior_info, taxon_name, new_names, genus, taxon_annotation)
+head(prior_info)
+```
+
+## Convert to phyloseq
+
+```{r convert to phyloseq}
+ps <- convertToPhyloseq(tse_subset)
+sample_data(ps)[[conditions_col]] <- 
+    factor(sample_data(ps)[[conditions_col]], levels = conditions)
+ps
+```
+
+## Differential abundance analysis
+
+Set normalization, weights, and DA method options:
+
+```{r norm, weights, methods}
+ps <- runNormalizations(set_norm_list(), ps, verbose = FALSE)
+zw <- weights_ZINB(ps, design = conditions_col)
+DA_methods <- set_DA_methods_list(conditions_col, conditions)
+for (i in seq_along(DA_methods)) {
+    if (grepl("Seurat", names(DA_methods)[i])) {
+        names(DA_methods[[i]]$contrast) <- NULL
+    } else {
+        next
+    }
+}
+names(DA_methods)
+```
+
+Run DA analysis:
+
+```{r run DA, warning=FALSE, message=FALSE}
+tim <- system.time({
+    DA_output <- imap(DA_methods, ~ {
+        message("Running method ", .y, " - ", Sys.time())
+        tryCatch(
+            error = function(e) NULL,
+            runDA(list(.x), ps, weights = zw, verbose = FALSE)
+        )
+    }) |> 
+        list_flatten(name_spec = "{outer}") |> 
+        discard(is.null)
+    DA_output <- map2(DA_output, names(DA_output), ~ {
+        .x$name <- .y
+        .x
+    })
+})
+tim
+```
+
+## Enrichment analysis
+
+### Define a threshold for LEFSE with CLR
+
+Define threshold variables:
+
+```{r define variables to use in createEnrichment}
+direction <- get_direction_cols(DA_output, conditions_col, conditions)
+
+adjThr<- rep(0.1, length(DA_output))
+names(adjThr) <- names(DA_output)
+
+esThr <- rep(0, length(DA_output))
+names(esThr) <- names(DA_output)
+esThr[grep("lefse.TSS", names(esThr))] <- 2
+esThr[grep("lefse.CLR", names(esThr))] <-
+    median(DA_output$LEfSe.CLR$statInfo$abs_score)
+
+slotV <- ifelse(grepl("lefse", names(DA_output)), "statInfo", "pValMat")
+colNameV <- ifelse(grepl("lefse", names(DA_output)), "LDA_scores", "adjP")
+typeV <- ifelse(grepl("lefse", names(DA_output)), "logfc", "pvalue")
+```
+
+Run enrichment:
+
+```{r perform enrichment}
+enrichment <- createEnrichment(
+    object = DA_output,
+    priorKnowledge = prior_info,
+    enrichmentCol = "taxon_annotation",
+    namesCol = "new_names",
+    slot = slotV, colName = colNameV, type = typeV,
+    direction = direction,
+    threshold_pvalue = adjThr,
+    threshold_logfc = esThr,
+    top = NULL, # No top feature selected
+    alternative = "greater",
+    verbose = FALSE 
+)
+```
+
+Extract summary of the enrichment analysis:
+
+```{r enrichment summary}
+enrichmentSummary <- purrr::map(enrichment,  ~ {
+    .x$summaries |> 
+        purrr::map(function(x) {
+            x |> 
+                tibble::rownames_to_column(var = "direction") |> 
+                tidyr::pivot_longer(
+                    names_to = "annotation", values_to = "n",
+                    cols = 2
+                )
+                
+        }) |> 
+        dplyr::bind_rows() |> 
+        dplyr::relocate(pvalue)
+}) |> 
+    dplyr::bind_rows(.id = "method") |> 
+    dplyr::mutate(
+        sig = dplyr::case_when(
+            pvalue < 0.05 & pvalue > 0.01 ~ "*",
+            pvalue < 0.01 & pvalue > 0.001 ~ "**",
+            pvalue < 0.001 ~ "***",
+            TRUE ~ ""
+        ) 
+    ) |> 
+    dplyr::mutate(
+        direction = dplyr::case_when(
+            direction == "DOWN Abundant" ~ "Subgingival",
+            direction == "UP Abundant" ~ "Supragingival",
+            TRUE ~ direction 
+        )
+    )
+head(enrichmentSummary)
+```
+
+## Plots
+
+### Enrichment plot
+
+```{r enrichment plot}
+nn <- unique(enrichmentSummary$annotation)
+nn <- nn[!is.na(nn)]
+colorPalette <- palette.colors(palette = "Okabe-Ito")[2:(length(nn) + 1)]
+enPlot <- enrichmentSummary |> 
+    dplyr::left_join(getMethodClass(), by = "method") |> 
+    mutate(
+        direction = factor(direction, levels = c("Supragingival", "Subgingival")),
+        annotation = factor(annotation, levels = nn)
+    ) |> 
+    mutate(
+        annotation = case_when(
+            annotation == "aerobic" ~ "Aerobic",
+            annotation == "anaerobic" ~ "Anaerobic",
+            annotation == "facultative_anaerobic" ~ "Facultative anaerobic",
+            TRUE ~ annotation
+        )
+    ) |> 
+    ggplot(aes(method, n)) +
+    geom_col(
+        aes(fill = annotation),
+        position = position_dodge2(width = 0.9)
+    ) +
+    geom_text(
+        aes(label = sig, color = annotation),
+        position = position_dodge2(width = 0.9)
+    ) +
+    facet_grid(
+        direction ~ method_class, scales = "free_x", space = "free"
+    ) +
+    scale_fill_manual(values = colorPalette, name = "Annotation") +
+    scale_color_manual(values = colorPalette, name = "Annotation") +
+    labs(
+        x = "DA method", y = "Number of DAFs"
+    ) +
+    theme_bw() +
+    theme(
+        axis.text.x = element_text(angle = 45, hjust = 1),
+        legend.position = "bottom",
+        strip.background = element_rect(fill = "white")
+    )
+```
+
+### Putative true positives - putative false positives
+
+Calculate TP - FP ratio (no threshold):
+
+```{r positives object}
+positives <- map(1:length(DA_output), .f = function(i) {
+    positives <- createPositives(
+        object = DA_output[i],
+        priorKnowledge = prior_info, 
+        enrichmentCol = "taxon_annotation", namesCol = "new_names",
+        slot = slotV[i], colName = colNameV[i], type = typeV[i],
+        direction = direction[i],
+        threshold_pvalue = 1,
+        threshold_logfc = 0,
+        top = seq.int(from = 0, to = 50, by = 5),
+        alternative = "greater",
+        verbose = FALSE,
+        TP = list(c("DOWN Abundant", "anaerobic"), c("UP Abundant", "aerobic")),
+        FP = list(c("DOWN Abundant", "aerobic"), c("UP Abundant", "anaerobic"))
+    ) |> 
+        dplyr::left_join(getMethodClass(), by = 'method')
+}) |> bind_rows() |> 
+    mutate(diff = jitter(TP - FP, amount = 1.5, factor = 2))
+```
+
+Positives plot:
+
+```{r positives plot, fig.height = 4, fig.width=12}
+# positives <- positives |> 
+#     mutate(diff = jitter(TP - FP, amount = 1.5, factor = 2)) |> 
+#     mutate(
+#         base_method = case_when(
+#             grepl("lefse", base_method) ~ sub("lefse", "LEfSe", base_method),
+#             grepl("wilcox", base_method) ~ sub("wilcox", "Wilcox", base_method),
+#             TRUE ~ base_method
+#         ),
+#         method = case_when(
+#             grepl("lefse", method) ~ sub("lefse", "LEfSe", method),
+#             grepl("wilcox", method) ~ sub("wilcox", "Wilcox", method),
+#             TRUE ~ method
+#         )
+#     )
+vec <- positives$color
+names(vec) <- positives$base_method
+posPlot <- positives |> 
+    ggplot(aes(top, diff)) +
+    geom_line(
+        aes(
+            group = method, color = base_method, linetype = norm,
+        ),
+    ) +
+    geom_point(
+        aes(
+            color = base_method, shape = norm
+        ),
+    ) +
+    facet_wrap(~method_class, nrow = 1) +
+    labs(
+        x = "Top DAFs", y = "TP - FP"
+    ) +
+    scale_shape(name = "Normalization") +
+    scale_linetype(name = "Normalization") +
+    scale_color_manual(values = vec, name = "Base DA method") +
+    theme_bw() +
+    theme(
+        legend.position = "bottom",
+        strip.background = element_rect(fill = "white")
+    )
+```
+
+### Combined plots
+
+```{r combined plots, fig.height=9, fig.width=10}
+pp <- ggarrange(
+    plotlist = list(enPlot, posPlot), ncol = 1, heights = c(1.5, 1)
+)
+pp
+```
+
+```{r export plots, eval=TRUE, echo=FALSE}
+## Nice figure to explore. Not included in the paper
+# ggsave(
+#     filename = "FigureX.pdf", plot = pp, dpi = 300,
+#     height = 9, width = 10, 
+# )
+```
+
+# Session Info
+
+```{r session information}
+sessioninfo::session_info()
+```