Update vignette with all data for V1-3

waldronlab · Jan 9, 2025 · ca5c265 · ca5c265
1 parent 7cf4e5e
commit ca5c265
Show file tree

Hide file tree

Showing 9 changed files with 775 additions and 4 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: MicrobiomeBenchmarkDataAnalyses
 Title: Benchmarking analyses with the MicrobiomeBenchmarkData package
-Version: 0.99.20
+Version: 0.99.21
 Authors@R: 
     person("Samuel David", "Gamboa-Tuz", email = "[email protected]",
         role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6863-7943"))
@@ -50,3 +50,4 @@ Depends:
     R (>= 4.2)
 URL: https://github.com/waldronlab/MicrobiomeBenchmarkDataAnalyses
 BugReports: https://github.com/waldronlab/MicrobiomeBenchmarkDataAnalyses/issues
+Config/Needs/website: rmarkdown
diff --git a/vignettes/articles/.gitignore b/vignettes/articles/.gitignore
@@ -0,0 +1,2 @@
+*.html
+*.R
diff --git a/vignettes/articles/Figure1.pdf b/vignettes/articles/Figure1.pdf
diff --git a/vignettes/articles/HMP_2012_16S_gingival_V13_whole.Rmd b/vignettes/articles/HMP_2012_16S_gingival_V13_whole.Rmd
@@ -0,0 +1,359 @@
+---
+title: " HMP_2012_16S_gingival_V35 - subgingival vs supgragingival plaque - whole"
+output: 
+    html_document:
+        toc: true
+---
+
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  message = FALSE,
+  warning = FALSE
+)
+```
+
+```{r load packages, message=FALSE, warning=FALSE}
+library(MicrobiomeBenchmarkDataAnalyses)
+library(MicrobiomeBenchmarkData)
+library(dplyr)
+library(purrr)
+library(phyloseq)
+library(mia)
+library(benchdamic)
+library(ggplot2)
+library(ggpubr)
+```
+
++ Analyses were carried out in a subset of subjects: subjects with samples
+taken for both subgingival and supragingival body subsites in the same visit.
+Only one visit was considered per subject.
+
++ All analyses were performed at the OTU level.
+
+## Data
+
+```{r import data}
+dat_name <- 'HMP_2012_16S_gingival_V13'
+conditions_col <- 'body_subsite'
+conditions <- c(condB = 'subgingival_plaque', condA = 'supragingival_plaque')
+tse <- getBenchmarkData(dat_name, dryrun = FALSE)[[1]]
+tse_subset <- filterTaxa(tse,min_ab = 1, min_per = 0.2)
+tse_subset
+```
+
+Unique subjects:
+
+```{r extract subjects}
+col_data <- tse_subset |> 
+    colData() |> 
+    as.data.frame() |> 
+    tibble::rownames_to_column("sample_name") |> 
+    as_tibble()
+col_data |> 
+    count(body_subsite)
+```
+```{r}
+subjects <- col_data |> 
+    pull(subject_id) |> 
+    unique()
+length(subjects)
+```
+
+# Prior information
+
+```{r prior information}
+row_data <- as.data.frame(rowData(tse_subset))
+prior_info <- row_data[, c('genus', 'taxon_annotation')]
+prior_info$taxon_name <- rownames(row_data)
+prior_info$new_names <- paste0(prior_info$taxon_name, '|', prior_info$genus)
+prior_info <- 
+    dplyr::relocate(prior_info, taxon_name, new_names, genus, taxon_annotation)
+head(prior_info)
+```
+
+Convert to phyloseq
+
+```{r convert to phyloseq}
+ps <- convertToPhyloseq(tse_subset)
+sample_data(ps)[[conditions_col]] <- 
+    factor(sample_data(ps)[[conditions_col]], levels = conditions)
+ps
+```
+
+## Differential abundance analysis
+
+Perform normalization, calculate weights, and select DA methods:
+
+```{r norm, weights, methods}
+ps2 <- runNormalizations(set_norm_list()[2:4], ps, verbose = FALSE)
+zw <- weights_ZINB(ps, design = conditions_col)
+DA_methods <- set_DA_methods_list(conditions_col, conditions)
+DA_methods <- DA_methods[grep("metagenomeSeq", names(DA_methods), invert = TRUE)]
+
+for (i in seq_along(DA_methods)) {
+    ## This was a nacessary change for when the version increased:
+    if (grepl("Seurat", names(DA_methods)[i])) {
+        names(DA_methods[[i]]$contrast) <- NULL
+    } else {
+        next
+    }
+}
+names(DA_methods)
+```
+
+Run all of the differential analysis (DA) methods:
+
+```{r run DA, warning=FALSE}
+tim <- system.time({
+    DA_output <- vector("list", length(DA_methods))
+    for (i in seq_along(DA_output)) {
+        message(
+            "Running method ", i, ": ", names(DA_methods)[i], " - ", Sys.time()
+        )
+        DA_output[[i]] <- tryCatch(
+            error = function(e) NULL,
+            runDA(DA_methods[i], ps, weights = zw, verbose = FALSE) 
+        )
+    }
+    DA_output <- purrr::list_flatten(DA_output, name_spec = "{inner}")
+    DA_output <- purrr::discard(DA_output, is.null)
+})
+tim
+```
+
+## Enrichment analysis
+
+### Define a threshold for LEFSE with CLR
+
+Lefser uses two values to define differential abundant taxa, p-value and LDA.
+
+```{r LEFSER TSS scores hist}
+DA_output$lefse.TSS$statInfo$abs_score |> hist()
+```
+
+```{r LEFSE CLR scores hist}
+DA_output$lefse.CLR$statInfo$abs_score |> hist()
+```
+
+```{r check median scores}
+c(
+    lefse.TSS = median(DA_output$lefse.TSS$statInfo$abs_score),
+    lefse.CLR = median(DA_output$lefse.CLR$statInfo$abs_score)
+)
+```
+
+Create variables of thresholds:
+
+```{r define variables to use in createEnrichment}
+direction <- get_direction_cols(DA_output, conditions_col, conditions)
+
+adjThr<- rep(0.1, length(DA_output))
+names(adjThr) <- names(DA_output)
+
+esThr <- rep(0, length(DA_output))
+names(esThr) <- names(DA_output)
+esThr[grep("lefse.TSS", names(esThr))] <- 2
+esThr[grep("lefse.CLR", names(esThr))] <- 0.06
+
+slotV <- ifelse(grepl("lefse", names(DA_output)), "statInfo", "pValMat")
+colNameV <- ifelse(grepl("lefse", names(DA_output)), "LDA_scores", "adjP")
+typeV <- ifelse(grepl("lefse", names(DA_output)), "logfc", "pvalue")
+```
+
+Run enrichment:
+
+```{r perform enrichment}
+enrichment <- createEnrichment(
+    object = DA_output,
+    priorKnowledge = prior_info,
+    enrichmentCol = "taxon_annotation",
+    namesCol = "new_names",
+    slot = slotV, colName = colNameV, type = typeV,
+    direction = direction,
+    threshold_pvalue = adjThr,
+    threshold_logfc = esThr,
+    top = NULL, # No top feature selected
+    alternative = "greater",
+    verbose = FALSE 
+)
+```
+
+Extract summary of the enrichment analysis:
+
+```{r enrichment summary}
+enrichmentSummary <- purrr::map(enrichment,  ~ {
+    .x$summaries |> 
+        purrr::map(function(x) {
+            pos <- which(colnames(x) != "pvalue")
+            x |> 
+                tibble::rownames_to_column(var = "direction") |> 
+                tidyr::pivot_longer(
+                    names_to = "annotation", values_to = "n",
+                    cols = 2
+                )
+                
+        }) |> 
+        dplyr::bind_rows() |> 
+        dplyr::relocate(pvalue)
+}) |> 
+    dplyr::bind_rows(.id = "method") |> 
+    dplyr::mutate(
+        sig = dplyr::case_when(
+            pvalue < 0.05 & pvalue > 0.01 ~ "*",
+            pvalue < 0.01 & pvalue > 0.001 ~ "**",
+            pvalue < 0.001 ~ "***",
+            TRUE ~ ""
+        ) 
+    ) |> 
+    dplyr::mutate(
+        direction = dplyr::case_when(
+            direction == "DOWN Abundant" ~ "Subgingival",
+            direction == "UP Abundant" ~ "Supragingival",
+            TRUE ~ direction 
+        )
+    )
+head(enrichmentSummary)
+```
+
+## Plots
+
+### Enrichment plot
+
+```{r enrichment plot}
+enPlot <- enrichmentSummary |> 
+    dplyr::left_join(get_meth_class(), by = "method") |> 
+    mutate(
+        direction = factor(
+            direction, levels = c("Supragingival", "Subgingival")
+        )
+    ) |> 
+    mutate(
+        method = case_when(
+            grepl("lefse", method) ~ sub("lefse", "LEfSe", method),
+            grepl("wilcox", method) ~ sub("wilcox", "Wilcox", method),
+            TRUE ~ method
+        )
+    ) |> 
+    mutate(
+        annotation = case_when(
+            annotation == "aerobic" ~ "Aerobic",
+            annotation == "anaerobic" ~ "Anaerobic",
+            annotation == "facultative_anaerobic" ~ "Facultative anaerobic",
+            TRUE ~ annotation
+        )
+    ) |> 
+    ggplot(aes(method, n)) +
+    geom_col(
+        aes(fill = annotation),
+        position = position_dodge2(width = 0.9)
+    ) +
+    geom_text(
+        aes(label = sig, color = annotation),
+        position = position_dodge2(width = 0.9)
+    ) +
+    facet_grid(
+        direction ~ method_class, scales = "free_x", space = "free"
+    ) +
+    scale_fill_viridis_d(option = "D", name = "Biological data") +
+    scale_color_viridis_d(option = "D", name = "Biological data") +
+    labs(
+        x = "DA method", y = "Number of DA taxa"
+    ) +
+    theme_minimal() +
+    theme(
+        axis.text.x = element_text(angle = 45, hjust = 1),
+        legend.position = "bottom"
+    )
+```
+
+### Putative true positives - putative false positives
+
+Calculate TP - FP ratio (no threshold)
+
+```{r positives object}
+positives <- map(1:length(DA_output), .f = function(i) {
+    positives <- createPositives(
+        object = DA_output[i],
+        priorKnowledge = prior_info, 
+        enrichmentCol = "taxon_annotation", namesCol = "new_names",
+        slot = slotV[i], colName = colNameV[i], type = typeV[i],
+        direction = direction[i],
+        threshold_pvalue = 1,
+        threshold_logfc = 0,
+        top = seq.int(from = 0, to = 50, by = 5),
+        alternative = "greater",
+        verbose = FALSE,
+        TP = list(c("DOWN Abundant", "anaerobic"), c("UP Abundant", "aerobic")),
+        FP = list(c("DOWN Abundant", "aerobic"), c("UP Abundant", "anaerobic"))
+    ) |> 
+        dplyr::left_join(get_meth_class(), by = 'method')
+}) |> bind_rows()
+```
+
+Positives plot:
+
+```{r positives plot, fig.height = 4, fig.width=12}
+# names(vec) <- positives$base_method
+positives <- positives |> 
+    mutate(diff = jitter(TP - FP, amount = 1.5, factor = 2)) |> 
+    mutate(
+        base_method = case_when(
+            grepl("lefse", base_method) ~ sub("lefse", "LEfSe", base_method),
+            grepl("wilcox", base_method) ~ sub("wilcox", "Wilcox", base_method),
+            TRUE ~ base_method 
+        ),
+        method = case_when(
+            grepl("lefse", method) ~ sub("lefse", "LEfSe", method),
+            grepl("wilcox", method) ~ sub("wilcox", "Wilcox", method),
+            TRUE ~ method
+        )
+    )
+vec <- positives$color
+names(vec) <- positives$base_method
+posPlot <- positives |> 
+    ggplot(aes(top, diff)) +
+    geom_line(
+        aes(
+            group = method, color = base_method, linetype = norm,
+        ),
+    ) +
+    geom_point(
+        aes(
+            color = base_method, shape = norm
+        ),
+    ) +
+    facet_wrap(~method_class, nrow = 1) +
+    labs(
+        x = "Top DA features", y = "TP - FP"
+    ) +
+    scale_shape(name = "Normalization") +
+    scale_linetype(name = "Normalization") +
+    scale_color_manual(values = vec, name = "Base method") +
+    theme_minimal() +
+    theme(legend.position = "bottom")
+```
+
+### Combined plots
+
+```{r combined plots, fig.height=9, fig.width=10}
+pp <- ggarrange(
+    plotlist = list(enPlot, posPlot), ncol = 1, heights = c(1.5, 1)
+)
+pp
+```
+
+```{r export plots, eval=TRUE, echo=FALSE}
+## Export the figure
+ggsave(
+    filename = "Figure1.pdf", plot = pp, dpi = 300,
+    height = 9, width = 11, 
+)
+```
+
+# Session Info
+
+```{r session information}
+sessioninfo::session_info()
+```
diff --git a/vignettes/articles/HMP_2012_16S_gingival_V35.Rmd b/vignettes/articles/HMP_2012_16S_gingival_V35.Rmd
@@ -1,5 +1,5 @@
 ---
-title: " HMP_2012_16S_gingival_V35 - subgingival vs supgragingival plaque"
+title: " HMP_2012_16S_gingival_V35 - subgingival vs supgragingival plaque - downsampling"
 output: 
     html_document:
         toc: true

diff --git a/vignettes/articles/HMP_2012_16S_gingival_V35_subset.Rmd b/vignettes/articles/HMP_2012_16S_gingival_V35_subset.Rmd
@@ -29,7 +29,7 @@ conditions <- c(condB = 'subgingival_plaque', condA = 'supragingival_plaque')
 
 tse <- getBenchmarkData(dat_name, dryrun = FALSE)[[1]]
 
-tse <- filterTaxa(tse)
+tse <- filterTaxa(tse, min_ab = 1, min_per = 0.2)
 
 colData(tse)[[conditions_col]] <- 
     factor(colData(tse)[[conditions_col]], levels = conditions)