diff --git a/vignettes/articles/HMP_2012_16S_gingival_V13.Rmd b/vignettes/articles/HMP_2012_16S_gingival_V13_downsampling.Rmd similarity index 100% rename from vignettes/articles/HMP_2012_16S_gingival_V13.Rmd rename to vignettes/articles/HMP_2012_16S_gingival_V13_downsampling.Rmd diff --git a/vignettes/articles/HMP_2012_16S_gingival_V35.Rmd b/vignettes/articles/HMP_2012_16S_gingival_V35_downsampling.Rmd similarity index 100% rename from vignettes/articles/HMP_2012_16S_gingival_V35.Rmd rename to vignettes/articles/HMP_2012_16S_gingival_V35_downsampling.Rmd diff --git a/vignettes/articles/HMP_2012_WMS_gingival.Rmd b/vignettes/articles/HMP_2012_WMS_gingival.Rmd index de1d866..73ec867 100644 --- a/vignettes/articles/HMP_2012_WMS_gingival.Rmd +++ b/vignettes/articles/HMP_2012_WMS_gingival.Rmd @@ -1,85 +1,120 @@ --- -title: "HMP_2012_WMS_gingiva - Supragingival vs subgingival plaque" +title: " HMP_2012_16S_gingival_V35 - subgingival vs supgragingival plaque - whole" output: html_document: toc: true --- -```{r, include=FALSE} -knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + message = FALSE, + warning = FALSE +) ``` - -```{r setup} +```{r load packages, message=FALSE, warning=FALSE} library(MicrobiomeBenchmarkDataAnalyses) library(MicrobiomeBenchmarkData) -library(mia) -library(phyloseq) -library(benchdamic) library(dplyr) library(purrr) +library(phyloseq) +library(mia) +library(benchdamic) library(ggplot2) -library(gridExtra) +library(ggpubr) ``` -# Import data ++ Analyses were carried out in a subset of subjects: subjects with samples +taken for both subgingival and supragingival body subsites in the same visit. +Only one visit was considered per subject. -```{r import data, message=FALSE, warning=FALSE} -dat_name <-'HMP_2012_WMS_gingival' ++ All analyses were performed at the OTU level. + +## Data + +Get data: + +```{r import data} +dat_name <- "HMP_2012_WMS_gingival" conditions_col <- 'body_subsite' conditions <- c(condB = 'subgingival_plaque', condA = 'supragingival_plaque') - tse <- getBenchmarkData(dat_name, dryrun = FALSE)[[1]] -tse <- filterTaxa(tse) - -colData(tse)[[conditions_col]] <- - factor(colData(tse)[[conditions_col]], levels = conditions) tse ``` +```{r} +tse_subset <- filterTaxa(tse, min_ab = 1, min_per = 0.2) +tse_subset -# Prior knowledge +``` -```{r prior info} -row_data <- as.data.frame(rowData(tse)) +Unique subjects per body subsite: + +```{r extract subjects} +col_data <- tse_subset |> + colData() |> + as.data.frame() |> + tibble::rownames_to_column("sample_name") |> + as_tibble() +col_data |> + count(body_subsite) +``` + +Unique subjects in the dataset: + +```{r} +subjects <- col_data |> + pull(subject_id) |> + unique() +length(subjects) +``` + +# Prior information + +```{r prior information} +row_data <- as.data.frame(rowData(tse_subset)) prior_info <- row_data[, c('genus', 'taxon_annotation')] -prior_info$taxon_name <- rownames(prior_info) +prior_info$taxon_name <- rownames(row_data) prior_info$new_names <- paste0(prior_info$taxon_name, '|', prior_info$genus) +prior_info <- + dplyr::relocate(prior_info, taxon_name, new_names, genus, taxon_annotation) head(prior_info) ``` -# Convert to phyloseq +Convert to phyloseq: -```{r, message=FALSE, warning=FALSE} -ps <- makePhyloseqFromTreeSummarizedExperiment(tse) -phyloseq::sample_data(ps)$body_subsite <- - factor(phyloseq::sample_data(ps)$body_subsite) +```{r convert to phyloseq, warning=FALSE} +ps <- convertToPhyloseq(tse_subset) +sample_data(ps)[[conditions_col]] <- + factor(sample_data(ps)[[conditions_col]], levels = conditions) ps ``` ## Differential abundance analysis -Select methods for DA: +Set normalization, weights, and DA methods parameters: -```{r, message=FALSE, warning=FALSE} +```{r norm, weights, methods} ps <- runNormalizations(set_norm_list(), ps, verbose = FALSE) zw <- weights_ZINB(ps, design = conditions_col) DA_methods <- set_DA_methods_list(conditions_col, conditions) for (i in seq_along(DA_methods)) { + ## This was a nacessary change for when the version increased: if (grepl("Seurat", names(DA_methods)[i])) { names(DA_methods[[i]]$contrast) <- NULL } else { next } } -# These methods throw an error, so they must be removed -# DA_methods <- DA_methods[!names(DA_methods) == 'DA_ALDEx2.1'] -# DA_methods <- DA_methods[!names(DA_methods) == 'DA_corncob.1'] -DA_methods <- DA_methods[!names(DA_methods) == 'DA_edgeR.1'] +DA_methods <- DA_methods[grep("edgeR",names(DA_methods), invert = TRUE)] names(DA_methods) ``` -```{r, warning=FALSE, message=TRUE} +Run DA analysis: + +```{r run DA, warning=FALSE} tim <- system.time({ DA_output <- vector("list", length(DA_methods)) for (i in seq_along(DA_output)) { @@ -97,88 +132,240 @@ tim <- system.time({ tim ``` -# Enrichment +## Enrichment analysis -Get direction +### Define a threshold for LEFSE with CLR -```{r} +Lefser uses two values to define differential abundant taxa, p-value and LDA. + +```{r LEFSER TSS scores hist} +DA_output$lefse.TSS$statInfo$abs_score |> hist() +``` + +```{r LEFSE CLR scores hist} +DA_output$lefse.CLR$statInfo$abs_score |> hist() +``` + +```{r check median scores} +c( + lefse.TSS = median(DA_output$lefse.TSS$statInfo$abs_score), + lefse.CLR = median(DA_output$lefse.CLR$statInfo$abs_score) +) +``` + +Create variables of thresholds: + +```{r define variables to use in createEnrichment} direction <- get_direction_cols(DA_output, conditions_col, conditions) + +adjThr<- rep(0.1, length(DA_output)) +names(adjThr) <- names(DA_output) + +esThr <- rep(0, length(DA_output)) +names(esThr) <- names(DA_output) +esThr[grep("lefse.TSS", names(esThr))] <- 2 +esThr[grep("lefse.CLR", names(esThr))] <- round(median( + DA_output$lefse.CLR$statInfo$abs_score +), 2) + +slotV <- ifelse(grepl("lefse", names(DA_output)), "statInfo", "pValMat") +colNameV <- ifelse(grepl("lefse", names(DA_output)), "LDA_scores", "adjP") +typeV <- ifelse(grepl("lefse", names(DA_output)), "logfc", "pvalue") ``` -## Enrichment (adjP <= 0.1) +Run enrichment: -```{r} +```{r perform enrichment} enrichment <- createEnrichment( object = DA_output, priorKnowledge = prior_info, enrichmentCol = "taxon_annotation", - namesCol = "taxon_name", - slot = "pValMat", colName = "adjP", type = "pvalue", + namesCol = "new_names", + slot = slotV, colName = colNameV, type = typeV, direction = direction, - threshold_pvalue = 0.1, - threshold_logfc = 0, + threshold_pvalue = adjThr, + threshold_logfc = esThr, top = NULL, # No top feature selected alternative = "greater", verbose = FALSE ) ``` -## Plot enrichment +Extract summary of the enrichment analysis: -```{r, warning=FALSE, message=FALSE, fig.height=6, fig.width=10} -enrich_plot <- plot_enrichment( - enrichment = enrichment, - enrichment_col = "taxon_annotation", - levels_to_plot = c("aerobic", "anaerobic", "facultative_anaerobic"), - conditions = conditions -) -enrich_plot2 <- plot_enrichment_2( - enrich_plot, - dir = c(up = 'Sup Plq', down = 'Sub Plq') -) -enrich_plot2 +```{r enrichment summary} +enrichmentSummary <- purrr::map(enrichment, ~ { + .x$summaries |> + purrr::map(function(x) { + pos <- which(colnames(x) != "pvalue") + x |> + tibble::rownames_to_column(var = "direction") |> + tidyr::pivot_longer( + names_to = "annotation", values_to = "n", + cols = 2 + ) + }) |> + dplyr::bind_rows() +}) |> + purrr::discard(~!nrow(.x)) |> + dplyr::bind_rows(.id = "method") |> + dplyr::relocate(method, pvalue) |> + dplyr::mutate( + sig = dplyr::case_when( + pvalue < 0.05 & pvalue > 0.01 ~ "*", + pvalue < 0.01 & pvalue > 0.001 ~ "**", + pvalue < 0.001 ~ "***", + TRUE ~ "" + ) + ) |> + dplyr::mutate( + direction = dplyr::case_when( + direction == "DOWN Abundant" ~ "Subgingival", + direction == "UP Abundant" ~ "Supragingival", + TRUE ~ direction + ) + ) +head(enrichmentSummary) ``` -# Putative true positives - putative false positives +## Plots -## Calculate TP - FP ratio (no threshold) +### Enrichment plot -```{r positives} -positives <- createPositives( - object = DA_output, - priorKnowledge = prior_info, - enrichmentCol = "taxon_annotation", namesCol = "taxon_name", - slot = "pValMat", colName = "rawP", type = "pvalue", - direction = direction, - threshold_pvalue = 1, - threshold_logfc = 0, - top = seq.int(from = 0, to = 50, by = 5), - alternative = "greater", - verbose = FALSE, - TP = list(c("DOWN Abundant", "anaerobic"), c("UP Abundant", "aerobic")), - FP = list(c("DOWN Abundant", "aerobic"), c("UP Abundant", "anaerobic")) -) |> - left_join(get_meth_class(), by = 'method') +```{r enrichment plot} +enPlot <- enrichmentSummary |> + dplyr::left_join(get_meth_class(), by = "method") |> + mutate( + direction = factor( + direction, levels = c("Supragingival", "Subgingival") + ) + ) |> + mutate( + method = case_when( + grepl("lefse", method) ~ sub("lefse", "LEfSe", method), + grepl("wilcox", method) ~ sub("wilcox", "Wilcox", method), + TRUE ~ method + ) + ) |> + mutate( + annotation = case_when( + annotation == "aerobic" ~ "Aerobic", + annotation == "anaerobic" ~ "Anaerobic", + annotation == "facultative_anaerobic" ~ "Facultative anaerobic", + TRUE ~ annotation + ) + ) |> + ggplot(aes(method, n)) + + geom_col( + aes(fill = annotation), + position = position_dodge2(width = 0.9) + ) + + geom_text( + aes(label = sig, color = annotation), + position = position_dodge2(width = 0.9) + ) + + facet_grid( + direction ~ method_class, scales = "free_x", space = "free" + ) + + scale_fill_viridis_d(option = "D", name = "Biological data") + + scale_color_viridis_d(option = "D", name = "Biological data") + + labs( + x = "DA method", y = "Number of DA taxa" + ) + + theme_minimal() + + theme( + axis.text.x = element_text(angle = 45, hjust = 1), + legend.position = "bottom" + ) +``` + +### Putative true positives - putative false positives + +Calculate TP - FP ratio (no threshold) + +```{r positives object} +positives <- map(1:length(DA_output), .f = function(i) { + positives <- createPositives( + object = DA_output[i], + priorKnowledge = prior_info, + enrichmentCol = "taxon_annotation", namesCol = "new_names", + slot = slotV[i], colName = colNameV[i], type = typeV[i], + direction = direction[i], + threshold_pvalue = 1, + threshold_logfc = 0, + top = seq.int(from = 0, to = 50, by = 5), + alternative = "greater", + verbose = FALSE, + TP = list(c("DOWN Abundant", "anaerobic"), c("UP Abundant", "aerobic")), + FP = list(c("DOWN Abundant", "aerobic"), c("UP Abundant", "anaerobic")) + ) |> + dplyr::left_join(get_meth_class(), by = 'method') +}) |> + bind_rows() ``` -## Plot TP - FP +Positives plot: + +```{r positives plot, fig.height = 4, fig.width=12} +# names(vec) <- positives$base_method +positives <- positives |> + mutate(diff = jitter(TP - FP, amount = 1.5, factor = 2)) |> + mutate( + base_method = case_when( + grepl("lefse", base_method) ~ sub("lefse", "LEfSe", base_method), + grepl("wilcox", base_method) ~ sub("wilcox", "Wilcox", base_method), + TRUE ~ base_method + ), + method = case_when( + grepl("lefse", method) ~ sub("lefse", "LEfSe", method), + grepl("wilcox", method) ~ sub("wilcox", "Wilcox", method), + TRUE ~ method + ) + ) +vec <- positives$color +names(vec) <- positives$base_method +posPlot <- positives |> + ggplot(aes(top, diff)) + + geom_line( + aes( + group = method, color = base_method, linetype = norm, + ), + ) + + geom_point( + aes( + color = base_method, shape = norm + ), + ) + + facet_wrap(~method_class, nrow = 1) + + labs( + x = "Top DA features", y = "TP - FP" + ) + + scale_shape(name = "Normalization") + + scale_linetype(name = "Normalization") + + scale_color_manual(values = vec, name = "Base method") + + theme_minimal() + + theme(legend.position = "bottom") +``` + +### Combined plots + +```{r combined plots, fig.height=9, fig.width=10} +pp <- ggarrange( + plotlist = list(enPlot, posPlot), ncol = 1, heights = c(1.5, 1) +) +pp +``` -```{r, fig.height = 10, fig.width=15} -positive_plots <- plot_positives(positives) |> - map( ~ { - .x + - theme( - axis.title = element_text(size = 17), - axis.text = element_text(size = 15), - legend.text = element_text(size = 13), - strip.text = element_text(size = 17) - ) - }) -grid.arrange(grobs = positive_plots, ncol = 3) +```{r export plots, eval=TRUE, echo=FALSE} +## Export the figure +# ggsave( +# filename = "Figure1.pdf", plot = pp, dpi = 300, +# height = 9, width = 11, +# ) ``` # Session Info -```{r session info} +```{r session information} sessioninfo::session_info() ```