feat: reports with more verbose description of results, improved plots

MPUSP · Apr 10, 2024 · bfc0a32 · bfc0a32
1 parent 5e3bb29
commit bfc0a32
Show file tree

Hide file tree

Showing 2 changed files with 92 additions and 30 deletions.
diff --git a/bin/counts_summary.Rmd b/bin/counts_summary.Rmd
@@ -84,7 +84,7 @@ print("Import of counts table complete.")
 list_samples <- unique(df_counts$sample)
 figwidth <- 9
 figheight <- round(1 + (length(list_samples) / 4))
-figheight2 <- 3 * figheight
+figheight2 <- 1.6 * figheight
 
 # output sample table
 test <- df_counts %>%
@@ -155,6 +155,9 @@ save_plot <- function(pl, path = "", width = 6, height = 6) {
 
 ## Total number of mapped reads per sample
 
+- figure shows the total number of mapped reads used for fitness score calculation
+- all other reads that were filtered out during preprocessing (low quality, low mapping score) are not included
+
 ```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
 plot_total_mapped_reads <- df_counts %>%
     dplyr::group_by(sample) %>%
@@ -171,6 +174,9 @@ print(plot_total_mapped_reads)
 
 ## Number of individual barcodes per sample
 
+- figure shows the number of individual, unique barcodes per sample
+- depnding on `bowtie` settings, a certain number of mismatches in barcodes are allowed
+
 ```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
 plot_individual_barcodes <- df_counts %>%
     dplyr::group_by(sample) %>%
@@ -187,6 +193,9 @@ print(plot_individual_barcodes)
 
 ## Number of missing barcodes per sample
 
+- figure shows the number of missing barcodes per sample
+- this number is determined from the number of total encountered barcodes across all samples
+
 ```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
 plot_missing_barcodes <- df_counts %>%
     dplyr::group_by(sample) %>%
@@ -203,38 +212,60 @@ print(plot_missing_barcodes)
 
 ## Number of barcodes per gene, per sample
 
+- figure shows the number of barcodes per gene, per sample
+- the x-axis shows the number of genes with N barcodes, broken down by sample
+- barcodes without mapped reads for the respective sample are removed
+- for example a colored bar with label `≤ 2` shows number of genes with less or equal than 2 barcodes
+
 ```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
-plot_barcodes_gene_sample <- df_counts %>%
+df_barcodes_per_gene <- df_counts %>%
+    dplyr::filter(n_reads > 0) %>%
     dplyr::group_by(sample, Gene) %>%
-    dplyr::summarize(`unique barcodes` = length(unique(sgRNA))) %>%
+    dplyr::summarize(`unique barcodes` = length(unique(sgRNA)), .groups = "drop") %>%
+    dplyr::mutate(`unique barcodes` = cut(`unique barcodes`, breaks = pretty(`unique barcodes`))) %>%
+    dplyr::group_by(sample) %>%
     dplyr::count(`unique barcodes`) %>%
-    ggplot(aes(x = sample, y = n, fill = factor(`unique barcodes`))) +
+    dplyr::mutate(`unique barcodes` = as.numeric(gsub("\\([0-9]*,|\\]", "", `unique barcodes`))) %>%
+    dplyr::ungroup() %>%
+    dplyr::arrange(sample, n)
+
+plot_barcodes_gene_sample <- df_barcodes_per_gene %>%
+    ggplot(aes(
+        x = sample, y = n,
+        fill = factor(`unique barcodes`),
+        label = paste0("≤ ", `unique barcodes`)
+    )) +
     geom_col(alpha = 0.7) +
-    geom_text(aes(
-        x = sample,
-        y = unlist((tapply(n, sample, function(x) cumsum(rev(x)) - (rev(x) / 2)))),
-        label = rev(`unique barcodes`)
-    ), color = "white") +
+    geom_text(position = position_stack(vjust = 0.5), color = "white") +
     labs(x = "") +
     coord_flip() +
-    scale_fill_manual(values = colorRampPalette(custom_colors[1:5])(9)) +
+    scale_fill_manual(values = colorRampPalette(
+        custom_colors[1:5])(length(unique(df_barcodes_per_gene[["unique barcodes"]])))
+    ) +
     custom_theme(legend.position = "none")
 
 save_plot(plot_barcodes_gene_sample, width = figwidth, height = figheight)
 print(plot_barcodes_gene_sample)
 ```
 
-## Read count distribution, per sample (capped at 1000 barcodes)
+## Read count distribution, violin plot
+
+- figure shows the read count distribution per sample and barcode
+- read count per barcode is only shown for the first 1000 barcodes to reduce processing time
+- barcodes without mapped reads for the respective sample are removed
+- read count is log 10 transformed (0 -> 1, 1 -> 10, 2 -> 100, ...)
 
 ```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
 plot_count_dist <- df_counts %>%
+    dplyr::filter(n_reads > 0) %>%
     dplyr::group_by(sample) %>%
     dplyr::slice(1:1000) %>%
     ggplot(aes(x = sample, y = log10(n_reads))) +
     geom_violin(
         trim = FALSE, fill = custom_colors[1],
         alpha = 0.7, col = "white"
     ) +
+    labs(y = expression("log"[10] * " reads per barcode")) +
     coord_flip() +
     stat_summary(fun.data = mean_sdl, geom = "pointrange", size = 0.5, col = grey(0.3)) +
     custom_theme()
@@ -243,14 +274,18 @@ save_plot(plot_count_dist, width = figwidth, height = figheight)
 print(plot_count_dist)
 ```
 
-## Number of reads per barcode, per sample
+## Read count distribution, histogram
+
+- figure shows the same data as above, but with full set of barcodes per sample
+- barcodes without mapped reads for the respective sample are removed
+- read count is log 10 transformed (0 -> 1, 1 -> 10, 2 -> 100, ...)
 
 ```{r, fig.width = figwidth, fig.height = figheight2, warning = FALSE}
 plot_reads_per_bc <- df_counts %>%
-    ggplot(aes(x = log2(n_reads))) +
+    ggplot(aes(x = log10(n_reads))) +
     geom_histogram(fill = custom_colors[1], alpha = 0.7, bins = 30) +
-    labs(y = "", x = expression("log"[2] * " reads per barcode")) +
-    facet_wrap(~sample, ncol = 2) +
+    labs(y = "", x = expression("log"[10] * " reads per barcode")) +
+    facet_wrap(~sample, ncol = 4) +
     custom_theme()
 
 save_plot(plot_reads_per_bc, width = figwidth, height = figheight2)
@@ -259,6 +294,12 @@ print(plot_reads_per_bc)
 
 ## Top 10 most abundant barcodes, per sample
 
+- figure shows top 10 barcodes ranked by read count
+- ideally, initial time point samples show high equality of barcode abundance
+- extremely high abundance of very few barcodes is a sign of few mutants outcompeting the population
+- this can happen when libraries are (pre-) cultivated for too long periods of time
+- can lead to downstream problems as it reduces library diversity (depletes low abundant mutants)
+
 ```{r, fig.width = figwidth, fig.height = figheight2, warning = FALSE}
 plot_top10_barcodes <- df_counts %>%
     dplyr::group_by(sample) %>%
@@ -268,7 +309,7 @@ plot_top10_barcodes <- df_counts %>%
     ggplot(aes(x = factor(rank), y = n_reads)) +
     geom_col(fill = custom_colors[1], alpha = 0.7, width = 1) +
     labs(y = "n reads", x = "barcodes ranked by abundance") +
-    facet_wrap(~sample, ncol = 2) +
+    facet_wrap(~sample, ncol = 4) +
     custom_theme()
 
 save_plot(plot_top10_barcodes, width = figwidth, height = figheight2)
@@ -277,6 +318,12 @@ print(plot_top10_barcodes)
 
 ## Cumulative read count distribution and barcode diversity
 
+- figure shows the barcode diversity by plotting fraction of reads (%) vs fraction of barcodes (%)
+- the ideal library has high diversity and equal distribution of barcodes for initial time points
+- such a distribution would follow the diagonal dashed grey line
+- if reads per barcode (red line) are not well distributed, `% of reads` (y-axis) shows a steep ascent
+- this means very few barcodes contribute to almost all reads
+
 ```{r, fig.width = figwidth, fig.height = figheight2, warning = FALSE}
 df_auc <- df_counts %>%
     dplyr::arrange(sample, desc(n_reads)) %>%
@@ -292,39 +339,49 @@ df_auc <- df_counts %>%
 
 plot_cumulative_read_count <- df_auc %>%
     ggplot(aes(x = pc_barcodes, y = pc_reads, group = sample)) +
-    geom_line(linewidth = 1.0, color = custom_colors[1]) +
+    geom_step(linewidth = 1.0, color = custom_colors[1]) +
     geom_abline(
         slope = 1, intercept = 0, linewidth = 1.0,
         linetype = 2, color = grey(0.5)
     ) +
-    facet_wrap(~sample, ncol = 2) +
     lims(x = c(0, 100), y = c(0, 100)) +
     labs(x = "% of barcodes", y = "% of reads") +
+    facet_wrap(~sample, ncol = 4) +
     custom_theme()
 
 save_plot(plot_cumulative_read_count, width = figwidth, height = figheight2)
 print(plot_cumulative_read_count)
 ```
 
+- this table shows the area under curve (AUC) for the line plot above
+- an AUC of 0.5 is ideal, an AUC approaching 1.0 is not optimal
+- the 'Gini index' is a score between 0 and 1 measuring population equality
+- it is defined as the `(AUC - AUC_optimal) / AUC_optimal`
+- a Gini score of 0 describes a perfectly equal, a Gini score of 1.0 a perfectly unequal distribution
+
 ```{r, warning = FALSE}
 calc_auc <- function(x, y) {
     sum(diff(x) * (head(y, -1) + tail(y, -1))) / 2
 }
 df_auc %>%
     dplyr::summarize(
-        auc = calc_auc(pc_barcodes, pc_reads),
-        gini = (auc - 5000) / 5000
+        auc = calc_auc(pc_barcodes, pc_reads)/100^2,
+        auc_optimal = 0.5,
+        gini = (auc - 0.5) / 0.5
     ) %>%
     mutate(quality = case_when(
-        gini <= 0.25 ~ "OK",
-        gini > 0.25 & gini <= 0.5 ~ "inequal read distribution",
-        gini > 0.5 ~ "extremely inequal read distribution"
+        gini <= 0.33 ~ "low inequality (G < 0.33)",
+        gini > 0.33 & gini <= 0.66 ~ "intermediate inequality (0.33 < G < 0.66)",
+        gini > 0.66 ~ "high inequality (G > 0.66)"
     ))
 ```
 
 
 ## Sample and replicate correlation coefficent (R)
 
+- figure shows heat map with correlation coefficient R (-1 < R < 1)
+- correlation coefficient shows how strongly read abundance is correlated
+
 ```{r, fig.width = 7.5, fig.height = 7, warning = FALSE}
 df_correlation <- df_counts %>%
     tidyr::pivot_wider(names_from = "sample", values_from = "n_reads") %>%
@@ -356,6 +413,12 @@ print(plot_replicate_correlation)
 
 ## Sample and replicate similarity with PCA
 
+- this figure shows sample similarity similar to above figure
+- uses principal component analysis (PCA) to reduce the multidimensional data to 3 main dimensions
+- plotted are principal component 1 (x-axis), 2 (y-axis) and 3 (point size)
+- replicates for same samples should cluster together
+- outliers should be easily visible
+
 ```{r, fig.width = 7, fig.height = 7, warning = FALSE}
 pca_result <- df_counts %>%
     tidyr::pivot_wider(names_from = "sample", values_from = "n_reads") %>%
@@ -388,7 +451,7 @@ print(plot_replicate_pca)
 
 The template for this report is located in `./nf-core-crispriscreen/bin/counts_summary.Rmd`.
 
-Date: 2022-04-28
+Date: 2024-04-10
 
 Author: Michael Jahn, PhD
 

diff --git a/bin/fitness_summary.Rmd b/bin/fitness_summary.Rmd
@@ -47,14 +47,14 @@ input_pattern <- "result.Rdata"
 # check input files and compile small table
 input_files <- grep(list.files(wd), pattern = input_pattern, value = TRUE)
 if (length(input_files) == 1) {
+    eval_chunks <- TRUE
     df_input <- bind_cols(
         data.frame(input_files),
         file.info(paste(wd, input_files, sep = "/"))
     ) %>%
         mutate(size = paste(size / 1e6, "MB"))
     rownames(df_input) <- NULL
     df_input[c(1, 2, 3, 4, 5, 10)]
-    eval_chunks <- TRUE
 } else {
     print(paste0("Required input file(s) '", input_pattern, "' were not found."))
     print("Further processing of the analysis is omitted.")
@@ -78,7 +78,7 @@ print("Import of fitness table complete.")
 # list of samples + generic options
 list_conditions <- unique(DESeq_result_table$condition)
 figwidth <- 9
-figheight <- round(2 + (length(list_conditions)))
+figheight <- round(3.5 + (length(list_conditions)))
 
 # output sample table
 df_conditions <- DESeq_result_table %>%
@@ -216,13 +216,13 @@ print(plot_sgrna_correlation_hist)
 ## Barcode efficiency
 
 - Plots show the _efficiency_ for each barcode in relation to other barcodes for the same gene
-- efficiency here means the ability to of a mutant to cause an effect
+- efficiency here means the ability of a mutant to cause an effect
 - correlation is broken down by position/index of the barcode
 - this representation can reveal if certain positions (i.e. distant from TSS) are less effective in causing a fitness effect
+- indices/positions with less than 3% of all barcodes are removed
 
 ```{r, fig.width = figwidth, fig.height = 4.5, warning = FALSE, eval = eval_chunks}
 plot_sgrna_efficiency <- DESeq_result_table %>%
-    # filter indices that have lower frequency than 3%
     group_by(sgRNA_index) %>%
     mutate(n_obs = n()) %>%
     ungroup() %>%
@@ -253,7 +253,6 @@ print(plot_sgrna_efficiency)
 
 ```{r, fig.width = figwidth, fig.height = 4.5, warning = FALSE, eval = eval_chunks}
 plot_sgrna_efficiency_hist <- DESeq_result_table %>%
-    # filter indices that have lower frequency than 3%
     group_by(sgRNA_index) %>%
     mutate(n_obs = n()) %>%
     ungroup() %>%
@@ -319,7 +318,7 @@ print(plot_gene_fitness)
 - This plot shows the global distribution of effect size versus significance score
 - Effect size is the weighted mean fitness of all barcodes
 - Significance is the adjusted p-value from 2-sided Wilcoxon rank sum test
-- This plot is useful to judge how many genes are _significant_ enriched or depleted
+- This plot is useful to judge how many genes are _significantly_ enriched or depleted
 
 ```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE, eval = eval_chunks}
 plot_fitness_vs_pval <- DESeq_result_table %>%