Skip to content

Commit

Permalink
feat: reports with more verbose description of results, improved plots
Browse files Browse the repository at this point in the history
  • Loading branch information
m-jahn committed Apr 10, 2024
1 parent 5e3bb29 commit bfc0a32
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 30 deletions.
111 changes: 87 additions & 24 deletions bin/counts_summary.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ print("Import of counts table complete.")
list_samples <- unique(df_counts$sample)
figwidth <- 9
figheight <- round(1 + (length(list_samples) / 4))
figheight2 <- 3 * figheight
figheight2 <- 1.6 * figheight
# output sample table
test <- df_counts %>%
Expand Down Expand Up @@ -155,6 +155,9 @@ save_plot <- function(pl, path = "", width = 6, height = 6) {

## Total number of mapped reads per sample

- figure shows the total number of mapped reads used for fitness score calculation
- all other reads that were filtered out during preprocessing (low quality, low mapping score) are not included

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
plot_total_mapped_reads <- df_counts %>%
dplyr::group_by(sample) %>%
Expand All @@ -171,6 +174,9 @@ print(plot_total_mapped_reads)

## Number of individual barcodes per sample

- figure shows the number of individual, unique barcodes per sample
- depnding on `bowtie` settings, a certain number of mismatches in barcodes are allowed

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
plot_individual_barcodes <- df_counts %>%
dplyr::group_by(sample) %>%
Expand All @@ -187,6 +193,9 @@ print(plot_individual_barcodes)

## Number of missing barcodes per sample

- figure shows the number of missing barcodes per sample
- this number is determined from the number of total encountered barcodes across all samples

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
plot_missing_barcodes <- df_counts %>%
dplyr::group_by(sample) %>%
Expand All @@ -203,38 +212,60 @@ print(plot_missing_barcodes)

## Number of barcodes per gene, per sample

- figure shows the number of barcodes per gene, per sample
- the x-axis shows the number of genes with N barcodes, broken down by sample
- barcodes without mapped reads for the respective sample are removed
- for example a colored bar with label `≤ 2` shows number of genes with less or equal than 2 barcodes

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
plot_barcodes_gene_sample <- df_counts %>%
df_barcodes_per_gene <- df_counts %>%
dplyr::filter(n_reads > 0) %>%
dplyr::group_by(sample, Gene) %>%
dplyr::summarize(`unique barcodes` = length(unique(sgRNA))) %>%
dplyr::summarize(`unique barcodes` = length(unique(sgRNA)), .groups = "drop") %>%
dplyr::mutate(`unique barcodes` = cut(`unique barcodes`, breaks = pretty(`unique barcodes`))) %>%
dplyr::group_by(sample) %>%
dplyr::count(`unique barcodes`) %>%
ggplot(aes(x = sample, y = n, fill = factor(`unique barcodes`))) +
dplyr::mutate(`unique barcodes` = as.numeric(gsub("\\([0-9]*,|\\]", "", `unique barcodes`))) %>%
dplyr::ungroup() %>%
dplyr::arrange(sample, n)
plot_barcodes_gene_sample <- df_barcodes_per_gene %>%
ggplot(aes(
x = sample, y = n,
fill = factor(`unique barcodes`),
label = paste0("≤ ", `unique barcodes`)
)) +
geom_col(alpha = 0.7) +
geom_text(aes(
x = sample,
y = unlist((tapply(n, sample, function(x) cumsum(rev(x)) - (rev(x) / 2)))),
label = rev(`unique barcodes`)
), color = "white") +
geom_text(position = position_stack(vjust = 0.5), color = "white") +
labs(x = "") +
coord_flip() +
scale_fill_manual(values = colorRampPalette(custom_colors[1:5])(9)) +
scale_fill_manual(values = colorRampPalette(
custom_colors[1:5])(length(unique(df_barcodes_per_gene[["unique barcodes"]])))
) +
custom_theme(legend.position = "none")
save_plot(plot_barcodes_gene_sample, width = figwidth, height = figheight)
print(plot_barcodes_gene_sample)
```

## Read count distribution, per sample (capped at 1000 barcodes)
## Read count distribution, violin plot

- figure shows the read count distribution per sample and barcode
- read count per barcode is only shown for the first 1000 barcodes to reduce processing time
- barcodes without mapped reads for the respective sample are removed
- read count is log 10 transformed (0 -> 1, 1 -> 10, 2 -> 100, ...)

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
plot_count_dist <- df_counts %>%
dplyr::filter(n_reads > 0) %>%
dplyr::group_by(sample) %>%
dplyr::slice(1:1000) %>%
ggplot(aes(x = sample, y = log10(n_reads))) +
geom_violin(
trim = FALSE, fill = custom_colors[1],
alpha = 0.7, col = "white"
) +
labs(y = expression("log"[10] * " reads per barcode")) +
coord_flip() +
stat_summary(fun.data = mean_sdl, geom = "pointrange", size = 0.5, col = grey(0.3)) +
custom_theme()
Expand All @@ -243,14 +274,18 @@ save_plot(plot_count_dist, width = figwidth, height = figheight)
print(plot_count_dist)
```

## Number of reads per barcode, per sample
## Read count distribution, histogram

- figure shows the same data as above, but with full set of barcodes per sample
- barcodes without mapped reads for the respective sample are removed
- read count is log 10 transformed (0 -> 1, 1 -> 10, 2 -> 100, ...)

```{r, fig.width = figwidth, fig.height = figheight2, warning = FALSE}
plot_reads_per_bc <- df_counts %>%
ggplot(aes(x = log2(n_reads))) +
ggplot(aes(x = log10(n_reads))) +
geom_histogram(fill = custom_colors[1], alpha = 0.7, bins = 30) +
labs(y = "", x = expression("log"[2] * " reads per barcode")) +
facet_wrap(~sample, ncol = 2) +
labs(y = "", x = expression("log"[10] * " reads per barcode")) +
facet_wrap(~sample, ncol = 4) +
custom_theme()
save_plot(plot_reads_per_bc, width = figwidth, height = figheight2)
Expand All @@ -259,6 +294,12 @@ print(plot_reads_per_bc)

## Top 10 most abundant barcodes, per sample

- figure shows top 10 barcodes ranked by read count
- ideally, initial time point samples show high equality of barcode abundance
- extremely high abundance of very few barcodes is a sign of few mutants outcompeting the population
- this can happen when libraries are (pre-) cultivated for too long periods of time
- can lead to downstream problems as it reduces library diversity (depletes low abundant mutants)

```{r, fig.width = figwidth, fig.height = figheight2, warning = FALSE}
plot_top10_barcodes <- df_counts %>%
dplyr::group_by(sample) %>%
Expand All @@ -268,7 +309,7 @@ plot_top10_barcodes <- df_counts %>%
ggplot(aes(x = factor(rank), y = n_reads)) +
geom_col(fill = custom_colors[1], alpha = 0.7, width = 1) +
labs(y = "n reads", x = "barcodes ranked by abundance") +
facet_wrap(~sample, ncol = 2) +
facet_wrap(~sample, ncol = 4) +
custom_theme()
save_plot(plot_top10_barcodes, width = figwidth, height = figheight2)
Expand All @@ -277,6 +318,12 @@ print(plot_top10_barcodes)

## Cumulative read count distribution and barcode diversity

- figure shows the barcode diversity by plotting fraction of reads (%) vs fraction of barcodes (%)
- the ideal library has high diversity and equal distribution of barcodes for initial time points
- such a distribution would follow the diagonal dashed grey line
- if reads per barcode (red line) are not well distributed, `% of reads` (y-axis) shows a steep ascent
- this means very few barcodes contribute to almost all reads

```{r, fig.width = figwidth, fig.height = figheight2, warning = FALSE}
df_auc <- df_counts %>%
dplyr::arrange(sample, desc(n_reads)) %>%
Expand All @@ -292,39 +339,49 @@ df_auc <- df_counts %>%
plot_cumulative_read_count <- df_auc %>%
ggplot(aes(x = pc_barcodes, y = pc_reads, group = sample)) +
geom_line(linewidth = 1.0, color = custom_colors[1]) +
geom_step(linewidth = 1.0, color = custom_colors[1]) +
geom_abline(
slope = 1, intercept = 0, linewidth = 1.0,
linetype = 2, color = grey(0.5)
) +
facet_wrap(~sample, ncol = 2) +
lims(x = c(0, 100), y = c(0, 100)) +
labs(x = "% of barcodes", y = "% of reads") +
facet_wrap(~sample, ncol = 4) +
custom_theme()
save_plot(plot_cumulative_read_count, width = figwidth, height = figheight2)
print(plot_cumulative_read_count)
```

- this table shows the area under curve (AUC) for the line plot above
- an AUC of 0.5 is ideal, an AUC approaching 1.0 is not optimal
- the 'Gini index' is a score between 0 and 1 measuring population equality
- it is defined as the `(AUC - AUC_optimal) / AUC_optimal`
- a Gini score of 0 describes a perfectly equal, a Gini score of 1.0 a perfectly unequal distribution

```{r, warning = FALSE}
calc_auc <- function(x, y) {
sum(diff(x) * (head(y, -1) + tail(y, -1))) / 2
}
df_auc %>%
dplyr::summarize(
auc = calc_auc(pc_barcodes, pc_reads),
gini = (auc - 5000) / 5000
auc = calc_auc(pc_barcodes, pc_reads)/100^2,
auc_optimal = 0.5,
gini = (auc - 0.5) / 0.5
) %>%
mutate(quality = case_when(
gini <= 0.25 ~ "OK",
gini > 0.25 & gini <= 0.5 ~ "inequal read distribution",
gini > 0.5 ~ "extremely inequal read distribution"
gini <= 0.33 ~ "low inequality (G < 0.33)",
gini > 0.33 & gini <= 0.66 ~ "intermediate inequality (0.33 < G < 0.66)",
gini > 0.66 ~ "high inequality (G > 0.66)"
))
```


## Sample and replicate correlation coefficent (R)

- figure shows heat map with correlation coefficient R (-1 < R < 1)
- correlation coefficient shows how strongly read abundance is correlated

```{r, fig.width = 7.5, fig.height = 7, warning = FALSE}
df_correlation <- df_counts %>%
tidyr::pivot_wider(names_from = "sample", values_from = "n_reads") %>%
Expand Down Expand Up @@ -356,6 +413,12 @@ print(plot_replicate_correlation)

## Sample and replicate similarity with PCA

- this figure shows sample similarity similar to above figure
- uses principal component analysis (PCA) to reduce the multidimensional data to 3 main dimensions
- plotted are principal component 1 (x-axis), 2 (y-axis) and 3 (point size)
- replicates for same samples should cluster together
- outliers should be easily visible

```{r, fig.width = 7, fig.height = 7, warning = FALSE}
pca_result <- df_counts %>%
tidyr::pivot_wider(names_from = "sample", values_from = "n_reads") %>%
Expand Down Expand Up @@ -388,7 +451,7 @@ print(plot_replicate_pca)

The template for this report is located in `./nf-core-crispriscreen/bin/counts_summary.Rmd`.

Date: 2022-04-28
Date: 2024-04-10

Author: Michael Jahn, PhD

Expand Down
11 changes: 5 additions & 6 deletions bin/fitness_summary.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,14 @@ input_pattern <- "result.Rdata"
# check input files and compile small table
input_files <- grep(list.files(wd), pattern = input_pattern, value = TRUE)
if (length(input_files) == 1) {
eval_chunks <- TRUE
df_input <- bind_cols(
data.frame(input_files),
file.info(paste(wd, input_files, sep = "/"))
) %>%
mutate(size = paste(size / 1e6, "MB"))
rownames(df_input) <- NULL
df_input[c(1, 2, 3, 4, 5, 10)]
eval_chunks <- TRUE
} else {
print(paste0("Required input file(s) '", input_pattern, "' were not found."))
print("Further processing of the analysis is omitted.")
Expand All @@ -78,7 +78,7 @@ print("Import of fitness table complete.")
# list of samples + generic options
list_conditions <- unique(DESeq_result_table$condition)
figwidth <- 9
figheight <- round(2 + (length(list_conditions)))
figheight <- round(3.5 + (length(list_conditions)))
# output sample table
df_conditions <- DESeq_result_table %>%
Expand Down Expand Up @@ -216,13 +216,13 @@ print(plot_sgrna_correlation_hist)
## Barcode efficiency

- Plots show the _efficiency_ for each barcode in relation to other barcodes for the same gene
- efficiency here means the ability to of a mutant to cause an effect
- efficiency here means the ability of a mutant to cause an effect
- correlation is broken down by position/index of the barcode
- this representation can reveal if certain positions (i.e. distant from TSS) are less effective in causing a fitness effect
- indices/positions with less than 3% of all barcodes are removed

```{r, fig.width = figwidth, fig.height = 4.5, warning = FALSE, eval = eval_chunks}
plot_sgrna_efficiency <- DESeq_result_table %>%
# filter indices that have lower frequency than 3%
group_by(sgRNA_index) %>%
mutate(n_obs = n()) %>%
ungroup() %>%
Expand Down Expand Up @@ -253,7 +253,6 @@ print(plot_sgrna_efficiency)

```{r, fig.width = figwidth, fig.height = 4.5, warning = FALSE, eval = eval_chunks}
plot_sgrna_efficiency_hist <- DESeq_result_table %>%
# filter indices that have lower frequency than 3%
group_by(sgRNA_index) %>%
mutate(n_obs = n()) %>%
ungroup() %>%
Expand Down Expand Up @@ -319,7 +318,7 @@ print(plot_gene_fitness)
- This plot shows the global distribution of effect size versus significance score
- Effect size is the weighted mean fitness of all barcodes
- Significance is the adjusted p-value from 2-sided Wilcoxon rank sum test
- This plot is useful to judge how many genes are _significant_ enriched or depleted
- This plot is useful to judge how many genes are _significantly_ enriched or depleted

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE, eval = eval_chunks}
plot_fitness_vs_pval <- DESeq_result_table %>%
Expand Down

0 comments on commit bfc0a32

Please sign in to comment.