Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add summary plots to guide notebook for Ewing annotation #1008

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ plot_density <- function(classification_df,
gene_exp_column,
annotation_column) {
# pull out gene set name to create the plot title
geneset_name <- stringr::str_remove(gene_exp_column, "_sum|mean-")
geneset_name <- stringr::str_remove(gene_exp_column, "_sum|mean-|auc_")

plot <- ggplot(classification_df) +
aes(x = !!sym(gene_exp_column)) + # marker gene set column
Expand Down Expand Up @@ -361,13 +361,15 @@ plot_density <- function(classification_df,

# UMAPs ------------------------------------------------------------------------

# creates a faceted UMAP where each panel shows the cell type of interest in color and
# creates a faceted UMAP where each panel shows the cell type of interest in red and
# all other cells in grey
# adapted from `faceted_umap()` function in `celltypes_qc.rmd` from `scpca-nf`
# https://github.com/AlexsLemonade/scpca-nf/blob/main/templates/qc_report/celltypes_qc.rmd#L143

plot_faceted_umap <- function(classification_df,
annotation_column) {
annotation_column,
legend_title = "Cell type") {

ggplot(classification_df, aes(x = UMAP1, y = UMAP2, color = {{ annotation_column }})) +
# set points for all "other" points
geom_point(
Expand All @@ -379,18 +381,17 @@ plot_faceted_umap <- function(classification_df,
size = 0.1
) +
# set points for desired cell type
geom_point(size = 0.1, alpha = 0.5) +
geom_point(size = 0.1, alpha = 0.5, color = "red") +
facet_wrap(
vars({{ annotation_column }}),
ncol = 3
) +
scale_color_brewer(palette = "Dark2") +
# remove axis numbers and background grid
scale_x_continuous(labels = NULL, breaks = NULL) +
scale_y_continuous(labels = NULL, breaks = NULL) +
guides(
color = guide_legend(
title = "Cell type",
title = legend_title,
# more visible points in legend
override.aes = list(
alpha = 1,
Expand All @@ -400,6 +401,7 @@ plot_faceted_umap <- function(classification_df,
) +
theme(
aspect.ratio = 1,
panel.border = element_rect(color = "black", fill = NA)
strip.background = element_rect(fill = "transparent", linewidth = 0.5),
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ theme_set(

# set seed
set.seed(2024)

# quiet messages
options(readr.show_col_types = FALSE)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ugh it is so tempting to put this in my ~/.Rprofile 😂

ComplexHeatmap::ht_opt(message = FALSE)
```


Expand Down Expand Up @@ -94,9 +98,15 @@ output_file <- file.path(results_dir, glue::glue("{params$library_id}_celltype-a
setup_functions <- file.path(module_base, "template_notebooks", "utils", "setup-functions.R")
source(setup_functions)

# source in validation functions calculate_mean_markers()
# source in validation functions
# calculate_mean_markers(), plot_faceted_umap()
validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R")
source(validation_functions)

# source in plotting functions
# expression_umap(), cluster_density_plot(), and annotated_exp_heatmap()
plotting_functions <- file.path(module_base, "template_notebooks", "utils", "plotting-functions.R")
source(plotting_functions)
```

```{r}
Expand Down Expand Up @@ -151,15 +161,150 @@ gene_exp_df <- cell_types |>
purrr::reduce(dplyr::inner_join, by = "barcodes")

all_info_df <- all_results_df |>
dplyr::left_join(gene_exp_df, by = "barcodes")
dplyr::left_join(gene_exp_df, by = "barcodes") |>
dplyr::arrange(cluster)
```

## Summary of workflow results

TODO: Insert plots that will summarize findings from each of the workflows
- UMAPs of SingleR, clusters, AUC values and custom gene set means
- Density plots by cluster of AUC values and custom gene set means
- Maybe heatmaps with cluster annotation of AUC scores and custom gene set means
### `aucell-singler-annotation` assignments

The below UMAP shows the top cell types assigned by `SingleR` in the `aucell-singler-annotation.sh` workflow.
The top 7 cell types are shown and all other cell types are grouped together as "All remaining cell types".

```{r, fig.height = 5}
plot_faceted_umap(all_info_df, singler_lumped, legend_title = "SingleR cell types") +
theme(strip.text = element_text(size = 8))
```

### Cluster assignments

The below UMAP shows the cluster assignments output by `evaluate-clusters.sh` using the following parameters:

- Leiden with modularity
- Nearest neighbors: `r params$cluster_nn`
- Resolution: `r params$cluster_res`

```{r, fig.height = 5}
plot_faceted_umap(all_info_df, cluster, legend_title = "Cluster")
```

### `AUCell` results

The below plots show the AUC values determined by `AUCell` and output from `run-aucell-ews-signatures.sh`.
The first plot shows the individual AUC values on the UMAP.

```{r}
# get the individual thresholds determined by AUCell for each msigdb geneset
# we want this so we can show the threshold on the density plots
auc_threshold_df <- all_info_df |>
tidyr::pivot_longer(starts_with("threshold_auc_"), names_to = "geneset", values_to = "threshold") |>
dplyr::mutate(
geneset = stringr::str_remove(geneset, "threshold_auc_")
) |>
dplyr::select(barcodes, geneset, threshold)

# reformat auc data for density plots and UMAPs showing AUC values
auc_df <- all_info_df |>
tidyr::pivot_longer(starts_with("auc_"), names_to = "geneset", values_to = "auc_value") |>
dplyr::mutate(
geneset = stringr::str_remove(geneset, "auc_")
) |>
dplyr::select(barcodes, UMAP1, UMAP2, geneset, auc_value, cluster) |>
dplyr::left_join(auc_threshold_df, by = c("barcodes", "geneset")) |>
dplyr::mutate(
in_geneset = auc_value > threshold
)
```


```{r, fig.height=10, fig.width=10}
expression_umap(auc_df, auc_value, geneset)
```

The below plot shows the distribution of AUC values for each gene set colored based on if the AUC value is above the threshold determined by `AUCell`, indicated with a dotted line.


```{r, fig.height=5}

ggplot(auc_df, aes(x = auc_value, color = in_geneset, fill = in_geneset)) +
geom_density(alpha = 0.5, bw = 0.01) +
facet_wrap(vars(geneset)) +
ggplot2::geom_vline(data = auc_df,
mapping = aes(xintercept = threshold),
lty = 2) +
theme(
aspect.ratio = 1,
strip.background = element_rect(fill = "transparent", linewidth = 0.5),
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)
)
```


Here we look at the AUC values for each gene set across all clusters.

```{r, fig.height=10}
auc_columns <- colnames(all_info_df)[which(startsWith(colnames(all_info_df), "auc_"))]
cluster_density_plot(all_info_df, auc_columns, "AUC")
```

The heatmap below shows the AUC values for all cells and all gene sets.
The annotations shown are the `SingleR` cell types and clusters.

```{r}
annotated_exp_heatmap(
all_info_df,
exp_columns = auc_columns,
cell_type_column = "singler_lumped",
cluster_column = "cluster",
legend_title = "AUC"
)
```

### Mean expression of custom gene sets

Below we look at the mean expression of all genes in each of the custom gene sets we have defined both on a UMAP and in a density plot.

```{r, fig.height=10, fig.width=10}
# format expression data for UMAP and density plot
exp_df <- all_info_df |>
tidyr::pivot_longer(ends_with("_mean"), names_to = "geneset", values_to = "mean") |>
dplyr::mutate(tumor_cell_classification = dplyr::if_else(singler_lumped == "tumor", "tumor", "other"))

expression_umap(exp_df, mean, geneset)
```

```{r, fig.height=8}
ggplot(exp_df, aes(x = mean, color = tumor_cell_classification, fill = tumor_cell_classification)) +
geom_density(alpha = 0.5, bw = 0.2) +
facet_wrap(vars(geneset)) +
theme(
aspect.ratio = 1,
strip.background = element_rect(fill = "transparent", linewidth = 0.5),
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)
)
```

Here we look at the mean expression for each gene set across all clusters.

```{r, fig.height=7}
mean_exp_columns <- colnames(all_info_df)[which(endsWith(colnames(all_info_df), "_mean"))]
cluster_density_plot(all_info_df, mean_exp_columns, "mean gene expression")
```

The heatmap below shows the mean expression values for all cells and all gene sets.
The annotations shown are the `SingleR` cell types and clusters.

```{r}
annotated_exp_heatmap(
all_info_df,
exp_columns = mean_exp_columns,
cell_type_column = "singler_lumped",
cluster_column = "cluster",
legend_title = "Mean\ngene expression"
)
```


## Re-cluster tumor cells {.manual-exploration}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# These functions are used in `celltype-exploration.Rmd`
# They are used for creating the summary plots in the report


# source in helper plotting functions
# plot_density() and plot_gene_heatmap()
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
module_base <- file.path(repository_base, "analyses", "cell-type-ewings")
validation_helper_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R")
source(validation_helper_functions)


#' Plot expression for a set of genes or gene sets on the UMAP
#' UMAP is faceted by gene or gene set and color corresponds to expression value (mean, sum, AUC, etc.)
#'
#' @param df Data frame with UMAP embeddings and gene set expression value
#' @param color_column Column to use for coloring the points in the UMAP.
#' This should correspond to an expression value, like counts, mean, sum, or AUC
#' @param facet_column Column to use for faceting the UMAP
#' This should correspond to the column with a gene or gene set name
#'
#' @return UMAP faceted by facet_column and colored by color_column
#'
expression_umap <- function(
df,
color_column,
facet_column) {

ggplot(df, aes(x = UMAP1, y = UMAP2, color = {{color_column}})) +
geom_point(size = 0.1, alpha = 0.5) +
scale_color_viridis_c() +
facet_wrap(vars({{facet_column}})) +
# make sure there's a box around every facet
theme(
aspect.ratio = 1,
strip.background = element_rect(fill = "transparent", linewidth = 0.5),
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5)
) +
# remove axis numbers and background grid
scale_x_continuous(labels = NULL, breaks = NULL) +
scale_y_continuous(labels = NULL, breaks = NULL)

}


#' Density plot showing expression across clusters
#'
#' @param df Data frame with clusters for each cell and expresion values to be plotted
#' Must contain the `cluster` column
#' @param expression_columns Vector of columns present in the data frame that contain some
#' expression value to show on the x-axis of the density plot, such as AUC values or mean expression
#' @param x_label label to use for labeling all x-axes
#'
#' @return Faceted density plot with clusters as rows and each column from expression_columns
#' as a single panel

cluster_density_plot <- function(
df,
expression_columns,
x_label){

# create density plot for each column and combine into one figure
expression_columns |>
purrr::map(\(column){
plot_density(
df,
column,
"cluster"
) +
labs(y = "Cluster",
x = x_label) +
theme(text = element_text(size = 8))
}) |>
patchwork::wrap_plots(ncol = 2)

}

#' Heatmap with genes or gene sets as rows and cells as columns
#' Values in the heatmap correspond to an expression value or AUC
#' Two annotations will be included, one for cell type and one for clusters
#'
#' @param df Data frame containing exp_columns, cell_type_column, and cluster_column
#' @param exp_columns Vector of column names that contain values to be shown in the heatmap
#' @param cell_type_column Column indicating cell types for each cell in the df
#' @param cluster_column Column indicating cluster assignments for each cell in the df
#' @param legend_title Title to use for heatmap legend
#'
#' @return Heatmap where each exp_column is a row and each cell in the df is a column

annotated_exp_heatmap <- function(
df,
exp_columns,
cell_type_column,
cluster_column,
legend_title
){


# get annotation colors for cell type and cluster
cell_types <- unique(df[[cell_type_column]])
num_cell_types <- length(cell_types)
cell_type_colors <- palette.colors(palette = "Dark2") |>
head(n = num_cell_types) |>
purrr::set_names(cell_types)

clusters <- unique(df[[cluster_column]])
num_clusters <- length(clusters)
# use a larger palette for clusters to account for any cases with a large number of clusters
cluster_colors <- palette.colors(palette = "alphabet") |>
head(n = num_clusters) |>
purrr::set_names(clusters)

# create annotation for heatmap
annotation <- ComplexHeatmap::columnAnnotation(
cell_type = df[[cell_type_column]],
cluster = df[[cluster_column]],
col = list(
cell_type = cell_type_colors,
cluster = cluster_colors
)
)

# build matrix for heatmap cells x gene set/ expression columns
heatmap_mtx <- df |>
dplyr::select(barcodes, all_of(exp_columns)) |>
tibble::column_to_rownames("barcodes") |>
as.matrix() |>
t()
# remove any extra strings from the expression column names
rownames(heatmap_mtx) <- stringr::str_remove(rownames(heatmap_mtx), "auc_|_mean")

# plot heatmap of marker genes
plot_gene_heatmap(heatmap_mtx,
row_title = "",
legend_title = legend_title,
annotation = annotation,
cluster_columns = FALSE
)
}
Loading