diff --git a/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.Rmd new file mode 100644 index 000000000..61ad9b9b0 --- /dev/null +++ b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.Rmd @@ -0,0 +1,291 @@ +--- +title: "Explore consensus cell types" +author: Ally Hawkins +date: "`r Sys.Date()`" +output: + html_notebook: + toc: true + toc_depth: 3 + code_folding: show +--- + +This notebook summarizes the findings from assigning consensus cell type labels to all ScPCA samples. +All results from the `cell-type-consensus` module in `OpenScPCA-nf` must be saved to `results` prior to rendering this notebook. + +```{r packages} +suppressPackageStartupMessages({ + # load required packages + library(ggplot2) +}) + +# Set default ggplot theme +theme_set( + theme_classic() +) +``` + +## Functions + +```{r} +# function to read in project data frames with all cells in a project +# output is a summarized table with total cells per sample, total cells per annotation, and number of cell types +summarize_celltypes <- function(file, id){ + + # read in data + df <- readr::read_tsv(file) + + # get total cell count and number of assigned cell types per library + total_cells_df <- df |> + dplyr::group_by(library_id) |> + dplyr::summarize( + total_cells_per_library = length(library_id), + num_celltypes = length(unique(consensus_annotation)) + ) + + summary_df <- df |> + dplyr::group_by(library_id, consensus_annotation, consensus_ontology) |> + dplyr::summarize(total_cells_per_annotation = length(consensus_annotation)) |> + dplyr::left_join(total_cells_df, by = "library_id") |> + dplyr::mutate( + # add percentage + percent_cells_annotation = round((total_cells_per_annotation / total_cells_per_library) * 100, 2) + ) |> + dplyr::ungroup() + + return(summary_df) + +} +``` + +## Data setup + + +```{r base paths} +# The base path for the OpenScPCA repository, found by its (hidden) .git directory +repository_base <- rprojroot::find_root(rprojroot::is_git_root) +module_base <- file.path(repository_base, "analyses", "cell-type-consensus") + +# results directory with cell-type-consensus +results_dir <- file.path(module_base, "results", "cell-type-consensus") + +# diagnoses table used for labeling plots +diagnoses_file <- file.path(module_base, "sample-info", "project-diagnoses.tsv") +``` + +```{r} +# list all results files +results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE) + +# get project ids from file list +project_ids <- stringr::str_remove(basename(results_files), "_consensus-cell-types.tsv.gz") +names(results_files) <- project_ids + +# remove cell line projects from file list +cell_line_projects <- c("SCPCP000020", "SCPCP000024") +project_ids <- setdiff(project_ids, cell_line_projects) # remove cell line projects +results_files <- results_files[project_ids] +``` + + +```{r, message=FALSE} +# read in diagnoses +diagnoses_df <- readr::read_tsv(diagnoses_file) + + +# read in results and prep data frame for plotting +all_results_df <- results_files |> + purrr::imap(summarize_celltypes) |> + dplyr::bind_rows(.id = "project_id") |> + # add in diagnoses + dplyr::left_join(diagnoses_df, by = "project_id") |> + dplyr::mutate( + # create a label for plotting + project_label = glue::glue("{project_id}:{diagnosis}") + ) + +``` + +## Is it all just Unknown? + +The first thing we will look at is how many of the cells in each sample are categorized as "Unknown", which means no consensus between `SingleR` and `CellAssign` was identified. + +```{r, fig.height=7} +unknown_only <- all_results_df |> + dplyr::filter(consensus_annotation == "Unknown") + +ggplot(unknown_only, aes(x = project_label, y = percent_cells_annotation)) + + ggforce::geom_sina(size = 0.1) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), + plot.margin = margin(10,10,10,10)) + + labs( + x = "", + y = "Percent of cells annotated as Unknown" + ) + +``` + +It looks like we do have some samples that aren't just all "Unknown"! +It definitely varies by project, but for most projects we at least see some proportion of samples with assigned cell types. + +Let's look at how many samples actually have some cells outside of unknown identified. +To do this, we will identify all libraries that only have cells called as "Unknown". + +```{r} +high_tumor_df <- unknown_only |> + dplyr::mutate(no_cells_identified = percent_cells_annotation == 100) |> + dplyr::group_by(project_label) |> + dplyr::summarize(all_unknown = sum(no_cells_identified), + classified_cells = sum(!no_cells_identified), + percentage_unknown = round(all_unknown/(all_unknown + classified_cells)*100, 2), + # add number of libraries for plotting + total_libraries = length(library_id)) |> + # set order for plots + dplyr::mutate(project_label = forcats::fct_reorder(project_label, total_libraries, .desc = TRUE)) +``` + + +Which projects have the highest proportion of samples with all "Unknown"? + +```{r} +# table with percentage of samples +high_tumor_df |> + dplyr::select(project_label, percentage_unknown) |> + dplyr::arrange(desc(percentage_unknown)) + +``` + +It looks like all projects do have cell types identified that are not "Unknown". +However, `SCPCP000011` (retinoblastoma), has a fairly high percentage of samples without any consensus labels. + +## Number of cell types observed + +Below we look at the number of cell types observed in each project for all samples. +This does not include cells labeled as "Unknown". + +```{r, fig.height=10} +num_celltypes_df <- all_results_df |> + # add a new line for facet labels + dplyr::mutate(facet_label = glue::glue("{project_id}\n{diagnosis}")) |> + # remove unknown as a cell type + dplyr::filter(consensus_annotation != "Unknown") |> + dplyr::select(facet_label, library_id, num_celltypes) |> + unique() + +ggplot(num_celltypes_df, aes(x = num_celltypes)) + + geom_histogram(binwidth = 1, center = 0) + + facet_wrap(vars(facet_label), + ncol = 3) + + labs( + x = "Number of cell types" + ) + + theme_bw() +``` + +## Distribution of consensus cell types + +Now we look at the distribution of the cell types in each sample. +For these plots, we will pull out the top 9 cell types for each project. +All other cells will be labeled with "All remaining cell types". + +The top cell types are determined by counting how many libraries each cell type is found in within a project and taking the most frequent types. + +```{r} +plot_df <- all_results_df |> + dplyr::group_by(project_id) |> + dplyr::mutate( + # get most frequently observed cell types across libraries in that project + top_celltypes = forcats::fct_lump_n(consensus_annotation, 9, other_level = "All remaining cell types", ties.method = "first") |> + # sort by frequency + forcats::fct_infreq() |> + # make sure all remaining and unknown are last, use this to assign colors in specific order + forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf) + ) + +# get all unique cell types ordered by frequency +unique_celltypes <- plot_df |> + dplyr::filter(!top_celltypes %in% c("All remaining cell types", "Unknown")) |> + dplyr::pull(top_celltypes) |> + unique() |> + sort() |> + as.character() + +# get color palette +colors <- c( + palette.colors(palette = "alphabet"), + "black", # 1 extra since alphabet is 26 and we have 27, this will be plasma cell which shows up once + "grey60", + "grey95" +) +names(colors) <- c(unique_celltypes, "All remaining cell types", "Unknown") +``` + + +```{r, fig.height=60, fig.width=10} +project_labels <- unique(all_results_df$project_label) + +# stacked bar chart showing the distribution of the top 9 cell types for each project, including Unknown +project_labels |> + purrr::map(\(label){ + + project_df <- plot_df |> + dplyr::filter(project_label == label) |> + dplyr::mutate( + # relevel factors for specific project + top_celltypes = forcats::fct_infreq(top_celltypes) |> + forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf) + ) + + # make a stacked bar chart with top cell types + ggplot(project_df) + + aes( + x = library_id, + y = percent_cells_annotation, + fill = top_celltypes + ) + + geom_col() + + scale_y_continuous(expand = c(0,0)) + + scale_fill_manual(values = colors, name = "cell type") + + ggtitle(label) + + theme(axis.text.x = element_blank()) + + }) |> + patchwork::wrap_plots(ncol = 1) +``` + + +This looks really promising! +A few observations: + +- Cell types identified tend to line up with expectations for the type of tumor. +For example, leukemia libraries have T and B cells, brain tumors have macrophages, and solid tumors have fibroblasts and muscle cells. +- Projects that I would expect to be more difficult to classify (sarcomas, wilms, RB) have fewer cells classified then things like brain and leukemia. +Notably many of the solid tumor projects (4, 5, 12-16, and 23) have a handful of PDX samples where I would expect to see fewer normal cells. + +## Most frequently observed cell types + +The last thing we will do is look at the most frequently observed cell types across all samples. +The below table is ordered by the number of libraries the cell type is observed. + +```{r} +all_results_df |> + dplyr::filter(consensus_annotation != "Unknown") |> + dplyr::group_by(consensus_annotation) |> + dplyr::summarize( + total_libraries = dplyr::n(), + min_percentage = min(percent_cells_annotation), + mean_percentage = round(mean(percent_cells_annotation), 2), + median_percentage = median(percent_cells_annotation), + max_percentage = max(percent_cells_annotation) + ) |> + dplyr::arrange(desc(total_libraries)) + +``` + + +## Session info + +```{r session info} +# record the versions of the packages used in this analysis and other environment information +sessionInfo() +``` + diff --git a/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html new file mode 100644 index 000000000..e280b8148 --- /dev/null +++ b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html @@ -0,0 +1,2247 @@ + + + + +
+ + + + + + + + + + +This notebook summarizes the findings from assigning consensus cell
+type labels to all ScPCA samples. All results from the
+cell-type-consensus
module in OpenScPCA-nf
+must be saved to results
prior to rendering this
+notebook.
suppressPackageStartupMessages({
+ # load required packages
+ library(ggplot2)
+})
+
+# Set default ggplot theme
+theme_set(
+ theme_classic()
+)
+
+
+
+
+# function to read in project data frames with all cells in a project
+# output is a summarized table with total cells per sample, total cells per annotation, and number of cell types
+summarize_celltypes <- function(file, id){
+
+ # read in data
+ df <- readr::read_tsv(file)
+
+ # get total cell count and number of assigned cell types per library
+ total_cells_df <- df |>
+ dplyr::group_by(library_id) |>
+ dplyr::summarize(
+ total_cells_per_library = length(library_id),
+ num_celltypes = length(unique(consensus_annotation))
+ )
+
+ summary_df <- df |>
+ dplyr::group_by(library_id, consensus_annotation, consensus_ontology) |>
+ dplyr::summarize(total_cells_per_annotation = length(consensus_annotation)) |>
+ dplyr::left_join(total_cells_df, by = "library_id") |>
+ dplyr::mutate(
+ # add percentage
+ percent_cells_annotation = round((total_cells_per_annotation / total_cells_per_library) * 100, 2)
+ ) |>
+ dplyr::ungroup()
+
+ return(summary_df)
+
+}
+
+
+
+
+# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+repository_base <- rprojroot::find_root(rprojroot::is_git_root)
+module_base <- file.path(repository_base, "analyses", "cell-type-consensus")
+
+# results directory with cell-type-consensus
+results_dir <- file.path(module_base, "results", "cell-type-consensus")
+
+# diagnoses table used for labeling plots
+diagnoses_file <- file.path(module_base, "sample-info", "project-diagnoses.tsv")
+
+
+
+
+
+
+
+
+# list all results files
+results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE)
+
+# get project ids from file list
+project_ids <- stringr::str_remove(basename(results_files), "_consensus-cell-types.tsv.gz")
+names(results_files) <- project_ids
+
+# remove cell line projects from file list
+cell_line_projects <- c("SCPCP000020", "SCPCP000024")
+project_ids <- setdiff(project_ids, cell_line_projects) # remove cell line projects
+results_files <- results_files[project_ids]
+
+
+
+
+
+
+
+
+# read in diagnoses
+diagnoses_df <- readr::read_tsv(diagnoses_file)
+
+
+# read in results and prep data frame for plotting
+all_results_df <- results_files |>
+ purrr::imap(summarize_celltypes) |>
+ dplyr::bind_rows(.id = "project_id") |>
+ # add in diagnoses
+ dplyr::left_join(diagnoses_df, by = "project_id") |>
+ dplyr::mutate(
+ # create a label for plotting
+ project_label = glue::glue("{project_id}:{diagnosis}")
+ )
+
+
+
+
+
+The first thing we will look at is how many of the cells in each
+sample are categorized as “Unknown”, which means no consensus between
+SingleR
and CellAssign
was identified.
unknown_only <- all_results_df |>
+ dplyr::filter(consensus_annotation == "Unknown")
+
+ggplot(unknown_only, aes(x = project_label, y = percent_cells_annotation)) +
+ ggforce::geom_sina(size = 0.1) +
+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
+ plot.margin = margin(10,10,10,10)) +
+ labs(
+ x = "",
+ y = "Percent of cells annotated as Unknown"
+ )
+
+
+
+
+NA
+
+
+
+
+It looks like we do have some samples that aren’t just all “Unknown”! +It definitely varies by project, but for most projects we at least see +some proportion of samples with assigned cell types.
+Let’s look at how many samples actually have some cells outside of +unknown identified. To do this, we will identify all libraries that only +have cells called as “Unknown”.
+ + + + +high_tumor_df <- unknown_only |>
+ dplyr::mutate(no_cells_identified = percent_cells_annotation == 100) |>
+ dplyr::group_by(project_label) |>
+ dplyr::summarize(all_unknown = sum(no_cells_identified),
+ classified_cells = sum(!no_cells_identified),
+ percentage_unknown = round(all_unknown/(all_unknown + classified_cells)*100, 2),
+ # add number of libraries for plotting
+ total_libraries = length(library_id)) |>
+ # set order for plots
+ dplyr::mutate(project_label = forcats::fct_reorder(project_label, total_libraries, .desc = TRUE))
+
+
+
+
+Which projects have the highest proportion of samples with all +“Unknown”?
+ + + + +# table with percentage of samples
+high_tumor_df |>
+ dplyr::select(project_label, percentage_unknown) |>
+ dplyr::arrange(desc(percentage_unknown))
+
+
+
+
+NA
+
+
+
+
+It looks like all projects do have cell types identified that are not
+“Unknown”. However, SCPCP000011
(retinoblastoma), has a
+fairly high percentage of samples without any consensus labels.
Below we look at the number of cell types observed in each project +for all samples. This does not include cells labeled as “Unknown”.
+ + + + +num_celltypes_df <- all_results_df |>
+ # add a new line for facet labels
+ dplyr::mutate(facet_label = glue::glue("{project_id}\n{diagnosis}")) |>
+ # remove unknown as a cell type
+ dplyr::filter(consensus_annotation != "Unknown") |>
+ dplyr::select(facet_label, library_id, num_celltypes) |>
+ unique()
+
+ggplot(num_celltypes_df, aes(x = num_celltypes)) +
+ geom_histogram(binwidth = 1, center = 0) +
+ facet_wrap(vars(facet_label),
+ ncol = 3) +
+ labs(
+ x = "Number of cell types"
+ ) +
+ theme_bw()
+
+
+
+
+Now we look at the distribution of the cell types in each sample. For +these plots, we will pull out the top 9 cell types for each project. All +other cells will be labeled with “All remaining cell types”.
+The top cell types are determined by counting how many libraries each +cell type is found in within a project and taking the most frequent +types.
+ + + + +plot_df <- all_results_df |>
+ dplyr::group_by(project_id) |>
+ dplyr::mutate(
+ # get most frequently observed cell types across libraries in that project
+ top_celltypes = forcats::fct_lump_n(consensus_annotation, 9, other_level = "All remaining cell types", ties.method = "first") |>
+ # sort by frequency
+ forcats::fct_infreq() |>
+ # make sure all remaining and unknown are last, use this to assign colors in specific order
+ forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
+ )
+
+
+
+Warning: There was 1 warning in `dplyr::mutate()`.
+ℹ In argument: `top_celltypes = forcats::fct_relevel(...)`.
+ℹ In group 19: `project_id = "SCPCP000021"`.
+Caused by warning:
+! 1 unknown level in `f`: All remaining cell types
+
+
+
+# get all unique cell types ordered by frequency
+unique_celltypes <- plot_df |>
+ dplyr::filter(!top_celltypes %in% c("All remaining cell types", "Unknown")) |>
+ dplyr::pull(top_celltypes) |>
+ unique() |>
+ sort() |>
+ as.character()
+
+# get color palette
+colors <- c(
+ palette.colors(palette = "alphabet"),
+ "black", # 1 extra since alphabet is 26 and we have 27, this will be plasma cell which shows up once
+ "grey60",
+ "grey95"
+)
+names(colors) <- c(unique_celltypes, "All remaining cell types", "Unknown")
+
+
+
+
+
+
+
+
+project_labels <- unique(all_results_df$project_label)
+
+# stacked bar chart showing the distribution of the top 9 cell types for each project, including Unknown
+project_labels |>
+ purrr::map(\(label){
+
+ project_df <- plot_df |>
+ dplyr::filter(project_label == label) |>
+ dplyr::mutate(
+ # relevel factors for specific project
+ top_celltypes = forcats::fct_infreq(top_celltypes) |>
+ forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
+ )
+
+ # make a stacked bar chart with top cell types
+ ggplot(project_df) +
+ aes(
+ x = library_id,
+ y = percent_cells_annotation,
+ fill = top_celltypes
+ ) +
+ geom_col() +
+ scale_y_continuous(expand = c(0,0)) +
+ scale_fill_manual(values = colors, name = "cell type") +
+ ggtitle(label) +
+ theme(axis.text.x = element_blank())
+
+ }) |>
+ patchwork::wrap_plots(ncol = 1)
+
+
+
+
+This looks really promising! A few observations:
+The last thing we will do is look at the most frequently observed +cell types across all samples. The below table is ordered by the number +of libraries the cell type is observed.
+ + + + +all_results_df |>
+ dplyr::filter(consensus_annotation != "Unknown") |>
+ dplyr::group_by(consensus_annotation) |>
+ dplyr::summarize(
+ total_libraries = dplyr::n(),
+ min_percentage = min(percent_cells_annotation),
+ mean_percentage = round(mean(percent_cells_annotation), 2),
+ median_percentage = median(percent_cells_annotation),
+ max_percentage = max(percent_cells_annotation)
+ ) |>
+ dplyr::arrange(desc(total_libraries))
+
+
+
+
+NA
+
+
+
+
+# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
+
+
+
+
+
+