diff --git a/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.Rmd b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.Rmd new file mode 100644 index 000000000..61ad9b9b0 --- /dev/null +++ b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.Rmd @@ -0,0 +1,291 @@ +--- +title: "Explore consensus cell types" +author: Ally Hawkins +date: "`r Sys.Date()`" +output: + html_notebook: + toc: true + toc_depth: 3 + code_folding: show +--- + +This notebook summarizes the findings from assigning consensus cell type labels to all ScPCA samples. +All results from the `cell-type-consensus` module in `OpenScPCA-nf` must be saved to `results` prior to rendering this notebook. + +```{r packages} +suppressPackageStartupMessages({ + # load required packages + library(ggplot2) +}) + +# Set default ggplot theme +theme_set( + theme_classic() +) +``` + +## Functions + +```{r} +# function to read in project data frames with all cells in a project +# output is a summarized table with total cells per sample, total cells per annotation, and number of cell types +summarize_celltypes <- function(file, id){ + + # read in data + df <- readr::read_tsv(file) + + # get total cell count and number of assigned cell types per library + total_cells_df <- df |> + dplyr::group_by(library_id) |> + dplyr::summarize( + total_cells_per_library = length(library_id), + num_celltypes = length(unique(consensus_annotation)) + ) + + summary_df <- df |> + dplyr::group_by(library_id, consensus_annotation, consensus_ontology) |> + dplyr::summarize(total_cells_per_annotation = length(consensus_annotation)) |> + dplyr::left_join(total_cells_df, by = "library_id") |> + dplyr::mutate( + # add percentage + percent_cells_annotation = round((total_cells_per_annotation / total_cells_per_library) * 100, 2) + ) |> + dplyr::ungroup() + + return(summary_df) + +} +``` + +## Data setup + + +```{r base paths} +# The base path for the OpenScPCA repository, found by its (hidden) .git directory +repository_base <- rprojroot::find_root(rprojroot::is_git_root) +module_base <- file.path(repository_base, "analyses", "cell-type-consensus") + +# results directory with cell-type-consensus +results_dir <- file.path(module_base, "results", "cell-type-consensus") + +# diagnoses table used for labeling plots +diagnoses_file <- file.path(module_base, "sample-info", "project-diagnoses.tsv") +``` + +```{r} +# list all results files +results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE) + +# get project ids from file list +project_ids <- stringr::str_remove(basename(results_files), "_consensus-cell-types.tsv.gz") +names(results_files) <- project_ids + +# remove cell line projects from file list +cell_line_projects <- c("SCPCP000020", "SCPCP000024") +project_ids <- setdiff(project_ids, cell_line_projects) # remove cell line projects +results_files <- results_files[project_ids] +``` + + +```{r, message=FALSE} +# read in diagnoses +diagnoses_df <- readr::read_tsv(diagnoses_file) + + +# read in results and prep data frame for plotting +all_results_df <- results_files |> + purrr::imap(summarize_celltypes) |> + dplyr::bind_rows(.id = "project_id") |> + # add in diagnoses + dplyr::left_join(diagnoses_df, by = "project_id") |> + dplyr::mutate( + # create a label for plotting + project_label = glue::glue("{project_id}:{diagnosis}") + ) + +``` + +## Is it all just Unknown? + +The first thing we will look at is how many of the cells in each sample are categorized as "Unknown", which means no consensus between `SingleR` and `CellAssign` was identified. + +```{r, fig.height=7} +unknown_only <- all_results_df |> + dplyr::filter(consensus_annotation == "Unknown") + +ggplot(unknown_only, aes(x = project_label, y = percent_cells_annotation)) + + ggforce::geom_sina(size = 0.1) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), + plot.margin = margin(10,10,10,10)) + + labs( + x = "", + y = "Percent of cells annotated as Unknown" + ) + +``` + +It looks like we do have some samples that aren't just all "Unknown"! +It definitely varies by project, but for most projects we at least see some proportion of samples with assigned cell types. + +Let's look at how many samples actually have some cells outside of unknown identified. +To do this, we will identify all libraries that only have cells called as "Unknown". + +```{r} +high_tumor_df <- unknown_only |> + dplyr::mutate(no_cells_identified = percent_cells_annotation == 100) |> + dplyr::group_by(project_label) |> + dplyr::summarize(all_unknown = sum(no_cells_identified), + classified_cells = sum(!no_cells_identified), + percentage_unknown = round(all_unknown/(all_unknown + classified_cells)*100, 2), + # add number of libraries for plotting + total_libraries = length(library_id)) |> + # set order for plots + dplyr::mutate(project_label = forcats::fct_reorder(project_label, total_libraries, .desc = TRUE)) +``` + + +Which projects have the highest proportion of samples with all "Unknown"? + +```{r} +# table with percentage of samples +high_tumor_df |> + dplyr::select(project_label, percentage_unknown) |> + dplyr::arrange(desc(percentage_unknown)) + +``` + +It looks like all projects do have cell types identified that are not "Unknown". +However, `SCPCP000011` (retinoblastoma), has a fairly high percentage of samples without any consensus labels. + +## Number of cell types observed + +Below we look at the number of cell types observed in each project for all samples. +This does not include cells labeled as "Unknown". + +```{r, fig.height=10} +num_celltypes_df <- all_results_df |> + # add a new line for facet labels + dplyr::mutate(facet_label = glue::glue("{project_id}\n{diagnosis}")) |> + # remove unknown as a cell type + dplyr::filter(consensus_annotation != "Unknown") |> + dplyr::select(facet_label, library_id, num_celltypes) |> + unique() + +ggplot(num_celltypes_df, aes(x = num_celltypes)) + + geom_histogram(binwidth = 1, center = 0) + + facet_wrap(vars(facet_label), + ncol = 3) + + labs( + x = "Number of cell types" + ) + + theme_bw() +``` + +## Distribution of consensus cell types + +Now we look at the distribution of the cell types in each sample. +For these plots, we will pull out the top 9 cell types for each project. +All other cells will be labeled with "All remaining cell types". + +The top cell types are determined by counting how many libraries each cell type is found in within a project and taking the most frequent types. + +```{r} +plot_df <- all_results_df |> + dplyr::group_by(project_id) |> + dplyr::mutate( + # get most frequently observed cell types across libraries in that project + top_celltypes = forcats::fct_lump_n(consensus_annotation, 9, other_level = "All remaining cell types", ties.method = "first") |> + # sort by frequency + forcats::fct_infreq() |> + # make sure all remaining and unknown are last, use this to assign colors in specific order + forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf) + ) + +# get all unique cell types ordered by frequency +unique_celltypes <- plot_df |> + dplyr::filter(!top_celltypes %in% c("All remaining cell types", "Unknown")) |> + dplyr::pull(top_celltypes) |> + unique() |> + sort() |> + as.character() + +# get color palette +colors <- c( + palette.colors(palette = "alphabet"), + "black", # 1 extra since alphabet is 26 and we have 27, this will be plasma cell which shows up once + "grey60", + "grey95" +) +names(colors) <- c(unique_celltypes, "All remaining cell types", "Unknown") +``` + + +```{r, fig.height=60, fig.width=10} +project_labels <- unique(all_results_df$project_label) + +# stacked bar chart showing the distribution of the top 9 cell types for each project, including Unknown +project_labels |> + purrr::map(\(label){ + + project_df <- plot_df |> + dplyr::filter(project_label == label) |> + dplyr::mutate( + # relevel factors for specific project + top_celltypes = forcats::fct_infreq(top_celltypes) |> + forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf) + ) + + # make a stacked bar chart with top cell types + ggplot(project_df) + + aes( + x = library_id, + y = percent_cells_annotation, + fill = top_celltypes + ) + + geom_col() + + scale_y_continuous(expand = c(0,0)) + + scale_fill_manual(values = colors, name = "cell type") + + ggtitle(label) + + theme(axis.text.x = element_blank()) + + }) |> + patchwork::wrap_plots(ncol = 1) +``` + + +This looks really promising! +A few observations: + +- Cell types identified tend to line up with expectations for the type of tumor. +For example, leukemia libraries have T and B cells, brain tumors have macrophages, and solid tumors have fibroblasts and muscle cells. +- Projects that I would expect to be more difficult to classify (sarcomas, wilms, RB) have fewer cells classified then things like brain and leukemia. +Notably many of the solid tumor projects (4, 5, 12-16, and 23) have a handful of PDX samples where I would expect to see fewer normal cells. + +## Most frequently observed cell types + +The last thing we will do is look at the most frequently observed cell types across all samples. +The below table is ordered by the number of libraries the cell type is observed. + +```{r} +all_results_df |> + dplyr::filter(consensus_annotation != "Unknown") |> + dplyr::group_by(consensus_annotation) |> + dplyr::summarize( + total_libraries = dplyr::n(), + min_percentage = min(percent_cells_annotation), + mean_percentage = round(mean(percent_cells_annotation), 2), + median_percentage = median(percent_cells_annotation), + max_percentage = max(percent_cells_annotation) + ) |> + dplyr::arrange(desc(total_libraries)) + +``` + + +## Session info + +```{r session info} +# record the versions of the packages used in this analysis and other environment information +sessionInfo() +``` + diff --git a/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html new file mode 100644 index 000000000..e280b8148 --- /dev/null +++ b/analyses/cell-type-consensus/exploratory-notebooks/02-explore-consensus-results.nb.html @@ -0,0 +1,2247 @@ + + + + + + + + + + + + + + + +Explore consensus cell types + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ +
+ + +

This notebook summarizes the findings from assigning consensus cell +type labels to all ScPCA samples. All results from the +cell-type-consensus module in OpenScPCA-nf +must be saved to results prior to rendering this +notebook.

+ + + + +
suppressPackageStartupMessages({
+  # load required packages
+  library(ggplot2)
+})
+
+# Set default ggplot theme
+theme_set(
+  theme_classic()
+)
+ + + + +
+

Functions

+ + + + +
# function to read in project data frames with all cells in a project
+# output is a summarized table with total cells per sample, total cells per annotation, and number of cell types 
+summarize_celltypes <- function(file, id){
+  
+  # read in data
+  df <- readr::read_tsv(file) 
+  
+  # get total cell count and number of assigned cell types per library
+  total_cells_df <- df |> 
+    dplyr::group_by(library_id) |> 
+    dplyr::summarize(
+      total_cells_per_library = length(library_id),
+      num_celltypes = length(unique(consensus_annotation))
+    )
+  
+  summary_df <- df |> 
+    dplyr::group_by(library_id, consensus_annotation, consensus_ontology) |> 
+    dplyr::summarize(total_cells_per_annotation = length(consensus_annotation)) |>
+    dplyr::left_join(total_cells_df, by = "library_id") |> 
+    dplyr::mutate(
+      # add percentage 
+      percent_cells_annotation = round((total_cells_per_annotation / total_cells_per_library) * 100, 2)
+    ) |> 
+    dplyr::ungroup()
+  
+  return(summary_df)
+  
+}
+ + + + +
+
+

Data setup

+ + + + +
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+repository_base <- rprojroot::find_root(rprojroot::is_git_root)
+module_base <- file.path(repository_base, "analyses", "cell-type-consensus")
+
+# results directory with cell-type-consensus 
+results_dir <- file.path(module_base, "results", "cell-type-consensus")
+
+# diagnoses table used for labeling plots 
+diagnoses_file <- file.path(module_base, "sample-info", "project-diagnoses.tsv")
+ + + + + + + + +
# list all results files 
+results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE)
+
+# get project ids from file list  
+project_ids <- stringr::str_remove(basename(results_files), "_consensus-cell-types.tsv.gz")
+names(results_files) <- project_ids
+
+# remove cell line projects from file list
+cell_line_projects <- c("SCPCP000020", "SCPCP000024")
+project_ids <- setdiff(project_ids, cell_line_projects) # remove cell line projects
+results_files <- results_files[project_ids]
+ + + + + + + + +
# read in diagnoses
+diagnoses_df <- readr::read_tsv(diagnoses_file)
+
+
+# read in results and prep data frame for plotting 
+all_results_df <- results_files |> 
+  purrr::imap(summarize_celltypes) |> 
+  dplyr::bind_rows(.id = "project_id") |> 
+  # add in diagnoses 
+  dplyr::left_join(diagnoses_df, by = "project_id") |> 
+  dplyr::mutate(
+    # create a label for plotting
+    project_label = glue::glue("{project_id}:{diagnosis}")
+  )
+
+ + + + +
+
+

Is it all just Unknown?

+

The first thing we will look at is how many of the cells in each +sample are categorized as “Unknown”, which means no consensus between +SingleR and CellAssign was identified.

+ + + + +
unknown_only <- all_results_df |> 
+  dplyr::filter(consensus_annotation == "Unknown")
+
+ggplot(unknown_only, aes(x = project_label, y = percent_cells_annotation)) +
+  ggforce::geom_sina(size = 0.1) +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
+        plot.margin = margin(10,10,10,10)) +
+  labs(
+    x = "", 
+    y = "Percent of cells annotated as Unknown"
+  )
+ + + + +

+ + + + +
NA
+ + + + +

It looks like we do have some samples that aren’t just all “Unknown”! +It definitely varies by project, but for most projects we at least see +some proportion of samples with assigned cell types.

+

Let’s look at how many samples actually have some cells outside of +unknown identified. To do this, we will identify all libraries that only +have cells called as “Unknown”.

+ + + + +
high_tumor_df <- unknown_only |> 
+  dplyr::mutate(no_cells_identified = percent_cells_annotation == 100) |> 
+  dplyr::group_by(project_label) |> 
+  dplyr::summarize(all_unknown = sum(no_cells_identified),
+                   classified_cells = sum(!no_cells_identified),
+                   percentage_unknown = round(all_unknown/(all_unknown + classified_cells)*100, 2),
+                   # add number of libraries for plotting 
+                   total_libraries = length(library_id)) |>
+  # set order for plots 
+  dplyr::mutate(project_label = forcats::fct_reorder(project_label, total_libraries, .desc = TRUE))
+ + + + +

Which projects have the highest proportion of samples with all +“Unknown”?

+ + + + +
# table with percentage of samples 
+high_tumor_df |> 
+  dplyr::select(project_label, percentage_unknown) |> 
+  dplyr::arrange(desc(percentage_unknown))
+ + + + +
+ +
+ + + + +
NA
+ + + + +

It looks like all projects do have cell types identified that are not +“Unknown”. However, SCPCP000011 (retinoblastoma), has a +fairly high percentage of samples without any consensus labels.

+
+
+

Number of cell types observed

+

Below we look at the number of cell types observed in each project +for all samples. This does not include cells labeled as “Unknown”.

+ + + + +
num_celltypes_df <- all_results_df |> 
+  # add a new line for facet labels 
+  dplyr::mutate(facet_label = glue::glue("{project_id}\n{diagnosis}")) |>
+  # remove unknown as a cell type 
+  dplyr::filter(consensus_annotation != "Unknown") |> 
+  dplyr::select(facet_label, library_id, num_celltypes) |> 
+  unique()
+
+ggplot(num_celltypes_df, aes(x = num_celltypes)) +
+  geom_histogram(binwidth = 1, center = 0) +
+  facet_wrap(vars(facet_label), 
+             ncol = 3) +
+  labs(
+    x = "Number of cell types"
+  ) +
+  theme_bw()
+ + + + +

+ + + + +
+
+

Distribution of consensus cell types

+

Now we look at the distribution of the cell types in each sample. For +these plots, we will pull out the top 9 cell types for each project. All +other cells will be labeled with “All remaining cell types”.

+

The top cell types are determined by counting how many libraries each +cell type is found in within a project and taking the most frequent +types.

+ + + + +
plot_df <- all_results_df |> 
+    dplyr::group_by(project_id) |> 
+    dplyr::mutate(
+      # get most frequently observed cell types across libraries in that project 
+      top_celltypes = forcats::fct_lump_n(consensus_annotation, 9, other_level = "All remaining cell types", ties.method = "first") |> 
+        # sort by frequency 
+        forcats::fct_infreq() |> 
+        # make sure all remaining and unknown are last, use this to assign colors in specific order
+        forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
+    )
+ + + +
Warning: There was 1 warning in `dplyr::mutate()`.
+ℹ In argument: `top_celltypes = forcats::fct_relevel(...)`.
+ℹ In group 19: `project_id = "SCPCP000021"`.
+Caused by warning:
+! 1 unknown level in `f`: All remaining cell types
+ + + +
# get all unique cell types ordered by frequency 
+unique_celltypes <- plot_df |> 
+  dplyr::filter(!top_celltypes %in% c("All remaining cell types", "Unknown")) |> 
+  dplyr::pull(top_celltypes) |> 
+  unique() |>
+  sort() |> 
+  as.character()
+
+# get color palette
+colors <- c(
+  palette.colors(palette = "alphabet"),
+  "black", # 1 extra since alphabet is 26 and we have 27, this will be plasma cell which shows up once 
+  "grey60", 
+  "grey95"
+)
+names(colors) <- c(unique_celltypes, "All remaining cell types", "Unknown")
+ + + + + + + + +
project_labels <- unique(all_results_df$project_label)
+
+# stacked bar chart showing the distribution of the top 9 cell types for each project, including Unknown
+project_labels |> 
+  purrr::map(\(label){
+    
+    project_df <- plot_df |> 
+      dplyr::filter(project_label == label) |> 
+      dplyr::mutate(
+        # relevel factors for specific project 
+        top_celltypes = forcats::fct_infreq(top_celltypes) |> 
+          forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
+      )
+    
+    # make a stacked bar chart with top cell types 
+    ggplot(project_df) + 
+      aes(
+        x = library_id, 
+        y = percent_cells_annotation, 
+        fill = top_celltypes
+      ) +
+      geom_col() + 
+      scale_y_continuous(expand = c(0,0)) +
+      scale_fill_manual(values = colors, name = "cell type") +
+      ggtitle(label) +
+      theme(axis.text.x = element_blank())
+  
+    }) |>
+  patchwork::wrap_plots(ncol = 1)
+ + + + +

+ + + + +

This looks really promising! A few observations:

+ +
+
+

Most frequently observed cell types

+

The last thing we will do is look at the most frequently observed +cell types across all samples. The below table is ordered by the number +of libraries the cell type is observed.

+ + + + +
all_results_df |> 
+  dplyr::filter(consensus_annotation != "Unknown") |> 
+  dplyr::group_by(consensus_annotation) |> 
+  dplyr::summarize(
+    total_libraries = dplyr::n(),
+    min_percentage = min(percent_cells_annotation),
+    mean_percentage = round(mean(percent_cells_annotation), 2),
+    median_percentage = median(percent_cells_annotation),
+    max_percentage = max(percent_cells_annotation)
+  ) |> 
+  dplyr::arrange(desc(total_libraries))
+ + + + +
+ +
+ + + + +
NA
+ + + + +
+
+

Session info

+ + + + +
# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
+ + + + + +
+ +
---
title: "Explore consensus cell types"
author: Ally Hawkins
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: true
    toc_depth: 3
    code_folding: show
---

This notebook summarizes the findings from assigning consensus cell type labels to all ScPCA samples. 
All results from the `cell-type-consensus` module in `OpenScPCA-nf` must be saved to `results` prior to rendering this notebook. 

```{r packages}
suppressPackageStartupMessages({
  # load required packages
  library(ggplot2)
})

# Set default ggplot theme
theme_set(
  theme_classic()
)
```

## Functions 

```{r}
# function to read in project data frames with all cells in a project
# output is a summarized table with total cells per sample, total cells per annotation, and number of cell types 
summarize_celltypes <- function(file, id){
  
  # read in data
  df <- readr::read_tsv(file) 
  
  # get total cell count and number of assigned cell types per library
  total_cells_df <- df |> 
    dplyr::group_by(library_id) |> 
    dplyr::summarize(
      total_cells_per_library = length(library_id),
      num_celltypes = length(unique(consensus_annotation))
    )
  
  summary_df <- df |> 
    dplyr::group_by(library_id, consensus_annotation, consensus_ontology) |> 
    dplyr::summarize(total_cells_per_annotation = length(consensus_annotation)) |>
    dplyr::left_join(total_cells_df, by = "library_id") |> 
    dplyr::mutate(
      # add percentage 
      percent_cells_annotation = round((total_cells_per_annotation / total_cells_per_library) * 100, 2)
    ) |> 
    dplyr::ungroup()
  
  return(summary_df)
  
}
```

## Data setup


```{r base paths}
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
module_base <- file.path(repository_base, "analyses", "cell-type-consensus")

# results directory with cell-type-consensus 
results_dir <- file.path(module_base, "results", "cell-type-consensus")

# diagnoses table used for labeling plots 
diagnoses_file <- file.path(module_base, "sample-info", "project-diagnoses.tsv")
```

```{r}
# list all results files 
results_files <- list.files(results_dir, pattern = "_consensus-cell-types\\.tsv.\\gz$", full.names = TRUE)

# get project ids from file list  
project_ids <- stringr::str_remove(basename(results_files), "_consensus-cell-types.tsv.gz")
names(results_files) <- project_ids

# remove cell line projects from file list
cell_line_projects <- c("SCPCP000020", "SCPCP000024")
project_ids <- setdiff(project_ids, cell_line_projects) # remove cell line projects
results_files <- results_files[project_ids]
```


```{r, message=FALSE}
# read in diagnoses
diagnoses_df <- readr::read_tsv(diagnoses_file)


# read in results and prep data frame for plotting 
all_results_df <- results_files |> 
  purrr::imap(summarize_celltypes) |> 
  dplyr::bind_rows(.id = "project_id") |> 
  # add in diagnoses 
  dplyr::left_join(diagnoses_df, by = "project_id") |> 
  dplyr::mutate(
    # create a label for plotting
    project_label = glue::glue("{project_id}:{diagnosis}")
  )

```

## Is it all just Unknown?

The first thing we will look at is how many of the cells in each sample are categorized as "Unknown", which means no consensus between `SingleR` and `CellAssign` was identified. 

```{r, fig.height=7}
unknown_only <- all_results_df |> 
  dplyr::filter(consensus_annotation == "Unknown")

ggplot(unknown_only, aes(x = project_label, y = percent_cells_annotation)) +
  ggforce::geom_sina(size = 0.1) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        plot.margin = margin(10,10,10,10)) +
  labs(
    x = "", 
    y = "Percent of cells annotated as Unknown"
  )
  
```

It looks like we do have some samples that aren't just all "Unknown"!
It definitely varies by project, but for most projects we at least see some proportion of samples with assigned cell types. 

Let's look at how many samples actually have some cells outside of unknown identified. 
To do this, we will identify all libraries that only have cells called as "Unknown". 

```{r}
high_tumor_df <- unknown_only |> 
  dplyr::mutate(no_cells_identified = percent_cells_annotation == 100) |> 
  dplyr::group_by(project_label) |> 
  dplyr::summarize(all_unknown = sum(no_cells_identified),
                   classified_cells = sum(!no_cells_identified),
                   percentage_unknown = round(all_unknown/(all_unknown + classified_cells)*100, 2),
                   # add number of libraries for plotting 
                   total_libraries = length(library_id)) |>
  # set order for plots 
  dplyr::mutate(project_label = forcats::fct_reorder(project_label, total_libraries, .desc = TRUE))
```


Which projects have the highest proportion of samples with all "Unknown"? 

```{r}
# table with percentage of samples 
high_tumor_df |> 
  dplyr::select(project_label, percentage_unknown) |> 
  dplyr::arrange(desc(percentage_unknown))

```

It looks like all projects do have cell types identified that are not "Unknown". 
However, `SCPCP000011` (retinoblastoma), has a fairly high percentage of samples without any consensus labels. 

## Number of cell types observed

Below we look at the number of cell types observed in each project for all samples. 
This does not include cells labeled as "Unknown". 

```{r, fig.height=10}
num_celltypes_df <- all_results_df |> 
  # add a new line for facet labels 
  dplyr::mutate(facet_label = glue::glue("{project_id}\n{diagnosis}")) |>
  # remove unknown as a cell type 
  dplyr::filter(consensus_annotation != "Unknown") |> 
  dplyr::select(facet_label, library_id, num_celltypes) |> 
  unique()

ggplot(num_celltypes_df, aes(x = num_celltypes)) +
  geom_histogram(binwidth = 1, center = 0) +
  facet_wrap(vars(facet_label), 
             ncol = 3) +
  labs(
    x = "Number of cell types"
  ) +
  theme_bw()
```

## Distribution of consensus cell types 

Now we look at the distribution of the cell types in each sample. 
For these plots, we will pull out the top 9 cell types for each project. 
All other cells will be labeled with "All remaining cell types". 

The top cell types are determined by counting how many libraries each cell type is found in within a project and taking the most frequent types. 

```{r}
plot_df <- all_results_df |> 
    dplyr::group_by(project_id) |> 
    dplyr::mutate(
      # get most frequently observed cell types across libraries in that project 
      top_celltypes = forcats::fct_lump_n(consensus_annotation, 9, other_level = "All remaining cell types", ties.method = "first") |> 
        # sort by frequency 
        forcats::fct_infreq() |> 
        # make sure all remaining and unknown are last, use this to assign colors in specific order
        forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
    )

# get all unique cell types ordered by frequency 
unique_celltypes <- plot_df |> 
  dplyr::filter(!top_celltypes %in% c("All remaining cell types", "Unknown")) |> 
  dplyr::pull(top_celltypes) |> 
  unique() |>
  sort() |> 
  as.character()

# get color palette
colors <- c(
  palette.colors(palette = "alphabet"),
  "black", # 1 extra since alphabet is 26 and we have 27, this will be plasma cell which shows up once 
  "grey60", 
  "grey95"
)
names(colors) <- c(unique_celltypes, "All remaining cell types", "Unknown")
```


```{r, fig.height=60, fig.width=10}
project_labels <- unique(all_results_df$project_label)

# stacked bar chart showing the distribution of the top 9 cell types for each project, including Unknown
project_labels |> 
  purrr::map(\(label){
    
    project_df <- plot_df |> 
      dplyr::filter(project_label == label) |> 
      dplyr::mutate(
        # relevel factors for specific project 
        top_celltypes = forcats::fct_infreq(top_celltypes) |> 
          forcats::fct_relevel("All remaining cell types", "Unknown", after = Inf)
      )
    
    # make a stacked bar chart with top cell types 
    ggplot(project_df) + 
      aes(
        x = library_id, 
        y = percent_cells_annotation, 
        fill = top_celltypes
      ) +
      geom_col() + 
      scale_y_continuous(expand = c(0,0)) +
      scale_fill_manual(values = colors, name = "cell type") +
      ggtitle(label) +
      theme(axis.text.x = element_blank())
  
    }) |>
  patchwork::wrap_plots(ncol = 1)
```


This looks really promising!
A few observations: 

- Cell types identified tend to line up with expectations for the type of tumor. 
For example, leukemia libraries have T and B cells, brain tumors have macrophages, and solid tumors have fibroblasts and muscle cells. 
- Projects that I would expect to be more difficult to classify (sarcomas, wilms, RB) have fewer cells classified then things like brain and leukemia. 
Notably many of the solid tumor projects (4, 5, 12-16, and 23) have a handful of PDX samples where I would expect to see fewer normal cells. 

## Most frequently observed cell types 

The last thing we will do is look at the most frequently observed cell types across all samples. 
The below table is ordered by the number of libraries the cell type is observed. 

```{r}
all_results_df |> 
  dplyr::filter(consensus_annotation != "Unknown") |> 
  dplyr::group_by(consensus_annotation) |> 
  dplyr::summarize(
    total_libraries = dplyr::n(),
    min_percentage = min(percent_cells_annotation),
    mean_percentage = round(mean(percent_cells_annotation), 2),
    median_percentage = median(percent_cells_annotation),
    max_percentage = max(percent_cells_annotation)
  ) |> 
  dplyr::arrange(desc(total_libraries))
  
```


## Session info 

```{r session info}
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
```


+ + + +
+ + + + + + + + + + + + + + + + diff --git a/analyses/cell-type-consensus/exploratory-notebooks/README.md b/analyses/cell-type-consensus/exploratory-notebooks/README.md new file mode 100644 index 000000000..f1794ee6f --- /dev/null +++ b/analyses/cell-type-consensus/exploratory-notebooks/README.md @@ -0,0 +1,9 @@ +# Exploratory notebooks + +This folder contains exploratory notebooks for this module. + +1. `01-reference-exploration.Rmd`: This notebook was used to explore possible consensus label assignments between cell types in the `PanglaoDB` and `BlueprintEncodeData` references. +Observations made in this notebook were used to define the set of possible consensus labels to be included in [`references/consensus-cell-type-reference.tsv`](../references/consensus-cell-type-reference.tsv). + +2. `01-explore-consensus-results.Rmd`: This notebook summarizes the consensus labels assigned to all ScPCA samples. +Prior to rendering this notebook results from the `cell-type-consensus` module in `OpenScPCA-nf` using the `2024-11-25` were downloaded. diff --git a/analyses/cell-type-consensus/renv.lock b/analyses/cell-type-consensus/renv.lock index 1912a8caa..520a536e5 100644 --- a/analyses/cell-type-consensus/renv.lock +++ b/analyses/cell-type-consensus/renv.lock @@ -478,6 +478,19 @@ ], "Hash": "6b868847b365672d6c1677b1608da9ed" }, + "RcppEigen": { + "Package": "RcppEigen", + "Version": "0.3.4.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "stats", + "utils" + ], + "Hash": "4ac8e423216b8b70cb9653d1b3f71eb9" + }, "RcppTOML": { "Package": "RcppTOML", "Version": "0.2.2", @@ -1109,6 +1122,22 @@ ], "Hash": "bd1297f9b5b1fc1372d19e2c4cd82215" }, + "forcats": { + "Package": "forcats", + "Version": "1.0.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "tibble" + ], + "Hash": "1a0a9a3d5083d0d573c4214576f1e690" + }, "fs": { "Package": "fs", "Version": "1.6.5", @@ -1141,6 +1170,35 @@ ], "Hash": "ed33b16c6d24f7ced1d68877ac2509ee" }, + "ggforce": { + "Package": "ggforce", + "Version": "0.4.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "MASS", + "R", + "Rcpp", + "RcppEigen", + "cli", + "ggplot2", + "grDevices", + "grid", + "gtable", + "lifecycle", + "polyclip", + "rlang", + "scales", + "stats", + "systemfonts", + "tidyselect", + "tweenr", + "utils", + "vctrs", + "withr" + ], + "Hash": "384b388bd9155468d2c851846ee69f9f" + }, "ggplot2": { "Package": "ggplot2", "Version": "3.5.1", @@ -1703,6 +1761,16 @@ ], "Hash": "bd54ba8a0a5faded999a7aab6e46b374" }, + "polyclip": { + "Package": "polyclip", + "Version": "1.10-7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "5879bf5aae702ffef0a315c44328f984" + }, "prettyunits": { "Package": "prettyunits", "Version": "1.2.0", @@ -2044,6 +2112,22 @@ "Repository": "RSPM", "Hash": "de342ebfebdbf40477d0758d05426646" }, + "systemfonts": { + "Package": "systemfonts", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cpp11", + "grid", + "jsonlite", + "lifecycle", + "tools", + "utils" + ], + "Hash": "f8b2924480a2679e2bad9750646112fe" + }, "tibble": { "Package": "tibble", "Version": "3.2.1", @@ -2112,6 +2196,21 @@ ], "Hash": "3ec7e3ddcacc2d34a9046941222bf94d" }, + "tweenr": { + "Package": "tweenr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cpp11", + "farver", + "magrittr", + "rlang", + "vctrs" + ], + "Hash": "82fac2b73e6a1f3874fc000aaf96d8bc" + }, "tzdb": { "Package": "tzdb", "Version": "0.4.0", diff --git a/analyses/cell-type-consensus/sample-info/README.md b/analyses/cell-type-consensus/sample-info/README.md new file mode 100644 index 000000000..57a542c35 --- /dev/null +++ b/analyses/cell-type-consensus/sample-info/README.md @@ -0,0 +1,6 @@ +# Sample info + +This folder contains any files with sample metadata needed for this module. + +1. `project-diagnoses.tsv`: This file contains a summarized label to use for all diagnoses in a given ScPCA project. +The contents of the `diagnosis` column is used to provide labels for plots in [`exploratory-notebooks/02-explore-consensus-results.Rmd`](../exploratory-notebooks/02-explore-consensus-results.Rmd). diff --git a/analyses/cell-type-consensus/sample-info/project-diagnoses.tsv b/analyses/cell-type-consensus/sample-info/project-diagnoses.tsv new file mode 100644 index 000000000..8a7bd45ce --- /dev/null +++ b/analyses/cell-type-consensus/sample-info/project-diagnoses.tsv @@ -0,0 +1,24 @@ +project_id diagnosis +SCPCP000001 high-grade glioma +SCPCP000002 low-grade glioma +SCPCP000003 acute lymphoblastic leukemia +SCPCP000004 neuroblastoma +SCPCP000005 rhabdoymyosarcoma +SCPCP000006 wilms +SCPCP000007 acute myeloid leukemia +SCPCP000008 acute lymphoblastic leukemia +SCPCP000009 brain +SCPCP000010 brain +SCPCP000011 retinoblastoma +SCPCP000012 other solid tumors +SCPCP000013 non-rhabdo soft tissue sarcoma +SCPCP000014 wilms +SCPCP000015 ewing sarcoma +SCPCP000016 rhabdoid tumor +SCPCP000017 osteosarcoma +SCPCP000018 osteosarcoma +SCPCP000020 neuroblastoma +SCPCP000021 high-grade glioma +SCPCP000022 leukemia +SCPCP000023 osteosarcoma +SCPCP000024 neuroblastoma \ No newline at end of file