-
Notifications
You must be signed in to change notification settings - Fork 18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Workflow to assign consensus cell types to ScPCA samples #977
Changes from all commits
79a3e51
84094b5
73d9498
2a2f514
82632b7
f3b6f78
7965691
e36f4c7
3f9183d
39b162d
2bbacc3
6f4f1a0
6b7c342
939103f
bcb16a5
965e445
d7b93ed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,3 +53,4 @@ jobs: | |
run: | | ||
cd ${MODULE_PATH} | ||
# run module script(s) here | ||
./assign-consensus-celltypes.sh |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#!/bin/bash | ||
|
||
# This script is used to create a single table with cell type assignments all cells from all ScPCA samples | ||
# The existing cell type annotations from SingleR and CellAssign are saved to a TSV file for each sample | ||
# Then all TSV files are combined into a single file and consensus cell types are assigned | ||
|
||
# Usage: ./assign-consensus-celltypes.sh | ||
|
||
|
||
set -euo pipefail | ||
|
||
# navigate to where script lives | ||
cd $(dirname "$0") | ||
#module_dir=$(pwd) | ||
|
||
data_dir="../../data/current" | ||
# path to save consensus results | ||
scpca_consensus_assignments_file="results/scpca-consensus-celltype-assignments.tsv.gz" | ||
# directory to store all individual tsv files | ||
celltype_tsv_dir="results/original-celltype-assignments" | ||
mkdir -p ${celltype_tsv_dir} | ||
|
||
# define reference input files | ||
panglao_ref_file="references/panglao-cell-type-ontologies.tsv" | ||
consensus_ref_file="references/consensus-cell-type-reference.tsv" | ||
|
||
# run script to export tsv file on all processed objects | ||
for sce_file in $data_dir/SCPCP*/SCPCS*/*_processed.rds; do | ||
|
||
# define library ID | ||
library_id=$(basename $sce_file | sed 's/_processed.rds$//') | ||
|
||
echo "Grabbing cell types for ${library_id}" | ||
# get celltypes as tsv file | ||
Rscript scripts/03-save-coldata.R \ | ||
--sce_file $sce_file \ | ||
--output_file ${celltype_tsv_dir}/${library_id}_celltype-assignments.tsv | ||
|
||
done | ||
|
||
echo "Combining TSVs and adding consensus labels" | ||
# run script to combine all tsv files and assign consensus cell types | ||
Rscript scripts/04-combine-celltype-tables.R \ | ||
--celltype_tsv_dir $celltype_tsv_dir \ | ||
--panglao_ref_file $panglao_ref_file \ | ||
--consensus_ref_file $consensus_ref_file \ | ||
--output_file $scpca_consensus_assignments_file |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# This script is used to grab the colData from a SCE object and save it as a TSV file | ||
|
||
library(optparse) | ||
|
||
option_list <- list( | ||
make_option( | ||
opt_str = c("--sce_file"), | ||
type = "character", | ||
help = "Path to RDS file containing a processed SingleCellExperiment object from scpca-nf" | ||
), | ||
make_option( | ||
opt_str = c("--output_file"), | ||
type = "character", | ||
help = "Path to file where colData will be saved, must end in `.tsv`" | ||
) | ||
) | ||
|
||
# Parse options | ||
opt <- parse_args(OptionParser(option_list = option_list)) | ||
|
||
# Set up ----------------------------------------------------------------------- | ||
|
||
# make sure input files exist | ||
stopifnot( | ||
"sce file does not exist" = file.exists(opt$sce_file) | ||
) | ||
|
||
# load SCE | ||
suppressPackageStartupMessages({ | ||
library(SingleCellExperiment) | ||
}) | ||
|
||
# Extract colData -------------------------------------------------------------- | ||
|
||
# read in sce | ||
sce <- readr::read_rds(opt$sce_file) | ||
|
||
# extract ids | ||
library_id <- metadata(sce)$library_id | ||
# account for multiplexed libraries that have multiple samples | ||
# for now just combine sample ids into a single string and don't worry about demultiplexing | ||
sample_id <- metadata(sce)$sample_id |> | ||
paste0(collapse = ";") | ||
project_id <- metadata(sce)$project_id | ||
|
||
# check if cell line since cell lines don't have any cell type assignments | ||
# account for having more than one sample and a list of sample types | ||
# all sample types should be the same theoretically | ||
is_cell_line <- all(metadata(sce)$sample_type == "cell line") | ||
|
||
# only create and write table for non-cell line samples | ||
if(!is_cell_line){ | ||
|
||
# get df with ids, barcodes, and cell type assignments | ||
celltype_df <- colData(sce) |> | ||
as.data.frame() |> | ||
dplyr::mutate( | ||
project_id = project_id, | ||
sample_id = sample_id, | ||
library_id = library_id | ||
) |> | ||
dplyr::select( | ||
project_id, | ||
sample_id, | ||
library_id, | ||
barcodes, | ||
contains("celltype") # get both singler and cellassign with ontology | ||
) | ||
|
||
# save tsv | ||
readr::write_tsv(celltype_df, opt$output_file) | ||
|
||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# This script is used to combine all TSV files containing cell types into a single TSV file | ||
# The output TSV file will include the following added columns: | ||
# panglao_ontology: CL term assigned to panglao term | ||
# panglao_annotation: human readable value associated with the CL term for panglao term | ||
# blueprint_annotation_fine: Fine-grained annotation from blueprint associated with singler_celltype_ontology | ||
# consensus_annotation: human readable name associated with the consensus label | ||
# consensus_ontology: CL ontology term for the consensus cell type | ||
|
||
project_root <- rprojroot::find_root(rprojroot::has_dir(".github")) | ||
|
||
library(optparse) | ||
|
||
option_list <- list( | ||
make_option( | ||
opt_str = c("--celltype_tsv_dir"), | ||
type = "character", | ||
help = "Path to directory containing TSV files with cell type annotations from single samples. | ||
All TSV files in this directory will be combined into a single file." | ||
), | ||
make_option( | ||
opt_str = c("--panglao_ref_file"), | ||
default = file.path(project_root, "references", "panglao-cell-type-ontologies.tsv"), | ||
type = "character", | ||
help = "Path to file with panglao assignments and associated cell ontology ids" | ||
), | ||
make_option( | ||
opt_str = c("--consensus_ref_file"), | ||
default = file.path(project_root, "references", "consensus-cell-type-reference.tsv"), | ||
type = "character", | ||
help = "Path to file containing the reference for assigning consensus cell type labels" | ||
), | ||
make_option( | ||
opt_str = c("--output_file"), | ||
type = "character", | ||
help = "Path to file where combined TSV file will be saved. | ||
File name must end in either `.tsv` or `.tsv.gz` to save a compressed TSV file" | ||
) | ||
) | ||
|
||
# Parse options | ||
opt <- parse_args(OptionParser(option_list = option_list)) | ||
|
||
# Prep ref files --------------------------------------------------------------- | ||
|
||
# make sure reference files exist | ||
stopifnot( | ||
"panglao reference file does not exist" = file.exists(opt$panglao_ref_file), | ||
"cell type consensus reference file does not exist" = file.exists(opt$consensus_ref_file), | ||
"output file must end in `.tsv` or `.tsv.gz`" = stringr::str_detect(opt$output_file, ".tsv|.tsv.gz") | ||
) | ||
|
||
# read in ref files | ||
# change names for panglao ref to match what's in the consensus file | ||
panglao_ref_df <- readr::read_tsv(opt$panglao_ref_file) |> | ||
dplyr::rename( | ||
panglao_ontology = ontology_id, | ||
panglao_annotation = human_readable_value, | ||
original_panglao_name = panglao_cell_type | ||
) | ||
|
||
consensus_ref_df <- readr::read_tsv(opt$consensus_ref_file) |> | ||
# select columns to use for joining and consensus assigmments | ||
dplyr::select( | ||
panglao_ontology, | ||
original_panglao_name, | ||
blueprint_ontology, | ||
consensus_annotation, | ||
consensus_ontology | ||
) | ||
|
||
# grab singler ref from celldex | ||
blueprint_ref <- celldex::BlueprintEncodeData() | ||
|
||
# get ontologies and human readable name into data frame for blueprint | ||
# in scpca-nf we don't include the fine label so this lets us add it in | ||
blueprint_df <- data.frame( | ||
blueprint_ontology = blueprint_ref$label.ont, | ||
blueprint_annotation_fine = blueprint_ref$label.fine | ||
Comment on lines
+79
to
+80
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are there any cases where the ontology maps to more than one fine label? I feel like this is the case, which would result in duplicate rows when joining. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. The ontology and fine label are 1:1. I checked this by making sure the length of the unique values in this data frame are equivalent and equal to the number of rows in the dataframe. Also from the celldex vignette:
This tells me that the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I read the last sentence a bit differently: they map to Cell Ontology but that does not mean that every fine label has a distinct ontology label. In the case of Blueprint, it turns out that this is true, but it is not always; the HumanPrimaryCellAtlas data has more fine labels than ontologies.
So I guess this is fine, but I guess I'm not sure what the value of it is? If we are just getting an alternative to the ontology label with a different naming convention, do we need it at all? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Perhaps the better approach here is to directly map the ontology ids to the name associated with them ourselves and remove the |
||
) |> | ||
unique() |> | ||
tidyr::drop_na() | ||
|
||
# get list of all TSV files | ||
all_files <- list.files(path = opt$celltype_tsv_dir, | ||
pattern = "*.tsv", | ||
full.names = TRUE) | ||
|
||
# read in TSV files and combine into a single df | ||
all_cells_df <- all_files |> | ||
purrr::map(readr::read_tsv) |> | ||
dplyr::bind_rows() |> | ||
# add columns for panglao ontology and consensus | ||
# first add panglao ontology | ||
dplyr::left_join(panglao_ref_df, by = c("cellassign_celltype_annotation" = "original_panglao_name")) |> | ||
# now add in all the blueprint columns | ||
dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |> | ||
# then add consensus labels | ||
dplyr::left_join(consensus_ref_df, | ||
by = c("singler_celltype_ontology" = "blueprint_ontology", | ||
"cellassign_celltype_annotation" = "original_panglao_name", | ||
"panglao_ontology")) |> | ||
# use unknown for NA annotation but keep ontology ID as NA | ||
dplyr::mutate(consensus_annotation = dplyr::if_else(is.na(consensus_annotation), "Unknown", consensus_annotation)) | ||
|
||
# export file | ||
readr::write_tsv(all_cells_df, opt$output_file) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems like the correct solution (for now at least).