-
Notifications
You must be signed in to change notification settings - Fork 18
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Workflow to assign consensus cell types to ScPCA samples #977
Changes from 9 commits
79a3e51
84094b5
73d9498
2a2f514
82632b7
f3b6f78
7965691
e36f4c7
3f9183d
39b162d
2bbacc3
6f4f1a0
6b7c342
939103f
bcb16a5
965e445
d7b93ed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,3 +53,4 @@ jobs: | |
run: | | ||
cd ${MODULE_PATH} | ||
# run module script(s) here | ||
./assign-consensus-celltypes.sh |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/bin/bash | ||
|
||
# This script is used to create a single table with cell type assignments all cells from all ScPCA samples | ||
# The existing cell type annotations from SingleR and CellAssign are saved to a TSV file for each sample | ||
# Then all TSV files are combined into a single file and consensus cell types are assigned | ||
|
||
# Usage: ./assign-consensus-celltypes.sh | ||
|
||
|
||
set -euo pipefail | ||
|
||
# navigate to where script lives | ||
cd $(dirname "$0") | ||
module_dir=$(pwd) | ||
allyhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
data_dir="${module_dir}/../../data/current" | ||
# path to save consensus results | ||
scpca_consensus_assignments_file="${module_dir}/results/scpca-consensus-celltype-assignments.tsv" | ||
# directory to store all individual tsv files | ||
celltype_tsv_dir="${module_dir}/results/original-celltype-assignments" | ||
mkdir -p ${celltype_tsv_dir} | ||
|
||
# define scripts and notebook directories | ||
scripts_dir="${module_dir}/scripts" | ||
|
||
# define reference input files | ||
panglao_ref_file="${module_dir}/references/panglao-cell-type-ontologies.tsv" | ||
consensus_ref_file="${module_dir}/references/consensus-cell-type-reference.tsv" | ||
|
||
# run script to export tsv file on all processed objects | ||
for sce_file in $data_dir/*/SCPCS*/*_processed.rds; do | ||
allyhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# define library ID | ||
library_id=$(basename $sce_file | sed 's/_processed.rds$//') | ||
|
||
echo "Grabbing cell types for ${library_id}" | ||
# get celltypes as tsv file | ||
Rscript $scripts_dir/03-save-coldata.R \ | ||
--sce_file $sce_file \ | ||
--output_file ${celltype_tsv_dir}/${library_id}_celltype-assignments.tsv | ||
|
||
done | ||
|
||
echo "Combining TSVs and adding consensus labels" | ||
# run script to combine all tsv files and assign consensus cell types | ||
Rscript $scripts_dir/04-combine-celltype-tables.R \ | ||
--celltype_tsv_dir $celltype_tsv_dir \ | ||
--panglao_ref_file $panglao_ref_file \ | ||
--consensus_ref_file $consensus_ref_file \ | ||
--output_file $scpca_consensus_assignments_file |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# This script is used to grab the colData from a SCE object and save it as a TSV file | ||
|
||
project_root <- here::here() | ||
allyhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
library(optparse) | ||
|
||
option_list <- list( | ||
make_option( | ||
opt_str = c("--sce_file"), | ||
type = "character", | ||
help = "Path to RDS file containing a processed SingleCellExperiment object from scpca-nf" | ||
), | ||
make_option( | ||
opt_str = c("--output_file"), | ||
type = "character", | ||
help = "Path to file where colData will be saved, must end in `.tsv`" | ||
) | ||
) | ||
|
||
# Parse options | ||
opt <- parse_args(OptionParser(option_list = option_list)) | ||
|
||
# Set up ----------------------------------------------------------------------- | ||
|
||
# make sure input files exist | ||
stopifnot( | ||
"sce file does not exist" = file.exists(opt$sce_file) | ||
) | ||
|
||
# load SCE | ||
suppressPackageStartupMessages({ | ||
library(SingleCellExperiment) | ||
}) | ||
|
||
# Extract colData -------------------------------------------------------------- | ||
|
||
# read in sce | ||
sce <- readr::read_rds(opt$sce_file) | ||
|
||
# extract ids | ||
library_id <- metadata(sce)$library_id | ||
sample_id <- metadata(sce)$sample_id | ||
project_id <- metadata(sce)$project_id | ||
|
||
# check if cell line since cell lines don't have any cell type assignments | ||
is_cell_line <- metadata(sce)$sample_type == "cell line" | ||
|
||
# only create and write table for non-cell line samples | ||
if(!is_cell_line){ | ||
|
||
# get df with ids, barcodes, and cell type assignments | ||
celltype_df <- colData(sce) |> | ||
as.data.frame() |> | ||
dplyr::mutate( | ||
project_id = project_id, | ||
sample_id = sample_id, | ||
library_id = library_id | ||
) |> | ||
dplyr::select( | ||
ends_with("id"), | ||
"barcodes", | ||
contains("celltype") # get both singler and cellassign with ontology | ||
) | ||
|
||
# save tsv | ||
readr::write_tsv(celltype_df, opt$output_file) | ||
|
||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# This script is used to combine all TSV files containing cell types into a single TSV file | ||
# The output TSV file will include the following added columns: | ||
# panglao_ontology: CL term assigned to panglao term | ||
# panglao_annotation: human readable value associated with the CL term for panglao term | ||
# blueprint_annotation_fine: Fine-grained annotation from blueprint associated with singler_celltype_ontology | ||
# consensus_annotation: human readable name associated with the consensus label | ||
# consensus_ontology: CL ontology term for the consensus cell type | ||
|
||
project_root <- here::here() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't particularly like this here, as if an Rproject file gets added to the module, this will start to fail. It is generally better to use so either
or maybe safer:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just noting that this fails in CI with the following error:
see https://github.com/AlexsLemonade/OpenScPCA-analysis/actions/runs/12694163418/job/35383339441 I'm looking into it, but posting it here in case you have any obvious solutions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, this was the issue I was referring to with checking out by the API, which is what will happen with docker images that don't have You could add a step to install |
||
|
||
library(optparse) | ||
|
||
option_list <- list( | ||
make_option( | ||
opt_str = c("--celltype_tsv_dir"), | ||
type = "character", | ||
help = "Path to directory containing TSV files with cell type annotations from single samples. | ||
All TSV files in this directory will be combined into a single file." | ||
), | ||
make_option( | ||
opt_str = c("--panglao_ref_file"), | ||
default = file.path(project_root, "references", "panglao-cell-type-ontologies.tsv"), | ||
type = "character", | ||
help = "Path to file with panglao assignments and associated cell ontology ids" | ||
), | ||
make_option( | ||
opt_str = c("--consensus_ref_file"), | ||
default = file.path(project_root, "references", "consensus-cell-type-reference.tsv"), | ||
type = "character", | ||
help = "Path to file containing the reference for assigning consensus cell type labels" | ||
), | ||
make_option( | ||
opt_str = c("--output_file"), | ||
type = "character", | ||
help = "Path to file where combined TSV file will be saved, must end in `.tsv`" | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume the output file here is large: do we want to allow compressed file output? You don't have code to enforce the |
||
) | ||
|
||
# Parse options | ||
opt <- parse_args(OptionParser(option_list = option_list)) | ||
|
||
# Prep ref files --------------------------------------------------------------- | ||
|
||
# make sure reference files exist | ||
stopifnot( | ||
"panglao reference file does not exist" = file.exists(opt$panglao_ref_file), | ||
"cell type consensus reference file does not exist" = file.exists(opt$consensus_ref_file) | ||
) | ||
|
||
# read in ref files | ||
# change names for panglao ref to match what's in the consensus file | ||
panglao_ref_df <- readr::read_tsv(opt$panglao_ref_file) |> | ||
dplyr::rename( | ||
panglao_ontology = ontology_id, | ||
panglao_annotation = human_readable_value, | ||
original_panglao_name = panglao_cell_type | ||
) | ||
|
||
consensus_ref_df <- readr::read_tsv(opt$consensus_ref_file) |> | ||
# select columns to use for joining and consensus assigmments | ||
dplyr::select( | ||
panglao_ontology, | ||
original_panglao_name, | ||
blueprint_ontology, | ||
consensus_annotation, | ||
consensus_ontology | ||
) | ||
|
||
# grab singler ref from celldex | ||
blueprint_ref <- celldex::BlueprintEncodeData() | ||
|
||
# get ontologies and human readable name into data frame for blueprint | ||
# in scpca-nf we don't include the fine label so this lets us add it in | ||
blueprint_df <- data.frame( | ||
blueprint_ontology = blueprint_ref$label.ont, | ||
blueprint_annotation_fine = blueprint_ref$label.fine | ||
Comment on lines
+79
to
+80
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are there any cases where the ontology maps to more than one fine label? I feel like this is the case, which would result in duplicate rows when joining. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. The ontology and fine label are 1:1. I checked this by making sure the length of the unique values in this data frame are equivalent and equal to the number of rows in the dataframe. Also from the celldex vignette:
This tells me that the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I read the last sentence a bit differently: they map to Cell Ontology but that does not mean that every fine label has a distinct ontology label. In the case of Blueprint, it turns out that this is true, but it is not always; the HumanPrimaryCellAtlas data has more fine labels than ontologies.
So I guess this is fine, but I guess I'm not sure what the value of it is? If we are just getting an alternative to the ontology label with a different naming convention, do we need it at all? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Perhaps the better approach here is to directly map the ontology ids to the name associated with them ourselves and remove the |
||
) |> | ||
unique() |> | ||
tidyr::drop_na() | ||
|
||
# get list of all TSV files | ||
all_files <- list.files(path = opt$celltype_tsv_dir, | ||
pattern = "*.tsv", | ||
full.names = TRUE) | ||
|
||
# read in TSV files and combine into a single df | ||
all_cells_df <- all_files |> | ||
purrr::map(readr::read_tsv) |> | ||
dplyr::bind_rows() |> | ||
# add columns for panglao ontology and consensus | ||
# first add panglao ontology | ||
dplyr::left_join(panglao_ref_df, by = c("cellassign_celltype_annotation" = "original_panglao_name")) |> | ||
# now add in all the blueprint columns | ||
dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |> | ||
# then add consensus labels | ||
dplyr::left_join(consensus_ref_df, | ||
by = c("singler_celltype_ontology" = "blueprint_ontology", | ||
"cellassign_celltype_annotation" = "original_panglao_name", | ||
"panglao_ontology")) |> | ||
# use unknown for NA annotation but keep ontology ID as NA | ||
dplyr::mutate(consensus_annotation = dplyr::if_else(is.na(consensus_annotation), "Unknown", consensus_annotation)) | ||
|
||
# export file | ||
readr::write_tsv(all_cells_df, opt$output_file) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just on wording, I don't know if we want to call this a workflow, as I keep thinking that will mean something more like Nextflow. I think you can just call it a script and describe the whole thing as the module output.