-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #977 from allyhawkins/allyhawkins/assign-scpca-con…
…sensus Workflow to assign consensus cell types to ScPCA samples
- Loading branch information
Showing
6 changed files
with
293 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,3 +53,4 @@ jobs: | |
run: | | ||
cd ${MODULE_PATH} | ||
# run module script(s) here | ||
./assign-consensus-celltypes.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
47 changes: 47 additions & 0 deletions
47
analyses/cell-type-consensus/assign-consensus-celltypes.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#!/bin/bash | ||
|
||
# This script is used to create a single table with cell type assignments all cells from all ScPCA samples | ||
# The existing cell type annotations from SingleR and CellAssign are saved to a TSV file for each sample | ||
# Then all TSV files are combined into a single file and consensus cell types are assigned | ||
|
||
# Usage: ./assign-consensus-celltypes.sh | ||
|
||
|
||
set -euo pipefail | ||
|
||
# navigate to where script lives | ||
cd $(dirname "$0") | ||
#module_dir=$(pwd) | ||
|
||
data_dir="../../data/current" | ||
# path to save consensus results | ||
scpca_consensus_assignments_file="results/scpca-consensus-celltype-assignments.tsv.gz" | ||
# directory to store all individual tsv files | ||
celltype_tsv_dir="results/original-celltype-assignments" | ||
mkdir -p ${celltype_tsv_dir} | ||
|
||
# define reference input files | ||
panglao_ref_file="references/panglao-cell-type-ontologies.tsv" | ||
consensus_ref_file="references/consensus-cell-type-reference.tsv" | ||
|
||
# run script to export tsv file on all processed objects | ||
for sce_file in $data_dir/SCPCP*/SCPCS*/*_processed.rds; do | ||
|
||
# define library ID | ||
library_id=$(basename $sce_file | sed 's/_processed.rds$//') | ||
|
||
echo "Grabbing cell types for ${library_id}" | ||
# get celltypes as tsv file | ||
Rscript scripts/03-save-coldata.R \ | ||
--sce_file $sce_file \ | ||
--output_file ${celltype_tsv_dir}/${library_id}_celltype-assignments.tsv | ||
|
||
done | ||
|
||
echo "Combining TSVs and adding consensus labels" | ||
# run script to combine all tsv files and assign consensus cell types | ||
Rscript scripts/04-combine-celltype-tables.R \ | ||
--celltype_tsv_dir $celltype_tsv_dir \ | ||
--panglao_ref_file $panglao_ref_file \ | ||
--consensus_ref_file $consensus_ref_file \ | ||
--output_file $scpca_consensus_assignments_file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# This script is used to grab the colData from a SCE object and save it as a TSV file | ||
|
||
library(optparse) | ||
|
||
option_list <- list( | ||
make_option( | ||
opt_str = c("--sce_file"), | ||
type = "character", | ||
help = "Path to RDS file containing a processed SingleCellExperiment object from scpca-nf" | ||
), | ||
make_option( | ||
opt_str = c("--output_file"), | ||
type = "character", | ||
help = "Path to file where colData will be saved, must end in `.tsv`" | ||
) | ||
) | ||
|
||
# Parse options | ||
opt <- parse_args(OptionParser(option_list = option_list)) | ||
|
||
# Set up ----------------------------------------------------------------------- | ||
|
||
# make sure input files exist | ||
stopifnot( | ||
"sce file does not exist" = file.exists(opt$sce_file) | ||
) | ||
|
||
# load SCE | ||
suppressPackageStartupMessages({ | ||
library(SingleCellExperiment) | ||
}) | ||
|
||
# Extract colData -------------------------------------------------------------- | ||
|
||
# read in sce | ||
sce <- readr::read_rds(opt$sce_file) | ||
|
||
# extract ids | ||
library_id <- metadata(sce)$library_id | ||
# account for multiplexed libraries that have multiple samples | ||
# for now just combine sample ids into a single string and don't worry about demultiplexing | ||
sample_id <- metadata(sce)$sample_id |> | ||
paste0(collapse = ";") | ||
project_id <- metadata(sce)$project_id | ||
|
||
# check if cell line since cell lines don't have any cell type assignments | ||
# account for having more than one sample and a list of sample types | ||
# all sample types should be the same theoretically | ||
is_cell_line <- all(metadata(sce)$sample_type == "cell line") | ||
|
||
# only create and write table for non-cell line samples | ||
if(!is_cell_line){ | ||
|
||
# get df with ids, barcodes, and cell type assignments | ||
celltype_df <- colData(sce) |> | ||
as.data.frame() |> | ||
dplyr::mutate( | ||
project_id = project_id, | ||
sample_id = sample_id, | ||
library_id = library_id | ||
) |> | ||
dplyr::select( | ||
project_id, | ||
sample_id, | ||
library_id, | ||
barcodes, | ||
contains("celltype") # get both singler and cellassign with ontology | ||
) | ||
|
||
# save tsv | ||
readr::write_tsv(celltype_df, opt$output_file) | ||
|
||
} | ||
|
108 changes: 108 additions & 0 deletions
108
analyses/cell-type-consensus/scripts/04-combine-celltype-tables.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# This script is used to combine all TSV files containing cell types into a single TSV file | ||
# The output TSV file will include the following added columns: | ||
# panglao_ontology: CL term assigned to panglao term | ||
# panglao_annotation: human readable value associated with the CL term for panglao term | ||
# blueprint_annotation_fine: Fine-grained annotation from blueprint associated with singler_celltype_ontology | ||
# consensus_annotation: human readable name associated with the consensus label | ||
# consensus_ontology: CL ontology term for the consensus cell type | ||
|
||
project_root <- rprojroot::find_root(rprojroot::has_dir(".github")) | ||
|
||
library(optparse) | ||
|
||
option_list <- list( | ||
make_option( | ||
opt_str = c("--celltype_tsv_dir"), | ||
type = "character", | ||
help = "Path to directory containing TSV files with cell type annotations from single samples. | ||
All TSV files in this directory will be combined into a single file." | ||
), | ||
make_option( | ||
opt_str = c("--panglao_ref_file"), | ||
default = file.path(project_root, "references", "panglao-cell-type-ontologies.tsv"), | ||
type = "character", | ||
help = "Path to file with panglao assignments and associated cell ontology ids" | ||
), | ||
make_option( | ||
opt_str = c("--consensus_ref_file"), | ||
default = file.path(project_root, "references", "consensus-cell-type-reference.tsv"), | ||
type = "character", | ||
help = "Path to file containing the reference for assigning consensus cell type labels" | ||
), | ||
make_option( | ||
opt_str = c("--output_file"), | ||
type = "character", | ||
help = "Path to file where combined TSV file will be saved. | ||
File name must end in either `.tsv` or `.tsv.gz` to save a compressed TSV file" | ||
) | ||
) | ||
|
||
# Parse options | ||
opt <- parse_args(OptionParser(option_list = option_list)) | ||
|
||
# Prep ref files --------------------------------------------------------------- | ||
|
||
# make sure reference files exist | ||
stopifnot( | ||
"panglao reference file does not exist" = file.exists(opt$panglao_ref_file), | ||
"cell type consensus reference file does not exist" = file.exists(opt$consensus_ref_file), | ||
"output file must end in `.tsv` or `.tsv.gz`" = stringr::str_detect(opt$output_file, ".tsv|.tsv.gz") | ||
) | ||
|
||
# read in ref files | ||
# change names for panglao ref to match what's in the consensus file | ||
panglao_ref_df <- readr::read_tsv(opt$panglao_ref_file) |> | ||
dplyr::rename( | ||
panglao_ontology = ontology_id, | ||
panglao_annotation = human_readable_value, | ||
original_panglao_name = panglao_cell_type | ||
) | ||
|
||
consensus_ref_df <- readr::read_tsv(opt$consensus_ref_file) |> | ||
# select columns to use for joining and consensus assigmments | ||
dplyr::select( | ||
panglao_ontology, | ||
original_panglao_name, | ||
blueprint_ontology, | ||
consensus_annotation, | ||
consensus_ontology | ||
) | ||
|
||
# grab singler ref from celldex | ||
blueprint_ref <- celldex::BlueprintEncodeData() | ||
|
||
# get ontologies and human readable name into data frame for blueprint | ||
# in scpca-nf we don't include the fine label so this lets us add it in | ||
blueprint_df <- data.frame( | ||
blueprint_ontology = blueprint_ref$label.ont, | ||
blueprint_annotation_fine = blueprint_ref$label.fine | ||
) |> | ||
unique() |> | ||
tidyr::drop_na() | ||
|
||
# get list of all TSV files | ||
all_files <- list.files(path = opt$celltype_tsv_dir, | ||
pattern = "*.tsv", | ||
full.names = TRUE) | ||
|
||
# read in TSV files and combine into a single df | ||
all_cells_df <- all_files |> | ||
purrr::map(readr::read_tsv) |> | ||
dplyr::bind_rows() |> | ||
# add columns for panglao ontology and consensus | ||
# first add panglao ontology | ||
dplyr::left_join(panglao_ref_df, by = c("cellassign_celltype_annotation" = "original_panglao_name")) |> | ||
# now add in all the blueprint columns | ||
dplyr::left_join(blueprint_df, by = c("singler_celltype_ontology" = "blueprint_ontology")) |> | ||
# then add consensus labels | ||
dplyr::left_join(consensus_ref_df, | ||
by = c("singler_celltype_ontology" = "blueprint_ontology", | ||
"cellassign_celltype_annotation" = "original_panglao_name", | ||
"panglao_ontology")) |> | ||
# use unknown for NA annotation but keep ontology ID as NA | ||
dplyr::mutate(consensus_annotation = dplyr::if_else(is.na(consensus_annotation), "Unknown", consensus_annotation)) | ||
|
||
# export file | ||
readr::write_tsv(all_cells_df, opt$output_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters