title | author | date | output |
---|---|---|---|
Notebook_1_P28_29_preprocessing |
Laura Wolbeck |
2023-11-21 |
html_document |
This script performs the preprocessing, filtering and initial visualizations of the data using the R package CRMetrics. Finally, the UMAP embedding and clusters are estimated also within the CRMetrics package which uses pagoda2 and conos.
source("scRNA_helper_functions_preterm_2.R")
library(CRMetrics)
library(magrittr)
library(dplyr)
library(qs)
library(pagoda2)
library(conos)
library(ggplot2)
crm <- CRMetrics$new(data.path = "/data/Japan_preterm_counts/preterm_P28_29/HN00143770_result_10X/",
metadata = "/people/nrq364/preterm-2-NCU-UCPH_figures/metadata.csv",
sep.meta = ",",
n.cores = 50,
verbose = T)
add count matrices
crm$addDetailedMetrics()
median genes per cell across all samples
median(crm$summary.metrics[crm$summary.metrics$metric=="median genes per cell",]$value)
cells per samples and sum of all cells
sapply(crm$cms, function(d) dim(d)[2])
sum(sapply(crm$cms, function(d) dim(d)[2]))
plot sequenced cells per sample
crm$plotSummaryMetrics(comp.group = "sample", second.comp.group = "condition", metrics = "estimated number of cells",plot.geom = "bar") + xlab("") + ylab("number of cells") + theme(legend.position = "right", legend.title = element_blank())
ggsave("number_of_cells.pdf", width=10, height = 8, units = "cm")
plot genes per cell
crm$plotDetailedMetrics(comp.group = "condition",
metrics = "gene_count",
plot.geom = "violin",hline = F) + ylab("genes per cell") + theme(legend.position = "right", legend.title = element_blank()) + stat_summary(fun.y=median, size=8, geom = "text", label = "-") #+ theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12))
ggsave("VlnPlot_genes_per_cell.pdf", width=10, height = 8, units = "cm")
before filtering
crm$doPreprocessing(preprocess = "pagoda")
crm$createEmbedding()
in terminal run (or in terminal tab in R studio): module load miniconda/4.12.0 conda activate doublets
crm$detectDoublets(env = "doublets", conda.path = "/opt/software/miniconda/4.12.0/condabin/conda", method = "scrublet")
#Detected 925 possible doublets out of 29127 cells
plot <- crm$plotEmbedding(doublet.method = "scrublet", size = 0.2)+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
rasterize(plot, layers='Point', dpi=1000)
ggsave("Doublets_UMAP.pdf", width=7, height=4.3 )
crm$filterCms(depth.cutoff = 750,
mito.cutoff = 0.07,
doublets = "scrublet",
species = "mouse")
#Removing 3231 of 29127 cells (11.1%)
save filtered count matrix
qsave(crm$cms.filtered, "cms_P28_29_filtered.qs",
nthreads = 10)
pre-processing of each filtered dataset with pagoda2, followed by using conos to build the joint UMAP graph with forced alignment (alignment.strength = 0.3) to integrate samples well
cms <- qread("cms_P28_29_filtered.qs")
#vector with sample names
names = c("full_P28_1",
"full_P28_2",
"pre_P29_1",
"pre_P29_2")
con <- quickConos(cms,
names,
n.cores.p2=10,
n.cores.con=20, get.tsne = F, alignment.strength = 0.3)
con <- con$con
Rerun leiden clustering to find additional clusters
con$findCommunities(method = leiden.community, resolution = 5, min.group.size = 15)
save clusters and conos object
leiden22<- con$clusters$leiden$groups %>% factor
qsave(leiden22, "leiden22.qs")
qsave(con, "con.qs")
For annotation, some of the clusters were collapsed or further separated.