-
Notifications
You must be signed in to change notification settings - Fork 15
/
scrnaseq.Rmd
3295 lines (2697 loc) · 162 KB
/
scrnaseq.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
date: "`r format(Sys.time(), '%B, %Y')`"
geometry: "margin=2cm"
params:
# Author
author: !r Sys.info()[["user"]]
# Project ID
project_id: pbmc
# Path to input data
path_data: !r data.frame(name=c("pbmc_10x","pbmc_smartseq2"),
type=c("10x","smartseq2"),
path=c("test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/", "test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz"),
stats=c(NA, NA))
# Data type ("RNA", "Spatial")
assay_raw: "RNA"
# Downsample data to at most n cells per sample AFTER filtering (mainly for tests)
# NULL to deactivate
downsample_cells_n: NULL
# Downsample all samples equally according to the smallest sample
# TRUE/FALSE
# Overwritten by downsample_cells_n
downsample_cells_equally: FALSE
# Path to output directory
path_out: test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/
# Marker genes based on literature, translated to Ensembl IDs
# xlsx file, one list per column, first row as header and Ensembl IDs below
# NULL if no known marker genes should be plotted
file_known_markers: test_datasets/10x_SmartSeq2_pbmc_GSE132044/known_markers.xlsx
# Contaminant cells
# NULL if not cells are to be excluded
# Otherwise a file with one column, one cell name per line
file_contam_cells: NULL
# Annotation via biomaRt
mart_dataset: hsapiens_gene_ensembl
annot_version: 98
annot_main: !r c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession")
mart_attributes: !r c(c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession"),
c("chromosome_name", "start_position", "end_position", "percentage_gene_gc_content", "gene_biotype", "strand", "description"))
biomart_mirror: NULL
# Prefix for mitochondrial genes
mt: "^MT-"
# Name for doublet scores
# Doublet scores can be provided automatically via metadata.tsv.gz in path_data
# NULL if no doublet scores are provided in metadata
doublets: NULL
doublets_prediction: NULL
#####################
# Filter parameters #
#####################
# Filter for cells
cell_filter: !r list(nFeature_RNA=c(NA, NA), percent_mt=c(NA, NA))
# Filter for features
feature_filter: !r list(min_counts=1, min_cells=3) # feature has to be found by at least one count in one cell
# Samples to drop
# Cells from these samples will be dropped after initial QC
# Example: samples_to_drop = c("pbmc_smartseq2_NC", "pbmc_smartseq2_RNA"),
# where "pbmc_smartseq2" is the name of the dataset, and "NC" and "RNA" are the names of the subsamples
samples_to_drop: NULL
# Drop samples with too few cells
samples_min_cells: 10
############################
# Normalisation parameters #
############################
# Which normalisation should be used for analysis? ("RNA", "SCT")
norm: RNA
# Whether or not to remove cell cycle effects
cc_remove: !r FALSE
# Should all cell cycle effects be removed, or only the difference between profilerating cells (G2M and S phase)?
# Read https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html, for an explanation
cc_remove_all: !r FALSE
# Whether or not to re-score cell cycle effects after data
# from different samples have been merged/integrated
cc_rescore_after_merge: !r TRUE
# Additional (unwanted) variables that will be regressed out for visualisation and clustering
vars_to_regress: NULL
# When there are multiple datasets, how to combine them:
# - method:
# - "single": Default when there is only one dataset after filtering, no integration is needed
# - "merge": Merge (in other words, concatenate) data when no integration is needed,
# e.g. when samples were multiplexed on the same chip.
# - "integrate": Anchors are computed for all pairs of datasets. This will give all datasets the same weight
# during dataset integration but can be computationally intensive
#
# Additional options for the "integrate" method:
#
# - dimensions: Number of dimensions to consider for integration
# - reference: Use one or more datasets as reference and compute anchors for all other datasets.
# Separate multiple datasets by comma. This is computationally faster but less accurate.
# - use_reciprocal_pca: Compute anchors in PCA space. Even more computationally faster but again less accurate
# Recommended for big datasets.
# - k_filter: How many neighbors to use when filtering anchors (default: min(200, minimum number of cells in a sample))
# - k_weight: Number of neighbors to consider when weighting anchors (default: min(100, minimum number of cells in a sample))
# - k_anchor: How many neighbors to use when picking anchors (default: min(5, minimum number of cells in a sample))
# - k_score: How many neighbors to use when scoring anchors (default: min(30, minimum number of cells in a sample))
integrate_samples: !r list(method="integrate", dimensions=30, reference=NULL, use_reciprocal_pca=FALSE)
# TO DISCUSS from Seurat vignette:
# The results show that rpca-based integration is more conservative, and in this case, do not perfectly align a subset of cells (which are naive and memory T cells) across experiments. You can increase the strength of alignment by increasing the k.anchor parameter, which is set to 5 by default. Increasing this parameter to 20 will assist in aligning these populations.
# The number of PCs to use; adjust this parameter based on the Elbowplot
pc_n: 10
# k nearest neighbors to find clusters
# k nearest neighbors to construct the UMAP
# Scanpy uses 15 for both by default
# Seurat uses 20 for cluster_k, and 30 for umap_k by default
cluster_k: 20
umap_k: 30
# Cluster resolutions to compute; lower values will lead to fewer clusters of cells; can be multiple values;
# Empty vector if not needed
cluster_resolution_test: !r c(0.3, 0.7)
# Cluster resolution to use for analysis
cluster_resolution: 0.5
#######################################################
# Marker genes and genes with differential expression #
#######################################################
# Thresholds to define marker genes
marker_padj: 0.05
marker_log2FC: !r log2(2)
marker_pct: 0.25
# Additional (unwanted) variables to account for in statistical tests
latent_vars: NULL
# Contrasts to find differentially expressed genes (R data.frame or Excel file)
# Required columns:
# condition_column: Categorial column in the cell metadata; specify "orig.ident" for sample and "seurat_clusters" for cluster
# condition_group1: Condition levels in group 1, multiple levels concatenated by the plus character
# Empty string = all levels not in group2 (cannot be used if group2 is empty)
# condition_group2: Condition levels in group 2, multiple levels concatenated by the plus character
# Empty string = all levels not in group1 (cannot be used if group1 is empty)
#
# Optional columns:
# subset_column: Categorial column in the cell metadata to subset before testing (default: NA)
# Specify "orig.ident" for sample and "seurat_clusters" for cluster
# subset_group: Further subset levels (default: NA)
# For the individual analysis of multiple levels separate by semicolons
# For the joint analysis of multiple levels concatenate by the plus character
# For the individual analysis of all levels empty string ""
# assay: Seurat assay to test on; can also be a Seurat dimensionality reduction (default: "RNA")
# slot: In case assay is a Seurat assay object, which slot to use (default: "data")
# padj: Maximum adjusted p-value (default: 0.05)
# log2FC: Minimum absolute log2 fold change (default: 0)
# min_pct: Minimum percentage of cells expressing a gene to test (default: 0.1)
# test: Type of test; "wilcox", "bimod", "roc", "t", "negbinom", "poisson", "LR", "MAST", "DESeq2"; (default: "wilcox")
# downsample_cells_n: Downsample each group to at most n cells to speed up tests (default: NA)
# latent_vars: Additional variables to account for; multiple variables need to be concatenated by semicolons; will overwrite the default by param$latent_vars (default: none).
deg_contrasts: !r data.frame(condition_column=c("orig.ident", "orig.ident", "Phase"),
condition_group1=c("pbmc_10x", "pbmc_10x", "G1"),
condition_group2=c("pbmc_smartseq2_sample1", "pbmc_smartseq2_sample1", "G2M"),
subset_column=c(NA, "seurat_clusters", "seurat_clusters"),
subset_group=c(NA, "", "1;2"),
downsample_cells_n=c(NA, 50, 30))
# Enrichr site ("Enrichr", "FlyEnrichr", "WormEnrichr", "YeastEnrichr", "FishEnrichr")
enrichr_site: "Enrichr"
# P-value threshold for functional enrichment tests
enrichr_padj: 0.05
# Enrichr databases of interest
enrichr_dbs: !r c("GO_Molecular_Function_2018", "GO_Biological_Process_2018", "GO_Cellular_Component_2018", "Azimuth_Cell_Types_2021", "CellMarker_Augmented_2021", "Descartes_Cell_Types_and_Tissue_2021")
######################
# General parameters #
######################
# Main colour to use for plots
col: "darkblue"
# Colour palette used for samples
col_palette_samples: "ggsci::pal_jama"
# Colour palette used for cluster
col_palette_clusters: "ggsci::pal_igv"
# Path to git repository
path_to_git: "."
# Debugging mode:
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions
debugging_mode: "default_debugging"
# Number of cores to use
cores: 8
output:
html_document:
toc: true
toc_depth: 2
toc_float: true
code_folding: "hide"
highlight: "tango"
theme: "paper"
bibliography: "`r file.path(params$path_out, 'references.bib')`"
title: "Single-cell RNA-seq data analysis of project `r params$project_id`"
author: "`r params$author`"
---
```{r setup, warning=FALSE, message=FALSE}
# R Options
options(stringsAsFactors=FALSE,
citation_format="pandoc",
dplyr.summarise.inform=FALSE,
knitr.table.format="html",
kableExtra_view_html=TRUE,
future.globals.maxSize=+Inf,
mc.cores=params$cores,
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
# Python3 needed for clustering, umap, other python packages
# Path to binary will be automatically found
# Set manually if it does not work
reticulate_python3_path = unname(Sys.which("python3"))
Sys.setenv(RETICULATE_PYTHON=reticulate_python3_path)
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for "leiden" clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat
# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr, stringr, Matrix, sctransform, glmGamPoi (optional for speed but only available for R 4.0)
# Tables: kableExtra, DT
# Plots: ggsci
# IO: openxlsx, readr, R.utils
# Annotation: biomaRt
# DEG: mast, limma (for a more efficient implementation of the Wilcoxon Rank Sum Test according to Seurat)
# Functional enrichment: enrichR
# Other: sessioninfo, cerebroApp, sceasy, ROpenSci/bibtex, knitcitations
# Knitr default options
knitr::opts_chunk$set(echo=TRUE, # output code
cache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source="fold-hide", # by default collapse code blocks
dev=c("png", "pdf"), # create figures in png and pdf; the first device (png) will be used for HTML output
dev.args=list(png=list(type="cairo"), # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
pdf=list(bg="white")), # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
dpi=96, # figure resolution
fig.retina=2 # retina multiplier
)
# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
invisible(knitcitations::citep(citation("knitr")))
```
```{r css, results="asis"}
cat(paste0('<link rel="stylesheet" href="', params$path_to_git, '/css/style.css">'))
```
```{r preparation_and_initial_checks}
# Copy input parameters to internal parameter list
param = params
if (is.null(param$samples_to_drop )) param$samples_to_drop = c()
if (is.null(param$vars_to_regress)) param$vars_to_regress = c()
if (is.null(param$latent_vars)) param$latent_vars = c()
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_io.R",
"functions_plotting.R",
"functions_analysis.R",
"functions_degs.R",
"functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))
# Debugging mode:
switch (param$debugging_mode,
default_debugging=on_error_default_debugging(),
terminal_debugger=on_error_start_terminal_debugger(),
print_traceback=on_error_just_print_traceback(),
on_error_default_debugging())
# Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)
# Create output directories
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "marker_degs"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "annotation"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "export"), recursive=TRUE, showWarnings=FALSE)
# Path for figures in png and pdf format (trailing "/" is needed)
knitr::opts_chunk$set(fig.path=paste0(file.path(param$path_out, "figures"), "/"))
# Path for annotation
param$file_annot = file.path(param$path_out, "annotation", paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt"))
# Path for cell cycle genes file
param$file_cc_genes = file.path(param$path_out, "annotation", "cell_cycle_markers.xlsx")
# Do checks
error_messages = c()
# Check parameters and parse entries (e.g. numbers) so that they are valid
param = check_parameters_scrnaseq(param)
error_messages = c(error_messages, param[["error_messages"]])
# Check installed packages
error_messages = c(error_messages, check_installed_packages_scrnaseq())
# Check python
error_messages = c(error_messages, check_python())
# Check pandoc
error_messages = c(error_messages, check_pandoc())
# Check enrichR
error_messages = c(error_messages, check_enrichr(param$enrichr_dbs, param$enrichr_site))
# Check ensembl
error_messages = c(error_messages, check_ensembl(biomart="ensembl",
dataset=param$mart_dataset,
mirror=param$biomart_mirror,
version=param$annot_version,
attributes=param$mart_attributes,
file_annot=param$file_annot,
file_cc_markers=param$file_cc_genes))
# Stop here if there are errors
if (length(error_messages)) stop(paste(c("", paste("*", error_messages)), collapse="\n"))
```
# Read data
The workflow can be run for pre-processed 10x data, as well as for pre-processed SmartSeq-2 or other data that are represented by a simple table with transcript counts per gene and cell.
## Read and print general metrics
Prior to this workflow, raw sequencing reads were mapped to the reference transcriptome. The resulting mapping statistics are printed below for a first estimation of sample quality.
<details>
<summary>Pre-processing of 10x data with Cell Ranger</summary>
We use the 10x Genomics Cell Ranger software to map 10x sequencing reads. The result is a count matrix that contains the number of unique transcripts per gene (rows) and cell (columns). To save storage space, the matrix is converted into a condensed format and described by the following 3 files:
<ul>
<li> “features.tsv.gz”: Tabular file with gene annotation (Ensembl gene ID and the gene symbol) to identify genes </li>
<li> “barcodes.tsv.gz”: File with cell barcodes to identify cells </li>
<li> “matrix.mtx.gz”: A condensed version of the count matrix </li>
</ul>
</details>
<details>
<summary>Pre-processing of SmartSeq-2 data</summary>
Sequencing reads from SmartSeq-2 (and other) experiments can be mapped with any suitable mapping software as long as a count matrix is generated that contains the number of mapped reads per gene (rows) and cell (columns). The first row of the count matrix should contain cell barcodes or names, the first column of the matrix should contain Ensembl gene IDs.
</details>
```{r general_metrics, message=TRUE}
# Are metrics provided?
if (!all(is.na(param$path_data$stats))) {
# Loop through all samples and read metrics
mapping_stats_list = list()
for (i in 1:nrow(param$path_data)) {
if (!is.na(param$path_data$stats[i])) {
mapping_stats_list[[param$path_data$name[i]]] = Read10XMetrics(param$path_data$stats[i])
}
}
# Join all mapping stats tables
mapping_stats = mapping_stats_list %>% purrr::reduce(dplyr::full_join, by="metric")
rownames(mapping_stats) = mapping_stats[["metric"]]
mapping_stats = mapping_stats %>% dplyr::select(-metric)
colnames(mapping_stats) = names(mapping_stats_list)
# Print table to HTML
knitr::kable(mapping_stats, align="l", caption="General metrics") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", fixed_thead=TRUE)
} else {
message("Mapping statistics cannot be shown. No valid file provided.")
}
```
## Read gene annotation
We read gene annotation from Ensembl and write the resulting table to file. We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names.
```{r read_ensembl_annotation}
# Read annotation from csv or from Ensembl and a tab separated txt will be created
if (file.exists(param$file_annot)) {
annot_ensembl = read.delim(param$file_annot)
} else {
annot_mart = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset=param$mart_dataset,
mirror=param$biomart_mirror,
version=param$annot_version))
annot_ensembl = biomaRt::getBM(mart=annot_mart, attributes=param$mart_attributes, useCache=FALSE)
write.table(annot_ensembl, file=param$file_annot, sep="\t", col.names=TRUE, row.names=FALSE, append=FALSE)
message("Gene annotation file was created at: ", param$file_annot)
# Note: depending on the attributes, there might be more than one row per gene
}
# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}
# Create translation tables
ensembl = param$annot_main["ensembl"]
symbol = param$annot_main["symbol"]
entrez = param$annot_main["entrez"]
# Ensembl id to gene symbol (new)
ensembl_to_symbol = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_symbol[, symbol] = ifelse(nchar(ensembl_to_symbol[, symbol]) == 0, NA, ensembl_to_symbol[, symbol])
ensembl_to_symbol = setNames(ensembl_to_symbol[, symbol], ensembl_to_symbol[, ensembl])
# Ensembl id to seurat-compatible unique rowname (new)
ensembl_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_seurat_rowname[, symbol] = ifelse(nchar(ensembl_to_seurat_rowname[, symbol]) == 0, ensembl_to_seurat_rowname[, ensembl], ensembl_to_seurat_rowname[, symbol])
ensembl_to_seurat_rowname[, symbol] = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname[, symbol], fixed=TRUE))
ensembl_to_seurat_rowname = setNames(ensembl_to_seurat_rowname[, symbol], ensembl_to_seurat_rowname[, ensembl])
# Seurat-compatible unique rowname to ensembl id
seurat_rowname_to_ensembl = setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)
# Ensembl to Entrez
ensembl_to_entrez = unique(annot_ensembl[, c(ensembl, entrez)])
ensembl_to_entrez[, entrez] = ifelse(nchar(ensembl_to_entrez[, entrez]) == 0, NA, ensembl_to_entrez[, entrez])
ensembl_to_entrez = split(ensembl_to_entrez[, entrez], ensembl_to_entrez[, ensembl])
# Seurat-compatible unique rowname to Entrez
seurat_rowname_to_ensembl_match = match(seurat_rowname_to_ensembl, names(ensembl_to_entrez))
names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
seurat_rowname_to_entrez = purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})
# Entrez IDs is duplicating Ensembl IDs in annot_ensembl
# Therefore, we remove Entrez IDs from the annotation table, after generating all required translation tables
# Set rownames of annotation table to Ensembl identifiers
annot_ensembl = annot_ensembl[, -match(entrez, colnames(annot_ensembl))] %>% unique() %>% as.data.frame()
rownames(annot_ensembl) = annot_ensembl[, ensembl]
```
```{r read_cc_genes}
# Use biomart to translate human cell cycle genes to the species of interest and save them in a file
if (file.exists(param$file_cc_genes)) {
# Load from file
genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
genes_g2m = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)
} else {
# Obtain from Ensembl
# Note: both mart objects must point to the same mirror for biomarT::getLDS to work
mart_human = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset="hsapiens_gene_ensembl",
mirror=param$biomart_mirror,
version=param$annot_version))
mart_myspecies = suppressWarnings(GetBiomaRt(biomart="ensembl",
dataset=param$mart_dataset,
mirror=GetBiomaRtMirror(mart_human),
version=param$annot_version))
# S phase marker
genes_s = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"),
filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$s.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id", "external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_s) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
genes_s = genes_s %>% dplyr::arrange(Human_gene_name)
# G2/M marker
genes_g2m = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"),
filters="external_gene_name",
values=Seurat::cc.genes.updated.2019$g2m.genes,
mart=mart_human,
attributesL=c("ensembl_gene_id", "external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
colnames(genes_g2m) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
genes_g2m = genes_g2m %>% dplyr::arrange(Human_gene_name)
# Write to file
openxlsx::write.xlsx(list(S_phase=genes_s,G2M_phase=genes_g2m), file=param$file_cc_genes)
}
# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))
```
## Read scRNA-seq data
We next read the single-cell RNA-seq data into a Seurat object.
<details>
<summary>What is Seurat and which information is contained in a Seurat object?</summary>
Seurat is an R-package that is used for the analysis of single-cell RNA-seq data. We read pre-processed data as described above, and convert it to a count matrix in R. In the case of 10x data, the count matrix contains the number of unique RNA molecules (UMIs) per gene and cell. In the case of SmartSeq-2 data, the count matrix contains the number of reads per gene and cell.
In addition to the count matrix, the workflow stores additional information in the Seurat object, including but not limited to the normalized data, dimensionality reduction and cluster results.
</details>
```{r read_datasets}
# List of Seurat objects
sc = list()
datasets = param$path_data
for (i in seq(nrow(datasets))) {
name = datasets[i, "name"]
type = datasets[i, "type"]
path = datasets[i, "path"]
suffix = datasets[i, "suffix"]
# Read 10X or smartseq2
if (type == "10x") {
# Read 10X sparse matrix into a Seurat object
sc = c(sc, ReadSparseMatrix(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, cellnames_suffix=suffix))
} else if (type == "smartseq2") {
# Read counts table into a Seurat object
sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE, cellnames_suffix=suffix))
}
}
# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
sample_names = names(sc)
duplicated_sample_names_idx = which(sample_names %in% sample_names[duplicated(sample_names)])
for (i in duplicated_sample_names_idx) {
sample_names[i] = paste(head(sc[[i]][["orig.dataset", drop=TRUE]], 1), sample_names[i], sep=".")
sc[[i]][["orig.ident"]] = sample_names[i]
}
# Set up colors for samples and add them to the sc objects
sample_names = purrr::flatten_chr(purrr::map(sc, function(s) {
nms = unique(as.character(s[[]][["orig.ident"]]))
return(nms)
}))
param$col_samples = GenerateColours(num_colours=length(sample_names), names=sample_names, palette=param$col_palette_samples, alphas=1)
sc = purrr::map(sc, ScAddLists, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")
#
### Add conditions per sample here if necessary
#
sc
# Remove contaminant cells if needed
if (!is.null(param$file_contam_cells) && file.exists(param$file_contam_cells)) {
contam_cells = read.delim(param$file_contam_cells, header=FALSE)[,1]
message("Read ", length(contam_cells), " names of contaminant cells.")
n_contam_cells = 0
for (i in seq(sc)) {
bool_contam_cells = colnames(sc[[i]]) %in% contam_cells
n_contam_cells = n_contam_cells + sum(bool_contam_cells)
names_contam_cells = colnames(sc[[i]])[!bool_contam_cells]
sc[[i]] = subset(sc[[i]], cells=names_contam_cells)
}
message("Removed ", n_contam_cells, " contaminant cells from the dataset.")
sc
}
# Rename doublet scores
for (i in seq(sc)) {
if (!is.null(param$doublets)) {
sc[[i]]@meta.data = dplyr::rename(sc[[i]][[]],
DoubletScore=param$doublets,
IsDoublet=param$doublets_prediction)
}
}
```
The following first table shows available metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to ("orig.ident"), the number of mapped reads (“nCounts_RNA”), or the above mentioned number of unique genes detected ("`r paste0("nFeature_", param$assay_raw)`").
The second table shows available metadata (columns) of the first 5 genes (rows).
```{r metadata}
# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s) return(s[[]])) %>% as.data.frame())
# Print cell metadata
knitr::kable(head(sc_cell_metadata), align="l", caption="Cell metadata, top 5 rows") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")
# Print gene metadata
knitr::kable(head(sc[[1]][[param$assay_raw]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%")
```
# Pre-processing
The steps below represent a standard pre-processing workflow for single-cell RNA-seq data, including quality control, the respective filtering of cells and genes, data normalization and scaling, and the detection of highly variable genes.
<details>
<summary>Why is pre-processing so important?</summary>
Cells may have been damaged during sample preparation and might be only partially captured in the sequencing. In addition, free-floating mRNA from damaged cells can be encapsulated, adding to the background noise. The low-quality cells and free-floating mRNA interfere with downstream analyses. Therefore, cells and genes are filtered based on defined quality metrics. Data normalization eliminates cell-specific biases such as the absolute number of reads per cell, allowing us to systematically compare cells afterwards. Subsequent scaling corrects for the fact that genes have different lengths, allowing us to compare genes afterwards. Lastly, highly variable genes are determined, reducing computational overhead and noise from genes that are not interesting.
</details>
## Quality control
We start the analysis by removing unwanted cells from the dataset(s). Three commonly used QC metrics include the number of unique genes detected in each cell ("`r paste0("nFeature_", param$assay_raw)`"), the total number of molecules detected in each cell ("`r paste0("nCount_", param$assay_raw)`"), and the percentage of counts that map to the mitochrondrial genome ("percent_mt"). If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown ("percent_ercc").
<details>
<summary>Which cells can be considered to be of low quality?</summary>
On the one hand, low-quality cells such as dying, degraded and damaged cells, and empty droplets will often have very few genes ("nFeature_RNA") and low total counts ("nCount_RNA"). On the other hand, cell doublets may show an aberrantly high number of genes. Since the total number of reads detected within a cell typically strongly correlates with the number of unique genes, we can focus on the number of unique genes for filtering. In addition, damaged cells often exhibit high mitochondrial ("percent_mt") or spike-in ("percent_ercc") content. As mitochondria have their own membranes, their RNA is often the last to degrade in damaged cells and can thus be found in high numbers. However, it is important to keep in mind that different cell types and cells isolated from various species may differ in their number of expressed genes and metabolism. For example, stem cells may express a higher number of unique genes, and metabolically active cells may express a higher number of mitochondrial genes.
</details>
<details>
<summary>Impact of low-quality cells on downstream analyses</summary>
First of all, low-quality cells of different cell types might cluster together due to similarities in their damage-induced gene expression profiles. This leads to artificial intermediate states or trajectories between subpopulations that would otherwise be distinct. Second, low-quality cells might mask relevant gene expression changes. The main differences captured by the first principal components will likely be based on cell quality rather than the underlying biology, reducing the power of dimensionality reduction. Likewise, highly variable genes might just be genes that differ between low- and high-quality cells. Lastly and equally important, the low number of total counts in low-quality cells might lead to artificial upregulation of genes due to wrong scaling effects during the normalisation process.
</details>
```{r qc_criteria_create}
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$cell_filter)) {
return(param$cell_filter[[s]])
} else {
return(param$cell_filter)
}
})
param$feature_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$feature_filter)) {
return(param$feature_filter[[s]])
} else {
return(param$feature_filter)
}
})
```
```{r qc_calculate_cells}
# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, function(s) {
mt_features = grep(pattern=param$mt, rownames(s), value=TRUE)
return(Seurat::PercentageFeatureSet(s, features=mt_features, col.name="percent_mt", assay=param$assay_raw))
})
# Calculate percentage of counts in ribosomal genes for each Seurat object
sc = purrr::map(sc, function(s) {
ribo_features = grep(pattern="^RP[SL]", rownames(s), value=TRUE, ignore.case=TRUE)
return(Seurat::PercentageFeatureSet(s, features=ribo_features, col.name="percent_ribo", assay=param$assay_raw))
})
# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
return(s)
})
# Combine (again) cell metadata of the Seurat objects into one big metadata, this time including mt and ercc
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ return(s[[]]) }) %>% as.data.frame())
```
```{r qc_calculate_features}
# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
# This might have to be adapted in future (Sparse Matrix Apply Function)
sc = purrr::map(list_names(sc), function(n) {
# Calculate percentage of counts per gene in a cell
counts_rna = Seurat::GetAssayData(sc[[n]], slot="counts", assay=param$assay_raw)
total_counts = sc[[n]][[paste0("nCount_", param$assay_raw), drop=TRUE]]
counts_rna_perc = Matrix::t(Matrix::t(counts_rna)/total_counts)*100
# Calculate feature filters
num_cells_expr = Matrix::rowSums(counts_rna >= 1)
num_cells_expr_threshold = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
# Calculate median of counts_rna_perc per gene
counts_median = sparseMatrixStats::rowMedians(counts_rna_perc)
# Add all QC measures as metadata
sc[[n]][[param$assay_raw]] = Seurat::AddMetaData(sc[[n]][[param$assay_raw]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
return(sc[[n]])
})
```
```{r qc_plot_cells, fig.height=10}
# QC metrics to plot for cells
cell_qc_features = c(paste0(c("nFeature_", "nCount_"), param$assay_raw), "percent_mt")
if ("DoubletScore" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "DoubletScore")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)
# Get filter thresholds per QC metrics (for plotting)
cell_qc_thresholds = purrr::map(cell_qc_features, function(m) {
tresh = purrr::map_dfr(names(param$cell_filter), function(n) {
if (m %in% names(param$cell_filter[[n]])) {
t = data.frame(orig.ident=n, min=param$cell_filter[[n]][[m]][1], max=param$cell_filter[[n]][[m]][2]) %>%
tidyr::pivot_longer(cols=2:3, names_to=c("threshold")) %>%
dplyr::filter(!is.na(value))
t$threshold = factor(t$threshold, levels=c("min", "max"))
return(t)
}
})
})
# Now create plot per QC metric
p_list = list()
for (m in cell_qc_features) {
p_list[[m]]= ggplot(sc_cell_metadata[, c("orig.ident", m)], aes(x=orig.ident, y=!!sym(m), fill=orig.ident, group=orig.ident)) +
geom_violin(scale="width")
# Adds points for samples with less than three cells since geom_violin does not work here
p_list[[m]] = p_list[[m]] +
geom_point(data=sc_cell_metadata[, c("orig.ident", m)] %>% dplyr::filter(orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes(x=orig.ident, y=!!sym(m), fill=orig.ident), shape=21, size=2)
# Now add style
p_list[[m]] = p_list[[m]] +
AddStyle(title=m, legend_position="none", fill=param$col_samples, xlab="") +
theme(axis.text.x=element_text(angle=45, hjust=1))
# Add filter threshold as segments to plot; min threshold lines are dashed and max threshold lines are twodashed
if (nrow(cell_qc_thresholds[[m]]) > 0) {
p_list[[m]] = p_list[[m]] + geom_segment(data=cell_qc_thresholds[[m]],
aes(x=as.integer(as.factor(orig.ident))-0.5,
xend=as.integer(as.factor(orig.ident))+0.5,
y=value, yend=value, lty=threshold), colour="firebrick") +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min")))
}
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values")
p
```
```{r qc_plot_correlation}
# Correlate QC metrics for cells
p_list = list()
sc_cell_metadata_plot_order = sample(1:nrow(sc_cell_metadata))
# nFeature vs nCount
m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p_list[[1]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=orig.ident)) +
geom_point() +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[1]] = p_list[[1]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_mt
m = c("percent_mt", paste0(c("nFeature_"), param$assay_raw))
p_list[[2]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=orig.ident)) +
geom_point() +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[2]] = p_list[[2]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
# nFeature vs percent_ercc (if available)
if ("percent_ercc" %in% names(cell_qc_features)) {
m = c("percent_ercc", paste0(c("nFeature_"), param$assay_raw))
p_list[[3]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=!!sym(m[1]), y=!!sym(m[2]), colour=orig.ident)) +
geom_point() +
scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
p_list[[3]] = p_list[[3]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
}
# Combine plots
p = patchwork::wrap_plots(p_list, ncol=length(p_list)) + patchwork::plot_annotation("Features plotted against each other")
if (length(p_list) == 1) {
p = p & theme(legend.position="bottom")
} else {
p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
}
p
```
## Genes with highest expression
We next investigate whether there are individual genes that are represented by an unusually high number of counts. For each cell, we first calculate the percentage of counts per gene. Subsequently, for each gene, we calculate the median value of these percentages in all cells. Genes with the highest median percentage of counts are plotted below.
```{r plot_highest_expression}
# Plot only samples that we intend to keep
sc_names = names(sc)[!(names(sc) %in% param$samples_to_drop)]
genes_highestExpr = lapply(sc_names, function(i) {
top_ten_exp = sc[[i]][[param$assay_raw]][["counts_median"]] %>% dplyr::arrange(dplyr::desc(counts_median)) %>% head(n=10)
return(rownames(top_ten_exp))
}) %>%
unlist() %>%
unique()
genes_highestExpr_counts = purrr::map_dfc(sc[sc_names], .f=function(s) s[[param$assay_raw]][["counts_median"]][genes_highestExpr, ])
genes_highestExpr_counts$gene = genes_highestExpr
genes_highestExpr_counts = genes_highestExpr_counts %>% tidyr::pivot_longer(cols=all_of(sc_names))
genes_highestExpr_counts$name = factor(genes_highestExpr_counts$name, levels=sc_names)
col = GenerateColours(num_colours=length(genes_highestExpr), names=genes_highestExpr, palette="ggsci::pal_simpsons")
p = ggplot(genes_highestExpr_counts, aes(x=name, y=value, col=gene, group=gene)) +
geom_point() +
AddStyle(title="Top 10 highest expressed genes per sample, added into one list",
xlab="Sample", ylab="Median % of raw counts\n per gene in a cell",
legend_position="bottom",
col=col)
if (length(unique(genes_highestExpr_counts$name))>1) p = p + geom_line()
p
```
## Filtering
Cells and genes are filtered based on the following thresholds:
```{r filter_print_cutoffs}
cell_filter_lst = param$cell_filter %>% unlist(recursive=FALSE)
is_numeric_filter = purrr::map_lgl(cell_filter_lst, function(f) return(is.numeric(f) & length(f)==2))
# numeric cell filters
if (length(cell_filter_lst[is_numeric_filter]) > 0) {
purrr::exec(rbind, !!!cell_filter_lst[is_numeric_filter]) %>%
knitr::kable(align="l", caption="Numeric filters applied to cells", col.names=c("Min", "Max")) %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
# categorial cell filters
if (length(cell_filter_lst[!is_numeric_filter]) > 0) {
purrr::exec(rbind, !!!cell_filter_lst[!is_numeric_filter] %>% purrr::map(paste, collapse=",")) %>%
knitr::kable(align="l", caption="Categorial filters applied to cells", col.names=c("Values")) %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
# gene filters
feature_filter_lst = param$feature_filter %>% unlist(recursive=FALSE)
if (length(feature_filter_lst) > 0) {
purrr::exec(rbind, !!!feature_filter_lst) %>%
knitr::kable(align="l", caption="Filters applied to genes", col.names=c("Value")) %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
```
The number of excluded cells and features is as follows:
```{r filter_cells}
# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
sc_cells_to_exclude = purrr::map(list_names(sc), function(n) {
filter_result = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
filter = param$cell_filter[[n]][[f]]
if (is.numeric(filter)) {
if (is.na(filter[1])) filter[1] = -Inf # Minimum
if (is.na(filter[2])) filter[2] = Inf # Maximum
idx_exclude = sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
return(names(which(idx_exclude)))
} else if (is.character(filter)) {
idx_exclude = !sc[[n]][[f, drop=TRUE]] %in% filter
return(Cells(sc[[n]])[idx_exclude])
}
})
# Samples to drop
if (n %in% param$samples_to_drop) {
filter_result[["samples_to_drop"]] = colnames(sc[[n]])
} else {
filter_result[["samples_to_drop"]] = as.character(c())
}
# Minimum number of cells for a sample to keep
if (ncol(sc[[n]]) < param$samples_min_cells) {
filter_result[["samples_min_cells"]] = colnames(sc[[n]])
} else {
filter_result[["samples_min_cells"]] = as.character(c())
}
return(filter_result)
})
# Summarise
sc_cells_to_exclude_summary = purrr::map_dfr(sc_cells_to_exclude, function(s) {
return(as.data.frame(purrr::map(s, length)))
})
rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)
sc_cells_to_exclude_summary$Original = purrr::map_int(sc, ncol)
sc_cells_to_exclude_summary$Excluded = purrr::map_int(sc_cells_to_exclude, function(s) { return(purrr::flatten(s) %>% unique() %>% length())})
sc_cells_to_exclude_summary$PercKept = round((sc_cells_to_exclude_summary$Original - sc_cells_to_exclude_summary$Excluded) / sc_cells_to_exclude_summary$Original * 100, 2)
knitr::kable(sc_cells_to_exclude_summary,
align="l",
caption="Summary of excluded cells") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
# Add filter column to sc_cell_metadata for post-filtering QC
sc_cell_metadata$IS_FILTERED = rownames(sc_cell_metadata) %in% unlist(sc_cells_to_exclude)
# Now filter, drop the respective colours and adjust integration method
sc = purrr::map(list_names(sc), function(n) {
cells_to_keep = Cells(sc[[n]])
cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
if (length(cells_to_keep)==0) return(NULL)
else return(subset(sc[[n]], cells=cells_to_keep))
}) %>% purrr::discard(is.null)
if (length(sc)==1) param$integrate_samples[["method"]] = "single"
```
```{r filter_features}
# Only RNA assay at the moment
# Iterate over samples and record a feature if it does not pass the filter
sc_features_to_exclude = purrr::map(list_names(sc), function(n) {
# Make sure the sample contains more cells than the minimum threshold
if (length(Cells(sc[[n]])) < param$feature_filter[[n]][["min_cells"]]) return(list())
# Return gene names that do not pass the minimum threshold
else return(names(which(sc[[n]][[param$assay_raw]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]])))
})
# Which genes are to be filtered for all samples?
# Note: Make sure that no other sample is called "AllSamples"
sc_features_to_exclude$AllSamples = Reduce(f=intersect, x=sc_features_to_exclude)
# Summarise
sc_features_to_exclude_summary = purrr::map_dfr(names(sc), function(n){
df = data.frame(Original=nrow(sc[[n]]),
FailThreshold=length(sc_features_to_exclude[[n]]))
df$PercFailThreshold = round(df$FailThreshold / df$Original * 100, 2)
df$Kept = length(setdiff(rownames(sc[[n]]), sc_features_to_exclude[["AllSamples"]]))
df$PercKept = round(df$Kept / df$Original * 100, 2)
return(df)
})
rownames(sc_features_to_exclude_summary) = names(sc)
knitr::kable(sc_features_to_exclude_summary, align="l", caption="Summary of excluded genes") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
# Now filter those genes that are to be filtered for all samples
sc = purrr::map(list_names(sc), function(n) {
assay_names = Seurat::Assays(sc[[n]])
features_to_keep = purrr::map(values_to_names(assay_names), function(a) {
features = rownames(sc[[n]][[a]])
keep = features[!features %in% sc_features_to_exclude$AllSamples]
return(keep)
})
return(subset(sc[[n]], features=purrr::flatten_chr(features_to_keep)))
})
```
After filtering, the size of the Seurat object is:
```{r filter_size_after}
sc
```
## Quality control post filtering
The updated QC plots are:
```{r qc_plot_cells_afterFiltering, fig.height=10}
# Now create plot per QC metric after filtering
p_list = list()
for (m in cell_qc_features) {
p_list[[m]] = ggplot(sc_cell_metadata[, c("orig.ident", m, "IS_FILTERED")] %>% dplyr::filter(!IS_FILTERED), aes(x=orig.ident, y=!!sym(m), fill=orig.ident, group=orig.ident)) +
geom_violin(scale="width")