-
Notifications
You must be signed in to change notification settings - Fork 15
/
scrnaseq_hto.Rmd
644 lines (540 loc) · 28.3 KB
/
scrnaseq_hto.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
---
date: "`r format(Sys.time(), '%B, %Y')`"
geometry: margin=2cm
output:
html_document:
toc: true
toc_depth: 4
toc_float: true
code_folding: hide
highlight: tango
theme: paper
params:
# Author
author: !r Sys.info()[["user"]]
# Project ID
project_id: HTO_testDataset
# Input directory
path_data: test_datasets/10x_pbmc_hto_GSE108313/counts
# Metrics summary
stats:
# Output directory
path_out: test_datasets/10x_pbmc_hto_GSE108313/demultiplexed
# HTO names
# HTOs have an ID that is included in the "features.tsv" input file
# We additionally ask for readable names that are used throughout this report (optional)
# This could look like this, where HTO1-3 are the IDs included raw dataset
# - without names: 'hto.names: c("HTO1", "HTO2", "HTO3")'
# - with names: 'hto.names: !r setNames(c("NameA", "NameB", "NameC"), c("HTO1", "HTO2", "HTO3"))'
#
hto_names: !r setNames(c("htoA","htoB","htoC","htoD","htoE","htoF","htoG","htoH"), c("htoA","htoB","htoC","htoD","htoE","htoF","htoG","htoH"))
# HTO regular expression (alternative to HTO names)
# Identify HTOs via a regular expression. Names cannot be provided here.
# If specified will overrule hto_names
hto_regex: NULL
# HTO normalisation: CLR (default) or LogNormalize
norm: "CLR"
# Prefix of mitochondrial genes
mt_names: "^MT-"
# Main color to use for plots
col: "palevioletred"
# Downsample data to at most N cells (mainly for tests); set to NULL to deactivate
downsample_cells_n: NULL
# Path to git
path_to_git: "."
# Debugging mode:
# "default_debugging" for default, "terminal_debugger" for debugging without X11, "print_traceback" for non-interactive sessions
debugging_mode: "default_debugging"
title: "Single-cell RNA-seq data analysis of project `r params$project_id`"
subtitle: "Demultiplexing with hashtag oligos"
author: "`r params$author`"
---
```{r setup, message=FALSE, warning=FALSE}
# R Options
options(stringsAsFactors=FALSE,
"citation_format"="pandoc",
dplyr.summarise.inform=FALSE,
knitr.table.format="html",
future.globals.maxSize=2000000000, mc.cores=4,
future.fork.enable=TRUE, future.plan="multicore",
future.rng.onMisuse="ignore")
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr
# Tables: kableExtra
# Plots: ggsci
# Knitr default options
knitr::opts_chunk$set(echo=TRUE, # output code
cache=FALSE, # do not cache results
message=TRUE, # show messages
warning=TRUE, # show warnings
tidy=FALSE, # do not auto-tidy-up code
fig.width=10, # default fig width in inches
class.source="fold-hide", # by default collapse code blocks
dev=c("png", "pdf"), # create figures in png and pdf; the first device (png) will be used for HTML output
dev.args=list(png=list(type="cairo"), # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
pdf=list(bg="white")), # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
dpi=96, # figure resolution (note: final = dpi*fig.retina; will not work with pandoc see below)
fig.retina=2 # retina multiplier for HTML output (note: final = dpi*fig.retina)
)
```
```{r preparation_and_initial_checks}
# Path for figures in png and pdf format
knitr::opts_chunk$set(fig.path=paste(params$path_out, "figures/", sep="/"))
# When using pandoc, setting the dpi via opts_chunk does not work (https://github.com/yihui/knitr/issues/901)
# As workaround, we define a function that takes care of that
#knitr::opts_hooks$set(dpi = function(options) {options$dpi = 40;options})
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("R/functions_io.R",
"R/functions_plotting.R",
"R/functions_util.R")
git_files_to_source = paste(params$path_to_git, git_files_to_source, sep="/")
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:",paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", params$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))
# Debugging mode:
# "default_debugging" for default, "terminal_debugger" for debugging without X11, "print_traceback" for non-interactive sessions
invisible(switch (params$debugging_mode,
default_debugging=on_error_default_debugging(),
terminal_debugger=on_error_start_terminal_debugger(),
print_traceback=on_error_just_print_traceback(),
on_error_default_debugging()))
# Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)
# Create output directories
if (!file.exists(params$path_out)) dir.create(params$path_out, recursive=TRUE, showWarnings=FALSE)
# Do checks
error_messages = c()
# Check installed packages
#error_messages = c(error_messages, check_installed_packages())
# Check parameters TODO
#error_messages = c(error_messages, check_parameters(param))
```
```{r initial_checks_failed, eval=length(error_messages)>0, include=length(error_messages)>0}
# Now format collected error messages as error boxes, print and exit
# Note: asis_output prints a content as is; knit_exist toggles knitr to exit after the current chunk
error_message_blocks = purrr::map(error_messages, format_error) %>% paste(collapse="")
knitr::knit_exit()
knitr::asis_output(error_message_blocks)
```
# Read input data
In this first section of the report, we begin by printing mapping statistics that have been produced prior to this workflow.
```{r mapping_stats, message=TRUE, results="asis"}
# Are statistics provided?
if (!is.null(params$stats)) {
cat("\n## Stats {.tabset}\n\n")
mapping_stats = read.delim(params$stats,sep=",", header=FALSE, check.names=FALSE) %>% t() %>% as.data.frame()
colnames(mapping_stats) = c("V1","Dataset")
rownames(mapping_stats) = mapping_stats$V1
mapping_stats$V1 = NULL
mapping_stats_general = mapping_stats[!grepl("^Antibody:",rownames(mapping_stats)),,drop=FALSE]
mapping_stats_antibodies = mapping_stats[grepl("^Antibody:",rownames(mapping_stats)),,drop=FALSE]
cat("\n### General\n\n")
print(knitr::kable(mapping_stats_general, align="l", caption="General statistics") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")))
cat("\n### Antibodies\n\n")
print(knitr::kable(mapping_stats_antibodies, align="l", caption="Antibody statistics") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")))
} else {
message("Mapping statistics cannot be shown. No valid file provided.")
}
```
## Dataset
Then, we read 10X data from the files produced by Cell Ranger:
* **barcodes.tsv.gz**: All cell barcodes
* **features.tsv.gz**: (Ensembl) ID, name, and type for each feature
* **matrix.mtx.gz**: Counts for all features
and setup a Seurat object. This object includes different data types in separate assays:
* **Gene Expresssion** in assay **RNA**
* **Antibody Capture** in **HTO** and **ADT**,
* **CRISPR Guide Capture** in **Crispr**, and
* **Custom** in **Custom**.
Note that "Antibody Capture" features can correspond to "HTO" and "ADT" and are distinguished based on provided HTO names.
```{r hto_read}
# Load the dataset with its assays and create a Seurat object; pass hto_names (or hto_regex if set) so that HTO will be a separate assay
sc = ReadSparseMatrix(params$path_data, project=params$project_id, row_name_column=1, hto_names=params$hto_names, hto_regex=params$hto_regex)
sc = sc[[1]]
# If requested: sample at most n cells
if (!is.null(params$downsample_cells_n)) {
sampled_barcodes = sample(Seurat::Cells(sc), min(params$downsample_cells_n, length(Seurat::Cells(sc))))
sc = subset(sc, cells=sampled_barcodes)
}
# Remember original assay names
original_assay_names = setdiff(Seurat::Assays(sc), "HTO")
# Set colours
hto_samples = rownames(sc[["HTO"]])
hto_colours = list()
hto_colours$col_hto_global = ggsci::pal_npg()(3)
names(hto_colours$col_hto_global) = c("Negative", "Doublet", "Singlet")
hto_colours$col_hto_collapsed = ggsci::pal_npg()(length(hto_samples) + 2)
hto_levels = c("Negative", "Doublet", hto_samples[1:length(hto_samples)])
names(hto_colours$col_hto_collapsed) = hto_levels
```
## HTO counts
Sometimes demultiplexing fails, because some HTOs generally do not perform well, or some cells end up with zero HTO counts. To assure the script can run through, we remove HTOs and cells with zero counts.
```{r hto_counts, results="asis"}
# Check for HTOs with 0 HTO counts, and remove them if necessary
hto_counts = Seurat::GetAssayData(sc, assay="HTO", slot="counts")
hto_counts_row = hto_counts %>% Matrix::rowSums() %>% as.data.frame()
colnames(hto_counts_row) = "HtoCounts"
if (any(hto_counts_row$HtoCounts == 0)) {
sc[["HTO"]] = subset(sc[["HTO"]], features=rownames(which(hto_counts_row$HtoCounts > 0)))
warning("Discarded ", sum(hto_counts_row$HtoCounts == 0), " HTOs with 0 HTO counts.")
}
# Check for cells with 0 HTO counts, and remove them if necessary
hto_counts_col = hto_counts %>% Matrix::colSums() %>% summary() %>% as.matrix() %>% as.data.frame()
colnames(hto_counts_col) = "HtoCounts"
if (any(hto_counts_col$HtoCounts == 0)) {
sc = subset(sc, cells=rownames(which(hto_counts_col[, "HtoCounts"] > 0)))
warning("Discarded ", sum(hto_counts_col$HtoCounts == 0), " cells with 0 HTO counts.")
}
# Check for HTOs how many cells they have at least 1 count
hto_counts_row$CellsWithAtLeast1HtoCount = sapply(1:nrow(hto_counts), function(x) sum(hto_counts[x,] > 0))
knitr::kable(hto_counts_row, align="l", caption="HTO counts per HTO") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
knitr::kable(hto_counts_col, align="l", caption="HTO counts per cell (summary)") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
```
# Demultiplexing with HTOs
This section of the report shows how cells are assigned to their sample-of-origin.
## Normalisation of HTO counts
We start the analysis by normalising raw HTO counts. If `LogNormalize` was chosen for normalisation, HTO counts for each cell are divided by the total counts for that cell, multiplied by 10,000 and then natural-log transformed. If `CLR` was chosen for normalisation, HTO counts for each cell are divided by the geometric mean of the counts for that cell and then natural-log transformed.
```{r hto_normalisation}
# HTO
sc = suppressWarnings(Seurat::NormalizeData(sc, assay="HTO", normalization.method=params$norm, verbose=FALSE))
```
## Classification of cells based on normalised HTO data
We assign cells to their sample-of-origin, annotate negative cells that cannot be assigned to any sample, and doublet cells that are assigned to two samples.
```{r hto_demux, results="asis"}
# Demultiplex HTOs
sc = Seurat::HTODemux(sc, assay="HTO", positive.quantile=0.95, verbose=FALSE)
# Sort Idents levels for nicer plotting
Seurat::Idents(sc) = factor(Seurat::Idents(sc), levels=hto_levels)
sc$hash.ID = factor(sc$hash.ID, levels=hto_levels)
sc$HTO_classification.global = factor(sc$HTO_classification.global, levels=c("Negative", "Doublet", "Singlet"))
# HTO classification results
hash_ID_table = sc[["hash.ID"]] %>%
dplyr::count(hash.ID) %>%
dplyr::rename(HTO=hash.ID) %>%
dplyr::mutate(Perc=round(n/sum(n)*100,2)) %>%
as.data.frame
p1 = ggplot(sc[["HTO_classification.global"]] %>% dplyr::count(HTO_classification.global),
aes(x="", y=n, fill=HTO_classification.global)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0, direction=-1) +
AddStyle(title="HTO global classification results",
fill=hto_colours$col_hto_global,
xlab="", ylab="")
p2 = ggplot(sc[["hash.ID"]] %>% dplyr::count(hash.ID),
aes(x="", y=n, fill=hash.ID)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0, direction=-1) +
AddStyle(title="HTO classification results",
fill=hto_colours$col_hto_collapsed,
xlab="", ylab="")
p = p1 + p2 +
patchwork::plot_annotation(title="HTO classification results")
p
knitr::kable(hash_ID_table, align="l", caption="HTO classification results") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
```
# Visualisation of HTO data
This section of the report visualises raw and normalised HTO data to understand whether the demultiplexing step has worked well.
```{r hto_plot_counts_raw, dpi=96}
# Distribution of HTO counts before and after normalisation
hto_t_raw = Seurat::GetAssayData(sc, assay="HTO", slot="counts") %>%
as.data.frame %>% t %>% as.data.frame
hto_t_raw_pseudo = hto_t_raw + 1
hto_t_norm = Seurat::GetAssayData(sc, assay="HTO", slot="data") %>%
as.data.frame %>% t %>% as.data.frame
p1 = ggplot(hto_t_raw_pseudo %>% tidyr::pivot_longer(tidyr::everything()),
aes(x=name, y=value, fill=name, group=name)) +
geom_violin() +
scale_y_continuous(trans="log2") +
AddStyle(title="HTO raw counts",
fill=hto_colours$col_hto_collapsed,
legend_position="none",
xlab="", ylab="") +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))
p2 = ggplot(hto_t_norm %>% tidyr::pivot_longer(tidyr::everything()),
aes(x=name, y=value, fill=name, group=name)) +
geom_violin() +
AddStyle(title="HTO normalised counts",
fill=hto_colours$col_hto_collapsed,
legend_position="none",
xlab="", ylab="") +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))
p = p1 + p2
p = p + patchwork::plot_annotation("HTO counts before and after normalisation")
p
```
Pairs of raw (top) and normalised (bottom) HTO counts are visualised to confirm mutal exclusivity in singlet cells. Data points correspond to measured HTO counts per HTO, colours correspond to the assigned samples-of-origin.
```{r hto_plot_counts_norm, fig.height=10}
n = DfAllColumnCombinations(x=hto_t_raw_pseudo, cell_classification=sc$hash.ID)
# Plot
p = ggplot(n, aes(x=value1, y=value2, color=cell_classification)) +
geom_point() +
scale_x_continuous(trans="log2") + scale_y_continuous(trans="log2") +
AddStyle(col=hto_colours$col_hto_collapsed)
p = p + facet_grid(name2~name1, drop=FALSE) +
theme(axis.title.x=element_blank(), strip.text.x=element_text(size=10, color="black"),
axis.title.y=element_blank(), strip.text.y=element_text(size=10, color="black"),
strip.background = element_rect(colour="white", fill="lightgrey"),
legend.position="bottom",
axis.text.x=element_text(angle=45, hjust=1, vjust=0.5)) +
patchwork::plot_annotation("Raw HTO counts")
p
```
```{r hto_plot_norm_scatter, fig.height=10}
n = DfAllColumnCombinations(x=hto_t_norm, cell_classification=sc$hash.ID)
n = n[n$value1 > 0 & n$value2 > 0, ]
p = ggplot(n, aes(x=value1, y=value2, color=cell_classification)) +
geom_point() +
scale_x_continuous(trans="log2") + scale_y_continuous(trans="log2") +
AddStyle(col=hto_colours$col_hto_collapsed)
p = p + facet_grid(name2~name1, drop=FALSE) +
theme(axis.title.x=element_blank(), strip.text.x=element_text(size=10, color="black"),
axis.title.y=element_blank(), strip.text.y=element_text(size=10, color="black"),
strip.background = element_rect(colour="white", fill="lightgrey"),
legend.position="bottom") +
patchwork::plot_annotation("Normalised HTO data")
suppressMessages(p)
```
The following ridge plots visualise the enrichment of assigned sample-of-origin for the respective normalised HTO counts.
```{r hto_plot_norm, fig.height=10, message=FALSE}
# Group cells based on HTO classification
p_list = RidgePlot(sc, assay="HTO", features=rownames(Seurat::GetAssay(sc, assay="HTO")),
same.y.lims=TRUE, cols=hto_colours$col_hto_collapsed, combine=FALSE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(legend_position="none")
p = patchwork::wrap_plots(p_list, ncol = 2) +
patchwork::plot_annotation("Normalised HTO data")
suppressMessages(p)
```
Lastly, we compare the counts and number of features between classified cells.
```{r hto_plot_features}
# Number of counts and features in the different cells
nfeature_metrics = grep("_HTO",
grep("nFeature_", colnames(sc[[]]), v=TRUE),
v=TRUE,
invert=TRUE)
ncounts_metrics = grep("_HTO",
grep("nCount_", colnames(sc[[]]), v=TRUE),
v=TRUE,
invert=TRUE)
p1 = VlnPlot(sc, features=ncounts_metrics, idents=levels(Seurat::Idents(sc)), pt.size=0) +
geom_violin(color=NA) +
AddStyle(title="Counts (RNA)",
fill=hto_colours$col_hto_collapsed,
legend_position="none",
xlab="") +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))
p2 = VlnPlot(sc, features=nfeature_metrics, idents=levels(Seurat::Idents(sc)), pt.size=0) +
geom_violin(color=NA) +
AddStyle(title="Features (RNA)",
fill=hto_colours$col_hto_collapsed,
legend_position="none",
xlab="") +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))
p = patchwork::wrap_plots(list(p1, p2), ncol=2)
p = p + patchwork::plot_annotation("Number of counts and features for HTO-classified cells")
p
```
# Keeping singlets
This section of the report states the number of cells that remain after negative and doublet cells are removed.
```{r hto_subset}
sc.all = sc
sc = subset(sc, idents=c("Negative", "Doublet"), invert=TRUE)
sc
```
# Preliminary visualisation of RNA data
This section of the report provides first insights into your RNA dataset based on a preliminary pre-processing of the RNA data using the standard scRNA-seq workflow.
```{r prelim_analysis, results="hide"}
# Normalise, scale, pca umap and clustering for sc
sc = suppressWarnings(Seurat::NormalizeData(sc, assay="RNA", normalization.method = "LogNormalize", scale.factor=10000, verbose=FALSE))
sc = Seurat::FindVariableFeatures(sc, assay="RNA", selection.method="vst", verbose=FALSE)
sc = Seurat::ScaleData(sc, assay="RNA", features=Seurat::VariableFeatures(sc, assay="RNA"), verbose=FALSE)
sc = Seurat::RunPCA(sc, assay="RNA", features=VariableFeatures(object=sc, assay="RNA"), verbose=FALSE, npcs=min(50, ncol(sc)))
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:10, verbose=FALSE, umap.method="uwot"))
sc = Seurat::FindNeighbors(sc, dims=1:10, verbose=FALSE)
sc = Seurat::FindClusters(sc, algorithm=1, verbose=FALSE, method="igraph")
Idents(sc) = "hash.ID"
# Normalise, scale, pca umap and clustering for sc.all
sc.all = suppressWarnings(Seurat::NormalizeData(sc.all, assay="RNA", normalization.method = "LogNormalize", scale.factor=10000, verbose=FALSE))
sc.all = Seurat::FindVariableFeatures(sc.all, assay="RNA", selection.method="vst", verbose=FALSE)
sc.all = Seurat::ScaleData(sc.all, assay="RNA", features=Seurat::VariableFeatures(sc.all, assay="RNA"), verbose=FALSE)
sc.all = Seurat::RunPCA(sc.all, assay="RNA", features=VariableFeatures(object=sc.all, assay="RNA"), verbose=FALSE, npcs=min(50, ncol(sc.all)))
sc.all = suppressWarnings(Seurat::RunUMAP(sc.all, dims=1:10, verbose=FALSE, umap.method="uwot"))
sc.all = Seurat::FindNeighbors(sc.all, dims=1:10, verbose=FALSE)
sc.all = Seurat::FindClusters(sc.all, algorithm=1, verbose=FALSE, method="igraph")
Idents(sc.all) = "hash.ID"
# Set figure height
sc_fig_height = round((length(hto_samples))*1.25)
sc_qc_fig_height = round((length(levels(sc$seurat_clusters)))*0.75)
sc_all_fig_height = round((length(hto_samples)+2)*1.25)
sc_all_qc_fig_height = round((length(levels(sc.all$seurat_clusters)))*0.75)
# Set cluster colors
cluster_names = levels(sc$seurat_clusters)
sc_col_clusters = GenerateColours(num_colours=length(cluster_names), palette="ggsci::pal_d3", palette_options = list(palette="category20"))
names(sc_col_clusters) = cluster_names
cluster_names = levels(sc.all$seurat_clusters)
sc_all_col_clusters = GenerateColours(num_colours=length(cluster_names), palette="ggsci::pal_d3", palette_options = list(palette="category20"))
names(sc_all_col_clusters) = cluster_names
```
## Visualisation with UMAP {.tabset}
We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the assigned sample-of-origin.
Take care not to mis-read a UMAP:
* Parameters influence the plot (we use defaults here)
* Cluster sizes relative to each other mean nothing, since the method has a local notion of distance
* Distances between clusters might not mean anything
* You may need more than one plot
For a nice read to intuitively understand UMAP, see https://pair-code.github.io/understanding-umap/.
### Before filtering
```{r umap_before}
p = Seurat::DimPlot(sc.all, reduction="umap", group.by="hash.ID", cols=hto_colours$col_hto_collapsed) +
AddStyle(title="UMAP, cells coloured by HTO classification, including doublets and negatives",
legend_title="Cell classification",
legend_position="bottom")
p
```
* Note: HTO colors are the same before and after filtering
### After filtering
```{r umap_after}
# Plot UMAP after HTO filtering
p = Seurat::DimPlot(sc, reduction="umap", group.by="hash.ID", cols=hto_colours$col_hto_collapsed) +
AddStyle(title="UMAP, cells coloured by HTO classification, singlets only",
legend_title="Cell classification",
legend_position="bottom")
p
```
* Note: HTO colors are the same before and after filtering
### Before filtering (each hto)
```{r umap_cluster_before, fig.height=sc_all_fig_height}
p = Seurat::DimPlot(sc.all, reduction="umap", group.by="seurat_clusters", split.by="hash.ID", ncol=2, cols=sc_all_col_clusters) +
AddStyle(title="UMAP for each HTO, including doublets and negatives",
legend_title="Cluster",
legend_position="right")
p
```
* Important: Cluster colors are not neccessarily the same before and after filtering!
### After filtering (each hto)
```{r umap_cluster_after, fig.height=sc_fig_height}
p = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", split.by="hash.ID", ncol=2, cols=sc_col_clusters) +
AddStyle(title="UMAP for each HTO, singlets only",
legend_title="Cluster",
legend_position="right")
p
```
* Important: Cluster colors are not neccessarily the same before and after filtering!
## Distribution of cells in HTO and in clusters
Next, we examine how the HTOs are distributed in the individual clusters and how the clusters are distributed in each individual HTO. We include negatives and doublets as they often group into specific clusters.
```{r hto_cluster distribution}
p = ggplot(sc.all[[c("seurat_clusters","hash.ID")]], aes(x=seurat_clusters, fill=hash.ID)) +
geom_bar(position="fill", colour=NA) +
scale_x_discrete("Cluster") +
scale_y_continuous("Fraction of cells") +
scale_fill_manual(values=hto_colours$col_hto_collapsed) +
AddStyle(title="HTO cells in cluster",
legend_title="Cell classification",
legend_position="right")
p
p = ggplot(sc.all[[c("seurat_clusters","hash.ID")]], aes(x=hash.ID, fill=seurat_clusters)) +
geom_bar(position="fill", colour=NA) +
scale_x_discrete("Cluster") +
scale_y_continuous("Fraction of cells") +
scale_fill_manual(values=sc_all_col_clusters) +
AddStyle(title="Clusters in individual HTO",
legend_title="Cluster",
legend_position="right") +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))
p
```
Here it also helps to inspect RNA counts and features per cluster and hto.
```{r hto_cluster_counts, fig.height=sc_all_qc_fig_height}
ncounts_metrics = grep("_HTO",
grep("nCount_", colnames(sc.all[[]]), v=TRUE),
v=TRUE,
invert=TRUE)
p = ggplot(sc.all[[c("hash.ID", "seurat_clusters", ncounts_metrics[1])]], aes_string(x="hash.ID", y=ncounts_metrics[1], fill="hash.ID", group="hash.ID")) +
geom_violin() +
xlim(levels(sc.all$hash.ID)) +
scale_y_continuous("Counts (RNA)") +
facet_wrap(~seurat_clusters, ncol=3, scales="free_y") +
#scale_fill_manual(values=hto_colours$col_hto_collapsed) +
AddStyle(fill=hto_colours$col_hto_collapsed,
legend_title="Classified cells",
legend_position="bottom",
xlab="") +
theme(axis.text.x = element_text(angle=45, hjust=1), legend.position="none")
p = p + patchwork::plot_annotation("Number of counts for each cluster and HTO")
p
```
```{r hto_cluster_features, fig.height=sc_all_qc_fig_height}
nfeature_metrics = grep("_HTO",
grep("nFeature_", colnames(sc.all[[]]), v=TRUE),
v=TRUE,
invert=TRUE)
p = ggplot(sc.all[[c("hash.ID", "seurat_clusters", nfeature_metrics[1])]], aes_string(x="hash.ID", y=nfeature_metrics[1], fill="hash.ID", group="hash.ID")) +
geom_violin() +
xlim(levels(sc.all$hash.ID)) +
facet_wrap(~seurat_clusters, ncol=3, scales="free_y") +
scale_y_continuous("Features (RNA)") +
#scale_fill_manual(values=hto_colours$col_hto_collapsed) +
AddStyle(fill=hto_colours$col_hto_collapsed,
legend_title="Classified cells",
legend_position="bottom",
xlab="") +
theme(axis.text.x = element_text(angle=45, hjust=1), legend.position="none")
p = p + patchwork::plot_annotation("Number of features for each cluster and HTO")
p
```
# Write out demultiplexed data
Finally, demultiplexed RNA data are written back to file. Results are included as metadata which may help for further analysis (e.g. filtering).
```{r hto_save_samples}
# Save each sample in a separate directory
Idents(sc) = "hash.ID"
sc$HTO_hash_ID = sc$hash.ID
samples = levels(Seurat::Idents(sc))
demux_samples_paths = c()
for (s in samples) {
p = ExportSeuratAssayData(sc[,Seurat::Idents(sc)==s],
dir=file.path(params$path_out, s),
assays=original_assay_names,
slot="counts",
include_cell_metadata_cols=c("HTO_classification",
"HTO_hash_ID"))
demux_samples_paths = c(demux_samples_paths, p)
}
message("Demultiplexed datasets are: ", paste(demux_samples_paths, collapse=", "))
# Export cell classification for Loupe
demux_samples_paths = c()
sc.all$HTO_hash_ID = sc.all$hash.ID
loupe_meta = as.data.frame([email protected])
loupe_meta = loupe_meta[,c("HTO_classification", "HTO_hash_ID")]
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
loupe_meta = cbind(Barcode=rownames(loupe_meta), loupe_meta[, idx_keep])
p = file.path(params$path_out, "cell_classification_for_loupe.csv")
write.table(x=loupe_meta, file=p, col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
demux_samples_paths = c(demux_samples_paths, p)
message(paste("Classification file for Loupe is:", demux_samples_paths))
```
# Credits
This Pre-Workflow was developed as part of the [scrnaseq](https://github.com/ktrns/scrnaseq) GitHub repository by Katrin Sameith and Andreas Petzold at the [Dresden-concept Genome Center, TU Dresden](https://genomecenter.tu-dresden.de/) (Dresden, Germany), in collaboration with Torsten Glomb and Oliver Dittrich at the [Research Core Unit Genomics, Hannover Medical School](https://www.mhh.de/genomics) (Hannover, Germany). The Seurat Vignettes were initially used as templates.
# Parameter table
The following parameters were used to run the workflow.
```{r parameters_table}
out = ScrnaseqParamsInfo(params=params)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")
```
# Software versions
This report was generated using the [scrnaseq](https://github.com/ktrns/scrnaseq) GitHub repository. Software versions were collected at run time.
```{r versions}
out = ScrnaseqSessionInfo(params$path_to_git)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")
```