-
Notifications
You must be signed in to change notification settings - Fork 0
/
advanced_settings.R
224 lines (181 loc) · 10.8 KB
/
advanced_settings.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
### Advanced settings
################################################################################
# Only set unequal 'NULL' if you want to change any settings.
### Output directory
param$path_out = NULL
### Dataset settings
# Set assay
# Data type ("RNA", "Spatial")
param_advset$assay_raw = NULL
# Downsample data to at most n cells per sample AFTER filtering (mainly for tests)
# NULL to deactivate
param_advset$downsample_cells_n = NULL
# Downsample all samples equally according to the smallest sample
# TRUE/FALSE (default: FALSE)
# Overwritten by downsample_cells_n
param_advset$downsample_cells_equally = NULL
### Filter
# Filter for cells
#param_advset$cell_filter = list(nFeature_RNA=c(20, NA), nCount_RNA=c(200, NA), percent_mt=c(0, 20))
param_advset$cell_filter = list(nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18))
# Filter for features
#param_advset$feature_filter = list(min_counts=1, min_cells=5)
param_advset$feature_filter = NULL
# Samples to drop
# Cells from these samples will be dropped after initial QC
# Example: samples_to_drop = c("<name of dataset>_<names of subsample>")
param_advset$samples_to_drop = NULL
# Drop samples with too few cells
param_advset$samples_min_cells = NULL
### Normalization
# Which normalization should be used for analysis? ("RNA", "SCT")
param_advset$norm=NULL
# Whether or not to remove cell cycle effects
param_advset$cc_remove = NULL
# Should all cell cycle effects be removed, or only the difference between proliferating cells (G2M and S phase)?
# Read https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html, for an explanation
param_advset$cc_remove_all = NULL
# Whether or not to re-score cell cycle effects after data from different samples have been merged/integrated
param_advset$cc_rescore_after_merge = NULL
# Additional (unwanted) variables that will be regressed out for visualisation and clustering ("nCount_RNA", "percent_mt")
param_advset$vars_to_regress = NULL
# How to combine multiple datasets (method = "merge" or "integrate")
# "merge" (default): Concatenate data e.g. when samples were multiplexed on the same chip.
# "integrate": Anchors are computed for all pairs of datasets. This will give all datasets the same weight during dataset integration but can be computationally intensive
# "streamlined_integrate": Use streamlined integration workflow in v5 (faster), i. e. perform correction in low-dimensional space (IntegrateLayers function) rather than on gene expression levels
# The steps of the integration workflow are baasically the same for "integrate" and "streamlined_integrate" (https://github.com/satijalab/seurat/issues/8653)
# Additional options for the "integrate" method:
# - integration_function: "CCAIntegration" or "RPCAIntegration"
# - dimensions: Number of dimensions to consider for integration
# - reference: Use one or more datasets (separate by comma) as reference and compute anchors for all other datasets. Computationally faster but less accurate.
# - use_reciprocal_pca: Compute anchors in PCA space. Even faster but less accurate. Recommended for big datasets.
# - k_filter: How many neighbors to use when filtering anchors (default: min(200, minimum number of cells in a sample))
# - k_weight: Number of neighbors to consider when weighting anchors (default: min(100, minimum number of cells in a sample))
# - k_anchor: How many neighbors to use when picking anchors (default: min(10, minimum number of cells in a sample))
# - k_score: How many neighbors to use when scoring anchors (default: min(30, minimum number of cells in a sample))
#param$integrate_samples[["method"]]="integrate"
#param$integrate_samples[["integration_function"]]="RPCAIntegration"
# TO SET param$integrate_samples[["method"]]="streamlined_integrate"
# Similarity between samples ("homogene" or "heterogene")
# "heterogene" (default): If samples are biologically heterogeneous or under different treatments.
# "homogene": If samples (with roughly the same celltype composition) are technically noisy (i.e. have batch effect) with only simple shifts in mean expression.
param_advset$experimental_groups = NULL
### Dimensional reduction
param_advset$pc_n = NULL
# k nearest neighbors to find clusters
# k nearest neighbors to construct the UMAP
# Scanpy uses 15 for both by default
# Seurat uses 20 for cluster_k, and 30 for umap_k by default
param_advset$cluster_k = NULL
param_advset$umap_k = NULL
# Cluster resolutions to compute; multiple values possible (comma separated); Empty vector if not needed
param_advset$cluster_resolution_test = NULL
# Cluster resolution to use for analysis (default: 0.6)
param_advset$cluster_resolution = NULL
### Set reference
param_advset$file_annot = NULL
param_advset$file_cc_genes = NULL
# Default is Ensembl release 98 which corresponds to 2020-A reference package of 10x Genomics Cell Ranger
# Ensembl release 110 which corresponds to 2024-A reference package of 10x Genomics Cell Ranger
# ATTENTION: Translation cc genes between human and mouse does not work; Error in getLDS() since version 105 (https://github.com/grimbough/biomaRt/issues/66)
# Means versions > 105 do not work for mouse
param_advset$annot_version=NULL
#param$annot_main=c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession")
param_advset$annot_main=NULL
#param_advset$mart_attributes=c(c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession"),
# c("chromosome_name", "start_position", "end_position", "percentage_gene_gc_content", "gene_biotype", "strand", "description"))
param_advset$mart_attributes=NULL
param_advset$biomart_mirror=NULL
# List of marker genes
param_advset$file_known_markers = NULL
### Marker genes and differential expression testing
# Thresholds to define marker genes
param_advset$marker_padj = NULL
param_advset$marker_log2FC = NULL
param_advset$marker_pct = NULL
# Additional (unwanted) variables to account for in statistical tests
param_advset$latent_vars = NULL
# Contrasts to find differentially expressed genes (R data.frame or Excel file)
# Required columns:
# condition_column: Categorial column in the cell metadata; specify "orig.ident" for sample and "seurat_clusters" for cluster
# condition_group1: Condition levels in group 1, multiple levels concatenated by the plus character
# Empty string = all levels not in group2 (cannot be used if group2 is empty)
# condition_group2: Condition levels in group 2, multiple levels concatenated by the plus character
# Empty string = all levels not in group1 (cannot be used if group1 is empty)
#
# Optional columns:
# subset_column: Categorial column in the cell metadata to subset before testing (default: NA)
# Specify "orig.ident" for sample and "seurat_clusters" for cluster
# subset_group: Further subset levels (default: NA)
# For the individual analysis of multiple levels separate by semicolons
# For the joint analysis of multiple levels concatenate by the plus character
# For the individual analysis of all levels empty string ""
# assay: Seurat assay to test on; can also be a Seurat dimensionality reduction (default: "RNA")
# slot: In case assay is a Seurat assay object, which slot to use (default: "data")
# padj: Maximum adjusted p-value (default: 0.05)
# log2FC: Minimum absolute log2 fold change (default: 0)
# min_pct: Minimum percentage of cells expressing a gene to test (default: 0.1)
# test: Type of test; "wilcox", "bimod", "roc", "t", "negbinom", "poisson", "LR", "MAST", "DESeq2"; (default: "wilcox")
# downsample_cells_n: Downsample each group to at most n cells to speed up tests (default: NA)
# latent_vars: Additional variables to account for; multiple variables need to be concatenated by semicolons; will overwrite the default by param$latent_vars (default: none).
#param_advset$deg_contrasts = data.frame(condition_column=c("orig.ident", "orig.ident", "Phase"),
# condition_group1=c("pbmc_10x", "pbmc_10x", "G1"),
# condition_group2=c("pbmc_smartseq2_sample1", "pbmc_smartseq2_sample1", "G2M"),
# subset_column=c(NA, "seurat_clusters", "seurat_clusters"),
# subset_group=c(NA, "", "1;2"),
# downsample_cells_n=c(NA, 50, 30))
#param_advset$deg_contrasts = data.frame(condition_column=c("orig.ident"),
# condition_group1=c("sample1"),
# condition_group2=c("sample2"),
# subset_column=c("seurat_clusters"),
# subset_group=c(""),
# downsample_cells_n=c(50))
param_advset$deg_contrasts = NULL
# Enrichr site ("Enrichr", "FlyEnrichr", "WormEnrichr", "YeastEnrichr", "FishEnrichr")
param_advset$enrichr_site = NULL
# P-value threshold for functional enrichment tests
param_advset$enrichr_padj = NULL
# Enrichr libraries
param_advset$enrichr_dbs = NULL
# Cell type annotation database
param_advset$annotation_dbs = NULL
### Set colors
# See https://r-charts.com/color-palettes/ and https://nanx.me/ggsci/articles/ggsci.html for palette characteristics
# Colour palette used for samples
param_advset$col_palette_samples = NULL
# Colour palette used for cluster
param_advset$col_palette_clusters = NULL
# Colour palette used for annotated cell types
param_advset$col_palette_annotation = NULL
# Defined colours for samples
param_advset$col_samples = NULL
param_advset$col_samples_ref = NULL
# Defined colours for seurat_clusters
param_advset$col_clusters = NULL
param_advset$col_clusters_ref = NULL
# Defined colours for annotated cell types
param_advset$col_annotation = NULL
param_advset$col_annotation_ref = NULL
# Feature Plot colors - Highlights
param_advset$col = NULL
# Feature Plot colors - Background
param_advset$col_bg = NULL
# Set dot size for umaps/tsne
param_advset$pt_size = NULL
### ccc analysis
# Available methods: "connectome", "logfc", "natmi", "sca", "cellphonedb", "cytotalk", "call_squidpy", "call_cellchat", "call_connectome", "call_sca", "call_italk", "call_natmi"
param_advset$liana_methods = c("logfc", "natmi", "cellphonedb")
# Threshold for liana agg rank (default: 0.01)
param_advset$liana_agg_rank_threshold = NULL
### For dataset mapping
# Pre-annotated cell types; column in reference dataset
param_advset$celltype = NULL # at the moment "annotation" required
# Reduction to use 'umap' or 'tsne'; must exist in ref dataset (default: 'umap')
param_advset$reduction = NULL
# Predicted score threshold (default: 0.9)
param_advset$predicted_score_threshold = NULL
# Minimum fraction of cell with respective cell identity (default: 0.1)
param_advset$percent_predicted_cells_threshold = NULL
### For generation of clustifyr reference
# Clustifyr reference path
param_advset$ref_data_path = NULL