-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoptim_snp_data.R
434 lines (385 loc) · 15.3 KB
/
optim_snp_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
##########
# Author: B. Anderson
# Date: Nov-Dec 2023
# Description: evaluate multiple SNP datasets to optimise assembly parameters
# Note: the minimum arguments are multiple VCF files (one per parameter combination)
# It is helpful to also indicate the association between the files and the parameter values (-p),
# specify which sample IDs are replicate pairs (-r), a samples and populations file to limit which
# to analyse (-s), as well as a custom stats file from ipyrad (if you used that) for autosomal output (-i)
# Note2: this script relies on genlight objects, which only uses biallelic SNPs (may underestimate heterozygosity)
##########
# load libraries
suppressMessages(library(adegenet))
suppressMessages(library(vcfR))
##########
# define functions
## a help function for when the script is called incorrectly or without arguments
help <- function(help_message) {
if (missing(help_message)) {
cat("A script to evaluate SNP datasets called under different parameter settings\n\n")
cat("Usage: Rscript optim_snp_data.R <optional args> VCF_file1 VCF_file2 ...\n")
cat("Arguments:\n")
cat("\t...\tPaths to VCF files in the order for plotting/comparing (e.g. sequentially increasing)\n")
cat("\t-i\tA custom-formatted ipyrad stats file with a header and columns containing specific values:\n")
cat("\t\tcolumn1 = sample ID; column2 = clustering threshold; column3 = autosomal heterozygosity\n")
cat("\t-o\tOutput prefix [default: \"out\"]\n")
cat("\t-p\tA text file with parameter values for each input VCF in the order provided,",
"one per line (e.g. \"85\" or \"M2\")\n")
cat("\t\tIf not provided, the parameter values will be displayed arbitrarily\n")
cat("\t-r\tA text file with sample IDs of replicate pairs (tab separated, one per line)\n")
cat("\t-s\tA text file with sample IDs and populations (tab separated, one per line)\n")
cat("\t\tIf a samples file is provided, only those specified samples will be analysed\n")
} else {
cat(help_message)
}
}
## a function to compute the number of loci shared by >=80% of samples
## -- based on the metric used in Paris et al. (2017) --
## returns a number
paris <- function(genl) {
mat <- as.matrix(genl)
keep_snps <- colSums(! is.na(mat)) / nrow(mat) >= 0.80
sub_genl <- genl[, keep_snps]
length(unique(sub_genl@chromosome))
}
## a function to run error rate assessments
## -- based on metrics used in Mastretta-Yanes et al. (2015) --
## arguments are a genlight and a table with the replicate pair sampleIDs, one pair per row (2 columns)
## returns a matrix with columns corresponding to locus, allele and snp error rates
mastretta_error <- function(genl, reps) {
errors <- matrix(nrow = nrow(reps), ncol = 3)
cat(paste0("Comparing replicate pair (of ", nrow(reps), "): "))
for (pair_row in seq_len(nrow(reps))) {
cat(paste0(pair_row, " "))
### subset the genl for that pair and make a matrix
pair_genl <- genl[match(c(reps[pair_row, ]), [email protected])]
comp_mat <- as.matrix(pair_genl)
na1 <- is.na(comp_mat[1, ])
na2 <- is.na(comp_mat[2, ])
na_both <- na1 & na2
pair_genl <- pair_genl[, !na_both] # remove NA in both
comp_mat <- as.matrix(pair_genl)
na1 <- is.na(comp_mat[1, ])
na2 <- is.na(comp_mat[2, ])
### locus error: loci found in one rep but not the other, divided by total potential shared
loci1 <- unique(pair_genl[1, !na1]@chromosome)
loci2 <- unique(pair_genl[2, !na2]@chromosome)
diffs <- length(setdiff(loci1, loci2)) + length(setdiff(loci2, loci1))
in_both <- intersect(loci1, loci2)
locus_error <- 100 * diffs / (diffs + length(in_both))
### allele error: whether shared loci differ in sequence (any SNP, including no call in one)
pair_genl <- pair_genl[, pair_genl@chromosome %in% in_both]
comp_mat <- as.matrix(pair_genl)
comp_mat[is.na(comp_mat)] <- 3 # so NA can be compared
diff_pos <- comp_mat[1, ] != comp_mat[2, ]
allele_errors <- length(unique(pair_genl[, diff_pos]@chromosome)) # so that multiple in an allele aren't counted
allele_error <- 100 * allele_errors / (length(in_both))
### snp error: differences between any shared SNP divided by total shared
comp_mat <- comp_mat[, colSums(comp_mat == 3) == 0] # remove any missing
diffs <- sum(comp_mat[1, ] != comp_mat[2, ])
snp_error <- 100 * diffs / ncol(comp_mat)
### capture the values
errors[pair_row, ] <- c(locus_error, allele_error, snp_error)
}
cat("\n")
errors
}
## a function to calculate within-population distances
## -- based on the suggestion in Mastretta-Yanes et al. (2015) --
## returns a vector of average distances
pop_dist <- function(vcfr, sampleids, pops) {
### load libraries
suppressMessages(library(ape))
suppressMessages(library(pofadinr))
### calculate distances for the VCF samples
dnabin <- suppressMessages(vcfR2DNAbin(vcfr, consensus = TRUE, extract.haps = FALSE))
temp <- as.character(dnabin)
temp[temp == "n"] <- "?" # replace missing with a character recognised by pofadinr
dnabin <- as.DNAbin(temp)
distances <- dist.snp(dnabin, model = "GENPOFAD")
dist_mat <- as.matrix(distances)
### extract distances between individuals of the same population
pop_names <- unique(pops)
dists <- vector()
index <- 1
for (pop in pop_names) {
samples <- sampleids[pops == pop]
if (length(samples) < 2) { # can't compute
next
}
sub_mat <- dist_mat[rownames(dist_mat) %in% samples, colnames(dist_mat) %in% samples]
sub_mat <- sub_mat[lower.tri(sub_mat)] # keep lower triangle
avg_dist <- mean(sub_mat)
dists[index] <- avg_dist
index <- index + 1
}
dists
}
## a function to calculate heterozygosity
## returns a dataframe with the observed heterozygosity for all samples
het_est <- function(genl) {
comp_mat <- as.matrix(genl)
as.data.frame(rowSums(comp_mat == 1, na.rm = TRUE) / (ncol(comp_mat) - rowSums(is.na(comp_mat))))
}
###########
# process the call
args <- commandArgs(trailingOnly = TRUE)
if (length(args) == 0) {
stop(help(), call. = FALSE)
} else {
catch_args <- vector("list")
extra <- 1
catch <- TRUE
ipyrad_present <- FALSE
ipyrad_file <- ""
out_pref <- "out"
params_present <- FALSE
params_file <- ""
reps_present <- FALSE
reps_file <- ""
samples_present <- FALSE
samples_file <- ""
for (index in seq_len(length(args))) {
if (args[index] == "-i") {
ipyrad_present <- TRUE
ipyrad_file <- args[index + 1]
catch <- FALSE
} else if (args[index] == "-o") {
out_pref <- args[index + 1]
catch <- FALSE
} else if (args[index] == "-p") {
params_present <- TRUE
params_file <- args[index + 1]
catch <- FALSE
} else if (args[index] == "-r") {
reps_present <- TRUE
reps_file <- args[index + 1]
catch <- FALSE
} else if (args[index] == "-s") {
samples_present <- TRUE
samples_file <- args[index + 1]
catch <- FALSE
} else {
if (catch) {
catch_args[extra] <- args[index]
extra <- extra + 1
} else {
catch <- TRUE
}
}
}
}
if (length(catch_args) < 2) {
stop(help("Missing VCF files!\n"), call. = FALSE)
}
# read in the input
if (ipyrad_present) {
het_stats <- read.table(ipyrad_file, sep = "\t", header = TRUE)
} else {
cat("No ipyrad file detected, so calculating heterozygosity only from SNPs\n")
}
if (params_present) {
params <- as.character(read.table(params_file, header = FALSE)[, 1])
} else {
params <- unlist(lapply(seq_len(length(catch_args)), function(x) paste0("P", x)))
}
if (reps_present) {
reps_table <- read.table(reps_file, sep = "\t", header = FALSE)
reps_table[] <- lapply(reps_table, as.character)
} else {
cat("No replicates designated, so skipping error rate calculations\n")
}
if (samples_present) {
samples_table <- read.table(samples_file, sep = "\t", header = FALSE)
sampleids <- as.character(samples_table[, 1])
pops <- as.character(samples_table[, 2]) # corresponding sequence to samples (not unique)
} else {
cat("No samples file detected, so skipping population metrics and using all samples\n")
}
##########
# execute the analyses
## initiate the matrices for holding the results
size_mat <- matrix(ncol = 3)
colnames(size_mat) <- c("param", "num_loci", "num_snps")
paris_mat <- matrix(ncol = 2)
colnames(paris_mat) <- c("param", "num_loci")
error_mat <- matrix(ncol = 4)
colnames(error_mat) <- c("param", "locus_err", "allele_err", "snp_err")
dists_pop <- matrix(ncol = 2)
colnames(dists_pop) <- c("param", "dist")
het_obs <- matrix(ncol = 3)
colnames(het_obs) <- c("sample", "param", "het")
## cycle through the VCF files
index <- 1
for (vcf_file in catch_args) {
cat(paste0("\nProcessing file ", as.character(index), " of ", as.character(length(catch_args)), "...\n"))
vcfr <- read.vcfR(vcf_file, verbose = FALSE)
genl <- suppressWarnings(vcfR2genlight(vcfr))
cat(paste0("Read in and converted the VCF to a genlight with ", nInd(genl), " samples, ",
length(unique(genl@chromosome)), " loci, and ",
nLoc(genl), " SNPs\n"))
## if there is a samples file, subset the genl
if (samples_present) {
keep_samples <- sampleids
keep_pops <- pops
genl <- genl[match(keep_samples, [email protected])]
cat(paste0("Subsetted the genlight to ", nInd(genl), " samples\n"))
}
## if there is a replicates file, calculate error rates
## and remove some samples afterward
if (reps_present) {
cat("Running error rate assessments using replicates\n")
# check that the rep pairs are present in the genlight
rep_mat <- matrix(ncol = 2)
for (pair_row in seq_len(nrow(reps_table))) {
sample1 <- reps_table[pair_row, 1]
sample2 <- reps_table[pair_row, 2]
if (all(sample1 %in% [email protected], sample2 %in% [email protected])) {
if (pair_row == 1) {
rep_mat[1, ] <- c(sample1, sample2)
} else {
rep_mat <- rbind(rep_mat, c(sample1, sample2))
}
} else {
c(paste0("No sample(s) found for replicate pair: ", sample1, " ", sample2, "\n"))
}
}
merror <- mastretta_error(genl, rep_mat)
out_mat <- as.matrix(cbind(rep(params[index], nrow(merror)),
merror))
if (index == 1) {
error_mat <- out_mat
} else {
error_mat <- rbind(error_mat, out_mat)
}
## remove one of the reps from the genlight
genl <- genl[-match(rep_mat[, 2], [email protected])]
cat(paste0("Subsetted the genlight to ", nInd(genl), " samples\n"))
## if there is a samples file, remove the reps (and corresponding pops) from those
if (samples_present) {
drop_indices <- keep_samples %in% rep_mat[, 2]
keep_samples <- keep_samples[!drop_indices]
keep_pops <- keep_pops[!drop_indices]
}
}
## summarise number of loci and SNPs
if (index == 1) {
size_mat[1, ] <- c(params[index], length(unique(genl@chromosome)), nLoc(genl))
} else {
size_mat <- rbind(size_mat, c(params[index], length(unique(genl@chromosome)), nLoc(genl)))
}
## calculate Paris
cat("Assessing how many loci are shared by at least 80% of samples\n")
paris_num <- paris(genl)
if (index == 1) {
paris_mat[1, ] <- c(params[index], paris_num)
} else {
paris_mat <- rbind(paris_mat, c(params[index], paris_num))
}
## calculate heterozygosity
cat("Calculating SNP heterozygosity\n")
het_df <- het_est(genl)
out_mat <- as.matrix(cbind(rownames(het_df),
rep(params[index], nrow(het_df)),
het_df[, 1]))
if (index == 1) {
het_obs <- out_mat
} else {
het_obs <- rbind(het_obs, out_mat)
}
## if there is a samples file, calculate within population distances
if (samples_present) {
suppressMessages(library(SNPfiltR))
cat("Calculating within population genetic (GENPOFAD) distances\n")
### make the vcfr smaller by removing samples
### note that vcfr objects have an extra row for format
### additionally, drop SNPs that are now monomorphic
indices <- match(keep_samples, [email protected])
indices <- c(0, indices)
indices <- indices + 1
vcfr <- vcfr[is.biallelic(vcfr), indices]
vcfr <- min_mac(vcfr, min.mac = 1)
dists <- pop_dist(vcfr, keep_samples, keep_pops)
out_mat <- as.matrix(cbind(rep(params[index], length(dists)),
dists))
if (index == 1) {
dists_pop <- out_mat
} else {
dists_pop <- rbind(dists_pop, out_mat)
}
}
## increment
index <- index + 1
}
##########
# save the results to text files
write.table(size_mat, file = paste0(out_pref, "_size.tab"), quote = FALSE, row.names = FALSE, sep = "\t")
write.table(paris_mat, file = paste0(out_pref, "_paris.tab"), quote = FALSE, row.names = FALSE, sep = "\t")
write.table(error_mat, file = paste0(out_pref, "_error.tab"), quote = FALSE, row.names = FALSE, sep = "\t")
write.table(het_obs, file = paste0(out_pref, "_snphet.tab"), quote = FALSE, row.names = FALSE, sep = "\t")
if (samples_present) {
write.table(dists_pop, file = paste0(out_pref, "_distspop.tab"), quote = FALSE, row.names = FALSE, sep = "\t")
}
##########
# plot
cat("\nPlotting...\n")
## start the pdf
pdf(paste0(out_pref, "_optim.pdf"), width = 10, height = 10)
## size
plot(NULL, xlim = c(1, nrow(size_mat)), ylim = c(0, max(as.numeric(size_mat[, 2])) * 1.05),
main = "Total number of loci recovered",
xaxt = "n", ylab = "Number of loci", xlab = "")
points(seq_len(nrow(size_mat)), as.numeric(size_mat[, 2]), pch = 16)
axis(1, at = seq_len(nrow(size_mat)), labels = size_mat[, 1])
plot(NULL, xlim = c(1, nrow(size_mat)), ylim = c(0, max(as.numeric(size_mat[, 3])) * 1.05),
main = "Total number of biallelic SNPs recovered",
xaxt = "n", ylab = "Number of biallelic SNPs", xlab = "")
points(seq_len(nrow(size_mat)), as.numeric(size_mat[, 3]), pch = 16)
axis(1, at = seq_len(nrow(size_mat)), labels = size_mat[, 1])
## Paris
plot(NULL, xlim = c(1, nrow(paris_mat)), ylim = c(0, max(as.numeric(paris_mat[, 2])) * 1.05),
main = "Total number of loci recovered in >= 80% of individuals",
xaxt = "n", ylab = "Number of loci", xlab = "")
points(seq_len(nrow(paris_mat)), as.numeric(paris_mat[, 2]), pch = 16)
axis(1, at = seq_len(nrow(paris_mat)), labels = paris_mat[, 1])
## heterozygosity
if (ipyrad_present) {
boxplot(as.numeric(het_stats[, 3]) * 100 ~ het_stats[, 2],
ylim = c(0, max(as.numeric(het_stats[, 3]) * 100) * 1.05),
main = "Autosomal heterozygosity\n(ambiguous bases in consensus sequences within samples)",
ylab = "Heterozygous sites (%)", xlab = "Clustering threshold (%)")
boxplot(as.numeric(het_obs[, 3]) * 100 ~ het_obs[, 2],
ylim = c(0, max(as.numeric(het_obs[, 3]) * 100) * 1.05),
main = "Observed SNP heterozygosity",
ylab = "Heterozygosity (%)", xlab = "")
} else {
boxplot(as.numeric(het_obs[, 3]) * 100 ~ het_obs[, 2],
ylim = c(0, max(as.numeric(het_obs[, 3]) * 100) * 1.05),
main = "Observed SNP heterozygosity",
ylab = "Heterozygosity (%)", xlab = "")
}
## error
if (reps_present) {
boxplot(as.numeric(error_mat[, 2]) ~ error_mat[, 1],
main = paste0("Locus error rates\n",
"(loci present in one rep but not the other, relative to total in either/both)"),
ylim = c(0, max(as.numeric(error_mat[, 2])) * 1.05),
ylab = "Locus error rate (%)", xlab = "")
boxplot(as.numeric(error_mat[, 3]) ~ error_mat[, 1],
main = "Allele error rates\n(loci present in both that differ in sequence, relative to total in both)",
ylim = c(0, max(as.numeric(error_mat[, 3])) * 1.05),
ylab = "Allele error rate (%)", xlab = "")
boxplot(as.numeric(error_mat[, 4]) ~ error_mat[, 1],
main = "SNP error rates\n(SNPs called in both that differ, relative to total called in both)",
ylim = c(0, max(as.numeric(error_mat[, 4])) * 1.05),
ylab = "SNP error rate (%)", xlab = "")
}
## within population distances
if (samples_present) {
boxplot(as.numeric(dists_pop[, 2]) ~ dists_pop[, 1],
main = "Within Population Genetic Distances",
ylim = c(0, max(as.numeric(dists_pop[, 2]), na.rm = TRUE) * 1.05),
ylab = "Average GENPOFAD distance", xlab = "")
}
## stop the pdf
invisible(dev.off)