-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMY_replicate_error.R
351 lines (292 loc) · 12.7 KB
/
MY_replicate_error.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
##########
# Authors: Rachel Binks and Ben Anderson, based on scripts from Mastretta-Yanes et al. 2015
# Date: June-July-Aug 2021
# Description: run Mastretta-Yanes et al. error estimation for technical replicates
##########
# Define any parameters that may need to be changed by users
min_present_rate <- 0.04 # the minimum proportion of samples a SNP must be in (for distances, not error)
# a helper function for when the script is called without arguments
help <- function(help_message) {
if (missing(help_message)) {
cat("A script to run Mastretta-Yanes et al. error estimation for technical",
"replicates and/or intrapopulation distances using a vcf file\n")
cat("Usage: Rscript MY_replicate_error.R -o output_pre -r reps_include -s sample_file -v vcf_file\n")
cat("Options:\n")
cat("\t-o\tThe output file name prefix [default \"output\"]\n")
cat("\t-r\tWhether the data includes technical replicates (yes [default] or no)\n")
cat("\t-s\tThe sample names file and pops, with the format:\n",
"\t\tvcf_label\tsample_name\tpopulation\n",
"\t\tNote: replicate names should have an \"_R\" after the same text for the other rep\n",
"\t\tNote: if sample names already match vcf, then you only need one column + pop\n")
cat("\t-v\tThe VCF file to be analysed\n")
cat("\t\tNote: only biallelic SNPs will be kept\n")
} else {
cat(help_message)
}
}
###########
# Read in and format the data
###########
# parse the command line
args <- commandArgs(trailingOnly = TRUE)
if (length(args) == 0) {
stop(help(), call. = FALSE)
} else {
catch_args <- vector("list")
i <- 1
outpre <- "output"
reps_present <- TRUE
samples_present <- FALSE
vcf_present <- FALSE
for (index in seq_len(length(args))) {
if (args[index] == "-o") {
outpre <- args[index + 1]
} else if (all(args[index] == "-r", args[index + 1] == "no")) {
reps_present <- FALSE
} else if (args[index] == "-s") {
samples_present <- TRUE
samples_file <- args[index + 1]
} else if (args[index] == "-v") {
vcf_present <- TRUE
vcf_file <- args[index + 1]
} else {
catch_args[i] <- args[index]
i <- i + 1
}
}
}
if (! samples_present) {
stop(help("Missing argument for sample/pops file!\n"), call. = FALSE)
}
if (! vcf_present) {
stop(help("Missing argument for vcf file!\n"), call. = FALSE)
}
# load required libraries
suppressMessages(library(vcfR))
suppressMessages(library(adegenet))
# read in the input files
sample_table <- read.table(samples_file, sep = "\t", header = FALSE)
myvcf <- read.vcfR(vcf_file, verbose = FALSE)
genl <- suppressWarnings(vcfR2genlight(myvcf)) # convert to genlight object
#genl # print a summary of the object
# assign correct sample names and population labels to the genlight object
# and filter the genlight to only retain samples present in the sample table
# format1 = id sample_name pop (if vcf ids need to be renamed)
# format2 = id pop (if vcf ids = sample names already)
len_original <- length([email protected])
original_names <- [email protected]
if (ncol(sample_table) != 2) {
[email protected] <- sample_table$V2[match([email protected], sample_table$V1)]
populations <- sample_table$V3[match([email protected], sample_table$V2)]
} else {
[email protected] <- sample_table$V1[match([email protected], sample_table$V1)]
populations <- sample_table$V2[match([email protected], sample_table$V1)]
}
if (length([email protected][is.na([email protected])]) > 0) { # if there are any NAs (non-matches)
original_names <- original_names[!is.na([email protected])] # to make sure vcf data matches
populations <- populations[!is.na([email protected])]
genl <- genl[!is.na([email protected]), ]
}
len_new <- length([email protected])
cat("Retained", length([email protected]), "SNPs from the VCF file\n")
cat("Removed", len_original - len_new, "samples not present in the input sample table\n")
######## Locus, allele and SNP error rates
# Locus error rate: if a locus is in one replicate but not the other relative to total present
## NOTE: previously, and with MY, this is relative to the total number of loci in the dataset
## I think it might make more sense to be relative to the total possible comparisons between the two
# allele error rate: if the locus is in both but differs (by at least one SNP) relative to total present, and
# SNP error rate: if a SNP is found in both and differs (there may be multiple SNPs per locus) relative to total
# If we use VCF input, then we get locus error rate from looking at "chromosome" field
# If we only output one SNP per locus, allele error rate == SNP error rate
# determine names of replicate pairs if reps are present, then run error rate calculations
if (reps_present) {
cat("Running error estimation\n")
reps <- grep("_R", [email protected], value = TRUE)
nonreps <- grep("_R", [email protected], value = TRUE, invert = TRUE)
samps <- nonreps[pmatch(sub("_R.*", "", reps), nonreps)]
pairs <- cbind(reps, samps)
pairs <- pairs[rowSums(is.na(pairs)) < 1, ] # remove rows with missing pairs
# calculate error rates for each pair
error_df <- as.data.frame(matrix(nrow = nrow(pairs), ncol = 3))
colnames(error_df) <- c("Locus_error", "Allele_error", "SNP_error")
for (pair_row in seq_len(nrow(pairs))) {
cat("Processing replicate pair", pair_row, "of", nrow(pairs), "\n")
pair_genl <- genl[match(c(pairs[pair_row, ]), [email protected])] # subset genlight for the pair
# locus
# use the "chromosome" tag to determine if a locus is present
na_pos <- NA.posi(pair_genl) # NA positions in the genlight
na_rep <- na_pos[[1]] # NA positions in the rep
rep_genl <- pair_genl[1, -na_rep] # remove NA
rep_loci <- unique(rep_genl@chromosome) # find unique loci names
na_samp <- na_pos[[2]] # NA positions in the samp
samp_genl <- pair_genl[2, -na_samp] # remove NA
samp_loci <- unique(samp_genl@chromosome) # find unique loci names
diffs <- length(setdiff(rep_loci, samp_loci)) + length(setdiff(samp_loci, rep_loci)) # differences
in_both <- intersect(rep_loci, samp_loci)
locus_error <- 100 * diffs / (diffs + length(in_both)) # diffs divided by possible comparisons
# allele
# loci in both
allele_genl <- pair_genl[, pair_genl@chromosome %in% in_both] # subset genlight for loci in both
chrom_list <- rep(NA, length = length(in_both))
comp_mat <- as.matrix(allele_genl)
comp_mat[is.na(comp_mat)] <- 3 # change NA to a number that won't match
index <- 1
allele_errors <- 0
for (snp in seq_len(ncol(comp_mat))) {
if (as.character(allele_genl@chromosome[snp]) %in% chrom_list) { # if we have seen this locus already
if (hit == 0) { # if we haven't found an error yet for it
if (comp_mat[1, snp] != comp_mat[2, snp]) {
allele_errors <- allele_errors + 1
hit <- 1
}
}
} else { # new locus
chrom_list[index] <- as.character(allele_genl@chromosome[snp]) # add it to the list
index <- index + 1
if (comp_mat[1, snp] != comp_mat[2, snp]) {
allele_errors <- allele_errors + 1
hit <- 1
} else {
hit <- 0
}
}
}
allele_error <- 100 * allele_errors / (length(in_both))
# SNP
# all SNPs comparable, but no NA as differences
comp_mat <- as.matrix(allele_genl)
comp_mat <- comp_mat[, colSums(is.na(comp_mat)) == 0] # completely remove any NA comparisons
snp_errors <- 0
for (snp in seq_len(ncol(comp_mat))) {
if (comp_mat[1, snp] != comp_mat[2, snp]) {
snp_errors <- snp_errors + 1
}
}
snp_error <- 100 * snp_errors / ncol(comp_mat)
# Now capture these values for that pair
error_df[pair_row, ] <- c(locus_error, allele_error, snp_error)
# report
cat("\tProcessed", length(union(rep_loci, samp_loci)), "total loci, of which",
diffs, "were present in one but not the other\n")
cat("\tProcessed", length(in_both), "loci present in both, of which",
allele_errors, "had at least one SNP difference\n")
cat("\tProcessed", ncol(comp_mat), "SNPs present in both, of which",
snp_errors, "differed\n")
}
# Write the resulting error rates to a file for plotting after
write.table(error_df, file = paste0(outpre, "_error_table.tab"), sep = "\t",
quote = FALSE, row.names = FALSE)
}
# This next step is not yet implemented, nor is it clear whether it should be (independent plotting)
##########
# create PCoA
##########
# create a PCoA to display how well populations/replicates are clustering
#cat("Running a PCoA based on the genlight\n")
# Remove monomorphic SNPs (?)
#gl_matrix <- as.matrix(genl)
#loc_list <- array(NA, length([email protected]))
#for (index in 1: length([email protected])) {
# row <- gl_matrix[, index]
# if (all(row == 0, na.rm = TRUE) | all(row == 2, na.rm = TRUE)
# | all(row == 1, na.rm = TRUE) | all(is.na(row))) {
# loc_list[index] <- [email protected][index]
# }
#}
#loc_list <- loc_list[!is.na(loc_list)]
#if (length(loc_list) > 0) {
# cat("Dropping", length(loc_list), "monomorphic or all NA loci\n")
# pca_genl <- genl[, is.na(match([email protected], loc_list))] # the ones that don't match
#} else {
# cat("There are no monomorphic loci\n")
#}
#cat("Retained", length([email protected]), "SNPs\n")
# run the PCA
#pca <- glPca(pca_genl, nf = 4, loadings = FALSE, parallel = TRUE, n.cores = 4)
# note the percentage variation from the first 8 eigenvalues
#perc_eig <- 100 * pca$eig / sum(pca$eig)
#sumvar <- sum(perc_eig[1:8])
# plot the PCoA for visual check of population/replicate clustering
###########
###########
# Remove replicates if they are present
if (reps_present) {
cat("Dropping", length(reps), "replicates\n")
original_names <- original_names[! [email protected] %in% reps]
populations <- populations[! [email protected] %in% reps]
genl <- genl[! [email protected] %in% reps, ]
}
# Calculate how many loci and SNPs are present in >= 80% of individuals
# this is to emulate some of the ideas in Paris et al. 2017
check_mat <- as.matrix(genl)
keep_snps <- colSums(! is.na(check_mat)) / nrow(check_mat) >= 0.80
check_genl <- genl[, keep_snps]
nsnps <- sum(keep_snps)
nloci <- length(unique(check_genl@chromosome))
cat("There were", nloci, "loci and", nsnps, "SNPs present in more than 80% of individuals\n")
# write to a table for plotting
count_df <- as.data.frame(matrix(ncol = 2, nrow = 1))
colnames(count_df) <- c("loci", "snps")
count_df[1, ] <- c(nloci, nsnps)
write.table(count_df, file = paste0(outpre, "_count80.tab"), sep = "\t", quote = FALSE, row.names = FALSE)
# Remove monomorphic SNPs
gl_matrix <- as.matrix(genl)
loc_list <- array(NA, length([email protected]))
for (index in seq_len(length([email protected]))) {
row <- gl_matrix[, index]
if (all(row == 0, na.rm = TRUE) | all(row == 2, na.rm = TRUE)
| all(row == 1, na.rm = TRUE) | all(is.na(row))) {
loc_list[index] <- [email protected][index]
}
}
loc_list <- loc_list[!is.na(loc_list)]
if (length(loc_list) > 0) {
cat("Dropping", length(loc_list), "monomorphic or all NA loci\n")
genl <- genl[, is.na(match([email protected], loc_list))] # the ones that don't match
} else {
cat("There are no monomorphic loci\n")
}
cat("Retained", length([email protected]), "SNPs\n")
##########
# Genetic distance and populations
##########
# If possible, assess genetic distances between individuals in the same populations
cat("Assessing intrapopulation Euclidean distances between individuals\n")
# Filter on missing data
check_mat <- as.matrix(genl)
keep_snps <- colSums(! is.na(check_mat)) / nrow(check_mat) >= min_present_rate
genl <- genl[, keep_snps]
cat("Kept", sum(keep_snps), "SNPs after filtering for a minimum sample coverage of",
min_present_rate, "\n")
# Calculate distances between individuals
# Use dist and Euclidean distance, which *should* omit calculations with missing values
dist_obj <- dist(as.matrix(genl))
# Normalize the distances
dist_mat <- as.matrix(dist_obj / max(dist_obj))
# Check that labels match (should)
if (! all(original_names == rownames(dist_mat))) {
cat("Whoops, names aren't matching as expected\n")
cat(populations)
populations <- populations[match(rownames(dist_mat), original_names)]
cat(populations)
}
# Extract distances between individuals of the same population
pops <- unique(populations)
dists_df <- as.data.frame(matrix(nrow = length(pops), ncol = 2))
colnames(dists_df) <- c("number_ind", "mean_intra_dist")
for (popnum in seq_len(length(pops))) {
pop <- pops[popnum]
pos <- populations %in% pop # find the positions which match that pop
dmat <- dist_mat[pos, pos] # extract a smaller matrix of samples from that pop
dists <- dmat[lower.tri(dmat)] # keep the lower triangle values
avg_dist <- mean(dists) # calculate an average within population distance
dists_df[popnum, ] <- c(nrow(dmat), avg_dist)
}
# Write the resulting file for plotting after
write.table(dists_df, file = paste0(outpre, "_dist_table.tab"), sep = "\t", quote = FALSE, row.names = FALSE)
# Report that the script has finished
if (reps_present) {
cat("Finished running MY replicate error assessment and intrapopulation Euclidean distances\n\n")
} else {
cat("Finished running intrapopulation Euclidean distances\n\n")
}