-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathR.step2
executable file
·192 lines (135 loc) · 8.42 KB
/
R.step2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env Rscript
library("optparse")
### BELOW IS CODE FROM https://www.r-bloggers.com/2015/09/passing-arguments-to-an-r-script-from-command-lines/ #
option_list = list(
make_option(c("-m", "--metadata"), type="character", default=NULL,
help="metadata file name", metavar="character"),
make_option(c("-i", "--within_G"), type="numeric", default=NULL,
help="minimum within-group prevalence", metavar="numeric"),
make_option(c("-o", "--without_G"), type="numeric", default=NULL,
help="maximum outside-of-group prevalence", metavar="numeric"),
make_option(c("-p", "--plasmid"), type="character", default=NULL,
help="plasmid type to evaluate", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
# if (is.null(opt$file)){
# print_help(opt_parser)
# stop("At least one argument must be supplied (input file).n", call.=FALSE)
# }
### END OF BORROWED CODE ###
library(DECIPHER)
### Add line to load Step1.RData ###
load("./roary/Step1.Rdata")
# Edit 6-4-2022
system(paste0("mkdir ", opt$plasmid))
system(paste0("mkdir ",opt$plasmid,"/unvalidated"))
# The imported variable mx.loci.bin is a filtered dataset
# If a plasmid only contained loci that occured fewer than three times in the entire dataset, that plasmid would be excluded
# This may cause a mismatch in dimensions when referencing the metadata file
# Also, if a plasmid was unassigned a plasmid type but still retained loci, this would also result in a mismatch
selected.plasmid <- opt$plasmid
withinmin <- opt$within_G
withoutmax <- opt$without_G
plasmid.meta <- read.csv(paste(getwd(), opt$metadata, sep = "/"), header = TRUE)
df.plasmid.meta <- as.data.frame(plasmid.meta[,-1])
rownames(df.plasmid.meta) <- plasmid.meta[,1]
meta.list <- list(rownames(df.plasmid.meta))
names(meta.list) <- meta.list
df.plasmid.meta.an <- df.plasmid.meta
# plasmid.meta.temp <- as.data.frame(matrix(0, ncol = ncol(mx.loci.bin), nrow = nrow(df.plasmid.meta.an)))
# colnames(plasmid.meta.temp) <- colnames(mx.loci.bin)
df.plasmid.meta.an[which(df.plasmid.meta.an != 0, arr.ind = TRUE)] <- "black"
df.plasmid.meta.an[which(df.plasmid.meta.an == 0, arr.ind = TRUE)] <- "white"
# Addition to allow for the use of a metadata sheet that includes data for more than just the inhouse set of sequences
df.plasmid.meta <- df.plasmid.meta[,which(colnames(df.plasmid.meta) %in% colnames(mx.loci.bin) == TRUE)]
# Divide the dataset into plasmids that are/are not members of the group of interest
# within.set <- as.matrix(mx.loci.bin[,colnames(df.plasmid.meta)[which(df.plasmid.meta[selected.plasmid,] != 0, arr.ind = TRUE)[,2]]])
# without.set <- as.matrix(mx.loci.bin[,colnames(df.plasmid.meta)[which(df.plasmid.meta[selected.plasmid,] == 0, arr.ind = TRUE)[,2]]])
within.set <- mx.loci.bin[,colnames(df.plasmid.meta[,which(df.plasmid.meta[selected.plasmid,] != 0)])]
without.set <- mx.loci.bin[,which(colnames(mx.loci.bin) %in% colnames(within.set) != TRUE, arr.ind = TRUE)]
# Prepare the dataframe
df.vals <- as.data.frame(matrix(0, nrow = nrow(within.set), ncol = 2))
rownames(df.vals) <- rownames(within.set)
# Obtain the prevalence of each loci within/without the group (remembering to account for groups where n=1)
ifelse(ncol(within.set) > 1, df.vals[,1] <- apply(within.set,1,mean), df.vals[,1] <- as.numeric(within.set[,1]))
df.vals[,2] <- apply(without.set,1,mean)
# Create the prevalence plot, using slider values for the abline
# loci_plot <- plot(df.vals[,2], df.vals[,1], xlim = c(0,1), ylim = c(0,1), main = paste(plasmid, " n=", ncol(within.set), sep = ""),xlab = "Frequency Outside Group", ylab = "Frequency Within Group", abline(v = without_G, h = within_G, col = 2))
# Subset out the loci that meet selection criterea, as dictated by the slider values
p1 <- df.vals[which(df.vals[,1] >= withinmin , arr.ind = TRUE),]
p2 <- p1[which(p1[,2] <= withoutmax , arr.ind = TRUE),]
# length(p2)
# p2
paste0(nrow(p2), " loci meet selection parameters")
png(file = paste0("./",selected.plasmid,"/",selected.plasmid,"_unvalidated_prevalence_plot.png"), width = 1800, height = 1800, res = 300)
plot(df.vals[,2], df.vals[,1], xlim = c(0,1), ylim = c(0,1), main = paste(opt$plasmid, " n=", ncol(within.set), sep = ""),xlab = "Frequency Outside Group", ylab = "Frequency Within Group", abline(v = opt$without_G, h = opt$within_G, col = 2))
mtext(paste0(nrow(p2), " loci meet selection parameters"), side = 3, cex=0.8)
dev.off()
# rownames(plasmid.meta.temp) <- rownames(df.plasmid.meta.an)
plasmid.meta.temp <- as.data.frame(matrix(0, ncol = ncol(mx.loci.bin), nrow = nrow(df.plasmid.meta)))
colnames(plasmid.meta.temp) <- colnames(mx.loci.bin)
rownames(plasmid.meta.temp) <- rownames(df.plasmid.meta)
plasmid.meta.temp[which(plasmid.meta.temp == 0, arr.ind = TRUE)] <- as.character("white")
for(i in 1:ncol(plasmid.meta.temp)){
if(colnames(plasmid.meta.temp)[i] %in% colnames(df.plasmid.meta.an) == TRUE){
plasmid.meta.temp[,i] <- df.plasmid.meta.an[,which(colnames(df.plasmid.meta.an) == colnames(plasmid.meta.temp)[i], arr.ind = TRUE)]
} else {
tempX <- 0
}
}
# dist.loci.bin <- dist(mx.loci.bin, method = "binary")
# dist.t.loci.bin <- dist(t(mx.loci.bin), method = "binary")
# hclust.loci.bin <- hclust(dist.loci.bin)
# hclust.t.loci.bin <- hclust(dist.t.loci.bin)
# heatmap(mx.loci.bin, Colv = as.dendrogram(hlcust.t.loci.bin), Rowv = as.dendrogram(hclust.loci.bin), scale = "none")
dend.color.bars <- plasmid.meta.temp[selected.plasmid,]
png(file = paste0("./",selected.plasmid,"/",selected.plasmid,"_Loci_highlight_PAM.png"), width = 3600, height = 3600, res = 300)
heatmap(mx.loci.bin, Colv = as.dendrogram(hclust.t.loci.bin), Rowv = as.dendrogram(hclust.loci.bin), scale = "none", main = paste0("Presence/Absence Matrix of Loci, group highlighted: ", selected.plasmid), ylab = "Loci", ColSideColors = as.character(dend.color.bars))
dev.off()
# for(i in 1:ncol(df.plasmid.meta.an)){
# plasmid.meta.temp[,match(colnames(df.plasmid.meta.an)[i], colnames(plasmid.meta.temp))] <- df.plasmid.meta.an[,colnames(df.plasmid.meta.an)[i]]
# }
# Generate a string of the loci of interest and obtain the current loci count
loci.of.interest <- paste((rownames(p2[which(p2[,1] > 0),])), collapse = ", ")
num.loci.of.interest <- length(rownames(p2[which(p2[,1] > 0),]))
loci.for.validation <- rownames(p2[which(p2[,1] > 0),])
common.for.validation <- df.name.key[loci.for.validation,1]
IG.for.validation <- loci.for.validation[which(loci.for.validation %in% IGindex[,1] == TRUE, arr.ind = TRUE)]
coding.coords1 <- df.annot.coords1[match(common.for.validation, df.annot.coords1[,2]),]
coding.coords1 <- na.omit(coding.coords1)
coding.coords1[,2] <- as.character(coding.coords1[,2])
coding.coords1[,5] <- as.character(coding.coords1[,5])
coding.coords.2 <- matrix(0, ncol = 1, nrow = nrow(coding.coords1))
for(i in 1:nrow(coding.coords.2)){
coding.coords.2[i,1] <- paste(as.character(coding.coords1[i,]), collapse = " ")
}
rev.coding.coords <- coding.coords.2[which(coding.coords1[,5] == "-", arr.ind = TRUE),1]
fwd.coding.coords <- coding.coords.2[which(coding.coords1[,5] == "+", arr.ind = TRUE),1]
writeLines(as.character(rev.coding.coords), con = paste0("./",selected.plasmid,"/unvalidated/rev.coding.coords"))
writeLines(as.character(fwd.coding.coords), con = paste0("./",selected.plasmid,"/unvalidated/fwd.coding.coords"))
set.IGindex <- IG.for.validation
###
# Escape for no intergenic regions
if(length(IG.for.validation) != 0){
set.IGindex <- IG.for.validation
for(j in 1:1){
writeout.IG.seq <- as.data.frame(matrix(0, ncol = 1, nrow = 2 * length(set.IGindex)))
for(i in 1:length(set.IGindex)){
v.cluster <- paste(piggy.dir,"/",as.character(set.IGindex[i]), "_aligned.fasta", sep = "")
dna <- readDNAStringSet(v.cluster)
dna <- RemoveGaps(dna, removeGaps = "all")
DNA <- AlignSeqs(dna)
consensus.result <- as.data.frame(ConsensusSequence(DNA))
writeout.IG.seq[(i*2)-1,1] <- paste(">", set.IGindex[i], sep = "")
writeout.IG.seq[(i*2),1] <- consensus.result[1,1]
writeout.IG.seq1 <- writeout.IG.seq
}
}
writeLines(as.character(writeout.IG.seq1[,1]), con = paste0("./",selected.plasmid,"/unvalidated/intergenic.fasta"))
} else {
writeLines(as.character(" "), con = paste0("./",selected.plasmid,"/unvalidated/intergenic.fasta"))
}
###
system(paste0("mkdir ./",selected.plasmid,"/data"))
save.image(paste0("./",selected.plasmid,"/data/Step2.Rdata"))