-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClassifier.R
176 lines (144 loc) · 5.46 KB
/
Classifier.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
library(DECIPHER)
value <- "Baseline" # name of the run
K <- 8 # k-mer size
S <- function(N) ceiling(32/(1 + exp(-1.445*(log(N) - log(44.33)))))
B <- 1000 # bootstrap replicates
LOOS <- TRUE # perform leave-out-one-sequence?
allkmers <- mkAllStrings(DNA_BASES, K)
# load the training sequences
train <- readDNAStringSet("/home/adithya/RDP/rdpDataset/RDPClassifier_16S_trainsetNo16_rawtrainingdata/trainset16_022016.fa")
# get the full classification (requires "Root;")
classes <- sapply(strsplit(names(train), "Root;", fixed=TRUE),
`[`,
2L)
# split sequence into k-mers
kmers <- lapply(as.character(train),
function(x)
substring(x, 1:(nchar(x) - K + 1), K:nchar(x)))
# index and sort all pure k-mers
kmers <- lapply(kmers, match, table=allkmers)
kmers <- lapply(kmers, function(x) sort(unique(x[!is.na(x)])))
# if performing actual classification, process test sequences
if (LOOS) {
testkmers <- kmers
} else {
# load the test sequences
test <- readDNAStringSet("~/Downloads/SRR5279174.fasta")
# eventually will need to collapse duplicates here
# also need to consider sequence orientation (+/-)
# (check both orientations with a few sequences, pick the best)
# split sequence into k-mers
testkmers <- lapply(as.character(test),
function(x)
substring(x, 1:(nchar(x) - K + 1), K:nchar(x)))
# index and sort all pure k-mers
testkmers <- lapply(testkmers, match, table=allkmers)
testkmers <- lapply(testkmers, function(x) sort(unique(x[!is.na(x)])))
}
# ultimately, will need to compute confidence for each class
# but for now only report confidence for the final class label
# count the number of representatives per class
t <- table(classes)
weight <- 1/t[classes] # weigh each class equally
# compute idf weights
counts <- numeric(length(allkmers))
for (i in seq_along(classes)) {
counts[kmers[[i]]] <- counts[kmers[[i]]] + weight[i]
}
counts <- length(t)/(1 + counts) # Max/(1 + F)
counts <- log(counts) # idf weight is log(Max/(1 + F))
# initialize vectors for the output
predicted <- character(length(testkmers))
predicted[] <- NA_character_
confidence <- numeric(length(testkmers))
confidence[] <- NA_real_
pBar <- txtProgressBar(style=3)
for (i in seq_along(testkmers)) {
if (length(testkmers[[i]])==0)
next # no k-mers to test
# sample the same k-mers for all genera
s <- S(length(testkmers[[i]])) # subsample size
sampling <- matrix(sample(testkmers[[i]], s*B, replace=TRUE), B, s)
# only match unique k-mers to improve speed
uSampling <- sort(unique(as.vector(sampling)))
m <- match(sampling, uSampling)
# lookup IDF weights for sampled k-mers
myweights <- sampling
myweights[] <- counts[sampling]
# record the matches to each genus
hits <- matrix(0,
nrow=length(classes),
ncol=B)
for (j in seq_along(classes)) {
matches <- .Call("intMatch",
uSampling,
kmers[[j]],
PACKAGE="DECIPHER")[m]
matches <- myweights*matches
# alternatively, the above two commands can be replaced by:
#matches <- myweights*(sampling %in% kmers[[j]]) # slightly slower
hits[j,] <- rowSums(matches)
}
if (LOOS)
hits[i,] <- -1 # eliminate this sequence from matching
# compute confidence from the number of hits per class
sumOfHits <- numeric(length(classes))
davg <- mean(rowSums(myweights))
for (j in seq_len(B)) {
mymax <- max(hits[,j])
w <- which(hits[,j]==mymax)
if (length(w) > 1) {
selected <- sample(w, 1)
} else {
selected <- w
}
# our formula:
sumOfHits[selected] <- sumOfHits[selected] + mymax/davg
# original SINTAX formula:
#sumOfHits[selected] <- sumOfHits[selected] + 1
}
t <- tapply(sumOfHits, classes, sum)
t <- t[which.max(t)]
# record the results for this sequence
predicted[i] <- names(t)
confidence[i] <- t/B*100 # scale confidence between [0, 100]
setTxtProgressBar(pBar, i/length(testkmers))
}
# save the results for all sequences
eval(parse(text=paste("p", value, " = predicted", sep="")))
eval(parse(text=paste("c", value, " = confidence", sep="")))
eval(parse(text=paste("save(p", value, ", c", value, ", file='~/Desktop/TEMP/temp", value, ".RData')", sep="")))
####################################
## Plotting
####################################
# create the plot once
plot(NA,
xlim=c(0, 1),
ylim=c(0, 1),
xlab="Fraction of classifiable sequences classified",
ylab="Classification rate")
# then run the code below once per dataset:
# load the desired results into memory, then:
predicted <- pBaseline16S
confidence <- cBaseline16S
t <- table(classes)
weight <- 1/t[classes] # weigh each class equally
singletons <- which(classes %in% names(t[t==1]))
nonsingletons <- which(classes %in% names(t[t > 1]))
thresholds <- seq(0, 100, 0.1)
OC <- MC <- CC <- numeric(length(thresholds))
for (k in seq_along(thresholds)) {
# OC = (# singletons classified) / (# singletons total)
OC[k] <- sum(confidence[singletons] >= thresholds[k])/length(singletons)
# MC = (# non-singletons mis-classified) / (# non-singletons classified)
w <- nonsingletons[which(confidence[nonsingletons] >= thresholds[k])]
MC[k] <- sum(weight[w[predicted[w] != classes[w]]])/sum(weight[w])
# CC = (# non-singletons classified) / (# non-singletons total)
CC[k] <- sum(weight[w])/sum(weight[nonsingletons])
}
lines(CC, OC, col="black")
lines(CC, MC, col="black")
legend("topleft",
legend=c("Original SINTAX", "+ Formula (F)", "F + IDF log"),
col=c("gray", "black", "green"),
lty=1)