-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsintaxTfidfBootstrapv9.R
223 lines (165 loc) · 6.54 KB
/
sintaxTfidfBootstrapv9.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# tfidf contains the tf and idf functions, whose values will be returned
# into the current namespace.
# About the functions :
# ----- tf(query, dataset, mode)
# This returns the term frequency of the kmers in the query seqeunce as
# a vector, where the names of the vector is the kmers and the values
# of the vector is the term frequency.
# >>>> Possible modes : rawsmooth, lognormal, doublenormK (default k = 0.5)
# ---- idf(query, dataset, mode)
# This returns the inverse document frequency as a vector that contains,
# the idf values for every query in our sequence against the dataset.
# The vector names are the names of the kmers and their respective
# idf values.
# >>>> Possible modes : log, logsmooth, logmax, probfreq
# Refer to tfidf wikipedia page for more information on the tfidf.
# source('tfidf.R')
args = (commandArgs(TRUE))
if(length(args)==0){
print("No arguments supplied.")
##supply default values
start = 1
end = 13212
k = 8
s = 32
mode = 1
}else{
for(i in 1:length(args)){
eval(parse(text=args[[i]]))
}
}
# loadfilename <- paste(c('tfidf',k,'mers.RData'),collapse = "")
if(mode %in% c(1,2,3,4,5,6)) {
loadfilename <- paste(c('pow0.',mode,'weightsIDF.RData'), collapse = '')
}else if (mode == 'log') {
loadfilename <- paste(c('logWeightsIDF.RData'), collapse = '')
}
cat('Loading Files ... ','\n')
load(loadfilename)
load('rdpDataframe.RData')
cat('Creating All Possible Kmers ... ' ,'\n')
# Find out which kmers in our idf vector is na, and remove them
weights <- weights[!is.na(weights)]
alldna <- names(weights)
rank <- rdp$genus
names(rank) <- rank
sequences <- rdp$sequences
uniqueRank <- unique(rank)
names(uniqueRank) <- uniqueRank
cat('Creating Impure Kmers', '\n')
impureKmers <- lapply(sequences,
function (x) {
substring(x,
1:(nchar(x) - k + 1L),
k:nchar(x))
})
names(impureKmers) <- rank
# Modify the kmer list to include only pure kmers
cat('Filtering pure kmers ', '\n')
mers <- lapply(impureKmers,
function(x) {
return(x[which(x %in% alldna)])
})
names(mers) <- rank
# MODIFICATION TO EXISTING SINTAX ALGORITHM
# -------------------------- THE ALGORTHM -----------------------------------
# Singleton sequences must be present in our query database but
# not in our
query_ranks <- rank
query_seqs <- mers
# Removing the singleton sequences from our reference database.
refernece_db_ranks <- rank
refernece_db_seqs <- mers
bs_confidence_vector <- vector(mode = 'integer', length=length(mers))
names(bs_confidence_vector) <- rdp$genus
tfidfVals <- eval(parse(text = paste(c('weights'), collapse = '')))
predictionVector <- vector(mode = 'character', length = length(rank))
for(i in start:end)
{
tfidfSeq <- tfidfVals
testSeq <- mers[[i]]
testRank <- rank[[i]]
# predictedRankFromPredictions <- predicted_RDP_sintax[i]
training_db_rank <- rank[-i]
training_db_seqs <- mers[-i]
confidenceVector <- vector(mode = 'integer', length = length(uniqueRank))
names(confidenceVector) <- uniqueRank
sequence_df <- data.frame(matrix(NA, nrow = 100, ncol = 5))
samp_matrix_w <- matrix(sample(length(testSeq),3200,replace = TRUE),nrow=100,ncol=32)
indexVector <- as.vector(samp_matrix_w)
lookUpWeights <- tfidfSeq[testSeq[indexVector]]
cat('testRank ', testRank, '\n')
for(j in 1:100) {
cat('bootstrapNo : ', j, '\n')
testSeq <- unlist(testSeq)
sampleKmerIndices <- samp_matrix_w[j,]#sample(length(testSeq), s, replace = FALSE)
bootstrappedKmers <- testSeq[sampleKmerIndices]
# The following is our overlap vector, which we can use to find the
# pb <- txtProgressBar(min = 0, max = length(training_db_seqs), char = '=')
# val <<-1
overlapVector <- sapply(training_db_seqs, k = bootstrappedKmers, FUN = function(X,k) {
t1 = unlist(X)
names(t1) <- t1
t2 = unlist(k)
names(t2) <- t2
# return(sum(tfidfSeq[intersect(t1,t2)]))
return(sum(lookUpWeights[intersect(t1,t2)]))
})
# This will return the overlap vector with the hi*di product
# the next step is to divide them into the
maxPos <- which(overlapVector == max(overlapVector))
if(length(maxPos) > 1) {
maxPos <- sample(maxPos)[1]
} else {
maxPos <- maxPos[1]
}
predicted <- training_db_rank[maxPos]
hi <- max(overlapVector)
# confidenceVector[predicted] <- confidenceVector[predicted] + 1
cat('Predicted In bootstrap : ', predicted,'\n')
cat('Score Value : ', hi,'\n')
# Now that we have the common kmers, we can use the same kmers to get the di from the tfidfSeq
# we can use the common kmers to get the values that we need and store it in a dataframe
di <- sum(tfidfSeq[bootstrappedKmers])
sequence_df[j,1] <- predicted
sequence_df[j,2] <- hi
sequence_df[j,3] <- di
}
# Once we have the values for all the bootstraps. we need to calculate te di/davg fore every bootstrap
# we do this by doing the following
davg <- sum(sequence_df[,3])/100
sequence_df[,4] <- sequence_df[,3]/davg
# In this version, our confidence is determined by the fact that we tie the
# goodness of our bootstrap with how good our prediction was in that bootstrap
sequence_df[,5] <- sequence_df[,4] * (sequence_df[,2]/sequence_df[,3])
# we run into an issue where our actual rank may not be present in one of the predicted ranks,
# if thats the case, then we need to control for it by assigning the value of tha bootstrap for that
# to 0.
uniquePredictions <- unique(sequence_df[,1])
cvec <- sapply(uniquePredictions, function(x) {
sum(sequence_df[which(sequence_df[,1] == x),5])
})
names(cvec) <- uniquePredictions
maxPos2 <- which(cvec == max(cvec))
if(length(maxPos2) > 1) {
maxPos2 <- sample(maxPos2)[1]
}else{
maxPos2 <- maxPos2[1]
}
confidence <- cvec[maxPos2]
prediction <- uniquePredictions[maxPos2]
bs_confidence_vector[i] <- confidence
predictionVector[i] <- prediction
cat('Query Seq : ', i,'\n')
cat('Final Prediction : ', testRank, '\n')
}
# Now we can use the existing bootstrap to begin
# our predictions using overlapping k-mers
# Now instead of using only part of the sequence,
# we will use full length sequences and find out
# the correct genus using the annotation.
savelink <- paste(c('confidence_',end,'_',mode,'_',s,'_',k,'v9.RData'), collapse = "")
savelink2 <- paste(c('predictions_',end,'_',mode,'_',s,'_',k,'v9.RData'), collapse = "")
save(bs_confidence_vector, file = savelink)
save(predictionVector, file = savelink2)
# ----------------------------------------------------------------------------------------