-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsintaxTfidfBootstrapv1.R
182 lines (132 loc) · 5.3 KB
/
sintaxTfidfBootstrapv1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# tfidf contains the tf and idf functions, whose values will be returned
# into the current namespace.
# About the functions :
# ----- tf(query, dataset, mode)
# This returns the term frequency of the kmers in the query seqeunce as
# a vector, where the names of the vector is the kmers and the values
# of the vector is the term frequency.
# >>>> Possible modes : rawsmooth, lognormal, doublenormK (default k = 0.5)
# ---- idf(query, dataset, mode)
# This returns the inverse document frequency as a vector that contains,
# the idf values for every query in our sequence against the dataset.
# The vector names are the names of the kmers and their respective
# idf values.
# >>>> Possible modes : log, logsmooth, logmax, probfreq
# Refer to tfidf wikipedia page for more information on the tfidf.
# source('tfidf.R')
args = (commandArgs(TRUE))
if(length(args)==0){
print("No arguments supplied.")
##supply default values
start = 1
end = 13212
k = 8
s = 32
}else{
for(i in 1:length(args)){
eval(parse(text=args[[i]]))
}
}
load('rdpDataframe.RData')
loadfilename <- paste(c('tfidf',k,'mers.RData'),collapse = "")
loadfilename2 <- paste(c(k,'mersPredictions.RData'), collapse = "")
load(loadfilename2)
load(loadfilename)
rank <- rdp$genus
names(rank) <- rank
sequences <- rdp$sequences
uniqueRank <- unique(rank)
names(uniqueRank) <- uniqueRank
mers <- lapply(sequences,
function (x) {
substring(x,
1:(nchar(x) - k + 1L),
k:nchar(x))
})
names(mers) <- rank
# MODIFICATION TO EXISTING SINTAX ALGORITHM
# -------------------------- THE ALGORTHM -----------------------------------
# Singleton sequences must be present in our query database but
# not in our
query_ranks <- rank
query_seqs <- mers
# Removing the singleton sequences from our reference database.
refernece_db_ranks <- rank
refernece_db_seqs <- mers
bs_confidence_vector <- vector(mode = 'integer', length=length(mers))
names(bs_confidence_vector) <- rdp$genus
tfidfVals <- eval(parse(text = paste(c('tfidf',k,'mers'), collapse = '')))
for(i in start:end)
{
tfidfSeq <- tfidfVals[[i]]
testSeq <- mers[[i]]
testRank <- rank[[i]]
# predictedRankFromPredictions <- predicted_RDP_sintax[i]
training_db_rank <- rank[-i]
training_db_seqs <- mers[-i]
confidenceVector <- vector(mode = 'integer', length = length(uniqueRank))
names(confidenceVector) <- uniqueRank
sequence_df <- data.frame(matrix(NA, nrow = 100, ncol = 5))
cat('testRank ', testRank, '\n')
for(j in 1:100) {
cat('bootstrapNo : ', j, '\n')
testSeq <- unlist(testSeq)
sampleKmerIndices <- sample(length(testSeq), s, replace = FALSE)
bootstrappedKmers <- testSeq[sampleKmerIndices]
# The following is our overlap vector, which we can use to find the
overlapVector <- sapply(training_db_seqs, k = bootstrappedKmers, FUN = function(X,k) {
t1 = unlist(X)
t2 = unlist(k)
# So instead of just getting the overlaps as 1s and 0s
# you would have to find the overlap in the tfidfseq
length(intersect(t1, t2))
})
# This will return the overlap vector with the hi*di product
# the next step is to divide them into the
maxPos <- which(overlapVector == max(overlapVector))
if(length(maxPos) > 1) {
maxPos <- sample(maxPos)[1]
} else {
maxPos <- maxPos[1]
}
predicted <- training_db_rank[maxPos]
hi <- max(overlapVector)
# confidenceVector[predicted] <- confidenceVector[predicted] + 1
cat('Predicted In bootstrap : ', predicted,'\n')
# Now that we have the common kmers, we can use the same kmers to get the di from the tfidfSeq
# we can use the common kmers to get the values that we need and store it in a dataframe
di <- sum(tfidfSeq[bootstrappedKmers])
sequence_df[j,1] <- predicted
sequence_df[j,2] <- hi
sequence_df[j,3] <- di
}
S
# Once we have the values for all the bootstraps. we need to calculate te di/davg fore every bootstrap
# we do this by doing the following
davg <- sum(sequence_df[,3])/100
sequence_df[,4] <- sequence_df[,3]/davg
# once we have found the di/davg for every bootstrap[, find get the averaged
sequence_df[,5] <- sequence_df[,4] * sequence_df[,2]
# we run into an issue where our actual rank may not be present in one of the predicted ranks,
# if thats the case, then we need to control for it by assigning the value of tha bootstrap for that
# to 0.
predictedRanks <- sequence_df[,1]
predictedRanks <- unique(predictedRanks)
if(predictions[i] %in% predictedRanks) {
# getting the threshold for our current sequence.
confidence <- sum(sequence_df[which(sequence_df[,1] == predictions[i]),5])/sum(sequence_df[,2])
} else if((predictions[i] %in% predictedRanks) == FALSE) {
confidence <- 0
}
bs_confidence_vector[i] <- confidence
cat('Query Seq : ', i,'\n')
cat('Final Prediction : ', testRank, '\n')
}
# Now we can use the existing bootstrap to begin
# our predictions using overlapping k-mers
# Now instead of using only part of the sequence,
# we will use full length sequences and find out
# the correct genus using the annotation.
savelink <- paste(c('SINTAXtfidf',k,'mers_',end,'.RData'), collapse = "")
save(bs_confidence_vector, file = savelink)
# ----------------------------------------------------------------------------------------