-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCreateTrainingSetsForEachSequence.R
155 lines (112 loc) · 4.46 KB
/
CreateTrainingSetsForEachSequence.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
args = (commandArgs(TRUE))
# Bi-directional sintax
# use 50 replicates for bootstrapping on each size
# use intersect.
if(length(args)==0){
print("No arguments supplied.")
##supply default values
start = 1
end = 13212
k = 8
s = 32
}else{
for(i in 1:length(args)){
eval(parse(text=args[[i]]))
}
}
loadfilename <- paste(c('tfidf',k,'mers.RData'),collapse = "")
load(loadfilename)
load('rdpDataframe.RData')
rank <- rdp$genus
names(rank) <- rank
sequences <- rdp$sequences
uniqueRank <- unique(rank)
names(uniqueRank) <- uniqueRank
mers <- lapply(sequences,
function (x) {
substring(x,
1:(nchar(x) - k + 1L),
k:nchar(x))
})
names(mers) <- rank
# In this program, we have to create a training set for each test sequence in our dataset.
# This means, for every test sequence in our dataset, we will have to create a new smaller
# dataset containing 2472 for 2472 genera in the dataset where each index corresponds to a
# sequence from a unique genus from the dataset.
# How do we do this?
# First, we take a test seqeuence from the dataset and make the rest of the dataset as our
# training set. Then, we segregate the training set based on the genus they represent.
# We can do this by picking out the corresponding indices for each genus.
# Once we have segregated, we train genus-by-genus. So for every genus that is represented
# by our training set, we train against the seqeunces in that genus and find the sequence
# that we hit most on that genus's set of sequence and pick that index and
# add it to our vector of indices for that sequence.
# Use the v4 sintax
tfidfVals <- eval(parse(text = paste(c('tfidf',k,'mers'), collapse = '')))
findBestIndices <- function(indices, test, train, samp_matrix_w) {
# we now have the training sequence from each genus
# Our objective at this point is to do 100 boostrap replicates
# on the new training set and find the index with the most number of
# hits.
# make a vector keep counts for which is the best vector
counts <- integer(length(train))
test <- unlist(test)
for(j in 1:100) {
# cat('Bootstrap No', j,'\n')
bootstrappedKmers <- test[samp_matrix_w[j,]]
overlapVector <- sapply(train, k = bootstrappedKmers, FUN = function(x,k) {
t1 <- unlist(x)
t2 <- unlist(k)
length(intersect(t1,t2))
})
maxPos <- which(overlapVector == max(overlapVector))
if(length(maxPos) > 1){
maxPos <- sample(maxPos)[1]
}
counts[maxPos] <- counts[maxPos] + 1
# cat('counts', counts)
}
names(counts) <- indices
maxIndices <- names(counts[which(counts == max(counts))])
if(length(maxIndices) > 1){
maxIndices <- sample(maxIndices)[1]
}
cat('predictedIndex = ',maxIndices,'\n')
return(maxIndices)
}
indexHolder <- as.list(integer(13212))
for(i in start:end) {
# Pick the test sequence
tfidfWeights <- tfidfVals[[i]]
testSeq <- mers[[i]]
testRank <- rank[[i]]
trainingSeqs <- mers[-i]
trainingRank <- rank[-i]
indexWithNames <- seq(1,13211)
names(indexWithNames) <- trainingRank
weights <- tfidfWeights[testSeq]
probs <- weights/sum(weights)
samp_matrix_w <- matrix(sample(length(testSeq),3200,replace = TRUE,prob=probs),nrow=100,ncol=32)
cat('Sequence No : ', i,'\n')
cat('Sequence Genus = ',testRank,'\n')
# Get the indices of seqeunces of every genus
uniqueTrainingRanks <- unique(trainingRank)
segIndex <- lapply(uniqueTrainingRanks, function(x) {
which(x == names(indexWithNames))
})
# Find the best index from processing every genus
genusIndices <- sapply(segIndex, FUN = function(x) {
indices <- unlist(x)
train <- trainingSeqs[indices]
test <- unlist(testSeq)
returnIndex <- findBestIndices(indices, test, train, samp_matrix_w)
cat('Returned Index = ', returnIndex,'\n')
return(returnIndex)
})
genusIndices <- strtoi(genusIndices)
genusIndices[which(genusIndices > i)] <- genusIndices[which(genusIndices > i)] + 1
names(genusIndices) <- uniqueTrainingRanks
indexHolder[[i]] <- genusIndices
# cat('Final Indices', genusIndices)
}
save(indexHolder, file = paste0(c('tSet_',end), collapse = ""))