-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPlot2.R
136 lines (104 loc) · 4.36 KB
/
Plot2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# CalculateOCRMCR takes the filename of the prediction and the confidence
# This should return the a vector with OC and MC for that version.
CalculateOCRMCR <- function(filename_pred, filename_conf){
# filename_pred <- ''
# filename_conf <- ''
load(filename_pred)
load(filename_conf)
load('rdpDataframe.RData')
# In this way, we calculate the MC based on how
# many actually happened in the genus.
# We need to make sure that the actual vector that we compare
# we need to get the indices of the predictions that we made and
# create our bootstrap.
# Get the indices
confidencesIndices <- which(confidences != '')
confidences <- confidences[confidencesIndices]
predictionsIndices <- which(predictions != '0')
predictions <- predictions[predictionsIndices]
# First we initialize the data that we need to work with.
thresholdValues <- seq(0.00,1.00,0.01)
predictions <- predictions
confidences <- confidences/100
actual <- rdp$genus[confidencesIndices]
singletonGenera <- names(which(table(actual) == 1))
knownGenera <- names(which(table(actual) != 1))
lengthOfKnown <- length(actual) - length(singletonGenera)
MCvector <- vector(mode = 'double', length = length(thresholdValues))
OCvector <- vector(mode = 'double', length = length(thresholdValues))
bootstrapValues <- confidences
unclassifiedVector <- vector(mode = 'double', length = length(thresholdValues))
# We need to create a dataframe with the following columns
# Actual, Prediction, Unclassified, OC, MC,
# This dataframe has to be set for every new threshold.
# It has 5 columns and length(predictions) rows
classDex <- matrix(0, nrow = length(predictions), ncol = 6)
classDex <- data.frame(classDex)
colnames(classDex) <- c('Actual','Prediction','Confidence','Unclassified','OC','MC')
classDex[,'Actual'] <- actual
# classDex[,'Prediction'] <- predictions
classDex[,'Confidence'] <- confidences
# Start going by each threshold and compare the confidence with teh thresholdvaluess
for(i in seq_along(thresholdValues)) {
# Calculate the new predictions for based on the theshold we are calculating from
cat('Threshold = ', thresholdValues[i],'\n')
newPredictions <- vector(mode = 'character', length = length(predictions))
newPredictions[] <- NA_character_
for(j in seq_along(bootstrapValues)) {
if(is.na(bootstrapValues[j]) == FALSE) {
if(bootstrapValues[j] >= currentThreshold) {
newPredictions[j] <- predictions[j]
}
else {
newPredictions[j] <- "unclassified"
}
}
}
unclassifiedVector[i] <- length(which(newPredictions == "unclassified"))/length(actual)
# Set our new predictions to the ones that have been modified
classDex[,'Prediction'] <- newPredictions
# Set our unclassifieds to true or false based on whether that predictions was called
# an unclassified or classified.
classDex[which(newPredictions == "unclassified"),'Unclassified'] <- TRUE
classDex[which(newPredictions != "unclassified"),'Unclassified'] <- FALSE
# Going through the predictions and comiing up
# with whether that was an MC or OC.
for(k in seq_along(predictions)) {
if(is.na(classDex[,'Prediction'] == FALSE){
if((classDex[,'Actual'] %in% singletonGenera)){
if(classDex[,'Unclassified'] == FALSE){
classDex[,'OC'] <- 1
}
}
else if((classDex[,'Unclassified'] == FALSE)) {
if(classDex[,'Prediction'] != classDex[,'Actual']){
classDex[,'MC'] <- 1
}
}
}
}
# The oc vector is calculated as normal.
OCvector[i] <- sum(classDex[,'OC'])/length(singletonGenera)
# Now to calculate the MC's
# In order to do the MC's we need to calculate
# We calculate the MC by taking into account the
# number of MC errors made for that genus present
# from that part of the dataset.
# In this case, we only need to pick out the genera
# Whose MC's are not 0. The genera that are not
# 0 would be the ones where the genus would
# be misclassified
# Scan through the dataframe by genus
# and find out what fraction of the selected
# genus is the genera.
uniqueGenera <- unique(classDex[,'Actual'])
mcv <- sapply(uniqueGenera, FUN = function(x) {
genSum <- sum(classDex[which(x == classDex[,'Actual']),'MC'])
genSum <- genSum/length(which(x == classDex[,'Actual']))
return(genSum)
})
mcv <- mcv/length(singletonGenera)
MCvector[i] <- mcv
}
return(list(mcv = MCvector, ocv = OCvector))
}