-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdec_tree.py
427 lines (397 loc) · 16 KB
/
dec_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
# Jessica Jones
# CS 429
# Project 1
import csv
import math
import numpy
import node
import mushroom
# info about mushroom dataset
target_attrib = mushroom.target_attrib
positive = mushroom.positive
negative = mushroom.negative
null_class = mushroom.unknown_class
unknown_val = mushroom.unknown_val
fields = mushroom.attributes
attribute_values = mushroom.attribute_values
# info for table of chi-squared critical values
chi_sq_path = './chi_squared_table.txt'
chi_sq_vals = ['dof', '50', '95', '99']
chi_squared_table = []
# print tree for debugging
def print_tree(root, level):
if root.parent:
parent_name = str(root.parent.split_attrib)
else:
parent_name = ''
if root.label:
print("#",root.count," Level-",level," Label-",root.label," Parent-",parent_name," ",str(root.attributes))
else:
print("#",root.count," Level-",level," Split attrib-",root.split_attrib," Parent-",parent_name," ",str(root.attributes))
for child in root.children:
print_tree(child, level+1)
# get depth of tree
def get_max_depth(root, level):
if root.label:
return level
else:
max_depth = 0
for child in root.children:
depth = get_max_depth(child, level+1)
if depth > max_depth:
max_depth = depth
return max_depth
# get number nodes in tree
def get_num_nodes(root):
if root == None:
return 0
else:
count = 1
for child in root.children:
count += get_num_nodes(child)
return count
# when classifying instance with missing value for an attribute,
# returns token value at leaf; method from Quinlan 1986
def get_leaf_token(node, instance, token):
# return token value after reaching leaf
if node.label:
if node.label == positive:
return token
else:
return -token
# if not at leaf, find attribute node splits on. Multiply token
# by percent of training examples that share instance's attribute value.
# Then recurse.
else:
split_attrib = node.split_attrib
instance_value = instance[split_attrib]
percent_match = len(search_examples(split_attrib, instance_value, node.examples))/len(node.examples)
token *= percent_match
child_match = [child for child in node.children if child.attributes[split_attrib] == instance_value][0]
return get_leaf_token(child_match, instance, token)
# takes list of instances, classifies each and
# returns list of classifications
def classify_instance_list(root, target_attrib, instance_list):
class_list = []
for instance in instance_list:
classification = classify_instance(root, instance)
class_list.append(classification)
return class_list
# classifies an instance using decision tree
def classify_instance(root, instance):
# if root has label, return label
if root.label:
return root.label
# else get root's split attribute
else:
split_attrib = root.split_attrib
# if value of attribute unknown, call other function to get probable class
# assume an instance does not have unknown values for > 1 attribute
if instance[split_attrib] == unknown_val:
return classify_unknown_attrib(root,instance)
# check which of root's children's value of split attribute matches instance's value
for child in root.children:
if child.attributes[split_attrib] == instance[split_attrib]:
# recurse
return classify_instance(child, instance)
# for instances with an unknown attribute value,
# return probable class of instance by passing token value through tree
# method from Quinlan 1986
def classify_unknown_attrib(root, instance):
# pos_token and neg_token sum values returned from leaves
pos_token = 0
neg_token = 0
num_root = len(root.examples)
# pass token to each child proportional to the number of
# training examples at root that went to each child
for child in root.children:
num_children = len(child.examples)
leaf_val = get_leaf_token(child, instance, num_children/num_root)
# if token value is positive, returned from leaf labeled +
if leaf_val > 0:
pos_token += leaf_val
else:
neg_token -= leaf_val
if pos_token > neg_token:
return positive
else:
return negative
# return list of examples for whom attrib == value
def search_examples(attrib, value, examples):
matches = [ex for ex in examples if ex[attrib] == value]
#print (len(matches))
return matches
# get the most common class from set of examples
def get_most_common_value(target_attrib, examples):
num_examples = len(examples)
num_positive = len(search_examples(target_attrib, positive, examples))
if num_positive > (num_examples - num_positive):
return positive
else:
return negative
# calculates classification error
def get_classification_error(prob_positive, prob_negative):
classification_error = 1 - max(prob_positive, prob_negative)
return classification_error
# calculates entropy
def get_entropy(prob_positive, prob_negative):
entropy = 0
if prob_positive:
entropy += - prob_positive * math.log(prob_positive,2)
if prob_negative:
entropy += - prob_negative * math.log(prob_negative,2)
return entropy
# calculate resulting impurity for splitting on given attribute
def eval_attrib(examples, target_attrib, attrib_to_eval, criterion):
num_examples = len(examples)
impurity = 0
unknowns = None
# if value of target_attrib is '?' for any example
if (search_examples(attrib_to_eval, unknown_val, examples)):
# get number of '?' in positive and negative classes
unknowns = search_examples(attrib_to_eval, unknown_val, examples)
num_unknowns = len(unknowns)
num_pos_unknowns = len(search_examples(target_attrib, positive, unknowns))
num_neg_unknowns = len(search_examples(target_attrib, negative, unknowns))
# for each valid value of target_attrib
# get number of positive & negative instance, adjusting for
# unknowns if necessary
for val in attribute_values[attrib_to_eval]:
examples_val = search_examples(attrib_to_eval, val, examples)
if unknowns:
raw_num_pos = len(search_examples(target_attrib, positive, examples_val))
raw_num_neg = len(search_examples(target_attrib, negative, examples_val))
adj_ratio = (raw_num_pos+raw_num_neg)/(num_examples-num_unknowns)
# from Quinlan: pi = pi + pu * (pi + ni)/(sum(pi + ni))
num_positive = raw_num_pos + num_pos_unknowns * adj_ratio
num_negative = raw_num_neg + num_neg_unknowns * adj_ratio
else:
num_positive = len(search_examples(target_attrib, positive, examples_val))
num_negative = len(search_examples(target_attrib, negative, examples_val))
# calculate prob(+) and prob(-); need to avoid / by 0 error
if (num_positive+num_negative):
prob_positive = num_positive/(num_positive+num_negative)
else:
prob_positive = 0
prob_negative = 1 - prob_positive
# calculate entropy or classification error per criterion
if criterion == 'entropy':
impurity_val = get_entropy(prob_positive, prob_negative)
else:
impurity_val = get_classification_error(prob_positive, prob_negative)
# calculate weighted sum of entropy/classification
impurity += impurity_val * (num_positive+num_negative)/num_examples
# return weighted sum
return impurity
# decide which of available attributes to split on
def get_best_attrib(examples, target_attrib, attributes, criterion):
start_impurity = 0
num_positive = len(search_examples(target_attrib, str(positive), examples))
prob_positive = num_positive/len(examples)
if (criterion == 'entropy'):
start_impurity = get_entropy(prob_positive, 1-prob_positive)
else:
start_impurity = get_classification_error(prob_positive, 1-prob_positive)
best_attrib = None
best_gain = 0
for attrib in attributes:
impurity = eval_attrib(examples, target_attrib, attrib, criterion)
gain = start_impurity - impurity
if gain > best_gain:
best_gain = gain
best_attrib = attrib
return best_attrib
# get chi-squared value for attribute chosen for split;
# formula and variable names from:
# http://isites.harvard.edu/fs/docs/icb.topic539621.files/lec7.pdf
def get_chi_squared(examples, target_attrib, attrib_for_split):
# p & n are number of +/- examples prior to splitting on node
p = len(search_examples(target_attrib, positive, examples))
n = len(search_examples(target_attrib, negative, examples))
devX = 0
for x in attribute_values[attrib_for_split]:
# Dx is subset of examples with split_attrib == x
Dx = search_examples(attrib_for_split, x, examples)
if len(Dx) == 0:
continue
# p_hat & n_hat are number of +/- examples in Dx if
# Dx has same distribution as all examples in node
p_hat = p/(p+n) * len(Dx)
n_hat = n/(p+n) * len(Dx)
# px and nx are actual number of +/- examples in Dx
px = len(search_examples(target_attrib, positive, Dx))
nx = len(search_examples(target_attrib, negative, Dx))
# devX is deviation from absence of pattern
devX += ((px - p_hat) ** 2)/p_hat + ((nx - n_hat) ** 2)/n_hat
return devX
# calculate threshold value for chi-squared test
def get_threshold(attrib_for_split, confidence):
# get degrees of freedome
deg = len(attribute_values[attrib_for_split])-1
# get critical values for attribute's degrees of freedom
critical_vals = [row for row in chi_squared_table if row['dof'] == str(deg)][0]
return float(critical_vals[str(confidence)])
# helper method labels node with most common class of its examples
def label_most_common(node, target_attrib):
if (get_most_common_value(target_attrib, node.examples)) == positive:
node.label = positive
node.attributes[target_attrib] = positive
else:
node.label = negative
node.attributes[target_attrib] = negative
return node
# ID3 from Mitchell
def make_id3_tree(examples, target_attrib, attributes, parent, inherited_attributes, criterion, confidence):
root = node.Node(parent)
# Node.attributes represents conjunction of attribute tests which are true
# for training examples at node
root.attributes = inherited_attributes.copy()
if parent:
parent.children.append(root)
root.examples = examples[:]
# if all examples are +, return single-node tree Root with label +
if len(search_examples(target_attrib, positive, examples)) == len(examples):
root.label = positive
root.attributes[target_attrib] = positive
# if all examples are -, return single-node tree Root with label -
elif len(search_examples(target_attrib, negative, examples)) == len(examples):
root.label = negative
root.attributes[target_attrib] = negative
# if set of attributes to be tested is empty, return single-node
# tree Root with label == most common value of target_attrib in examples
elif not attributes:
root = label_most_common(root, target_attrib)
else:
# else
# find attribute A w/highest info gain/accuracy
attrib_for_split = get_best_attrib(examples, target_attrib, attributes, criterion)
# perform chi-squared test on attribute A if confidence level entered
if confidence:
chi_sq = get_chi_squared(examples, target_attrib, attrib_for_split)
threshold = get_threshold(attrib_for_split, confidence)
if chi_sq < threshold:
root = label_most_common(root, target_attrib)
# if root not yet labeled, either passed chi-square test or wasn't tested
if not root.label:
# set decision attribute for root == A
root.split_attrib = attrib_for_split
# for each possible value vi of A
for value in attribute_values[attrib_for_split]:
# add a branch below root corresponding to A == vi
# let examples_vi be subset of examples with A == vi
examples_val = search_examples(attrib_for_split, value, examples)
# if |examples_vi| == 0, add leaf node w/most common value of target_attrib in examples
if not examples_val and not root.children:
child = node.Node(parent)
root.children.append(child)
label = get_most_common_value(target_attrib,examples)
child.label = label
child.attributes = root.attributes.copy()
child.attributes[target_attrib] = label
# else add subtree make_id3_tree(examples_vi, target_attrib, (attributes-A), criterion, root)
else:
attributes_child = attributes[:]
attributes_child.remove(attrib_for_split)
attribute_values_child = root.attributes.copy()
attribute_values_child[attrib_for_split] = value
make_id3_tree(examples_val, target_attrib, attributes_child, root, attribute_values_child, criterion, confidence)
return root
# populate lists of dictionaries from csv files
def get_file_data(filename, filefields, data):
with open(filename) as csvfile:
reader = csv.reader(csvfile)
for row in reader:
data.append(dict(zip(filefields, row)))
csvfile.close()
# output list of classes to csv
def output_csv(filepath, class_list):
filename = filepath +'output.txt'
with open(filename, 'w') as csvfile:
writer = csv.writer(csvfile)
for item in class_list:
writer.writerow([item,])
csvfile.close()
# generate confusion matrix for instances w/known classes
def get_confusion_mtx(target_attrib, instances, predictions):
mtx = numpy.zeros((2,2))
num_instances = len(instances)
if num_instances != len(predictions):
print ('Unequal number of instances and predictions')
else:
for i in range(0,len(instances)):
# find if true pos/neg, false pos/neg
gt = instances[i][target_attrib]
pred = predictions[i]
# increment confusion mtx
if gt == pred:
if gt == positive:
mtx[0][0] += 1
else:
mtx[1][1] += 1
else:
if gt == positive:
mtx[0][1] += 1
else:
mtx[1][0] += 1
return mtx
def get_tree(training_data, confidence, entropy):
# attributes = attributes available to split on
attributes = fields[:]
attributes.remove(target_attrib)
if entropy:
criterion = 'entropy'
else:
criterion = 'class_error'
tree_root = make_id3_tree(training_data, target_attrib, attributes, None, {}, criterion, int(confidence))
return tree_root
def test_tree(tree_root, results_table, column, test_data):
predictions = classify_instance_list(tree_root, target_attrib, test_data)
conf_mtx = get_confusion_mtx(target_attrib, test_data, predictions)
print (conf_mtx.astype(int))
accuracy = numpy.sum(conf_mtx.diagonal())/numpy.sum(conf_mtx)
depth = get_max_depth(tree_root, 0)
num_nodes = get_num_nodes(tree_root)
results_table[1][column] = str(int(accuracy*100))+'%'
results_table[2][column] = str(num_nodes)
results_table[3][column] = str(depth)
def do_id3(train_file, test_file, validation_file, output_path):
training_data = []
get_file_data(train_file, fields, training_data)
get_file_data(chi_sq_path, chi_sq_vals, chi_squared_table)
test_data = []
get_file_data(test_file, fields, test_data)
validation_data = []
get_file_data(validation_file, fields, validation_data)
ce_results = [['0' for i in range(5)] for i in range(4)]
ent_results = [['0' for i in range(5)] for i in range(4)]
tab_labels = ['Confidence Level', 'Accuracy\t', 'Number Nodes\t', 'Tree Depth\t']
for i in range(4):
ce_results[i][0] = tab_labels[i]
ent_results[i][0] = tab_labels[i]
for i in range(1,4):
ce_results[0][i+1] = str(chi_sq_vals[i])
ent_results[0][i+1] = str(chi_sq_vals[i])
print ('Using Classification Error as Criterion:')
for i in range(1,5):
cl = ce_results[0][i]
print ('Confidence Level ',cl,'%')
tree_root = get_tree(training_data, cl, False)
test_tree(tree_root, ce_results, i, test_data)
print ('\nUsing Entropy as Criterion:')
for i in range(1,5):
cl = ent_results[0][i]
print ('Confidence Level ',cl,'%')
tree_root = get_tree(training_data, cl, True)
test_tree(tree_root, ent_results, i, test_data)
print('\n')
print ('Criterion = Classification Error')
for row in ce_results:
print ('\t'.join(row))
print ('\nCriterion = Entropy')
for row in ent_results:
print ('\t'.join(row))
# handle validation data
val_tree = get_tree(training_data, 0, True)
predictions = classify_instance_list(val_tree, target_attrib, validation_data)
if output_path:
output_csv(output_path, predictions)