-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscikitClassification.py
180 lines (152 loc) · 7.16 KB
/
scikitClassification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 13 15:55 2018
@author: dshi, hbaud, vlefranc
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import warnings
from IPython.display import display
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
# from config import current_file
from classification import Classification
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
seed = 7
def compare_classifiers(labels='spam'):
# Categorized data frame
classif = Classification(labels)
df_tweets_categorized = classif.create_dataframe()
# df_tweets_categorized = classif.create_dataframe(False)
# df_tweets_categorized = pd.read_csv(current_file, encoding="utf-8")
# print(df_tweets_categorized.head())
k_value = 7
dict_classifiers = {
"Logistic Regression": LogisticRegression(),
"KNN": KNeighborsClassifier(n_neighbors=k_value, weights='distance', algorithm='auto'),
"Linear SVM": SVC(gamma='scale', class_weight={0: 5, 1: 1}, kernel='rbf'),
"Random Forest": RandomForestClassifier(class_weight={0: 5, 1: 1}),
"Naive Bayes": GaussianNB(),
"LDA": LinearDiscriminantAnalysis(),
# "CART": DecisionTreeClassifier()
}
dict_models = {}
tweets = df_tweets_categorized
HEADERS = df_tweets_categorized.columns.values.tolist()
def split_dataset(dataset, train_percentage, feature_headers, target_header):
"""
Split dataset into train and test dataset
"""
train_x, test_x, train_y, test_y = train_test_split(
dataset[feature_headers],
dataset[target_header],
train_size=train_percentage)
return train_x, test_x, train_y, test_y
def evaluate(y, predicted_y):
# acc = accuracy_score(y, predicted_y)
if labels == 'type':
cm = pd.DataFrame(confusion_matrix(y, predicted_y), columns=[0, 1, 2, 3, 4, 5], index=[0, 1, 2, 3, 4, 5])
# cm[column][line]
alpha = cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1] # TP
beta = alpha + cm[0][2] + cm[1][2] + cm[0][3] + cm[1][3] + cm[0][4] + cm[1][4] + cm[0][5] + cm[1][5]
gamma = alpha + cm[2][0] + cm[3][0] + cm[4][0] + cm[5][0] + cm[2][1] + cm[3][1] + cm[4][1] + cm[5][1]
precision = alpha / beta
recall = alpha / gamma
else:
cm = pd.DataFrame(confusion_matrix(y, predicted_y), columns=[0, 1], index=[0, 1])
precision = cm[0][0] / (cm[0][0] + cm[0][1])
recall = cm[0][0] / (cm[0][0] + cm[1][0])
f_score = 2 * precision * recall / (precision + recall) if precision > 0 and recall > 0 else 0
return precision, recall, f_score
# return "Precision = {} \nRecall = {} \nF score = {}".format(precision, recall, f_score)
def classify(classifier_name, classifier, train_x, test_x, train_y, test_y, verbose=True):
t_start = time.time()
classifier.fit(train_x, train_y)
t_end = time.time()
t_diff = t_end - t_start
train_score = classifier.score(train_x, train_y)
test_score = classifier.score(test_x, test_y)
dict_models[classifier_name] = {
'model': classifier,
'train_score': train_score,
'test_score': test_score,
'train_time': t_diff
}
if verbose:
print("trained {c} in {f:.2f} s".format(c=classifier_name, f=t_diff))
return classifier
def display_dict_models(dict, sort_by='test_score'):
cls = [key for key in dict.keys()]
test_s = [dict[key]['test_score'] for key in cls]
training_s = [dict[key]['train_score'] for key in cls]
training_t = [dict[key]['train_time'] for key in cls]
precision = [dict[key]['precision'] for key in cls]
recall = [dict[key]['recall'] for key in cls]
f_score = [dict[key]['f_score'] for key in cls]
columns = ['classifier', 'train_score', 'test_score', 'train_time', 'precision', 'recall', 'f_score']
df_ = pd.DataFrame(data=np.zeros(shape=(len(cls), len(columns))),
columns=columns)
for ii in range(0, len(cls)):
df_.loc[ii, 'classifier'] = cls[ii]
df_.loc[ii, 'train_score'] = training_s[ii]
df_.loc[ii, 'test_score'] = test_s[ii]
df_.loc[ii, 'train_time'] = training_t[ii]
df_.loc[ii, 'precision'] = precision[ii]
df_.loc[ii, 'recall'] = recall[ii]
df_.loc[ii, 'f_score'] = f_score[ii]
display(df_.sort_values(by=sort_by, ascending=False))
def predict(dataset, classifier):
train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[1:-1], HEADERS[-1])
clf = classify(classifier, dict_classifiers[classifier], train_x, test_x, train_y, test_y)
pred_y = clf.predict(test_x)
precision, recall, f_score = evaluate(test_y, pred_y)
dict_models[classifier_name]['precision'] = precision
dict_models[classifier_name]['recall'] = recall
dict_models[classifier_name]['f_score'] = f_score
def compare(dataset):
"""
Evaluate each model in turn
"""
results = []
names = []
# https://scikit-learn.org/stable/modules/model_evaluation.html
scoring = 'accuracy'
# scoring = 'f1_weighted'
train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[1:-1], HEADERS[-1])
print(scoring)
for name in dict_classifiers.keys():
model = dict_classifiers[name]
kfold = KFold(n_splits=10, random_state=seed)
cv_results = cross_val_score(model, train_x, train_y, cv=kfold, scoring=scoring)
if scoring == 'f1_weighted': # because the labels are False when the tweet is relevant, we need to study it as True in our case by doing 1-f1_score
cv_results = np.array([1 - f1_score for f1_score in cv_results])
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison ({0}) - {1}'.format(labels, scoring))
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
for classifier_name in dict_classifiers.keys():
predict(tweets, classifier_name)
print("\n==================================== Results based on {} ==================================== ".format(labels))
display_dict_models(dict_models)
print()
compare(tweets)
if __name__ == "__main__":
compare_classifiers('spam')
compare_classifiers('type')