-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsvm.py
86 lines (71 loc) · 2.58 KB
/
svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# import numpy
import nltk
import pandas as pd
from sklearn import svm, cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
import re
tfidf_vectorizer = TfidfVectorizer(stop_words=nltk.corpus.stopwords.words('portuguese'), strip_accents='ascii')
count_vectorizer = CountVectorizer(analyzer='word', stop_words=nltk.corpus.stopwords.words('portuguese'), strip_accents='ascii')
def train_svm(data):
X = extract_features(row[0] for row in data)
x = [row[1] for row in data]
# clf = svm.LinearSVC()
clf = svm.SVC(kernel='linear', probability=True)
clf.fit(X, x)
return clf
def extract_features(docs):
docs = map(preprocess_text, docs)
features = count_vectorizer.fit_transform(docs)
return features
def preprocess_text(text):
text = text.lower()
text = unidecode(text)
text = remove_rt(text)
text = remove_twitter_user_mentions(text)
text = remove_hashtags(text)
text = remove_links(text)
text = remove_special_chars(text)
text = remove_numbers(text)
# TODO:
# - remove "kkkkkk"
return text
def remove_rt(text):
return re.sub('RT ', '', text)
def remove_twitter_user_mentions(text):
return re.sub(r'(?:@[\w_]+)', '', text)
def remove_hashtags(text):
return re.sub(r'(?:\#+[\w_]+[\w\'_\-]*[\w_]+)', '', text)
def remove_links(text):
return re.sub(r'http\S+', '', text)
def remove_special_chars(text):
return re.sub(r'[^\w\s\']', '', text)
def remove_numbers(text):
return re.sub(r'\d', '', text)
def build_classification_report(clf, test_data):
y_true = [row[1] for row in test_data]
docs = map(preprocess_text, [row[0] for row in test_data])
tfidf = count_vectorizer.transform(docs)
y_pred = clf.predict(tfidf)
report = classification_report(y_true, y_pred)
return report
def cross_validation_report(clf, dataset):
data = count_vectorizer.transform([row[0] for row in dataset])
target = [row[1] for row in dataset]
return cross_validation.cross_val_score(clf, data, target)
data = pd.read_csv('data.csv', encoding='utf8').as_matrix()
clf = train_svm(data)
if __name__ == '__main__':
# numpy.set_printoptions(threshold='nan')
train_data, test_data = train_test_split(data, test_size=0.2)
print 'Training SVM...'
clf = train_svm(train_data)
print 'SVM trained'
print 'Building reports...'
print 'Classification report:'
print build_classification_report(clf, test_data)
print '----------'
print 'Cross-validation report:'
print cross_validation_report(clf, data)