-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
136 lines (111 loc) · 3.81 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
import pandas as pd
import numpy as np
import string
import time
STOPWORDS = stopwords.words('english')
POS_TAGS = {
'nn': 'n',
'nns': 'n',
'nnp': 'n',
'nnps': 'n',
'vb': 'v',
'vbd': 'v',
'vbg': 'v',
'vbn': 'v',
'vbp': 'v',
'vbz': 'v',
'md': 'v',
'jj': 'a',
'jjr': 'a',
'jjs': 'a',
'rb': 'r',
'rbr': 'r',
'rbs': 'r',
'rb': 'r',
'wrb': 'r',
}
def tokenize(sentence):
return tokenizer.tokenize(sentence)
def is_stopword(token):
return (token in STOPWORDS) or (token in string.punctuation)
def _stem(token):
return stemmer.stem(token)
def stem(tokens):
return list(map(_stem, tokens))
def _lemmatize(token):
tag = pos_tag([token])[0][-1].lower()
tag = POS_TAGS.get(tag, 'n')
return lemmatizer.lemmatize(token, tag)
def lemmatize(tokens):
tags = pos_tag(tokens)
tags = list(map(lambda x: x[-1].lower(), tags))
tags = list(map(lambda x: POS_TAGS.get(x, 'n'), tags))
return list(map(lambda x: lemmatizer.lemmatize(*x), zip(tokens, tags)))
def preprocess(query, stemming=True, stopword=True, lower=True, strip=True, puncs=True):
translator = str.maketrans(dict(zip(string.punctuation, ['']*len(string.punctuation))))
tokens = []
for token in query:
if isinstance(token, (list, tuple, set)):
tokens.append(preprocess(token, stemming=stemming, stopword=stopword, lower=lower, strip=strip, puncs=puncs))
continue
elif stopword and is_stopword(token):
continue
if lower:
token = token.lower()
if puncs:
token = token.translate(translator)
if strip:
token = token.strip()
if stemming:
token = _stem(token)
tokens.append(token)
return tokens
def preprocess_sentence(string, return_token=False, delimiter=' ', **kwargs):
tokens = preprocess(tokenize(string), **kwargs)
if return_token:
return tokens
return delimiter.join(tokens)
def unique_tokens(tokens):
used_tokens = set()
for token in tokens:
if isinstance(token, list):
for t in unique_tokens(token):
if t not in used_tokens:
yield t
used_tokens.add(t)
if token not in used_tokens:
yield token
used_tokens.add(token)
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
if __name__ == "__main__":
doc = ['Hello', 'sir ', 'reading!']
print('Document:', doc)
print('Preprocessed:', preprocess(doc))
print()
print('Benchmark stemming vs lemmatizing')
print('Loading Data...')
df = pd.read_csv('ted_talks.csv')
tokens = df['description'].map(tokenizer.tokenize)
print('Number of Tokens: ', len(tokens))
t1 = time.perf_counter()
stemming = tokens.map(stem)
t2 = time.perf_counter()
print(f"The stemming calculation took {t2-t1:.3f} seconds")
t3 = time.perf_counter()
lemmatizing = tokens.map(lemmatize)
t4 = time.perf_counter()
print(f"The lemmatizing calculation took {t4-t3:.3f} seconds")
idx = np.random.choice(stemming.index)
print('Index: ', idx)
print('Sentence: ', df.loc[idx, 'description'])
print()
print('The differneces between these two methods for above sentence: (stemmed | lemmatized)')
for i in range(len(stemming[idx])):
if stemming[idx][i] != lemmatizing[idx][i]:
print(stemming[idx][i], '|', lemmatizing[idx][i])