-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunit_tests.py
191 lines (154 loc) · 8.1 KB
/
unit_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import codecs, re
from editdistance import eval as distance
from random import shuffle
from difflib import SequenceMatcher
from sklearn.externals import joblib
f = codecs.open('corpus2.csv', encoding = 'utf-8-sig')
lines = f.read().split('\r\n')
f.close()
border = int(lines.pop(0))
marked = lines[:border]
not_marked = lines[border:]
pattern = re.compile('[0-9]+[а-я]?')
mask = lambda words: ['_NUMBER_' if pattern.fullmatch(word) else word for word in words]
tokenize = lambda x: mask(re.findall('[А-Яа-яЁё0-9A-Za-z_]+', x))
only_numbers = lambda query: re.findall('[0-9]+', ' '.join(query))
print("Checking the corpus for consistency")
if min([line.count(',') >= 3 or bool(print(line)) for line in marked]) == True:
print("Corpus test 1 succeeded")
else:
print("Corpus test 1 failed (!)")
if min([line.count(',') >= 1 or bool(print(line)) for line in not_marked]) == True:
print("Corpus test 2 succeeded")
else:
print("Corpus test 2 failed (!)")
marked_queries = [query.rsplit(',', 3) for query in marked]
tokenized_queries = [tokenize(query) for raw_query, query, suggestion, label in marked_queries]
for (counter, (raw_query, query, suggestion, label)) in enumerate(marked_queries):
if SequenceMatcher(a = query, b = suggestion).find_longest_match(0, len(query) -1, 0, len(suggestion) -1).size < 3 and len(query) > 6 and suggestion != '<?????>' or (len(query) > 4 and len(suggestion) < 3) or suggestion in ['', 'окок', 'jr']:
print('Corpus test 3 failed (!) at query "%s", suggestion %s, number %d' % (query, suggestion, counter))
print("Corpus test 3 finished")
for (counter, (raw_query, query, suggestion, label)) in enumerate(marked_queries):
if int(query != suggestion) != int(label):
print('Corpus test 4 failed (!) at query "%s", suggestion %s, label %s, number %d' % (query, suggestion, label, counter))
print("Corpus test 4 finished")
dic = {}
for raw_query, query, suggestion, label in marked_queries:
if query in dic:
if suggestion not in dic[query]:
dic[query].append(suggestion)
else:
dic[query] = [suggestion]
for key in dic:
if len(dic[key]) > 1:
print("(!) Conflicting suggestions for %s:" % key, *dic[key])
print("Corpus test 5 finished")
for counter, (raw_query, query, suggestion, label) in enumerate(marked_queries):
if only_numbers(query) != only_numbers(suggestion) and suggestion != '<?????>':
print("Corpus test 6 failed (!) at query %s, suggestion %s, number %d" % (query, suggestion, counter))
print("Corpus test 6 has been completed.")
shuffle(tokenized_queries)
from morpho_model import queries2morpho
morpho = queries2morpho(tokenized_queries[:100])
if len(morpho) == 100:
print("Morpho test 1 succeeded")
else:
print("Morpho test 1 failed (!): feature extractor returned %d examples instead of %d" % (len(morpho), 100))
for query, morpho_query in zip(tokenized_queries[:100], morpho):
if len(query) != len(morpho_query):
print("Lengthes of %s and %s are different" % (' '.join(query), ' '.join(morpho2query)))
print("Morpho test 2 has been completed.")
if queries2morpho([['дыбенко', '_NUMBER_']]) == [['Npmsny', '_NUMBER_']]:
print("Morpho test 3 succeeded.")
else:
print("Morpho test 3 failed (!)")
from prep_model0 import corpus2prep
prep = corpus2prep([' '.join(query) for query in tokenized_queries[:100]])
if len(prep) == 100:
print("Prep0 test 1 succeeded")
else:
print("Prep0 test 1 failed (!): feature extractor returned %d examples instead of %d" % (len(prep), 100))
for query, prep_query in zip(tokenized_queries[:100], prep):
if len(query) != len(prep_query):
print("Lengthes of %s and %s are different" % (' '.join(query), ' '.join(prep2query)))
print("Prep0 test has been completed.")
if queries2morpho([['дыбенко', '_NUMBER_']]) == [['Npmsny', '_NUMBER_']]:
print("Prep0 test 3 succeeded.")
else:
print("Prep0 test 3 failed (!)")
from gensim.models import Word2Vec
my_w2v1 = Word2Vec.load('word2vecs/my_w2v1')
if len(my_w2v1.score(tokenized_queries[:100])) == 100:
print("Word2vec test 1 succeeded.")
else:
print("Word2vec test 1 failed (!).")
from prep_model1 import corpus2prep
if len(corpus2prep(["гостиницы на ул ленина в екатеринбурге"])) == 1 and len(corpus2prep(["гостиницы на ул ленина в екатеринбурге"])[0]) == 3:
print("Prep1 test succeeded.")
else:
print("Prep1 test failed (!)")
from metrics import count
sentence = "Улица Довлатова праспект шахтёроф сройка".split()
suggestion = "Улица Далматова проспект шахтёроф тройка".split()
correct = "Улица Довлатова проспект шахтёров стройка".split()
if count(sentence, suggestion, correct) == (1, 1, 2):
print("Metrics test 1 succeeded.")
else:
print("Metrics test 1 failed (!). It returned %s instead of %s" % (str(count(sentence, suggestion, correct)), "(1, 1, 2)"))
print("Checking data")
from sklearn.externals import joblib
X = joblib.load('X.pkl')
y = joblib.load('y.pkl')
if X.shape[0] == y.shape[0] and len(X.shape) == 2 and len(y.shape) == 1:
print("Data test 1 succeeded")
else:
print("Data test 1 failed. Shapes: ", X.shape, y.shape)
from reranking import tokenize, suggest
from random import choice
f = codecs.open("corpus2.csv", mode = "rU", encoding = "utf-8-sig")
corpus_lines = f.read().split('\r\n')
border = int(corpus_lines.pop(0))
marked_queries = [query.rsplit(',', 3) for query in corpus_lines[:border]]
misspelled = [(tokenize(query), tokenize(suggestion)) for (raw_query, query, suggestion, label) in marked_queries if label == '1' and suggestion != '<?????>' and suggestion != '']
random_misspelled_query = choice([a for a, b in misspelled if '_NUMBER_' in a])
if random_misspelled_query in suggest(random_misspelled_query):
print("Suggestion test #1 finished successfully at query", random_misspelled_query)
else:
print("Suggestion test #1 failed (!) at query", query)
random_misspelled_query = choice([a for a, b in misspelled if '_NUMBER_' not in a])
if random_misspelled_query in suggest(random_misspelled_query):
print("Suggestion test #2 finished successfully at query", random_misspelled_query)
else:
print("Suggestion test #2 failed (!) at query", query)
(random_misspelled_query, sug) = choice([(a, b) for a, b in misspelled if len(a) != len(b)])
sugs = suggest(random_misspelled_query)
if len(sug) in [len(s) for s in sugs]:
print("Suggestion test #3 succeded at query ", random_misspelled_query, "with sugs", *sugs)
else:
print("Suggestion test #3 failed (!) at query ", random_misspelled_query, "with sugs", *sugs)
from reranking import corpus2features, suggest
Scaler = joblib.load('Scaler.pkl')
random_misspelled_query = choice([a for a, b in misspelled])
sugs = suggest(random_misspelled_query)
try:
features = corpus2features([random_misspelled_query] * len(sugs), sugs)
except Exception as error:
print("Feature extractor test #1 failed at query %s (!) with the following error message: %s" % (random_misspelled_query, error))
else:
print("Feature extractor test #1 has been successfully completed at query %s. " % query)
logreg = joblib.load('logreg.pkl')
if logreg.coef_.shape[1] == features.shape[1]:
print("Feature extractor test #2 has been completed successfully at query %s: feature shape %s is consistent with logreg coefficients shape %s" % (random_misspelled_query, str(features.shape), str(logreg.coef_.shape)))
features = Scaler.transform(features)
if features.mean() > 1:
print("Scaler test failed because mean value %f > 1" % features.mean())
elif features.mean() < -1:
print("Scaler test failed because mean value %f < -1" % features.mean())
elif features.std() > 2:
print("Scaler test failed because standard deviation value %f > 2" % features.std())
elif features.std() < 0.5:
print("Scaler test failed because standard deviation value %f < 0.5" % features.std())
else:
print("Scaler test has been successfully completed with mean value %f and std value %f" % (features.mean(), features.std()))
else:
print("Feature extractor test #2 has failed at query %s: feature shape %s is consistent with logreg coefficients shape %s" % (random_misspelled_query, str(features.shape), str(logreg.coef_.shape)))