-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathwords2map.py
305 lines (282 loc) · 15.5 KB
/
words2map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# -*- coding: utf-8 -*-
from __future__ import division
import numpy as np
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from hdbscan import HDBSCAN
from nltk import tokenize, bigrams, trigrams, everygrams, FreqDist, corpus
from pattern.web import Google, SEARCH, download, plaintext, HTTPError, HTTP403Forbidden, URLError, URLTimeout, SearchEngineLimitError
from ssl import SSLError
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import semidbm
from cPickle import loads, load, UnpicklingError
from operator import itemgetter
from itertools import combinations
from unidecode import unidecode
import string
import time
import sys
import os
from os import listdir, getcwd
from os.path import isfile, join
from multiprocessing import Process, Manager
GOOGLE_API_KEY = "AIzaSyB4f-UO51_qDWXIwSwR92aejZso6hHJEY4" # Google provides everyone with 100 free searches per day, and then $5 per 1,000 searches after that, and a limit of 10,000 per day. However new users get $300 free in their first 60 days, so that's your first 60,000 words2map free. Today all words2map users share this, but please get your own key to reduce the shared load and make your word mapping sustainable: go to https://code.google.com/apis/console - from that site, simply (1) In the API Manager Overview, find "Custom Search API" and enable it; (2) Copy your new API key from "Credentials"; and (3) Paste it in words2map.py in the global variable "GOOGLE_API_KEY"
class Loader(dict):
# loads ~20 MB of indexes for word2vec and index2word into RAM for fast dictionary reads
def __init__(self, dbm_file):
self._dbm = semidbm.open(dbm_file, 'r')
def __iter__(self):
return iter(self._dbm.keys())
def __len__(self):
return len(self._dbm)
def __contains__(self, key):
if isinstance(key, int):
key = str(key)
return key in self._dbm
def __getitem__(self, key):
if isinstance(key, int):
key = str(key)
return self._dbm[key]
else:
return loads(self._dbm[key])
def keys(self):
return self._dbm.keys()
def values(self):
return [self._dbm[key] for key in self._dbm.keys()]
def itervalues(self):
return (self._dbm[key] for key in self._dbm.keys())
def load_model(directory="{}/vectors".format(getcwd())):
# current model contains 100,000 vectors of 300 elements, each element containing a 16 bit floating point number, such that vectors total ~60 MB uncompressed; note that there is practically no loss of precision in saving vectors with 16 bits versus 32 bits, while data consumption is halved
print "Loading 100,000 word vectors..."
model = load(open(join(directory, 'model.pickle')))
model.vocab = Loader(join(directory, 'word_to_index'))
model.index2word = Loader(join(directory, 'index_to_word'))
model.syn0norm = np.memmap(join(directory, 'syn0norm.dat'), dtype='float16', mode='r', shape=(len(model.vocab.keys()), model.layer1_size))
model.syn0 = model.syn0norm
return model
def get_collocations(words):
# returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant
minimum_frequency = 3
ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency}
collocations = dict(ngrams)
for ngram, likelihood in dict(ngrams).iteritems():
grams = ngram.split("_")
if len(grams) != 1:
gram_likelihoods = [ngrams[gram] for gram in grams]
if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)):
collocations.pop(ngram, None)
else:
for gram in grams:
collocations.pop(gram, None)
return sorted(collocations.items(), key=itemgetter(1), reverse=True)
def evaluate_keyword(frequency, word_index, max_word_index=100000):
# inspired by tf-idf (https://en.wikipedia.org/wiki/tf–idf)
rarity = word_index / max_word_index # intuition: rare words tend to have bigger indexes in word2vec, because they're more likely encountered later in training
return frequency * rarity
def extract_keywords(url, model, all_keywords):
minimum_word_index_for_unigrams = 20000 # minimum rarity of word to be considered a keyword
try:
text = plaintext(download(url))
words = [word for word in tokenize.word_tokenize(text) if word.isalnum() and word.lower() not in corpus.stopwords.words('english') and word.lower() not in url]
for collocation, frequency in get_collocations(words):
word_index = get_index(collocation, model)
if word_index:
if collocation.count('_') == 0 and word_index < minimum_word_index_for_unigrams:
pass
else:
all_keywords[collocation] = all_keywords.get(collocation, 0) + evaluate_keyword(frequency, word_index)
except (URLError, URLTimeout, HTTPError, HTTP403Forbidden, SSLError, UnicodeEncodeError, ValueError) as e:
pass
def research_keywords(something_unknown, model, websites_to_scan=10, keyword_count=25, attempts=0, google_api_key=GOOGLE_API_KEY):
# searches for something unknown on Google to find related websites and returns a ranked list of keywords from across all sites
maximum_number_of_google_search_attempts = 3
if attempts < maximum_number_of_google_search_attempts:
all_keywords = Manager().dict()
engine = Google(license=google_api_key, throttle=1.0, language="en")
try:
processes = []
for page in range(int(websites_to_scan/10)):
for website in engine.search(something_unknown, start=page+1, count=10, type=SEARCH, cached=False):
web_mining_process = Process(target=extract_keywords, args=(website.url, model, all_keywords))
processes.append(web_mining_process)
web_mining_process.start()
[process.join() for process in processes]
except HTTP403Forbidden:
print "\nToday's maximum number of free searches from Google shared by this API across all words2map users has expired.\nPlease get your own key at https://code.google.com/apis/console\n\nFrom that site, simply:\n1. In the API Manager Overview, find \"Custom Search API\" and enable it\n2. Copy your new API key from \"Credentials\"\n3. Paste it in words2map.py in the global variable \"GOOGLE_API_KEY\"\n"
sys.exit(1)
except (URLError, URLTimeout, HTTPError, SSLError):
print "\nUnable to reach Google Search for {}, trying one more time".format(something_unknown)
return research_keywords(something_unknown, model, attempts=attempts+1)
all_keywords = sorted(all_keywords.items(), key=itemgetter(1), reverse=True)
print "\nKeywords about {} to combine vectors for:".format(something_unknown)
top_keywords = []
for i in range(keyword_count):
try:
keyword, score = all_keywords[i]
top_keywords.append(all_keywords[i])
print "{} {}".format(round(score, 3), unidecode(keyword).replace("_", " "))
except IndexError:
break
return top_keywords
else:
print "After a few tries, it seems that Google is not returning results for us. If you haven't done so already, please try adding your own API key at https://code.google.com/apis/console\n\nFrom that site, simply:\n1. In the API Manager Overview, find \"Custom Search API\" and enable it\n2. Copy your new API key from \"Credentials\"\n3. Paste it in words2map.py in the global variable \"GOOGLE_API_KEY\"\n"
sys.exit(1)
def load_words(filename):
words = []
filepath = getcwd() + "/words/" + filename
with open(filepath) as fp:
for i, line in enumerate(fp):
clean_line = line.split("\r")[0].split("\n")[0]
words.append(clean_line)
return words
def load_derived_vectors(filename):
# loads derived vectors from a previous words2map as a standalone Gensim Word2Vec model (https://radimrehurek.com/gensim/models/word2vec.html)
filepath = getcwd() + "/derived_vectors/" + filename
model = Word2Vec.load_word2vec_format(filepath, binary=False)
return model
def save_derived_vectors(words, vectors, filename=None):
# saves vectors in human readable format, which can be easily and quickly reloaded back into a Gensim Word2Vec model (https://radimrehurek.com/gensim/models/word2vec.html)
derived_vectors_directory = getcwd() + "/derived_vectors"
if type(filename) == type(None):
files = [f for f in listdir(derived_vectors_directory) if isfile(join(derived_vectors_directory, f))]
words2map_files = [int(f.split("_")[1].split(".txt")[0]) for f in files if "words2map_" in f and ".txt" in f]
if words2map_files:
map_number = max(words2map_files) + 1
else:
map_number = 0
filename = "words2map_{}.txt".format(map_number)
f = open("{}/{}".format(derived_vectors_directory, filename),'w')
f.write("{} {}\n".format(len(words), 300))
for word, vector in zip(words, vectors):
formatted_word = word.replace(" ", "_")
formatted_vector = ' '.join([str(i) for i in vector])
f.write("{} {}\n".format(formatted_word, formatted_vector))
f.close()
print "\nSaved word vectors as {}".format(filename)
return filename
def test_performance():
# calculates average time to access a word vector after loading the model in RAM
model = load_model()
times = []
for i in range(100000):
word = model.index2word[i]
start_time = time.time()
vector = model[word]
end_time = time.time()
times.append(end_time - start_time)
total_time = sum(times)
average_time = np.mean(times)
print "You can count on it taking about {} μs to check / get each word vector at runtime, after loading the model".format(round(total_time, 2), round(average_time * 100000, 2))
def get_visualization_file_path(print_status):
visualizations = getcwd() + "/visualizations"
files = [f for f in listdir(visualizations) if isfile(join(visualizations, f))]
words2map_files = [int(f.split("_")[1].split(".png")[0]) for f in files if "words2map_" in f and ".png" in f]
if words2map_files:
map_number = max(words2map_files) + 1
else:
map_number = 0
if print_status:
print "\nVisualization saved! Check out words2map_{}.png".format(map_number)
return "{}/words2map_{}.png".format(visualizations, map_number)
def generate_clusters(words, vectors_in_2D, print_status=True):
# HDBSCAN, i.e. hierarchical density-based spatial clustering of applications with noise (https://github.com/lmcinnes/hdbscan)
vectors = vectors_in_2D
sns.set_context('poster')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 500, 'linewidths': 0}
clusters = HDBSCAN(min_cluster_size=2).fit_predict(vectors)
palette = sns.color_palette("husl", np.unique(clusters).max() + 1)
colors = [palette[cluster_index] if cluster_index >= 0 else (0.0, 0.0, 0.0) for cluster_index in clusters]
fig = plt.figure(figsize=(30, 30))
plt.scatter(vectors.T[0], vectors.T[1], c=colors, **plot_kwds)
plt.axis('off')
x_vals = [i[0] for i in vectors]
y_vals = [i[1] for i in vectors]
plt.ylim(min(y_vals)-0.3, max(y_vals)+0.3)
plt.xlim(min(x_vals)-0.3, max(x_vals)+0.3)
font_path = getcwd() + '/fonts/Comfortaa-Regular.ttf'
font_property = matplotlib.font_manager.FontProperties(fname=font_path, size=24)
for i, word in enumerate(words):
if type(word) != type(None):
if type(word) != type(""):
word = unidecode(word).replace("_", " ")
else:
word = word.replace("_", " ")
text_object = plt.annotate(word, xy=(x_vals[i], y_vals[i]+0.05), font_properties=font_property, color=colors[i], ha="center")
plt.subplots_adjust(left=(500/3000), right=(2900/3000), top=1.0, bottom=(300/2700))
plt.savefig(get_visualization_file_path(print_status), bbox_inches="tight")
return clusters
def reduce_dimensionality(vectors, dimensions=2):
# t-stochastic neighbor embedding (https://lvdmaaten.github.io/tsne/)
print "\nComputing t-SNE reduction of 300D word vectors to {}D".format(dimensions)
tsne_model = TSNE(n_components=dimensions, n_iter=10000, metric="correlation", learning_rate=50, early_exaggeration=500.0, perplexity=40.0)
np.set_printoptions(suppress=True)
vectors_in_2D = tsne_model.fit_transform(np.asarray(vectors).astype('float64'))
return vectors_in_2D
def k_nearest_neighbors(model, k=10, word=None, vector=None):
if word:
return model.most_similar(positive=[word], topn=k)
elif any(vector):
return model.most_similar(positive=[vector], topn=k)
else:
raise ValueError("Provide a word or vector as an argument to get k-nearest neighbors\ne.g. k_nearest_neighbors(k=25, word=\"humanity\")")
def get_vector(word, model, lowercase=True):
# returns vector of word as 300 dimensions, each containing a 16 bit floating point number, trying first for the lowercase version if that's set to True, and returning None if neither exists
if lowercase:
formatted_word = word.replace(" ", "_").lower()
try:
vector = model[formatted_word]
return np.asarray(vector)
except (EOFError, KeyError, UnpicklingError):
return get_vector(word, model, lowercase=False)
else:
formatted_word = word.replace(" ", "_")
try:
vector = model[formatted_word]
return np.asarray(vector)
except (EOFError, KeyError, UnpicklingError):
return None
def get_index(word, model, lowercase=True):
# returns index of word ranging between 0 and 99,999 (corresponding to the order that words were encountered during word2vec training), trying first for the lowercase version if that's set to True, and returning None if neither exists
if lowercase:
formatted_word = word.replace(" ", "_").lower()
try:
word_index = model.vocab[formatted_word].index
return word_index
except (EOFError, KeyError, UnpicklingError):
return get_index(word, model, lowercase=False)
else:
formatted_word = word.replace(" ", "_")
try:
word_index = model.vocab[formatted_word].index
return word_index
except (EOFError, KeyError, UnpicklingError):
return None
def memory_efficient_vector(vector, significant_digits_per_dimension=2):
# upon analysis of dimensions of original GoogleNews vectors, values seem to be significant to about 0.01 (analysis at https://docs.google.com/spreadsheets/d/1cgvgcoy8Es36GAZ67FyoheYKivUni1BNXykNi4GNPRc/edit?usp=sharing), while at least 3x memory compression seems possible here in the future following the distribution of elements of word2vec corpus (see for data: https://docs.google.com/spreadsheets/d/1RT0TphDA3Rk1Mi5Rg6aj9IpuGn1hFcK92PeffCsOmM4/edit?usp=sharing)
return [round(dimension, significant_digits_per_dimension) for dimension in vector]
def add_vectors(vectors):
# vector addition is done first by averaging the values for each dimension, and then unit normalizing the derived vector (see e.g. Udacity deep learning instructor explain: https://youtu.be/BD8wPsr_DAI)
derived_vector = np.average(np.array(vectors), axis=0)
return derived_vector / np.linalg.norm(derived_vector)
def derive_vector(word, model, rederive=True, google_api_key=GOOGLE_API_KEY):
# extracts keywords from Google searches and adds their vectors; if rederive is True, a word that already has a vector for will still be rederived, which can help to normalize clustering especially for data visualization
if not rederive:
vector = get_vector(word, model)
if type(vector) != type(None):
return memory_efficient_vector(vector)
keywords = research_keywords(something_unknown=word, model=model, google_api_key=GOOGLE_API_KEY) # see top of file about API key
vectors = [get_vector(keyword, model) for keyword, score in keywords]
derived_vector = add_vectors(vectors)
return memory_efficient_vector(derived_vector)
def clarify(words, model):
#derives vectors for any set of words and visualizes structured clusters
vectors = [derive_vector(word, model) for word in words]
filename = save_derived_vectors(words, vectors)
model = load_derived_vectors(filename)
words = [word for word in model.vocab]
vectors = [model[word] for word in words]
vectors_in_2D = reduce_dimensionality(vectors)
cluster_indexes = generate_clusters(words, vectors_in_2D) # clusters of just one word have an index of -1, otherwise they have unique positive integer indexes