-
Notifications
You must be signed in to change notification settings - Fork 0
/
embeddings.py
111 lines (85 loc) · 3.06 KB
/
embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Taken from my a5.
import math
import numpy as np
from tqdm import tqdm
class Embeddings:
def __init__(self, glove_file = 'glove.42B.300d.filtered.txt'):
self.embeddings = {}
for line in tqdm(open(glove_file), total=108947, desc='Loading GloVe vectors'):
row = line.split()
word = row[0]
vals = np.array([float(x) for x in row[1:]])
self.embeddings[word] = vals
def __getitem__(self, word):
return self.embeddings[word]
def __contains__(self, word):
return word in self.embeddings
def vector_norm(self, vec):
"""
Calculate the vector norm (aka length) of a vector.
Parameters
----------
vec : np.array
An embedding vector.
Returns
-------
float
The length (L2 norm, Euclidean norm) of the input vector.
"""
return math.sqrt(np.sum(vec**2))
def cosine_similarity(self, v1, v2):
"""
Calculate cosine similarity between v1 and v2; these could be
either words or numpy vectors.
If either or both are words (e.g., type(v#) == str), replace them
with their corresponding numpy vectors before calculating similarity.
Parameters
----------
v1, v2 : str or np.array
The words or vectors for which to calculate similarity.
Returns
-------
float
The cosine similarity between v1 and v2.
"""
if type(v1) == str:
vec1 = self.__getitem__(v1)
else:
vec1 = v1
if type(v2) == str:
vec2 = self.__getitem__(v2)
else:
vec2 = v2
return (vec1 @ vec2) / (self.vector_norm(vec1) * self.vector_norm(vec2))
def most_similar(self, vec, n = 5, exclude = []):
"""
Return the most similar words to `vec` and their similarities.
As in the cosine similarity function, allow words or embeddings as input.
Parameters
----------
vec : str or np.array
Input to calculate similarity against.
n : int
Number of results to return. Defaults to 5.
exclude : list of str
Do not include any words in this list in what you return.
Returns
-------
list of ('word', similarity_score) tuples
The top n results.
"""
if type(vec) == str:
vec = self.__getitem__(vec)
similarity_list = []
for word in self.embeddings:
if word not in exclude:
similarity_list.append((word, self.cosine_similarity(self.embeddings[word], vec)))
similarity_list.sort(key=lambda a: a[1], reverse=True)
return similarity_list[:n]
if __name__ == '__main__':
embeddings = Embeddings()
# word = 'mercury'
# print(f'Most similar to {word}:')
# for item in embeddings.most_similar(word, exclude=[word], n=20):
# print('\t',item[0], '\t', item[1])
# print(len(embeddings.embeddings))