-
Notifications
You must be signed in to change notification settings - Fork 1
/
Neighbors.py
164 lines (148 loc) · 7.01 KB
/
Neighbors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
Class to retrieve list of neighbors from embedding (can be counterfitted) and filtering
i.e. by discarding words that semantically different, like words for verbs etc.
"""
import numpy as np
import operator
from collections import defaultdict
import pickle
from numpy import linalg as LA
import time
class Neighbors(object):
def __init__(self,
embedding,
in_memory=False,
norm=np.linalg.norm,
neighbors_filters=[]):
"""
Input:
embedding: string
path to the embedding
in_memory: boolean
optional:
norm: function
optional: distance function with just an input (the distance vector)
neighbors_filters: list
optional: list of functions to filter the results: each function must recieve as input a list of words,
and outputs a filtered list (for example discarding words that are tagged differently by nltk.pos_tag function)
"""
self.embedding = embedding
self.in_memory = in_memory
self.norm = norm
self.neighbors_filters = neighbors_filters
self.word2index, self.index2word, self.index2embedding = self.load_embedding(self.embedding)
# parameters for the in-memory initialization
self.words_matrix = None # vector with all the datapoints
# keep in memory the entire words' space matrix so operations can be vectorized
if in_memory is True:
# this code works in theory and can speedup the process of finding a neighbor,
# but the matrix that is stored in memory is big, shape>=(5, 100k)
self.words_matrix = np.vstack([v for v in self.index2embedding])
def set_embedding(self, embedding):
"""
Change on the fly the embedding used to find the neighbors
"""
self.embedding = embedding
self.word2index, self.index2word, self.index2embedding = self.load_embedding(embedding)
print("[logger]: Embedding has been changed successfully to {}".format(self.embedding))
def nearest_neighbors(self,
word,
eps,
k=1,
dist=np.linalg.norm):
"""
Return the k nearest neighbors in the neighborhood of an embedded word
TODO: complete the documentation
TODO: implement missing feature for in memory search
Parameters
-------
word: string
input word
eps: float
radius (in the same norm of the embedding) of the n-ball where neighbors are gathered
k: integer
optional: maximum number of neighbors to return, regardless of the numbers in the eps-radius
dist: function
optional: distance measure between two embeddings
Returns
-------
list of words in the neighborhood, empty if no word is close enough
"""
neighbors = []
if self.in_memory is False:
self.words = [v for v in self.index2word.values()]
w1 = self.index2embedding[self.word2index[word]]
for w in self.words:
w2 = self.index2embedding[self.word2index[w]]
dist = self.norm(w1-w2)
if dist <= eps:
neighbors.append([w, dist])
if len(neighbors) == 0:
print("[logger]: No neighbors in the radius {}".format(eps))
return []
neighbors = sorted(neighbors, key = operator.itemgetter(1, 0))
neighbors = [[n[0], n[1]] for n in neighbors[1:k+1]]
else:
# this code works and can speedup the process of finding a neighbor,
# still the matrix that is stored in memory is big
indices = operator.itemgetter(*word)(self.word2index)
words_vectors = np.vstack(operator.itemgetter(*indices)(self.index2embedding))
distances = self.norm(self.words_matrix[:,np.newaxis,:].repeat(len(words_vectors), 1) - words_vectors, axis=2) # exploit numpy vectorization
argsort_ = distances.argsort(axis=0)[1:k+1]
neighbors = {}
for (a,w) in zip(argsort_.T, word):
for aa in a:
if w not in neighbors.keys():
try:
neighbors[w] = [self.index2word[aa]]
except KeyError:
print("[logger-WARNING]: index {} for word {} is not in index2word".format(aa, w))
neighbors[w] = [self.index2word[2]] # append unknown
else:
try:
neighbors[w].append(self.index2word[aa])
except KeyError:
print("[logger-WARNING]: index {} for word {} is not in index2word".format(aa, w))
neighbors[w].append(self.index2word[2]) # append unknown
return neighbors
def filter(self, w, candidates, granularity):
filtered_candidates = []
for f in self.neighbors_filters:
filtered_candidates = f(w, candidates, granularity)
return filtered_candidates
def index_to_word(self, word2index) :
index2word = {value:key for key,value in word2index.items()}
index2word[0] = '<PAD>'
index2word[1] = '<START>'
index2word[2] = '<UNK>'
return index2word
def load_embedding(self, emb):
'''
Load word embeddings from file.
Based on https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer/blob/master/GloVe-as-TensorFlow-Embedding-Tutorial.ipynb
'''
word_to_index_dict = dict()
index_to_embedding_array = []
with open(emb, 'r', encoding="utf-8") as emb_file:
for (i, line) in enumerate(emb_file):
split = line.split(' ')
word = split[0]
representation = split[1:]
representation = np.array(
[float(val) for val in representation]
)
# use +3 because actual word indexes start at 3 while indexes 0,1,2 are for
# <PAD>, <START>, and <UNK>
word_to_index_dict[word] = i+3
index_to_embedding_array.append(representation)
_WORD_NOT_FOUND = [0.0]* len(representation) # Empty representation for unknown words.
_PAD = 0
_START = 1
_UNK = 2
word_to_index_dict['<PAD>'] = 0
word_to_index_dict['<UNK>'] = 2
word_to_index_dict = defaultdict(lambda: _UNK, word_to_index_dict)
index_to_word_dict = self.index_to_word(word_to_index_dict)
# three 0 vectors for <PAD>, <START> and <UNK>
index_to_embedding_array = np.array(3*[_WORD_NOT_FOUND] + index_to_embedding_array )
return word_to_index_dict, index_to_word_dict, index_to_embedding_array