-
Notifications
You must be signed in to change notification settings - Fork 1
/
embedding_utils.py
108 lines (97 loc) · 4.05 KB
/
embedding_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
from collections import defaultdict
import pickle
from numpy import linalg as LA
import time
from Embedding import Embedding
def pad_sequences(X, maxlen, emb_size=50):
for i in range(len(X)):
if len(X[i]) > maxlen:
X[i] = X[i][:maxlen]
elif len(X[i]) < maxlen:
pad = np.zeros(shape=(maxlen-len(X[i]), emb_size))
X[i] = np.append(X[i], pad, axis=0)
return X
def index_to_word(word2index) :
index2word = {value:key for key,value in word2index.items()}
index2word[0] = '<PAD>'
index2word[1] = '<START>'
index2word[2] = '<UNK>'
return index2word
# Based on https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer/blob/master/GloVe-as-TensorFlow-Embedding-Tutorial.ipynb
def load_embedding(glove):
'''
Load word embeddings from file.
'''
word_to_index_dict = dict()
index_to_embedding_array = []
with open(glove, 'r', encoding="utf-8") as glove_file:
for (i, line) in enumerate(glove_file):
split = line.split(' ')
word = split[0]
representation = split[1:]
representation = np.array(
[float(val) for val in representation]
)
# use +3 because actual word indexes start at 3 while indexes 0,1,2 are for
# <PAD>, <START>, and <UNK>
word_to_index_dict[word] = i+3
index_to_embedding_array.append(representation)
_WORD_NOT_FOUND = [0.0]* len(representation) # Empty representation for unknown words.
_PAD = 0
_START = 1
_UNK = 2
word_to_index_dict['<PAD>'] = 0
word_to_index_dict['<UNK>'] = 2
word_to_index_dict = defaultdict(lambda: _UNK, word_to_index_dict)
index_to_word_dict = index_to_word(word_to_index_dict)
# three 0 vectors for <PAD>, <START> and <UNK>
index_to_embedding_array = np.array(3*[_WORD_NOT_FOUND] + index_to_embedding_array )
return word_to_index_dict, index_to_word_dict, index_to_embedding_array
# Based on https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer/blob/master/GloVe-as-TensorFlow-Embedding-Tutorial.ipynb
def load_embedding_clear(glove):
'''
Load word embeddings from file without initial specifications (<Start> etc.)
'''
word_to_index_dict = dict()
index_to_embedding_array = []
with open(glove, 'r', encoding="utf-8") as glove_file:
for (i, line) in enumerate(glove_file):
split = line.split(' ')
word = split[0]
representation = split[1:]
representation = np.array(
[float(val) for val in representation]
)
# use +3 because actual word indexes start at 3 while indexes 0,1,2 are for
# <PAD>, <START>, and <UNK>
word_to_index_dict[word] = i
index_to_embedding_array.append(representation)
word_to_index_dict = defaultdict(lambda: 1, word_to_index_dict)
index_to_word_dict = index_to_word(word_to_index_dict)
index_to_embedding_array = np.array(index_to_embedding_array )
return word_to_index_dict, index_to_word_dict, index_to_embedding_array
def load_syn_dict(filename = 'data/syn_dict/syn_dict_glove300.pickle', N = 10):
'''
Load cached synonyms dictionary.
'''
try:
file = open(filename, 'rb')
syn_dict = pickle.load(file)
syn_dict = {word: neighbors[:N] for word, neighbors in syn_dict.items()}
return syn_dict
except:
print("ERROR: Could not load synonyms dictionary.")
return dict()
def load_dist_dict(filename = 'data/syn_dict/dist_dict_glove300.pickle', N = 10):
'''
Load cached dictionary with distances to nearest neighbors.
'''
try :
file = open(filename, 'rb')
dist_dict = pickle.load(file)
dist_dict = {word: distances[:N] for word, distances in dist_dict.items()}
return dist_dict
except:
print("ERROR: Could not load distances to nearest neighbors dictionary")
return dict()