-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathglove_utils.py
74 lines (66 loc) · 2.6 KB
/
glove_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np
from collections import defaultdict
import pickle
from numpy import linalg as LA
import time
from embedding import Embedding
def index_to_word(word2index) :
index2word = {value:key for key,value in word2index.items()}
index2word[0] = '<PAD>'
index2word[1] = '<START>'
index2word[2] = '<UNK>'
return index2word
# Based on https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer/blob/master/GloVe-as-TensorFlow-Embedding-Tutorial.ipynb
def load_embedding(glove):
'''
Load word embeddings from file.
'''
word_to_index_dict = dict()
index_to_embedding_array = []
with open(glove, 'r', encoding="utf-8") as glove_file:
for (i, line) in enumerate(glove_file):
split = line.split(' ')
word = split[0]
representation = split[1:]
representation = np.array(
[float(val) for val in representation]
)
# use +3 because actual word indexes start at 3 while indexes 0,1,2 are for
# <PAD>, <START>, and <UNK>
word_to_index_dict[word] = i+3
index_to_embedding_array.append(representation)
_WORD_NOT_FOUND = [0.0]* len(representation) # Empty representation for unknown words.
_PAD = 0
_START = 1
_UNK = 2
word_to_index_dict['<PAD>'] = 0
word_to_index_dict['<UNK>'] = 2
word_to_index_dict = defaultdict(lambda: _UNK, word_to_index_dict)
index_to_word_dict = index_to_word(word_to_index_dict)
# three 0 vectors for <PAD>, <START> and <UNK>
index_to_embedding_array = np.array(3*[_WORD_NOT_FOUND] + index_to_embedding_array )
return word_to_index_dict, index_to_word_dict, index_to_embedding_array
def load_syn_dict(filename = 'data/syn_dict/syn_dict_glove300.pickle', N = 10):
'''
Load cached synonyms dictionary.
'''
try:
file = open(filename, 'rb')
syn_dict = pickle.load(file)
syn_dict = {word: neighbors[:N] for word, neighbors in syn_dict.items()}
return syn_dict
except:
print("ERROR: Could not load synonyms dictionary.")
return dict()
def load_dist_dict(filename = 'data/syn_dict/dist_dict_glove300.pickle', N = 10):
'''
Load cached dictionary with distances to nearest neighbors.
'''
try :
file = open(filename, 'rb')
dist_dict = pickle.load(file)
dist_dict = {word: distances[:N] for word, distances in dist_dict.items()}
return dist_dict
except:
print("ERROR: Could not load distances to nearest neighbors dictionary")
return dict()