-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsandbox.py
124 lines (93 loc) · 3.98 KB
/
sandbox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
'''
from flair.embeddings import FlairEmbeddings
from flair.data import Token
flair_embedding_forward = FlairEmbeddings('multi-forward-fast')
token = Token('green')
flair_embedding_forward.embed(token)
print(token)
'''
'''
import utilsOs
import fastText
from fastText.util import util
import numpy as np
class FastTextNN:
""" found in comments in https://github.com/facebookresearch/fastText/issues/384 """
def __init__(self, ft_model=None, ft_matrix=None):
if ft_model == None:
ft_model = fastText.load_model(u'/data/rali5/Tmp/alfonsda/fasttextVectorModels/wiki.fr.bin')
self.ft_model = ft_model
self.ft_words = ft_model.get_words()
self.word_frequencies = dict(zip(*ft_model.get_words(include_freq=True)))
self.ft_matrix = ft_matrix
if self.ft_matrix is None:
self.ft_matrix = np.empty((len(self.ft_words), ft_model.get_dimension()))
for i, word in enumerate(self.ft_words):
self.ft_matrix[i,:] = ft_model.get_word_vector(word)
def find_nearest_neighbor(self, query, vectors, n=10, cossims=None):
"""
query is a 1d numpy array corresponding to the vector to which you want to
find the closest vector
vectors is a 2d numpy array corresponding to the vectors you want to consider
cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency
returns the index of the closest n matches to query within vectors and the cosine similarity (cosine the angle between the vectors)
"""
if cossims is None:
cossims = np.matmul(vectors, query, out=cossims)
norms = np.sqrt((query**2).sum() * (vectors**2).sum(axis=1))
cossims = cossims/norms
result_i = np.argpartition(-cossims, range(n+1))[1:n+1]
return list(zip(result_i, cossims[result_i]))
def nearest_words(self, word, n=10, word_freq=None):
result = self.find_nearest_neighbor(self.ft_model.get_word_vector(word), self.ft_matrix, n=n)
if word_freq:
return [(self.ft_words[r[0]], r[1]) for r in result if self.word_frequencies[self.ft_words[r[0]]] >= word_freq]
else:
return [(self.ft_words[r[0]], r[1]) for r in result]
model = fastText.load_model(u'/data/rali5/Tmp/alfonsda/fasttextVectorModels/wiki.fr.bin')
print(model.get_dimension())
tok = (model.get_word_vector('machine'))
n = FastTextNN()
print(n.nearest_words('minuteor'))
'''
'''
import os
os.environ['LD_LIBRARY_PATH'] = '/u/alfonsda/anaconda3/lib/'
import flair
import re
def getElemsNotInIterObj(elemDict, iterObj, replaceKeyForValue=False):
"""given a dict of elements and an iterable object, if the elements are
found in the iterable object, it replaces the None value with the iterable element """
#if it's not a dict, make it a dict
if type(elemDict) is not dict:
elemList, elemDict = list(elemDict), {}
for index,key in enumerate(elemList):
#if the token appears multiple times in the list
if key in elemDict:
elemDict[key] = (None, elemDict[key][1]+[index])
else:
elemDict[key] = (None, [index])
#if the element is in the iterable object
for keyElem,valueElem in elemDict.items():
if valueElem[0] == None and keyElem in iterObj:
#if the iterable object is a dict and we want the value, not the key
if type(iterObj) is dict and replaceKeyForValue != False:
elemDict[keyElem] = (iterObj[keyElem], valueElem[1])
else:
elemDict[keyElem] = (keyElem, valueElem[1])
#if there is a numeric or ascii symbol character in the token we take it as is
elif len(re.findall( re.compile(r'([0-9]|-|\+|\!|\#|\$|%|&|\'|\*|\?|\.|\^|_|`|\||~|:|@)' ), keyElem)) > 0:
elemDict[keyElem] = (keyElem, valueElem[1])
return elemDict
#print(getElemsNotInIterObj(['abc', 3, 'abcd', 'acc'], ['abc', 'abfc', 'aebc'], replaceKeyForValue=False))
'''
'''
from pywiktionary import Wiktionary
wikt = Wiktionary(XSAMPA=True)
dump_file = "/data/rali5/Tmp/alfonsda/wikiDump/wiktionary/frwiktionary-20190201-pages-articles-multistream.xml"
pron = wikt.extract_IPA(dump_file)
'''
l = ['asdfg3ss', 'asdfg4ss', 'asdfg1ss', 'asdfg2ss']
import re
x = re.compile(r'[a-zA-Z]4[a-zA-Z]')
print( [ t for t in l if x.search(t)] )