forked from anlausch/XWEAT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsemEval.py
142 lines (118 loc) · 5.32 KB
/
semEval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import codecs
import re
import numpy as np
from scipy import spatial
import os
from joblib import Parallel, delayed
import multiprocessing
from sklearn.metrics import r2_score
file_list = os.listdir('data/vec')
embedding_path = "data/vec/ara_news_2008_1M-sentencesCleaned.txt.vec"
def load_embedding(path):
embbedding_dict = {}
with codecs.open(path, "rb", "utf8", "ignore") as infile:
for line in infile:
try:
parts = line.split()
word = parts[0]
nums = [float(p) for p in parts[1:]]
embbedding_dict[word] = nums
except Exception as e:
print(line)
continue
return embbedding_dict #
def clean_arabic_str(text):
'''
this method clean strings of arabic, remove tashkeel, and replace double letters and unify ta2 marbuta and ha2
:param text: text: an arabic word
:type text str
:return:text
'''
search = ["أ", "إ", "آ", "ة", "_", "-", "/", ".", "،", " و ", " يا ", '"', "ـ", "'", "ى", "\\", '\n', '\t',
'"', '?', '؟', '!']
replace = ["ا", "ا", "ا", "ه", " ", " ", "", "", "", " و", " يا", "", "", "", "ي", "", ' ', ' ', ' ', ' ? ', ' ؟ ',
' ! ']
# remove tashkeel
p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
text = re.sub(p_tashkeel, "", text)
# remove longation
p_longation = re.compile(r'(.)\1+')
subst = r"\1\1"
text = re.sub(p_longation, subst, text)
text = text.replace('وو', 'و')
text = text.replace('يي', 'ي')
text = text.replace('اا', 'ا')
for i in range(0, len(search)):
text = text.replace(search[i], replace[i])
# trim
text = text.strip()
return text
def semEval(embedding_path,output):
''':parameter embedding_path
:return correlation, print results to a file
'''
embedding_dict = load_embedding(embedding_path)
result = []
gold_standard = []
with open("data/STS.gs.track1.ar-ar.txt" , "r") as sts_results:
for line in sts_results:
line= line.strip()
gold_standard.append(float(line))
with codecs.open('data/STS.input.track1.ar-ar.txt', 'r', "utf-8") as input: #load track test
for index, line in enumerate(input):
line = line.strip().split("\t") #split into two sentences
#sts_1 = line[0].replace(".","").split(" ")
#sts_2 = line[1].replace(".", "").split(" ")
sts_1 = line[0].split(" ") # create array with words from sentence one
sts_2 = line[1].split(" ") # create array with words from sentence one
print(index ,"\n",sts_1, "\n" ,sts_2)
sum_embedding_1 = 0.0
sum_embedding_2 = 0.0
sts_1_score = 0.0
sts_2_score = 0.0
for token in sts_1:
token = clean_arabic_str(token).replace(" ", "_") #clean the token to match the training format
token = token.replace(".", "")
try:
embedding_dict[token]
word_embedding = np.array(embedding_dict[token])
word_embedding = np.divide(word_embedding, len(sts_1)) #normalizing the word embedding by the length of the sentence
sum_embedding_1 = np.add(sum_embedding_1,word_embedding) # add the word vector to the sentence vector
except KeyError as e:
print("not found:" + token)
#sts_1_score = np.divide(sum_embedding_2, len(sts_2)) # not needed anymore
for token in sts_2:
token = clean_arabic_str(token).replace(" ", "_")
token = token.replace(".", "")
try:
embedding_dict[token]
word_embedding = np.array(embedding_dict[token])
word_embedding = np.divide(word_embedding,len(sts_2))
sum_embedding_2 = np.add(sum_embedding_2, word_embedding) # same as above
except KeyError as e:
print("not found:" + token)
#sts_2_score = np.divide(sum_embedding_2, len(sts_2)) #not needed anymore
result.append(float(1 - spatial.distance.cosine(sum_embedding_1, sum_embedding_2)))
#result = [result.append(float(i)) for i in result]
index_missing_elements = np.argwhere(np.isnan(result))
for a in index_missing_elements:
print(int(a))
del result[int(a)] # there is a sentance causing nan value, so i deleted it
del gold_standard[int(a)] #delete the missing from the gold standard
# print(gold_standard)
# print(result)
# print(len(result))
# print(len(gold_standard))
result = np.array(result)
gold_standard = np.array(gold_standard)
print(np.argwhere(np.isnan(result)))
#print(r2_score(gold_standard, result))
correlation = np.corrcoef(gold_standard, result) #
#print(correlation)
#np.savetxt("eval/"+embedding_path+"Eval",correlation)
np.savetxt("eval/"+output+"Eval",correlation,fmt='%.18e',header="")
return correlation
#for file_name in file_list:
# semEval("data/vec/"+file_name,file_name)
num_cores = multiprocessing.cpu_count()
Parallel(n_jobs=num_cores)(delayed(semEval)("data/vec/"+file_name,file_name) for file_name in file_list) #for the server