-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmetrics.py
251 lines (210 loc) · 8.32 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# coding: utf-8
"""
This module holds various MT evaluation metrics.
"""
from external_metrics import sacrebleu
from external_metrics import mscoco_rouge
import numpy as np
from huggingface_hub import hf_hub_download
import evaluate
import huggingface_hub
import evaluate
WER_COST_DEL = 3
WER_COST_INS = 3
WER_COST_SUB = 4
def chrf(references, hypotheses):
"""
Character F-score from sacrebleu
:param hypotheses: list of hypotheses (strings)
:param references: list of references (strings)
:return:
"""
return (
sacrebleu.corpus_chrf(hypotheses=hypotheses, references=references).score * 100
)
def report_all(references, hypotheses):
bleu = evaluate.load("sacrebleu")
results = bleu.compute(predictions=hypotheses, references=[[i] for i in references], tokenize='none', force=True)
chrf_md = evaluate.load("chrf")
chrf_sc = chrf_md.compute(predictions=hypotheses, references=[[i] for i in references])
chrf_plus = chrf_md.compute(predictions=hypotheses, references=[[i] for i in references], word_order=2)
return {'bleu': results['score'], 'blue_comp':sacrebleu.raw_corpus_bleu(sys_stream=hypotheses, ref_streams=[references]).scores, 'chrf':chrf_sc['score'],
'chrf++':chrf_plus['score'], 'rouge': rouge(references,hypotheses) }
def bleu(references, hypotheses):
"""
Raw corpus BLEU from sacrebleu (without tokenization)
:param hypotheses: list of hypotheses (strings)
:param references: list of references (strings)
:return:
"""
# sacrebleu.raw_corpus_bleu(sys_stream=hypotheses, ref_streams=[references])
bleu_scores = sacrebleu.raw_corpus_bleu(
sys_stream=hypotheses, ref_streams=[references]
).scores
scores = {}
for n in range(len(bleu_scores)):
scores["bleu" + str(n + 1)] = bleu_scores[n]
return scores
def token_accuracy(references, hypotheses, level="word"):
"""
Compute the accuracy of hypothesis tokens: correct tokens / all tokens
Tokens are correct if they appear in the same position in the reference.
:param hypotheses: list of hypotheses (strings)
:param references: list of references (strings)
:param level: segmentation level, either "word", "bpe", or "char"
:return:
"""
correct_tokens = 0
all_tokens = 0
split_char = " " if level in ["word", "bpe"] else ""
assert len(hypotheses) == len(references)
for hyp, ref in zip(hypotheses, references):
all_tokens += len(hyp)
for h_i, r_i in zip(hyp.split(split_char), ref.split(split_char)):
# min(len(h), len(r)) tokens considered
if h_i == r_i:
correct_tokens += 1
return (correct_tokens / all_tokens) * 100 if all_tokens > 0 else 0.0
def sequence_accuracy(references, hypotheses):
"""
Compute the accuracy of hypothesis tokens: correct tokens / all tokens
Tokens are correct if they appear in the same position in the reference.
:param hypotheses: list of hypotheses (strings)
:param references: list of references (strings)
:return:
"""
assert len(hypotheses) == len(references)
correct_sequences = sum(
[1 for (hyp, ref) in zip(hypotheses, references) if hyp == ref]
)
return (correct_sequences / len(hypotheses)) * 100 if hypotheses else 0.0
def rouge(references, hypotheses):
rouge_score = 0
n_seq = len(hypotheses)
for h, r in zip(hypotheses, references):
rouge_score += mscoco_rouge.calc_score(hypotheses=[h], references=[r]) / n_seq
return rouge_score * 100
def wer_list(references, hypotheses):
total_error = total_del = total_ins = total_sub = total_ref_len = 0
for r, h in zip(references, hypotheses):
res = wer_single(r=r, h=h)
total_error += res["num_err"]
total_del += res["num_del"]
total_ins += res["num_ins"]
total_sub += res["num_sub"]
total_ref_len += res["num_ref"]
wer = (total_error / total_ref_len) * 100
del_rate = (total_del / total_ref_len) * 100
ins_rate = (total_ins / total_ref_len) * 100
sub_rate = (total_sub / total_ref_len) * 100
return {
"wer": wer,
"del_rate": del_rate,
"ins_rate": ins_rate,
"sub_rate": sub_rate,
}
def wer_single(r, h):
r = r.strip().split()
h = h.strip().split()
edit_distance_matrix = edit_distance(r=r, h=h)
alignment, alignment_out = get_alignment(r=r, h=h, d=edit_distance_matrix)
num_cor = np.sum([s == "C" for s in alignment])
num_del = np.sum([s == "D" for s in alignment])
num_ins = np.sum([s == "I" for s in alignment])
num_sub = np.sum([s == "S" for s in alignment])
num_err = num_del + num_ins + num_sub
num_ref = len(r)
return {
"alignment": alignment,
"alignment_out": alignment_out,
"num_cor": num_cor,
"num_del": num_del,
"num_ins": num_ins,
"num_sub": num_sub,
"num_err": num_err,
"num_ref": num_ref,
}
def edit_distance(r, h):
"""
Original Code from https://github.com/zszyellow/WER-in-python/blob/master/wer.py
This function is to calculate the edit distance of reference sentence and the hypothesis sentence.
Main algorithm used is dynamic programming.
Attributes:
r -> the list of words produced by splitting reference sentence.
h -> the list of words produced by splitting hypothesis sentence.
"""
d = np.zeros((len(r) + 1) * (len(h) + 1), dtype=np.uint8).reshape(
(len(r) + 1, len(h) + 1)
)
for i in range(len(r) + 1):
for j in range(len(h) + 1):
if i == 0:
# d[0][j] = j
d[0][j] = j * WER_COST_INS
elif j == 0:
d[i][0] = i * WER_COST_DEL
for i in range(1, len(r) + 1):
for j in range(1, len(h) + 1):
if r[i - 1] == h[j - 1]:
d[i][j] = d[i - 1][j - 1]
else:
substitute = d[i - 1][j - 1] + WER_COST_SUB
insert = d[i][j - 1] + WER_COST_INS
delete = d[i - 1][j] + WER_COST_DEL
d[i][j] = min(substitute, insert, delete)
return d
def get_alignment(r, h, d):
"""
Original Code from https://github.com/zszyellow/WER-in-python/blob/master/wer.py
This function is to get the list of steps in the process of dynamic programming.
Attributes:
r -> the list of words produced by splitting reference sentence.
h -> the list of words produced by splitting hypothesis sentence.
d -> the matrix built when calculating the editing distance of h and r.
"""
x = len(r)
y = len(h)
max_len = 3 * (x + y)
alignlist = []
align_ref = ""
align_hyp = ""
alignment = ""
while True:
if (x <= 0 and y <= 0) or (len(alignlist) > max_len):
break
elif x >= 1 and y >= 1 and d[x][y] == d[x - 1][y - 1] and r[x - 1] == h[y - 1]:
align_hyp = " " + h[y - 1] + align_hyp
align_ref = " " + r[x - 1] + align_ref
alignment = " " * (len(r[x - 1]) + 1) + alignment
alignlist.append("C")
x = max(x - 1, 0)
y = max(y - 1, 0)
elif x >= 1 and y >= 1 and d[x][y] == d[x - 1][y - 1] + WER_COST_SUB:
ml = max(len(h[y - 1]), len(r[x - 1]))
align_hyp = " " + h[y - 1].ljust(ml) + align_hyp
align_ref = " " + r[x - 1].ljust(ml) + align_ref
alignment = " " + "S" + " " * (ml - 1) + alignment
alignlist.append("S")
x = max(x - 1, 0)
y = max(y - 1, 0)
elif y >= 1 and d[x][y] == d[x][y - 1] + WER_COST_INS:
align_hyp = " " + h[y - 1] + align_hyp
align_ref = " " + "*" * len(h[y - 1]) + align_ref
alignment = " " + "I" + " " * (len(h[y - 1]) - 1) + alignment
alignlist.append("I")
x = max(x, 0)
y = max(y - 1, 0)
else:
align_hyp = " " + "*" * len(r[x - 1]) + align_hyp
align_ref = " " + r[x - 1] + align_ref
alignment = " " + "D" + " " * (len(r[x - 1]) - 1) + alignment
alignlist.append("D")
x = max(x - 1, 0)
y = max(y, 0)
align_ref = align_ref[1:]
align_hyp = align_hyp[1:]
alignment = alignment[1:]
return (
alignlist[::-1],
{"align_ref": align_ref, "align_hyp": align_hyp, "alignment": alignment},
)