-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmetrics.py
138 lines (116 loc) · 4.71 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import argparse
from nltk import wordpunct_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.probability import FreqDist
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
BLEU_WEIGHTS_MEAN = [
[1.0],
[0.5, 0.5],
[1 / 3, 1 / 3, 1 / 3],
[0.25, 0.25, 0.25, 0.25],
]
SMOOTHING_FUNCTION = SmoothingFunction().method0
def calculate_ngram_diversity(corpus):
"""
Calculates unigram and bigram diversity
Args:
corpus: tokenized list of sentences sampled
Returns:
uni_diversity: distinct-1 score
bi_diversity: distinct-2 score
"""
bigram_finder = BigramCollocationFinder.from_words(corpus)
try:
bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N
except ZeroDivisionError:
print("Division by zero in dist-2 calcluation")
bi_diversity = 0
dist = FreqDist(corpus)
try:
uni_diversity = len(dist) / len(corpus)
except ZeroDivisionError:
print("Division by zero in dist-1 calcluation")
uni_diversity = 0
return uni_diversity, bi_diversity
def i_corpus_bleu(
list_of_references,
hypotheses,
inputs,
alpha=0.9,
weights=(0.25, 0.25, 0.25, 0.25),
smoothing_function=None,
auto_reweigh=False,
):
list_of_inputs = [[i] for i in inputs]
bleu = corpus_bleu(
list_of_references,
hypotheses,
weights=weights,
smoothing_function=smoothing_function,
auto_reweigh=auto_reweigh,
)
penalty = corpus_bleu(
list_of_inputs,
hypotheses,
weights=weights,
smoothing_function=smoothing_function,
auto_reweigh=auto_reweigh,
)
return alpha * bleu - (1 - alpha) * penalty
def load_data(file, lower=False):
strs = []
with open(file, "r", encoding="utf8") as of:
datas = of.readlines()
for idx, data in enumerate(datas):
strs.append(data.strip())
str_list = [wordpunct_tokenize(seq.lower() if lower else seq) for seq in strs]
return str_list
def load_ref_data(file, lower=False):
strs = []
with open(file, "r", encoding="utf8") as of:
datas = of.readlines()
for idx, data in enumerate(datas):
strs.append(data.strip())
str_list = [[wordpunct_tokenize(seq.lower() if lower else seq)] for seq in strs]
return str_list
def metric(args):
infer = load_data(args.gen, lower=args.lowercase)
ori = load_data(args.src, lower=args.lowercase)
golden = load_ref_data(args.ref, lower=args.lowercase)
# eval bleu
corp_model_bleu1 = corpus_bleu(golden, infer, weights=BLEU_WEIGHTS_MEAN[0]) * 100
corp_model_bleu2 = corpus_bleu(golden, infer, weights=BLEU_WEIGHTS_MEAN[1]) * 100
corp_model_bleu3 = corpus_bleu(golden, infer, weights=BLEU_WEIGHTS_MEAN[2]) * 100
corp_model_bleu4 = corpus_bleu(golden, infer, weights=BLEU_WEIGHTS_MEAN[3]) * 100
nltk_bleu = [corp_model_bleu1, corp_model_bleu2, corp_model_bleu3, corp_model_bleu4]
print("BLEU", nltk_bleu)
# eval ibleu
corp_model_ibleu1 = i_corpus_bleu(golden, infer, ori, weights=BLEU_WEIGHTS_MEAN[0]) * 100
corp_model_ibleu2 = i_corpus_bleu(golden, infer, ori, weights=BLEU_WEIGHTS_MEAN[1]) * 100
corp_model_ibleu3 = i_corpus_bleu(golden, infer, ori, weights=BLEU_WEIGHTS_MEAN[2]) * 100
corp_model_ibleu4 = i_corpus_bleu(golden, infer, ori, weights=BLEU_WEIGHTS_MEAN[3]) * 100
nltk_ibleu = [corp_model_ibleu1, corp_model_ibleu2, corp_model_ibleu3, corp_model_ibleu4]
print("iBLEU", nltk_ibleu)
# eval dist
tokens = [token for sentence in infer for token in sentence]
dist_1, dist_2 = calculate_ngram_diversity(tokens)
distinct = [dist_1 * 100, dist_2 * 100]
print("distinct", distinct)
print("length-gen", sum(1 for sentence in infer for _ in sentence))
print("length-ref", sum(1 for ref in golden for sent in ref for _ in sent))
def eval_bleu(golden, infer, weights_list=None):
if weights_list is None:
weights_list = BLEU_WEIGHTS_MEAN
corp_model_bleu1 = corpus_bleu(golden, infer, weights=weights_list[0]) * 100
corp_model_bleu2 = corpus_bleu(golden, infer, weights=weights_list[1]) * 100
corp_model_bleu3 = corpus_bleu(golden, infer, weights=weights_list[2]) * 100
corp_model_bleu4 = corpus_bleu(golden, infer, weights=weights_list[3]) * 100
return [corp_model_bleu1, corp_model_bleu2, corp_model_bleu3, corp_model_bleu4]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--ref", default=None, type=str)
parser.add_argument("--gen", default=None, type=str)
parser.add_argument("--src", default=None, type=str)
parser.add_argument("--lowercase", action="store_true")
args = parser.parse_args()
metric(args)