Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluation module #88

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
# Pycharm
.idea/*
data/*
venv*/*
*.perl
*.en
*.de
*.chkpt
*.txt
recurrent_models/logs*/*

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,21 @@ python preprocess.py -train_src data/multi30k/train.en.atok -train_tgt data/mult

### 2) Train the model
```bash
python train.py -data data/multi30k.atok.low.pt -save_model trained -save_mode best -proj_share_weight -label_smoothing
python train.py -data data/multi30k.atok.low.pt -save_model trained_en-de -save_mode best -proj_share_weight -label_smoothing
```
> If your source and target language share one common vocabulary, use the `-embs_share_weight` flag to enable the model to share source/target word embedding.

### 3) Test the model
```bash
python translate.py -model trained.chkpt -vocab data/multi30k.atok.low.pt -src data/multi30k/test.en.atok -no_cuda
python translate.py -output pred_en-de.txt -model trained_en-de.chkpt -vocab data/multi30k.atok.low.pt -src data/multi30k/test.en.atok -no_cuda
```

### 4) Evaluate the model
```bash
python evaluate.py -hyp pred_en-de.txt -ref data/multi30k/test.de -out scores_en-de.txt
```
> Here we have BLEU, GLEU and WER. Check out [NLPMetrics](https://github.com/gcunhase/NLPMetrics) for more evaluation metrics

---
# Performance
## Training
Expand All @@ -95,7 +102,6 @@ python translate.py -model trained.chkpt -vocab data/multi30k.atok.low.pt -src d
- coming soon.
---
# TODO
- Evaluation on the generated text.
- Attention weight plot.
---
# Acknowledgement
Expand Down
220 changes: 220 additions & 0 deletions evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@

import nltk
import nltk.translate.gleu_score as gleu
import nltk.translate.bleu_score as bleu

import numpy
import os
import argparse

__author__ = "Gwena Cunha"


""" Class that provides methods to calculate similarity between two files
Each line can be composed of multiple sentences

python evaluate.py -hyp pred_en-de.txt -ref data/multi30k/test.de -out scores_en-de.txt
"""

# Constants
BLEU_NAME = "BLEU"
GOOGLE_BLEU_NAME = "GLEU" # "Google-BLEU"
WER_NAME = "WER"


def project_dir_name():
current_dir = os.path.abspath(os.path.dirname(__file__))
project_dir = os.path.abspath(current_dir + "/../") + "/"
return project_dir


class TextScore:

def __init__(self):
print("Initialize Machine Translation text score")

# Needed to separate sentences in CONTENT
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')

def score_multiple_from_file(self, ref_file, hyp_file, scores_file, score_type=BLEU_NAME, average_prec="corpus"):
# Clean scores_file if existent
open(scores_file, 'w').close()

scores = []
if BLEU_NAME in score_type:
scores.append(self.score_one_from_file(ref_file, hyp_file, scores_file, score_type=BLEU_NAME, average_prec=average_prec))

if GOOGLE_BLEU_NAME in score_type:
scores.append(self.score_one_from_file(ref_file, hyp_file, scores_file, score_type=GOOGLE_BLEU_NAME, average_prec=average_prec))

if WER_NAME in score_type:
scores.append(self.score_one_from_file(ref_file, hyp_file, scores_file, score_type=WER_NAME, average_prec=average_prec))

return scores

def score_one_from_file(self, ref_file, hyp_file, scores_file, score_type="BLEU", average_prec="corpus"):
""" Calculates score of file where each line is a text corresponding to the same hyp line
Doesn't treat cases of multiple references for the same hypotheses

:param ref_file: text file of reference sentences
:param hyp_file: text file of sentences generated by model
:param scores_file: text file with scores
:param score_type: BLEU, Google-BLEU, GLEU, WER, TER
:param average_prec: "corpus", "sent_average" or both ("corpus sent_average")
:return: final score
"""

hf = open(hyp_file, "r")
hypothesis = hf.read().split("\n")
num_sentences = len(hypothesis) - 1

rf = open(ref_file, "r")
reference = rf.read().split("\n")

sf = open(scores_file, "a+")

list_of_references = []
hypotheses = []
real_num_sentences = 0
for i in range(0, num_sentences):
if len(reference[i].strip()) != 0 or len(hypothesis[i].strip()) != 0:
# Hyptheseis ends with </s>
hypothesis_i = hypothesis[i].split('</s>')[0]
# Strip string of punctuation marks
table = str.maketrans("!?.,-|", 6*" ")
sentences_ref = reference[i].translate(table)
sentence_hyp = hypothesis_i.translate(table)
# Don't consider <unk> in hyp
sentence_hyp = sentence_hyp.replace('<unk>', ' ')
# Get sentences (if more than 1 per line)
sentences_ref = nltk.sent_tokenize(sentences_ref.lower())
sentence_hyp = nltk.sent_tokenize(sentence_hyp.lower())
# Separate sentences in each line
for sent_ref, sent_hyp in zip(sentences_ref, sentence_hyp):
ref, hypo = sent_ref.split(), sent_hyp.split()
list_of_references.append([ref])
hypotheses.append(hypo)
real_num_sentences += 1

print("Sentences: " + str(real_num_sentences))
scores_str = ""
score_corpus, score_sent = None, None

# Corpus: only relevant for BLEU and GLEU (Google-BLEU)
if "corpus" in average_prec and (WER_NAME not in score_type):
score_corpus = self.corpus_score(list_of_references, hypotheses, score_type=score_type)
scores_str += score_type + " corpus: " + str(format(score_corpus, '.4f')) + "\n"
if "sent_average" in average_prec:
score_sent = self.sentence_average_score(list_of_references, hypotheses, score_type=score_type)
scores_str += score_type + " sent_average: " + str(format(score_sent, '.4f')) + "\n"

scores_str += "\n"
sf.write(scores_str)
sf.close()

return score_corpus, score_sent

def corpus_score(self, list_of_references, hypotheses, score_type="BLEU"):
""" Score specifically implemented for corpus

:param list_of_references: list of reference texts
:param hypotheses: hypotheses relative to reference
:param score_type: metric being used
:return: corpus score
"""

corpus_score = None
if BLEU_NAME in score_type:
corpus_score = bleu.corpus_bleu(list_of_references, hypotheses)
elif GOOGLE_BLEU_NAME in score_type:
corpus_score = gleu.corpus_gleu(list_of_references, hypotheses)

print("%s corpus score: %.4f" % (score_type, corpus_score))
return corpus_score

def sentence_average_score(self, list_of_references, hypotheses, score_type="BLEU"):
""" Averages score applied for every sentence

:param list_of_references: list of reference texts (separated into words)
:param hypotheses: hypotheses relative to reference (separated into words)
:param score_type: metric being used
:return: average sentences score
"""

sent_average_score = 0
if BLEU_NAME in score_type:
for ref, hyp in zip(list_of_references, hypotheses):
sent_average_score += bleu.sentence_bleu(ref, hyp) # gram: default is between 1 and 4
elif GOOGLE_BLEU_NAME in score_type:
for ref, hyp in zip(list_of_references, hypotheses):
sent_average_score += gleu.sentence_gleu(ref, hyp) # gram: default is between 1 and 4
elif WER_NAME in score_type:
for ref, hyp in zip(list_of_references, hypotheses):
sent_average_score += self.wer_score(ref[0], hyp) # Assumes only 1 reference

sent_average_score /= len(list_of_references)

print("%s sentence average score: %.4f" % (score_type, sent_average_score))
return sent_average_score

def wer_score(self, ref, hyp):
""" Calculation of WER with Levenshtein distance.

Time/space complexity: O(nm)

Source: https://martin-thoma.com/word-error-rate-calculation/

:param ref: reference text (separated into words)
:param hyp: hypotheses text (separated into words)
:return: WER score
"""

# Initialization
d = numpy.zeros([len(ref) + 1, len(hyp) + 1], dtype=numpy.uint8)
for i in range(len(ref) + 1):
for j in range(len(hyp) + 1):
if i == 0:
d[0][j] = j
elif j == 0:
d[i][0] = i

# print(d)

# Computation
for i in range(1, len(ref) + 1):
for j in range(1, len(hyp) + 1):
if ref[i - 1] == hyp[j - 1]:
d[i][j] = d[i - 1][j - 1]
else:
substitution = d[i - 1][j - 1] + 1
insertion = d[i][j - 1] + 1
deletion = d[i - 1][j] + 1
d[i][j] = min(substitution, insertion, deletion)

# print(d)
return d[len(ref)][len(hyp)]


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='translate.py')

parser.add_argument('-hyp', required=True,
help='Path to candidate document')
parser.add_argument('-ref', required=True,
help='Path to reference document')
parser.add_argument('-out', required=True,
help='Path to output text file with calculated scores')

args = parser.parse_args()
# params = vars(args)

print("Test score file")

# Initialize handler for text scores
text_score = TextScore()
text_score.score_multiple_from_file(ref_file=args.ref, hyp_file=args.hyp, scores_file=args.out,
score_type=BLEU_NAME + GOOGLE_BLEU_NAME + WER_NAME,
average_prec="corpus, sent_average")
11 changes: 11 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
from dataset import TranslationDataset, paired_collate_fn
from transformer.Models import Transformer
from transformer.Optim import ScheduledOptim
from timeit import default_timer as timer
import os


def cal_performance(pred, gold, smoothing=False):
''' Apply label smoothing if needed '''
Expand Down Expand Up @@ -194,6 +197,7 @@ def main():
parser = argparse.ArgumentParser()

parser.add_argument('-data', required=True)
parser.add_argument('-results_dir', required=False, default='./')

parser.add_argument('-epoch', type=int, default=10)
parser.add_argument('-batch_size', type=int, default=64)
Expand Down Expand Up @@ -223,6 +227,10 @@ def main():
opt.cuda = not opt.no_cuda
opt.d_word_vec = opt.d_model

# = Ensure results directory exists =#
if not os.path.exists(opt.results_dir):
os.mkdir(opt.results_dir)

#========= Loading Dataset =========#
data = torch.load(opt.data)
opt.max_token_seq_len = data['settings'].max_token_seq_len
Expand Down Expand Up @@ -290,4 +298,7 @@ def prepare_dataloaders(data, opt):


if __name__ == '__main__':
start_time = timer()
main()
total_time = timer() - start_time
print("Program ran for {:.4f} hours".format(total_time/3600))