Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix #225: Empty lines in WMT21/dev Icelandic-English #226

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 2 additions & 10 deletions sacrebleu/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,9 @@
"en-zh": ["wmt22-news-systems-1.1/xml/wmttest2022.en-zh.all.xml"],
"fr-de": ["wmt22-news-systems-1.1/xml/wmttest2022.fr-de.all.xml"],
"ja-en": ["wmt22-news-systems-1.1/xml/wmttest2022.ja-en.all.xml"],
"liv-en": {
"path": "wmt22-news-systems-1.1/xml/wmttest2022.liv-en.all.xml",
# no translator because data is English-original
"refs": [""],
},
"liv-en": ["wmt22-news-systems-1.1/xml/wmttest2022.liv-en.all.xml"],
"ru-en": ["wmt22-news-systems-1.1/xml/wmttest2022.ru-en.all.xml"],
"ru-sah": {
"path": "wmt22-news-systems-1.1/xml/wmttest2022.ru-sah.all.xml",
# no translator because data is Yakut-original
"refs": [""],
},
"ru-sah": ["wmt22-news-systems-1.1/xml/wmttest2022.ru-sah.all.xml"],
"sah-ru": ["wmt22-news-systems-1.1/xml/wmttest2022.sah-ru.all.xml"],
"uk-cs": ["wmt22-news-systems-1.1/xml/wmttest2022.uk-cs.all.xml"],
"uk-en": ["wmt22-news-systems-1.1/xml/wmttest2022.uk-en.all.xml"],
Expand Down
16 changes: 9 additions & 7 deletions sacrebleu/dataset/wmt_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,11 @@ def _unwrap_wmt21_or_later(raw_file):
tree = ET.parse(raw_file)
# Find and check the documents (src, ref, hyp)
src_langs, ref_langs, translators = set(), set(), set()
for src_doc in tree.getroot().findall(".//src"):
for src_doc, ref_doc in zip(tree.getroot().findall(".//src"), tree.getroot().findall(".//ref")):
src_langs.add(src_doc.get("lang"))

for ref_doc in tree.getroot().findall(".//ref"):
ref_langs.add(ref_doc.get("lang"))
translator = ref_doc.get("translator")
# the "translator" keywords are in "src" instead of "ref" sometimes. "wmt21/dev is-en" for example.
translator = ref_doc.get("translator", src_doc.get("translator"))
translators.add(translator)

assert (
Expand Down Expand Up @@ -80,10 +79,13 @@ def get_sents(doc):
}

ref_docs = doc.findall(".//ref")
src_docs = doc.findall(".//src")

trans_to_ref = {
ref_doc.get("translator"): get_sents(ref_doc) for ref_doc in ref_docs
}
trans_to_ref = {}
for src_doc, ref_doc in zip(src_docs, ref_docs):
# the "translator" keywords are in "src" instead of "ref" sometimes. "wmt21/dev is-en" for example.
translator = ref_doc.get("translator", src_doc.get("translator"))
trans_to_ref[translator] = get_sents(ref_doc)

hyp_docs = doc.findall(".//hyp")
hyps = {
Expand Down
5 changes: 0 additions & 5 deletions test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,6 @@ def test_wmt22_references():
# make sure ref:B is the one used by default
assert wmt22._get_langpair_allowed_refs("cs-en") == ["ref:B"]

# similar check for another dataset: there should be no default ("A"),
# and the only ref found should be the unannotated one
assert "ref:A" not in wmt22.fieldnames("liv-en")
assert "ref" in wmt22.fieldnames("liv-en")

# and that ref:A is the default for all languages where it wasn't overridden
for langpair, langpair_data in wmt22.langpairs.items():
if type(langpair_data) == dict:
Expand Down