From 74711c57d2595a985c06df032655603dabc91516 Mon Sep 17 00:00:00 2001 From: Moritz Gunz Date: Wed, 11 Sep 2024 13:48:12 +0200 Subject: [PATCH] Drop optional n-best header in CTM (#542) This optional header fields leads to issues in GLM mapping leading to high (almost 100 percent) deletion rates, because all lines with fewer than 7 columns are dropped from the file. --- returnn/search.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/returnn/search.py b/returnn/search.py index 472460a5..62917862 100644 --- a/returnn/search.py +++ b/returnn/search.py @@ -449,7 +449,12 @@ def run(self): d = eval(util.uopen(self.recog_words_file.get_path(), "rt").read()) assert isinstance(d, dict), "only search output file with dict format is supported" with util.uopen(self.out_ctm_file.get_path(), "wt") as out: - out.write(";; []\n") + # Do not print optional [n-best] header, some downstream evaluation pipelines + # use the number of headers for validation. Since we do not print n-best-list + # information this validation fails and discards the entire search outputs. + # + # See https://github.com/rwth-i6/i6_core/pull/542. + out.write(";; \n") for seg in corpus.segments(): seg_start = 0.0 if seg.start == float("inf") else seg.start seg_end = 0.0 if seg.end == float("inf") else seg.end @@ -541,7 +546,12 @@ def run(self): else: seq_order = d.keys() with util.uopen(self.out_ctm_file.get_path(), "wt") as out: - out.write(";; []\n") + # Do not print optional [n-best] header, some downstream evaluation pipelines + # use the number of headers for validation. Since we do not print n-best-list + # information this validation fails and discards the entire search outputs. + # + # See https://github.com/rwth-i6/i6_core/pull/542. + out.write(";; \n") for seg_fullname in seq_order: assert isinstance( seg_fullname, str