From da557e8b5d263997e6a3bb2385e7de01ea69e292 Mon Sep 17 00:00:00 2001 From: Moritz Gunz Date: Wed, 11 Sep 2024 05:06:09 -0400 Subject: [PATCH 1/2] Drop optional CTM header This optional header fields leads to issues in GLM mapping leading to high (almost 100 percent) deletion rates, because all lines with fewer than 7 columns are dropped from the file. --- returnn/search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/returnn/search.py b/returnn/search.py index 472460a5..248de437 100644 --- a/returnn/search.py +++ b/returnn/search.py @@ -449,7 +449,7 @@ def run(self): d = eval(util.uopen(self.recog_words_file.get_path(), "rt").read()) assert isinstance(d, dict), "only search output file with dict format is supported" with util.uopen(self.out_ctm_file.get_path(), "wt") as out: - out.write(";; []\n") + out.write(";; \n") for seg in corpus.segments(): seg_start = 0.0 if seg.start == float("inf") else seg.start seg_end = 0.0 if seg.end == float("inf") else seg.end @@ -541,7 +541,7 @@ def run(self): else: seq_order = d.keys() with util.uopen(self.out_ctm_file.get_path(), "wt") as out: - out.write(";; []\n") + out.write(";; \n") for seg_fullname in seq_order: assert isinstance( seg_fullname, str From 4297d2b364ac9be527cd2b8ec8f2b68e55a08b01 Mon Sep 17 00:00:00 2001 From: Moritz Gunz Date: Wed, 11 Sep 2024 06:08:30 -0400 Subject: [PATCH 2/2] add comment why header field is not printed --- returnn/search.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/returnn/search.py b/returnn/search.py index 248de437..62917862 100644 --- a/returnn/search.py +++ b/returnn/search.py @@ -449,6 +449,11 @@ def run(self): d = eval(util.uopen(self.recog_words_file.get_path(), "rt").read()) assert isinstance(d, dict), "only search output file with dict format is supported" with util.uopen(self.out_ctm_file.get_path(), "wt") as out: + # Do not print optional [n-best] header, some downstream evaluation pipelines + # use the number of headers for validation. Since we do not print n-best-list + # information this validation fails and discards the entire search outputs. + # + # See https://github.com/rwth-i6/i6_core/pull/542. out.write(";; \n") for seg in corpus.segments(): seg_start = 0.0 if seg.start == float("inf") else seg.start @@ -541,6 +546,11 @@ def run(self): else: seq_order = d.keys() with util.uopen(self.out_ctm_file.get_path(), "wt") as out: + # Do not print optional [n-best] header, some downstream evaluation pipelines + # use the number of headers for validation. Since we do not print n-best-list + # information this validation fails and discards the entire search outputs. + # + # See https://github.com/rwth-i6/i6_core/pull/542. out.write(";; \n") for seg_fullname in seq_order: assert isinstance(