From da557e8b5d263997e6a3bb2385e7de01ea69e292 Mon Sep 17 00:00:00 2001
From: Moritz Gunz <moritz.gunz@gmail.com>
Date: Wed, 11 Sep 2024 05:06:09 -0400
Subject: [PATCH 1/2] Drop optional CTM header

This optional header fields leads to issues in GLM mapping leading to high (almost 100 percent) deletion rates, because all lines with fewer than 7 columns are dropped from the file.
---
 returnn/search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/returnn/search.py b/returnn/search.py
index 472460a5..248de437 100644
--- a/returnn/search.py
+++ b/returnn/search.py
@@ -449,7 +449,7 @@ def run(self):
         d = eval(util.uopen(self.recog_words_file.get_path(), "rt").read())
         assert isinstance(d, dict), "only search output file with dict format is supported"
         with util.uopen(self.out_ctm_file.get_path(), "wt") as out:
-            out.write(";; <name> <track> <start> <duration> <word> <confidence> [<n-best>]\n")
+            out.write(";; <name> <track> <start> <duration> <word> <confidence>\n")
             for seg in corpus.segments():
                 seg_start = 0.0 if seg.start == float("inf") else seg.start
                 seg_end = 0.0 if seg.end == float("inf") else seg.end
@@ -541,7 +541,7 @@ def run(self):
         else:
             seq_order = d.keys()
         with util.uopen(self.out_ctm_file.get_path(), "wt") as out:
-            out.write(";; <name> <track> <start> <duration> <word> <confidence> [<n-best>]\n")
+            out.write(";; <name> <track> <start> <duration> <word> <confidence>\n")
             for seg_fullname in seq_order:
                 assert isinstance(
                     seg_fullname, str

From 4297d2b364ac9be527cd2b8ec8f2b68e55a08b01 Mon Sep 17 00:00:00 2001
From: Moritz Gunz <moritz.gunz@gmail.com>
Date: Wed, 11 Sep 2024 06:08:30 -0400
Subject: [PATCH 2/2] add comment why header field is not printed

---
 returnn/search.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/returnn/search.py b/returnn/search.py
index 248de437..62917862 100644
--- a/returnn/search.py
+++ b/returnn/search.py
@@ -449,6 +449,11 @@ def run(self):
         d = eval(util.uopen(self.recog_words_file.get_path(), "rt").read())
         assert isinstance(d, dict), "only search output file with dict format is supported"
         with util.uopen(self.out_ctm_file.get_path(), "wt") as out:
+            # Do not print optional [n-best] header, some downstream evaluation pipelines
+            # use the number of headers for validation. Since we do not print n-best-list
+            # information this validation fails and discards the entire search outputs.
+            #
+            # See https://github.com/rwth-i6/i6_core/pull/542.
             out.write(";; <name> <track> <start> <duration> <word> <confidence>\n")
             for seg in corpus.segments():
                 seg_start = 0.0 if seg.start == float("inf") else seg.start
@@ -541,6 +546,11 @@ def run(self):
         else:
             seq_order = d.keys()
         with util.uopen(self.out_ctm_file.get_path(), "wt") as out:
+            # Do not print optional [n-best] header, some downstream evaluation pipelines
+            # use the number of headers for validation. Since we do not print n-best-list
+            # information this validation fails and discards the entire search outputs.
+            #
+            # See https://github.com/rwth-i6/i6_core/pull/542.
             out.write(";; <name> <track> <start> <duration> <word> <confidence>\n")
             for seg_fullname in seq_order:
                 assert isinstance(