From 74711c57d2595a985c06df032655603dabc91516 Mon Sep 17 00:00:00 2001
From: Moritz Gunz <moritz.gunz@gmail.com>
Date: Wed, 11 Sep 2024 13:48:12 +0200
Subject: [PATCH] Drop optional n-best header in CTM (#542)

This optional header fields leads to issues in GLM mapping leading to high (almost 100 percent) deletion rates, because all lines with fewer than 7 columns are dropped from the file.
---
 returnn/search.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/returnn/search.py b/returnn/search.py
index 472460a5..62917862 100644
--- a/returnn/search.py
+++ b/returnn/search.py
@@ -449,7 +449,12 @@ def run(self):
         d = eval(util.uopen(self.recog_words_file.get_path(), "rt").read())
         assert isinstance(d, dict), "only search output file with dict format is supported"
         with util.uopen(self.out_ctm_file.get_path(), "wt") as out:
-            out.write(";; <name> <track> <start> <duration> <word> <confidence> [<n-best>]\n")
+            # Do not print optional [n-best] header, some downstream evaluation pipelines
+            # use the number of headers for validation. Since we do not print n-best-list
+            # information this validation fails and discards the entire search outputs.
+            #
+            # See https://github.com/rwth-i6/i6_core/pull/542.
+            out.write(";; <name> <track> <start> <duration> <word> <confidence>\n")
             for seg in corpus.segments():
                 seg_start = 0.0 if seg.start == float("inf") else seg.start
                 seg_end = 0.0 if seg.end == float("inf") else seg.end
@@ -541,7 +546,12 @@ def run(self):
         else:
             seq_order = d.keys()
         with util.uopen(self.out_ctm_file.get_path(), "wt") as out:
-            out.write(";; <name> <track> <start> <duration> <word> <confidence> [<n-best>]\n")
+            # Do not print optional [n-best] header, some downstream evaluation pipelines
+            # use the number of headers for validation. Since we do not print n-best-list
+            # information this validation fails and discards the entire search outputs.
+            #
+            # See https://github.com/rwth-i6/i6_core/pull/542.
+            out.write(";; <name> <track> <start> <duration> <word> <confidence>\n")
             for seg_fullname in seq_order:
                 assert isinstance(
                     seg_fullname, str