From d0cf85ee3137959a1667bc4d84b74e7fce3911a8 Mon Sep 17 00:00:00 2001
From: Moritz Gunz <moritz.gunz@gmail.com>
Date: Tue, 3 Sep 2024 10:23:14 +0200
Subject: [PATCH 1/4] Parse `TextDict`s chunkwise to avoid `OverflowError`

---
 returnn/search.py | 26 ++++++++------------------
 text/convert.py   | 10 ++++------
 util.py           | 21 +++++++++++++++++++++
 3 files changed, 33 insertions(+), 24 deletions(-)
diff --git a/returnn/search.py b/returnn/search.py
index 472460a5..a4bff091 100644
--- a/returnn/search.py
+++ b/returnn/search.py
@@ -348,8 +348,7 @@ def tasks(self):
         yield Task("run", mini_task=True)
 
     def run(self):
-        d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
-        assert isinstance(d, dict)  # seq_tag -> bpe string
+        d = util.parse_text_dict(self.search_py_output)
         assert not os.path.exists(self.out_word_search_results.get_path())
         with util.uopen(self.out_word_search_results, "wt") as out:
             out.write("{\n")
@@ -400,8 +399,7 @@ def tasks(self):
         yield Task("run", mini_task=True)
 
     def run(self):
-        d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
-        assert isinstance(d, dict)  # seq_tag -> bpe string
+        d = util.parse_text_dict(self.search_py_output)
         assert not os.path.exists(self.out_search_results.get_path())
 
         def _transform_text(s: str):
@@ -446,8 +444,7 @@ def tasks(self):
     def run(self):
         corpus = Corpus()
         corpus.load(self.bliss_corpus.get_path())
-        d = eval(util.uopen(self.recog_words_file.get_path(), "rt").read())
-        assert isinstance(d, dict), "only search output file with dict format is supported"
+        d = util.parse_text_dict(self.recog_words_file)
         with util.uopen(self.out_ctm_file.get_path(), "wt") as out:
             out.write(";; <name> <track> <start> <duration> <word> <confidence> [<n-best>]\n")
             for seg in corpus.segments():
@@ -531,10 +528,7 @@ def tasks(self):
         yield Task("run", mini_task=True)
 
     def run(self):
-        # nan/inf should not be needed, but avoids errors at this point and will print an error below,
-        # that we don't expect an N-best list here.
-        d = eval(util.uopen(self.recog_words_file, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
-        assert isinstance(d, dict), "only search output file with dict format is supported"
+        d = util.parse_text_dict(self.recog_words_file)
         if self.seq_order_file is not None:
             seq_order = eval(util.uopen(self.seq_order_file, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
             assert isinstance(seq_order, (dict, list, tuple))
@@ -647,8 +641,7 @@ def tasks(self):
 
     def run(self):
         """run"""
-        d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
-        assert isinstance(d, dict)  # seq_tag -> bpe string
+        d = util.parse_text_dict(self.search_py_output)
         assert not os.path.exists(self.out_best_search_results.get_path())
         with util.uopen(self.out_best_search_results, "wt") as out:
             out.write("{\n")
@@ -686,8 +679,7 @@ def tasks(self):
 
     def run(self):
         """run"""
-        d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
-        assert isinstance(d, dict)  # seq_tag -> bpe string
+        d = util.parse_text_dict(self.search_py_output)
         assert not os.path.exists(self.out_search_results.get_path())
         with util.uopen(self.out_search_results, "wt") as out:
             out.write("{\n")
@@ -727,8 +719,7 @@ def tasks(self):
 
     def run(self):
         """run"""
-        d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
-        assert isinstance(d, dict)  # seq_tag -> bpe string
+        d = util.parse_text_dict(self.search_py_output)
         assert not os.path.exists(self.out_search_results.get_path())
         with util.uopen(self.out_search_results, "wt") as out:
             out.write("{\n")
@@ -786,8 +777,7 @@ def logsumexp(*args):
             lsp = numpy.log(sum(numpy.exp(a - a_max) for a in args))
             return a_max + lsp
 
-        d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
-        assert isinstance(d, dict)  # seq_tag -> bpe string
+        d = util.parse_text_dict(self.search_py_output)
         assert not os.path.exists(self.out_search_results.get_path())
         with util.uopen(self.out_search_results, "wt") as out:
             out.write("{\n")
diff --git a/text/convert.py b/text/convert.py
index f9079feb..a832aa80 100644
--- a/text/convert.py
+++ b/text/convert.py
@@ -3,10 +3,10 @@
     "TextDictToStmJob",
 ]
 
-from typing import Optional, Union, Sequence, Dict, List, Tuple
+from typing import Union, Sequence, Dict, Tuple
 import re
 from sisyphus import Job, Path, Task
-from i6_core.util import uopen
+from i6_core.util import parse_text_dict, uopen
 
 
 class TextDictToTextLinesJob(Job):
@@ -30,8 +30,7 @@ def tasks(self):
     def run(self):
         # nan/inf should not be needed, but avoids errors at this point and will print an error below,
         # that we don't expect an N-best list here.
-        d = eval(uopen(self.text_dict, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
-        assert isinstance(d, dict)  # seq_tag -> text
+        d = parse_text_dict(self.text_dict)
 
         with uopen(self.out_text_lines, "wt") as out:
             for seq_tag, entry in d.items():
@@ -83,8 +82,7 @@ def tasks(self):
     def run(self):
         # nan/inf should not be needed, but avoids errors at this point and will print an error below,
         # that we don't expect an N-best list here.
-        c = eval(uopen(self.text_dict, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
-        assert isinstance(c, dict)
+        c = parse_text_dict(self.text_dict)
 
         all_tags = [
             ("d%d" % i, "default%d" % i, "all other segments of category %d" % i)
diff --git a/util.py b/util.py
index 4bc8fe56..617096a2 100644
--- a/util.py
+++ b/util.py
@@ -383,3 +383,24 @@ def update_nested_dict(dict1: Dict[str, Any], dict2: Dict[str, Any]):
         else:
             dict1[k] = v
     return dict1
+
+
+def parse_text_dict(path: Union[str, tk.Path]) -> Dict[str, str]:
+    """
+    Loads the text dict at :param:`path` making sure not to trigger line counter overflow.
+    """
+
+    with uopen(path, "rt") as text_dict_file:
+        txt = text_dict_file.read()
+
+    # remove leading and trailing dict brackets
+    txt = txt.strip().strip("{}").strip()
+
+    lines = txt.splitlines()
+    result = {
+        k: v
+        # parse chunkwise to avoid line counter overflow when the text dict is very large
+        for chunk in chunks(lines, max(1, len(lines) // 1000))
+        for k, v in eval("\n".join(["{", *chunk, "}"]), {"nan": float("nan"), "inf": float("inf")}).items()
+    }
+    return result

From f25fb80a0678fc5260d13ad0bb905f31e4d09004 Mon Sep 17 00:00:00 2001
From: Moritz Gunz <moritz.gunz@gmail.com>
Date: Tue, 3 Sep 2024 11:52:51 +0200
Subject: [PATCH 2/4] Simplify implementation, simply strip newlines

---
 util.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/util.py b/util.py
index 617096a2..4a17ee7d 100644
--- a/util.py
+++ b/util.py
@@ -387,20 +387,15 @@ def update_nested_dict(dict1: Dict[str, Any], dict2: Dict[str, Any]):
 
 def parse_text_dict(path: Union[str, tk.Path]) -> Dict[str, str]:
     """
-    Loads the text dict at :param:`path` making sure not to trigger line counter overflow.
+    Loads the text dict at :param:`path`.
+
+    Works around https://github.com/rwth-i6/i6_core/issues/539 by stripping the newlines
+    from the text dict before parsing.
     """
 
     with uopen(path, "rt") as text_dict_file:
-        txt = text_dict_file.read()
-
-    # remove leading and trailing dict brackets
-    txt = txt.strip().strip("{}").strip()
-
-    lines = txt.splitlines()
-    result = {
-        k: v
-        # parse chunkwise to avoid line counter overflow when the text dict is very large
-        for chunk in chunks(lines, max(1, len(lines) // 1000))
-        for k, v in eval("\n".join(["{", *chunk, "}"]), {"nan": float("nan"), "inf": float("inf")}).items()
-    }
-    return result
+        # removing the newlines works around an overflow of the line number table in python3.10
+        txt = "".join(line for line in text_dict_file)
+    d = eval(txt, {"nan": float("nan"), "inf": float("inf")})
+    assert isinstance(d, dict), f"expected a text dict, but found {type(d)}"
+    return d

From 0721c944c6b348b24d9faa22a13c6476c60f55f5 Mon Sep 17 00:00:00 2001
From: Moritz Gunz <moritz.gunz@gmail.com>
Date: Tue, 3 Sep 2024 11:54:07 +0200
Subject: [PATCH 3/4] improve docs

---
 util.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/util.py b/util.py
index 4a17ee7d..eb2a7fc4 100644
--- a/util.py
+++ b/util.py
@@ -389,12 +389,11 @@ def parse_text_dict(path: Union[str, tk.Path]) -> Dict[str, str]:
     """
     Loads the text dict at :param:`path`.
 
-    Works around https://github.com/rwth-i6/i6_core/issues/539 by stripping the newlines
-    from the text dict before parsing.
+    Works around https://github.com/rwth-i6/i6_core/issues/539 (``OverflowError: line number table is too long``)
+    by stripping the newlines from the text dict before the ``eval``.
     """
 
     with uopen(path, "rt") as text_dict_file:
-        # removing the newlines works around an overflow of the line number table in python3.10
         txt = "".join(line for line in text_dict_file)
     d = eval(txt, {"nan": float("nan"), "inf": float("inf")})
     assert isinstance(d, dict), f"expected a text dict, but found {type(d)}"

From f11632908656d78f00bee5fdfeb0678028f00333 Mon Sep 17 00:00:00 2001
From: Moritz Gunz <moritz.gunz@gmail.com>
Date: Tue, 3 Sep 2024 11:56:54 +0200
Subject: [PATCH 4/4] file iter doesn't strip newlines

---
 util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util.py b/util.py
index eb2a7fc4..3abb0cad 100644
--- a/util.py
+++ b/util.py
@@ -394,7 +394,7 @@ def parse_text_dict(path: Union[str, tk.Path]) -> Dict[str, str]:
     """
 
     with uopen(path, "rt") as text_dict_file:
-        txt = "".join(line for line in text_dict_file)
+        txt = "".join(line.strip() for line in text_dict_file)
     d = eval(txt, {"nan": float("nan"), "inf": float("inf")})
     assert isinstance(d, dict), f"expected a text dict, but found {type(d)}"
     return d