Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip newlines when parsing TextDicts to avoid OverflowError #540

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 8 additions & 18 deletions returnn/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,7 @@ def tasks(self):
yield Task("run", mini_task=True)

def run(self):
d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(d, dict) # seq_tag -> bpe string
d = util.parse_text_dict(self.search_py_output)
assert not os.path.exists(self.out_word_search_results.get_path())
with util.uopen(self.out_word_search_results, "wt") as out:
out.write("{\n")
Expand Down Expand Up @@ -400,8 +399,7 @@ def tasks(self):
yield Task("run", mini_task=True)

def run(self):
d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(d, dict) # seq_tag -> bpe string
d = util.parse_text_dict(self.search_py_output)
assert not os.path.exists(self.out_search_results.get_path())

def _transform_text(s: str):
Expand Down Expand Up @@ -446,8 +444,7 @@ def tasks(self):
def run(self):
corpus = Corpus()
corpus.load(self.bliss_corpus.get_path())
d = eval(util.uopen(self.recog_words_file.get_path(), "rt").read())
assert isinstance(d, dict), "only search output file with dict format is supported"
d = util.parse_text_dict(self.recog_words_file)
with util.uopen(self.out_ctm_file.get_path(), "wt") as out:
# Do not print optional [n-best] header, some downstream evaluation pipelines
# use the number of headers for validation. Since we do not print n-best-list
Expand Down Expand Up @@ -536,10 +533,7 @@ def tasks(self):
yield Task("run", mini_task=True)

def run(self):
# nan/inf should not be needed, but avoids errors at this point and will print an error below,
# that we don't expect an N-best list here.
d = eval(util.uopen(self.recog_words_file, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(d, dict), "only search output file with dict format is supported"
d = util.parse_text_dict(self.recog_words_file)
if self.seq_order_file is not None:
seq_order = eval(util.uopen(self.seq_order_file, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(seq_order, (dict, list, tuple))
Expand Down Expand Up @@ -657,8 +651,7 @@ def tasks(self):

def run(self):
"""run"""
d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(d, dict) # seq_tag -> bpe string
d = util.parse_text_dict(self.search_py_output)
assert not os.path.exists(self.out_best_search_results.get_path())
with util.uopen(self.out_best_search_results, "wt") as out:
out.write("{\n")
Expand Down Expand Up @@ -696,8 +689,7 @@ def tasks(self):

def run(self):
"""run"""
d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(d, dict) # seq_tag -> bpe string
d = util.parse_text_dict(self.search_py_output)
assert not os.path.exists(self.out_search_results.get_path())
with util.uopen(self.out_search_results, "wt") as out:
out.write("{\n")
Expand Down Expand Up @@ -737,8 +729,7 @@ def tasks(self):

def run(self):
"""run"""
d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(d, dict) # seq_tag -> bpe string
d = util.parse_text_dict(self.search_py_output)
assert not os.path.exists(self.out_search_results.get_path())
with util.uopen(self.out_search_results, "wt") as out:
out.write("{\n")
Expand Down Expand Up @@ -796,8 +787,7 @@ def logsumexp(*args):
lsp = numpy.log(sum(numpy.exp(a - a_max) for a in args))
return a_max + lsp

d = eval(util.uopen(self.search_py_output, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(d, dict) # seq_tag -> bpe string
d = util.parse_text_dict(self.search_py_output)
assert not os.path.exists(self.out_search_results.get_path())
with util.uopen(self.out_search_results, "wt") as out:
out.write("{\n")
Expand Down
10 changes: 4 additions & 6 deletions text/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
"TextDictToStmJob",
]

from typing import Optional, Union, Sequence, Dict, List, Tuple
from typing import Union, Sequence, Dict, Tuple
import re
from sisyphus import Job, Path, Task
from i6_core.util import uopen
from i6_core.util import parse_text_dict, uopen


class TextDictToTextLinesJob(Job):
Expand All @@ -30,8 +30,7 @@ def tasks(self):
def run(self):
# nan/inf should not be needed, but avoids errors at this point and will print an error below,
# that we don't expect an N-best list here.
d = eval(uopen(self.text_dict, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(d, dict) # seq_tag -> text
d = parse_text_dict(self.text_dict)

with uopen(self.out_text_lines, "wt") as out:
for seq_tag, entry in d.items():
Expand Down Expand Up @@ -83,8 +82,7 @@ def tasks(self):
def run(self):
# nan/inf should not be needed, but avoids errors at this point and will print an error below,
# that we don't expect an N-best list here.
c = eval(uopen(self.text_dict, "rt").read(), {"nan": float("nan"), "inf": float("inf")})
assert isinstance(c, dict)
c = parse_text_dict(self.text_dict)

all_tags = [
("d%d" % i, "default%d" % i, "all other segments of category %d" % i)
Expand Down
15 changes: 15 additions & 0 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,3 +383,18 @@ def update_nested_dict(dict1: Dict[str, Any], dict2: Dict[str, Any]):
else:
dict1[k] = v
return dict1


def parse_text_dict(path: Union[str, tk.Path]) -> Dict[str, str]:
"""
Loads the text dict at :param:`path`.

Works around https://github.com/rwth-i6/i6_core/issues/539 (``OverflowError: line number table is too long``)
by stripping the newlines from the text dict before the ``eval``.
"""

with uopen(path, "rt") as text_dict_file:
txt = "".join(line.strip() for line in text_dict_file)
d = eval(txt, {"nan": float("nan"), "inf": float("inf")})
assert isinstance(d, dict), f"expected a text dict, but found {type(d)}"
return d
Loading