Skip to content

Commit

Permalink
Adding another possibility to provide the num_words in order to compu…
Browse files Browse the repository at this point in the history
…te this dynamically (#545)

* Adding another possibility to provide the num_words in order to compute this dynamically

* Update lm/vocabulary.py

Co-authored-by: michelwi <[email protected]>

---------

Co-authored-by: Javier Jorge Cano <[email protected]>
Co-authored-by: Nick Rossenbach <[email protected]>
Co-authored-by: michelwi <[email protected]>
  • Loading branch information
4 people authored Nov 4, 2024
1 parent 1a2cee5 commit 5bfefda
Showing 1 changed file with 5 additions and 3 deletions.
8 changes: 5 additions & 3 deletions lm/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ class VocabularyFromTextJob(Job):
Extract vocabulary from given text files based on frequency.
"""

def __init__(self, file_paths: List[tk.Path], num_words: int = 1_000_000):
def __init__(self, file_paths: List[tk.Path], num_words: Union[int, tk.Variable] = 1_000_000):
"""
:param file_paths: paths to the text files
:param num_words: expected size of the vocabulary
Expand All @@ -185,12 +185,14 @@ def run(self):
words = line.strip().split()
counter.update(words)

cutoff = min(self.num_words, len(counter))
num_words = self.num_words.get() if isinstance(self.num_words, tk.Variable) else self.num_words

cutoff = min(num_words, len(counter))

with open(self.out_vocabulary, "w") as vocabulary, open(
self.out_vocabulary_with_counts, "w"
) as vocabulary_with_counts:
for (word, count) in counter.most_common(cutoff):
for word, count in counter.most_common(cutoff):
vocabulary.write(f"{word}\n")
vocabulary_with_counts.write(f"{word} {count}\n")

Expand Down

0 comments on commit 5bfefda

Please sign in to comment.