Skip to content

Commit

Permalink
improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
christophmluscher committed Nov 14, 2024
1 parent 11e3ac4 commit c415d28
Showing 1 changed file with 21 additions and 5 deletions.
26 changes: 21 additions & 5 deletions text/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,34 @@


class SentenceLengthHistogramJob(Job):
def __init__(self, corpus_path: tk.Path, max_chars: Optional[int] = None, max_words: Optional[int] = None):
def __init__(
self,
input_text: tk.Path,
max_chars: Optional[int] = None,
max_words: Optional[int] = None,
full_histogram: bool = True,
):
"""
Job computes a histogram of the sentence lengths. `max_chars`/`max_words` can be set to count the number of
removed lines.
:param corpus_path: text file path.
:param input_text: text file path.
:param max_chars: maximum number of characters per line.
Any line with more characters than this amount won't be added to the final variables.
:param max_words: maximum number of words per line.
Any line with more words than this amount won't be added to the final variables.
:param full_histogram: the removed lines will be shown in the histogram.
"""
self.corpus_path = corpus_path
self.input_text = input_text
self.max_chars = max_chars
self.max_words = max_words
self.full_histogram = full_histogram

self.out_num_sentences_full_corpus = self.output_var("num_sentences_full_corpus.txt")
self.out_num_sentences_removed = self.output_var("num_sentences_removed.txt")
self.out_word_plot = self.output_path("sentence_length_word_histogram.png")
self.out_char_plot = self.output_path("sentence_length_char_histogram.png")
self.out_sentences = self.output_path("sentences.txt.gz")
self.out_removed_sentences = self.output_path("removed_sentences.txt.gz")

self.rqmt = {"cpu": 1, "mem": 4, "time": 6}

Expand All @@ -37,7 +47,9 @@ def run(self):
counter_chars = Counter()
num_sentences_full_corpus = 0
num_sentences_removed = 0
with uopen(self.corpus_path.get_path(), "rt") as f_in, uopen(self.out_sentences.get_path(), "wt") as f_out:
with uopen(self.input_text.get_path(), "rt") as f_in, uopen(
self.out_removed_sentences.get_path(), "wt"
) as f_out:
for line in f_in:
num_sentences_full_corpus += 1
line = line.strip()
Expand All @@ -57,6 +69,10 @@ def run(self):
self.out_num_sentences_full_corpus.set(num_sentences_full_corpus)
self.out_num_sentences_removed.set(num_sentences_removed)

if not self.full_histogram:
counter_words = Counter({k: v for k, v in counter_words.items() if k <= self.max_words})
counter_chars = Counter({k: v for k, v in counter_chars.items() if k <= self.max_chars})

self._plot(counter_words, self.out_word_plot.get_path())
self._plot(counter_chars, self.out_char_plot.get_path())

Expand Down

0 comments on commit c415d28

Please sign in to comment.