Skip to content

Commit

Permalink
fix: Move helper function out into root
Browse files Browse the repository at this point in the history
  • Loading branch information
saattrupdan committed Oct 25, 2023
1 parent 2c76a26 commit 27750d1
Showing 1 changed file with 34 additions and 9 deletions.
43 changes: 34 additions & 9 deletions src/coral_models/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Functions related to the data loading and processing"""

from functools import partial
import logging
import os
import re
Expand Down Expand Up @@ -277,15 +278,13 @@ def clean_dataset(
f"[^{re.escape(cfg.characters_to_keep)}]"
)

def clean_examples(example: dict) -> dict:
example["text"] = clean_transcription(
doc=example["text"],
mapped = dataset.map(
partial(
clean_example,
non_standard_characters_regex=non_standard_characters_regex,
conversion_dict=conversion_dict,
)
return example

mapped = dataset.map(clean_examples)
)

# After calling `map` the DatasetInfo is lost, so we need to add it back in
for split in dataset.keys():
Expand All @@ -294,6 +293,32 @@ def clean_examples(example: dict) -> dict:
return mapped


def clean_example(
example: dict,
non_standard_characters_regex: re.Pattern[str],
conversion_dict: dict[str, str],
) -> dict:
"""Helper function which cleans a single example.
Args:
example:
The example to be cleaned.
non_standard_characters_regex:
A compiled regex expression that matches all non-standard characters.
conversion_dict:
A dictionary of characters to be converted.
Returns:
The cleaned example.
"""
example["text"] = clean_transcription(
doc=example["text"],
non_standard_characters_regex=non_standard_characters_regex,
conversion_dict=conversion_dict,
)
return example


def clean_transcription(
doc: str,
non_standard_characters_regex: re.Pattern[str],
Expand All @@ -302,11 +327,11 @@ def clean_transcription(
"""Cleans the transcription of a document.
Args:
doc (str):
doc:
A document to be cleaned.
non_standard_characters_regex (compiled regex expression):
non_standard_characters_regex:
A compiled regex expression that matches all non-standard characters.
conversion_dict (dict[str, str]):
conversion_dict:
A dictionary of characters to be converted.
Returns:
Expand Down

0 comments on commit 27750d1

Please sign in to comment.