Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix typing in corpus/filter.py #450

Merged
merged 3 commits into from
Sep 6, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 37 additions & 26 deletions corpus/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,24 @@
import numpy as np
import re
import xml.etree.cElementTree as ET
from typing import Dict

from i6_core.util import MultiOutputPath
from typing import Dict, List, Optional, Union

from i6_core import rasr
from i6_core.lib import corpus
from i6_core.util import chunks, uopen
from i6_core.util import chunks, uopen, MultiOutputPath

from sisyphus import *

Path = setup_path(__package__)


class FilterSegmentsByListJob(Job):
def __init__(self, segment_files, filter_list, invert_match=False):
def __init__(self, segment_files: Dict[int, Path], filter_list: Union[List[str], Path], invert_match: bool = False):
"""
Filters segment list file using a given list of segments, which is either used as black or as white list
:param dict[int,Path] segment_files: original segment list files to be filtered
:param Union[list, Path] filter_list: list used for filtering or a path to a text file containing the entries of
that list one per line
:param bool invert_match: black list (if False) or white list (if True) usage
:param segment_files: original segment list files to be filtered
:param filter_list: list used for filtering or a path to a text file with the entries of that list one per line
:param invert_match: black list (if False) or white list (if True) usage
"""
assert isinstance(filter_list, tk.Path) or isinstance(filter_list, list)
self.segment_files = segment_files
Expand Down Expand Up @@ -111,14 +109,20 @@ def run(self):


class FilterSegmentsByAlignmentConfidenceJob(Job):
def __init__(self, alignment_logs, percentile, crp=None, plot=True, absolute_threshold=None):
def __init__(
self,
alignment_logs: Dict[int, Path],
percentile: float,
crp: Optional[rasr.CommonRasrParameters] = None,
plot: bool = True,
absolute_threshold: Optional[float] = None,
):
"""
:param dict[int,Path] alignment_logs: alignment_job.out_log_file; task_id -> log_file
:param float percentile: percent of alignment segments to keep. should be in (0,100]. for :func:`np.percentile`
:param float absolute_threshold: alignments with score above this number are discarded
:param Optional[rasr.crp.CommonRasrParameters] crp: used to set the number of output segments.
if none, number of alignment log files is used instead.
:param bool plot: plot the distribution of alignment scores
:param alignment_logs: alignment_job.out_log_file; task_id -> log_file
:param percentile: percent of alignment segments to keep. should be in (0,100]. for :func:`np.percentile`
:param crp: used to set the number of output segments. if none, number of alignment log files is used instead.
:param plot: plot the distribution of alignment scores
:param absolute_threshold: alignments with score above this number are discarded
"""
self.alignment_logs = alignment_logs # alignment_job.log_file
self.percentile = percentile
Expand Down Expand Up @@ -191,13 +195,20 @@ def run(self):
class FilterCorpusBySegmentsJob(Job):
__sis_hash_exclude__ = {"delete_empty_recordings": False}

def __init__(self, bliss_corpus, segment_file, compressed=False, invert_match=False, delete_empty_recordings=False):
def __init__(
self,
bliss_corpus: Path,
segment_file: Union[List[Path], Path],
compressed: bool = False,
invert_match: bool = False,
delete_empty_recordings: bool = False,
):
"""
:param Path bliss_corpus:
:param list[Path]|Path segment_file: a single segment file or a list of segment files
:param bool compressed:
:param bool invert_match:
:param bool delete_empty_recordings: if true, empty recordings will be removed
:param bliss_corpus:
:param segment_file: a single segment file or a list of segment files
:param compressed:
:param invert_match:
:param delete_empty_recordings: if true, empty recordings will be removed
"""
self.bliss_corpus = bliss_corpus
self.segment_file_list = [segment_file] if isinstance(segment_file, tk.Path) else segment_file
Expand Down Expand Up @@ -314,11 +325,11 @@ def unknown_filter(corpus: corpus.Corpus, recording: corpus.Recording, segment:


class FilterCorpusBySegmentDurationJob(Job):
def __init__(self, bliss_corpus, min_duration=0.1, max_duration=120.0):
def __init__(self, bliss_corpus: Path, min_duration: float = 0.1, max_duration: float = 120.0):
"""
:param Path bliss_corpus: path of the corpus file
:param float min_duration: minimum duration for a segment to keep (in seconds)
:param float max_duration: maximum duration for a segment to keep (in seconds)
:param bliss_corpus: path of the corpus file
:param min_duration: minimum duration for a segment to keep (in seconds)
:param max_duration: maximum duration for a segment to keep (in seconds)
"""
self.bliss_corpus = bliss_corpus
self.min_duration = min_duration
Expand Down
Loading