diff --git a/lexicon/conversion.py b/lexicon/conversion.py index de42e34a..f9502cbd 100644 --- a/lexicon/conversion.py +++ b/lexicon/conversion.py @@ -99,9 +99,9 @@ def run(self): class LexiconUniqueOrthJob(Job): """Merge lemmata with the same orthography.""" - __sis_hash_exclude__ = {"merge_multi_orths_lemmata": False} + __sis_hash_exclude__ = {"merge_multi_orths_lemmata": False, "deduplicate_special_lemmata": False} - def __init__(self, bliss_lexicon, merge_multi_orths_lemmata=False): + def __init__(self, bliss_lexicon, merge_multi_orths_lemmata=False, deduplicate_special_lemmata=False): """ :param tk.Path bliss_lexicon: lexicon file to be handeled :param bool merge_multi_orths_lemmata: if True, also lemmata containing @@ -119,11 +119,14 @@ def __init__(self, bliss_lexicon, merge_multi_orths_lemmata=False): ** having a synt <=> synt is not None this could lead to INFORMATION LOSS if there are several different synt token sequences in the to-be-merged lemmata + :param bool deduplicate_special_lemmata: if True, special lemmata will also be considered + in the unique process. """ self.set_vis_name("Make Lexicon Orths Unique") self.bliss_lexicon = bliss_lexicon self.merge_multi_orths_lemmata = merge_multi_orths_lemmata + self.deduplicate_special_lemmata = deduplicate_special_lemmata self.out_bliss_lexicon = self.output_path(os.path.basename(tk.uncached_path(bliss_lexicon)), cached=True) @@ -137,7 +140,7 @@ def run(self): orth2lemmata = collections.defaultdict(list) for lemma in lex.lemmata: - if lemma.special: + if lemma.special and not self.deduplicate_special_lemmata: continue num_orths = len(lemma.orth) if num_orths < 1: diff --git a/lexicon/modification.py b/lexicon/modification.py index d30c0562..4236b299 100644 --- a/lexicon/modification.py +++ b/lexicon/modification.py @@ -130,16 +130,22 @@ class MergeLexiconJob(Job): will create a new lexicon that might be incompatible to previously generated alignments. """ - def __init__(self, bliss_lexica, sort_phonemes=False, sort_lemmata=False, compressed=True): + __sis_hash_exclude__ = {"deduplicate_lemmata": False} + + def __init__( + self, bliss_lexica, sort_phonemes=False, sort_lemmata=False, compressed=True, deduplicate_lemmata=False + ): """ :param list[Path] bliss_lexica: list of bliss lexicon files (plain or gz) :param bool sort_phonemes: sort phoneme inventory alphabetically :param bool sort_lemmata: sort lemmata alphabetically based on first orth entry :param bool compressed: compress final lexicon + :param bool deduplicate_lemmata: whether to deduplicate lemmatas, only applied when sort_lemmata=True """ self.lexica = bliss_lexica self.sort_phonemes = sort_phonemes self.sort_lemmata = sort_lemmata + self.deduplicate_lemmata = deduplicate_lemmata self.out_bliss_lexicon = self.output_path("lexicon.xml.gz" if compressed else "lexicon.xml") @@ -178,6 +184,10 @@ def run(self): for lemma in lex.lemmata: # sort by first orth entry orth_key = lemma.orth[0] if lemma.orth else "" + if self.deduplicate_lemmata: + # don't add the lemma when there's already an equal lemma + if len(lemma_dict[orth_key]) > 0 and lemma == lemma_dict[orth_key][0]: + continue lemma_dict[orth_key].append(lemma) merged_lex.lemmata = list(itertools.chain(*[lemma_dict[key] for key in sorted(lemma_dict.keys())])) else: diff --git a/lib/lexicon.py b/lib/lexicon.py index c0793cea..bd18f83d 100644 --- a/lib/lexicon.py +++ b/lib/lexicon.py @@ -3,10 +3,13 @@ For format details visit: `https://www-i6.informatik.rwth-aachen.de/rwth-asr/manual/index.php/Lexicon`_ """ +from __future__ import annotations + __all__ = ["Lemma", "Lexicon"] from collections import OrderedDict -from typing import Optional, List +import itertools +from typing import Optional, List, Set import xml.etree.ElementTree as ET from i6_core.util import uopen @@ -104,6 +107,42 @@ def from_element(cls, e): synt = None if not synt else synt[0] return Lemma(orth, phon, synt, eval, special) + def _equals(self, other: Lemma, *, same_order: bool = True) -> bool: + """ + Check for lemma equality. + + :param other: Other lemma to compare :param:`self` to. + :param same_order: Whether the order in the different lemma elements matters or not. + :return: Whether :param:`self` and :param:`other` are equal or not. + """ + if same_order: + return ( + self.orth == other.orth + and self.phon == other.phon + and self.special == other.special + and self.synt == other.synt + and self.eval == other.eval + ) + else: + if self.synt is not None and other.synt is not None: + equal_synt = set(self.synt) == set(other.synt) + else: + equal_synt = self.synt == other.synt + + return ( + set(self.orth) == set(other.orth) + and set(self.phon) == set(other.phon) + and self.special == other.special + and equal_synt + and set(itertools.chain(*self.eval)) == set(itertools.chain(*other.eval)) + ) + + def __eq__(self, other: Lemma) -> bool: + return self._equals(other, same_order=False) + + def __ne__(self, other: Lemma) -> bool: + return not self.__eq__(other) + class Lexicon: """