rwth-i6 · Icemole · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/lexicon/conversion.py b/lexicon/conversion.py
@@ -99,9 +99,9 @@ def run(self):
 class LexiconUniqueOrthJob(Job):
     """Merge lemmata with the same orthography."""
 
-    __sis_hash_exclude__ = {"merge_multi_orths_lemmata": False}
+    __sis_hash_exclude__ = {"merge_multi_orths_lemmata": False, "deduplicate_special_lemmata": False}
 
-    def __init__(self, bliss_lexicon, merge_multi_orths_lemmata=False):
+    def __init__(self, bliss_lexicon, merge_multi_orths_lemmata=False, deduplicate_special_lemmata=False):
         """
         :param tk.Path bliss_lexicon: lexicon file to be handeled
         :param bool merge_multi_orths_lemmata: if True, also lemmata containing
@@ -119,11 +119,14 @@ def __init__(self, bliss_lexicon, merge_multi_orths_lemmata=False):
                     ** having a synt <=> synt is not None
                 this could lead to INFORMATION LOSS if there are several
                 different synt token sequences in the to-be-merged lemmata
+        :param bool deduplicate_special_lemmata: if True, special lemmata will also be considered
+        in the unique process.
         """
         self.set_vis_name("Make Lexicon Orths Unique")
 
         self.bliss_lexicon = bliss_lexicon
         self.merge_multi_orths_lemmata = merge_multi_orths_lemmata
+        self.deduplicate_special_lemmata = deduplicate_special_lemmata
 
         self.out_bliss_lexicon = self.output_path(os.path.basename(tk.uncached_path(bliss_lexicon)), cached=True)
 
@@ -137,7 +140,7 @@ def run(self):
         orth2lemmata = collections.defaultdict(list)
 
         for lemma in lex.lemmata:
-            if lemma.special:
+            if lemma.special and not self.deduplicate_special_lemmata:
                 continue
             num_orths = len(lemma.orth)
             if num_orths < 1:

diff --git a/lexicon/modification.py b/lexicon/modification.py
@@ -130,16 +130,22 @@ class MergeLexiconJob(Job):
     will create a new lexicon that might be incompatible to previously generated alignments.
     """
 
-    def __init__(self, bliss_lexica, sort_phonemes=False, sort_lemmata=False, compressed=True):
+    __sis_hash_exclude__ = {"deduplicate_lemmata": False}
+
+    def __init__(
+        self, bliss_lexica, sort_phonemes=False, sort_lemmata=False, compressed=True, deduplicate_lemmata=False
+    ):
         """
         :param list[Path] bliss_lexica: list of bliss lexicon files (plain or gz)
         :param bool sort_phonemes: sort phoneme inventory alphabetically
         :param bool sort_lemmata: sort lemmata alphabetically based on first orth entry
         :param bool compressed: compress final lexicon
+        :param bool deduplicate_lemmata: whether to deduplicate lemmatas, only applied when sort_lemmata=True
         """
         self.lexica = bliss_lexica
         self.sort_phonemes = sort_phonemes
         self.sort_lemmata = sort_lemmata
+        self.deduplicate_lemmata = deduplicate_lemmata
 
         self.out_bliss_lexicon = self.output_path("lexicon.xml.gz" if compressed else "lexicon.xml")
 
@@ -178,7 +184,12 @@ def run(self):
                 for lemma in lex.lemmata:
                     # sort by first orth entry
                     orth_key = lemma.orth[0] if lemma.orth else ""
+                    if self.deduplicate_lemmata:
+                        # don't add the lemma when there's already an equal lemma
+                        if len(lemma_dict[orth_key]) > 0 and lemma == lemma_dict[orth_key][0]:
+                            continue
                     lemma_dict[orth_key].append(lemma)
+            print(lemma_dict)
             merged_lex.lemmata = list(itertools.chain(*[lemma_dict[key] for key in sorted(lemma_dict.keys())]))
         else:
             for lex in lexica:

diff --git a/lib/lexicon.py b/lib/lexicon.py
@@ -3,10 +3,13 @@
 
 For format details visit: `https://www-i6.informatik.rwth-aachen.de/rwth-asr/manual/index.php/Lexicon`_
 """
+from __future__ import annotations
+
 __all__ = ["Lemma", "Lexicon"]
 
 from collections import OrderedDict
-from typing import Optional, List
+import itertools
+from typing import Optional, List, Set
 import xml.etree.ElementTree as ET
 
 from i6_core.util import uopen
@@ -104,6 +107,42 @@ def from_element(cls, e):
         synt = None if not synt else synt[0]
         return Lemma(orth, phon, synt, eval, special)
 
+    def _equals(self, other: Lemma, same_order: bool = True) -> bool:
+        """
+        Check for lemma equality.
+
+        :param other: Other lemma to compare :param:`self` to.
+        :param same_order: Whether the order in the different lemma elements matters or not.
+        :return: Whether :param:`self` and :param:`other` are equal or not.
+        """
+        if same_order:
+            return (
+                self.orth == other.orth
+                and self.phon == other.phon
+                and self.special == other.special
+                and self.synt == other.synt
+                and self.eval == other.eval
+            )
+        else:
+            if self.synt is not None and other.synt is not None:
+                equal_synt = set(self.synt) == set(other.synt)
+            else:
+                equal_synt = self.synt == other.synt
+
+            return (
+                set(self.orth) == set(other.orth)
+                and set(self.phon) == set(other.phon)
+                and self.special == other.special
+                and equal_synt
+                and set(itertools.chain(*self.eval)) == set(itertools.chain(*other.eval))
+            )
+
+    def __eq__(self, other: Lemma) -> bool:
+        return self._equals(other, same_order=False)
+
+    def __ne__(self, other: Lemma) -> bool:
+        return not self.__eq__(other)
+
 
 class Lexicon:
     """