Add match grouping by spans and set this as default output type

Closes #122
direct-phonology · Jul 13, 2024 · 63cb42a · 63cb42a
1 parent 240f4b0
commit 63cb42a
Show file tree

Hide file tree

Showing 5 changed files with 292 additions and 111 deletions.
diff --git a/dphon/cli.py b/dphon/cli.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """dphon - a tool for old chinese phonetic analysis
- 
+
 Usage:
     dphon -h | --help
     dphon --version
-    dphon [-v | -vv] [options] <path>... 
- 
+    dphon [-v | -vv] [options] <path>...
+
 Global Options:
     -h, --help
         Show this help text and exit.
@@ -38,7 +38,7 @@
 
     -l <NUM>, --len-limit <NUM>     [default: 50]
         Compare at most NUM tokens when obtaining the similarity score. A
-        higher number will slow down execution time but return more matches. 
+        higher number will slow down execution time but return more matches.
 
     -c <NUM>, --context <NUM>       [default: 4]
         Add NUM tokens of context to each side of matches. Context displays with
@@ -56,14 +56,14 @@
         less than the value for "--ngram-order".
 
     --max <NUM>                     [default: 64]
-        Limit to matches with total number of tokens <= NUM. Must be equal to 
+        Limit to matches with total number of tokens <= NUM. Must be equal to
         or greater than the value for "--ngram-order".
 
 Examples:
     dphon texts/*.txt --min 8 > matches.txt
     dphon file1.txt file2.txt --ngram-order 8 --threshold 0.8
     dphon docs.jsonl --input-format jsonl --output-format jsonl > matches.jsonl
- 
+
 Help:
     For more information on using this tool, visit the Github repository:
     https://github.com/direct-phonology/dphon
@@ -155,8 +155,11 @@ def run() -> None:
     else:
         # use system pager by default; colorize if LESS=R
         with console.pager(styles=os.getenv("LESS", "") == "R"):
-            for match in results:
-                console.print(match)
+            for doc in graph.docs:
+                for group in doc._.groups:
+                    console.print(group, "\n")
+            # for match in results:
+            #     console.print(match)
 
 
 def setup(args: Dict) -> Language:
@@ -168,6 +171,8 @@ def setup(args: Dict) -> Language:
     # add Doc metadata
     if not Doc.has_extension("id"):
         Doc.set_extension("id", default="")
+    if not Doc.has_extension("groups"):
+        Doc.set_extension("groups", default=[])
 
     # setup spaCy model
     nlp = spacy.blank("zh", meta={"tokenizer": {"config": {"use_jieba": False}}})
@@ -192,7 +197,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
     start = time.perf_counter()
     for doc, context in nlp.pipe(load_texts(args["<path>"]), as_tuples=True):
         doc._.id = context["id"]
-        graph.add_doc(context["id"], doc)
+        graph.add_doc(doc)
         logging.debug(f'indexed doc "{doc._.id}"')
     stop = time.perf_counter() - start
     logging.info(f"indexed {graph.number_of_docs()} docs in {stop:.1f}s")
@@ -243,6 +248,9 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
     # align all matches
     graph.align(SmithWatermanPhoneticAligner(gap_char="　"))
 
+    # group all matches
+    graph.group()
+
     # limit via min and max lengths if requested
     if args["--min"]:
         graph.filter(lambda m: len(m) >= int(args["--min"]))

diff --git a/dphon/console.py b/dphon/console.py
@@ -1,12 +1,12 @@
-from typing import Tuple, List
+from typing import List, Tuple
 
 from rich.console import Console
 from rich.highlighter import RegexHighlighter
 from rich.theme import Theme
+from spacy.tokens import Span
 
-from .match import Match
 from .g2p import GraphemesToPhonemes
-
+from .match import Match
 
 # Default color scheme for highlighting matches
 DEFAULT_THEME = Theme(
@@ -45,91 +45,99 @@ def format_match(self, match: Match) -> Tuple[str, str]:
         Adds markup for highlighting insertions, mismatches, etc. If context is
         set, also adds highlighted context to either end of the match.
         """
+        return (
+            self.format_span(match.utxt, match.vtxt, match.au, match.av),
+            self.format_span(match.vtxt, match.utxt, match.av, match.au),
+        )
 
-        su, sv = self._mark(match)
+    def transcribe_match(self, match: Match) -> Tuple[str, str]:
+        """Render a phonemic transcription for a Match."""
+        return (self.transcribe_span(match.utxt), self.transcribe_span(match.vtxt))
+
+    def format_span(
+        self,
+        span: Span,
+        other: Span = None,
+        alignment: str = None,
+        other_alignment: str = None,
+    ) -> str:
+        """Return a Span as a Rich format string, with optional context.
+
+        Adds markup for highlighting insertions, mismatches, etc. if a second
+        reference Span is provided. If context is set, also adds highlighted
+        context to either end of the match.
+        """
+        highlighted_span = self._mark_span(span, alignment, other, other_alignment)
         if self.context > 0:
-            cul, cur, cvl, cvr = self._add_context(match)
-            su = cul + su + cur
-            sv = cvl + sv + cvr
-        return su, sv
+            context_left, context_right = self._add_span_context(span)
+            formatted_span = context_left + highlighted_span + context_right
+        return formatted_span
 
-    def _mark(self, match: Match) -> Tuple[str, str]:
-        """Mark up the match for colorization with a theme.
+    def transcribe_span(self, span: Span) -> str:
+        """Render a phonemic transcription for a Span."""
+        return "*" + " ".join(span._.syllables)
 
-        - Adds markup for insertions (tokens in one sequence but not another).
+    def _mark_span(
+        self, span: Span, alignment: str, other: Span, other_alignment: str
+    ) -> str:
+        """Mark up a Span for colorization with a theme, in relation to another Span.
+
+        - Adds markup for insertions (tokens in one Span but not another).
         - Adds markup for mismatches (differing tokens in the same position).
         - Adds markup for graphic variants (mismatches with same phonemes).
         """
-
-        # if no alignment, just convert to strings because we can't highlight
-        if not match.au or not match.av:
-            return match.utxt.text, match.vtxt.text
+        # if no alignment, just return the text because we can't highlight
+        if not alignment or not other or not other_alignment:
+            return span.text
 
         # o(N) implementation: step through each sequence adding markup
-        # TODO convert to a DFA so there's less markup repetition
-        su: List[str] = []
-        sv: List[str] = []
-        u_ptr = 0
-        v_ptr = 0
-        for i in range(len(match)):
-
+        # TODO convert to a DFA so there's less markup repetition?
+        marked_span: List[str] = []
+        span_ptr = 0
+        other_ptr = 0
+        for i in range(len(span)):
             # gap in u: insertion in v (if not punctuation)
-            if match.au[i] == self.gap_char and match.av[i].isalnum():
-                su.append(match.au[i])
-                sv.append(f"[insertion]{match.av[i]}[/insertion]")
-                v_ptr += 1
+            if alignment[i] == self.gap_char and other_alignment[i].isalnum():
+                marked_span.append(alignment[i])
+                other_ptr += 1
                 continue
 
             # gap in v: insertion in u (if not punctuation)
-            if match.av[i] == self.gap_char and match.au[i].isalnum():
-                su.append(f"[insertion]{match.au[i]}[/insertion]")
-                sv.append(match.av[i])
-                u_ptr += 1
+            if other_alignment[i] == self.gap_char and alignment[i].isalnum():
+                marked_span.append(f"[insertion]{alignment[i]}[/insertion]")
+                span_ptr += 1
                 continue
 
             # variants (both u and v)
-            if self.g2p.are_graphic_variants(match.utxt[u_ptr], match.vtxt[v_ptr]):
-                su.append(f"[variant]{match.au[i]}[/variant]")
-                sv.append(f"[variant]{match.av[i]}[/variant]")
-                u_ptr += 1
-                v_ptr += 1
+            if self.g2p.are_graphic_variants(span[span_ptr], other[other_ptr]):
+                marked_span.append(f"[variant]{alignment[i]}[/variant]")
+                span_ptr += 1
+                other_ptr += 1
                 continue
 
             # mismatch (both u and v) - only highlight if alphanumeric
-            if match.au[i] != match.av[i]:
-                if match.au[i].isalnum() and match.av[i].isalnum():
-                    su.append(f"[mismatch]{match.au[i]}[/mismatch]")
-                    sv.append(f"[mismatch]{match.av[i]}[/mismatch]")
-                    u_ptr += 1
-                    v_ptr += 1
+            if alignment[i] != other_alignment[i]:
+                if alignment[i].isalnum() and other_alignment[i].isalnum():
+                    marked_span.append(f"[mismatch]{alignment[i]}[/mismatch]")
+                    span_ptr += 1
+                    other_ptr += 1
                     continue
 
             # equality; nothing to highlight
-            su.append(match.au[i])
-            sv.append(match.av[i])
-            u_ptr += 1
-            v_ptr += 1
+            marked_span.append(alignment[i])
+            span_ptr += 1
+            other_ptr += 1
 
-        return "".join(su), "".join(sv)
+        return "".join(marked_span)
 
-    def _add_context(self, match: Match) -> Tuple[str, str, str, str]:
-        """Add context to either side of the match sequences.
+    def _add_span_context(self, span: Span) -> Tuple[str, str]:
+        """Add context to either side of the Span.
 
         Context coloration can be changed by the default theme; a dim appearance
         is used in terminals.
         """
-
-        utxt, vtxt = match.utxt, match.vtxt
-        u, v = utxt.doc, vtxt.doc
-        cul = f"[context]{u[utxt.start-self.context:utxt.start]}[/context]"
-        cur = f"[context]{u[utxt.end:utxt.end+self.context]}[/context]"
-        cvl = f"[context]{v[vtxt.start-self.context:vtxt.start]}[/context]"
-        cvr = f"[context]{v[vtxt.end:vtxt.end+self.context]}[/context]"
-        return (cul, cur, cvl, cvr)
-
-    def transcription(self, match: Match) -> Tuple[str, str]:
-        """Get the phonemic transcription for the match for display."""
-        return (
-            "*" + " ".join(match.utxt._.syllables),
-            "*" + " ".join(match.vtxt._.syllables),
+        context_left = (
+            f"[context]{span.doc[span.start-self.context:span.start]}[/context]"
         )
+        context_right = f"[context]{span.doc[span.end:span.end+self.context]}[/context]"
+        return context_left, context_right
diff --git a/dphon/match.py b/dphon/match.py
@@ -6,9 +6,9 @@
 from typing import Dict, List, NamedTuple
 
 import Levenshtein as Lev
-from rich.padding import Padding
 from rich.console import Console, ConsoleOptions, RenderResult
-from spacy.tokens import Span
+from rich.padding import Padding
+from spacy.tokens import Doc, Span
 
 
 class Match(NamedTuple):