From ebeb2491fd75f3e296b163b1979394d24867bde1 Mon Sep 17 00:00:00 2001 From: Nick Budak Date: Sat, 13 Jul 2024 12:14:03 -0700 Subject: [PATCH] Add match grouping by spans and set this as default output type Closes #122 --- dphon/cli.py | 26 +++++--- dphon/console.py | 132 +++++++++++++++++++++------------------ dphon/match.py | 4 +- dphon/reuse.py | 131 ++++++++++++++++++++++++++++++++++---- tests/unit/test_reuse.py | 110 ++++++++++++++++++++++++-------- 5 files changed, 292 insertions(+), 111 deletions(-) diff --git a/dphon/cli.py b/dphon/cli.py index 50d89ee..0883b8a 100644 --- a/dphon/cli.py +++ b/dphon/cli.py @@ -1,12 +1,12 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- """dphon - a tool for old chinese phonetic analysis - + Usage: dphon -h | --help dphon --version - dphon [-v | -vv] [options] ... - + dphon [-v | -vv] [options] ... + Global Options: -h, --help Show this help text and exit. @@ -38,7 +38,7 @@ -l , --len-limit [default: 50] Compare at most NUM tokens when obtaining the similarity score. A - higher number will slow down execution time but return more matches. + higher number will slow down execution time but return more matches. -c , --context [default: 4] Add NUM tokens of context to each side of matches. Context displays with @@ -56,14 +56,14 @@ less than the value for "--ngram-order". --max [default: 64] - Limit to matches with total number of tokens <= NUM. Must be equal to + Limit to matches with total number of tokens <= NUM. Must be equal to or greater than the value for "--ngram-order". Examples: dphon texts/*.txt --min 8 > matches.txt dphon file1.txt file2.txt --ngram-order 8 --threshold 0.8 dphon docs.jsonl --input-format jsonl --output-format jsonl > matches.jsonl - + Help: For more information on using this tool, visit the Github repository: https://github.com/direct-phonology/dphon @@ -155,8 +155,11 @@ def run() -> None: else: # use system pager by default; colorize if LESS=R with console.pager(styles=os.getenv("LESS", "") == "R"): - for match in results: - console.print(match) + for doc in graph.docs: + for group in doc._.groups: + console.print(group, "\n") + # for match in results: + # console.print(match) def setup(args: Dict) -> Language: @@ -168,6 +171,8 @@ def setup(args: Dict) -> Language: # add Doc metadata if not Doc.has_extension("id"): Doc.set_extension("id", default="") + if not Doc.has_extension("groups"): + Doc.set_extension("groups", default=[]) # setup spaCy model nlp = spacy.blank("zh", meta={"tokenizer": {"config": {"use_jieba": False}}}) @@ -192,7 +197,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph: start = time.perf_counter() for doc, context in nlp.pipe(load_texts(args[""]), as_tuples=True): doc._.id = context["id"] - graph.add_doc(context["id"], doc) + graph.add_doc(doc) logging.debug(f'indexed doc "{doc._.id}"') stop = time.perf_counter() - start logging.info(f"indexed {graph.number_of_docs()} docs in {stop:.1f}s") @@ -243,6 +248,9 @@ def process(nlp: Language, args: Dict) -> MatchGraph: # align all matches graph.align(SmithWatermanPhoneticAligner(gap_char=" ")) + # group all matches + graph.group() + # limit via min and max lengths if requested if args["--min"]: graph.filter(lambda m: len(m) >= int(args["--min"])) diff --git a/dphon/console.py b/dphon/console.py index 269c7cf..1605819 100644 --- a/dphon/console.py +++ b/dphon/console.py @@ -1,12 +1,12 @@ -from typing import Tuple, List +from typing import List, Tuple from rich.console import Console from rich.highlighter import RegexHighlighter from rich.theme import Theme +from spacy.tokens import Span -from .match import Match from .g2p import GraphemesToPhonemes - +from .match import Match # Default color scheme for highlighting matches DEFAULT_THEME = Theme( @@ -45,91 +45,99 @@ def format_match(self, match: Match) -> Tuple[str, str]: Adds markup for highlighting insertions, mismatches, etc. If context is set, also adds highlighted context to either end of the match. """ + return ( + self.format_span(match.utxt, match.vtxt, match.au, match.av), + self.format_span(match.vtxt, match.utxt, match.av, match.au), + ) - su, sv = self._mark(match) + def transcribe_match(self, match: Match) -> Tuple[str, str]: + """Render a phonemic transcription for a Match.""" + return (self.transcribe_span(match.utxt), self.transcribe_span(match.vtxt)) + + def format_span( + self, + span: Span, + other: Span = None, + alignment: str = None, + other_alignment: str = None, + ) -> str: + """Return a Span as a Rich format string, with optional context. + + Adds markup for highlighting insertions, mismatches, etc. if a second + reference Span is provided. If context is set, also adds highlighted + context to either end of the match. + """ + highlighted_span = self._mark_span(span, alignment, other, other_alignment) if self.context > 0: - cul, cur, cvl, cvr = self._add_context(match) - su = cul + su + cur - sv = cvl + sv + cvr - return su, sv + context_left, context_right = self._add_span_context(span) + formatted_span = context_left + highlighted_span + context_right + return formatted_span - def _mark(self, match: Match) -> Tuple[str, str]: - """Mark up the match for colorization with a theme. + def transcribe_span(self, span: Span) -> str: + """Render a phonemic transcription for a Span.""" + return "*" + " ".join(span._.syllables) - - Adds markup for insertions (tokens in one sequence but not another). + def _mark_span( + self, span: Span, alignment: str, other: Span, other_alignment: str + ) -> str: + """Mark up a Span for colorization with a theme, in relation to another Span. + + - Adds markup for insertions (tokens in one Span but not another). - Adds markup for mismatches (differing tokens in the same position). - Adds markup for graphic variants (mismatches with same phonemes). """ - - # if no alignment, just convert to strings because we can't highlight - if not match.au or not match.av: - return match.utxt.text, match.vtxt.text + # if no alignment, just return the text because we can't highlight + if not alignment or not other or not other_alignment: + return span.text # o(N) implementation: step through each sequence adding markup - # TODO convert to a DFA so there's less markup repetition - su: List[str] = [] - sv: List[str] = [] - u_ptr = 0 - v_ptr = 0 - for i in range(len(match)): - + # TODO convert to a DFA so there's less markup repetition? + marked_span: List[str] = [] + span_ptr = 0 + other_ptr = 0 + for i in range(len(span)): # gap in u: insertion in v (if not punctuation) - if match.au[i] == self.gap_char and match.av[i].isalnum(): - su.append(match.au[i]) - sv.append(f"[insertion]{match.av[i]}[/insertion]") - v_ptr += 1 + if alignment[i] == self.gap_char and other_alignment[i].isalnum(): + marked_span.append(alignment[i]) + other_ptr += 1 continue # gap in v: insertion in u (if not punctuation) - if match.av[i] == self.gap_char and match.au[i].isalnum(): - su.append(f"[insertion]{match.au[i]}[/insertion]") - sv.append(match.av[i]) - u_ptr += 1 + if other_alignment[i] == self.gap_char and alignment[i].isalnum(): + marked_span.append(f"[insertion]{alignment[i]}[/insertion]") + span_ptr += 1 continue # variants (both u and v) - if self.g2p.are_graphic_variants(match.utxt[u_ptr], match.vtxt[v_ptr]): - su.append(f"[variant]{match.au[i]}[/variant]") - sv.append(f"[variant]{match.av[i]}[/variant]") - u_ptr += 1 - v_ptr += 1 + if self.g2p.are_graphic_variants(span[span_ptr], other[other_ptr]): + marked_span.append(f"[variant]{alignment[i]}[/variant]") + span_ptr += 1 + other_ptr += 1 continue # mismatch (both u and v) - only highlight if alphanumeric - if match.au[i] != match.av[i]: - if match.au[i].isalnum() and match.av[i].isalnum(): - su.append(f"[mismatch]{match.au[i]}[/mismatch]") - sv.append(f"[mismatch]{match.av[i]}[/mismatch]") - u_ptr += 1 - v_ptr += 1 + if alignment[i] != other_alignment[i]: + if alignment[i].isalnum() and other_alignment[i].isalnum(): + marked_span.append(f"[mismatch]{alignment[i]}[/mismatch]") + span_ptr += 1 + other_ptr += 1 continue # equality; nothing to highlight - su.append(match.au[i]) - sv.append(match.av[i]) - u_ptr += 1 - v_ptr += 1 + marked_span.append(alignment[i]) + span_ptr += 1 + other_ptr += 1 - return "".join(su), "".join(sv) + return "".join(marked_span) - def _add_context(self, match: Match) -> Tuple[str, str, str, str]: - """Add context to either side of the match sequences. + def _add_span_context(self, span: Span) -> Tuple[str, str]: + """Add context to either side of the Span. Context coloration can be changed by the default theme; a dim appearance is used in terminals. """ - - utxt, vtxt = match.utxt, match.vtxt - u, v = utxt.doc, vtxt.doc - cul = f"[context]{u[utxt.start-self.context:utxt.start]}[/context]" - cur = f"[context]{u[utxt.end:utxt.end+self.context]}[/context]" - cvl = f"[context]{v[vtxt.start-self.context:vtxt.start]}[/context]" - cvr = f"[context]{v[vtxt.end:vtxt.end+self.context]}[/context]" - return (cul, cur, cvl, cvr) - - def transcription(self, match: Match) -> Tuple[str, str]: - """Get the phonemic transcription for the match for display.""" - return ( - "*" + " ".join(match.utxt._.syllables), - "*" + " ".join(match.vtxt._.syllables), + context_left = ( + f"[context]{span.doc[span.start-self.context:span.start]}[/context]" ) + context_right = f"[context]{span.doc[span.end:span.end+self.context]}[/context]" + return context_left, context_right diff --git a/dphon/match.py b/dphon/match.py index 2475cc5..d66fe7d 100644 --- a/dphon/match.py +++ b/dphon/match.py @@ -6,9 +6,9 @@ from typing import Dict, List, NamedTuple import Levenshtein as Lev -from rich.padding import Padding from rich.console import Console, ConsoleOptions, RenderResult -from spacy.tokens import Span +from rich.padding import Padding +from spacy.tokens import Doc, Span class Match(NamedTuple): diff --git a/dphon/reuse.py b/dphon/reuse.py index eed1be8..fbc8f21 100644 --- a/dphon/reuse.py +++ b/dphon/reuse.py @@ -2,21 +2,94 @@ # -*- coding: utf-8 -*- """Classes for analyzing text reuse.""" -from itertools import combinations -from typing import Callable, Iterable, Iterator, Tuple +from functools import cached_property +from itertools import combinations, groupby +from typing import Callable, Iterable, Iterator from networkx import MultiGraph, create_empty_copy -from rich.progress import Progress, BarColumn, SpinnerColumn -from spacy.tokens import Doc +from rich.console import Console, ConsoleOptions, RenderResult +from rich.progress import BarColumn, Progress, SpinnerColumn +from spacy.tokens import Doc, Span from .align import Aligner +from .console import err_console from .extend import Extender, extend_matches from .match import Match -from .console import err_console -class MatchGraph: +class MatchGroup: + """A group of matches with common bounds in a single document.""" + + def __init__( + self, doc: Doc, start: int, end: int, matches: Iterable[Match] + ) -> None: + self.doc = doc + self.start = start + self.end = end + self.matches = list(matches) + + def __len__(self) -> int: + return len(self.matches) + + def __rich_console__( + self, console: Console, options: ConsoleOptions + ) -> RenderResult: + """Format the group for display in console.""" + render_results = [] + + # render the "anchor" span first (i.e., the span that all matches share) + render_results += [ + f"[bold]{self.doc._.id}[/bold] ({self.start}–{self.end-1}):", + console.highlighter.format_span(self.anchor_span), + console.highlighter.transcribe_span(self.anchor_span), + ] + + # render the non-anchor spans from each match in the group + for i, match in enumerate(self.matches): + span = self.non_anchor_span(match) + alignment = self.non_anchor_alignment(match) + anchor_alignment = self.anchor_alignment(match) + render_results += [ + f"{i + 1}. {span.doc._.id} ({span.start}–{span.end-1}):", + console.highlighter.format_span( + span, self.anchor_span, alignment, anchor_alignment + ), + console.highlighter.transcribe_span(span), + ] + + return render_results + + @cached_property + def anchor_span(self) -> Span: + """Get the anchor span for the group.""" + return self.doc[self.start : self.end] + + def anchor_alignment(self, match: Match) -> str: + """Get the anchor alignment for a given match.""" + if match.u == self.doc._.id: + return match.au + if match.v == self.doc._.id: + return match.av + raise ValueError("Match does not belong to document.", match, self.doc) + + def non_anchor_span(self, match: Match) -> Span: + """Get the non-anchor span for a given match.""" + if match.u == self.doc._.id: + return match.vtxt + if match.v == self.doc._.id: + return match.utxt + raise ValueError("Match does not belong to document.", match, self.doc) + + def non_anchor_alignment(self, match: Match) -> str: + """Get the non-anchor alignment for a given match.""" + if match.u == self.doc._.id: + return match.av + if match.v == self.doc._.id: + return match.au + raise ValueError("Match does not belong to document.", match, self.doc) + +class MatchGraph: _G: MultiGraph def __init__(self) -> None: @@ -50,13 +123,17 @@ def number_of_docs(self) -> int: """Total number of documents in the graph.""" return self._G.number_of_nodes() - def add_doc(self, label: str, doc: Doc) -> None: + def add_doc(self, doc: Doc, label: str = None) -> None: """Add a single document to the graph.""" - self._G.add_node(label, doc=doc) + doc_id = label or doc._.id + if not doc_id: + raise ValueError("Document must have an identifier.", doc) + doc._.id = doc_id + self._G.add_node(doc_id, doc=doc) - def add_docs(self, docs: Iterable[Tuple[str, Doc]]) -> None: + def add_docs(self, docs: Iterable[Doc]) -> None: """Add a collection of documents to the graph.""" - self._G.add_nodes_from(((label, {"doc": doc}) for label, doc in docs)) + [self.add_doc(doc) for doc in docs] def add_match(self, match: Match) -> None: """Add a single match to the graph.""" @@ -64,7 +141,7 @@ def add_match(self, match: Match) -> None: def add_matches(self, matches: Iterable[Match]) -> None: """Add a collection of matches to the graph.""" - self._G.add_edges_from([(m.u, m.v, m._asdict()) for m in matches]) + [self.add_match(match) for match in matches] def extend(self, extender: Extender) -> None: """Extend all matches in the graph using a provided strategy.""" @@ -108,9 +185,41 @@ def align(self, align: Aligner) -> None: self._G = G self.progress.remove_task(task) + def group(self) -> None: + """Group all matches in the graph by their shared spans.""" + # track progress + task = self.progress.add_task( + "grouping", u="", v="", total=self.number_of_matches() + ) + + # iterate through each document and group all matches that target it + with self.progress: + for doc in self.docs: + self.progress.update(task, u=doc) + edges = self._G.edges(doc._.id, data=True) + matches = [Match(**data) for _u, _v, data in edges] + for span, group in groupby( + sorted(matches, key=_bounds_in(doc)), key=_bounds_in(doc) + ): + doc._.groups.append(MatchGroup(doc, span[0], span[1], group)) + self.progress.update(task, advance=len(edges)) + self.progress.remove_task(task) + def filter(self, predicate: Callable[[Match], bool]) -> None: """Filter all matches in the graph using a provided predicate.""" G = create_empty_copy(self._G) filtered = filter(predicate, self.matches) G.add_edges_from([(m.u, m.v, m._asdict()) for m in filtered]) self._G = G + + +# helper for getting bounds of a match in a given document +def _bounds_in(doc): + def _bounds(match): + if match.utxt.doc == doc: + return match.utxt.start, match.utxt.end + if match.vtxt.doc == doc: + return match.vtxt.start, match.vtxt.end + raise ValueError("Match does not belong to document.", match, doc) + + return _bounds diff --git a/tests/unit/test_reuse.py b/tests/unit/test_reuse.py index ad29cfd..94f421b 100644 --- a/tests/unit/test_reuse.py +++ b/tests/unit/test_reuse.py @@ -4,10 +4,11 @@ from unittest import TestCase, skip import spacy -from dphon.reuse import MatchGraph from spacy.tokens import Doc + from dphon.extend import LevenshteinExtender from dphon.match import Match +from dphon.reuse import MatchGraph # disconnect logging for testing logging.captureWarnings(True) @@ -22,38 +23,93 @@ class TestMatchGraph(TestCase): def setUp(self) -> None: """create a spaCy pipeline and match graph for testing""" self.nlp = spacy.blank( - "zh", meta={"tokenizer": {"config": {"use_jieba": False}}}) - self.G = MatchGraph() + "zh", meta={"tokenizer": {"config": {"use_jieba": False}}} + ) if not Doc.has_extension("id"): Doc.set_extension("id", default="") - # doc1 = self.nlp.make_doc("與朋友交言而有信雖曰未學吾必謂之學矣") - # doc2 = self.nlp.make_doc("與朋友交言而有信雖曰已學吾必謂之未也") - # doc3 = self.nlp.make_doc("與朋友交言而有信雖未讀書吾亦謂之學矣") + if not Doc.has_extension("groups"): + Doc.set_extension("groups", default=[]) def test_extend(self) -> None: """extend should reduce graph to maximal matches only""" doc1 = self.nlp.make_doc("與朋友交言而有信雖曰未學吾") doc2 = self.nlp.make_doc("與朋友交言而有信雖曰已學吾") doc3 = self.nlp.make_doc("與朋友交言而有信雖未讀書吾") - self.G.add_docs([("論語·學而", doc1), - ("藝文類聚·錢", doc2), - ("顏氏家訓·勉學", doc3)]) - self.G.add_matches([ - Match("論語·學而", "藝文類聚·錢", doc1[0:4], doc2[0:4]), # 與朋友交 - Match("論語·學而", "藝文類聚·錢", doc1[4:8], doc2[4:8]), # 言而有信 - Match("論語·學而", "顏氏家訓·勉學", doc1[0:4], doc3[0:4]), # 與朋友交 - Match("論語·學而", "顏氏家訓·勉學", doc1[4:8], doc3[4:8]), # 言而有信 - Match("藝文類聚·錢", "顏氏家訓·勉學", doc2[0:4], doc3[0:4]), # 與朋友交 - Match("藝文類聚·錢", "顏氏家訓·勉學", doc2[4:8], doc3[4:8]), # 言而有信 - ]) + doc1._.id = "論語·學而" + doc2._.id = "藝文類聚·錢" + doc3._.id = "顏氏家訓·勉學" + G = MatchGraph() + G.add_docs([doc1, doc2, doc3]) + G.add_matches( + [ + Match("論語·學而", "藝文類聚·錢", doc1[0:4], doc2[0:4]), # 與朋友交 + Match("論語·學而", "藝文類聚·錢", doc1[4:8], doc2[4:8]), # 言而有信 + Match("論語·學而", "顏氏家訓·勉學", doc1[0:4], doc3[0:4]), # 與朋友交 + Match("論語·學而", "顏氏家訓·勉學", doc1[4:8], doc3[4:8]), # 言而有信 + Match("藝文類聚·錢", "顏氏家訓·勉學", doc2[0:4], doc3[0:4]), # 與朋友交 + Match("藝文類聚·錢", "顏氏家訓·勉學", doc2[4:8], doc3[4:8]), # 言而有信 + ] + ) extender = LevenshteinExtender(threshold=0.8, len_limit=50) - self.G.extend(extender) - matches = [(m.u, m.v, m.utxt.text, m.vtxt.text) - for m in self.G.matches] - self.assertEqual(len(matches), 3) - self.assertEqual(matches[0], ("論語·學而", "藝文類聚·錢", - "與朋友交言而有信雖曰未學吾", "與朋友交言而有信雖曰已學吾")) - self.assertEqual(matches[1], ("論語·學而", "顏氏家訓·勉學", - "與朋友交言而有信雖曰未學吾", "與朋友交言而有信雖未讀書吾")) - self.assertEqual(matches[2], ("藝文類聚·錢", "顏氏家訓·勉學", - "與朋友交言而有信雖", "與朋友交言而有信雖")) + G.extend(extender) + matches = [(m.u, m.v, m.utxt.text, m.vtxt.text) for m in G.matches] + self.assertEqual(len(matches), 3, "should have 3 matches") + self.assertEqual( + matches[0], + ( + "論語·學而", + "藝文類聚·錢", + "與朋友交言而有信雖曰未學吾", + "與朋友交言而有信雖曰已學吾", + ), + ) + self.assertEqual( + matches[1], + ( + "論語·學而", + "顏氏家訓·勉學", + "與朋友交言而有信雖曰未學吾", + "與朋友交言而有信雖未讀書吾", + ), + ) + self.assertEqual( + matches[2], + ( + "藝文類聚·錢", + "顏氏家訓·勉學", + "與朋友交言而有信雖", + "與朋友交言而有信雖", + ), + ) + + def test_group(self) -> None: + """grouping should group matches by shared spans""" + doc1 = self.nlp.make_doc("與朋友交言而有信雖曰未學吾") + doc2 = self.nlp.make_doc("與朋友交言而有信雖曰已學吾") + doc3 = self.nlp.make_doc("與朋友交言而有信雖未讀書吾") + doc1._.id = "論語·學而" + doc2._.id = "藝文類聚·錢" + doc3._.id = "顏氏家訓·勉學" + G = MatchGraph() + G.add_docs([doc1, doc2, doc3]) + G.add_matches( + [ + Match( + "論語·學而", "藝文類聚·錢", doc1[0:8], doc2[0:8] + ), # 與朋友交言而有信 + Match( + "論語·學而", "顏氏家訓·勉學", doc1[0:8], doc3[0:8] + ), # 與朋友交言而有信 + Match( + "藝文類聚·錢", "顏氏家訓·勉學", doc2[0:8], doc3[0:8] + ), # 與朋友交言而有信 + ] + ) + G.group() + self.assertEqual(len(doc1._.groups), 1) + self.assertEqual(len(doc2._.groups), 1) + self.assertEqual(len(doc3._.groups), 1) + group = doc1._.groups[0] + self.assertEqual(group.start, 0) + self.assertEqual(group.end, 8) + self.assertEqual(len(group), 2)