From a59afb2d0503295b4bff37cf2d65604acd547f8b Mon Sep 17 00:00:00 2001 From: Nick Budak Date: Sat, 13 Jul 2024 13:06:56 -0700 Subject: [PATCH] Ensure matches are filtered before grouping This fixes #365, and adds a test and some formatting. --- dphon/cli.py | 13 ++++++------- dphon/match.py | 2 +- dphon/reuse.py | 8 +++++--- tests/unit/test_reuse.py | 21 +++++++++++++++++++++ 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/dphon/cli.py b/dphon/cli.py index 0883b8a..97ab09f 100644 --- a/dphon/cli.py +++ b/dphon/cli.py @@ -42,8 +42,7 @@ -c , --context [default: 4] Add NUM tokens of context to each side of matches. Context displays with - a dimmed appearance if color is supported in the terminal. Has no effect - if the output format is not plaintext. + a dimmed appearance if color is supported in the terminal. Filtering Options: -a, --all @@ -200,7 +199,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph: graph.add_doc(doc) logging.debug(f'indexed doc "{doc._.id}"') stop = time.perf_counter() - start - logging.info(f"indexed {graph.number_of_docs()} docs in {stop:.1f}s") + logging.info(f"indexed {graph.number_of_docs} docs in {stop:.1f}s") # prune all ngrams from index that only occur once groups = list(nlp.get_pipe("index").filter(lambda g: len(g[1]) > 1)) @@ -231,7 +230,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph: ) progress.advance(task) stop = time.perf_counter() - start - logging.info(f"seeded {graph.number_of_matches()} matches in {stop:.1f}s") + logging.info(f"seeded {graph.number_of_matches} matches in {stop:.1f}s") # limit to seeds with graphic variants if requested if not args["--all"]: @@ -248,15 +247,15 @@ def process(nlp: Language, args: Dict) -> MatchGraph: # align all matches graph.align(SmithWatermanPhoneticAligner(gap_char=" ")) - # group all matches - graph.group() - # limit via min and max lengths if requested if args["--min"]: graph.filter(lambda m: len(m) >= int(args["--min"])) if args["--max"]: graph.filter(lambda m: len(m) <= int(args["--max"])) + # group all matches + graph.group() + # return completed reuse graph return graph diff --git a/dphon/match.py b/dphon/match.py index d66fe7d..28ffdc7 100644 --- a/dphon/match.py +++ b/dphon/match.py @@ -8,7 +8,7 @@ import Levenshtein as Lev from rich.console import Console, ConsoleOptions, RenderResult from rich.padding import Padding -from spacy.tokens import Doc, Span +from spacy.tokens import Span class Match(NamedTuple): diff --git a/dphon/reuse.py b/dphon/reuse.py index fbc8f21..747ca1c 100644 --- a/dphon/reuse.py +++ b/dphon/reuse.py @@ -115,10 +115,12 @@ def docs(self) -> Iterator[Doc]: """Iterator over all docs in the graph.""" return (doc for _label, doc in self._G.nodes(data="doc")) + @property def number_of_matches(self) -> int: """Total number of matches in the graph.""" return self._G.number_of_edges() + @property def number_of_docs(self) -> int: """Total number of documents in the graph.""" return self._G.number_of_nodes() @@ -147,7 +149,7 @@ def extend(self, extender: Extender) -> None: """Extend all matches in the graph using a provided strategy.""" # track progress task = self.progress.add_task( - "extending", u="", v="", total=self.number_of_matches() + "extending", u="", v="", total=self.number_of_matches ) # create a new graph without matches and add each extended match to it @@ -168,7 +170,7 @@ def align(self, align: Aligner) -> None: """Align all matches in the graph using a provided strategy.""" # track progress task = self.progress.add_task( - "aligning", u="", v="", total=self.number_of_matches() + "aligning", u="", v="", total=self.number_of_matches ) # create a new graph without matches and add each aligned match to it @@ -189,7 +191,7 @@ def group(self) -> None: """Group all matches in the graph by their shared spans.""" # track progress task = self.progress.add_task( - "grouping", u="", v="", total=self.number_of_matches() + "grouping", u="", v="", total=self.number_of_matches ) # iterate through each document and group all matches that target it diff --git a/tests/unit/test_reuse.py b/tests/unit/test_reuse.py index 94f421b..fd98bad 100644 --- a/tests/unit/test_reuse.py +++ b/tests/unit/test_reuse.py @@ -82,6 +82,27 @@ def test_extend(self) -> None: ), ) + def test_filter(self) -> None: + """filter should remove matches that don't meet a predicate""" + doc1 = self.nlp.make_doc("abcdefg123") + doc2 = self.nlp.make_doc("abcdefg456") + doc3 = self.nlp.make_doc("456nothing") + doc1._.id = "1" + doc2._.id = "2" + doc3._.id = "3" + G = MatchGraph() + G.add_docs([doc1, doc2, doc3]) + G.add_matches( + [ + Match("1", "2", doc1[0:7], doc2[0:7]), # abcdefg + Match("2", "3", doc2[7:10], doc3[3:6]), # 456 + ] + ) + G.filter(lambda m: len(m) > 3) + self.assertEqual(G.number_of_matches, 1, "should have 1 match with length > 3") + match_texts = [m.utxt.text for m in G.matches] + self.assertEqual(match_texts[0], "abcdefg") + def test_group(self) -> None: """grouping should group matches by shared spans""" doc1 = self.nlp.make_doc("與朋友交言而有信雖曰未學吾")