From a59afb2d0503295b4bff37cf2d65604acd547f8b Mon Sep 17 00:00:00 2001
From: Nick Budak <thatbudakguy@gmail.com>
Date: Sat, 13 Jul 2024 13:06:56 -0700
Subject: [PATCH] Ensure matches are filtered before grouping

This fixes #365, and adds a test and some formatting.
---
 dphon/cli.py             | 13 ++++++-------
 dphon/match.py           |  2 +-
 dphon/reuse.py           |  8 +++++---
 tests/unit/test_reuse.py | 21 +++++++++++++++++++++
 4 files changed, 33 insertions(+), 11 deletions(-)
diff --git a/dphon/cli.py b/dphon/cli.py
index 0883b8a..97ab09f 100644
--- a/dphon/cli.py
+++ b/dphon/cli.py
@@ -42,8 +42,7 @@
 
     -c <NUM>, --context <NUM>       [default: 4]
         Add NUM tokens of context to each side of matches. Context displays with
-        a dimmed appearance if color is supported in the terminal. Has no effect
-        if the output format is not plaintext.
+        a dimmed appearance if color is supported in the terminal.
 
 Filtering Options:
     -a, --all
@@ -200,7 +199,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
         graph.add_doc(doc)
         logging.debug(f'indexed doc "{doc._.id}"')
     stop = time.perf_counter() - start
-    logging.info(f"indexed {graph.number_of_docs()} docs in {stop:.1f}s")
+    logging.info(f"indexed {graph.number_of_docs} docs in {stop:.1f}s")
 
     # prune all ngrams from index that only occur once
     groups = list(nlp.get_pipe("index").filter(lambda g: len(g[1]) > 1))
@@ -231,7 +230,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
                     )
             progress.advance(task)
     stop = time.perf_counter() - start
-    logging.info(f"seeded {graph.number_of_matches()} matches in {stop:.1f}s")
+    logging.info(f"seeded {graph.number_of_matches} matches in {stop:.1f}s")
 
     # limit to seeds with graphic variants if requested
     if not args["--all"]:
@@ -248,15 +247,15 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
     # align all matches
     graph.align(SmithWatermanPhoneticAligner(gap_char="　"))
 
-    # group all matches
-    graph.group()
-
     # limit via min and max lengths if requested
     if args["--min"]:
         graph.filter(lambda m: len(m) >= int(args["--min"]))
     if args["--max"]:
         graph.filter(lambda m: len(m) <= int(args["--max"]))
 
+    # group all matches
+    graph.group()
+
     # return completed reuse graph
     return graph
 
diff --git a/dphon/match.py b/dphon/match.py
index d66fe7d..28ffdc7 100644
--- a/dphon/match.py
+++ b/dphon/match.py
@@ -8,7 +8,7 @@
 import Levenshtein as Lev
 from rich.console import Console, ConsoleOptions, RenderResult
 from rich.padding import Padding
-from spacy.tokens import Doc, Span
+from spacy.tokens import Span
 
 
 class Match(NamedTuple):
diff --git a/dphon/reuse.py b/dphon/reuse.py
index fbc8f21..747ca1c 100644
--- a/dphon/reuse.py
+++ b/dphon/reuse.py
@@ -115,10 +115,12 @@ def docs(self) -> Iterator[Doc]:
         """Iterator over all docs in the graph."""
         return (doc for _label, doc in self._G.nodes(data="doc"))
 
+    @property
     def number_of_matches(self) -> int:
         """Total number of matches in the graph."""
         return self._G.number_of_edges()
 
+    @property
     def number_of_docs(self) -> int:
         """Total number of documents in the graph."""
         return self._G.number_of_nodes()
@@ -147,7 +149,7 @@ def extend(self, extender: Extender) -> None:
         """Extend all matches in the graph using a provided strategy."""
         # track progress
         task = self.progress.add_task(
-            "extending", u="", v="", total=self.number_of_matches()
+            "extending", u="", v="", total=self.number_of_matches
         )
 
         # create a new graph without matches and add each extended match to it
@@ -168,7 +170,7 @@ def align(self, align: Aligner) -> None:
         """Align all matches in the graph using a provided strategy."""
         # track progress
         task = self.progress.add_task(
-            "aligning", u="", v="", total=self.number_of_matches()
+            "aligning", u="", v="", total=self.number_of_matches
         )
 
         # create a new graph without matches and add each aligned match to it
@@ -189,7 +191,7 @@ def group(self) -> None:
         """Group all matches in the graph by their shared spans."""
         # track progress
         task = self.progress.add_task(
-            "grouping", u="", v="", total=self.number_of_matches()
+            "grouping", u="", v="", total=self.number_of_matches
         )
 
         # iterate through each document and group all matches that target it
diff --git a/tests/unit/test_reuse.py b/tests/unit/test_reuse.py
index 94f421b..fd98bad 100644
--- a/tests/unit/test_reuse.py
+++ b/tests/unit/test_reuse.py
@@ -82,6 +82,27 @@ def test_extend(self) -> None:
             ),
         )
 
+    def test_filter(self) -> None:
+        """filter should remove matches that don't meet a predicate"""
+        doc1 = self.nlp.make_doc("abcdefg123")
+        doc2 = self.nlp.make_doc("abcdefg456")
+        doc3 = self.nlp.make_doc("456nothing")
+        doc1._.id = "1"
+        doc2._.id = "2"
+        doc3._.id = "3"
+        G = MatchGraph()
+        G.add_docs([doc1, doc2, doc3])
+        G.add_matches(
+            [
+                Match("1", "2", doc1[0:7], doc2[0:7]),  # abcdefg
+                Match("2", "3", doc2[7:10], doc3[3:6]),  # 456
+            ]
+        )
+        G.filter(lambda m: len(m) > 3)
+        self.assertEqual(G.number_of_matches, 1, "should have 1 match with length > 3")
+        match_texts = [m.utxt.text for m in G.matches]
+        self.assertEqual(match_texts[0], "abcdefg")
+
     def test_group(self) -> None:
         """grouping should group matches by shared spans"""
         doc1 = self.nlp.make_doc("與朋友交言而有信雖曰未學吾")