Skip to content

Commit

Permalink
Ensure matches are filtered before grouping
Browse files Browse the repository at this point in the history
This fixes #365, and adds a test and some formatting.
  • Loading branch information
thatbudakguy committed Jul 13, 2024
1 parent ebeb249 commit a59afb2
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 11 deletions.
13 changes: 6 additions & 7 deletions dphon/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@
-c <NUM>, --context <NUM> [default: 4]
Add NUM tokens of context to each side of matches. Context displays with
a dimmed appearance if color is supported in the terminal. Has no effect
if the output format is not plaintext.
a dimmed appearance if color is supported in the terminal.
Filtering Options:
-a, --all
Expand Down Expand Up @@ -200,7 +199,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
graph.add_doc(doc)
logging.debug(f'indexed doc "{doc._.id}"')
stop = time.perf_counter() - start
logging.info(f"indexed {graph.number_of_docs()} docs in {stop:.1f}s")
logging.info(f"indexed {graph.number_of_docs} docs in {stop:.1f}s")

# prune all ngrams from index that only occur once
groups = list(nlp.get_pipe("index").filter(lambda g: len(g[1]) > 1))
Expand Down Expand Up @@ -231,7 +230,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
)
progress.advance(task)
stop = time.perf_counter() - start
logging.info(f"seeded {graph.number_of_matches()} matches in {stop:.1f}s")
logging.info(f"seeded {graph.number_of_matches} matches in {stop:.1f}s")

# limit to seeds with graphic variants if requested
if not args["--all"]:
Expand All @@ -248,15 +247,15 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
# align all matches
graph.align(SmithWatermanPhoneticAligner(gap_char=" "))

# group all matches
graph.group()

# limit via min and max lengths if requested
if args["--min"]:
graph.filter(lambda m: len(m) >= int(args["--min"]))
if args["--max"]:
graph.filter(lambda m: len(m) <= int(args["--max"]))

# group all matches
graph.group()

# return completed reuse graph
return graph

Expand Down
2 changes: 1 addition & 1 deletion dphon/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import Levenshtein as Lev
from rich.console import Console, ConsoleOptions, RenderResult
from rich.padding import Padding
from spacy.tokens import Doc, Span
from spacy.tokens import Span


class Match(NamedTuple):
Expand Down
8 changes: 5 additions & 3 deletions dphon/reuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,12 @@ def docs(self) -> Iterator[Doc]:
"""Iterator over all docs in the graph."""
return (doc for _label, doc in self._G.nodes(data="doc"))

@property
def number_of_matches(self) -> int:
"""Total number of matches in the graph."""
return self._G.number_of_edges()

@property
def number_of_docs(self) -> int:
"""Total number of documents in the graph."""
return self._G.number_of_nodes()
Expand Down Expand Up @@ -147,7 +149,7 @@ def extend(self, extender: Extender) -> None:
"""Extend all matches in the graph using a provided strategy."""
# track progress
task = self.progress.add_task(
"extending", u="", v="", total=self.number_of_matches()
"extending", u="", v="", total=self.number_of_matches
)

# create a new graph without matches and add each extended match to it
Expand All @@ -168,7 +170,7 @@ def align(self, align: Aligner) -> None:
"""Align all matches in the graph using a provided strategy."""
# track progress
task = self.progress.add_task(
"aligning", u="", v="", total=self.number_of_matches()
"aligning", u="", v="", total=self.number_of_matches
)

# create a new graph without matches and add each aligned match to it
Expand All @@ -189,7 +191,7 @@ def group(self) -> None:
"""Group all matches in the graph by their shared spans."""
# track progress
task = self.progress.add_task(
"grouping", u="", v="", total=self.number_of_matches()
"grouping", u="", v="", total=self.number_of_matches
)

# iterate through each document and group all matches that target it
Expand Down
21 changes: 21 additions & 0 deletions tests/unit/test_reuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,27 @@ def test_extend(self) -> None:
),
)

def test_filter(self) -> None:
"""filter should remove matches that don't meet a predicate"""
doc1 = self.nlp.make_doc("abcdefg123")
doc2 = self.nlp.make_doc("abcdefg456")
doc3 = self.nlp.make_doc("456nothing")
doc1._.id = "1"
doc2._.id = "2"
doc3._.id = "3"
G = MatchGraph()
G.add_docs([doc1, doc2, doc3])
G.add_matches(
[
Match("1", "2", doc1[0:7], doc2[0:7]), # abcdefg
Match("2", "3", doc2[7:10], doc3[3:6]), # 456
]
)
G.filter(lambda m: len(m) > 3)
self.assertEqual(G.number_of_matches, 1, "should have 1 match with length > 3")
match_texts = [m.utxt.text for m in G.matches]
self.assertEqual(match_texts[0], "abcdefg")

def test_group(self) -> None:
"""grouping should group matches by shared spans"""
doc1 = self.nlp.make_doc("與朋友交言而有信雖曰未學吾")
Expand Down

0 comments on commit a59afb2

Please sign in to comment.