Skip to content

Commit

Permalink
Add match grouping by spans and set this as default output type
Browse files Browse the repository at this point in the history
Closes #122
  • Loading branch information
thatbudakguy committed Jul 13, 2024
1 parent 240f4b0 commit 38ce566
Show file tree
Hide file tree
Showing 5 changed files with 290 additions and 105 deletions.
18 changes: 15 additions & 3 deletions dphon/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
from pathlib import Path
from typing import Dict

import debugpy
import jsonlines
import pkg_resources
import spacy
Expand All @@ -97,6 +98,9 @@
from .match import Match
from .reuse import MatchGraph

# debugpy.listen(5678)
# debugpy.wait_for_client()

# Available log levels: default is WARN, -v is INFO, -vv is DEBUG
LOG_LEVELS = {
0: "WARN",
Expand Down Expand Up @@ -155,8 +159,11 @@ def run() -> None:
else:
# use system pager by default; colorize if LESS=R
with console.pager(styles=os.getenv("LESS", "") == "R"):
for match in results:
console.print(match)
for doc in graph.docs:
for group in doc._.groups:
console.print(group, "\n")
# for match in results:
# console.print(match)


def setup(args: Dict) -> Language:
Expand All @@ -168,6 +175,8 @@ def setup(args: Dict) -> Language:
# add Doc metadata
if not Doc.has_extension("id"):
Doc.set_extension("id", default="")
if not Doc.has_extension("groups"):
Doc.set_extension("groups", default=[])

# setup spaCy model
nlp = spacy.blank("zh", meta={"tokenizer": {"config": {"use_jieba": False}}})
Expand All @@ -192,7 +201,7 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
start = time.perf_counter()
for doc, context in nlp.pipe(load_texts(args["<path>"]), as_tuples=True):
doc._.id = context["id"]
graph.add_doc(context["id"], doc)
graph.add_doc(doc)
logging.debug(f'indexed doc "{doc._.id}"')
stop = time.perf_counter() - start
logging.info(f"indexed {graph.number_of_docs()} docs in {stop:.1f}s")
Expand Down Expand Up @@ -243,6 +252,9 @@ def process(nlp: Language, args: Dict) -> MatchGraph:
# align all matches
graph.align(SmithWatermanPhoneticAligner(gap_char=" "))

# group all matches
graph.group()

# limit via min and max lengths if requested
if args["--min"]:
graph.filter(lambda m: len(m) >= int(args["--min"]))
Expand Down
132 changes: 70 additions & 62 deletions dphon/console.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from typing import Tuple, List
from typing import List, Tuple

from rich.console import Console
from rich.highlighter import RegexHighlighter
from rich.theme import Theme
from spacy.tokens import Span

from .match import Match
from .g2p import GraphemesToPhonemes

from .match import Match

# Default color scheme for highlighting matches
DEFAULT_THEME = Theme(
Expand Down Expand Up @@ -45,91 +45,99 @@ def format_match(self, match: Match) -> Tuple[str, str]:
Adds markup for highlighting insertions, mismatches, etc. If context is
set, also adds highlighted context to either end of the match.
"""
return (
self.format_span(match.utxt, match.vtxt, match.au, match.av),
self.format_span(match.vtxt, match.utxt, match.av, match.au),
)

su, sv = self._mark(match)
def transcribe_match(self, match: Match) -> Tuple[str, str]:
"""Render a phonemic transcription for a Match."""
return (self.transcribe_span(match.utxt), self.transcribe_span(match.vtxt))

def format_span(
self,
span: Span,
other: Span = None,
alignment: str = None,
other_alignment: str = None,
) -> str:
"""Return a Span as a Rich format string, with optional context.
Adds markup for highlighting insertions, mismatches, etc. if a second
reference Span is provided. If context is set, also adds highlighted
context to either end of the match.
"""
highlighted_span = self._mark_span(span, alignment, other, other_alignment)
if self.context > 0:
cul, cur, cvl, cvr = self._add_context(match)
su = cul + su + cur
sv = cvl + sv + cvr
return su, sv
context_left, context_right = self._add_span_context(span)
formatted_span = context_left + highlighted_span + context_right
return formatted_span

def _mark(self, match: Match) -> Tuple[str, str]:
"""Mark up the match for colorization with a theme.
def transcribe_span(self, span: Span) -> str:
"""Render a phonemic transcription for a Span."""
return "*" + " ".join(span._.syllables)

- Adds markup for insertions (tokens in one sequence but not another).
def _mark_span(
self, span: Span, alignment: str, other: Span, other_alignment: str
) -> str:
"""Mark up a Span for colorization with a theme, in relation to another Span.
- Adds markup for insertions (tokens in one Span but not another).
- Adds markup for mismatches (differing tokens in the same position).
- Adds markup for graphic variants (mismatches with same phonemes).
"""

# if no alignment, just convert to strings because we can't highlight
if not match.au or not match.av:
return match.utxt.text, match.vtxt.text
# if no alignment, just return the text because we can't highlight
if not alignment or not other or not other_alignment:
return span.text

# o(N) implementation: step through each sequence adding markup
# TODO convert to a DFA so there's less markup repetition
su: List[str] = []
sv: List[str] = []
u_ptr = 0
v_ptr = 0
for i in range(len(match)):

# TODO convert to a DFA so there's less markup repetition?
marked_span: List[str] = []
span_ptr = 0
other_ptr = 0
for i in range(len(span)):
# gap in u: insertion in v (if not punctuation)
if match.au[i] == self.gap_char and match.av[i].isalnum():
su.append(match.au[i])
sv.append(f"[insertion]{match.av[i]}[/insertion]")
v_ptr += 1
if alignment[i] == self.gap_char and other_alignment[i].isalnum():
marked_span.append(alignment[i])
other_ptr += 1
continue

# gap in v: insertion in u (if not punctuation)
if match.av[i] == self.gap_char and match.au[i].isalnum():
su.append(f"[insertion]{match.au[i]}[/insertion]")
sv.append(match.av[i])
u_ptr += 1
if other_alignment[i] == self.gap_char and alignment[i].isalnum():
marked_span.append(f"[insertion]{alignment[i]}[/insertion]")
span_ptr += 1
continue

# variants (both u and v)
if self.g2p.are_graphic_variants(match.utxt[u_ptr], match.vtxt[v_ptr]):
su.append(f"[variant]{match.au[i]}[/variant]")
sv.append(f"[variant]{match.av[i]}[/variant]")
u_ptr += 1
v_ptr += 1
if self.g2p.are_graphic_variants(span[span_ptr], other[other_ptr]):
marked_span.append(f"[variant]{alignment[i]}[/variant]")
span_ptr += 1
other_ptr += 1
continue

# mismatch (both u and v) - only highlight if alphanumeric
if match.au[i] != match.av[i]:
if match.au[i].isalnum() and match.av[i].isalnum():
su.append(f"[mismatch]{match.au[i]}[/mismatch]")
sv.append(f"[mismatch]{match.av[i]}[/mismatch]")
u_ptr += 1
v_ptr += 1
if alignment[i] != other_alignment[i]:
if alignment[i].isalnum() and other_alignment[i].isalnum():
marked_span.append(f"[mismatch]{alignment[i]}[/mismatch]")
span_ptr += 1
other_ptr += 1
continue

# equality; nothing to highlight
su.append(match.au[i])
sv.append(match.av[i])
u_ptr += 1
v_ptr += 1
marked_span.append(alignment[i])
span_ptr += 1
other_ptr += 1

return "".join(su), "".join(sv)
return "".join(marked_span)

def _add_context(self, match: Match) -> Tuple[str, str, str, str]:
"""Add context to either side of the match sequences.
def _add_span_context(self, span: Span) -> Tuple[str, str]:
"""Add context to either side of the Span.
Context coloration can be changed by the default theme; a dim appearance
is used in terminals.
"""

utxt, vtxt = match.utxt, match.vtxt
u, v = utxt.doc, vtxt.doc
cul = f"[context]{u[utxt.start-self.context:utxt.start]}[/context]"
cur = f"[context]{u[utxt.end:utxt.end+self.context]}[/context]"
cvl = f"[context]{v[vtxt.start-self.context:vtxt.start]}[/context]"
cvr = f"[context]{v[vtxt.end:vtxt.end+self.context]}[/context]"
return (cul, cur, cvl, cvr)

def transcription(self, match: Match) -> Tuple[str, str]:
"""Get the phonemic transcription for the match for display."""
return (
"*" + " ".join(match.utxt._.syllables),
"*" + " ".join(match.vtxt._.syllables),
context_left = (
f"[context]{span.doc[span.start-self.context:span.start]}[/context]"
)
context_right = f"[context]{span.doc[span.end:span.end+self.context]}[/context]"
return context_left, context_right
4 changes: 2 additions & 2 deletions dphon/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from typing import Dict, List, NamedTuple

import Levenshtein as Lev
from rich.padding import Padding
from rich.console import Console, ConsoleOptions, RenderResult
from spacy.tokens import Span
from rich.padding import Padding
from spacy.tokens import Doc, Span


class Match(NamedTuple):
Expand Down
Loading

0 comments on commit 38ce566

Please sign in to comment.