Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve CSV output #363

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dphon/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def run() -> None:
for match in results:
writer.write(match.as_dict())
elif args["--output-format"] == "csv":
fieldnames = Match("", "", "", "").as_dict().keys()
fieldnames = results[0].as_dict().keys()
writer = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
writer.writeheader()
for match in results:
Expand Down
7 changes: 0 additions & 7 deletions dphon/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,3 @@ def _add_context(self, match: Match) -> Tuple[str, str, str, str]:
cvl = f"[context]{v[vtxt.start-self.context:vtxt.start]}[/context]"
cvr = f"[context]{v[vtxt.end:vtxt.end+self.context]}[/context]"
return (cul, cur, cvl, cvr)

def transcription(self, match: Match) -> Tuple[str, str]:
"""Get the phonemic transcription for the match for display."""
return (
"*" + " ".join(match.utxt._.syllables),
"*" + " ".join(match.vtxt._.syllables),
)
53 changes: 47 additions & 6 deletions dphon/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""The Match class for encoding text reuse relationships."""

import math
from typing import Dict, List, NamedTuple
from typing import Dict, List, NamedTuple, Tuple

import Levenshtein as Lev
from rich.padding import Padding
Expand Down Expand Up @@ -32,7 +32,7 @@ def __rich_console__(
"""Format the match for display in console."""
# get colorized match text and transcription
su, sv = console.highlighter.format_match(self) # type: ignore
pu, pv = console.highlighter.transcription(self) # type: ignore
pu, pv = self.transcription

# add left-padding to align with match numbers, and bottom-padding
# so that there's a space between matches in output
Expand All @@ -49,27 +49,68 @@ def __rich_console__(
pv,
)

@property
def u_transcription(self) -> str:
return "*" + " ".join(self.utxt._.syllables)

@property
def v_transcription(self) -> str:
return "*" + " ".join(self.vtxt._.syllables)

@property
def weighted_score(self) -> float:
"""Ratio of phonemic similarity to graphic similarity."""
try:
return self.weight / Lev.seqratio(self.au, self.av)
return self.phonetic_similarity() / self.graphic_similarity()
except ZeroDivisionError:
return math.inf

@property
def transcription(self) -> Tuple[str, str]:
"""Return the phonemic transcription of the match."""
return (self.u_transcription, self.v_transcription)

def graphic_similarity(self) -> float:
"""Levenshtein ratio of the aligned sequences."""
return Lev.seqratio(self.au, self.av)

def phonetic_similarity(self) -> float:
"""Similarity score of the phonetic content of the sequences."""
return self.weight

def context(self, chars: int) -> Tuple[str, str, str, str]:
"""Return up to `chars` characters of context around the match.

Return value is a tuple of four strings:
- left context of u
- right context of u
- left context of v
- right context of v
"""
u, v = self.utxt.doc, self.vtxt.doc
u_start, u_end = self.utxt.start, self.utxt.end
v_start, v_end = self.vtxt.start, self.vtxt.end
u_context_left = u[max(u_start - chars, 0) : u_start]
v_context_left = v[max(v_start - chars, 0) : v_start]
u_context_right = u[u_end : min(u_end + chars, len(u))]
v_context_right = v[v_end : min(v_end + chars, len(v))]
return (u_context_left, u_context_right, v_context_left, v_context_right)

def as_dict(self) -> Dict[str, str]:
"""Match with prettier field names for serialization."""
"""Dict form for structured output formats."""
return {
"u_id": self.u,
"v_id": self.v,
"u_text": self.utxt.text,
"v_text": self.vtxt.text,
"u_text_aligned": "".join(self.au),
"v_text_aligned": "".join(self.av),
"u_transcription": self.u_transcription,
"v_transcription": self.v_transcription,
"u_start": self.utxt.start,
"u_end": self.utxt.end,
"v_start": self.vtxt.start,
"v_end": self.vtxt.end,
"score": str(self.weight),
"weighted_score": str(self.weighted_score),
"phonetic_similarity": self.phonetic_similarity(),
"graphic_similarity": self.graphic_similarity(),
}
Loading