Skip to content

Commit

Permalink
Support non-regex based tokens for spellcheck_line
Browse files Browse the repository at this point in the history
The `Spellchecker` only needs the `group` method from the `re.Match`.
With a bit of generics and typing protocols, we can make the
`Spellchecker` work with any token type that has a `group()` method.

The `codespell` command line tool still assumes `re.Match` but it can
get that via its own line tokenizer, so it all works out for everyone.
  • Loading branch information
nthykier committed May 25, 2024
1 parent 8bd3517 commit 7273c77
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 16 deletions.
6 changes: 3 additions & 3 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,7 @@ def is_text_file(filename: str) -> bool:

def ask_for_word_fix(
line: str,
issue: DetectedMisspelling,
issue: "DetectedMisspelling[re.Match[str]]",
interactivity: int,
colors: TermColors,
) -> Tuple[bool, Sequence[str]]:
Expand All @@ -725,7 +725,7 @@ def ask_for_word_fix(
if interactivity <= 0:
return misspelling.fix, fix_case(wrongword, misspelling.candidates)

match = issue.re_match
match = issue.token

line_ui = (
f"{line[:match.start()]}"
Expand Down Expand Up @@ -841,7 +841,7 @@ def line_tokenizer_factory(
uri_regex: Pattern[str],
word_regex: Pattern[str],
ignore_word_regex: Optional[Pattern[str]],
) -> LineTokenizer:
) -> "LineTokenizer[re.Match[str]]":
def line_tokenizer(line: str) -> Iterable[Match[str]]:
# If all URI spelling errors will be ignored, erase any URI before
# extracting words. Otherwise, apply ignores after extracting words.
Expand Down
155 changes: 142 additions & 13 deletions codespell_lib/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,135 @@
Copyright (C) 2011 ProFUSION embedded systems
"""

import os
import re
from typing import (
Callable,
Container,
Dict,
Generic,
Iterable,
Match,
Optional,
Protocol,
Sequence,
TypeVar,
)

# Pass all misspellings through this translation table to generate
# alternative misspellings and fixes.
alt_chars = (("'", "’"),) # noqa: RUF001

T_co = TypeVar("T_co", bound="Token", covariant=True)

LineTokenizer = Callable[[str], Iterable[Match[str]]]

supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU")
supported_languages = supported_languages_en

# Users might want to link this file into /usr/local/bin, so we resolve the
# symbolic link path to the real path if necessary.
_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
_builtin_dictionaries = (
# name, desc, name, err in aspell, correction in aspell, \
# err dictionary array, rep dictionary array
# The arrays must contain the names of aspell dictionaries
# The aspell tests here aren't the ideal state, but the None's are
# realistic for obscure words
("clear", "for unambiguous errors", "", False, None, supported_languages_en, None),
(
"rare",
"for rare (but valid) words that are likely to be errors",
"_rare",
None,
None,
None,
None,
),
(
"informal",
"for making informal words more formal",
"_informal",
True,
True,
supported_languages_en,
supported_languages_en,
),
(
"usage",
"for replacing phrasing with recommended terms",
"_usage",
None,
None,
None,
None,
),
(
"code",
"for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)", # noqa: E501
"_code",
None,
None,
None,
None,
),
(
"names",
"for valid proper names that might be typos",
"_names",
None,
None,
None,
None,
),
(
"en-GB_to_en-US",
"for corrections from en-GB to en-US",
"_en-GB_to_en-US",
True,
True,
("en_GB",),
("en_US",),
),
)
_builtin_default = "clear,rare"

_builtin_default_as_tuple = tuple(_builtin_default.split(","))


class UnknownBuiltinDictionaryError(ValueError):
def __init__(self, name: str) -> None:
super().__init__(f"Unknown built-in dictionary: {name}")


class BuiltinDictionariesAlreadyLoadedError(TypeError):
def __init__(self) -> None:
super().__init__(
"load_builtin_dictionaries must not be called more than once",
)


class LineTokenizer(Protocol[T_co]):
"""Callable that splits a line into multiple tokens to be spellchecked
Generally, a regex will do for simple cases. A probably too simple one is:
>>> tokenizer = re.compile(r"[^ ]+").finditer
For more complex cases, either use more complex regexes or custom tokenization
code.
"""

def __call__(self, line: str) -> Iterable[T_co]: ...


class Token(Protocol):
"""Describes a token
This is a protocol to support `re.Match[str]` (which codespell uses) and any
other tokenization method that our API consumers might be using.
"""

def group(self) -> str: ...

def start(self) -> int: ...


class Misspelling:
Expand All @@ -41,13 +154,18 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None:
self.reason = reason


class DetectedMisspelling:

def __init__(self, word: str, lword: str, misspelling: Misspelling, match: Match[str]) -> None:
class DetectedMisspelling(Generic[T_co]):
def __init__(
self,
word: str,
lword: str,
misspelling: Misspelling,
token: T_co,
) -> None:
self.word = word
self.lword = lword
self.misspelling = misspelling
self.re_match = match
self.token = token


class Spellchecker:
Expand All @@ -58,14 +176,25 @@ def __init__(self) -> None:
def spellcheck_line(
self,
line: str,
tokenizer: Callable[[str], Iterable[re.Match[str]]],
tokenizer: LineTokenizer[T_co],
*,
extra_words_to_ignore: Container[str] = frozenset()
) -> Iterable[DetectedMisspelling]:
) -> Iterable[DetectedMisspelling[T_co]]:
"""Tokenize and spellcheck a line
Split the line into tokens based using the provided tokenizer. See the doc
string for the class for an example.
:param line: The line to spellcheck.
:param tokenizer: A callable that will tokenize the line
:param extra_words_to_ignore: Extra words to ignore for this particular line
(such as content from a `codespell:ignore` comment)
"""
misspellings = self._misspellings
ignore_words_cased = self.ignore_words_cased
for match in tokenizer(line):
word = match.group()

for token in tokenizer(line):
word = token.group()
if word in ignore_words_cased:
continue
lword = word.lower()
Expand All @@ -74,7 +203,7 @@ def spellcheck_line(
# Sometimes we find a 'misspelling' which is actually a valid word
# preceded by a string escape sequence. Ignore such cases as
# they're usually false alarms; see issue #17 among others.
char_before_idx = match.start() - 1
char_before_idx = token.start() - 1
if (
char_before_idx >= 0
and line[char_before_idx] == "\\"
Expand All @@ -83,7 +212,7 @@ def spellcheck_line(
and lword[1:] not in misspellings
):
continue
yield DetectedMisspelling(word, lword, misspelling, match)
yield DetectedMisspelling(word, lword, misspelling, token)

def check_lower_cased_word(self, word: str) -> Optional[Misspelling]:
"""Check a given word against the loaded dictionaries
Expand Down

0 comments on commit 7273c77

Please sign in to comment.