From 7273c77edd389154d5e6a848633e506b411d76ae Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Fri, 17 May 2024 14:21:17 +0000 Subject: [PATCH] Support non-regex based tokens for `spellcheck_line` The `Spellchecker` only needs the `group` method from the `re.Match`. With a bit of generics and typing protocols, we can make the `Spellchecker` work with any token type that has a `group()` method. The `codespell` command line tool still assumes `re.Match` but it can get that via its own line tokenizer, so it all works out for everyone. --- codespell_lib/_codespell.py | 6 +- codespell_lib/spellchecker.py | 155 +++++++++++++++++++++++++++++++--- 2 files changed, 145 insertions(+), 16 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 98efc12033..8cbb1155b8 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -716,7 +716,7 @@ def is_text_file(filename: str) -> bool: def ask_for_word_fix( line: str, - issue: DetectedMisspelling, + issue: "DetectedMisspelling[re.Match[str]]", interactivity: int, colors: TermColors, ) -> Tuple[bool, Sequence[str]]: @@ -725,7 +725,7 @@ def ask_for_word_fix( if interactivity <= 0: return misspelling.fix, fix_case(wrongword, misspelling.candidates) - match = issue.re_match + match = issue.token line_ui = ( f"{line[:match.start()]}" @@ -841,7 +841,7 @@ def line_tokenizer_factory( uri_regex: Pattern[str], word_regex: Pattern[str], ignore_word_regex: Optional[Pattern[str]], -) -> LineTokenizer: +) -> "LineTokenizer[re.Match[str]]": def line_tokenizer(line: str) -> Iterable[Match[str]]: # If all URI spelling errors will be ignored, erase any URI before # extracting words. Otherwise, apply ignores after extracting words. diff --git a/codespell_lib/spellchecker.py b/codespell_lib/spellchecker.py index 0d87eef366..9d1c5398d6 100644 --- a/codespell_lib/spellchecker.py +++ b/codespell_lib/spellchecker.py @@ -16,22 +16,135 @@ Copyright (C) 2011 ProFUSION embedded systems """ +import os +import re from typing import ( - Callable, Container, Dict, + Generic, Iterable, - Match, Optional, + Protocol, Sequence, + TypeVar, ) # Pass all misspellings through this translation table to generate # alternative misspellings and fixes. alt_chars = (("'", "’"),) # noqa: RUF001 +T_co = TypeVar("T_co", bound="Token", covariant=True) -LineTokenizer = Callable[[str], Iterable[Match[str]]] + +supported_languages_en = ("en", "en_GB", "en_US", "en_CA", "en_AU") +supported_languages = supported_languages_en + +# Users might want to link this file into /usr/local/bin, so we resolve the +# symbolic link path to the real path if necessary. +_data_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") +_builtin_dictionaries = ( + # name, desc, name, err in aspell, correction in aspell, \ + # err dictionary array, rep dictionary array + # The arrays must contain the names of aspell dictionaries + # The aspell tests here aren't the ideal state, but the None's are + # realistic for obscure words + ("clear", "for unambiguous errors", "", False, None, supported_languages_en, None), + ( + "rare", + "for rare (but valid) words that are likely to be errors", + "_rare", + None, + None, + None, + None, + ), + ( + "informal", + "for making informal words more formal", + "_informal", + True, + True, + supported_languages_en, + supported_languages_en, + ), + ( + "usage", + "for replacing phrasing with recommended terms", + "_usage", + None, + None, + None, + None, + ), + ( + "code", + "for words from code and/or mathematics that are likely to be typos in other contexts (such as uint)", # noqa: E501 + "_code", + None, + None, + None, + None, + ), + ( + "names", + "for valid proper names that might be typos", + "_names", + None, + None, + None, + None, + ), + ( + "en-GB_to_en-US", + "for corrections from en-GB to en-US", + "_en-GB_to_en-US", + True, + True, + ("en_GB",), + ("en_US",), + ), +) +_builtin_default = "clear,rare" + +_builtin_default_as_tuple = tuple(_builtin_default.split(",")) + + +class UnknownBuiltinDictionaryError(ValueError): + def __init__(self, name: str) -> None: + super().__init__(f"Unknown built-in dictionary: {name}") + + +class BuiltinDictionariesAlreadyLoadedError(TypeError): + def __init__(self) -> None: + super().__init__( + "load_builtin_dictionaries must not be called more than once", + ) + + +class LineTokenizer(Protocol[T_co]): + """Callable that splits a line into multiple tokens to be spellchecked + + Generally, a regex will do for simple cases. A probably too simple one is: + + >>> tokenizer = re.compile(r"[^ ]+").finditer + + For more complex cases, either use more complex regexes or custom tokenization + code. + """ + + def __call__(self, line: str) -> Iterable[T_co]: ... + + +class Token(Protocol): + """Describes a token + + This is a protocol to support `re.Match[str]` (which codespell uses) and any + other tokenization method that our API consumers might be using. + """ + + def group(self) -> str: ... + + def start(self) -> int: ... class Misspelling: @@ -41,13 +154,18 @@ def __init__(self, candidates: Sequence[str], fix: bool, reason: str) -> None: self.reason = reason -class DetectedMisspelling: - - def __init__(self, word: str, lword: str, misspelling: Misspelling, match: Match[str]) -> None: +class DetectedMisspelling(Generic[T_co]): + def __init__( + self, + word: str, + lword: str, + misspelling: Misspelling, + token: T_co, + ) -> None: self.word = word self.lword = lword self.misspelling = misspelling - self.re_match = match + self.token = token class Spellchecker: @@ -58,14 +176,25 @@ def __init__(self) -> None: def spellcheck_line( self, line: str, - tokenizer: Callable[[str], Iterable[re.Match[str]]], + tokenizer: LineTokenizer[T_co], *, extra_words_to_ignore: Container[str] = frozenset() - ) -> Iterable[DetectedMisspelling]: + ) -> Iterable[DetectedMisspelling[T_co]]: + """Tokenize and spellcheck a line + + Split the line into tokens based using the provided tokenizer. See the doc + string for the class for an example. + + :param line: The line to spellcheck. + :param tokenizer: A callable that will tokenize the line + :param extra_words_to_ignore: Extra words to ignore for this particular line + (such as content from a `codespell:ignore` comment) + """ misspellings = self._misspellings ignore_words_cased = self.ignore_words_cased - for match in tokenizer(line): - word = match.group() + + for token in tokenizer(line): + word = token.group() if word in ignore_words_cased: continue lword = word.lower() @@ -74,7 +203,7 @@ def spellcheck_line( # Sometimes we find a 'misspelling' which is actually a valid word # preceded by a string escape sequence. Ignore such cases as # they're usually false alarms; see issue #17 among others. - char_before_idx = match.start() - 1 + char_before_idx = token.start() - 1 if ( char_before_idx >= 0 and line[char_before_idx] == "\\" @@ -83,7 +212,7 @@ def spellcheck_line( and lword[1:] not in misspellings ): continue - yield DetectedMisspelling(word, lword, misspelling, match) + yield DetectedMisspelling(word, lword, misspelling, token) def check_lower_cased_word(self, word: str) -> Optional[Misspelling]: """Check a given word against the loaded dictionaries