diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 800e12c9947..b24c482bbde 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -36,6 +36,9 @@ "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|" "\\b[\\w.%+-]+@[\\w.-]+\\b)" ) +# Pass all misspellings through this translation table to catch +# alternative misspellings. +alt_chars = (("'", "’"),) encodings = ("utf-8", "iso-8859-1") USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] @@ -622,31 +625,45 @@ def build_ignore_words(filename: str, ignore_words: Set[str]) -> None: ignore_words.add(line.strip()) +def add_misspelling( + key: str, + data: str, + misspellings: Dict[str, Misspelling], +) -> None: + data = data.strip() + + if "," in data: + fix = False + data, reason = data.rsplit(",", 1) + reason = reason.lstrip() + else: + fix = True + reason = "" + + misspellings[key] = Misspelling(data, fix, reason) + + def build_dict( filename: str, misspellings: Dict[str, Misspelling], ignore_words: Set[str], ) -> None: with open(filename, encoding="utf-8") as f: + translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] for line in f: [key, data] = line.split("->") # TODO for now, convert both to lower. Someday we can maybe add # support for fixing caps. key = key.lower() data = data.lower() - if key in ignore_words: - continue - data = data.strip() - - if "," in data: - fix = False - data, reason = data.rsplit(",", 1) - reason = reason.lstrip() - else: - fix = True - reason = "" - - misspellings[key] = Misspelling(data, fix, reason) + if key not in ignore_words: + add_misspelling(key, data, misspellings) + for x, table in translate_tables: + if x in key: + alt_key = key.translate(table) + alt_data = data.translate(table) + if alt_key not in ignore_words: + add_misspelling(alt_key, alt_data, misspellings) def is_hidden(filename: str, check_hidden: bool) -> bool: