From 333c2a0c6bdccd4930c028bbf15b4f17423731c3 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Tue, 1 Aug 2023 08:40:23 +0200 Subject: [PATCH 1/2] =?UTF-8?q?Demonstrate=20issue=20with=20typographic=20?= =?UTF-8?q?apostrophe=20U+2019=20(=E2=80=99)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We miss misspellings using the typographic apostrophe or acute accent U+2019 (’) because the typos in our dictionaries use the typewriter apostrophe U+0027 ('). --- codespell_lib/tests/test_basic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 473b72fef4..c2a1e80c5b 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -168,6 +168,12 @@ def test_default_word_parsing( f.write("`abandonned`\n") assert cs.main(fname) == 1, "bad" + fname = tmp_path / "apostrophe" + fname.write_text("woudn't\n", encoding="utf-8") # U+0027 (') + assert cs.main(fname) == 1, "misspelling containing typewriter apostrophe U+0027" + fname.write_text("woudn’t\n", encoding="utf-8") # U+2019 (’) + assert cs.main(fname) == 1, "misspelling containing typographic apostrophe U+2019" + def test_bad_glob( tmp_path: Path, From 2010c78a75c9e4193e0518d2e7629887db19865f Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Fri, 28 Jul 2023 12:05:19 +0200 Subject: [PATCH 2/2] Generate alternative typos with a translation table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This way we can catch misspellings with alternative characters, typically typographic apostrophe or acute accent U+2019 (´) instead of typewriter apostrophe U+0027 ('). In this case, the alternative character is a valid character and will be used both in the misspelling and the fix(es). The above is different from detecting Unicode phishing, where some characters like `A` are intentionally, or not, replaced by lookalikes such as `A`, `Α`, `А`, `ᗅ`, `ᴀ`, `A`. In that case, the alternative character is invalid and should be replaced by its valid counterpart in the fix. We do not address that case here. --- codespell_lib/_codespell.py | 44 ++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 2e23b9acd6..1fe8c6306c 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -36,6 +36,9 @@ "(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|" "\\b[\\w.%+-]+@[\\w.-]+\\b)" ) +# Pass all misspellings through this translation table to generate +# alternative misspellings and fixes. +alt_chars = (("'", "’"),) encodings = ("utf-8", "iso-8859-1") USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] @@ -622,31 +625,46 @@ def build_ignore_words(filename: str, ignore_words: Set[str]) -> None: ignore_words.add(line.strip()) +def add_misspelling( + key: str, + data: str, + misspellings: Dict[str, Misspelling], +) -> None: + data = data.strip() + + if "," in data: + fix = False + data, reason = data.rsplit(",", 1) + reason = reason.lstrip() + else: + fix = True + reason = "" + + misspellings[key] = Misspelling(data, fix, reason) + + def build_dict( filename: str, misspellings: Dict[str, Misspelling], ignore_words: Set[str], ) -> None: with open(filename, encoding="utf-8") as f: + translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] for line in f: [key, data] = line.split("->") # TODO for now, convert both to lower. Someday we can maybe add # support for fixing caps. key = key.lower() data = data.lower() - if key in ignore_words: - continue - data = data.strip() - - if "," in data: - fix = False - data, reason = data.rsplit(",", 1) - reason = reason.lstrip() - else: - fix = True - reason = "" - - misspellings[key] = Misspelling(data, fix, reason) + if key not in ignore_words: + add_misspelling(key, data, misspellings) + # generate alternative misspellings/fixes + for x, table in translate_tables: + if x in key: + alt_key = key.translate(table) + alt_data = data.translate(table) + if alt_key not in ignore_words: + add_misspelling(alt_key, alt_data, misspellings) def is_hidden(filename: str, check_hidden: bool) -> bool: