Skip to content

Commit

Permalink
Generate alternative typos with a translation table
Browse files Browse the repository at this point in the history
This way we can catch misspellings with alternative characters,
typically typographic apostrophe or acute accent U+2019 (´)
instead of typewriter apostrophe U+0027 ('). In this case,
the alternative character is a valid character and will be
used both in the misspelling and the fix(es).

The above is different from detecting Unicode phishing, where
some characters like `A` are intentionally, or not, replaced
by lookalikes such as `A`, `Α`,  `А`,  `ᗅ`, `ᴀ`,  `A`.
In that case, the alternative character is invalid and should
be replaced by its valid counterpart in the fix. We do not
address that case here.
  • Loading branch information
DimitriPapadopoulos committed Aug 2, 2023
1 parent 5f29c62 commit 555a752
Showing 1 changed file with 30 additions and 13 deletions.
43 changes: 30 additions & 13 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
"(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|"
"\\b[\\w.%+-]+@[\\w.-]+\\b)"
)
# Pass all misspellings through this translation table to catch
# alternative misspellings.
alt_chars = (("'", "’"),)
encodings = ("utf-8", "iso-8859-1")
USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN]
Expand Down Expand Up @@ -622,31 +625,45 @@ def build_ignore_words(filename: str, ignore_words: Set[str]) -> None:
ignore_words.add(line.strip())


def add_misspelling(
key: str,
data: str,
misspellings: Dict[str, Misspelling],
) -> None:
data = data.strip()

if "," in data:
fix = False
data, reason = data.rsplit(",", 1)
reason = reason.lstrip()
else:
fix = True
reason = ""

misspellings[key] = Misspelling(data, fix, reason)


def build_dict(
filename: str,
misspellings: Dict[str, Misspelling],
ignore_words: Set[str],
) -> None:
with open(filename, encoding="utf-8") as f:
translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars]
for line in f:
[key, data] = line.split("->")
# TODO for now, convert both to lower. Someday we can maybe add
# support for fixing caps.
key = key.lower()
data = data.lower()
if key in ignore_words:
continue
data = data.strip()

if "," in data:
fix = False
data, reason = data.rsplit(",", 1)
reason = reason.lstrip()
else:
fix = True
reason = ""

misspellings[key] = Misspelling(data, fix, reason)
if key not in ignore_words:
add_misspelling(key, data, misspellings)
for x, table in translate_tables:
if x in key:
alt_key = key.translate(table)
alt_data = data.translate(table)
if alt_key not in ignore_words:
add_misspelling(alt_key, alt_data, misspellings)


def is_hidden(filename: str, check_hidden: bool) -> bool:
Expand Down

0 comments on commit 555a752

Please sign in to comment.