Skip to content

Commit

Permalink
Generate alternative typos with a translation table
Browse files Browse the repository at this point in the history
This way we can catch misspellings with alternative characters,
typically and acute accent (´) instead of an apostrophe (').
In this case, the alternative character is a valid character
and will be used both in the misspelling and the fix(es).

The above is different from detecting Unicode phishing, where
some characters like `A` are intentionally, or not, replaced
by lookalikes such as `A`, `Α`,  `А`,  `ᗅ`, `ᴀ`,  `A`.
In that case, the alternative character is invalid and should
be replaced by its valid counterpart in the fix. We do not
address that here.
  • Loading branch information
DimitriPapadopoulos committed Jul 29, 2023
1 parent 2eeb8b9 commit 6cb0d5e
Showing 1 changed file with 34 additions and 18 deletions.
52 changes: 34 additions & 18 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
"(\\b(?:https?|[ts]?ftp|file|git|smb)://[^\\s]+(?=$|\\s)|"
"\\b[\\w.%+-]+@[\\w.-]+\\b)"
)
# Pass all misspellings through this translation table to catch
# alternative misspellings.
alt_chars = (("'", "’"),)
_alt_char_trans_tables = (str.maketrans(x, y) for x, y in alt_chars)
encodings = ("utf-8", "iso-8859-1")
USAGE = """
\t%prog [OPTIONS] [file1 file2 ... fileN]
Expand Down Expand Up @@ -622,6 +626,29 @@ def build_ignore_words(filename: str, ignore_words: Set[str]) -> None:
ignore_words.add(line.strip())


def add_misspelling(
key: str,
data: str,
misspellings: Dict[str, Misspelling],
) -> None:
data = data.strip()
fix = data.rfind(",")

if fix < 0:
fix = True
reason = ""
elif fix == (len(data) - 1):
data = data[:fix]
reason = ""
fix = False
else:
reason = data[fix + 1 :].strip()
data = data[:fix]
fix = False

misspellings[key] = Misspelling(data, fix, reason)


def build_dict(
filename: str,
misspellings: Dict[str, Misspelling],
Expand All @@ -634,24 +661,13 @@ def build_dict(
# support for fixing caps.
key = key.lower()
data = data.lower()
if key in ignore_words:
continue
data = data.strip()
fix = data.rfind(",")

if fix < 0:
fix = True
reason = ""
elif fix == (len(data) - 1):
data = data[:fix]
reason = ""
fix = False
else:
reason = data[fix + 1 :].strip()
data = data[:fix]
fix = False

misspellings[key] = Misspelling(data, fix, reason)
if key not in ignore_words:
add_misspelling(key, data, misspellings)
for table in _alt_char_trans_tables:
alt_key = key.translate(table)
alt_data = data.translate(table)
if alt_key not in ignore_words:
add_misspelling(alt_key, alt_data, misspellings)


def is_hidden(filename: str, check_hidden: bool) -> bool:
Expand Down

0 comments on commit 6cb0d5e

Please sign in to comment.