From ad861d6375490925405f49727bd0882595064f97 Mon Sep 17 00:00:00 2001
From: Eric Mertens <emertens@gmail.com>
Date: Thu, 9 May 2024 19:15:40 -0700
Subject: [PATCH] Extract ftfy's badness computation

---
 badness.py       | 374 +++++++++++++++++++++++++++++++++++++++++++++++
 plugin.py        |   7 +-
 requirements.txt |   3 +-
 3 files changed, 379 insertions(+), 5 deletions(-)
 create mode 100644 badness.py

diff --git a/badness.py b/badness.py
new file mode 100644
index 0000000..8328c4e
--- /dev/null
+++ b/badness.py
@@ -0,0 +1,374 @@
+# This module extracted from python-ftfy under the terms of the following license
+# in order to reduce external project dependencies.
+
+# Copyright (C) 2013-2018 Robyn Speer (rspeer@luminoso.com)
+# MIT License
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+# of the Software, and to permit persons to whom the Software is furnished to do
+# so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import re
+
+# There are only 403 characters that occur in known UTF-8 mojibake, and we can
+# characterize them:
+
+MOJIBAKE_CATEGORIES = {
+    # Characters that appear in many different contexts. Sequences that contain
+    # them are not inherently mojibake
+    "common": (
+        "\N{NO-BREAK SPACE}"
+        "\N{SOFT HYPHEN}"
+        "\N{MIDDLE DOT}"
+        "\N{ACUTE ACCENT}"
+        "\N{EN DASH}"
+        "\N{EM DASH}"
+        "\N{HORIZONTAL BAR}"
+        "\N{HORIZONTAL ELLIPSIS}"
+        "\N{RIGHT SINGLE QUOTATION MARK}"
+    ),
+    # the C1 control character range, which have no uses outside of mojibake anymore
+    "c1": "\x80-\x9f",
+    # Characters that are nearly 100% used in mojibake
+    "bad": (
+        "\N{BROKEN BAR}"
+        "\N{CURRENCY SIGN}"
+        "\N{DIAERESIS}"
+        "\N{NOT SIGN}"
+        "\N{MACRON}"
+        "\N{PILCROW SIGN}"
+        "\N{SECTION SIGN}"
+        "\N{CEDILLA}"
+        "\N{LATIN SMALL LETTER F WITH HOOK}"
+        "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}"  # it's not a modifier
+        "\N{CARON}"
+        "\N{BREVE}"
+        "\N{OGONEK}"
+        "\N{SMALL TILDE}"
+        "\N{DAGGER}"
+        "\N{DOUBLE DAGGER}"
+        "\N{PER MILLE SIGN}"
+        "\N{REVERSED NOT SIGN}"
+        "\N{LOZENGE}"
+        "\ufffd"
+        # Theoretically these would appear in 'numeric' contexts, but when they
+        # co-occur with other mojibake characters, it's not really ambiguous
+        "\N{FEMININE ORDINAL INDICATOR}"
+        "\N{MASCULINE ORDINAL INDICATOR}"
+    ),
+    "currency": (
+        "\N{CENT SIGN}"
+        "\N{POUND SIGN}"
+        "\N{YEN SIGN}"
+        "\N{PESETA SIGN}"
+        "\N{EURO SIGN}"
+    ),
+    "start_punctuation": (
+        "\N{INVERTED EXCLAMATION MARK}"
+        "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}"
+        "\N{INVERTED QUESTION MARK}"
+        "\N{COPYRIGHT SIGN}"
+        "\N{GREEK TONOS}"
+        "\N{GREEK DIALYTIKA TONOS}"
+        "\N{LEFT SINGLE QUOTATION MARK}"
+        "\N{SINGLE LOW-9 QUOTATION MARK}"
+        "\N{LEFT DOUBLE QUOTATION MARK}"
+        "\N{DOUBLE LOW-9 QUOTATION MARK}"
+        "\N{BULLET}"
+        "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}"
+        "\uf8ff"  # OS-specific symbol, usually the Apple logo
+    ),
+    "end_punctuation": (
+        "\N{REGISTERED SIGN}"
+        "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
+        "\N{DOUBLE ACUTE ACCENT}"
+        "\N{RIGHT DOUBLE QUOTATION MARK}"
+        "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}"
+        "\N{TRADE MARK SIGN}"
+    ),
+    "numeric": (
+        "\N{SUPERSCRIPT TWO}"
+        "\N{SUPERSCRIPT THREE}"
+        "\N{SUPERSCRIPT ONE}"
+        "\N{PLUS-MINUS SIGN}"
+        "\N{VULGAR FRACTION ONE QUARTER}"
+        "\N{VULGAR FRACTION ONE HALF}"
+        "\N{VULGAR FRACTION THREE QUARTERS}"
+        "\N{MULTIPLICATION SIGN}"
+        "\N{MICRO SIGN}"
+        "\N{DIVISION SIGN}"
+        "\N{FRACTION SLASH}"
+        "\N{PARTIAL DIFFERENTIAL}"
+        "\N{INCREMENT}"
+        "\N{N-ARY PRODUCT}"
+        "\N{N-ARY SUMMATION}"
+        "\N{SQUARE ROOT}"
+        "\N{INFINITY}"
+        "\N{INTERSECTION}"
+        "\N{INTEGRAL}"
+        "\N{ALMOST EQUAL TO}"
+        "\N{NOT EQUAL TO}"
+        "\N{IDENTICAL TO}"
+        "\N{LESS-THAN OR EQUAL TO}"
+        "\N{GREATER-THAN OR EQUAL TO}"
+        "\N{NUMERO SIGN}"
+    ),
+    # Letters that might be used to make emoticon faces (kaomoji), and
+    # therefore might need to appear in more improbable-looking contexts.
+    #
+    # These are concatenated character ranges for use in a regex. I know
+    # they look like faces themselves. I think expressing the ranges like
+    # this helps to illustrate why we need to be careful with these
+    # characters.
+    "kaomoji": (
+        "Ò-Ö"
+        "Ù-Ü"
+        "ò-ö"
+        "ø-ü"
+        "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}"
+        "\N{DEGREE SIGN}"
+    ),
+    "upper_accented": (
+        # LATIN CAPITAL LETTER A WITH GRAVE - LATIN CAPITAL LETTER N WITH TILDE
+        "\xc0-\xd1"
+        # skip capital O's and U's that could be used in kaomoji, but
+        # include Ø because it's very common in Arabic mojibake:
+        "\N{LATIN CAPITAL LETTER O WITH STROKE}"
+        "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}"
+        "\N{LATIN CAPITAL LETTER Y WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER A WITH BREVE}"
+        "\N{LATIN CAPITAL LETTER A WITH OGONEK}"
+        "\N{LATIN CAPITAL LETTER C WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER C WITH CARON}"
+        "\N{LATIN CAPITAL LETTER D WITH CARON}"
+        "\N{LATIN CAPITAL LETTER D WITH STROKE}"
+        "\N{LATIN CAPITAL LETTER E WITH OGONEK}"
+        "\N{LATIN CAPITAL LETTER E WITH CARON}"
+        "\N{LATIN CAPITAL LETTER G WITH BREVE}"
+        "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}"
+        "\N{LATIN CAPITAL LETTER L WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER L WITH CARON}"
+        "\N{LATIN CAPITAL LETTER L WITH STROKE}"
+        "\N{LATIN CAPITAL LETTER N WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER N WITH CARON}"
+        "\N{LATIN CAPITAL LIGATURE OE}"
+        "\N{LATIN CAPITAL LETTER R WITH CARON}"
+        "\N{LATIN CAPITAL LETTER S WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER S WITH CEDILLA}"
+        "\N{LATIN CAPITAL LETTER S WITH CARON}"
+        "\N{LATIN CAPITAL LETTER T WITH CEDILLA}"
+        "\N{LATIN CAPITAL LETTER T WITH CARON}"
+        "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}"
+        "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}"
+        "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"
+        "\N{LATIN CAPITAL LETTER Z WITH ACUTE}"
+        "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}"
+        "\N{LATIN CAPITAL LETTER Z WITH CARON}"
+        "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}"
+    ),
+    "lower_accented": (
+        "\N{LATIN SMALL LETTER SHARP S}"
+        # LATIN SMALL LETTER A WITH GRAVE - LATIN SMALL LETTER N WITH TILDE
+        "\xe0-\xf1"
+        # skip o's and u's that could be used in kaomoji
+        "\N{LATIN SMALL LETTER A WITH BREVE}"
+        "\N{LATIN SMALL LETTER A WITH OGONEK}"
+        "\N{LATIN SMALL LETTER C WITH ACUTE}"
+        "\N{LATIN SMALL LETTER C WITH CARON}"
+        "\N{LATIN SMALL LETTER D WITH CARON}"
+        "\N{LATIN SMALL LETTER D WITH STROKE}"
+        "\N{LATIN SMALL LETTER E WITH OGONEK}"
+        "\N{LATIN SMALL LETTER E WITH CARON}"
+        "\N{LATIN SMALL LETTER G WITH BREVE}"
+        "\N{LATIN SMALL LETTER L WITH ACUTE}"
+        "\N{LATIN SMALL LETTER L WITH CARON}"
+        "\N{LATIN SMALL LETTER L WITH STROKE}"
+        "\N{LATIN SMALL LIGATURE OE}"
+        "\N{LATIN SMALL LETTER R WITH ACUTE}"
+        "\N{LATIN SMALL LETTER S WITH ACUTE}"
+        "\N{LATIN SMALL LETTER S WITH CEDILLA}"
+        "\N{LATIN SMALL LETTER S WITH CARON}"
+        "\N{LATIN SMALL LETTER T WITH CARON}"
+        "\N{LATIN SMALL LETTER U WITH DIAERESIS}"
+        "\N{LATIN SMALL LETTER Z WITH ACUTE}"
+        "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}"
+        "\N{LATIN SMALL LETTER Z WITH CARON}"
+        "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}"
+        "\N{LATIN SMALL LIGATURE FI}"
+        "\N{LATIN SMALL LIGATURE FL}"
+    ),
+    "upper_common": (
+        "\N{LATIN CAPITAL LETTER THORN}"
+        "\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}"
+        # not included under 'accented' because these can commonly
+        # occur at ends of words, in positions where they'd be detected
+        # as mojibake
+        "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER ETA WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}"
+        "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}"
+        "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}"
+        "\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}"
+    ),
+    "lower_common": (
+        # lowercase thorn does not appear in mojibake
+        "\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}"
+        "\N{GREEK SMALL LETTER ALPHA WITH TONOS}"
+        "\N{GREEK SMALL LETTER EPSILON WITH TONOS}"
+        "\N{GREEK SMALL LETTER ETA WITH TONOS}"
+        "\N{GREEK SMALL LETTER IOTA WITH TONOS}"
+        "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}"
+        "\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}"
+    ),
+    "box": (
+        # omit the single horizontal line, might be used in kaomoji
+        "│┌┐┘├┤┬┼"
+        "\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}"
+        "▀▄█▌▐░▒▓"
+    ),
+}
+
+# We can now build a regular expression that detects unlikely juxtapositions
+# of characters, mostly based on their categories.
+#
+# Another regular expression, which detects sequences that look more specifically
+# like UTF-8 mojibake, appears in chardata.py.
+#
+# This is a verbose regular expression, with whitespace added for somewhat more
+# readability. Remember that the only spaces that count as literal spaces in this
+# expression are ones inside character classes (square brackets).
+
+BADNESS_RE = re.compile(
+    r"""
+    [{c1}]
+    |
+    [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] [{bad}]
+    |
+    [a-zA-Z] [{lower_common}{upper_common}] [{bad}]
+    |
+    [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}]
+    |
+    [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}]
+    |
+    [{box}{end_punctuation}{currency}{numeric}] [{lower_accented}]
+    |
+    # leave out [upper_accented][currency] without further info, because it's used in some
+    # fancy leetspeak-esque writing
+    [{lower_accented}{box}{end_punctuation}] [{currency}]
+    |
+    \s [{upper_accented}] [{currency}]
+    |
+    [{upper_accented}{box}] [{numeric}]
+    |
+    [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}]
+    |
+    [{lower_accented}{upper_accented}{currency}{numeric}{box}] [{end_punctuation}] [{start_punctuation}]
+    |
+    [{currency}{numeric}{box}] [{start_punctuation}]
+    |
+    [a-z] [{upper_accented}] [{start_punctuation}{currency}]
+    |
+    [{box}] [{kaomoji}]
+    |
+    [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}] [{box}]
+    |
+    [{box}] [{end_punctuation}]
+    |
+    [{lower_accented}{upper_accented}] [{end_punctuation}] \w
+    |
+
+    # The ligature œ when not followed by an unaccented Latin letter
+    [Œœ][^A-Za-z]
+    |
+
+    # Common Windows-1252 2-character mojibake that isn't covered by the cases above
+    [ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´]
+    |
+    × [²³]
+    |
+    # Windows-1252 mojibake of Arabic words needs to include the 'common' characters.
+    # To compensate, we require four characters to be matched.
+      [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
+      [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»]
+    |
+
+    # Windows-1252 mojibake that starts 3-character sequences for some South Asian
+    # alphabets
+    à[²µ¹¼½¾]
+    |
+
+    # MacRoman mojibake that isn't covered by the cases above
+    √[±∂†≠®™´≤≥¥µø]
+    |
+    ≈[°¢]
+    |
+    ‚Ä[ìîïòôúùû†°¢π]
+    |
+    ‚[âó][àä°ê]
+    |
+
+    # Windows-1251 mojibake of characters in the U+2000 range
+    вЂ
+    |
+
+    # Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet.
+    # Because the 2-character sequences involved here may be common, we require
+    # seeing a 3-character sequence.
+    [ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС]
+    |
+    # A distinctive five-character sequence of Cyrillic letters, which can be
+    # Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters.
+    # Require a Latin letter nearby.
+    ГўВЂВ.[A-Za-z ]
+    |
+
+    # Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself
+    Ã[\xa0¡]
+    |
+    [a-z]\s?[ÃÂ][ ]
+    |
+    ^[ÃÂ][ ]
+    |
+
+    # Cases where Â precedes a character as an encoding of exactly the same
+    # character, and the character is common enough
+    [a-z.,?!{end_punctuation}] Â [ {start_punctuation}{end_punctuation}]
+    |
+
+    # Windows-1253 mojibake of characters in the U+2000 range
+    β€[™\xa0Ά\xad®°]
+    |
+
+    # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet
+    [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ]
+""".format(
+        **MOJIBAKE_CATEGORIES
+    ),
+    re.VERBOSE,
+)
+
+def badness(text):
+    """
+    Get the 'badness' of a sequence of text, counting the number of unlikely
+    character sequences. A badness greater than 0 indicates that some of it
+    seems to be mojibake.
+    """
+    return len(BADNESS_RE.findall(text))
diff --git a/plugin.py b/plugin.py
index 467fe50..106745b 100644
--- a/plugin.py
+++ b/plugin.py
@@ -58,8 +58,9 @@
 import supybot.callbacks as callbacks
 import supybot.schedule as schedule
 import supybot.registry as registry
-from ftfy.badness import badness
-#from ftfy.badness import text_cost
+
+from . import badness
+
 try:
     from supybot.i18n import PluginInternationalization
     _ = PluginInternationalization('Sigyn')
@@ -3241,7 +3242,7 @@ def isChannelLowHilight (self,irc,msg,channel,mask,text):
     def isChannelUnicode (self,irc,msg,channel,mask,text):
         limit = self.registryValue('badunicodeLimit',channel=channel)
         if limit > 0:
-            score = badness(u'%s' % text)
+            score = badness.badness(u'%s' % text)
             count = self.registryValue('badunicodeScore',channel=channel)
             if count < score:
                 return self.isBadOnChannel(irc,channel,'badunicode',mask)
diff --git a/requirements.txt b/requirements.txt
index 74b491b..22f67aa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
 requests==2.27.1
 dnspython==2.2.1
-ftfy==6.1.1
-limnoria==2022.4.27 
\ No newline at end of file
+limnoria==2022.4.27