From ad861d6375490925405f49727bd0882595064f97 Mon Sep 17 00:00:00 2001 From: Eric Mertens Date: Thu, 9 May 2024 19:15:40 -0700 Subject: [PATCH] Extract ftfy's badness computation --- badness.py | 374 +++++++++++++++++++++++++++++++++++++++++++++++ plugin.py | 7 +- requirements.txt | 3 +- 3 files changed, 379 insertions(+), 5 deletions(-) create mode 100644 badness.py diff --git a/badness.py b/badness.py new file mode 100644 index 0000000..8328c4e --- /dev/null +++ b/badness.py @@ -0,0 +1,374 @@ +# This module extracted from python-ftfy under the terms of the following license +# in order to reduce external project dependencies. + +# Copyright (C) 2013-2018 Robyn Speer (rspeer@luminoso.com) +# MIT License + +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# of the Software, and to permit persons to whom the Software is furnished to do +# so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import re + +# There are only 403 characters that occur in known UTF-8 mojibake, and we can +# characterize them: + +MOJIBAKE_CATEGORIES = { + # Characters that appear in many different contexts. Sequences that contain + # them are not inherently mojibake + "common": ( + "\N{NO-BREAK SPACE}" + "\N{SOFT HYPHEN}" + "\N{MIDDLE DOT}" + "\N{ACUTE ACCENT}" + "\N{EN DASH}" + "\N{EM DASH}" + "\N{HORIZONTAL BAR}" + "\N{HORIZONTAL ELLIPSIS}" + "\N{RIGHT SINGLE QUOTATION MARK}" + ), + # the C1 control character range, which have no uses outside of mojibake anymore + "c1": "\x80-\x9f", + # Characters that are nearly 100% used in mojibake + "bad": ( + "\N{BROKEN BAR}" + "\N{CURRENCY SIGN}" + "\N{DIAERESIS}" + "\N{NOT SIGN}" + "\N{MACRON}" + "\N{PILCROW SIGN}" + "\N{SECTION SIGN}" + "\N{CEDILLA}" + "\N{LATIN SMALL LETTER F WITH HOOK}" + "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # it's not a modifier + "\N{CARON}" + "\N{BREVE}" + "\N{OGONEK}" + "\N{SMALL TILDE}" + "\N{DAGGER}" + "\N{DOUBLE DAGGER}" + "\N{PER MILLE SIGN}" + "\N{REVERSED NOT SIGN}" + "\N{LOZENGE}" + "\ufffd" + # Theoretically these would appear in 'numeric' contexts, but when they + # co-occur with other mojibake characters, it's not really ambiguous + "\N{FEMININE ORDINAL INDICATOR}" + "\N{MASCULINE ORDINAL INDICATOR}" + ), + "currency": ( + "\N{CENT SIGN}" + "\N{POUND SIGN}" + "\N{YEN SIGN}" + "\N{PESETA SIGN}" + "\N{EURO SIGN}" + ), + "start_punctuation": ( + "\N{INVERTED EXCLAMATION MARK}" + "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}" + "\N{INVERTED QUESTION MARK}" + "\N{COPYRIGHT SIGN}" + "\N{GREEK TONOS}" + "\N{GREEK DIALYTIKA TONOS}" + "\N{LEFT SINGLE QUOTATION MARK}" + "\N{SINGLE LOW-9 QUOTATION MARK}" + "\N{LEFT DOUBLE QUOTATION MARK}" + "\N{DOUBLE LOW-9 QUOTATION MARK}" + "\N{BULLET}" + "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" + "\uf8ff" # OS-specific symbol, usually the Apple logo + ), + "end_punctuation": ( + "\N{REGISTERED SIGN}" + "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}" + "\N{DOUBLE ACUTE ACCENT}" + "\N{RIGHT DOUBLE QUOTATION MARK}" + "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" + "\N{TRADE MARK SIGN}" + ), + "numeric": ( + "\N{SUPERSCRIPT TWO}" + "\N{SUPERSCRIPT THREE}" + "\N{SUPERSCRIPT ONE}" + "\N{PLUS-MINUS SIGN}" + "\N{VULGAR FRACTION ONE QUARTER}" + "\N{VULGAR FRACTION ONE HALF}" + "\N{VULGAR FRACTION THREE QUARTERS}" + "\N{MULTIPLICATION SIGN}" + "\N{MICRO SIGN}" + "\N{DIVISION SIGN}" + "\N{FRACTION SLASH}" + "\N{PARTIAL DIFFERENTIAL}" + "\N{INCREMENT}" + "\N{N-ARY PRODUCT}" + "\N{N-ARY SUMMATION}" + "\N{SQUARE ROOT}" + "\N{INFINITY}" + "\N{INTERSECTION}" + "\N{INTEGRAL}" + "\N{ALMOST EQUAL TO}" + "\N{NOT EQUAL TO}" + "\N{IDENTICAL TO}" + "\N{LESS-THAN OR EQUAL TO}" + "\N{GREATER-THAN OR EQUAL TO}" + "\N{NUMERO SIGN}" + ), + # Letters that might be used to make emoticon faces (kaomoji), and + # therefore might need to appear in more improbable-looking contexts. + # + # These are concatenated character ranges for use in a regex. I know + # they look like faces themselves. I think expressing the ranges like + # this helps to illustrate why we need to be careful with these + # characters. + "kaomoji": ( + "Ò-Ö" + "Ù-Ü" + "ò-ö" + "ø-ü" + "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" + "\N{DEGREE SIGN}" + ), + "upper_accented": ( + # LATIN CAPITAL LETTER A WITH GRAVE - LATIN CAPITAL LETTER N WITH TILDE + "\xc0-\xd1" + # skip capital O's and U's that could be used in kaomoji, but + # include Ø because it's very common in Arabic mojibake: + "\N{LATIN CAPITAL LETTER O WITH STROKE}" + "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" + "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" + "\N{LATIN CAPITAL LETTER A WITH BREVE}" + "\N{LATIN CAPITAL LETTER A WITH OGONEK}" + "\N{LATIN CAPITAL LETTER C WITH ACUTE}" + "\N{LATIN CAPITAL LETTER C WITH CARON}" + "\N{LATIN CAPITAL LETTER D WITH CARON}" + "\N{LATIN CAPITAL LETTER D WITH STROKE}" + "\N{LATIN CAPITAL LETTER E WITH OGONEK}" + "\N{LATIN CAPITAL LETTER E WITH CARON}" + "\N{LATIN CAPITAL LETTER G WITH BREVE}" + "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" + "\N{LATIN CAPITAL LETTER L WITH ACUTE}" + "\N{LATIN CAPITAL LETTER L WITH CARON}" + "\N{LATIN CAPITAL LETTER L WITH STROKE}" + "\N{LATIN CAPITAL LETTER N WITH ACUTE}" + "\N{LATIN CAPITAL LETTER N WITH CARON}" + "\N{LATIN CAPITAL LIGATURE OE}" + "\N{LATIN CAPITAL LETTER R WITH CARON}" + "\N{LATIN CAPITAL LETTER S WITH ACUTE}" + "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" + "\N{LATIN CAPITAL LETTER S WITH CARON}" + "\N{LATIN CAPITAL LETTER T WITH CEDILLA}" + "\N{LATIN CAPITAL LETTER T WITH CARON}" + "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" + "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" + "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" + "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" + "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" + "\N{LATIN CAPITAL LETTER Z WITH CARON}" + "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" + ), + "lower_accented": ( + "\N{LATIN SMALL LETTER SHARP S}" + # LATIN SMALL LETTER A WITH GRAVE - LATIN SMALL LETTER N WITH TILDE + "\xe0-\xf1" + # skip o's and u's that could be used in kaomoji + "\N{LATIN SMALL LETTER A WITH BREVE}" + "\N{LATIN SMALL LETTER A WITH OGONEK}" + "\N{LATIN SMALL LETTER C WITH ACUTE}" + "\N{LATIN SMALL LETTER C WITH CARON}" + "\N{LATIN SMALL LETTER D WITH CARON}" + "\N{LATIN SMALL LETTER D WITH STROKE}" + "\N{LATIN SMALL LETTER E WITH OGONEK}" + "\N{LATIN SMALL LETTER E WITH CARON}" + "\N{LATIN SMALL LETTER G WITH BREVE}" + "\N{LATIN SMALL LETTER L WITH ACUTE}" + "\N{LATIN SMALL LETTER L WITH CARON}" + "\N{LATIN SMALL LETTER L WITH STROKE}" + "\N{LATIN SMALL LIGATURE OE}" + "\N{LATIN SMALL LETTER R WITH ACUTE}" + "\N{LATIN SMALL LETTER S WITH ACUTE}" + "\N{LATIN SMALL LETTER S WITH CEDILLA}" + "\N{LATIN SMALL LETTER S WITH CARON}" + "\N{LATIN SMALL LETTER T WITH CARON}" + "\N{LATIN SMALL LETTER U WITH DIAERESIS}" + "\N{LATIN SMALL LETTER Z WITH ACUTE}" + "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" + "\N{LATIN SMALL LETTER Z WITH CARON}" + "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" + "\N{LATIN SMALL LIGATURE FI}" + "\N{LATIN SMALL LIGATURE FL}" + ), + "upper_common": ( + "\N{LATIN CAPITAL LETTER THORN}" + "\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}" + # not included under 'accented' because these can commonly + # occur at ends of words, in positions where they'd be detected + # as mojibake + "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" + "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" + "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" + "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" + "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" + "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" + "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" + "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" + "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" + "\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}" + ), + "lower_common": ( + # lowercase thorn does not appear in mojibake + "\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}" + "\N{GREEK SMALL LETTER ALPHA WITH TONOS}" + "\N{GREEK SMALL LETTER EPSILON WITH TONOS}" + "\N{GREEK SMALL LETTER ETA WITH TONOS}" + "\N{GREEK SMALL LETTER IOTA WITH TONOS}" + "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" + "\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}" + ), + "box": ( + # omit the single horizontal line, might be used in kaomoji + "│┌┐┘├┤┬┼" + "\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}" + "▀▄█▌▐░▒▓" + ), +} + +# We can now build a regular expression that detects unlikely juxtapositions +# of characters, mostly based on their categories. +# +# Another regular expression, which detects sequences that look more specifically +# like UTF-8 mojibake, appears in chardata.py. +# +# This is a verbose regular expression, with whitespace added for somewhat more +# readability. Remember that the only spaces that count as literal spaces in this +# expression are ones inside character classes (square brackets). + +BADNESS_RE = re.compile( + r""" + [{c1}] + | + [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] [{bad}] + | + [a-zA-Z] [{lower_common}{upper_common}] [{bad}] + | + [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}] + | + [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}] + | + [{box}{end_punctuation}{currency}{numeric}] [{lower_accented}] + | + # leave out [upper_accented][currency] without further info, because it's used in some + # fancy leetspeak-esque writing + [{lower_accented}{box}{end_punctuation}] [{currency}] + | + \s [{upper_accented}] [{currency}] + | + [{upper_accented}{box}] [{numeric}] + | + [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}] + | + [{lower_accented}{upper_accented}{currency}{numeric}{box}] [{end_punctuation}] [{start_punctuation}] + | + [{currency}{numeric}{box}] [{start_punctuation}] + | + [a-z] [{upper_accented}] [{start_punctuation}{currency}] + | + [{box}] [{kaomoji}] + | + [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}] [{box}] + | + [{box}] [{end_punctuation}] + | + [{lower_accented}{upper_accented}] [{end_punctuation}] \w + | + + # The ligature œ when not followed by an unaccented Latin letter + [Œœ][^A-Za-z] + | + + # Common Windows-1252 2-character mojibake that isn't covered by the cases above + [ÂÃÎÐ][€Šš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´] + | + × [²³] + | + # Windows-1252 mojibake of Arabic words needs to include the 'common' characters. + # To compensate, we require four characters to be matched. + [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»] + [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»] + | + + # Windows-1252 mojibake that starts 3-character sequences for some South Asian + # alphabets + à[²µ¹¼½¾] + | + + # MacRoman mojibake that isn't covered by the cases above + √[±∂†≠®™´≤≥¥µø] + | + ≈[°¢] + | + ‚Ä[ìîïòôúùû†°¢π] + | + ‚[âó][àä°ê] + | + + # Windows-1251 mojibake of characters in the U+2000 range + †+ | + + # Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet. + # Because the 2-character sequences involved here may be common, we require + # seeing a 3-character sequence. + [ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС] + | + # A distinctive five-character sequence of Cyrillic letters, which can be + # Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters. + # Require a Latin letter nearby. + ГўВЂВ.[A-Za-z ] + | + + # Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself + Ã[\xa0¡] + | + [a-z]\s?[ÃÂ][ ] + | + ^[ÃÂ][ ] + | + + # Cases where  precedes a character as an encoding of exactly the same + # character, and the character is common enough + [a-z.,?!{end_punctuation}]  [ {start_punctuation}{end_punctuation}] + | + + # Windows-1253 mojibake of characters in the U+2000 range + β€[™\xa0Ά\xad®°] + | + + # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet + [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ] +""".format( + **MOJIBAKE_CATEGORIES + ), + re.VERBOSE, +) + +def badness(text): + """ + Get the 'badness' of a sequence of text, counting the number of unlikely + character sequences. A badness greater than 0 indicates that some of it + seems to be mojibake. + """ + return len(BADNESS_RE.findall(text)) diff --git a/plugin.py b/plugin.py index 467fe50..106745b 100644 --- a/plugin.py +++ b/plugin.py @@ -58,8 +58,9 @@ import supybot.callbacks as callbacks import supybot.schedule as schedule import supybot.registry as registry -from ftfy.badness import badness -#from ftfy.badness import text_cost + +from . import badness + try: from supybot.i18n import PluginInternationalization _ = PluginInternationalization('Sigyn') @@ -3241,7 +3242,7 @@ def isChannelLowHilight (self,irc,msg,channel,mask,text): def isChannelUnicode (self,irc,msg,channel,mask,text): limit = self.registryValue('badunicodeLimit',channel=channel) if limit > 0: - score = badness(u'%s' % text) + score = badness.badness(u'%s' % text) count = self.registryValue('badunicodeScore',channel=channel) if count < score: return self.isBadOnChannel(irc,channel,'badunicode',mask) diff --git a/requirements.txt b/requirements.txt index 74b491b..22f67aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ requests==2.27.1 dnspython==2.2.1 -ftfy==6.1.1 -limnoria==2022.4.27 \ No newline at end of file +limnoria==2022.4.27