-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunicode.py
133 lines (122 loc) · 3.85 KB
/
unicode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import sys, re, unicodedata, html
import icu # pip install PyICU
def graphemes(s):
# ICU returns UTF-16 offsets, but we're working at the code point-level.
itor = icu.BreakIterator.createCharacterInstance(icu.Locale())
itor.setText(s)
p, q, p16 = 0, 0, 0
for q16 in itor:
while p16 < q16:
if ord(s[q]) > 0xffff:
p16 += 1
assert p16 < q16
p16 += 1
q += 1
yield s[p:q]
p = q
vowels = {
"a", "e", "i", "o", "u",
"\N{latin small letter schwa}",
"ṃ", "ṁ", "m\N{combining candrabindu}",
"ṛ", "ṝ", "ḷ", "ḹ",
"r\N{combining ring below}", "r\N{combining ring below}\N{combining macron}",
"l\N{combining ring below}", "l\N{combining ring below}\N{combining macron}",
"ḥ"
}
def hyphenate(s):
return s
# TODO reenable only for mobile+tablet or find a way to exclude soft hyphenate
# from clipboard copy, see https://stackoverflow.com/questions/28837944/simplest-way-to-filter-text-copied-from-a-web-page-using-javascript
# or use css for including the hyphen
# TODO try
"""
.hyphen::before {
content: '\00AD';
}
and then
hel<span class=".hyphen"/>lo
# NO, the css solution does not work
# TODO: or just don't justify the text when it looks like it won't fit correctly
# (not enough spaces and hyphens), or maybe don't use justification at all.
"""
# def hyphenate(s):
# buf = ""
# brk = False
# for t in re.split(r"(\s+|-)", s):
# if t.isspace() or t == "-" or "\N{soft hyphen}" in t or len(t) <= 6:
# buf += t
# continue
# for g in graphemes(t):
# gn = unicodedata.normalize("NFD", g).casefold()
# if gn[0] in vowels:
# brk = True
# buf += g
# else:
# if brk:
# buf += "\N{soft hyphen}"
# brk = False
# buf += g
# buf = buf.rstrip("\N{soft hyphen}")
# return buf
def char_name(c):
try:
return unicodedata.name(c)
except ValueError:
return "U+%04X" % ord(c)
SCRIPTS = ("TAMIL", "TELUGU", "KHMER", "DEVANAGARI", "GRANTHA", "KANNADA", "BALINESE")
def script(g):
# Not really correct, but enough for this purpose
names = [char_name(c) for c in g]
for s in SCRIPTS:
if all(name.startswith(s + " ") for name in names):
return s
valid_combinations = set("""
\N{LATIN SMALL LETTER R}\N{COMBINING RING BELOW}
\N{LATIN CAPITAL LETTER L}\N{COMBINING RING BELOW}
\N{LATIN CAPITAL LETTER R}\N{COMBINING RING BELOW}
\N{LATIN SMALL LETTER M}\N{COMBINING TILDE}
\N{LATIN SMALL LETTER SCHWA}\N{COMBINING MACRON}
\N{LATIN SMALL LETTER L}\N{COMBINING RING BELOW}
\N{LATIN SMALL LETTER M}\N{COMBINING CANDRABINDU}
\N{LATIN SMALL LETTER R}\N{COMBINING RING BELOW}\N{COMBINING MACRON}
\N{LATIN CAPITAL LETTER L}\N{COMBINING RING BELOW}\N{COMBINING MACRON}
\N{LATIN CAPITAL LETTER R}\N{COMBINING RING BELOW}\N{COMBINING MACRON}
\N{LATIN SMALL LETTER L}\N{COMBINING RING BELOW}\N{COMBINING MACRON}
\N{LATIN SMALL LETTER N}\N{COMBINING BREVE}
""".strip().splitlines())
def is_valid(g):
if len(g) == 1:
# Andrea always uses this for some reason
if g[0] == "\N{CYRILLIC SMALL LETTER SCHWA}":
return True
return not char_name(g[0]).startswith("CYRILLIC ")
return script(g) or g in valid_combinations
def validate(s):
problems = []
# We don't use str.splitlines() because it counts U+2028 and U+2029 as
# line separators, while github, oxygen and normal text editors don't.
s = unicodedata.normalize("NFC", s)
lines = re.split(r"\r|\n|\r\n", s)
for line_no, line in enumerate(lines, 1):
gs = list(graphemes(line))
for i, g in enumerate(gs):
if is_valid(g):
continue
hl = html.escape("".join(gs[:i]))
hl += "<mark>%s</mark>" % html.escape(g)
hl += html.escape("".join(gs[i + 1:]))
names = [char_name(c) for c in g]
problem = {
"line_no": line_no,
"highlighted_line": hl,
"grapheme": names,
}
problems.append(problem)
return problems
if __name__ == "__main__":
for line in sys.stdin:
p = 0
for g in graphemes(line):
names = [char_name(c) for c in g]
print(p, names)
p += len(g)