-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsort.py
executable file
·107 lines (81 loc) · 2.55 KB
/
sort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python3
"""
# sort.py
Sort (in-place) certain sections of `phrases-traditional.txt` and `phrases-simplified.txt`.
Licensed under "MIT No Attribution" (MIT-0), see <https://spdx.org/licenses/MIT-0>.
"""
import re
CJK_UNIFIED_IDEOGRAPHS_START = 0x4E00
CJK_UNIFIED_IDEOGRAPHS_END = 0x9FFF
PHRASE_FILE_NAMES = [
'phrases-traditional.txt',
'phrases-simplified.txt',
]
def character_sorting_key(character):
"""
Return a key for simple sorting of Chinese characters.
Puts characters in the main CJK Unified Ideographs block (U+4E00 to U+9FFF) first,
but otherwise sorts by code point.
"""
code_point = ord(character)
if CJK_UNIFIED_IDEOGRAPHS_START <= code_point <= CJK_UNIFIED_IDEOGRAPHS_END:
block_rank = 0
else:
block_rank = 1
return block_rank, code_point
def phrase_sorting_key(phrase):
"""
Return a key for simple sorting of Chinese phrases.
"""
return tuple(character_sorting_key(character) for character in phrase)
def sort_variant_sections(text):
"""
Sort the variant sections of the supplied text.
"""
return re.sub(
r'(?<=^# <variants>\n)[\s\S]+?(?=\n# </variants>)',
lambda match: '\n'.join(
sorted(
set(sort_variants(line) for line in match.group().splitlines()), # intra-line sorting
key=phrase_sorting_key, # inter-line sorting
)
),
text,
flags=re.MULTILINE,
)
def sort_variants(line):
"""
Sort variants within the supplied line.
"""
return re.sub(
r'(?<=^# )\S+',
lambda match: ''.join(sorted(set(match.group()), key=character_sorting_key)),
line,
)
def sort_phrase_sections(text):
"""
Sort the phrase sections of the supplied text.
"""
return re.sub(
r'(?<=^# <phrases>\n)[\s\S]+?(?=\n# </phrases>)',
lambda match: '\n'.join(
sorted(
set(match.group().splitlines()),
key=phrase_sorting_key,
)
),
text,
flags=re.MULTILINE,
)
def main():
for phrase_file_name in PHRASE_FILE_NAMES:
with open(phrase_file_name, 'r', encoding='utf-8') as phrase_file:
old_text = phrase_file.read()
new_text = sort_variant_sections(old_text)
new_text = sort_phrase_sections(new_text)
if old_text == new_text:
continue
with open(phrase_file_name, 'w', encoding='utf-8') as phrase_file:
phrase_file.write(new_text)
if __name__ == '__main__':
main()