-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkanjidic2.py
49 lines (42 loc) · 1.71 KB
/
kanjidic2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from xml.sax import parse as parse_sax
from xml.sax.handler import ContentHandler
def parse(path):
handler = KanjiDic2Handler()
parse_sax(path, handler)
return handler.kanji_to_info, handler.meaning_to_kanji
class KanjiDic2Handler(ContentHandler):
def __init__(self) -> None:
super().__init__()
self.kanji_to_info = {}
self.meaning_to_kanji = {}
self.kanji = None
self.kanji_info = None
self.element_name = None
self.element_attr = None
def startElement(self, name, attrs):
self.element_name = name
self.element_attr = attrs
if name == 'character':
self.kanji = None
self.kanji_info = {'meanings': [], 'kunyomi': [], 'onyomi': []}
def endElement(self, name):
self.element_name = None
self.element_attr = None
if name == 'character':
self.kanji_to_info[self.kanji] = self.kanji_info
for meaning in self.kanji_info['meanings']:
key = meaning.lower()
if key not in self.meaning_to_kanji:
self.meaning_to_kanji[key] = []
self.meaning_to_kanji[key].append(self.kanji)
def characters(self, content):
if self.element_name == 'literal':
self.kanji = content
elif self.element_name == 'meaning':
if 'm_lang' not in self.element_attr:
self.kanji_info['meanings'].append(content)
elif self.element_name == 'reading':
if self.element_attr['r_type'] == 'ja_kun':
self.kanji_info['kunyomi'].append(content)
elif self.element_attr['r_type'] == 'ja_on':
self.kanji_info['onyomi'].append(content)