From 9c3f4614436d0566951d9062cf83f0cddaa0f18e Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Sat, 23 May 2020 15:38:21 +0800 Subject: [PATCH] Initial version --- Makefile | 22 ++++++++++++++++++++++ README | 14 ++++++++++++++ convert.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100644 Makefile create mode 100644 README create mode 100755 convert.py diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4a59191 --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ +FILENAME=zhwiki-20200501-all-titles-in-ns0 + +all: build + +build: zhwiki.dict + +download: $(FILENAME).gz + +$(FILENAME).gz: + wget https://dumps.wikimedia.org/zhwiki/20200501/$(FILENAME).gz + +$(FILENAME): $(FILENAME).gz + gzip -k -d $(FILENAME).gz + +zhwiki.raw: $(FILENAME) + ./convert.py $(FILENAME) > zhwiki.raw + +zhwiki.dict: zhwiki.raw + libime_pinyindict zhwiki.raw zhwiki.dict + +install: zhwiki.dict + install -Dm644 zhwiki.dict -t $(DESTDIR)/usr/share/fcitx5/pinyin/dictionaries/ diff --git a/README b/README new file mode 100644 index 0000000..87a536e --- /dev/null +++ b/README @@ -0,0 +1,14 @@ +zhwiki dictionary for fcitx5-pinyin + + +Build time requirements: + +Python modules: +opencc +pypinyin + + +Installation: + +make +sudo make install diff --git a/convert.py b/convert.py new file mode 100755 index 0000000..e968d19 --- /dev/null +++ b/convert.py @@ -0,0 +1,29 @@ +#!/bin/python +import sys +import re +import opencc +from pypinyin import lazy_pinyin +converter = opencc.OpenCC('t2s.json') + +FILE = sys.argv[1] + +HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$') +count = 0 +with open(FILE) as f: + for line in f: + line = line.rstrip("\n") + if not HANZI_RE.match(line): + continue + + pinyin = "'".join(lazy_pinyin(line)) + if pinyin == line: + print("Failed to convert, ignoring:", pinyin, file=sys.stderr) + continue + + print("\t".join((converter.convert(line), pinyin, "0"))) + count += 1 + if count % 1000 == 0: + print(str(count) + " converted", file=sys.stderr) + +if count % 1000 != 0: + print(str(count) + " converted", file=sys.stderr)