From 59ff25798917cb5fb88088e50b3913e8fabeee17 Mon Sep 17 00:00:00 2001 From: Felix Yan Date: Tue, 26 May 2020 17:07:40 +0800 Subject: [PATCH] Add web slang generation --- Makefile | 13 ++++++++++-- zhwiki-web-slang.py | 49 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) create mode 100755 zhwiki-web-slang.py diff --git a/Makefile b/Makefile index 4a59191..d062e73 100644 --- a/Makefile +++ b/Makefile @@ -9,14 +9,23 @@ download: $(FILENAME).gz $(FILENAME).gz: wget https://dumps.wikimedia.org/zhwiki/20200501/$(FILENAME).gz +web-slang.source: + ./zhwiki-web-slang.py > web-slang.source + $(FILENAME): $(FILENAME).gz gzip -k -d $(FILENAME).gz -zhwiki.raw: $(FILENAME) - ./convert.py $(FILENAME) > zhwiki.raw +zhwiki.source: $(FILENAME) web-slang.source + cat $(FILENAME) web-slang.source > zhwiki.source + +zhwiki.raw: zhwiki.source + ./convert.py zhwiki.source > zhwiki.raw zhwiki.dict: zhwiki.raw libime_pinyindict zhwiki.raw zhwiki.dict install: zhwiki.dict install -Dm644 zhwiki.dict -t $(DESTDIR)/usr/share/fcitx5/pinyin/dictionaries/ + +clean: + rm -f $(FILENAME) zhwiki.{source,raw,dict} web-slang.source diff --git a/zhwiki-web-slang.py b/zhwiki-web-slang.py new file mode 100755 index 0000000..a6b8491 --- /dev/null +++ b/zhwiki-web-slang.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import json +import urllib.parse +import urllib.request + +_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page=" +_PAGE = "中国大陆网络用语列表" + +page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read() +wikitext = json.loads(page)["parse"]["wikitext"] +words = set() + + +def add_word(word): + if word.startswith("形容"): + return + for garbage in ("、", "[", "]", "…"): + word = word.replace(garbage, "") + words.add(word.strip()) + + +def add_words(word): + for word_separator in ("、", "/", "|", ",", "。"): + if word_separator in word: + for w in word.split(word_separator): + # recursively resolve + add_words(w.strip()) + break + else: + add_word(word) + + +for line in wikitext.split("\n"): + if line.startswith("*"): + # Lists + for table_separator in (":", ":"): + if table_separator in line: + word = line.split(table_separator)[0].strip("*").strip() + add_words(word) + break + elif line.startswith("|"): + # Tables + word = line.split("|")[1] + add_words(word) + +for word in words: + print(word)