Skip to content

Commit

Permalink
0109
Browse files Browse the repository at this point in the history
  • Loading branch information
osfans committed Jan 9, 2025
1 parent 3a5cfbb commit 0150b4a
Show file tree
Hide file tree
Showing 97 changed files with 27,306 additions and 3,920,901 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
pushd tools
python3 -m pip install -r requirements.txt
touch tables/*.py
python3 make.py
python3 make.py -c
popd
- name: Grant execute permission for gradlew
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/push-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ jobs:
pushd tools
python3 -m pip install -r requirements.txt
touch tables/*.py
python3 make.py
python3 make.py -c
popd
3 changes: 2 additions & 1 deletion tools/make.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
import argparse

parser = argparse.ArgumentParser(description='Create mcpdict database')
parser.add_argument('-c', action='store_true', help='check 同音字頻', required=False)
parser.add_argument('-省', help='province to include', required=False)
args, argv = parser.parse_known_args()
start = time()

dicts = defaultdict(dict)
langs = getLangs(dicts, argv, =args.)
langs = getLangs(dicts, argv, args)
keys = [f"{lang.簡稱}" for lang in langs]
fields = [f"`{i}`" for i in keys]
CREATE = 'CREATE VIRTUAL TABLE mcpdict USING fts3 (%s)' % (",".join(fields))
Expand Down
66 changes: 43 additions & 23 deletions tools/tables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,47 @@ def getLangsByArgv(infos, argv):
break
return l

def getLangs(dicts, 參數, =None):
def 獲取同音字頻(get=False):
if not get: return
同音字頻 = defaultdict(int)
詳情 = tables._詳情.加載()
for mod,d in 詳情.items():
try:
if d["文件格式"]:
= import_module(f'tables._{d["文件格式"]}').()
.setmod(mod)
else:
= import_module(f"tables.{mod}").()
if not .文件名: .文件名 = d["文件名"]
except:
continue
if "繁" not in d["繁簡"]: .simplified = 2
if d["地圖集二分區"] == None: d["地圖集二分區"] = ""
if "聯表列名" in d:
a = d["聯表列名"].upper()
.音列 = sum([26**(len(a)-1-i)*(ord(j)-ord('A')+1) for i,j in enumerate(a)]) - 1
if d["聲調"]:
調典 = dict()
調組 = json.loads(d["聲調"])
for 調 in 調組:
調值 = 調組[調][0]
if 調值 in 調典 and "入" in 調組[調][3]:
調值 += "0"
調典[調值] = 調
.調典 = 調典
.info = d
.加載()
if .音節數 > 0:
for 字組 in .聲韻典.values():
if len(字組) < 2: continue
for in combinations(字組, 2):
雙字 = "".join(sorted())
同音字頻[雙字] += 1
return 同音字頻

def getLangs(dicts, 參數, args):
= args.
同音字頻 = 獲取同音字頻(args.c)
詳情 = tables._詳情.加載()
語組 = []
= 0
Expand All @@ -158,14 +198,6 @@ def getLangs(dicts, 參數, 省=None):
推薦人 = defaultdict(int)
維護人 = defaultdict(int)
keys = None
同音字頻 = defaultdict(int)
同音字頻表 = os.path.exists("同音字頻.tsv")
if 同音字頻表:
t = open("同音字頻.tsv", "r", encoding="U8")
for line in t:
, = line.strip().split("\t")
同音字頻[] = int()
t.close()
t = open("warnings.txt", "w", encoding="U16")
for mod in mods:
if mod in 詳情:
Expand Down Expand Up @@ -200,7 +232,7 @@ def getLangs(dicts, 參數, 省=None):
調典[調值] = 調
.調典 = 調典
.info = d
.加載(dicts)
.加載(dicts, 更新=args.c)
if d["文件名"] != "mcpdict.db":
if .字數 == 0: continue
if .字數 < 900:
Expand All @@ -226,7 +258,7 @@ def getLangs(dicts, 參數, 省=None):
if :
維護人[] += 1
+= 1
if 同音字頻表:
if 同音字頻:
if .檢查同音字() and .字數 < 10000:
for , 字組 in .聲韻典.items():
if len(字組) < 2: continue
Expand All @@ -239,12 +271,6 @@ def getLangs(dicts, 參數, 省=None):
字頻 += 同音字頻["".join(sorted((字甲, 字乙)))]
if 字頻 < 1.8 * n:
..append(f"{字甲}可能不讀[{}]{''.join(字組乙)[:4]}")
elif .音節數 > 0:
for 字組 in .聲韻典.values():
if len(字組) < 2: continue
for in combinations(字組, 2):
雙字 = "".join(sorted())
同音字頻[雙字] += 1
.info["解析日志"] = None
.info["同音字表"] = None
if .:
Expand Down Expand Up @@ -295,12 +321,6 @@ def getLangs(dicts, 參數, 省=None):
if not keys: keys = .info.keys()
語組.append()
t.close()
if not 同音字頻表:
t = open("同音字頻.tsv", "w", encoding="U8")
for i, j in 同音字頻.items():
if j > 1:
t.write(f"{i}\t{j}\n")
t.close()
= 語組[0]
for in keys:
if not in .info: .info[] = None
Expand Down
9 changes: 6 additions & 3 deletions tools/tables/_縣志.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ def 統(自, 行):
= .lstrip("ø")
elif in ("江夏湖泗"):
= .replace("ø[", "0[")
elif in ("遂川","大庸南","大庸北", "婺川", "蒙山程村","欽州東場", "陽朔鳳樓"):
elif in ("遂川","大庸南","大庸北", "婺川", "蒙山程村","欽州東場", "陽朔鳳樓","桑植芙蓉橋"):
= .行轉調類()
elif in ("道眞"):
= .行轉調類().lstrip("ø")
elif in ("奉化",):
= .行轉調類(, r"(\d+)(?![:\d])")
elif in ("巢湖",):
= .normS()
if .startswith(" ") and not .startswith(" #"): = "Ø" +
elif in ("崇仁"):
= .normS()
elif in ("羅山","贛縣安平"):
Expand Down Expand Up @@ -237,7 +238,9 @@ def 析韻(自, 行):
elif "[" in or "]" in : return
=
if : = .split("\t")[0].strip().strip("[]")
if 有字(): = ""
if 有字():
if .: ..append(f"[{}]前不應斷行,或不是合法韻母")
return .
. =
if : .韻組.append()
return
Expand Down Expand Up @@ -303,7 +306,7 @@ def 更新(自):
異讀 = ""
= [1:-1]
if .count("{") != .count("}"):
..append(f"大括號未成對:{}")
..append(f"括號未成對:{}")
= .replace("{", "").replace("}", "")
音義 = + 異讀 + "\t" + 序號 + 音義 +
if 音義 not in []:
Expand Down
32 changes: 17 additions & 15 deletions tools/tables/_表.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,10 @@ def docx2tsv(doc):
lines = []
for each in Document(doc).paragraphs:
= "".join(map(run2text, each.runs)).replace("}{", "")
lines.append( + "\n")
lines.append()
= "\n".join(lines).replace("}\n{", "")
t = open(tsv, "w", encoding="U8", newline="\n")
t.writelines(lines)
t.write()
t.close()

def ybKey(x):
Expand Down Expand Up @@ -297,16 +298,16 @@ def 正音(自, 音, 檢查=False):
= ._正音()
if not 檢查: return
if "\t" in :
..append(f"{} 音節有TAB空檔")
..append(f"[{}]音節含TAB字符")
= .replace("\t", "")
if 爲字([0]):
..append(f"{} 音節錯誤")
if re.match(r".+\d{3,}", ):
..append(f"{} 調類錯誤")
if not re.match(r".+\d{0,2}[a-z\-=]?", ):
..append(f"[{}]音節錯誤")
elif 有字():
..append(f"[{}]音節包含漢字")
if not in .音集:
.音集.add()
else:
..append(f"{} 音節重複")
..append(f"[{}]音節重複")
return

def 檢查同音字():
Expand Down Expand Up @@ -354,7 +355,7 @@ def 寫(自, d):
= s2t(, .simplified)
if not 爲字():
if .爲方言():
..append(f"【{}[{','.join([i.strip() for i in pys])}]不是漢字")
..append(f"【{}({','.join([i.strip() for i in pys])})不是漢字")
continue
if .註序:
pys = sorted(pys,key=ybKey)
Expand All @@ -366,7 +367,7 @@ def 寫(自, d):
, = py, ""
= .正音()
if == "□" and not :
..append(f"□[{}]没有注释")
..append(f"【□】({})無註釋")
= f"{}\t{}"
print(f"{}\t{}", file=t)
t.close()
Expand Down Expand Up @@ -396,9 +397,9 @@ def 音節數(自):
def 聲韻數():
return len(.聲韻典)

def ():
def (, 更新=False):
.音表.clear()
if .過時(): .更新()
if .過時() or 更新 and .spath: .更新()
.音典.clear()
.聲韻典.clear()
.d.clear()
Expand Down Expand Up @@ -428,7 +429,7 @@ def 讀(自):
if "-" not in :
.音典[].add()
繁註 = s2t(.replace(" ", ""))
if "訓" not in 繁註 and "(又)" not in 繁註 and "口語" not in 繁註 and "合音" not in 繁註 and "語流" not in 繁註 and "音變" not in 繁註 and "連讀" not in 繁註 and "存疑" not in 繁註 and "地方字" not in 繁註 and "地名" not in 繁註 and "俗" not in 繁註 and != "□":
if "訓" not in 繁註 and "" not in 繁註 and "口語" not in 繁註 and "合音" not in 繁註 and "語流" not in 繁註 and "音變" not in 繁註 and "連讀" not in 繁註 and "存疑" not in 繁註 and "地方字" not in 繁註 and "地名" not in 繁註 and "俗" not in 繁註 and != "□":
聲韻 = .分音()[0]
.聲韻典[聲韻].add()
if :
Expand All @@ -444,9 +445,10 @@ def 讀(自):
if py not in .d[]:
.d[].append(py)

def 加載(, dicts):
.()
def 加載(, dicts=None, 更新=False):
.(更新)
if not .d: return
if dicts is None: return
for , 音集 in .d.items():
if not in dicts:
dicts[] = {"漢字": }
Expand Down
2 changes: 1 addition & 1 deletion tools/tables/_詳情.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def normSource(books):
return books.value
return None

def 加載():
def 加載(=None):
if not and not 過時():
return json.load(open(tpath,encoding="U8"))
d = dict()
Expand Down
1 change: 1 addition & 0 deletions tools/tables/_音典.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def 析(自, 列):
if 異讀 == "文": +="="
elif 異讀 == "白": +="-"
elif in ("台山斗山墟",):
if (len() < 13): return
, 音標, = [0], [12], [13]
elif in ("新會天湖",):
, , , 調值, = [0], [11], [12], [13], [14]
Expand Down
6 changes: 3 additions & 3 deletions tools/tables/data/三江黄牌.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ t [7b]搭答
[8b]达沓{一~纸}
l [8b]腊蜡
k [7b]甲胛{肩~}
ŋ [7b]夹{~菜}
ŋ [7b]夹~菜
[8b]和{~面}合
d̥z̥ [8b]杂
ʔ [7b]压
Expand Down Expand Up @@ -182,7 +182,7 @@ tʰ [1]梯[3]体
[2]题提蹄啼[6]弟{此词也表示“男”义}地{~上}
n [2]泥
l [2]犁
k [1]鸡肌[5]□{屎}
k [1]鸡肌[5]□[屎]
ts [5]济剂{一~药}
tsʰ [5]砌
d̥z̥ [2]齐脐
Expand Down Expand Up @@ -475,7 +475,7 @@ d̥ʑ̥ [2]茶察查{调~}
ɕ [1]沙纱
Ø [4]也{~是}野[6]夜
#iɑi
ɕ [1]斋[6]寨
[1]斋[6]寨
tɕʰ [1]差{出~}
ɕ [1]筛{~子}
#iɑu
Expand Down
Loading

0 comments on commit 0150b4a

Please sign in to comment.