0105

osfans · Jan 5, 2025 · 86dfad3 · 86dfad3
1 parent 2596058
commit 86dfad3
Show file tree

Hide file tree

Showing 102 changed files with 890,104 additions and 8,322 deletions.
diff --git a/tools/tables/__init__.py b/tools/tables/__init__.py
@@ -6,6 +6,7 @@
 import tables._詳情
 from pypinyin import pinyin, Style
 from collections import defaultdict
+from itertools import combinations
 from opencc import OpenCC
 
 SOURCE = "data"
@@ -154,6 +155,14 @@ def getLangs(dicts, 參數, 省=None):
 	推薦人 = defaultdict(int)
 	維護人 = defaultdict(int)
 	keys = None
+	同音字頻 = defaultdict(int)
+	同音字頻表 = os.path.exists("同音字頻.tsv")
+	if 同音字頻表:
+		t = open("同音字頻.tsv", "r", encoding="U8")
+		for line in t:
+			字, 頻 = line.strip().split("\t")
+			同音字頻[字] = int(頻)
+		t.close()
 	t = open("warnings.txt", "w", encoding="U16")
 	for mod in mods:
 		if mod in 詳情:
@@ -214,12 +223,33 @@ def getLangs(dicts, 參數, 省=None):
 				if 人:
 					維護人[人] += 1
 			數 += 1
+			if 語.檢查同音字():
+				if 同音字頻表:
+					for 音, 字組 in 語.音典.items():
+						if "□" in 字組: 字組.remove("□")
+						for 字甲 in 字組:
+							字頻 = 0
+							字組乙 = set(字組)
+							字組乙.remove(字甲)
+							n = len(字組乙)
+							if n < 2: continue
+							for 字乙 in 字組乙:
+								字頻 += 同音字頻["".join(sorted((字甲, 字乙)))]
+							if 字頻 < n:
+								語.誤.append(f"{字甲}可能不讀{音}")
+				else:
+					for 字組 in 語.音典.values():
+						if "□" in 字組: 字組.remove("□")
+						if len(字組) < 2: continue
+						for 項 in combinations(字組, 2):
+							雙字 = "".join(sorted(項))
+							同音字頻[雙字] += 1
 			if 語.誤:
 				all_editors = ",".join(editor)
 				語.全稱 = 語.info["語言"]
 				print(f"{語.全稱}（{語}）-{語.文件名}-{all_editors}", file=t)
-				for 調 in 語.誤:
-					print(f"\t{調}", file=t)
+				for 誤 in 語.誤:
+					print(f"\t{誤}", file=t)
 		else:
 			語 = import_module(f"tables.{mod}").表()
 			d = dict()
@@ -231,10 +261,10 @@ def getLangs(dicts, 參數, 省=None):
 			語.加載(dicts)
 		語.info["字數"] = 語.字數
 		語.info["□數"] = 語.框數 if 語.框數 else None
-		聲韻調數 = 語.聲韻調數
+		音節數 = 語.音節數
 		聲韻數 = 語.聲韻數
-		語.info["音節數"] = 聲韻調數 if 聲韻調數 else None
-		語.info["不帶調音節數"] = 聲韻數 if 聲韻數 and 聲韻數 != 聲韻調數 else None
+		語.info["音節數"] = 音節數 if 音節數 else None
+		語.info["不帶調音節數"] = 聲韻數 if 聲韻數 and 聲韻數 != 音節數 else None
 		語.info["網站"] = 語.網站
 		語.info["網址"] = 語.網址
 		lang_t = 語.info["語言"]
@@ -249,6 +279,12 @@ def getLangs(dicts, 參數, 省=None):
 		if not keys: keys = 語.info.keys()
 		語組.append(語)
 	t.close()
+	if not 同音字頻表:
+		t = open("同音字頻.tsv", "w", encoding="U8")
+		for i, j in 同音字頻.items():
+			if j > 1:
+				t.write(f"{i}\t{j}\n")
+		t.close()
 	字 = 語組[0]
 	for 項 in keys:
 		if 項 not in 字.info: 字.info[項] = None

diff --git a/tools/tables/_表.py b/tools/tables/_表.py
@@ -318,8 +318,11 @@ def 正音(自, 音, 檢查=False):
 			自.誤.append(f"{音} 音節重複")
 		return 音
 
+	def 檢查同音字(自):
+		return 自.分區 and 自.簡稱 not in ("普通話",) and not 自.分區.startswith("歷史音") and not 自.分區.startswith("域外方音")
+
 	def 爲方言(自):
-		return 自.名 in ("老國音","党項") or (自.爲語() and not 自.分區.startswith("歷史音"))
+		return 自.簡稱 in ("老國音","党項") or (自.爲語() and not 自.分區.startswith("歷史音"))
 
 	def 分註(自, 註):
 		if not 註: return ""
@@ -394,15 +397,14 @@ def 框數(自):
 		return 1 if 數 > 0 else 0
 
 	@property
-	def 聲韻調數(自):
+	def 音節數(自):
 		return len(自.音典)
 
 	@property
 	def 聲韻數(自):
-		return len(set(map(lambda x:x.split("/")[0].rstrip("1234567890"), 自.音典.keys())))
+		return len(set(map(lambda x:x.rstrip("1234567890"), 自.音典.keys())))
 
 	def 讀(自):
-		start = time()
 		if 自.過時(): 自.更新()
 		自.音典.clear()
 		自.d.clear()
@@ -413,37 +415,35 @@ def 讀(自):
 			if "\t" not in 行: continue
 			字, py = 行.split("\t", 1)
 			if 自.爲語():
-				js = ""
-				if "\t" in py: py, js = py.split("\t", 1)
-				if js and 自.爲語():
-					js = 自.分註(js)
+				註 = ""
+				if "\t" in py: py, 註 = py.split("\t", 1)
+				if 註 and 自.爲語():
+					註 = 自.分註(註)
 				try:
 					yd = getYD(py)
 				except:
-					print("\t\t\t", 自.簡稱, py, js)
+					print("\t\t\t", 自.簡稱, py, 註)
 					exit(1)
 				if yd and py.count("*") <= 1:
-					js = f"({yd}){js}"
+					註 = f"({yd}){註}"
 					py = py[:-1]
-				if re.match(r"^\([^()]*?\)$", js):
-					js = js[1:-1]
-				syd = re.sub(r"\(.*?\)","",py).strip(" _`*")
-				if "-" not in syd:
-					自.音典[syd].add(字)
-				if js:
-					py += "{%s}" % js
+				if re.match(r"^\([^()]*?\)$", 註):
+					註 = 註[1:-1]
+				音 = re.sub(r"\(.*?\)","",py).strip(" _`*")
+				if "-" not in 音:
+					自.音典[音.split("/", 1)[0]].add(字)
+				if 註:
+					py += "{%s}" % 註
 			else:
 				if 自.字書:
 					sep = "▲" if 自.名 == "匯纂" else "\t"
-					py2, js = py.split(sep, 1)
-					py = ("\n\n" if 自.d[字] else "") + py2 + sep + 自.分註(js)
+					py2, 註 = py.split(sep, 1)
+					py = ("\n\n" if 自.d[字] else "") + py2 + sep + 自.分註(註)
 				elif 自.簡稱 in ("部件檢索","字形描述"):
 					py = 自.正部件(py)
 				py = py.replace("\t", "\n")
 			if py not in 自.d[字]:
 				自.d[字].append(py)
-		# passed = time() - start
-		# logging.info(f"({自.count:5d}({自.框數})-{自.聲韻調數:4d}-{自.聲韻數:4d}) {passed:6.3f} {自}")
 
 	def 加載(自, dicts):
 		自.讀()

diff --git a/tools/tables/_跳跳老鼠.py b/tools/tables/_跳跳老鼠.py
@@ -111,6 +111,13 @@ def 析(自, 列):
 		elif 名 in ("江華河路口", "江華粟米塘", "全州黃沙河", "安仁新洲", "1935長沙", "長沙黃花", "瀏陽鎭頭"):
 			聲韻, 調, 組 = 列[:3]
 			組 = 自.normS(組)
+		elif 名 in ("東海",):
+			果 = re.findall(r"^(.+?)\[(\d+)\][ ]*?(.+)$", "".join(列))
+			if not 果: return
+			聲韻, 調值, 組 = 果[0]
+			調 = 自.僅轉調類(調值)
+			組 = 自.normG(組, "〚\\1〛")
+			組 = 組.replace("*", "")
 		elif 名 in ("孝昌小河",):
 			果 = re.findall(r"^(.+?)(\d+) ?(.+)$", 列[0])
 			if not 果: return