-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_bccwj.py
129 lines (110 loc) · 6.25 KB
/
process_bccwj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Quick script to process the data from the Balanced Corpus of Contemporary Written Japanese (BCCWJ), which is linked under the README.
Our dataset is the BCCWJ "Short Unit Word list data" (`BCCWJ_frequencylist_suw_ver1_0.tsv`).
It contains more properties of the words than are necessary for our purposes, such as frequency data,
and also contains tokens that are not useful for us, like symbols and proper nouns.
So this script filters out just the words we need for our project - words constituting a representative part of the Japanese vocabulary
(for training an autoencoder on Japanese phonology) - and also only records the fields we need for each word,
namely its katakana transcription, its lemma, its part of speech, and its sublexical origin (eg Yamato, Sinitic).
This website has some information on labels but it seems less relevant than the manual + Morita (2018)'s comments https://ccd.ninjal.ac.jp/bccwj/en/tsv.html
"""
import os
import csv
from data_processing.transcriber import Transcriber
import torch
COUNT_ETYMOLOGIES = False # TODO eventually I will make this a flag for the script
PRECISE_TRANSCRIPTION = True # TODO pass this to the transcriber to make it more/less precise
# path to the short BCCWJ word frequency list
PATH_TO_UNPROCESSED_TSV = "data/BCCWJ/BCCWJ_frequencylist_suw_ver1_0.tsv"
# path to the new, processed wordlist I am building
PATH_TO_OUTPUT_CSV = "data/BCCWJ/pared_BCCWJ.csv"
# we are interested in these particular properties:
# lForm (index 1): a katakana transcription of the given word
# lemma (index 2): the Japanese written form of the word, in kanji/hiragana/katakana
# pos (index 3): the part of speech of the word
# wType (index 5): the sublexicon of origin for a given word, namely native Yamato (和) Sino-Japanese (漢) or foreign (外).
# There are also mixed etymology words (混), proper names (固), and symbols (記号),
# which covers anything not written in a Japanese script (eg acronyms like 'DVD')
# core_frequency (index 78): following Morita (2018), we only consider words with positive core_frequency,
# as it is only those words which were manually checked by humans for validity
YAMATO = '和'
SINO = '漢'
FOREIGN = '外'
MIXED = '混'
PROPER_NAME = '固'
SYMBOLS = '記号'
# the relevant parts-of-speech for certain kinds of words one may want to exclude
PARTICLE = '助詞'
ONOMATOPOEIA = '感動詞'
def main():
with open(PATH_TO_UNPROCESSED_TSV) as f:
reader = csv.reader(f, delimiter='\t')
header = next(reader)
# `prop_to_index` maps the name of a given property to its index in a data row for ease of use
# the meaning of the properties is expounded upon in the BCCWJ manual, available at the same place as the actual data.
# I can't actually read Japanese, but Morita's explanation of the corpus is sufficient for my purposes,
# and Google Translate + knowledge of Chinese characters has served as a good aid in filling in the gaps
prop_to_index = {header[i] : i for i in range(len(header))}
if COUNT_ETYMOLOGIES:
yamato_count = 0
sino_count = 0
foreign_count = 0
for entry in reader:
core_freq = entry[prop_to_index['core_frequency']]
if core_freq != '' and int(core_freq) > 0:
origin_lang = entry[prop_to_index['wType']]
if origin_lang == YAMATO:
yamato_count += 1
elif origin_lang == SINO:
sino_count += 1
elif origin_lang == FOREIGN:
foreign_count += 1
print(f'{yamato_count} Yamato words')
print(f'{sino_count} Sino-Japanese words')
print(f'{foreign_count} Foreign words')
# after applying the core frequency > 0 requirement we reproduce Takashi's counts (at least for SJ and foreign, which he discloses):
# 9893 Yamato, 13373 Sino-Japanese, 4421 foreign
with open(PATH_TO_OUTPUT_CSV, 'w+') as f2:
t = Transcriber()
# to count the etymological distribution of the recorded words
yamato_count = 0
sino_count = 0
foreign_count = 0
total_count = 0
writer = csv.writer(f2)
writer.writerow(['word', 'kana', 'ipa', 'origin'])
for entry in reader:
core_freq = entry[prop_to_index['core_frequency']]
pos = entry[prop_to_index['pos']]
kana = entry[prop_to_index['lForm']]
word = entry[prop_to_index['lemma']]
origin_lang = entry[prop_to_index['wType']]
if core_freq != '' and int(core_freq) > 0:
if PARTICLE in pos or ONOMATOPOEIA in pos:
pass
else:
try:
ipa = t.katakana_to_ipa(kana)
if ipa != "":
writer.writerow([word, kana, ipa, origin_lang])
total_count += 1
if origin_lang == YAMATO:
yamato_count += 1
elif origin_lang == SINO:
sino_count += 1
elif origin_lang == FOREIGN:
foreign_count += 1
except:
# the above could fail if an unrecognized kana is used
# print(f'Failed to convert to IPA on word {word}')
pass
print(f'Recorded {total_count} words:')
print(f'- {yamato_count} Yamato words')
print(f'- {sino_count} Sino-Japanese words')
print(f'- {foreign_count} foreign words')
# when this script was last run we produced 36396 words: 9667 Yamato, 13370 Sino-Japanese, 4421 foreign.
# note that these counts differ from Takashi's counts. His method includes some words
# that fail to be converted to IPA; manually inspecting reveals these words appear
# to not have rendered correctly and appear just as boxes: ■
if __name__ == '__main__':
main()