-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpluvier.py
185 lines (168 loc) · 4.48 KB
/
pluvier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import sys
class Word:
def __init__(self, word, phonetics, lemme, cgram, genre, number, info_verb, syll):
self.word = word
self.phonetics = phonetics
self.lemme = lemme
self.cgram = cgram
self.genre = genre
self.number = number
self.info_verb = info_verb
self.syll = syll.split("-")
source = sys.argv[1]
corpus = []
VOWELS = {
"a": "A", # chat
"e": "E", # clé
"2": "AO", # eux
"9": "AO", # seul
"E": "AEU", # père
"i": "EU", # lit
"o": "OE", # haut
"O": "O", # mort
"y": "U", # cru
"u": "OU", # mou
"5": "EUFR", # vin
# *N is used for "on" (§) endings. # TODO this is a vowel, but only on word endings
}
DIPHTONGS = {
"8i": "AU", # pluie
"j2": "AOEU", # vieux
"je": "AE", # pied
"ja": "RA", # cria
"jo": "RO", # bio
"jO": "ROE", # fjord # TODO unsure
"jo": "AO", # bio # TODO Some conflict there. "-R can be read as i" (above), but the diphtongs are more important I guess?
"jO": "AO", # fjord
"j§": "AO", # av_ion_
"wa": "OEU", # froid
"w5": "OEUPB", # loin
"wi": "AOU", # oui
"j5": "AEPB", # chien
"ey": "EU", # r_éu_nion
"ya": "WA", # suave
"ij": "-LZ", # bille # TODO Maybe not a diphtong, but a word ending/consonant thing
}
CONSONANTS_LHS = {
"b": "PW", # bonjour
"d": "TK", # dans
"f": "TP", # faire, phare
"g": "TKPW", # gare
"k": "K", # quoi, car
"l": "HR", # la
"m": "PH", # mais
"n": "TPH", # ne
"p": "P", # père
"R": "R", # rouge
"s": "S", # super
"S": "SH", # chien
"t": "T", # toi
"v": "W", # vous
"8": "W", # huit
"j": "KWR", # hier
"z": "SWR", # zone
"Z": "SKWR", # je
}
CONSONANT_PAIRS_LHS = {
"gz": "KP",
"ps": "S",
"pn": "TPH",
"kw": "KW",
"tS": "SK", # TODO: I'd rather deviate and use KH if possible
}
SOUNDS_LHS = {
"de": "STK",
"def": "STKW",
}
CONSONANTS_RHS = {
"v": "F",
"l": "L",
"b": "B",
"n": "B",
"g": "G",
"d": "D",
"z": "Z",
"k": "BG",
"l": "FL", # TODO When is "l" FL or L?
"m": "PL",
"n": "PB", # TODO This doubles the "n" = "B" from earlier. Might be in only certain pre-defined cases like AIB = "aine"
"Z": "G",
"S": "FP",
"N": "PG",
"v": "F",
"v": "F",
}
CONSONANT_PAIRS_RHS = {
"tR": "TS",
"st": "*S",
"ks": "BGS",
"bZ": "PBLG",
"dZ": "PBLG",
"sm": "FLP",
"st": "*S",
"bZ": "PBLG",
"kR": "RBG",
"vR": "FR",
"fR": "FR",
"rS": "FRPB",
"kR": "RBG",
"kR": "RBG",
"kR": "RBG",
}
SOUNDS_RHS = {
"En": "AIB",
"wan": "OIB",
"jEn": "AEB",
"win": "AOUB",
"zj§": "GZ",
"sj§": "GZ",
"z§": "GZ",
"sjOn": "GZ", # TODO This might conflict with "zj§" just above. The rule says "either `-GS/*B` or `-GZ`*
"zjOn": "GZ",
"@b": "AFRB", # jambe
"5b": "EUFRB", # limbe
"§b": "OFRB", # tombe
"@bl": "AFRBL", # tremble
"1bl": "EUFRBL", # humble
"§bl": "OFRBL", # comble
"@bR": "AFRBS", # ambre
"5bR": "EUFRBS", # timbre
"§bR": "OFRBS", # ombre
"@pR": "AFRPS",
"5pR": "EUFRPS",
"§pR": "OFRPS",
"@sj§": "APBGS", # p_ension_
"5sj§": "EUPBGS", # p_incions_
"§sj§": "OPBGS", # pron_oncions_
"@ksj§": "APBGS", # san_ction_
"5ksj§": "EUPBGS", # dist_inction_
"§ksj§": "OPBGS", # j_onction_
"ksj§": "*BGS", # a_ction_
"isjOn": "EUGZ",
"tR": "TS",
"tER": "TS",
"tyR": "TS",
# "Et": "*T", # TODO: "Et" is covered by `AIT`, I don't get this. Maybe orthographic for "ette".
"isjOn": "EUGZ",
"isjOn": "EUGZ",
}
# Here we build a list of Word objects from the corpus
with open(source) as f:
corpus = f.readlines()
words = []
for line in corpus:
entry = line.split("\t")
word = Word(word = entry[0],
phonetics = entry[1],
lemme = entry[2],
cgram = entry[3],
genre = entry[4],
number = entry[5],
info_verb = entry[10],
syll = entry[22],
)
words.append(word)
# Here we start doing stuff with Word objects
for word in words:
if "@pR" in word.phonetics:
print(word.word)