-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtmxt.py
110 lines (94 loc) · 3.31 KB
/
tmxt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""Transform TMX in a tab-separated text file according to the code list
specified.
Usage:
tmxt.py --codelist=<langcodes> [INPUT_FILE [OUTPUT_FILE]]
Options:
--codelist=<langcodes> Comma-separated list of langcodes (i.e. "en,es").
TU propierties can also be specified
I/O Defaults:
INPUT_FILE Defaults to stdin.
OUTPUT_FILE Defaults to stdout.
"""
from docopt import docopt
import re
import sys
import xml.parsers.expat
def process_tmx(input, output, codelist):
curlang = ""
curtuv = []
intuv = False
inph = False
numlangs = 0
tu = {}
intprop = True
curtype = ""
curprop = []
p1 = re.compile(r'\n')
p2 = re.compile(r' *')
fmt = ("{}\t"*(len(codelist))).strip()+"\n"
def se(name, attrs):
nonlocal intuv, intprop, curtuv, curprop, curtype, tu, curlang, codelist, numlangs, inph
if intuv:
if name in ["ph", "ept", "bpt", "prop"]:
inph = True
curtuv.append("")
elif name == "tu":
tu = {i:'' for i in codelist}
elif name == "tuv":
if "xml:lang" in attrs:
curlang = attrs["xml:lang"]
elif "lang" in attrs:
curlang = attrs["lang"]
numlangs += 1
elif name == "seg":
curtuv = []
intuv = True
elif name == "field":
pass
elif name == "prop":
intprop = True
curtype = attrs['type']
curprop = []
def ee(name):
nonlocal intuv, intprop, curtuv, curprop, curtype, p1, p2, tu, curlang, codelist, numlangs, fmt, output, inph
if name == "tu":
output.write(fmt.format(*[tu[code] if not isinstance(tu[code], list) else '\t'.join(tu[code][:numlangs]) for code in codelist]))
numlangs = 0
elif name == "seg":
intuv = False
mystr = p2.sub(' ', p1.sub(' ', "".join(curtuv))).strip()
tu[curlang] = mystr
curlang = ""
elif name == "prop":
introp = False
mystr = p2.sub(' ', p1.sub(' ', "".join(curprop))).strip()
if curtype in tu and isinstance(tu[curtype], list):
tu[curtype].append(mystr)
else:
tu[curtype] = [mystr]
curtype = ""
elif name in ["ph", "ept", "bpt", "prop"]:
inph = False
def cd(data):
nonlocal intuv, curtuv, intprop, curprop
if intuv:
if not inph:
curtuv.append(data.replace("\t", " "))
if intprop:
curprop.append(data.replace("\t", " "))
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = se
p.EndElementHandler = ee
p.CharacterDataHandler = cd
p.ParseFile(input)
def main():
arguments = docopt(__doc__, version='tmxt 1.1')
input = sys.stdin.buffer if not arguments["INPUT_FILE"] else open(arguments["INPUT_FILE"], "rb")
output = sys.stdout if not arguments["OUTPUT_FILE"] else open(arguments["OUTPUT_FILE"], "w")
codelist = arguments["--codelist"].split(",")
if len(codelist) > 1:
process_tmx(input, output, codelist)
input.close()
output.close()
if __name__ == '__main__':
main()