-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiktionarytodict.py
executable file
·227 lines (208 loc) · 9.88 KB
/
wiktionarytodict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#wiktionarytodict - creates dictd format dictionaries from Witkionary data
#Copyright (C) 2012 Tim Edwards
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#You should have received a copy of the GNU General Public License
#along with this program; if not, write to the Free Software
#Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
import sys, codecs, re, pycountry
class WiktionaryDumpHandler(ContentHandler):
def __init__ (self, language, langcode, outputdir):
self.isTitleElement = 0
self.isTextElement = 0
self.word = u''
self.isEnglishWord = 0
self.translationsfromeng = {}
self.translationstoeng = {}
# the language to create a dictionary to
self.language = language
self.langcode = langcode # ISO 639-3 language code, e.g. 'deu' for German
self.textchardata = ''
self.outputdir = outputdir
def processTextElement(self, content):
# The content of each Wiktionary page is contained in the <text> elemnt.
# This script is only interested in the ====Translations==== section for words marked ==English==
#ucontent = unicode(content, "utf-8")
for aline in content.splitlines():
if '==English==' in aline:
self.isEnglishWord = 1
self.parseTranslations(content)
def parseTranslations(self, content):
inTransSection = 0
currentMeaning = ''
for aline in content.splitlines():
transbegin = re.compile("\{\{trans-top")
transend = re.compile("\{\{trans-bottom")
if transbegin.match(aline) != None:
inTransSection = 1
splitresult = aline.split('|')
if len(splitresult) < 2:
# handle the case where there's no refinement on the meaning (just a '{{trans-top}}')
currentMeaning = ''
else:
currentMeaning = splitresult[1].strip('{}')
elif transend.match(aline) != None:
inTransSection = 0
currentMeaning = ''
else: # not a {{trans-top or {{trans-bottom line
if inTransSection == 1:
self.parseTranslationline(aline, currentMeaning)
def parseTranslationline(self, aline, currentMeaning):
if self.pickLanguageLines(aline) == 1:
wordwithmeaning = self.word + ' (%s)' % (currentMeaning) # word with meaning, e.g. "trade (practice)": Handwerk
# TODO: deal with these qualifiers properly
aline = re.sub('\{\{qualifier\|[\w ]*\}\}', '', aline) # remove qualifier crap (eg. {{qualifier|man}}) from strings like this: * German: {{qualifier|man}} [[Scheißkerl]] {{m}}, [[Drecksack]] {{m}}
# Extract the actual translation (the bit we're interested in e.g. '{{t+|de|Stuhl|m}}') from the line e.g. '* German: {{t+|de|Stuhl|m}}', also works for sub-languages e.g. '*: Bokmål: {{t+|nb|stol|m}}'
regex = re.compile("^.*\:(.*)$")
rawtranslation = regex.findall(aline)[0].lstrip()
# Process the actual translation:
if rawtranslation.startswith('{{'): # lines formatted like this: {{t+|de|frei}}
translationslist = rawtranslation.split('{{')
for atranslation in translationslist:
bits = atranslation.strip('} ,').split('|')
if len(bits) > 2:
translation = bits[2] # the actual word
if len(bits) > 3:
translation += ' (%s)' % bits [3] # noun gender if specified
self.addtoTranslationsMap(wordwithmeaning, translation)
else:
pass # TODO: log to stderr here printing the atranslation
elif rawtranslation.startswith('[['): # lines formatted like this: [[gratis#German|gratis]], [[kostenlos]], [[frei]], [[kostenfrei]]
translationslist = rawtranslation.split(',')
for atranslation in translationslist:
bits = atranslation.strip('][ ,').split('|')
if len(bits) > 1:
translation = bits[1] # e.g. from "[[gratis#German|gratis]]" pick out "gratis"
elif len(bits) == 1:
translation = bits[0]
else:
pass # TODO: log to stderr here printing the atranslation
self.addtoTranslationsMap(wordwithmeaning, translation)
else: # lines formatted like this: Der [[groß|Große]] [[Teich]]
wordlist = rawtranslation.split('[[')
translation = ''
for aword in wordlist:
bits = aword.split('|')
if len(bits) > 1:
translation += bits[1] # e.g. 'Große' from '[[groß|Große]'
else:
translation += bits[0]
self.addtoTranslationsMap(wordwithmeaning, translation)
def addtoTranslationsMap(self, wordwithmeaning, translation):
if self.translationsfromeng.has_key(wordwithmeaning):
# add multiple translations together, eg: free (make free) : freisetzen, befreien
newtrans = self.translationsfromeng[wordwithmeaning] + ', %s' % translation
self.translationsfromeng[wordwithmeaning] = newtrans
else:
self.translationsfromeng[wordwithmeaning] = translation
if self.translationstoeng.has_key(translation):
newtrans = self.translationstoeng[translation] + ', %s' % wordwithmeaning
self.translationstoeng[translation] = newtrans
else:
self.translationstoeng[translation] = wordwithmeaning
def pickLanguageLines(self, aline):
# only return true if aline is really for the current language
langNameFormat1 = '%s:' % self.language # e.g. German:
langNameFormat2 = '%s]]' % self.language # e.g. German]]
if self.language == 'Norwegian':
# (TODO fix)special case for Norwegian due to its 2 spelling systems.
# By default uses Bokmål, replace 'Bokmål' with 'Nynorsk' in the line below to use that
if u'*: Bokmål' in aline:
return 1
if (langNameFormat1 in aline) or (langNameFormat2 in aline):
if('{{trreq' not in aline): # filter out translation requests (http://en.wiktionary.org/wiki/Template:trreq/doc)
return 1
return 0
def outputJargonFormat(self):
# output to Jargon File format (see the -j option in man dictfmt)
fromeng = open(('%s/eng-%s.txt' % (self.outputdir, self.langcode)).encode('utf-8'), 'w')
fromengheader = "This dictionary tranlsates English to %s. It was created by the script %s and is based on data from the Wiktionary dumps available from http://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2\nAll content in this dictionary is under the same license as Wiktionary content.\n\n" % (self.language, sys.argv[0])
fromeng.write(fromengheader.encode('utf-8'))
for akey in self.translationsfromeng.keys():
# split out the headword, e.g. dictionary, from the explanation
#print ("DEBUG akey is %s" % akey).encode('utf-8')
if akey.find('(') != -1:
(headword, explanation) = akey.split('(', 1)
headword = headword.rstrip()
explanation = explanation[:-1] # remove last char (an extra ')')
explanation = '(%s) ' % explanation
else:
headword = akey
explanation = ''
#print ("DEBUG about to write %s" % headword).encode('utf-8')
fromeng.write((':%s:%s%s\n' % (headword, explanation, self.translationsfromeng[akey])).encode('utf-8'))
fromeng.close()
toeng = open(('%s/%s-eng.txt' % (self.outputdir, self.langcode)).encode('utf-8'), 'w')
toengheader = "This dictionary tranlsates %s to English. It was created by the script %s and is based on data from the Wiktionary dumps available from http://dumps.wikimedia.org/enwiktionary/latest/enwiktionary-latest-pages-articles.xml.bz2\nAll content in this dictionary is under the same license as Wiktionary content.\n\n" % (self.language, sys.argv[0])
toeng.write(toengheader.encode('utf-8'))
for akey in self.translationstoeng.keys():
if akey.find('(') != -1:
(headword, explanation) = akey.split('(', 1)
headword = headword.rstrip()
explanation = explanation[:-1] # remove last char (an extra ')')
explanation = '(%s) ' % explanation
elif akey.find('{') != -1:
(headword, explanation) = akey.split('{', 1)
headword = headword.rstrip()
explanation = explanation[:-1] # remove last char (an extra ')')
explanation = '(%s) ' % explanation
else:
headword = akey
explanation = ''
toeng.write((':%s:%s%s\n' % (headword, explanation, self.translationstoeng[akey])).encode('utf-8'))
toeng.close()
def startElement(self, name, attrs):
if name=='title':
self.isTitleElement=1
if name=='text':
self.isTextElement=1
def characters (self, content):
if self.isTitleElement==1:
# the page title should be the headword, but filter out Wiktionary crap
if 'Wiktionary:' not in content:
self.word = content
if self.isTextElement==1:
self.textchardata += content
def ignorableWhitespace (self, whitespace):
self.word = u''
def endElement(self, name):
#if self.isEnglishWord==1:
# print self.word.encode('utf-8')
if self.isTitleElement == 1:
self.isTitleElement = 0
if self.isTextElement == 1:
self.processTextElement(self.textchardata)
self.textchardata = ''
self.isTextElement = 0
self.isEnglishWord = 0
self.word = ''
def usage():
print "usage: %s FILE LANGUAGE OUTPUTDIR" % sys.argv[0]
print "%s --showlangcodes" % sys.argv[0]
if (len(sys.argv) > 4):
parser = make_parser()
curHandler = WiktionaryDumpHandler(sys.argv[2], sys.argv[3], sys.argv[4])
parser.setContentHandler(curHandler)
parser.parse(open(sys.argv[1], 'r'))
curHandler.outputJargonFormat()
elif (len(sys.argv) == 2):
if sys.argv[1] == '--showlangcodes':
langlist = []
for lang in list(pycountry.languages):
langlist.append(lang)
for lang in sorted(langlist):
print ("%s:%s" % (lang.name, pycountry.languages.get(name=lang.name).terminology)).encode("utf-8")
else:
usage()
else:
usage()