-
Notifications
You must be signed in to change notification settings - Fork 11
/
wikiParser.py
334 lines (266 loc) · 14.7 KB
/
wikiParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
# -*- coding: utf-8 -*-
'''
##############
WikiParser
##############
* This module contains all the utility methods/functions that are used to process and parse the wikipedia
articles.
* This module also contains methods that process the information from wikipedia (using JSONpedia) and return
the appropriate JSON output to the mapper functions so that they could be easily processed to extract triples.
* Also contains methods that form the uri's from the string elements by making wikidata queries.
'''
import utilities
import time
import json
import sys
import subprocess
#set default encoding
reload(sys)
sys.setdefaultencoding('utf8')
last_sec_title = "" # last section title parsed
header_title = "" # last header (main section) title parsed
last_sec_lev = 0 # last section level parsed
def main_parser(language, resource):
''' **Main method**, obtains a **JSON** *representation* of a resource and stores the relevant data
in a dictionary.
Asks JSONpedia for the JSON representing the resource and parses the result looking for lists in sections.
Returns final dictionary containing all lists and their section names from given resource in given language.
:param language: ``Language`` of Wikipedia page, needed by JSONpedia to identify the resource.
:param resource: ``Resource name``, needed by JSONpedia.
:return: a ``dictionary`` containing section names as keys and featured lists as values, without empty fields.
'''
global header_title # used to concatenate sections and subsections titles
lists = {} # initialize dictionary
result = jsonpedia_convert(language, resource) # result obtained from JSONpedia in form of a list of sections
if result == []: #if the result is empty, try again looking for page redirects
new_resource = find_page_redirects(resource, language)
result = jsonpedia_convert(language, new_resource)
for res in result: # iterate on every section
if '@type' in res and res['@type'] == 'section':
parsed_sect = parse_section(res)
lists.update(parsed_sect)
cleanlists = utilities.clean_dictionary(language, lists) #clean resulting dictionary and leave only meaningful keys
return cleanlists
def parse_section(section):
''' Parses each section of the Wikipedia page searching for lists and calling ``parse_list()`` in turn.
Returns a dictionary with section names as keys and their list contents as values.
:param section: current section to parse in json format.
:param title: a string used to concatenate names of nested sections.
:return: a ``dictionary`` representing the section.
'''
global last_sec_lev
global last_sec_title
global header_title
section_lists = {} #initializing dictionary
if ('content' in section and section['content'] != ""): # parse only if there is available content
# checks current level to know whether to concatenate the title or not
if section['level'] == 0: #this is a 'header title'
title = section['title']
header_title = title
elif section['level'] > last_sec_lev:
#must concatenate with the previous title and update 'header' for possible further depth
title = last_sec_title + " - " + section['title']
header_title = last_sec_title
else:
#just concatenate its title with current 'header'
title = header_title + " - " + section['title']
last_sec_title = title
last_sec_lev = section['level']
content = section['content'].values() # don't consider keys since they are non-relevant (e.g. @an0, @an1,..)
sect_list = [] # will contain the list extracted from current section
"""Extract section content - values inside dictionary inside 'content' key """
for val in content:
if ('@type' in val):
if (val['@type'] == 'list'): # look for lists inside current section
level = 1 # level is used to keep trace of list inception
nest_list = [] # will contain a nested list if there is one
for cont in val['content']: # pass list elements to be parsed
if ('level' in cont and cont['level'] > level): # check if current list element is nested
nest_cont = parse_list(cont) #call parse_list on nested list and store it in nest_cont
nest_list.append(nest_cont)
sect_list.append(nest_list)
nest_list = []
else:
sect_list.append(parse_list(cont))
'''adds a new field in the dictionary representing list in the given section'''
section_lists[title] = sect_list
return section_lists
def parse_list(list_elem):
'''Parses a list element extracting relevant info and to be put in a string.
It also marks `references (links)` with double curly brackets ``{{...}}`` in order to be recognizable
for mapping.
:param list_elem: current list item in json format.
:return: a string containing useful info from list element.
'''
list_content = "" # initializing output
if ('content' in list_elem and list_elem['content'] != None):
for cont in list_elem['content']:
if ('@type' in cont and cont['@type'] != 'list_element'):
cont_type = cont['@type']
if (cont_type == 'template' or cont_type == 'link'): #Take only content field
tl_cont = cont['content']
if type(tl_cont) == list:
for tl_val in tl_cont.values(): # look for significant info in templates or links
list_content += tl_val[0] + " "
elif type(tl_cont) == dict:
if '@an0' in tl_cont: # recurring structure type with an anonymous field '@an0'
tl_val = tl_cont['@an0'] # template content lies inside first anonymus value
if type(tl_val) == list:
for tlv in tl_val:
if type(tlv) == dict:
if 'label' in tlv:
list_content += tlv['label'] # for references
else:
list_content += tlv + " " # for actual values
elif (cont_type == 'reference'):
list_content += " {{" + cont['label'] + "}} " #this format helps me to discriminate the references
elif ('label' in cont): # if there is a label key, take only its value
cont = cont['label']
list_content = list_content + " " + cont + " " # necessary to avoid lack of spaces between words
elif ('attributes' not in cont): # Take everything else but ignore bottom page references
list_content += cont
return list_content
'''
######################
### IMPORTANT NOTE ###
######################
* The jsonpedia_convert() and find_page_redirects() functions below are the older versions which make the
calls to the JSONpedia Live service in order to obtain the resource's JSON representations. Since it's a
web-service, high traffic and consistent/frequent requests can overload the server and make the
list-extractor unusable. Hence these are not used anymore.
* The newer versions of these functions use the JSONpedia library instead of the web-service, which makes
the list-extractor more robust and stable and is not dependent on the JSONpedia Live Service's server anymore.
* To use the older live web-request version (why though :P), uncomment the following two functions, and comment
the existing newer functions, in order to prevent name clashes.
'''
#####################
### Older Version ###
#####################
### Uncomment the lines below to use the web-request version.
# def jsonpedia_convert(language, resource):
# ''' Calls JSONpedia online service to get a JSON representation of the Wikipedia page divided in sections
# :param language: language of the resource we want to parse (e.g. it, en, fr...)
# :param resource: name of the resource
# :return: a JSON with significant info about the resource
# '''
# res = language + "%3A" + resource
# # JSONpedia call to obtain sections - in this way I get both section titles and their lists
# jsonpediaURL_sect = "http://jsonpedia.org/annotate/resource/json/" + res + "?filter=@type:section&procs=Structure"
# try:
# sections = utilities.json_req(jsonpediaURL_sect)
# except (IOError):
# print('Network Error - please check your connection and try again')
# raise
# except (ValueError):
# raise
# else:
# if 'success' in sections and sections['success'] == "false":
# if sections['message'] == 'Invalid page metadata.':
# print("JSONpedia error: Invalid wiki page."),
# raise
# elif 'Expected DocumentElement found' in sections['message']:
# print(("JSONpedia error: something went wrong (DocumentElement expected).")),
# raise
# else:
# print("JSONpedia error! - the web service may be currently overloaded, retrying... "
# "Error: " + sections['message'])
# time.sleep(1) # wait one second before retrying
# return jsonpedia_convert(language, resource) #try again JSONpedia call
# else:
# result = sections['result'] #JSON index with actual content
# return result
# def find_page_redirects(res, lang):
# '''Calls JSONpedia to find out whether the resource name provided redirects to another Wikipedia page
# Returns the actual page if found, thus preventing from losing pages due to non-existing names.
# :param lang: Wikipedia language of the resource
# :param res: initial resource name which may trigger a redirection
# :return: the redirection page if found
# '''
# redirect = []
# jsonpedia_req = "http://jsonpedia.org/annotate/resource/json/" + lang + ":" + res + "?&procs=Structure"
# result = utilities.json_req(jsonpedia_req)
# if 'wikitext-dom' in result:
# dom = result['wikitext-dom'][0]
# if 'structure' in dom:
# new_res = dom['structure'][1]['label']
# redirect = new_res.replace(" ", "_").encode('utf-8')
# return redirect
#####################
### Newer Version ###
#####################
### Comment the lines below to use the web-request version.
def jsonpedia_convert(language, resource):
''' Uses the ``JSONpedia wrapper`` to use the JSONpedia library to get a JSON representation of the
Wikipedia page divided in sections.
:param language: language of the resource we want to parse `(e.g. it, en, fr...)`.
:param resource: name of the resource.
:return: a JSON with significant info about the resource.
'''
try:
# spawn a new process that makes a call to the json wrapper, which creates the required
# json for the given resource, then load the string into a dict using json.loads()
proc = subprocess.Popen(['java','-jar','jsonpedia_wrapper.jar','-l', language,
'-r', resource, '-p', 'Structure', '-f', 'section'], stdout=subprocess.PIPE)
pipe_output = proc.stdout.read() #redirect the input into python variable
proc.kill() #kill the spawned process
sections = json.loads(pipe_output) #load the string as a python dict
#handle different errors
except (IOError):
print('Network Error - please check your connection and try again')
raise
except (ValueError):
raise
except (OSError):
print('Error spawning process!')
raise
else:
#JSONpedia call was succesfull
if 'success' in sections and sections['success'] == "false":
if sections['message'] == 'Invalid page metadata.':
print("JSONpedia error: Invalid wiki page."),
raise
elif 'Expected DocumentElement found' in sections['message']:
print(("JSONpedia error: something went wrong (DocumentElement expected).")),
raise
else:
print("JSONpedia error! - the web service may be currently overloaded, retrying... "
"Error: " + sections['message'])
time.sleep(1) # wait one second before retrying
return jsonpedia_convert(language, resource) #try again JSONpedia call
else:
result = sections['result'] #JSON index with actual content
return result
pass
def find_page_redirects(res, lang):
'''Calls ``JSONpedia wrapper`` to find out whether the resource name provided redirects to
another Wikipedia page. Returns the actual page if found, thus preventing from losing pages
due to non-existing names.
:param lang: Wikipedia language of the resource.
:param res: initial resource name which may trigger a redirection.
:return: the redirection page, if found.
'''
try:
# spawn a new process that makes a call to the json wrapper, which creates the required
# json for the given resource, then load the string into a dict using json.loads()
proc = subprocess.Popen(['java','-jar','jsonpedia_wrapper.jar','-l', language,
'-r', resource, '-p', 'Structure'], stdout=subprocess.PIPE)
pipe_output = proc.stdout.read() #redirect the input into python variable
proc.kill() #kill the spawned process
result = json.load(pipe_output) #load the string as a python dict
#handle different exceptions
except (IOError):
print('Network Error - please check your connection and try again')
raise
except (ValueError):
raise
except (OSError):
print('Error spawning process!')
raise
redirect = []
#find if any redirects are present, if yes, return the redirect.
if 'wikitext-dom' in result:
dom = result['wikitext-dom'][0]
if 'structure' in dom:
new_res = dom['structure'][1]['label']
redirect = new_res.replace(" ", "_").encode('utf-8')
return redirect