-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapping.py
76 lines (66 loc) · 3.41 KB
/
scrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#IMPORT LIBRAIRIES
from SPARQLWrapper import SPARQLWrapper, JSON
import wikipedia
import wptools
import re
from nltk.tokenize import sent_tokenize
#VARIABLES
k = 10 # number of articles to extract
n = 3 # if an article contains less than n sentences, this article is ignored
Categories = ['Written_communication','Airports', 'Artists', 'Astronauts', 'Astronomical_objects',
'Building','City','Comics_characters', 'Companies', 'Foods', 'Monuments_and_memorials',
'Politicians','Sports_teams','Sportspeople', 'Transport', 'Universities_and_colleges']
def SPARQLQuery(category:str, k:int, n:int, broader = 0):
#variables
current_number_of_articles=0
titles,texts, infoboxes, wikidatas, descriptions = [],[],[],[],[]
wikipedia.set_lang('en')
prefix1="PREFIX dcterms:<http://purl.org/dc/terms/> "
prefix2 = "PREFIX dbc:<http://dbpedia.org/resource/Category:> "
select = 'SELECT ?article WHERE {{?article dcterms:subject/skos:broader{{,{}}} dbc:{} . }}'.format(broader,
category)
sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery(prefix1 + prefix2 +select+'LIMIT {}'.format('1000'))
#print(broader)
sparql.setReturnFormat(JSON)
ret = sparql.queryAndConvert()
print(ret)
for r in ret['results']['bindings']:
try:
title = re.sub(r'_', r' ',r['article']['value'].split('/')[-1])
#print(title)
page = wptools.page(title, silent = True)
page.get_query()
content = re.sub(r'\s+', r' ', page.data['extract'])
if len(sent_tokenize(content)) >= n:
#print(title)
page.get_wikidata()
if page.data['description']:
if page.data['wikidata']:
page.get_labels()
titles.append(title)
texts.append(content)
wikidatas.append(page.data['wikidata'])
descriptions.append(page.data['description'])
page.get_parse()
infoboxes.append(page.data['infobox'])
current_number_of_articles += 1
if current_number_of_articles == k:
assert len(infoboxes) == len(titles)
assert len(infoboxes)== len(texts)
assert len(infoboxes) == len(wikidatas)
assert len(infoboxes)==len(descriptions)
assert len(infoboxes)== k
return titles,texts, infoboxes, wikidatas,descriptions
except Exception as e:
print(e)
continue
print('number of articles = {}'.format(current_number_of_articles))
return titles,texts, infoboxes, wikidatas, descriptions
def save_results_csv(categories:list, data:dict, filename='scrapping.csv'):
with open(filename, 'w') as f:
f.write('Category\tTitle\tText\tInfobox\tWikidata\tdescription\t\n') #Header
for c in categories:
for title, text, infobox, wikidata, description in zip(data[c]['title'],data[c]['text'],
data[c]['infobox'], data[c]['wikidata'], data[c]['description']):
f.write('{}\t{}\t{}\t{}\t{}\t{}\t\n'.format(c,title,text,infobox, wikidata, description))