-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetData.py
50 lines (39 loc) · 1.43 KB
/
getData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import sys
import json
from SPARQLWrapper import SPARQLWrapper, JSON
endpoint_url = "https://query.wikidata.org/sparql"
file = open('JSONs/allWords.json','w+')
json_languages = open('JSONs/languages.json', 'r')
json_categories = open('JSONs/categories.json', 'r')
languages = json.load(json_languages)
categories = json.load(json_categories)
query_base = """SELECT ?lexeme ?lemma WHERE {
?lexeme dct:language wd:$LANGUAGE.
?lexeme wikibase:lexicalCategory wd:$CATEGORY.
?lexeme wikibase:lemma ?lemma.
}"""
def get_results(endpoint_url, query):
user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
# TODO adjust user agent; see https://w.wiki/CX6
sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
## [i]['lemma']['value']
out = sparql.query().convert()['results']['bindings']
out = [out[i]['lemma']['value'] for i in range(len(out))]
return out
def process_query(languageID, categoryID):
query = query_base.replace('$LANGUAGE', languageID)
query = query.replace('$CATEGORY', categoryID)
print(query)
return query
results = {}
for lang in languages:
results[lang] = {}
for cat in categories:
langID = languages[lang]
catID = categories[cat]
results[lang][cat] = get_results(endpoint_url, process_query(langID, catID))
json.dump(results, file, indent=4)
file.close()
exit()