-
Notifications
You must be signed in to change notification settings - Fork 1
/
codeswitch_update.py
74 lines (61 loc) · 2.69 KB
/
codeswitch_update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import arrow
import re
import redis
import requests
from bz2 import BZ2File as bzopen
from site_credentials import redis_server, redis_port, redis_key
REDIS = redis.Redis(host=redis_server, port=redis_port, password=redis_key)
today = arrow.utcnow().format('YYYYMMDD')
def main():
# <http://www.wikidata.org/entity/Q47133351> <http://www.wikidata.org/prop/direct/P356> "10.1002/EJP.1050" .
REGEX = r'^<http:\/\/www\.wikidata\.org\/entity\/(Q\d+)> <http:\/\/www.wikidata.org\/prop\/direct\/(P\d+)> "(.*?)" \.$'
manifest = ['P356', 'P698', 'P932', 'P2880']
dump_location = 'https://dumps.wikimedia.org/wikidatawiki/entities/latest-truthy.nt.bz2'
to_add = {x: [] for x in manifest}
print("Downloading latest dump")
with requests.get(dump_location, stream=True) as r:
print("Saving dump")
r.raise_for_status()
with open('/tmp/latest-truthy.nt.bz2', 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print("Opening dump")
with bzopen('/tmp/latest-truthy.nt.bz2', 'r') as f:
for line in f:
line = line.decode('utf-8')
match = re.match(REGEX, line)
if match is None:
continue
wd_item = match.group(1)
wd_prop = match.group(2)
wd_value = match.group(3)
if wd_prop in manifest:
#print('Up to', wd_item, end='\r')
to_add[wd_prop].append((wd_item, wd_value))
if len(to_add[wd_prop]) >= 10000:
print('\nSaving to Redis')
wikidata_to_x = {x[0]: x[1] for x in to_add[wd_prop]}
x_to_wikidata = {x[1]: x[0] for x in to_add[wd_prop]}
REDIS.hmset(
'{0}_to_wikidata_{1}'.format(wd_prop, today),
x_to_wikidata)
REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today),
wikidata_to_x)
to_add[wd_prop] = []
# If there are leftovers
for wd_prop, tuplelist in to_add.items():
wikidata_to_x = {x[0]: x[1] for x in tuplelist}
x_to_wikidata = {x[1]: x[0] for x in tuplelist}
REDIS.hmset(
'{0}_to_wikidata_{1}'.format(wd_prop, today),
x_to_wikidata)
REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today),
wikidata_to_x)
# Finalize
for wd_prop in manifest:
REDIS.rename('{0}_to_wikidata_{1}'.format(wd_prop, today),
'{0}_to_wikidata'.format(wd_prop))
REDIS.rename('wikidata_to_{0}_{1}'.format(wd_prop, today),
'wikidata_to_{0}'.format(wd_prop))
if __name__ == '__main__':
main()