-
Notifications
You must be signed in to change notification settings - Fork 8
/
commons_date_find.py
188 lines (172 loc) · 5.07 KB
/
commons_date_find.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Match Commons date categories with Wikidata items
# Mike Peel 11-Oct-2021 v1 - start
import pywikibot
import numpy as np
import time
import string
from pywikibot import pagegenerators
from pywikibot.data import api
import urllib
import random
# Settings
targetcat = 'Category:Days by day'
maxnum = 50000
nummodified = 0
debug = 0
trip = 1
# Sites
wikidata_site = pywikibot.Site("wikidata", "wikidata")
repo = wikidata_site.data_repository() # this is a DataSite object
commons = pywikibot.Site('commons', 'commons')
# Functions
def search_entities(site, itemtitle):
params = { 'action' :'wbsearchentities',
'format' : 'json',
'language' : 'en',
'type' : 'item',
'search': itemtitle}
request = api.Request(site=site, parameters=params)
return request.submit()
def do_date_find(page):
print(page.title())
# See if we have a Wikidata item already
try:
wd_item = pywikibot.ItemPage.fromPage(page)
item_dict = wd_item.get()
qid = wd_item.title()
print("Has a sitelink already - https://www.wikidata.org/wiki/" + qid)
return 0
except:
wd_item = 0
item_dict = 0
qid = 0
sitelink_check = 0
# continue
# If we're here, we don't - search for a match.
print('Searching for a match...')
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
date = page.title().replace('Category:','')
date = date.split('-')
print(date)
if len(date) != 3:
print('Something odd happened, skipping')
print(date)
return 0
day = date[2]
if day[0] == '0':
day = day[1]
datestr = months[int(date[1])-1] + ' ' + str(day) + ', ' + str(date[0])
print(datestr)
wikidataEntries = search_entities(repo, datestr)
if wikidataEntries['search'] != []:
results = wikidataEntries['search']
numresults = len(results)
for i in range(0,numresults):
targetpage = pywikibot.ItemPage(wikidata_site, results[i]['id'])
item_dict = targetpage.get()
print('http://www.wikidata.org/wiki/'+results[i]['id'])
# Make sure we don't have a sitelink already
sitelink_check = False
try:
sitelink = get_sitelink_title(item_dict['sitelinks']['commonswiki'])
print('http://commons.wikimedia.org/wiki/'+sitelink.replace(' ','_'))
sitelink_check = True
except:
pass
if sitelink_check:
print('Has sitelink')
continue
calday = False
P31 = ''
try:
P31 = item_dict['claims']['P31']
except:
print('No P31, skipping')
continue
if P31 != '':
for clm in P31:
# print(clm)
# print(clm.getTarget().title())
if clm.getTarget().title() == 'Q47150325':
calday = True
if not calday:
print('Not a calendar day, skipping')
continue
try:
print(item_dict['labels']['en'])
except:
print('')
print('http://commons.wikimedia.org/wiki/'+page.title().replace(' ','_'))
text = 'y'
if debug:
text = input("Save? ")
if text != 'n':
data = {'sitelinks': [{'site': 'commonswiki', 'title': page.title()}]}
targetpage.editEntity(data, summary=u'Add commons sitelink')
return 1
# Also try with a different date format (only datestr line is different from the above)
datestr = str(day) + ' ' + months[int(date[1])-1] + ' ' + str(date[0])
wikidataEntries = search_entities(repo, datestr)
if wikidataEntries['search'] != []:
results = wikidataEntries['search']
numresults = len(results)
for i in range(0,numresults):
targetpage = pywikibot.ItemPage(wikidata_site, results[i]['id'])
item_dict = targetpage.get()
print('http://www.wikidata.org/wiki/'+results[i]['id'])
# Make sure we don't have a sitelink already
sitelink_check = False
try:
sitelink = get_sitelink_title(item_dict['sitelinks']['commonswiki'])
print('http://commons.wikimedia.org/wiki/'+sitelink.replace(' ','_'))
sitelink_check = True
except:
pass
if sitelink_check:
print('Has sitelink')
continue
calday = False
P31 = ''
try:
P31 = item_dict['claims']['P31']
except:
print('No P31, skipping')
continue
if P31 != '':
for clm in P31:
# print(clm)
# print(clm.getTarget().title())
if clm.getTarget().title() == 'Q47150325':
calday = True
if not calday:
print('Not a calendar day, skipping')
continue
try:
print(item_dict['labels']['en'])
except:
print('')
print('http://commons.wikimedia.org/wiki/'+page.title().replace(' ','_'))
text = 'y'
if debug:
text = input("Save? ")
if text != 'n':
data = {'sitelinks': [{'site': 'commonswiki', 'title': page.title()}]}
targetpage.editEntity(data, summary=u'Add commons sitelink')
return 1
# If we're here, it hasn't worked, return 0
return 0
# Run through the category contents
cat = pywikibot.Category(commons, targetcat)
for result in pagegenerators.SubCategoriesPageGenerator(cat, recurse=False):
if trip == 0:
if 'Category:2000' not in result.title():
continue
else:
trip = 1
nummodified += do_date_find(result)
if nummodified >= maxnum:
print('Reached the maximum of ' + str(maxnum) + ' entries modified, quitting!')
break
# EOF