-
Notifications
You must be signed in to change notification settings - Fork 8
/
commons_gallery_check.py
138 lines (125 loc) · 3.26 KB
/
commons_gallery_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Look at random galleries to see if they are worth keeping or not
# Mike Peel 01-Mar-2018 v1 - start
import pywikibot
from pywikibot import pagegenerators
import json
def countfiles(text):
text = text.lower()
# Check for unmarked files
lines = text.splitlines()
trip = 0
for i in range(0,len(lines)):
if '</gallery' in lines[i]:
trip = 0
if trip == 1:
if 'file:' not in lines[i] and 'image:' not in lines[i]:
lines[i] = 'file:'+lines[i]
if '<gallery' in lines[i]:
trip = 1
text = "\n".join(lines)
text = text.replace(':file:','').replace(':image:','')
# Count the number of files
count = text.count('file:')
count = count + text.count('image:')
# Get the filenames
test = text.split('file:')
filelist = []
for i in range(1,len(test)-1):
filelist.append(test[i].split('.')[0])
test = text.split('image:')
filelist = []
for i in range(1,len(test)-1):
filelist.append(test[i].split('.')[0])
return count, filelist
commons = pywikibot.Site('commons', 'commons')
repo = commons.data_repository() # this is a DataSite object
maxnum = 100
numchecked = 0
seen = []
good = []
bad_0 = []
bad_1 = []
bad_2 = []
bad_3 = []
bad_4 = []
bad_5 = []
while numchecked < maxnum:
targets = pagegenerators.RandomPageGenerator(total=120, site=commons, namespaces='0')
for target in targets:
print(target.title())
if target.title() not in seen:
numchecked += 1
seen.append(target.title())
text = target.get()
count, filelist = countfiles(text)
# Look back at the history to see when the last file was added
history = target.revisions()
previous = 'latest'
for revision in history:
# print(revision)
revision_page = target.getOldVersion(revision.revid)
# print(revision_page)
# print(count)
# print(filelist)
count2, filelist2 = countfiles(revision_page)
# print(count2)
# print(filelist2)
# print(revision['timestamp'])
if previous == 'latest':
previous = revision['timestamp']
lastadded = previous
previous = revision['timestamp']
if count2 == count and filelist2 == filelist:
null = 0
# print('OK')
else:
break
# print('* [['+target.title()+']] - ' + str(lastadded))
# print(int(str(lastadded)[0:4]))
if int(str(lastadded)[0:4]) < 2016:
log = "* '''[["+target.title()+"]]''' - " + str(lastadded)
else:
log = "* [["+target.title()+"]] - " + str(lastadded)
# print(log)
# test = input('Continue?')
if count == 0:
bad_0.append()
elif count == 1:
bad_1.append(log)
elif count == 2:
bad_2.append(log)
elif count == 3:
bad_3.append(log)
elif count == 4:
bad_4.append(log)
elif count == 5:
bad_5.append(log)
else:
good.append(log)
if numchecked >= maxnum:
print('Reached the maximum of ' + str(maxnum) + ' entries checked, quitting!')
break
print('== Bad (0 files) ==')
for line in bad_0:
print(line)
print('== Bad (1 file) ==')
for line in bad_1:
print(line)
print('== Bad (2 files) ==')
for line in bad_2:
print(line)
print('== Bad (3 files) ==')
for line in bad_3:
print(line)
print('== Bad (4 files) ==')
for line in bad_4:
print(line)
print('== Bad (5 files) ==')
for line in bad_5:
print(line)
print('== Good ==')
for line in good:
print(line)
# EOF