-
Notifications
You must be signed in to change notification settings - Fork 8
/
check_dewp.py
57 lines (51 loc) · 1.43 KB
/
check_dewp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Remove bad dewp sitelinks
# Mike Peel 24-Aug-2020 v1 - start
from __future__ import unicode_literals
import pywikibot
import numpy as np
import time
import string
from pywikibot import pagegenerators
import urllib
import csv
from pibot_functions import *
wikidata_site = pywikibot.Site("wikidata", "wikidata")
repo = wikidata_site.data_repository() # this is a DataSite object
debug = 1
infile = 'quarry-47624-wikidata-sitelinks-run491913.csv'
lang = 'de'
with open(infile, mode='r') as infile:
reader = csv.reader(infile)
targets = {rows[0] for rows in reader}
for target in targets:
# if test == 0 and 'Toyota JPN' not in target:
target = target.strip()
print(target)
page = pywikibot.ItemPage(repo, target)
try:
item_dict = page.get()
qid = page.title()
except:
print('Huh - no page found')
continue
print("\nhttps://www.wikidata.org/wiki/" + qid)
try:
sitelink = get_sitelink_title(item_dict['sitelinks'][lang+'wiki'])
print(sitelink)
except:
print(lang + ' sitelink not found!')
continue
url = u'https://'+lang+'.wikipedia.org/wiki/'+sitelink.replace(' ','_')
url = urllib.parse.quote(url.encode('utf8'), ':/')
print(url)
try:
a=urllib.request.urlopen(url)
except urllib.error.URLError as e:
print(e.code)
if e.code == 404:
print('Removing link')
page.removeSitelink(site=lang+'wiki', summary=u'Removing broken sitelink to '+lang+'wiki')
# exit()
# EOF