This repository has been archived by the owner on Jul 6, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 309
/
linkfix.py
171 lines (127 loc) · 5.4 KB
/
linkfix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import re
import sys
from datetime import datetime, timedelta
from sphinx.util.console import purple, darkgreen, darkred, white
LINE_RE = re.compile(r'^([^:]+):(\d+): \[([^\]]+)\] (\S+)(.*)$')
REASON_RE = re.compile(r'\s*(.+) (?:for|with) url:.+')
DOMAIN_ROLE_MAP = {
'https://github.com/': 'github',
'https://bitbucket.org/': 'bitbucket'
}
def error_exit(msg):
print('linkfix: error: {}\n'.format(msg), file=sys.stderr)
sys.exit(1)
def url_path(domain, url):
result = url.split(domain, 1)[-1]
result = result[:-1] if result.endswith('/') else result
return result
def replace_roles(domain, text, src, dst):
changed = 0
role = DOMAIN_ROLE_MAP[domain]
src_part = url_path(domain, src)
dst_part = url_path(domain, dst)
pttn = r':{}:`([^<`]+)<{}/?>`'.format(role, src_part)
repl = r':{}:`\g<1><{}>`'.format(role, dst_part)
text, n = re.subn(pttn, repl, text)
changed += n
pttn = r':{}:`{}/?`'.format(role, src_part)
repl = r':{}:`{}`'.format(role, dst_part)
text, n = re.subn(pttn, repl, text)
changed += n
return changed, text
def update_redirects(redirects):
replacements_made = 0
# for all files that redirects were found in...
for filename, redirects in redirects.items():
# read the entire text from the rst file
rst_filename = os.path.join('source', filename)
with open(rst_filename, 'r') as fp:
text = fp.read()
# for all redirects found in the rst file...
esc_redirects = {}
for src, dst in redirects.items():
# if src url is in the file add add it to a redirects map
if src in text:
esc_redirects[re.escape(src)] = dst
continue
# check if src is actually written as a sphinx role in the file
for domain in DOMAIN_ROLE_MAP:
if src.startswith(domain) and dst.startswith(domain):
n, text = replace_roles(domain, text, src, dst)
replacements_made += n
break
else:
# if it wasn't a sphinx role then print an error message
error_exit('{!r} not found in {!s}'.format(src, filename))
# create a giant regex that matches all (non-role) urls to change
pttn = re.compile('|'.join(esc_redirects))
# search and replace all non-role redirects in the file
text, n = pttn.subn(lambda m: esc_redirects[re.escape(m.group(0))], text)
replacements_made += n
# write the resulting text back into the file
with open(rst_filename, 'w') as fp:
fp.writelines(text)
return replacements_made
def main():
# check that the linkcheck file exists
linkcheck_file = os.path.join('build', 'linkcheck', 'output.txt')
if not os.path.exists(linkcheck_file):
error_exit('no linkcheck output file; run make linkcheck')
# check that it hasn't been more than a day since the last linkcheck
last_linkcheck = datetime.fromtimestamp(os.path.getmtime(linkcheck_file))
if datetime.now() - last_linkcheck > timedelta(days=1):
error_exit('linkcheck output outdated; run make linkcheck')
# parse each line of the linkcheck output.txt file
with open(linkcheck_file) as fp:
lines = fp.readlines()
local = {}
broken = {}
perm_redirects = {}
temp_redirects = {}
for line in lines:
m = LINE_RE.match(line)
if m is None:
error_exit('could not parse: {!r}'.format(line))
continue
filename, lineno, status, url, more = m.groups()
# ignore links with certain status messages
if '429' in more and 'Too Many Requests' in more.title():
continue
# gather data for broken urls
elif status == 'broken':
url = url.rstrip(':')
m = REASON_RE.match(more)
more = m.group(1) if m else more.strip()
broken.setdefault(filename, {})[url] = more
# gather local links
elif status == 'local':
local.setdefault(filename, set()).add(url)
# gather data for permanent redirects
elif status == 'redirected permanently':
dst = more.split(' to ', 1)[-1].strip()
perm_redirects.setdefault(filename, {})[url] = dst
# gather data for ...other ...temporary? redirects
elif status.startswith('redirected'):
dst = more.split(' to ', 1)[-1].strip()
temp_redirects.setdefault(filename, {})[url] = dst
n = update_redirects(perm_redirects)
print()
urls = {x for urls in perm_redirects.values() for x in urls.items()}
print(white('Found {} links returning 301s [{} replacements made]'.format(len(urls), n)))
for src, dst in sorted(urls):
print(src + darkgreen(' -> ' + dst))
print()
urls = {x for urls in temp_redirects.values() for x in urls.items()}
print(white('Found {} links returning other 300 codes [no replacements made]'.format(len(urls))))
for src, dst in sorted(urls):
print(src + purple(' -> ' + dst))
print()
urls = {x for urls in broken.values() for x in urls.items()}
print(white('Found {} links returning error codes (excluding 429)'.format(len(urls))))
for url, reason in sorted(urls):
print(url + darkred(' - ' + reason))
if __name__ == '__main__':
main()