-
Notifications
You must be signed in to change notification settings - Fork 1
/
check_links.py
executable file
·124 lines (99 loc) · 3.67 KB
/
check_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
"""Ensure URLs in every .html file are reachable"""
import multiprocessing as mp
import os
import re
import requests
import sys
def lint_links(link):
"""Runs the URL in the regex Match object through the link linter.
Keyword arguments:
link -- a tuple containing the filename, line number of URL, and URL
"""
filename, line, url = link
return verify_url(filename, line, url)
def verify_url(filename, line_number, url):
"""Verifies URL is reachable and returns 200 status code.
Keyword arguments:
filename -- name of file containing URL
line_number -- line number of URL
url -- the URL to verify
Returns:
True if verification succeeded or False otherwise
"""
if url.startswith("http://") or url.startswith("https://"):
# Ignore .ms links since they only return success on Windows
m = re.search(r"^https?://([^/]+).*?$", url)
if m.group(1).endswith(".ms"):
return True
try:
r = requests.head(url, headers={"User-Agent": "Python Requests"})
if r.status_code != 200:
print(f"[{filename}:{line_number}]\n {url}\n {r.status_code}")
return False
except requests.ConnectionError as ex:
print(f"[{filename}:{line_number}]\n {url}\n {str(ex)}")
return False
else:
if url.find("#") != -1:
# Strip trailing page ID
relative_dest = url[: url.find("#")]
url_id = url[url.find("#") + 1 :]
else:
relative_dest = url
url_id = ""
# If URL is empty, it refers to the current file
if relative_dest == "":
relative_dest = os.path.basename(filename)
absolute_dest = os.path.join(os.path.dirname(filename), relative_dest)
if not os.path.exists(absolute_dest):
print(
f"[{filename}:{line_number}]\n file://{relative_dest}\n No such file or directory."
)
return False
# If filename wasn't provided in link, assume index.html
if os.path.isdir(absolute_dest):
absolute_dest = os.path.join(absolute_dest, "index.html")
# If destination file exists and the URL has an ID appended, verify
# the ID exists in the destination file.
if len(url_id) > 0:
id_rgx = re.compile(r'\s+ id= \s* "(?P<id>[^"]+)"', re.X)
with open(absolute_dest) as file:
content = file.read()
for match in id_rgx.finditer(content):
if match.group("id") == url_id:
return True
print(
f"[{filename}:{line_number}]\n file://{absolute_dest}\n tag '{url_id}' does not exist in {absolute_dest}"
)
return False
return True
files = [
os.path.join(dp, f)[2:]
for dp, dn, fn in os.walk(".")
for f in fn
if f.endswith(".html") and "reveal.js" not in dp
]
url_rgx = re.compile(r'a \s+ href= \s* "(?P<url>[^"]+)"', re.X)
# link tuples contain:
# filename -- filename
# contents -- file contents
# match -- regex Match object
links = []
for filename in files:
# Get file contents
with open(filename, "r") as f:
contents = f.read()
for match in list(url_rgx.finditer(contents)):
# Get line regex match was on
linecount = 1
for i in range(match.start()):
if contents[i] == os.linesep:
linecount += 1
links.append((filename, linecount, match.group("url")))
with mp.Pool(mp.cpu_count()) as pool:
results = pool.map(lint_links, links)
if all(results):
sys.exit(0)
else:
sys.exit(1)