-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbroken_ext_links.py
executable file
·52 lines (43 loc) · 1.65 KB
/
broken_ext_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/python3
"""
Module for checking broken links (404)
Use https://docs.csc.fi/ as default URL for checking.
It will last around 10 minutes for https://docs.csc.fi/
If you want to test another URL, you need to specify as argument.
Don't forget to add http or https at the beginning.
eg: ./broken_ext_links.py https://www.google.com
"""
import subprocess
import sys
import re
def check_broken_ext_links(url):
"""
Function for checking broken links (404).
"""
subprocess.run(["linkchecker", "--check-extern", "-F", "text", "-q", url], check=False)
with open("linkchecker-out.txt", "r", encoding="utf-8") as f_in:
file = f_in.read()
result = "Result Error: 404 Not Found"
counter = 0
print("BROKEN LINKS REPORT: \n")
for line in file.split('\n\n'):
if result in line:
url = line.split('\n')[0].split(' ')[-1]
name = line.split('\n')[1].split(' ')[-1]
parent_url = line.split('\n')[2].split(' ')[-5].strip(',')
counter += 1
print(f"URL: {url}\nName: {name}\nParent URL: {parent_url}\n{result}\n")
print(f"Processed: {counter}")
if __name__ == "__main__":
r = re.compile("http(s?)://.+")
if len(sys.argv) != 2:
URL = "https://docs.csc.fi/"
print(f"No URL specified, testing: {URL}")
check_broken_ext_links(URL)
else:
URL = sys.argv[1]
if r.match(URL):
print(f"Testing {URL}")
check_broken_ext_links(URL)
else:
print("Incorrect format of the URL, don't forget to add http or https at the beginning")