forked from JetBrains/kotlin-web-site
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdist.py
78 lines (51 loc) · 2.06 KB
/
dist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from bs4 import BeautifulSoup
from os import path, walk
dist_path = path.join(path.dirname(__file__), "../", "dist")
def get_dist_page_content(url):
path_file = dist_path + url
if url.endswith('/'):
path_file += 'index.html'
if path.exists(path_file):
with open(path_file, 'r', encoding="UTF-8") as file:
return file.read()
raise Exception('Bad response during indexing')
def get_dist_page_xml(url):
html_content = get_dist_page_content(url)
return BeautifulSoup(html_content, "html.parser")
def get_dist_page_type(url):
page_type = None
if url.endswith('/') or url.endswith('.html'):
page_type = 'Page'
if url.startswith('community'):
page_type = 'Page_Community'
if url.startswith('docs/reference'):
page_type = 'Page_Reference'
if url.startswith('docs/tutorials'):
page_type = 'Page_Tutorial'
if url.endswith('404.html'):
page_type = 'Page_NotFound'
parsed = get_dist_page_xml(url)
if url.startswith("/api/latest/"):
page_type = "Page_API_stdlib" if "jvm/stdlib" in url else "Page_API_test"
if url.startswith("/spec/"):
page_type = "Page_Spec"
if parsed.select_one("body[data-article-props]"):
page_type = 'Page_Documentation'
if parsed.find("meta", {"http-equiv": "refresh"}):
page_type = 'Redirect'
if url.endswith('pdf'):
page_type = 'File_Pdf'
if url.endswith('package-list') or url.endswith('index.yml'):
page_type = 'File_Text'
return page_type
def get_dist_pages():
paths = []
if path.isdir(dist_path):
for root, dirnames, filenames in walk(dist_path):
for filename in filenames:
prefix_path = root[len(dist_path):]
if not prefix_path: prefix_path = "/"
url = path.join(prefix_path, filename)
if url.endswith('index.html'): url = url[:-10]
paths.append((url, get_dist_page_type(url)))
return paths