-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpage_gatherer.py
35 lines (24 loc) · 884 Bytes
/
page_gatherer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from pyquery import PyQuery as pq
from utils import print_dot
# works for tokyo ghoul and naruto
def v1(base_url: str, category):
chapters_gathered = False
chapters_urls = set()
glossary_url = f"{base_url}/wiki/Category:{category}"
count = 1
print("Gathering glossary.")
while not chapters_gathered:
print_dot(count, 50)
q = pq(url=glossary_url)
links = q("div.category-page__members a")
for link in links:
href = link.get('href')
if "Discussion" in href:
continue
chapters_urls.add(f"{base_url}{href}")
glossary_url = q("a.category-page__pagination-next").attr("href")
count += 1
if not glossary_url:
chapters_gathered = True
print(f"\nDone gathering glossary. Found {len(chapters_urls)} entries.")
return chapters_urls