-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbot.py
120 lines (86 loc) · 3.31 KB
/
bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
# coding: utf-8
import datetime as dt
import os
import shutil
from functools import partial
from multiprocessing.dummy import Pool as ThreadPool
import requests
from pypdf import PdfWriter
def get_page_count(issue_id, date_string):
url = f"https://www.enewspapr.com/OutSourcingDataChanged.php?operation=getPageArticleDetails&selectedIssueId={issue_id}_{date_string}"
response = requests.get(url)
response.status_code
page_count = len(response.json())
return page_count
def download_pdf(issue_id, date_string, page_no):
issue = issue_id.split("_")[0]
region = issue_id.split("_")[1]
yyyy = date_string[:4]
mm = date_string[4:6]
dd = date_string[6:8]
page_no = str(page_no).zfill(2)
page_url = f"https://www.enewspapr.com/News/{issue}/{region}/{yyyy}/{mm}/{dd}/{date_string}_{page_no}.PDF"
response = requests.get(page_url)
print(page_url, response.status_code)
if response.status_code == 200:
filename = page_url.rsplit("/", 1)[-1]
with open("tmp/" + filename, "wb") as f:
f.write(response.content)
else:
print("Error :", response.text)
def export_to_single_df(issue_id, date_string):
merger = PdfWriter()
file_found = False
for pdf in sorted(os.listdir("tmp")):
file_found = True
merger.append("tmp/" + pdf)
if not file_found:
merger.close()
print("E-paper not found")
return
out_filename = f"output/{issue_id}_{date_string}.pdf"
merger.write(out_filename)
merger.close()
print("E-paper saved :", out_filename)
if __name__ == "__main__":
current_time = dt.datetime.now(dt.timezone.utc) + dt.timedelta(hours=5, minutes=30)
print("Current India time :", current_time)
date_string = dt.datetime.strftime(current_time, "%Y%m%d")
ISSUE_ID = "KANPRABHA_MN"
# check if paper already downloaded or exists
paper_exists = False
for file in os.listdir("output"):
file_date_str = file.split("_")[-1].split(".")[0]
if date_string == file_date_str:
print(f"Paper for date {date_string} already exists")
paper_exists = True
break
if not paper_exists:
print(f"Downloading '{ISSUE_ID}' of date : {date_string}")
page_count = get_page_count(ISSUE_ID, date_string)
print("No. of pages :", page_count)
os.makedirs("tmp", exist_ok=True)
os.makedirs("output", exist_ok=True)
pages = []
for i in range(1, page_count + 1):
pages.append(i)
print("Downloading all pages...")
# Make the Pool of workers
pool = ThreadPool(8)
func = partial(download_pdf, ISSUE_ID, date_string)
pool.map(func, pages)
pool.close()
pool.join()
print("Download complete")
export_to_single_df(ISSUE_ID, date_string)
print("Cleaning 'tmp' folder")
shutil.rmtree("tmp")
print("Complete")
# delete pdf files older than 7 days
for file in os.listdir("output"):
file_date_str = file.split("_")[-1].split(".")[0]
file_date = dt.datetime.strptime(file_date_str, "%Y%m%d")
if (current_time.replace(tzinfo=None) - file_date).days > 7:
print(f"Deleting File '{file}' older than 7 days")
os.remove("output/" + file)