-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape.py
105 lines (88 loc) · 2.91 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import argparse
import json
import logging
import pandas as pd
import psycopg
from datetime import datetime
from pathlib import Path
from psycopg.conninfo import make_conninfo
from apis import main as main_apis
from scraper import main as main_scraper
def get_sources(content):
try:
data = json.loads(content)
except json.JSONDecodeError as e:
logging.error(f"Failed to decode JSON: {e}")
raise
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
raise
# Validate the data structure
for d in data:
if not isinstance(d, dict):
logging.error(f"Invalid data structure: expected a dictionary, got {type(d).__name__}")
raise
required_keys = ["name", "id", "url", "type"]
for key in required_keys:
if key not in d:
logging.error(f"Missing required key '{key}' in data: {d}")
raise
scrapers, apis = [], []
for d in data:
if d["type"] == "scraper":
scrapers.append(d)
elif d["type"] == "api":
apis.append(d)
return scrapers, apis
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--country",
default="fr",
help="run the scraper for the given json containing data sources",
)
parser.add_argument(
"--headless",
action="store_true",
default=False,
help="run scraping in headless mode",
)
parser.add_argument(
"--push-to-db",
action="store_true",
default=False,
help="push the scraped results to db",
)
args = parser.parse_args()
# Validate the source file
source_path = Path(f"countries/{args.country}.json")
try:
with open(source_path, "r") as file:
content = file.read()
except FileNotFoundError:
print(f"Source file {source_path} does not exist.")
raise
# Parse the sources
scrapers, apis = get_sources(content)
# Launch the scraper
df1 = main_scraper(scrapers, headless=args.headless)
df2 = main_apis(apis)
df_merged = pd.concat([df1, df2])
dt = datetime.now()
insert_time = dt.strftime("%Y%m%d_%H%M%S")
with open(f"results/events_{insert_time}.json", "w", encoding="UTF-8") as file:
df_merged.to_json(file, orient="records", force_ascii=False, indent=2)
# Push the resulting json file to the database
if args.push_to_db:
print("Pushing scraped results into db...")
credentials = get_config()
host = credentials["host"]
port = credentials["port"]
user = credentials["user"]
psw = credentials["psw"]
database = credentials["database"]
with psycopg.connect(
make_conninfo(dbname=database, user=user, password=psw, host=host, port=port)
) as conn:
etl(conn, df_merged)
print("Done")