-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_urls.py
60 lines (50 loc) · 2.29 KB
/
scrape_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import logging
from datetime import timedelta
import sys
import yaml
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from technews_nlp_aggregator.scraping.main.scrapy.spiders import *
from technews_nlp_aggregator.scraping.main.scrapy import settings
from technews_nlp_aggregator.scraping.main.scrapy.pipelines import Pipeline
from technews_nlp_aggregator.persistence import ArticleDatasetRepo, ArticlesSpiderRepo
from technews_nlp_aggregator.scraping.main.scrapy import settings
from technews_nlp_aggregator.common import load_config
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def do_crawl(articleDatasetRepo, spidermap, go_back):
crawler_settings = Settings()
crawler_settings.setmodule(settings)
process = CrawlerProcess(settings=crawler_settings)
for spider_name in spidermap:
spider_class = spider_name+"Spider"
if spider_class in globals():
spider = globals()[spider_name+"Spider"]
urls = spidermap[spider_name]
max_date = articleDatasetRepo.get_latest_article_date()
go_back_date = max_date - timedelta(days=go_back)
spider.start_urls = urls
process.crawl(spider, articleDatasetRepo, go_back_date, urls)
else:
logging.error("COULD NOT FIND SPIDER {}".format(spider_name))
process.start()
def create_spider_map(url_queued):
to_process = {}
for spider, url in url_queued:
if spider and url:
list_to_process = to_process.get(spider, [])
list_to_process.append(url)
to_process[spider] = list_to_process
return to_process
if __name__ == '__main__':
config = load_config(sys.argv)
go_back = config["go_back"]
db_config = yaml.safe_load(open(config["key_file"]))
db_url = db_config["db_url"]
logging.info("DB_URL: {}".format(db_url))
articleDatasetRepo = ArticleDatasetRepo(db_config.get("db_url"))
articleSpiderRepo = ArticlesSpiderRepo(db_config.get("db_url"))
url_queued = articleSpiderRepo.retrieve_urls_queued()
result = [(row["UTA_SPIDER"], row["UTA_URL"].strip()) for row in url_queued]
to_process = create_spider_map(result)
do_crawl(articleDatasetRepo, to_process, go_back)
print(Pipeline.successfully_crawled)