-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
75 lines (66 loc) · 2.24 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
import os
from urllib import response
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import string
import re
import pprint as pp
finviz_url = "https://finviz.com/quote.ashx?t="
news_headlines = {}
def get_news_tables(ticker):
url = finviz_url + ticker
req = Request(url, headers={'User-Agent': 'my-app/0.0.1'})
resp = urlopen(req)
html = BeautifulSoup(resp, "lxml")
return html.find(id = "news-table")
def get_news_headlines(table):
response = []
html_headline = table.findAll("tr")
for index, row in enumerate(html_headline):
headline = []
headline.append(re.sub(r'[^\w\s]', '', row.a.text))
headline.append(row.a["href"])
headline.append(row.td.text.replace("\xa0", u""))
headline.append(row.span.text[1:])
response.append(headline)
return response
def fix_dates(data):
latest_date = ""
for headline in data:
timestamp = headline[2].split(" ")
if len(timestamp) == 2:
latest_date = timestamp[0]
else:
headline[2] = latest_date + " " + timestamp[0]
return data
def group_by_date(data):
by_date = {}
for headline in data:
date = headline[2].split(" ")[0]
headline[2] = headline[2].split(" ")[1]
headline_formatted = {}
headline_formatted["title"] = headline[0]
headline_formatted["url"] = headline[1]
headline_formatted["timestamp"] = headline[2]
headline_formatted["source"] = headline[3]
try:
by_date[date].append(headline_formatted)
except KeyError:
by_date[date] = [headline_formatted]
return by_date
def save_to_json(data, ticker):
for date in data:
package = {"ticker": ticker, "date": date, "headlines": data[date]}
if not os.path.isdir("json/" + ticker):
os.mkdir("json/" + ticker)
else:
with open("json/" + ticker + "/" + date + ".json", "w") as f:
json.dump(package, f, indent = 4)
with open("tickers.txt") as f:
tickers = f.read().splitlines()
for ticker in tickers:
data = get_news_headlines(get_news_tables(ticker))
data = fix_dates(data)
data = group_by_date(data)
save_to_json(data, ticker)