-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopstories.py
149 lines (116 loc) · 5.63 KB
/
topstories.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import redis
import requests
import feedparser
import datetime
import dateutil.parser
import urllib2
from json import loads, dumps
from collections import OrderedDict
from util import send_email
REDIS_URL = os.getenv('REDISTOGO_URL', 'redis://localhost:6379')
r_server = redis.StrictRedis.from_url(REDIS_URL)
news_feed_list_url = 'https://raw.githubusercontent.com/spacehackers/topstories/master/news_feeds.txt'
ADMIN_EMAIL = os.getenv('ADMIN_EMAIL', None)
# if a headline includes these terms, skip it
exclude_terms = [l.lower() for l in [
'Wallpaper',
'Rosetta Stone',
'Johannes'
]]
def send_email_update(new_top_stories):
if not ADMIN_EMAIL:
return
for probe_name in [p for p in new_top_stories][:10]:
title = new_top_stories[probe_name]['title']
link = new_top_stories[probe_name]['link']
# msg = '%s: %s <a href = "%s">%s</a>' % (probe_name, title, link, link)
msg = '%s: %s' % (title, link)
if len(new_top_stories) > 10:
msg += "\n Alert! There are more than 10 top stories in this update"
subject = "spaceprobes update for %s" % probe_name
send_email(subject, msg, ADMIN_EMAIL)
def get_search_terms_by_probe():
# collect search terms from google doc
search_terms_csv_url = 'https://docs.google.com/spreadsheet/pub?key=0AtHxCskz4p33dEs5elE0eUxfd3Z4VnlXTVliVWJxRHc&single=true&gid=4&output=csv'
response = requests.get(search_terms_csv_url)
assert response.status_code == 200, 'Wrong status code'
search_terms = {}
for line in response.content.split("\n")[1:]:
probe_name = line.strip(',').split(',')[0].replace(',',' ').strip()
search_terms[probe_name.lower()] = [l.strip('"').strip() for l in line.split(',')[1:] if l] # removes empty strngs
# sort it alphabetically by probe name
search_terms_sorted = OrderedDict()
for probe_name in sorted(search_terms, key=lambda key: search_terms[key]):
search_terms_sorted[probe_name] = search_terms[probe_name]
return search_terms_sorted
def update_topstories():
""" fetches topstories from rss feeds and updates redis """
try:
topstories = loads(r_server.get('topstories'))
except TypeError:
topstories = {}
try:
topstories_archive = loads(r_server.get('topstories_archive'))
except TypeError:
topstories_archive = {}
search_terms = get_search_terms_by_probe()
# read each feed # wait how can this work on Heroku?
# move to read remote github..
response = urllib2.urlopen(news_feed_list_url)
news_feeds = response.readlines()[1:]
# keep track of when we add a new top story
new_top_stories = {}
for line in news_feeds:
url = line.split('#')[0].strip()
feed = feedparser.parse(url)
for post in feed.entries:
# nowsearch each term against each post.title
for probe_name, terms in search_terms.items():
for term in terms:
if len(term) < 3:
continue # there are no 2 letter search terms there just aren't
if post.title.find(term) > -1 and max([post.title.lower().find(t) for t in exclude_terms]) < 0:
# term is found in post title and title contains no exclude_terms
# use feedparser's published_parsed to find the published date as time.struct_time
# convert it to a string and that's what gets stored in redis
# and convert it to a datetime.datetime object for comparing to other post dates
try:
published_str = datetime.datetime(*post.published_parsed[:6]).isoformat()
published = dateutil.parser.parse(published_str)
except AttributeError, e:
print e
print url
print post
continue
if probe_name in topstories:
if post.link.strip() == topstories[probe_name]['link'].strip():
continue # we are already serving this link
if published != max([published, dateutil.parser.parse(topstories[probe_name]['published'])]):
continue # the link we already have is the latest, move along
# add this story to topstories
try:
topstories_archive[probe_name] = topstories[probe_name] # first arvhive the old
except KeyError:
pass # there was no archive, nbd this is a newly added probe
try:
topstories[probe_name] = {'title':post.title, 'link':post.link, 'published': published_str }
new_top_stories[probe_name] = topstories[probe_name]
except AttributeError:
if __name__ != "__main__":
print "can't find a post.title, post.link, here is post:"
print post
continue
r_server.set('topstories', dumps(topstories))
r_server.set('topstories_archive', dumps(topstories_archive))
if new_top_stories:
# send an email digest!
print "New Stories Added!"
print new_top_stories
send_email_update(new_top_stories)
else:
print "no new stories added"
return topstories
if __name__ == "__main__":
topstories = update_topstories()
print 'ok'