-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessURLs.py
198 lines (157 loc) · 7.42 KB
/
processURLs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import init
import redis
import identifyMovie as im
import movieMeta as mm
# default, hardcoded values for some settings (you can override them in the config file)
# if movie identification is interactive, then whenever a movie cannot be identified
# the IMDB id is asked to the user on the command line
interactive = False
key_toIndex = 'toIndex' # the queue (actually a set now) where the movies that still need to be indexed are stored
key_failed = 'failed' # the set where the unreadable (no size/hash) movies are saved
key_noIMDB = 'noIMDB' # the set containing the movies for which no IMDB was found
key_noMeta = 'noMeta' # the set containing the movies for which no IMDB was found
key_imdb = 'imdb:#imdbid#' # the hash containing all the IMDB metadata we are interested in
key_uris = 'uris:#imdbid#' # the hash containing all the URIs matching a gived IMDBid
key_hashIMDB = 'h2i:#size#:#hash#' # cached mappings between bytesize/movhash and IMDB ids
key_genres = 'genre:#genre#' # the set containing all the IMDBids of movies of a given genre
# default parameters for redis connection
redis_host = 'localhost'
redis_port = '6379'
redis_db = '0'
# load configuration params and start logger
# (just use default config.yaml filename for now, might extend to a parameter later)
conf,logger = init.configure()
if conf is not None:
interactive = conf['interactive_ident']
key_toIndex = conf['key_toIndex']
key_failed = conf['key_failed']
key_noIMDB = conf['key_noIMDB']
key_noMeta = conf['key_noMeta']
key_imdb = conf['key_imdb']
key_uris = conf['key_uris']
key_hashIMDB = conf['key_hashIMDB']
key_genres = conf['key_genres']
redis_host = conf['redis_host']
redis_port = conf['redis_port']
redis_db = conf['redis_db']
else:
logger.error("Could not open config file, reverting to defaults")
def getCachedHash (r,movURL):
if r.exists(movURL):
# if the URL has already been cached, load size and hash
logger.debug('Moviehash was cached, skipping hashing (yay!)')
s = r.hget(movURL,'size')
h = r.hget(movURL,'hash')
else:
# call hashURL to get is size and hash, then cache them
logger.debug('Moviehash was not cached, hashing URL %s' % movURL)
try:
# calculate hash
s,h = im.hashURL(movURL)
# save data related to URL
logger.debug("Caching URL size and hash")
r.hset(movURL,'size',s)
r.hset(movURL,'hash',h)
except im.hashException as e:
raise
return s,h
def getCachedIMDBid(r, bytesize, movhash):
key_h2i = conf['key_hashIMDB'].replace("#size#",bytesize).replace("#hash#",movhash)
# first check if the IMDBid is cached
IMDBid = r.get(key_h2i)
if IMDBid is not None:
logger.debug('IMDBid was cached, skipping identification (yay!)')
else:
# IMDBid was not cached, try to get it automatically from opensubtitles
logger.debug('IMDBid was not cached, trying to get it automatically')
try:
IMDBid = im.getIMDBid(bytesize, movhash)
except im.IMDBException:
# if IMDB cannot be found automatically, ask to provide it manually
# (but only if interactive mode is enabled)
if interactive:
IMDBid = input("IMDBid not found, please enter it manually: ")
if not IMDBid:
# works for empty string
raise im.IMDBException("IMDBid not found and not manually provided")
else:
raise
# TODO: check for IMDB validity?
if IMDBid:
logger.debug('Caching IMDBid')
r.set(key_h2i, IMDBid)
return IMDBid
def getCachedMetadata(r, IMDBid):
IMDBkey = key_imdb.replace("#imdbid#",IMDBid)
# first check if metadata already exists
if r.exists(IMDBkey):
logger.debug('Metadata was cached, skipping download (yay!)')
m = r.hgetall(IMDBkey)
else:
# if not, get them from IMDB + possibly other APIs (check movieMeta for details)
logger.debug('Metadata was not cached, downloading...')
try:
m = mm.getMovieMeta(IMDBid, downloadPosters=True)
# getMovieMeta should provide everything we need (IMDB meta + something more)
# we want to save the following keys in the imdb:IMDBid hash
imdbKeys = ('title', 'long imdb title', 'slug', 'year', 'rating', \
'runtime_simple', 'plot outline', 'cover url', 'full-size cover url', \
'altPosterURL')
for key in imdbKeys:
if m.has_key(key):
r.hsetnx(IMDBkey, key, m[key])
else:
logger.warning('Movie with IMDB %s does not have key %s!' %(str(IMDBid),key))
# movie genres
for genre in m.get('genres',[]):
r.sadd(key_genres.replace("#genre#",genre), IMDBid)
# poster URLs
# TODO: fix key names here
# I like the fact that posters: key are agnostic of the poster provenance,
# but you know nothing about quality etc! Probably it is better to save
# poster URLs together with imdb:* keys, keeping their original semantics
# related to size/quality
if m.has_key('full-size cover url'):
r.sadd('posters:'+IMDBid,m['full-size cover url'])
if m.has_key('altPosterURL'):
r.sadd('posters:'+IMDBid,m['altPosterURL'])
except mm.movieMetaException:
raise
if __name__ == '__main__':
# connect to redis
logger.info("Storing URLs in Redis (%s:%s, db %s)" %(redis_host,redis_port,redis_db))
r = redis.StrictRedis(host=redis_host, port=redis_port, db=redis_db, decode_responses=True)
movnum = r.scard(key_toIndex)
# while there are URLs to index...
while movnum > 0:
movURL = r.spop(key_toIndex)
# some other thread might have popped the last url from the queue
if not movURL:
break
try:
logger.info("Identifying URL " + movURL)
s,h = getCachedHash(r,movURL)
# get IMDB id from opensubtitles
logger.debug("Getting IMDBid for movie (%s,%s)" %(str(s),str(h)))
IMDBid = getCachedIMDBid(r,s,h)
# if you got the IMDB id, save it together with the URL
logger.debug("IMDBid found! Saving URL for IMDBid %s" %str(IMDBid))
r.sadd(key_uris.replace("#imdbid#",IMDBid), movURL)
# add metadata
logger.debug("Getting movie metadata for IMDBid %s" %str(IMDBid))
m = getCachedMetadata(r,IMDBid)
logger.debug("Metadata found for IMDBid %s" %str(IMDBid))
except im.hashException as e:
logger.error("Error hashing URL (%s) => moving to failed" % e.message)
r.sadd(key_failed, movURL)
except im.IMDBException as e:
logger.error("Error identifying URL (%s) => moving to unidentified" % e.message)
r.sadd(key_noIMDB, movURL)
except mm.movieMetaException as e:
logger.error("Error getting metaadata for URL (%s) => moving to noMeta" % e.message)
r.sadd(key_noMeta, movURL)
finally:
# get updated queue length
movnum = r.scard(key_toIndex)
logger.info("Queue length: %d" % movnum)
logger.info("Movie queue empty: stopping.")