-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_series_rating.py
155 lines (122 loc) · 4.94 KB
/
get_series_rating.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import sys
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from threading import Thread
def parse_episodes(doc):
"""Parse episodes from the season page of a series on IMDB."""
# Find all TVEpisode items within the document
eplist = doc.find("div", class_="eplist").find_all(
itemtype="http://schema.org/TVEpisode"
)
# Iterate over all episodes
for episode in eplist:
# Get airdate for episode
date_str = episode.find(class_="airdate").text.strip()
try:
# Parse airdate into datetime object
airdate = datetime.strptime(
date_str,
"%d %b. %Y" if "May" not in date_str else "%d %b %Y"
)
# Ensure episode has already aired
if datetime.now() > airdate:
# Get title, episode number and episode info from entry
title = episode.strong.text
num = episode.find("meta", itemprop="episodeNumber")["content"]
info = episode.find(class_="ipl-rating-star").find_all("span")
# Get rating for episode if available
if len(info) >= 2:
rating = float(info[1].text)
else:
rating = 0.0
# Get number of votes for episode if available
if len(info) >= 3:
num_votes = int(re.sub(r"[^0-9]", "", info[2].text))
else:
num_votes = 0
# Yield extracted values
yield num, title, rating, num_votes
except ValueError:
# Skip to next iteration if an exception occurs
pass
def get_series_ratings(imdb_id):
"""Returns ratings for all episodes of the series with given IMDB ID."""
# Fetch list of episodes for given IMDB ID
url = "http://www.imdb.com/title/" + imdb_id + "/episodes"
data = requests.get(url, headers={
"Accept-Language": "en"
})
# Parse document and find dropdown menu containing list of seasons
doc = BeautifulSoup(data.text, "html.parser")
seasons = doc.find("select", id="bySeason").find_all("option")
# Parse episodes on initial page
preselected_episodes = parse_episodes(doc)
# Extract series title from initial page
series_title = doc.find("h3", itemprop="name").a.text
# Shared data structure for storing retrieved data grouped by season
season_data = {}
def fetch_season_and_parse(season_number):
"""Fetches and parses the data associated to the given season."""
# Fetch season page
season_url = url + "?season=" + season_number
data = requests.get(season_url, headers={
"Accept-Language": "en"
})
# Initialise entry in shared data structure
season_data[season_number] = []
# Parse season page
doc = BeautifulSoup(data.text, "html.parser")
# Parse episodes for season
for episode in parse_episodes(doc):
episode_num = season_number + "." + episode[0]
# Add parsed data to data strcture
season_data[season_number].append(
(episode_num, series_title) + episode[1:]
)
# List of threads
threads = []
# Iterate over remaining seasons
for season in seasons:
# Process season extracted from initial page
if season.has_attr("selected"):
season_data[season["value"]] = []
for episode in preselected_episodes:
# Prefix episode number with season number and yield
episode_num = season["value"] + "." + episode[0]
# Add data to data strcture
season_data[season["value"]].append(
(episode_num, series_title) + episode[1:]
)
else:
# Spawn new thread for fetching data for the given season
thread = Thread(
target=fetch_season_and_parse,
args=(season["value"], )
)
# Append thread to list of threads and start it
threads.append(thread)
thread.start()
# Wait for threads to complete
for thread in threads:
thread.join()
# Sort retrieved data by season number
for key in sorted(season_data):
# Yield data for each episode separately
for episode in season_data[key]:
yield episode
def main():
"""Called if script is called from the command line."""
# Make sure an IMDB ID is passed through the command line
if len(sys.argv) != 2:
print("USAGE:", sys.argv[0], "[imdb_id]")
sys.exit(1)
# Get ratings for all episodes
ratings = get_series_ratings(sys.argv[1])
# Print episode ratings as CSV
print("episode_num,name,title,rating,rating_count")
for episode_info in ratings:
print("\"%s\",\"%s\",\"%s\",%.1f,%d" % episode_info)
if __name__ == "__main__":
main()