-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis_viz
133 lines (81 loc) · 3.09 KB
/
analysis_viz
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Loading data into a dictionary
# Create the years and durations lists
years = [2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]
durations = [103, 101, 99, 100, 100, 95, 95, 96, 93, 90]
# Create a dictionary with the two lists
movie_dict = {'years':[2011,2012,2013,2014,2015,2016,2017,2018,2019,2020],
'durations':[103, 101, 99, 100, 100, 95, 95, 96, 93, 90]}
# Print the dictionary
print(movie_dict)
# Creating a DataFrame from a dictionary
# Import pandas under its usual alias
import pandas as pd
# Create a DataFrame from the dictionary
durations_df = pd.DataFrame(movie_dict)
# Print the DataFrame
print(durations_df)
# visual inspection of our data
# Import matplotlib.pyplot under its usual alias and create a figure
from matplotlib import pyplot as plt
fig = plt.figure()
# Draw a line plot of release_years and durations
plt.plot(years,durations)
# Create a title
plt.title("Netflix Movie Durations 2011-2020")
# Show the plot
plt.show()
# Loading the rest of the data from a CSV
# Read in the CSV as a DataFrame
netflix_df = pd.read_csv("datasets/netflix_data.csv")
# Print the first five rows of the DataFrame
print(netflix_df.head())
# Filtering for movies
# Subset the DataFrame for type "Movie"
netflix_df_movies_only = netflix_df[netflix_df['type']=='Movie']
# Select only the columns of interest
netflix_movies_col_subset = netflix_df_movies_only.loc[:,['title','country','genre','release_year', 'duration']]
# Print the first five rows of the new DataFrame
print(netflix_movies_col_subset.head())
# Creating a scatter plot
# Create a figure and increase the figure size
fig = plt.figure(figsize=(12,8))
# Create a scatter plot of duration versus year
plt.scatter(netflix_movies_col_subset.release_year, netflix_movies_col_subset.duration)
# Create a title
plt.title("Movie Duration by Year of Release")
# Show the plot
plt.show()
# More filtering
# Filter for durations shorter than 60 minutes
short_movies = netflix_movies_col_subset[netflix_movies_col_subset['duration']<60]
# Print the first 20 rows of short_movies
print(short_movies.head(20))
# Marking non-feature films
# Define an empty list
colors = []
# Iterate over rows of netflix_movies_col_subset
for lab,row in netflix_movies_col_subset.iterrows() :
if row['genre'] == "Children" :
colors.append('red')
elif row['genre'] == "Documentaries" :
colors.append('blue')
elif row['genre'] == "Stand-Up":
colors.append('green')
else:
colors.append('black')
# Inspect the first 10 values in your list
colors[0:10]
# Plotting with color
# Set the figure style and initalize a new figure
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(12,8))
# Create a scatter plot of duration versus release_year
plt.scatter(netflix_movies_col_subset['release_year'],netflix_movies_col_subset['duration'], c=colors)
# Create a title and axis labels
plt.title("Movie duration by year of release")
plt.xlabel("Release year")
plt.ylabel("Duration (min)")
# Show the plot
plt.show()
# Are we certain that movies are getting shorter?
are_movies_getting_shorter = 'maybe'