forked from shomerthesec/Movies_success_predictor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA.py
146 lines (101 loc) · 3.39 KB
/
EDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
# %% imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
data = pd.read_csv(
'S:/Movie_success_predictor/data/movie_success_rate.csv', skipfooter=1)
# %%
describtion = data.describe()
# %%
for i, col in enumerate(describtion.columns[:7]):
_, ax = plt.subplots(1, 1)
sns.distplot(data[col], ax=ax)
# %%
data['Revenue (Millions)'] # skewed to the right
# %% mimicing the distribution of Metascore with a normal dist
mu, sig = data.Metascore.mean(), data.Metascore.std()
sns.distplot(np.random.normal(mu, sig, 1000))
# %% dropping genre as it's already converted into dummy columns
df = data.copy() # .drop('Genre', axis=1)
# %% appending the names of unique actors to a list
all_actors = []
for n in df.Actors:
for n_i in n.split(','):
all_actors.append(n_i.strip())
all_actors = pd.unique(all_actors)
# %%
# tmp_df=pd.DataFrame([], columns=all_actors)
# for actor_name in all_actors:
# for idx, n_i in enumerate(df.Actors):
# tmp_df[actor_name] = actor_name in n_i
# %%
categories = data.columns[12:-1]
# %%
for cat in categories:
print(data[]
# %% number of each category
data[categories].sum(axis=0).plot.bar()
# %% action movies that succeeded
sns.countplot(x=data.query('Action==1').Success,
)
plt.title('Action films and their success')
# %%
sns.countplot(x=data.query('Aniimation==1').Success)
plt.title('Animation films and their success')
# %%
sns.countplot(x=data.query('Drama==1').Success)
plt.title('Drama films and their success')
# %%
zero_matrix=np.zeros((len(df), len(all_actors)))
actors_df=pd.DataFrame(zero_matrix, columns=all_actors)
for i, actor_names in enumerate(df.Actors):
tmp_names=[]
for name in actor_names.split(','):
tmp_names.append(name.strip())
indices=actors_df.columns.get_indexer(tmp_names)
actors_df.iloc[i, indices]=1
# %%
# dummies is a dataframe for actors names
# %%
sns.catplot(x='Success',
col='Genre',
data=data,
hue="Genre",
kind='count')
# %%
sns.relplot(data=df,
x="Success",
y=categories.T)
# %%
# data.groupby('Category')
# can't do a group by category as it's splitted into multi genres
# %% to return the data in a certain category
# for category in categories:
# # mask= df.loc[:,category] == 1
# tmp= df[ df.loc[:,category] ==1 ]
# %% function to do the ubove task
def category_df(df, category):
return df.query(f'{category}==1')
# %% to only get the action moveis
# action_df= category_df(df , 'Action' )
# %% to join names of the actors with the movies
# display( action_df.join(actors_df, on= action_df.index ) )
# %% duo to the highly sparse actors list let's work on the directors
provided_data=df.drop(['Actors', 'Title', 'Rank', 'Year', 'Rating',
'Votes', 'Revenue (Millions)', 'Metascore'],
axis=1)
# %% let's group by director and see what we can find
dir_tmp=provided_data.groupby('Director').sum()
# we can see that we have 524 unique director, with some of them so successful
# %% shows the directos and their success
succ_tmp=dir_tmp.query('Success != 0')
sns.barplot(x=succ_tmp.index, y=succ_tmp.Success)
# %% saving our processed data
provided_data.to_csv('data/provided_data.csv')
actors_df.to_csv('data/actors_data.csv')