-
Notifications
You must be signed in to change notification settings - Fork 0
/
eda.py
47 lines (38 loc) · 1.34 KB
/
eda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Set the backend for Matplotlib
plt.switch_backend('Agg')
# Load the preprocessed dataset
df = pd.read_csv('movies_cleaned.csv')
# Summary Statistics
print(df.describe())
print(df['genre'].value_counts())
print(df['release_country'].value_counts())
# Data Distribution
# Histograms for numerical columns
df.hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.savefig('histograms.png') # Save the plot instead of showing it
plt.close()
# Box plots for numerical columns
plt.figure(figsize=(15, 10))
sns.boxplot(data=df[['budget', 'gross', 'runtime', 'score', 'votes']])
plt.savefig('boxplots.png') # Save the plot instead of showing it
plt.close()
# Bar plots for categorical columns
plt.figure(figsize=(15, 10))
sns.countplot(y='genre', data=df, order=df['genre'].value_counts().index)
plt.savefig('barplots.png') # Save the plot instead of showing it
plt.close()
# Correlation Analysis
# Select only numeric columns for correlation matrix
numeric_df = df.select_dtypes(include=['float64', 'int64'])
# Correlation matrix
corr_matrix = numeric_df.corr()
print(corr_matrix)
# Heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.savefig('heatmap.png') # Save the plot instead of showing it
plt.close()