-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
151 lines (125 loc) · 5.92 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import streamlit as st
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
# Title and description
st.title("PipeDreams - Enhanced Data Explorer")
st.write("Upload a CSV file to analyze, visualize, and extract deeper insights from your data using ETL and machine learning.")
# Load sample data if no file is uploaded
sample_data_path = "data/customers-100000.csv"
# File uploader
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
# Check if a file is uploaded
if uploaded_file is not None:
data = pd.read_csv(uploaded_file)
st.write("## Uploaded Data")
else:
# Use sample data if no file is uploaded
st.write("No file uploaded. Displaying sample data.")
data = pd.read_csv(sample_data_path)
st.write("## Sample Data")
st.write(data.head()) # Display the first few rows of the data
# Basic ETL Transformations
st.write("## ETL Transformations")
# Remove rows with missing values
clean_data = data.dropna()
st.write("### Cleaned Data (no missing values)")
st.write(clean_data.head())
# Convert Subscription Date to datetime and add 'Years Since Subscription' feature
if 'Subscription Date' in clean_data.columns:
clean_data['Subscription Date'] = pd.to_datetime(clean_data['Subscription Date'], errors='coerce')
clean_data['Years Since Subscription'] = (pd.to_datetime("today") - clean_data['Subscription Date']).dt.days / 365
st.write("### Added 'Years Since Subscription' Feature")
st.write(clean_data[['Subscription Date', 'Years Since Subscription']].head())
# Encode categorical columns as integers to make them usable in ML
label_encoders = {}
for column in clean_data.select_dtypes(include=['object']).columns:
if column != 'Subscription Date' and column != 'Email' and column != 'Website': # Exclude specific non-informative columns
le = LabelEncoder()
clean_data[column] = le.fit_transform(clean_data[column])
label_encoders[column] = le
st.write("### Categorical Columns Encoded")
st.write(clean_data.head())
# Generate synthetic target column for regression analysis
np.random.seed(42)
clean_data['Annual Purchase Amount'] = np.random.normal(loc=5000, scale=2000, size=len(clean_data)).clip(min=0)
# Data Visualization
st.write("## Data Visualization")
# Select visualization type
viz_type = st.selectbox("Choose a visualization type", ["Scatter Plot", "Bar Chart", "Line Chart", "Histogram", "Box Plot"])
# Select columns for visualization
columns = clean_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
if columns:
x_axis = st.selectbox("Choose X-axis", columns)
y_axis = st.selectbox("Choose Y-axis", columns)
# Generate the selected plot
if viz_type == "Scatter Plot":
fig = px.scatter(clean_data, x=x_axis, y=y_axis)
elif viz_type == "Bar Chart":
fig = px.bar(clean_data, x=x_axis, y=y_axis)
elif viz_type == "Line Chart":
fig = px.line(clean_data, x=x_axis, y=y_axis)
elif viz_type == "Histogram":
fig = px.histogram(clean_data, x=x_axis)
elif viz_type == "Box Plot":
fig = px.box(clean_data, x=x_axis, y=y_axis)
st.plotly_chart(fig)
else:
st.write("No numerical data available for visualization.")
# Advanced Insights
st.write("## Advanced Insights with Machine Learning")
# Clustering Analysis
st.write("### Clustering Analysis")
if len(columns) > 1:
cluster_features = st.multiselect("Choose features for clustering", columns, default=columns[:2])
if len(cluster_features) >= 2:
# Standardize features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(clean_data[cluster_features])
# Run KMeans
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(scaled_data)
clean_data['Cluster'] = clusters
# Plot clusters
fig = px.scatter(clean_data, x=cluster_features[0], y=cluster_features[1], color='Cluster', title="Clustering Analysis")
st.plotly_chart(fig)
else:
st.write("Please select at least two features for clustering.")
else:
st.write("Not enough numerical features for clustering.")
# Predictive Analysis
st.write("### Predictive Analysis (Linear Regression)")
target = st.selectbox("Choose a target variable for prediction", columns)
features = st.multiselect("Choose features for prediction", [col for col in columns if col != target])
# Ensure Annual Purchase Amount appears as a default target option
if 'Annual Purchase Amount' not in columns:
st.write("No appropriate target variable found. Consider adding synthetic or numerical data.")
if target and len(features) > 0:
X = clean_data[features]
y = clean_data[target]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build the model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict and evaluate
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
st.write(f"Mean Squared Error on test set: {mse:.2f}")
# Display predictions and a visual comparison
result_df = pd.DataFrame({"Actual": y_test, "Predicted": predictions})
st.write("### Prediction Results")
st.write(result_df.head(10))
# Scatter plot of Actual vs Predicted
fig = px.scatter(result_df, x="Actual", y="Predicted", title="Actual vs Predicted")
st.plotly_chart(fig)
# Insight Summary
st.write("## Summary Insights")
st.write("Based on the clustering and predictive analysis, here are some insights that could help in decision-making:")
st.write("- **Clustering** helps to identify natural groupings in the data.")
st.write("- **Predictive Analysis** can forecast trends based on selected features, providing actionable insights for future decisions.")