-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathiris with kmeans.py
39 lines (31 loc) · 1.27 KB
/
iris with kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
# Load the Iris dataset
iris = load_iris()
X = iris.data # Using all four features for clustering
# Compute Sum of Squared Errors (SSE) for different values of K
sse = []
k_values = range(1, 21)
for k in k_values:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X)
sse.append(kmeans.inertia_) # Inertia is the SSE
# Compute the second derivative (rate of change of SSE)
sse_diff = np.diff(sse) # First derivative
sse_diff2 = np.diff(sse_diff) # Second derivative
# The optimal K is typically where the second derivative is minimal (largest drop in SSE)
optimal_k = np.argmin(sse_diff2) + 2 # +2 because second derivative reduces dimension by 2
# Print the optimal K
print(f"Optimal number of clusters (K) based on the elbow method: {optimal_k}")
# Plot SSE against K
plt.figure(figsize=(8, 6))
plt.plot(k_values, sse, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Method for Optimal K')
plt.xticks(range(1, 21))
plt.grid(True)
plt.show()
# The optimal K is typically at the 'elbow' of the curve