-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans_seq.py
85 lines (69 loc) · 2.48 KB
/
kmeans_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# ============================================================================
# Name : kmeans_seq.py
# Author : Federico Nocentini & Corso Vignoli
# Version :
# Copyright :
# Description : Python implementation of K-means clustering algorithm
# ============================================================================
import csv
import numpy as np
import matplotlib.pyplot as plt
import random
import math
import time
# Function for euclidean distance
def euclidean_distance_2D(x1, x2, y1, y2):
return math.sqrt(((x1 - y1) ** 2) + ((x2 - y2) ** 2))
# Function for centroid assignment
def centroid_assg(points, centroids, points_assg):
for i in range(n):
distance = 100000
for j in range(k):
dist = euclidean_distance_2D(points[i, 0], points[i, 1], centroids[j, 0], centroids[j, 1])
if dist < distance:
distance = dist
points_assg[i] = j
# Function for centroid update
def centroid_update(points, points_assgn):
centroids_sum = np.zeros((k, 2))
cluster_size = np.zeros(k)
for i in range(n):
clust_id = points_assgn[i]
clust_id = int(clust_id)
cluster_size[clust_id] = cluster_size[clust_id] + 1
centroids_sum[clust_id, 0] += points[i, 0]
centroids_sum[clust_id, 1] += points[i, 1]
cluster_size = np.vstack((cluster_size, cluster_size))
return centroids_sum / cluster_size.T
num_iter = 20
k = 5
n = 0
# Read the data
with open('datasets/2D_data_3.csv', 'r') as read_obj:
# pass the file object to reader() to get the reader object
csv_reader = csv.reader(read_obj)
# Iterate over each row in the csv using reader object
x_coords = []
y_coords = []
for row in csv_reader:
n = n + 1
x_coords.append(float(row[0]))
y_coords.append(float(row[1]))
points = np.vstack((x_coords, y_coords))
points = points.T
# K-means iterations
centroids = np.zeros((k, 2))
points_assg = np.zeros((n))
for i in range(k):
rand = random.randint(0, n)
centroids[i, 0] = points[rand, 0]
centroids[i, 1] = points[rand, 1]
start_time = time.time()
for i in range(num_iter):
start_time_iter = time.time()
centroid_assg(points, centroids, points_assg)
centroids = centroid_update(points, points_assg)
print("Seconds for iter " + str(i) + ': ' + str((time.time() - start_time_iter)))
print("Seconds for clustering: " + str((time.time() - start_time)))
plt.scatter(points[:, 0], points[:, 1], c=points_assg)
plt.show()