forked from TeddovanMierle/ML-Methods
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunsupervised_clustering.py
139 lines (125 loc) · 5.71 KB
/
unsupervised_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from array import array
import numpy as np
from sklearn.cluster import MeanShift
def mean_shift(centers, predict_data=None):
"""Function that perform mean shift clustering, can also predict values if predict_data is passed
Parameters
----------
centers : 2D array like
centers of data to perform clustering on
predict_data : 2D array like, optional
data to be predicted by the clustering, by default None
Returns
-------
cluster_centers, labels, num_features, predict
cluster_centers: centers after clustering
labels: labels of each point
num_features: number of features seen during fit
predict: predicted values by the clustering for predict_data
Raises
------
Exception
raise exception when normal array (non 2D array) is passed in as predict data
"""
ms = MeanShift()
clustering = ms.fit(centers)
cluster_centers = clustering.cluster_centers_
labels = clustering.labels_
num_features = clustering.n_features_in_
if type(predict_data) == type(array) or type(np.array):
try: predicted = clustering.predict(predict_data)
except: raise Exception ('Use 2D array for predict_data')
else:
predicted = None
return cluster_centers, labels, num_features, predicted
from sklearn.cluster import DBSCAN
def perform_DBSCAN(data, eps, min_samples):
"""Perform DBSCAN algorithm on a given set of data
Parameters
----------
data : 2D array-like
array of data of interest to perform DBSCAN
eps : float
The maximum distance between two samples for one to be considered as in the neighborhood of the other.
This is not a maximum bound on the distances of points within a cluster.
This is the most important DBSCAN parameter to choose appropriately for your data set and distance function.
min_samples : int
The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
This includes the point itself.
Returns
-------
labels, num_features, core_sample_indices, components
labels: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.
num_features: Number of features seen during fit.
core_sample_indices: Indices of core samples.
components: Copy of each core sample found by training.
"""
clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(data)
labels = clustering.labels_
num_features = clustering.n_features_in_
core_sample_indices = clustering.core_sample_indices_
components = clustering.components_
return labels, num_features, core_sample_indices, components
from sklearn.cluster import AgglomerativeClustering
def hierarchical_clustering(data, n_clusters=2, linkage='ward', distance_threshold=None):
"""Function that performs hiearchical clustering and fit to an array of data
Parameters
----------
data : 2D array
data to be fitted
n_clusters : int, default=2
number of clusters to find
linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation.
The algorithm will merge the pairs of cluster that minimize this criterion.
'ward' minimizes the variance of the clusters being merged.
'average' uses the average of the distances of each observation of the two sets.
'complete' or 'maximum' linkage uses the maximum distances between all observations of the two sets.
'single' uses the minimum of the distances between all observations of the two sets.
distance_threshold : float, default=None
The linkage distance threshold above which, clusters will not be merged.
If not None, n_clusters must be None and compute_full_tree must be True.
Returns
-------
num_clusters : int
The number of clusters found by the algorithm
labels : ndarray of shape (n_samples)
Cluster labels for each point.
num_leaves : int
Number of leaves in the hierarchical tree
num_connected_components : int
The estimated number of connected components in the graph
num_features : int
number of features seen during fit
"""
model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters, distance_threshold=distance_threshold)
model.fit(data)
num_clusters = model.n_clusters_
labels = model.labels_
num_leaves = model.n_leaves_
num_connected_components = model.n_connected_components_
num_features = model.n_features_in_
return num_clusters, labels, num_leaves, num_connected_components, num_features
from sklearn.mixture import GaussianMixture
def gaussian_mixture_model(data, num_components, num_random_state=0, predict_data=None):
"""Perform unsupervised learning with gaussian mixture model for a given data, and make prediction if needed
Parameters
----------
data : 2D array
Array of data to be fitted with Gaussian Mixture Model
num_components : int
number of underlying Gaussian distributions
num_random_state : int
random seed for initialization, by default 0
predict_data : 2D array, optional
array of data to be predicted from the model, by default None
Returns
-------
predicted
predicted is the predicted data of data passed into the model, which is predict_data
"""
GMM = GaussianMixture(n_components=num_components, random_state=num_random_state).fit(data)
if type(predict_data) == type(array) or type(np.array):
predicted = GMM.predict(predict_data)
else: predicted = None
return predicted