diff --git a/river/cluster/__init__.py b/river/cluster/__init__.py index 0fe354fd2f..085a21befc 100644 --- a/river/cluster/__init__.py +++ b/river/cluster/__init__.py @@ -5,9 +5,10 @@ from .clustream import CluStream from .dbstream import DBSTREAM from .denstream import DenStream +from .hcluster import HierarchicalClustering from .k_means import KMeans from .odac import ODAC from .streamkmeans import STREAMKMeans from .textclust import TextClust -__all__ = ["CluStream", "DBSTREAM", "DenStream", "KMeans", "ODAC", "STREAMKMeans", "TextClust"] +__all__ = ["CluStream", "DBSTREAM", "DenStream", "HierarchicalClustering", "KMeans", "ODAC", "STREAMKMeans", "TextClust"] diff --git a/river/cluster/hcluster.py b/river/cluster/hcluster.py new file mode 100644 index 0000000000..c3511c9a6b --- /dev/null +++ b/river/cluster/hcluster.py @@ -0,0 +1,402 @@ +from __future__ import annotations + +import functools + +from river import base, utils +from river.neighbors.base import DistanceFunc, FunctionWrapper + + +# Node of a binary tree for Hierarchical Clustering +class BinaryTreeNode: + def __init__(self, key: int, data: dict = None): + self.data = data + self.key = key + # Children and parent + self.left = None + self.right = None + self.parent = None + + +class HierarchicalClustering(base.Clusterer): + """Hierarchical Clustering. + + HierarchicalClustering is a stream hierarchical clustering algorithm. This algorithm [^1] inserts new nodes + near the nodes it is similar to without breaking clusters of very similar nodes. + + Beginning with the whole tree `T`, it will compare the new node to this respective tree: + * If `T` is just a leaf: merge + * Else, if the nodes of `T` are more similar between them than with the new node: merge + * Else, if the new node is more similar to the left subtree than to the right subtree: + redo from the first point with `T` equal to left subtree + * Else, if the new node is more similar to the right subtree than to the left subtree: + redo from the first point with `T` right subtree + + A window size can also be chosen to use only the most recent points to make sure that the tree is not overloaded. + + Parameters + ---------- + window_size + number of data points to use + dist_func + A distance function to use to compare the nodes. The Minkowski distance with `p=2` is used as default. + + Attributes + ---------- + n + number of nodes + x_clusters + data points used by the algorithm with the key of the node representing them + + References + ---------- + [^1]: Anand Rajagopalan, Aditya Krishna Menon, Qin Cao, Gui Citovsky, Baris Sumengen and Sanjiv Kumar (2019). Online + Hierarchical Clustering Approximations. arXiV:1909.09667. Available at: https://doi.org/10.48550/arXiv.1909.09667 + + Examples + -------- + + The first example is with leaving the window size to 100. In the second one we put it at 2 to see how it works. + + >>> from river import cluster + >>> from river import stream + + >>> X = [[1, 2, 1], [2, 1, 0], [3, 2, 1], [2, 2, 1], [5, 2, 3]] + + >>> hierarchical_clustering = cluster.HierarchicalClustering() + + >>> for x, _ in stream.iter_array(X): + ... hierarchical_clustering = hierarchical_clustering.learn_one(x) + + >>> hierarchical_clustering.x_clusters + {'[1, 2, 1]': 1, + '[2, 1, 0]': 2, + '[3, 2, 1]': 4, + '[2, 2, 1]': 6, + '[5, 2, 3]': 8} + + >>> hierarchical_clustering.n + 9 + + >>> print(hierarchical_clustering) + -> 8 + -> 9 + -> 6 + -> 7 + -> 4 + -> 5 + -> 2 + -> 3 + -> 1 + Printed Hierarchical Clustering Tree. + + >>> hierarchical_clustering.get_all_clusters() + [(1, ['[1, 2, 1]']), + (2, ['[2, 1, 0]']), + (4, ['[3, 2, 1]']), + (6, ['[2, 2, 1]']), + (8, ['[5, 2, 3]']), + (3, [1, 2]), + (5, [3, 7]), + (7, [4, 6]), + (9, [5, 8])] + + >>> hierarchical_clustering.get_clusters_by_point() + {'[1, 2, 1]': [1, 3, 5, 9], + '[2, 1, 0]': [2, 3, 5, 9], + '[3, 2, 1]': [4, 7, 5, 9], + '[2, 2, 1]': [6, 7, 5, 9], + '[5, 2, 3]': [8, 9]} + + >>> hierarchical_clustering.predict_one({0: 4, 1: 3, 2: 1}) + ([10, 11, 9], 8) + + >>> hierarchical_clustering = hierarchical_clustering.learn_one({0: 4, 1: 3, 2: 1}) + + >>> print(hierarchical_clustering) + -> 10 + -> 11 + -> 8 + -> 9 + -> 6 + -> 7 + -> 4 + -> 5 + -> 2 + -> 3 + -> 1 + Printed Hierarchical Clustering Tree. + + >>> hierarchical_clustering = cluster.HierarchicalClustering(window_size=2) + + >>> for x, _ in stream.iter_array(X): + ... hierarchical_clustering = hierarchical_clustering.learn_one(x) + + >>> hierarchical_clustering.x_clusters + {'[2, 2, 1]': 2, '[5, 2, 3]': 1} + + >>> hierarchical_clustering.n + 3 + + >>> print(hierarchical_clustering) + -> 2 + -> 3 + -> 1 + Printed Hierarchical Clustering Tree. + """ + + def __init__( + self, + window_size: int = 100, + dist_func: DistanceFunc | FunctionWrapper | None = None, + ): + # Number of nodes + self.n = 0 + # Max number of leaves + self.window_size = window_size + # Dict : x data (str(array of size m)) -> key of the node + self.x_clusters: dict[str, int] = {} + # Dict : key -> node + self.nodes: dict[int, BinaryTreeNode] = {} + # First node of the tree + self.root = None + # Distance function + if dist_func is None: + dist_func = functools.partial(utils.math.minkowski_distance, p=2) + self.dist_func = dist_func + + def otd_clustering(self, tree, x): + # Online top down clustering (OTD), the first algorithm for online hierarchical clustering. + # The algorithm performs highly efficient online updates and provably approximates Moseley-Wang revenue. + x_string = str(list(x.values())) + if self.n == 1: + # First node in the tree + self.root = self.nodes[1] + elif tree.data is not None: + # If T is a leaf, we merge the two nodes together + self.merge_nodes(tree, self.nodes[self.x_clusters[x_string]]) + elif tree.left is None: + # If there is no node at the left of the intermediate node, we add it there + tree.left = self.nodes[self.x_clusters[x_string]] + self.nodes[self.x_clusters[x_string]].parent = tree + elif tree.right is None: + # If there is no node at the right of the intermediate node, we add it there + tree.right = self.nodes[self.x_clusters[x_string]] + self.nodes[self.x_clusters[x_string]].parent = tree + elif self.intra_subtree_similarity(tree) < self.inter_subtree_similarity( + tree, self.nodes[self.x_clusters[x_string]] + ): + # If the nodes in T are closer between them than with the new node, we merge T and the new node + self.merge_nodes(tree, self.nodes[self.x_clusters[x_string]]) + elif self.inter_subtree_similarity( + tree.left, self.nodes[self.x_clusters[x_string]] + ) > self.inter_subtree_similarity(tree.right, self.nodes[self.x_clusters[x_string]]): + # Continue to search where to merge the new node in the right part of T + self.otd_clustering(tree.right, x) + else: + # Continue to search where to merge the new node in the left part of T + self.otd_clustering(tree.left, x) + + def merge_nodes(self, tree, added_node): + # Merge a new node (added node) to the tree + # We create the node that will be the parent of the tree and the added node + self.n += 1 + new_node = BinaryTreeNode(self.n) + # We add the tree and the added node as its children + new_node.left = tree + new_node.right = added_node + # The parent of the new node is the parent of the tree + new_node.parent = tree.parent + # If the tree is not the root, we set the child of its parent as new node (instead of T) + if tree.parent is not None: + if tree.parent.left.key == tree.key: + tree.parent.left = new_node + else: + tree.parent.right = new_node + # We add the new node as the parent of the tree and the added node + tree.parent = new_node + added_node.parent = new_node + # We add the new node to the dict + self.nodes[self.n] = new_node + # If the tree was the root, the new node become the root + if self.root.key == tree.key: + self.root = self.nodes[self.n] + + def learn_one(self, x): + # We create the node for x and add it to the tree + if len(self.x_clusters.keys()) >= self.window_size: + # Delete the oldest data point and add a node with the same key as the one deleted + oldest_key = self.x_clusters[list(self.x_clusters.keys())[0]] + oldest = self.nodes[oldest_key] + if oldest.parent.left.key == oldest_key: + oldest.parent.left = None + else: + oldest.parent.right = None + del self.nodes[oldest_key] + del self.x_clusters[list(self.x_clusters.keys())[0]] + self.x_clusters[str(list(x.values()))] = oldest_key + self.nodes[oldest_key] = BinaryTreeNode(oldest_key, x) + else: + # Else, add a node + self.n += 1 + self.x_clusters[str(list(x.values()))] = self.n + self.nodes[self.n] = BinaryTreeNode(self.n, x) + # We add it to the tree + self.otd_clustering(self.root, x) + return self + + def predict_otd(self, x, node, clusters): + # get the list of predicted clusters for x + if node is None: + # If there is still no node in the tree + return [1], -1 + if node.data is not None: + # Add itself (n+1) and the key of the node that would merge x and node (n+2) + clusters.extend([self.n + 2, self.n + 1]) + return clusters, node.key + if self.intra_subtree_similarity(node) < self.inter_subtree_similarity( + node, BinaryTreeNode(self.n + 1, x) + ): + # Add itself (n+1) and the key of the node that would merge x and node (n+2) + clusters.extend([self.n + 2, self.n + 1]) + return clusters, node.key + else: + # Else, x would be added in the tree, we add the key of node + clusters.extend([node.key]) + if self.inter_subtree_similarity( + node.left, BinaryTreeNode(self.n + 1, x) + ) > self.inter_subtree_similarity(node.right, BinaryTreeNode(self.n + 1, x)): + # If the right part of the tree is closer to x than the left part, we continue in the right part + return self.predict_otd(x, node.right, clusters) + else: + # If the left part of the tree is closer to x than the right part, we continue in the left part + return self.predict_otd(x, node.left, clusters) + + def predict_one(self, x): + """Predicts the clusters for a set of features `x`. + + Parameters + ---------- + x + A dictionary of features. + Returns + ------- + (list, int) + A list of clusters (from node `x` to root) and the node to which it would have been merged. + + """ + # We predict to which cluster x would be if we added in the tree + r, merged = self.predict_otd(x, self.root, []) + r.reverse() + return r, merged + + @staticmethod + def find_path(root, path, k): + # find the path from root to k + # Adapted from https://www.geeksforgeeks.org/lowest-common-ancestor-binary-tree-set-1/ + + if root is None: + return False + + path.append(root) + + if root.key == k: + return True + + if (root.left is not None and HierarchicalClustering.find_path(root.left, path, k)) or ( + root.right is not None and HierarchicalClustering.find_path(root.right, path, k) + ): + return True + + path.pop() + return False + + def leaves(self, v): + # find all the leaves from node v + + if v is None: + return -1 + if v.data is not None: + return [v] + + leave_list = [] + leave_list.extend(self.leaves(v.left)) + leave_list.extend(self.leaves(v.right)) + return leave_list + + def inter_subtree_similarity(self, tree_a, tree_b): + # compute the mean distance (mean of distances) between two trees + leaves_a = self.leaves(tree_a) + leaves_b = self.leaves(tree_b) + r = 0 + nb = 0 + for i, w_i in enumerate(leaves_a): + for j, w_j in enumerate(leaves_b): + nb += 1 + r += self.dist_func(w_i.data, w_j.data) + return r / nb + + def intra_subtree_similarity(self, tree): + # compute mean of distances between the nodes from a certain tree + leaves = self.leaves(tree) + r = 0 + nb = 0 + if len(leaves) == 1: + return 0 + for i, w_i in enumerate(leaves): + for j, w_j in enumerate(leaves): + if i < j: + nb += 1 + r += self.dist_func(w_i.data, w_j.data) + return r / nb + + def __str__(self): + self.print_tree(self.root) + return "Printed Hierarchical Clustering Tree." + + @staticmethod + def print_tree(node, level=0): + # Print node and its children + # Adapted from https://stackoverflow.com/questions/34012886/print-binary-tree-level-by-level-in-python + if node is not None: + HierarchicalClustering.print_tree(node.right, level + 1) + print(" " * 4 * level + "-> " + str(node.key)) + HierarchicalClustering.print_tree(node.left, level + 1) + + def get_parents(self, node): + # Get all the parents of the node (the clusters it belongs to) + clusters = [node.key] + if node.parent is None: + return clusters + clusters.extend(self.get_parents(node.parent)) + return clusters + + def get_clusters_by_point(self): + """Returns the list of clusters (from the data point node to the root) for all data points. + + Returns + ------- + {x : list} + A dict of all the data points with their clusters. + """ + # Get all the clusters each data point belong to + clusters = {} + for x in self.x_clusters.keys(): + clusters[x] = self.get_parents(self.nodes[self.x_clusters[x]]) + return clusters + + def get_all_clusters(self): + """Returns all the clusters of the tree. + + Returns + ------- + {int : list} + A dict of all the clusters with their children (or the data point for the leaves). + """ + # Get the data of each cluster + clusters = {} + for i in range(1, self.n + 1): + if self.nodes[i].data is not None: + clusters[i] = [str(list(self.nodes[i].data.values()))] + else: + clusters[i] = [self.nodes[i].left.key, self.nodes[i].right.key] + return sorted(clusters.items(), key=lambda x: len(x[1]))