From bad9d62394a9d05201665ef43cf20ebd3ebafccb Mon Sep 17 00:00:00 2001 From: Jelmer Bot Date: Mon, 14 Oct 2024 14:07:46 +0200 Subject: [PATCH] update docs --- doc/api_reference.rst | 6 +- doc/basic_usage.rst | 188 ------------------------- doc/index.rst | 192 +++++++++++++++++++++++++- multi_mst/k_mst/api.py | 16 +-- multi_mst/k_mst_descent/api.py | 8 +- multi_mst/k_mst_descent_recall/api.py | 8 +- multi_mst/kdtree.py | 2 +- multi_mst/noisy_mst/api.py | 8 +- 8 files changed, 211 insertions(+), 217 deletions(-) delete mode 100644 doc/basic_usage.rst diff --git a/doc/api_reference.rst b/doc/api_reference.rst index 9fbcd81..5a03ab3 100644 --- a/doc/api_reference.rst +++ b/doc/api_reference.rst @@ -8,21 +8,21 @@ Noisy MST :members: NoisyMST, noisyMST -$k$-MST +k-MST ------- .. automodule:: multi_mst.k_mst :members: KMST, kMST -$k$-MST Descent +k-MST Descent --------------- .. automodule:: multi_mst.k_mst_descent :members: KMSTDescent, kMSTDescent -Recall logging $k$-MST Descent +Recall logging k-MST Descent ------------------------------ .. automodule:: multi_mst.k_mst_descent_recall diff --git a/doc/basic_usage.rst b/doc/basic_usage.rst deleted file mode 100644 index 2ca6de9..0000000 --- a/doc/basic_usage.rst +++ /dev/null @@ -1,188 +0,0 @@ -============================================== -Manifold Modelling with Minimum Spanning Trees -============================================== - -Dimensionality reduction (DR) algorithms typically assume the data they are -given is uniformly sampled from some underlying manifold. When this is not the -case, and there are observation-gaps along the manifold, these algorithms may -fail to detect a single connected entity. This repository presents two manifold -approximation approaches based on minimum spanning trees (MST) for non-uniform -sampled data. - ---------------------------------- -Noisy Minimum Spanning Tree Union ---------------------------------- - -The noisy minimum spanning tree union ($n$-MST) is inspired by Pathfinder -networks that, with a specific parameter selection, yield the union set of all -possible MSTs in a network (see, e.g., [`1`_], [`2`_]). We compute noisy MSTs to -detect alternative connectivity at all distance scales for distances which may -have few identically weighted connections. - -We add Gaussian noise ($\mu=0$) to every candidate edge. The noise parameter $n$ -is specified as a fraction of the points' nearest neighbour distance and -controls the Gaussian's standard deviation. This formulation makes the noise -scale with the data's density to avoid adding more edges in dense regions than -sparse regions, retaining a reasonably uniform manifold approximation graph. - -.. code:: python - - import matplotlib.pyplot as plt - import matplotlib.collections as mc - from sklearn.datasets import make_swiss_roll - from multi_mst.noisy_mst import NoisyMST - - X, t = make_swiss_roll(n_samples=2000, noise=0.5, hole=True) - projector = NoisyMST(num_trees=10, noise_fraction=1.0).fit(X) - - # Draw the network - xs = projector.embedding_[:, 0] - ys = projector.embedding_[:, 1] - coo_matrix = projector.graph_.tocoo() - sources = coo_matrix.row - targets = coo_matrix.col - - plt.figure(figsize=(4, 3)) - plt.scatter(xs, ys, c=t, s=1, edgecolors="none", linewidth=0, cmap="viridis") - lc = mc.LineCollection( - list(zip(zip(xs[sources], ys[sources]), zip(xs[targets], ys[targets]))), - linewidth=0.2, - zorder=-1, - alpha=0.5, - color="k", - ) - ax = plt.gca() - ax.add_collection(lc) - ax.set_aspect("equal") - plt.subplots_adjust(0, 0, 1, 1) - plt.axis("off") - plt.show() - -.. figure:: _static/noisy_mst.png - - ---------------------------------- -$k$-Nearest Minimum Spanning Tree ---------------------------------- - -The k-nearest Minimum Spanning Tree ($k$-MST) generalises $k$-nearest neighbour -networks ($k$-NN) for minimum spanning trees. It adds the $k$ shortest edges -between components. Since data points start as distinct components, all $k$-NN -edges are included in the kMST. - -To avoid creating shortcuts in the manifold, a distance threshold $\epsilon$ can -be applied. The parameter is specified as a fraction of the shortest edge -between components and provides an upper distance limit for the $2$-to-$k$ -alternative edges. - -.. code:: python - - import matplotlib.pyplot as plt - import matplotlib.collections as mc - from sklearn.datasets import make_swiss_roll - from multi_mst.k_mst import KMST - - X, t = make_swiss_roll(n_samples=2000, noise=0.5, hole=True) - projector = KMST(num_neighbors=3, epsilon=2.0).fit(X) - - # Draw the network - xs = projector.embedding_[:, 0] - ys = projector.embedding_[:, 1] - coo_matrix = projector.graph_.tocoo() - sources = coo_matrix.row - targets = coo_matrix.col - - plt.figure(figsize=(4, 3)) - plt.scatter(xs, ys, c=t, s=1, edgecolors="none", linewidth=0, cmap="viridis") - lc = mc.LineCollection( - list(zip(zip(xs[sources], ys[sources]), zip(xs[targets], ys[targets]))), - linewidth=0.2, - zorder=-1, - alpha=0.5, - color="k", - ) - ax = plt.gca() - ax.add_collection(lc) - ax.set_aspect("equal") - plt.subplots_adjust(0, 0, 1, 1) - plt.axis("off") - plt.show() - -.. figure:: _static/k_mst.png - - -------------------- -Approximate $k$-MST -------------------- - -Computing $k$-MSTs using KDTrees can be expensive on some datasets. We provide a -version of the algorithm based on Nearest Neighbour Descent for quicker -approximations. We combined Boruvka's algorithm with NNDescent to find -neighbours that are not already connected in the MST being build. - - -.. code:: python - - import matplotlib.pyplot as plt - import matplotlib.collections as mc - from sklearn.datasets import make_swiss_roll - from multi_mst.k_mst_descent import KMSTDescent - - X, t = make_swiss_roll(n_samples=2000, noise=0.5, hole=True) - projector = KMSTDescent(num_neighbors=3, epsilon=2.0).fit(X) - - # Draw the network - xs = projector.embedding_[:, 0] - ys = projector.embedding_[:, 1] - coo_matrix = projector.graph_.tocoo() - sources = coo_matrix.row - targets = coo_matrix.col - - plt.figure(figsize=(4, 3)) - plt.scatter(xs, ys, c=t, s=1, edgecolors="none", linewidth=0, cmap="viridis") - lc = mc.LineCollection( - list(zip(zip(xs[sources], ys[sources]), zip(xs[targets], ys[targets]))), - linewidth=0.2, - zorder=-1, - alpha=0.5, - color="k", - ) - ax = plt.gca() - ax.add_collection(lc) - ax.set_aspect("equal") - plt.subplots_adjust(0, 0, 1, 1) - plt.axis("off") - plt.show() - -.. figure:: _static/k_mst_descent.png - - -------------------------- -Installation Instructions -------------------------- - -The `multi_mst` package can be installed from pypi: - -.. code:: bash - - pip install multi_mst - ----------------- -Acknowledgements ----------------- - -Most code---including the numba KDTree, disjoint set and boruvka MST -construction implementation---is adapted from `fast_hdbscan`_. The -NNDescent implementation is adapted from `pynndescent`_. - -------- -License -------- - -`multi_mst` uses the same license as `fast_hdbscan`: BSD (2-clause). See the -LICENSE file for details. - -.. _1: https://onlinelibrary.wiley.com/doi/10.1002/asi.20904 -.. _2: https://ieeexplore.ieee.org/document/8231853 -.. _fast_hdbscan: https://github.com/TutteInstitute/fast_hdbscan -.. _pynndescent: https://github.com/lmcinnes/pynndescent \ No newline at end of file diff --git a/doc/index.rst b/doc/index.rst index 22f27a0..b110268 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,8 +1,190 @@ .. multi_mst documentation master file. -.. toctree:: - :maxdepth: 1 - :caption: Contents: +============================================== +Manifold Modelling with Minimum Spanning Trees +============================================== - basic_usage - api_reference +Dimensionality reduction (DR) algorithms typically assume the data they are +given is uniformly sampled from some underlying manifold. When this is not the +case, and there are observation-gaps along the manifold, these algorithms may +fail to detect a single connected entity. This repository presents two manifold +approximation approaches based on minimum spanning trees (MST) for non-uniform +sampled data. + +--------------------------------- +Noisy Minimum Spanning Tree Union +--------------------------------- + +The noisy minimum spanning tree union (n-MST) is inspired by Pathfinder +networks that, with a specific parameter selection, yield the union set of all +possible MSTs in a network (see, e.g., [`1`_], [`2`_]). We compute noisy MSTs to +detect alternative connectivity at all distance scales for distances which may +have few identically weighted connections. + +We add Gaussian noise (mean=0) to every candidate edge. The noise parameter n +is specified as a fraction of the points' nearest neighbour distance and +controls the Gaussian's standard deviation. This formulation makes the noise +scale with the data's density to avoid adding more edges in dense regions than +sparse regions, retaining a reasonably uniform manifold approximation graph. + +.. code:: python + + import matplotlib.pyplot as plt + import matplotlib.collections as mc + from sklearn.datasets import make_swiss_roll + from multi_mst.noisy_mst import NoisyMST + + X, t = make_swiss_roll(n_samples=2000, noise=0.5, hole=True) + projector = NoisyMST(num_trees=10, noise_fraction=1.0).fit(X) + + # Draw the network + xs = projector.embedding_[:, 0] + ys = projector.embedding_[:, 1] + coo_matrix = projector.graph_.tocoo() + sources = coo_matrix.row + targets = coo_matrix.col + + plt.figure(figsize=(4, 3)) + plt.scatter(xs, ys, c=t, s=1, edgecolors="none", linewidth=0, cmap="viridis") + lc = mc.LineCollection( + list(zip(zip(xs[sources], ys[sources]), zip(xs[targets], ys[targets]))), + linewidth=0.2, + zorder=-1, + alpha=0.5, + color="k", + ) + ax = plt.gca() + ax.add_collection(lc) + ax.set_aspect("equal") + plt.subplots_adjust(0, 0, 1, 1) + plt.axis("off") + plt.show() + +.. figure:: _static/noisy_mst.png + + +--------------------------------- +k-Nearest Minimum Spanning Tree +--------------------------------- + +The k-nearest Minimum Spanning Tree (k-MST) generalises k-nearest neighbour +networks (k-NN) for minimum spanning trees. It adds the k shortest edges +between components. Since data points start as distinct components, all k-NN +edges are included in the kMST. + +To avoid creating shortcuts in the manifold, a distance threshold epsilon can +be applied. The parameter is specified as a fraction of the shortest edge +between components and provides an upper distance limit for the 2-to-k +alternative edges. + +.. code:: python + + import matplotlib.pyplot as plt + import matplotlib.collections as mc + from sklearn.datasets import make_swiss_roll + from multi_mst.k_mst import KMST + + X, t = make_swiss_roll(n_samples=2000, noise=0.5, hole=True) + projector = KMST(num_neighbors=3, epsilon=2.0).fit(X) + + # Draw the network + xs = projector.embedding_[:, 0] + ys = projector.embedding_[:, 1] + coo_matrix = projector.graph_.tocoo() + sources = coo_matrix.row + targets = coo_matrix.col + + plt.figure(figsize=(4, 3)) + plt.scatter(xs, ys, c=t, s=1, edgecolors="none", linewidth=0, cmap="viridis") + lc = mc.LineCollection( + list(zip(zip(xs[sources], ys[sources]), zip(xs[targets], ys[targets]))), + linewidth=0.2, + zorder=-1, + alpha=0.5, + color="k", + ) + ax = plt.gca() + ax.add_collection(lc) + ax.set_aspect("equal") + plt.subplots_adjust(0, 0, 1, 1) + plt.axis("off") + plt.show() + +.. figure:: _static/k_mst.png + + +------------------- +Approximate k-MST +------------------- + +Computing k-MSTs using KDTrees can be expensive on some datasets. We provide a +version of the algorithm based on Nearest Neighbour Descent for quicker +approximations. We combined Boruvka's algorithm with NNDescent to find +neighbours that are not already connected in the MST being build. + + +.. code:: python + + import matplotlib.pyplot as plt + import matplotlib.collections as mc + from sklearn.datasets import make_swiss_roll + from multi_mst.k_mst_descent import KMSTDescent + + X, t = make_swiss_roll(n_samples=2000, noise=0.5, hole=True) + projector = KMSTDescent(num_neighbors=3, epsilon=2.0).fit(X) + + # Draw the network + xs = projector.embedding_[:, 0] + ys = projector.embedding_[:, 1] + coo_matrix = projector.graph_.tocoo() + sources = coo_matrix.row + targets = coo_matrix.col + + plt.figure(figsize=(4, 3)) + plt.scatter(xs, ys, c=t, s=1, edgecolors="none", linewidth=0, cmap="viridis") + lc = mc.LineCollection( + list(zip(zip(xs[sources], ys[sources]), zip(xs[targets], ys[targets]))), + linewidth=0.2, + zorder=-1, + alpha=0.5, + color="k", + ) + ax = plt.gca() + ax.add_collection(lc) + ax.set_aspect("equal") + plt.subplots_adjust(0, 0, 1, 1) + plt.axis("off") + plt.show() + +.. figure:: _static/k_mst_descent.png + + +------------------------- +Installation Instructions +------------------------- + +The `multi_mst` package can be installed from pypi: + +.. code:: bash + + pip install multi_mst + +---------------- +Acknowledgements +---------------- + +Most code---including the numba KDTree, disjoint set and boruvka MST +construction implementation---is adapted from `fast_hdbscan`_. The +NNDescent implementation is adapted from `pynndescent`_. + +------- +License +------- + +`multi_mst` uses the same license as `fast_hdbscan`: BSD (2-clause). See the +LICENSE file for details. + +.. _1: https://onlinelibrary.wiley.com/doi/10.1002/asi.20904 +.. _2: https://ieeexplore.ieee.org/document/8231853 +.. _fast_hdbscan: https://github.com/TutteInstitute/fast_hdbscan +.. _pynndescent: https://github.com/lmcinnes/pynndescent \ No newline at end of file diff --git a/multi_mst/k_mst/api.py b/multi_mst/k_mst/api.py index 5f45b0e..e793a8b 100644 --- a/multi_mst/k_mst/api.py +++ b/multi_mst/k_mst/api.py @@ -32,9 +32,9 @@ def validate_parameters(data, num_neighbors, min_samples, epsilon): def kMST(data, num_neighbors=3, min_samples=1, epsilon=None, umap_kwargs=None): """ - Computes a $k$-MST of a dataset. Adapts the boruvka algorithm to look for - $k$ candidate edges per point, of which the $k$ best per connected component - are retained (up to $epsilon$ times the shortest distance). + Computes a k-MST of a dataset. Adapts the boruvka algorithm to look for + k candidate edges per point, of which the k best per connected component + are retained (up to epsilon times the shortest distance). The algorithm operates on HDBSCAN's mutual reachability Euclidean distance. The resulting graph is embedded with UMAP as if it contains normal k nearest @@ -89,9 +89,9 @@ def kMST(data, num_neighbors=3, min_samples=1, epsilon=None, umap_kwargs=None): class KMST(BaseEstimator): """ - An SKLEARN-style estimator for computing a $k$-MST of a dataset. Adapts the - boruvka algorithm to look for $k$ candidate edges per point, of which the - $k$ best per connected component are retained (up to $epsilon$ times the + An SKLEARN-style estimator for computing a k-MST of a dataset. Adapts the + boruvka algorithm to look for k candidate edges per point, of which the + k best per connected component are retained (up to epsilon times the shortest distance). The algorithm operates on HDBSCAN's mutual reachability Euclidean distance. @@ -141,7 +141,7 @@ def __init__(self, *, num_neighbors=3, min_samples=1, epsilon=None, umap_kwargs= def fit(self, X, y=None, **fit_params): """ - Computes the $\epsilon k$-MST of the given data. + Computes the k-MST of the given data. Parameters ---------- @@ -212,7 +212,7 @@ def fit(self, X, y=None, **fit_params): def fit_transform(self, X, y=None, **fit_params): """ - Computes the $\epsilon k$-MST of the given data. + Computes the k-MST of the given data. Parameters ---------- diff --git a/multi_mst/k_mst_descent/api.py b/multi_mst/k_mst_descent/api.py index 37a25d9..ce13d9a 100644 --- a/multi_mst/k_mst_descent/api.py +++ b/multi_mst/k_mst_descent/api.py @@ -51,7 +51,7 @@ def kMSTDescent( ): """ Computes an approximate k-MST using NN-Descent. Adapts the boruvka algorithm - to look for k candidate edges per point, of which the $k$ best per connected + to look for k candidate edges per point, of which the k best per connected component are retained (up to epsilon times the shortest distance). Adapts NN-Descent to find MST edges, i.e., neighbours that are not already connected in the MST build up so far. @@ -171,7 +171,7 @@ class KMSTDescent(BaseEstimator): """ An SKLEARN-style estimator for computing approximate k-MSTs using NN-Descent. Adapts the boruvka algorithm to look for k candidate edges per - point, of which the $k$ best per connected component are retained (up to + point, of which the k best per connected component are retained (up to epsilon times the shortest distance). Adapts NN-Descent to find MST edges, i.e., neighbours that are not already connected in the MST build up so far. @@ -282,7 +282,7 @@ def __init__( def fit(self, X, y=None, **fit_params): """ - Computes the $k$-MST of the given data. + Computes the k-MST of the given data. Parameters ---------- @@ -353,7 +353,7 @@ def fit(self, X, y=None, **fit_params): def fit_transform(self, X, y=None, **fit_params): """ - Computes the $k$-MST of the given data. + Computes the k-MST of the given data. Parameters ---------- diff --git a/multi_mst/k_mst_descent_recall/api.py b/multi_mst/k_mst_descent_recall/api.py index ecd8cc1..af3c60d 100644 --- a/multi_mst/k_mst_descent_recall/api.py +++ b/multi_mst/k_mst_descent_recall/api.py @@ -51,7 +51,7 @@ def kMSTDescentLogRecall( ): """ Computes approximate k-MSTs using NN-Descent. Adapts the boruvka algorithm - to look for k candidate edges per point, of which the $k$ best per connected + to look for k candidate edges per point, of which the k best per connected component are retained (up to epsilon times the shortest distance). Adapts NN-Descent to find MST edges, i.e., neighbours that are not already connected in the MST build up so far. @@ -158,7 +158,7 @@ class KMSTDescentLogRecall(BaseEstimator): """ An SKLEARN-style estimator for computing approximate k-MSTs using NN-Descent. Adapts the boruvka algorithm to look for k candidate edges per - point, of which the $k$ best per connected component are retained (up to + point, of which the k best per connected component are retained (up to epsilon times the shortest distance). Adapts NN-Descent to find MST edges, i.e., neighbours that are not already connected in the MST build up so far. @@ -238,7 +238,7 @@ def __init__( def fit(self, X, y=None, **fit_params): """ - Computes the $k$-MST of the given data. + Computes the k-MST of the given data. Parameters ---------- @@ -309,7 +309,7 @@ def fit(self, X, y=None, **fit_params): def fit_transform(self, X, y=None, **fit_params): """ - Computes the $k$-MST of the given data. + Computes the k-MST of the given data. Parameters ---------- diff --git a/multi_mst/kdtree.py b/multi_mst/kdtree.py index 7bde089..970ae05 100644 --- a/multi_mst/kdtree.py +++ b/multi_mst/kdtree.py @@ -169,7 +169,7 @@ def point_to_node_lower_bound_rdist(upper, lower, pt): ) def tree_query_recursion(tree, node, point, heap_p, heap_i, dist_lower_bound): """ - Traverses a KD-tree recursively to find $k$ nearest points. Updates heap + Traverses a KD-tree recursively to find k nearest points. Updates heap with neighbors inplace. """ node_info = tree.node_data[node] diff --git a/multi_mst/noisy_mst/api.py b/multi_mst/noisy_mst/api.py index ae76669..daa85fd 100644 --- a/multi_mst/noisy_mst/api.py +++ b/multi_mst/noisy_mst/api.py @@ -32,7 +32,7 @@ def validate_parameters(data, num_trees, noise_fraction, min_samples): def noisyMST(data, num_trees=3, noise_fraction=0.1, min_samples=1, umap_kwargs=None): """ - Computes a union of $k$ noisy MSTs for the given data. Adapts the boruvka + Computes a union of k noisy MSTs for the given data. Adapts the boruvka algorithm construct multiple noisy miminum spanning trees. The algorithm operates on HDBSCAN's mutual reachability Euclidean distance. @@ -88,7 +88,7 @@ def noisyMST(data, num_trees=3, noise_fraction=0.1, min_samples=1, umap_kwargs=N class NoisyMST(BaseEstimator): """ - An SKLEARN-style estimator for computing a union of $k$ noisy MSTs for the + An SKLEARN-style estimator for computing a union of k noisy MSTs for the given data. Adapts the boruvka algorithm construct multiple noisy miminum spanning trees. @@ -141,7 +141,7 @@ def __init__( def fit(self, X, y=None, **fit_params): """ - Computes the $\epsilon k$-MST of the given data. + Computes the k-MST of the given data. Parameters ---------- @@ -212,7 +212,7 @@ def fit(self, X, y=None, **fit_params): def fit_transform(self, X, y=None, **fit_params): """ - Computes the $\epsilon k$-MST of the given data. + Computes the k-MST of the given data. Parameters ----------