From 31a1ba07ceb0b9fe3cdeebbe3df594b7e0592901 Mon Sep 17 00:00:00 2001
From: Brian Hie <brianhie@mit.edu>
Date: Sun, 3 May 2020 14:15:37 -0400
Subject: [PATCH] remove custom tsne

---
 scanorama/__init__.py     |   2 +-
 scanorama/scanorama.py    |  10 +-
 scanorama/t_sne_approx.py | 886 --------------------------------------
 setup.py                  |   4 +-
 4 files changed, 8 insertions(+), 894 deletions(-)
 delete mode 100644 scanorama/t_sne_approx.py

diff --git a/scanorama/__init__.py b/scanorama/__init__.py
index e4a76be..c17ffa1 100644
--- a/scanorama/__init__.py
+++ b/scanorama/__init__.py
@@ -1,3 +1,3 @@
 from .scanorama import *
 
-__version__ = '1.5.2'
+__version__ = '1.6'
diff --git a/scanorama/scanorama.py b/scanorama/scanorama.py
index 6c86b79..dad5139 100644
--- a/scanorama/scanorama.py
+++ b/scanorama/scanorama.py
@@ -13,7 +13,6 @@
 import sys
 import warnings
 
-from .t_sne_approx import TSNEApprox
 from .utils import plt, dispersion, reduce_dimensionality
 from .utils import visualize_cluster, visualize_expr, visualize_dropout
 from .utils import handle_zeros_in_scale
@@ -421,14 +420,15 @@ def visualize(assembled, labels, namespace, data_names,
               n_iter=N_ITER, perplexity=PERPLEXITY, verbose=VERBOSE,
               learn_rate=200., early_exag=12., embedding=None,
               shuffle_ds=False, size=1, multicore_tsne=True,
-              image_suffix='.svg', viz_cluster=False, colors=None):
+              image_suffix='.svg', viz_cluster=False, colors=None,
+              random_state=None,):
     # Fit t-SNE.
     if embedding is None:
         try:
             from MulticoreTSNE import MulticoreTSNE
             tsne = MulticoreTSNE(
                 n_iter=n_iter, perplexity=perplexity,
-                verbose=verbose, random_state=69,
+                verbose=verbose, random_state=random_state,
                 learning_rate=learn_rate,
                 early_exaggeration=early_exag,
                 n_jobs=40
@@ -437,9 +437,9 @@ def visualize(assembled, labels, namespace, data_names,
             multicore_tsne = False
 
         if not multicore_tsne:
-            tsne = TSNEApprox(
+            tsne = TSNE(
                 n_iter=n_iter, perplexity=perplexity,
-                verbose=verbose, random_state=69,
+                verbose=verbose, random_state=random_state,
                 learning_rate=learn_rate,
                 early_exaggeration=early_exag
             )
diff --git a/scanorama/t_sne_approx.py b/scanorama/t_sne_approx.py
deleted file mode 100644
index 78e21a7..0000000
--- a/scanorama/t_sne_approx.py
+++ /dev/null
@@ -1,886 +0,0 @@
-# Modified by Brian Hie <brianhie@mit.edu> to use an approximate nearest
-# neighbors search.
-# Original source code available at:
-# https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/manifold/t_sne.py
-
-# Author: Alexander Fabisch  -- <afabisch@informatik.uni-bremen.de>
-# Author: Christopher Moody <chrisemoody@gmail.com>
-# Author: Nick Travers <nickt@squareup.com>
-# License: BSD 3 clause (C) 2014
-
-# This is the exact and Barnes-Hut t-SNE implementation. There are other
-# modifications of the algorithm:
-# * Fast Optimization for t-SNE:
-#   http://cseweb.ucsd.edu/~lvdmaaten/workshops/nips2010/papers/vandermaaten.pdf
-
-from time import time
-import numpy as np
-from scipy import linalg
-import scipy.sparse as sp
-from scipy.spatial.distance import pdist
-from scipy.spatial.distance import squareform
-from scipy.sparse import csr_matrix
-from sklearn.neighbors import NearestNeighbors
-from sklearn.base import BaseEstimator
-from sklearn.utils import check_array
-from sklearn.utils import check_random_state
-from sklearn.decomposition import PCA
-from sklearn.metrics.pairwise import pairwise_distances
-from sklearn.manifold import _utils
-from sklearn.manifold import _barnes_hut_tsne
-from sklearn.externals.six import string_types
-from sklearn.utils import deprecated
-
-from annoy import AnnoyIndex
-
-MACHINE_EPSILON = np.finfo(np.double).eps
-
-
-def _joint_probabilities(distances, desired_perplexity, verbose):
-    """Compute joint probabilities p_ij from distances.
-
-    Parameters
-    ----------
-    distances : array, shape (n_samples * (n_samples-1) / 2,)
-        Distances of samples are stored as condensed matrices, i.e.
-        we omit the diagonal and duplicate entries and store everything
-        in a one-dimensional array.
-
-    desired_perplexity : float
-        Desired perplexity of the joint probability distributions.
-
-    verbose : int
-        Verbosity level.
-
-    Returns
-    -------
-    P : array, shape (n_samples * (n_samples-1) / 2,)
-        Condensed joint probability matrix.
-    """
-    # Compute conditional probabilities such that they approximately match
-    # the desired perplexity
-    distances = distances.astype(np.float32, copy=False)
-    conditional_P = _utils._binary_search_perplexity(
-        distances, None, desired_perplexity, verbose)
-    P = conditional_P + conditional_P.T
-    sum_P = np.maximum(np.sum(P), MACHINE_EPSILON)
-    P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON)
-    return P
-
-
-def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose):
-    """Compute joint probabilities p_ij from distances using just nearest
-    neighbors.
-
-    This method is approximately equal to _joint_probabilities. The latter
-    is O(N), but limiting the joint probability to nearest neighbors improves
-    this substantially to O(uN).
-
-    Parameters
-    ----------
-    distances : array, shape (n_samples, k)
-        Distances of samples to its k nearest neighbors.
-
-    neighbors : array, shape (n_samples, k)
-        Indices of the k nearest-neighbors for each samples.
-
-    desired_perplexity : float
-        Desired perplexity of the joint probability distributions.
-
-    verbose : int
-        Verbosity level.
-
-    Returns
-    -------
-    P : csr sparse matrix, shape (n_samples, n_samples)
-        Condensed joint probability matrix with only nearest neighbors.
-    """
-    t0 = time()
-    # Compute conditional probabilities such that they approximately match
-    # the desired perplexity
-    n_samples, k = neighbors.shape
-    distances = distances.astype(np.float32, copy=False)
-    neighbors = neighbors.astype(np.int64, copy=False)
-    conditional_P = _utils._binary_search_perplexity(
-        distances, neighbors, desired_perplexity, verbose)
-    assert np.all(np.isfinite(conditional_P)), \
-        "All probabilities should be finite"
-
-    # Symmetrize the joint probability distribution using sparse operations
-    P = csr_matrix((conditional_P.ravel(), neighbors.ravel(),
-                    range(0, n_samples * k + 1, k)),
-                   shape=(n_samples, n_samples))
-    P = P + P.T
-
-    # Normalize the joint probability distribution
-    sum_P = np.maximum(P.sum(), MACHINE_EPSILON)
-    P /= sum_P
-
-    assert np.all(np.abs(P.data) <= 1.0)
-    if verbose >= 2:
-        duration = time() - t0
-        print("[t-SNE] Computed conditional probabilities in {:.3f}s"
-              .format(duration))
-    return P
-
-
-def _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components,
-                   skip_num_points=0):
-    """t-SNE objective function: gradient of the KL divergence
-    of p_ijs and q_ijs and the absolute error.
-
-    Parameters
-    ----------
-    params : array, shape (n_params,)
-        Unraveled embedding.
-
-    P : array, shape (n_samples * (n_samples-1) / 2,)
-        Condensed joint probability matrix.
-
-    degrees_of_freedom : float
-        Degrees of freedom of the Student's-t distribution.
-
-    n_samples : int
-        Number of samples.
-
-    n_components : int
-        Dimension of the embedded space.
-
-    skip_num_points : int (optional, default:0)
-        This does not compute the gradient for points with indices below
-        `skip_num_points`. This is useful when computing transforms of new
-        data where you'd like to keep the old data fixed.
-
-    Returns
-    -------
-    kl_divergence : float
-        Kullback-Leibler divergence of p_ij and q_ij.
-
-    grad : array, shape (n_params,)
-        Unraveled gradient of the Kullback-Leibler divergence with respect to
-        the embedding.
-    """
-    X_embedded = params.reshape(n_samples, n_components)
-
-    # Q is a heavy-tailed distribution: Student's t-distribution
-    dist = pdist(X_embedded, "sqeuclidean")
-    dist += 1.
-    dist /= degrees_of_freedom
-    dist **= (degrees_of_freedom + 1.0) / -2.0
-    Q = np.maximum(dist / (2.0 * np.sum(dist)), MACHINE_EPSILON)
-
-    # Optimization trick below: np.dot(x, y) is faster than
-    # np.sum(x * y) because it calls BLAS
-
-    # Objective: C (Kullback-Leibler divergence of P and Q)
-    kl_divergence = 2.0 * np.dot(P, np.log(np.maximum(P, MACHINE_EPSILON) / Q))
-
-    # Gradient: dC/dY
-    # pdist always returns double precision distances. Thus we need to take
-    grad = np.ndarray((n_samples, n_components), dtype=params.dtype)
-    PQd = squareform((P - Q) * dist)
-    for i in range(skip_num_points, n_samples):
-        grad[i] = np.dot(np.ravel(PQd[i], order='K'),
-                         X_embedded[i] - X_embedded)
-    grad = grad.ravel()
-    c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
-    grad *= c
-
-    return kl_divergence, grad
-
-
-def _kl_divergence_bh(params, P, degrees_of_freedom, n_samples, n_components,
-                      angle=0.5, skip_num_points=0, verbose=False):
-    """t-SNE objective function: KL divergence of p_ijs and q_ijs.
-
-    Uses Barnes-Hut tree methods to calculate the gradient that
-    runs in O(NlogN) instead of O(N^2)
-
-    Parameters
-    ----------
-    params : array, shape (n_params,)
-        Unraveled embedding.
-
-    P : csr sparse matrix, shape (n_samples, n_sample)
-        Sparse approximate joint probability matrix, computed only for the
-        k nearest-neighbors and symmetrized.
-
-    degrees_of_freedom : float
-        Degrees of freedom of the Student's-t distribution.
-
-    n_samples : int
-        Number of samples.
-
-    n_components : int
-        Dimension of the embedded space.
-
-    angle : float (default: 0.5)
-        This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
-        'angle' is the angular size (referred to as theta in [3]) of a distant
-        node as measured from a point. If this size is below 'angle' then it is
-        used as a summary node of all points contained within it.
-        This method is not very sensitive to changes in this parameter
-        in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
-        computation time and angle greater 0.8 has quickly increasing error.
-
-    skip_num_points : int (optional, default:0)
-        This does not compute the gradient for points with indices below
-        `skip_num_points`. This is useful when computing transforms of new
-        data where you'd like to keep the old data fixed.
-
-    verbose : int
-        Verbosity level.
-
-    Returns
-    -------
-    kl_divergence : float
-        Kullback-Leibler divergence of p_ij and q_ij.
-
-    grad : array, shape (n_params,)
-        Unraveled gradient of the Kullback-Leibler divergence with respect to
-        the embedding.
-    """
-    params = params.astype(np.float32, copy=False)
-    X_embedded = params.reshape(n_samples, n_components)
-
-    val_P = P.data.astype(np.float32, copy=False)
-    neighbors = P.indices.astype(np.int64, copy=False)
-    indptr = P.indptr.astype(np.int64, copy=False)
-
-    grad = np.zeros(X_embedded.shape, dtype=np.float32)
-    error = _barnes_hut_tsne.gradient(val_P, X_embedded, neighbors, indptr,
-                                      grad, angle, n_components, verbose,
-                                      dof=degrees_of_freedom)
-    c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
-    grad = grad.ravel()
-    grad *= c
-
-    return error, grad
-
-
-def _gradient_descent(objective, p0, it, n_iter,
-                      n_iter_check=1, n_iter_without_progress=300,
-                      momentum=0.8, learning_rate=200.0, min_gain=0.01,
-                      min_grad_norm=1e-7, verbose=0, args=None, kwargs=None):
-    """Batch gradient descent with momentum and individual gains.
-
-    Parameters
-    ----------
-    objective : function or callable
-        Should return a tuple of cost and gradient for a given parameter
-        vector. When expensive to compute, the cost can optionally
-        be None and can be computed every n_iter_check steps using
-        the objective_error function.
-
-    p0 : array-like, shape (n_params,)
-        Initial parameter vector.
-
-    it : int
-        Current number of iterations (this function will be called more than
-        once during the optimization).
-
-    n_iter : int
-        Maximum number of gradient descent iterations.
-
-    n_iter_check : int
-        Number of iterations before evaluating the global error. If the error
-        is sufficiently low, we abort the optimization.
-
-    n_iter_without_progress : int, optional (default: 300)
-        Maximum number of iterations without progress before we abort the
-        optimization.
-
-    momentum : float, within (0.0, 1.0), optional (default: 0.8)
-        The momentum generates a weight for previous gradients that decays
-        exponentially.
-
-    learning_rate : float, optional (default: 200.0)
-        The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
-        the learning rate is too high, the data may look like a 'ball' with any
-        point approximately equidistant from its nearest neighbours. If the
-        learning rate is too low, most points may look compressed in a dense
-        cloud with few outliers.
-
-    min_gain : float, optional (default: 0.01)
-        Minimum individual gain for each parameter.
-
-    min_grad_norm : float, optional (default: 1e-7)
-        If the gradient norm is below this threshold, the optimization will
-        be aborted.
-
-    verbose : int, optional (default: 0)
-        Verbosity level.
-
-    args : sequence
-        Arguments to pass to objective function.
-
-    kwargs : dict
-        Keyword arguments to pass to objective function.
-
-    Returns
-    -------
-    p : array, shape (n_params,)
-        Optimum parameters.
-
-    error : float
-        Optimum.
-
-    i : int
-        Last iteration.
-    """
-    if args is None:
-        args = []
-    if kwargs is None:
-        kwargs = {}
-
-    p = p0.copy().ravel()
-    update = np.zeros_like(p)
-    gains = np.ones_like(p)
-    error = np.finfo(np.float).max
-    best_error = np.finfo(np.float).max
-    best_iter = i = it
-
-    tic = time()
-    for i in range(it, n_iter):
-        error, grad = objective(p, *args, **kwargs)
-        grad_norm = linalg.norm(grad)
-
-        inc = update * grad < 0.0
-        dec = np.invert(inc)
-        gains[inc] += 0.2
-        gains[dec] *= 0.8
-        np.clip(gains, min_gain, np.inf, out=gains)
-        grad *= gains
-        update = momentum * update - learning_rate * grad
-        p += update
-
-        if (i + 1) % n_iter_check == 0:
-            toc = time()
-            duration = toc - tic
-            tic = toc
-
-            if verbose >= 2:
-                print("[t-SNE] Iteration %d: error = %.7f,"
-                      " gradient norm = %.7f"
-                      " (%s iterations in %0.3fs)"
-                      % (i + 1, error, grad_norm, n_iter_check, duration))
-
-            if error < best_error:
-                best_error = error
-                best_iter = i
-            elif i - best_iter > n_iter_without_progress:
-                if verbose >= 2:
-                    print("[t-SNE] Iteration %d: did not make any progress "
-                          "during the last %d episodes. Finished."
-                          % (i + 1, n_iter_without_progress))
-                break
-            if grad_norm <= min_grad_norm:
-                if verbose >= 2:
-                    print("[t-SNE] Iteration %d: gradient norm %f. Finished."
-                          % (i + 1, grad_norm))
-                break
-
-    return p, error, i
-
-
-def trustworthiness(X, X_embedded, n_neighbors=5, precomputed=False):
-    """Expresses to what extent the local structure is retained.
-
-    The trustworthiness is within [0, 1]. It is defined as
-
-    .. math::
-
-        T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1}
-            \sum_{j \in U^{(k)}_i} (r(i, j) - k)
-
-    where :math:`r(i, j)` is the rank of the embedded datapoint j
-    according to the pairwise distances between the embedded datapoints,
-    :math:`U^{(k)}_i` is the set of points that are in the k nearest
-    neighbors in the embedded space but not in the original space.
-
-    * "Neighborhood Preservation in Nonlinear Projection Methods: An
-      Experimental Study"
-      J. Venna, S. Kaski
-    * "Learning a Parametric Embedding by Preserving Local Structure"
-      L.J.P. van der Maaten
-
-    Parameters
-    ----------
-    X : array, shape (n_samples, n_features) or (n_samples, n_samples)
-        If the metric is 'precomputed' X must be a square distance
-        matrix. Otherwise it contains a sample per row.
-
-    X_embedded : array, shape (n_samples, n_components)
-        Embedding of the training data in low-dimensional space.
-
-    n_neighbors : int, optional (default: 5)
-        Number of neighbors k that will be considered.
-
-    precomputed : bool, optional (default: False)
-        Set this flag if X is a precomputed square distance matrix.
-
-    Returns
-    -------
-    trustworthiness : float
-        Trustworthiness of the low-dimensional embedding.
-    """
-    if precomputed:
-        dist_X = X
-    else:
-        dist_X = pairwise_distances(X, squared=True)
-    dist_X_embedded = pairwise_distances(X_embedded, squared=True)
-    ind_X = np.argsort(dist_X, axis=1)
-    ind_X_embedded = np.argsort(dist_X_embedded, axis=1)[:, 1:n_neighbors + 1]
-
-    n_samples = X.shape[0]
-    t = 0.0
-    ranks = np.zeros(n_neighbors)
-    for i in range(n_samples):
-        for j in range(n_neighbors):
-            ranks[j] = np.where(ind_X[i] == ind_X_embedded[i, j])[0][0]
-        ranks -= n_neighbors
-        t += np.sum(ranks[ranks > 0])
-    t = 1.0 - t * (2.0 / (n_samples * n_neighbors *
-                          (2.0 * n_samples - 3.0 * n_neighbors - 1.0)))
-    return t
-
-
-class TSNEApprox(BaseEstimator):
-    """t-distributed Stochastic Neighbor Embedding.
-
-    t-SNE [1] is a tool to visualize high-dimensional data. It converts
-    similarities between data points to joint probabilities and tries
-    to minimize the Kullback-Leibler divergence between the joint
-    probabilities of the low-dimensional embedding and the
-    high-dimensional data. t-SNE has a cost function that is not convex,
-    i.e. with different initializations we can get different results.
-
-    It is highly recommended to use another dimensionality reduction
-    method (e.g. PCA for dense data or TruncatedSVD for sparse data)
-    to reduce the number of dimensions to a reasonable amount (e.g. 50)
-    if the number of features is very high. This will suppress some
-    noise and speed up the computation of pairwise distances between
-    samples. For more tips see Laurens van der Maaten's FAQ [2].
-
-    Read more in the :ref:`User Guide <t_sne>`.
-
-    Parameters
-    ----------
-    n_components : int, optional (default: 2)
-        Dimension of the embedded space.
-
-    perplexity : float, optional (default: 30)
-        The perplexity is related to the number of nearest neighbors that
-        is used in other manifold learning algorithms. Larger datasets
-        usually require a larger perplexity. Consider selecting a value
-        between 5 and 50. The choice is not extremely critical since t-SNE
-        is quite insensitive to this parameter.
-
-    early_exaggeration : float, optional (default: 12.0)
-        Controls how tight natural clusters in the original space are in
-        the embedded space and how much space will be between them. For
-        larger values, the space between natural clusters will be larger
-        in the embedded space. Again, the choice of this parameter is not
-        very critical. If the cost function increases during initial
-        optimization, the early exaggeration factor or the learning rate
-        might be too high.
-
-    learning_rate : float, optional (default: 200.0)
-        The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If
-        the learning rate is too high, the data may look like a 'ball' with any
-        point approximately equidistant from its nearest neighbours. If the
-        learning rate is too low, most points may look compressed in a dense
-        cloud with few outliers. If the cost function gets stuck in a bad local
-        minimum increasing the learning rate may help.
-
-    n_iter : int, optional (default: 1000)
-        Maximum number of iterations for the optimization. Should be at
-        least 250.
-
-    n_iter_without_progress : int, optional (default: 300)
-        Maximum number of iterations without progress before we abort the
-        optimization, used after 250 initial iterations with early
-        exaggeration. Note that progress is only checked every 50 iterations so
-        this value is rounded to the next multiple of 50.
-
-        .. versionadded:: 0.17
-           parameter *n_iter_without_progress* to control stopping criteria.
-
-    min_grad_norm : float, optional (default: 1e-7)
-        If the gradient norm is below this threshold, the optimization will
-        be stopped.
-
-    metric : string or callable, optional
-        The metric to use when calculating distance between instances in a
-        feature array. If metric is a string, it must be one of the options
-        allowed by scipy.spatial.distance.pdist for its metric parameter, or
-        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
-        If metric is "precomputed", X is assumed to be a distance matrix.
-        Alternatively, if metric is a callable function, it is called on each
-        pair of instances (rows) and the resulting value recorded. The callable
-        should take two arrays from X as input and return a value indicating
-        the distance between them. The default is "euclidean" which is
-        interpreted as squared euclidean distance.
-
-    init : string or numpy array, optional (default: "random")
-        Initialization of embedding. Possible options are 'random', 'pca',
-        and a numpy array of shape (n_samples, n_components).
-        PCA initialization cannot be used with precomputed distances and is
-        usually more globally stable than random initialization.
-
-    verbose : int, optional (default: 0)
-        Verbosity level.
-
-    random_state : int, RandomState instance or None, optional (default: None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.  Note that different initializations might result in
-        different local minima of the cost function.
-
-    method : string (default: 'barnes_hut')
-        By default the gradient calculation algorithm uses Barnes-Hut
-        approximation running in O(NlogN) time. method='exact'
-        will run on the slower, but exact, algorithm in O(N^2) time. The
-        exact algorithm should be used when nearest-neighbor errors need
-        to be better than 3%. However, the exact method cannot scale to
-        millions of examples.
-
-        .. versionadded:: 0.17
-           Approximate optimization *method* via the Barnes-Hut.
-
-    angle : float (default: 0.5)
-        Only used if method='barnes_hut'
-        This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
-        'angle' is the angular size (referred to as theta in [3]) of a distant
-        node as measured from a point. If this size is below 'angle' then it is
-        used as a summary node of all points contained within it.
-        This method is not very sensitive to changes in this parameter
-        in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
-        computation time and angle greater 0.8 has quickly increasing error.
-
-    Attributes
-    ----------
-    embedding_ : array-like, shape (n_samples, n_components)
-        Stores the embedding vectors.
-
-    kl_divergence_ : float
-        Kullback-Leibler divergence after optimization.
-
-    n_iter_ : int
-        Number of iterations run.
-
-    Examples
-    --------
-
-    >>> import numpy as np
-    >>> from sklearn.manifold import TSNE
-    >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
-    >>> X_embedded = TSNE(n_components=2).fit_transform(X)
-    >>> X_embedded.shape
-    (4, 2)
-
-    References
-    ----------
-
-    [1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data
-        Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008.
-
-    [2] van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding
-        http://homepage.tudelft.nl/19j49/t-SNE.html
-
-    [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms.
-        Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
-        http://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf
-    """
-    # Control the number of exploration iterations with early_exaggeration on
-    _EXPLORATION_N_ITER = 250
-
-    # Control the number of iterations between progress checks
-    _N_ITER_CHECK = 50
-
-    def __init__(self, n_components=2, perplexity=30.0,
-                 early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
-                 n_iter_without_progress=300, min_grad_norm=1e-7,
-                 metric="euclidean", init="random", verbose=0,
-                 random_state=None, method='barnes_hut', angle=0.5):
-        self.n_components = n_components
-        self.perplexity = perplexity
-        self.early_exaggeration = early_exaggeration
-        self.learning_rate = learning_rate
-        self.n_iter = n_iter
-        self.n_iter_without_progress = n_iter_without_progress
-        self.min_grad_norm = min_grad_norm
-        self.metric = metric
-        self.init = init
-        self.verbose = verbose
-        self.random_state = random_state
-        self.method = method
-        self.angle = angle
-
-    def _fit(self, X, skip_num_points=0):
-        """Fit the model using X as training data.
-
-        Note that sparse arrays can only be handled by method='exact'.
-        It is recommended that you convert your sparse array to dense
-        (e.g. `X.toarray()`) if it fits in memory, or otherwise using a
-        dimensionality reduction technique (e.g. TruncatedSVD).
-
-        Parameters
-        ----------
-        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
-            If the metric is 'precomputed' X must be a square distance
-            matrix. Otherwise it contains a sample per row. Note that this
-            when method='barnes_hut', X cannot be a sparse array and if need be
-            will be converted to a 32 bit float array. Method='exact' allows
-            sparse arrays and 64bit floating point inputs.
-
-        skip_num_points : int (optional, default:0)
-            This does not compute the gradient for points with indices below
-            `skip_num_points`. This is useful when computing transforms of new
-            data where you'd like to keep the old data fixed.
-        """
-        if self.method not in ['barnes_hut', 'exact']:
-            raise ValueError("'method' must be 'barnes_hut' or 'exact'")
-        if self.angle < 0.0 or self.angle > 1.0:
-            raise ValueError("'angle' must be between 0.0 - 1.0")
-        if self.metric == "precomputed":
-            if isinstance(self.init, string_types) and self.init == 'pca':
-                raise ValueError("The parameter init=\"pca\" cannot be "
-                                 "used with metric=\"precomputed\".")
-            if X.shape[0] != X.shape[1]:
-                raise ValueError("X should be a square distance matrix")
-            if np.any(X < 0):
-                raise ValueError("All distances should be positive, the "
-                                 "precomputed distances given as X is not "
-                                 "correct")
-        if self.method == 'barnes_hut' and sp.issparse(X):
-            raise TypeError('A sparse matrix was passed, but dense '
-                            'data is required for method="barnes_hut". Use '
-                            'X.toarray() to convert to a dense numpy array if '
-                            'the array is small enough for it to fit in '
-                            'memory. Otherwise consider dimensionality '
-                            'reduction techniques (e.g. TruncatedSVD)')
-        else:
-            X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                            dtype=[np.float32, np.float64])
-        if self.method == 'barnes_hut' and self.n_components > 3:
-            raise ValueError("'n_components' should be inferior to 4 for the "
-                             "barnes_hut algorithm as it relies on "
-                             "quad-tree or oct-tree.")
-        random_state = check_random_state(self.random_state)
-
-        if self.early_exaggeration < 1.0:
-            raise ValueError("early_exaggeration must be at least 1, but is {}"
-                             .format(self.early_exaggeration))
-
-        if self.n_iter < 250:
-            raise ValueError("n_iter should be at least 250")
-
-        n_samples = X.shape[0]
-
-        neighbors_nn = None
-        if self.method == "exact":
-            # Retrieve the distance matrix, either using the precomputed one or
-            # computing it.
-            if self.metric == "precomputed":
-                distances = X
-            else:
-                if self.verbose:
-                    print("[t-SNE] Computing pairwise distances...")
-
-                if self.metric == "euclidean":
-                    distances = pairwise_distances(X, metric=self.metric,
-                                                   squared=True)
-                else:
-                    distances = pairwise_distances(X, metric=self.metric)
-
-                if np.any(distances < 0):
-                    raise ValueError("All distances should be positive, the "
-                                     "metric given is not correct")
-
-            # compute the joint probability distribution for the input space
-            P = _joint_probabilities(distances, self.perplexity, self.verbose)
-            assert np.all(np.isfinite(P)), "All probabilities should be finite"
-            assert np.all(P >= 0), "All probabilities should be non-negative"
-            assert np.all(P <= 1), ("All probabilities should be less "
-                                    "or then equal to one")
-
-        else:
-            # Cpmpute the number of nearest neighbors to find.
-            # LvdM uses 3 * perplexity as the number of neighbors.
-            # In the event that we have very small # of points
-            # set the neighbors to n - 1.
-            k = min(n_samples - 1, int(3. * self.perplexity + 1))
-
-            if self.verbose:
-                print("[t-SNE] Computing {} nearest neighbors...".format(k))
-
-            # Find the nearest neighbors for every point
-            neighbors_method = 'ball_tree'
-            if (self.metric == 'precomputed'):
-                neighbors_method = 'brute'
-            knn = AnnoyIndex(X.shape[1], metric='euclidean')
-            t0 = time()
-            for i in range(n_samples):
-                knn.add_item(i, X[i, :])
-            knn.build(50)
-            duration = time() - t0
-            if self.verbose:
-                print("[t-SNE] Indexed {} samples in {:.3f}s...".format(
-                    n_samples, duration))
-
-            t0 = time()
-            neighbors_nn = np.zeros((n_samples, k), dtype=int)
-            distances_nn = np.zeros((n_samples, k))
-            for i in range(n_samples):
-                (neighbors_nn[i, :], distances_nn[i, :]) = knn.get_nns_by_vector(
-                    X[i, :], k, include_distances=True
-                )
-            duration = time() - t0
-            if self.verbose:
-                print("[t-SNE] Computed neighbors for {} samples in {:.3f}s..."
-                      .format(n_samples, duration))
-
-            # Free the memory used by the ball_tree
-            del knn
-
-            if self.metric == "euclidean":
-                # knn return the euclidean distance but we need it squared
-                # to be consistent with the 'exact' method. Note that the
-                # the method was derived using the euclidean method as in the
-                # input space. Not sure of the implication of using a different
-                # metric.
-                distances_nn **= 2
-
-            # compute the joint probability distribution for the input space
-            P = _joint_probabilities_nn(distances_nn, neighbors_nn,
-                                        self.perplexity, self.verbose)
-
-        if isinstance(self.init, np.ndarray):
-            X_embedded = self.init
-        elif self.init == 'pca':
-            pca = PCA(n_components=self.n_components, svd_solver='randomized',
-                      random_state=random_state)
-            X_embedded = pca.fit_transform(X).astype(np.float32, copy=False)
-        elif self.init == 'random':
-            # The embedding is initialized with iid samples from Gaussians with
-            # standard deviation 1e-4.
-            X_embedded = 1e-4 * random_state.randn(
-                n_samples, self.n_components).astype(np.float32)
-        else:
-            raise ValueError("'init' must be 'pca', 'random', or "
-                             "a numpy array")
-
-        # Degrees of freedom of the Student's t-distribution. The suggestion
-        # degrees_of_freedom = n_components - 1 comes from
-        # "Learning a Parametric Embedding by Preserving Local Structure"
-        # Laurens van der Maaten, 2009.
-        degrees_of_freedom = max(self.n_components - 1.0, 1)
-
-        return self._tsne(P, degrees_of_freedom, n_samples, random_state,
-                          X_embedded=X_embedded,
-                          neighbors=neighbors_nn,
-                          skip_num_points=skip_num_points)
-
-    @property
-    @deprecated("Attribute n_iter_final was deprecated in version 0.19 and "
-                "will be removed in 0.21. Use ``n_iter_`` instead")
-    def n_iter_final(self):
-        return self.n_iter_
-
-    def _tsne(self, P, degrees_of_freedom, n_samples, random_state, X_embedded,
-              neighbors=None, skip_num_points=0):
-        """Runs t-SNE."""
-        # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P
-        # and the Student's t-distributions Q. The optimization algorithm that
-        # we use is batch gradient descent with two stages:
-        # * initial optimization with early exaggeration and momentum at 0.5
-        # * final optimization with momentum at 0.8
-        params = X_embedded.ravel()
-
-        opt_args = {
-            "it": 0,
-            "n_iter_check": self._N_ITER_CHECK,
-            "min_grad_norm": self.min_grad_norm,
-            "learning_rate": self.learning_rate,
-            "verbose": self.verbose,
-            "kwargs": dict(skip_num_points=skip_num_points),
-            "args": [P, degrees_of_freedom, n_samples, self.n_components],
-            "n_iter_without_progress": self._EXPLORATION_N_ITER,
-            "n_iter": self._EXPLORATION_N_ITER,
-            "momentum": 0.5,
-        }
-        if self.method == 'barnes_hut':
-            obj_func = _kl_divergence_bh
-            opt_args['kwargs']['angle'] = self.angle
-            # Repeat verbose argument for _kl_divergence_bh
-            opt_args['kwargs']['verbose'] = self.verbose
-        else:
-            obj_func = _kl_divergence
-
-        # Learning schedule (part 1): do 250 iteration with lower momentum but
-        # higher learning rate controlled via the early exageration parameter
-        P *= self.early_exaggeration
-        params, kl_divergence, it = _gradient_descent(obj_func, params,
-                                                      **opt_args)
-        if self.verbose:
-            print("[t-SNE] KL divergence after %d iterations with early "
-                  "exaggeration: %f" % (it + 1, kl_divergence))
-
-        # Learning schedule (part 2): disable early exaggeration and finish
-        # optimization with a higher momentum at 0.8
-        P /= self.early_exaggeration
-        remaining = self.n_iter - self._EXPLORATION_N_ITER
-        if it < self._EXPLORATION_N_ITER or remaining > 0:
-            opt_args['n_iter'] = self.n_iter
-            opt_args['it'] = it + 1
-            opt_args['momentum'] = 0.8
-            opt_args['n_iter_without_progress'] = self.n_iter_without_progress
-            params, kl_divergence, it = _gradient_descent(obj_func, params,
-                                                          **opt_args)
-
-        # Save the final number of iterations
-        self.n_iter_ = it
-
-        if self.verbose:
-            print("[t-SNE] Error after %d iterations: %f"
-                  % (it + 1, kl_divergence))
-
-        X_embedded = params.reshape(n_samples, self.n_components)
-        self.kl_divergence_ = kl_divergence
-
-        return X_embedded
-
-    def fit_transform(self, X, y=None):
-        """Fit X into an embedded space and return that transformed
-        output.
-
-        Parameters
-        ----------
-        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
-            If the metric is 'precomputed' X must be a square distance
-            matrix. Otherwise it contains a sample per row.
-
-        Returns
-        -------
-        X_new : array, shape (n_samples, n_components)
-            Embedding of the training data in low-dimensional space.
-        """
-        embedding = self._fit(X)
-        self.embedding_ = embedding
-        return self.embedding_
-
-    def fit(self, X, y=None):
-        """Fit X into an embedded space.
-
-        Parameters
-        ----------
-        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
-            If the metric is 'precomputed' X must be a square distance
-            matrix. Otherwise it contains a sample per row. If the method
-            is 'exact', X may be a sparse matrix of type 'csr', 'csc'
-            or 'coo'.
-        """
-        self.fit_transform(X)
-        return self
diff --git a/setup.py b/setup.py
index 9b6158d..cf5d9d2 100644
--- a/setup.py
+++ b/setup.py
@@ -2,10 +2,10 @@
 
 setup(
     name='scanorama',
-    version='1.5.2',
+    version='1.6',
     description='Panoramic stitching of heterogeneous single cell transcriptomic data',
     url='https://github.com/brianhie/scanorama',
-    download_url='https://github.com/brianhie/scanorama/archive/v1.5.2.tar.gz',
+    download_url='https://github.com/brianhie/scanorama/archive/v1.6.tar.gz',
     packages=find_packages(exclude=['bin', 'conf', 'data', 'target']),
     install_requires=[
         'annoy>=1.11.5',