Merge pull request #24 from Intron7/testing

Testing & Improvements
scverse · May 17, 2023 · 2596bcc · 2596bcc
2 parents 853a619 + a3ca74f
commit 2596bcc
Show file tree

Hide file tree

Showing 31 changed files with 5,745 additions and 164 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ __pycache__/
 .Python
 build/
 develop-eggs/
+assets/
 dist/
 downloads/
 eggs/

diff --git a/data/pbmc3k_processed.h5ad b/data/pbmc3k_processed.h5ad
diff --git a/data/pbmc3k_raw.h5ad b/data/pbmc3k_raw.h5ad
diff --git a/rapids_singlecell/__init__.py b/rapids_singlecell/__init__.py
@@ -5,4 +5,4 @@
 from . import pl
 from . import gr
 
-__version__ = "0.6.2"
+__version__ = "0.6.3"
diff --git a/rapids_singlecell/cunnData/__init__.py b/rapids_singlecell/cunnData/__init__.py
@@ -31,6 +31,15 @@ def update_shape(self, shape):
 
     def __setitem__(self, key, item):
         if self.shape == item.shape:
+            if issparse_gpu(item):
+                item = item
+            elif isinstance(item, cp.ndarray):
+                item = item
+            elif not issparse_cpu(item):
+                inter = sparse.csr_matrix(item)
+                item = cp.sparse.csr_matrix(inter, dtype=cp.float32)
+            else:
+                item = cp.sparse.csr_matrix(item, dtype=cp.float32)
             if issparse_gpu(item):
                 if item.nnz > 2**31 - 1:
                     raise ValueError(
@@ -103,6 +112,7 @@ def __init__(
         obsm: Optional[Mapping[str, Any]] = None,
         varm: Optional[Mapping[str, Any]] = None,
     ):
+        # Initialize from adata
         if adata:
             if not issparse_cpu(adata.X):
                 inter = sparse.csr_matrix(adata.X)
@@ -119,19 +129,15 @@ def __init__(
             self.raw = None
             if adata.layers:
                 for key, matrix in adata.layers.items():
-                    if not issparse_cpu(matrix):
-                        inter = sparse.csr_matrix(matrix)
-                        inter = cp.sparse.csr_matrix(inter, dtype=cp.float32)
-                    else:
-                        inter = cp.sparse.csr_matrix(matrix, dtype=cp.float32)
-                    self._layers[key] = inter
+                    self._layers[key] = matrix
             if adata.obsm:
                 for key, matrix in adata.obsm.items():
                     self._obsm[key] = matrix.copy()
             if adata.varm:
                 for key, matrix in adata.varm.items():
                     self._varm[key] = matrix.copy()
 
+        # Initialize from items
         else:
             if issparse_gpu(X):
                 self._X = X
@@ -143,11 +149,11 @@ def __init__(
                 del inter
             else:
                 self._X = cp.sparse.csr_matrix(X, dtype=cp.float32)
-            if obs:
+            if obs is not None:
                 self._obs = obs
             else:
                 self._obs = pd.DataFrame(index=range(self.shape[0]))
-            if var:
+            if var is not None:
                 self._var = var
             else:
                 self._var = pd.DataFrame(index=range(self.shape[1]))
@@ -162,16 +168,7 @@ def __init__(
 
             if layers:
                 for key, matrix in layers.items():
-                    if issparse_gpu(matrix):
-                        inter = matrix
-                    elif isinstance(matrix, cp.ndarray):
-                        inter = matrix
-                    elif not issparse_cpu(X):
-                        inter = sparse.csr_matrix(matrix)
-                        inter = cp.sparse.csr_matrix(inter, dtype=cp.float32)
-                    else:
-                        inter = cp.sparse.csr_matrix(matrix, dtype=cp.float32)
-                    self.layers[key] = inter
+                    self.layers[key] = matrix
             if obsm:
                 for key, matrix in obsm.items():
                     self.obsm[key] = matrix.copy()

diff --git a/rapids_singlecell/cunnData_funcs/_hvg.py b/rapids_singlecell/cunnData_funcs/_hvg.py
@@ -164,10 +164,9 @@ def highly_variable_genes(
             batches = cudata.obs[batch_key].cat.categories
             df = []
             genes = cudata.var.index.to_numpy()
+            X = cudata.layers[layer] if layer is not None else cudata.X
             for batch in batches:
-                inter_matrix = cudata.X[
-                    np.where(cudata.obs[batch_key] == batch)[0],
-                ].tocsc()
+                inter_matrix = X[np.where(cudata.obs[batch_key] == batch)[0],].tocsc()
                 thr_org = cp.diff(inter_matrix.indptr).ravel()
                 thr = cp.where(thr_org >= 1)[0]
                 thr_2 = cp.where(thr_org < 1)[0]
@@ -336,6 +335,12 @@ def _highly_variable_genes_single_batch(
         ].sort()  # interestingly, np.argpartition is slightly slower
         if n_top_genes > X.shape[1]:
             n_top_genes = X.shape[1]
+        if n_top_genes > dispersion_norm.size:
+            warnings.warn(
+                "`n_top_genes` > number of normalized dispersions, returning all genes with normalized dispersions.",
+                UserWarning,
+            )
+            n_top_genes = dispersion_norm.size
         disp_cut_off = dispersion_norm[n_top_genes - 1]
         gene_subset = np.nan_to_num(df["dispersions_norm"].values) >= disp_cut_off
     else:
@@ -448,6 +453,7 @@ def _highly_variable_genes_seurat_v3(
     ranked_norm_gene_vars = ranked_norm_gene_vars.get()
     ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars)
     median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan)
+
     df["highly_variable_nbatches"] = num_batches_high_var.get()
     df["highly_variable_rank"] = median_ranked
     df["variances_norm"] = cp.mean(norm_gene_vars, axis=0).get()
@@ -598,6 +604,7 @@ def _highly_variable_pearson_residuals(
         if clip < 0:
             raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
 
+        clip = cp.array([clip], dtype=cp.float32)
         theta = cp.array([theta], dtype=cp.float32)
         sums_genes = cp.zeros(X_batch.shape[1], dtype=cp.float32)
         sums_cells = cp.zeros(X_batch.shape[0], dtype=cp.float32)

diff --git a/rapids_singlecell/cunnData_funcs/_normalize.py b/rapids_singlecell/cunnData_funcs/_normalize.py
@@ -323,6 +323,8 @@ def normalize_pearson_residuals(
             "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
             UserWarning,
         )
+    computed_on = layer if layer else "cudata.X"
+    settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on)
     if theta <= 0:
         raise ValueError("Pearson residuals require theta > 0")
     if clip is None:
@@ -331,6 +333,7 @@ def normalize_pearson_residuals(
     if clip < 0:
         raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
     theta = cp.array([1 / theta], dtype=cp.float32)
+    clip = cp.array([clip], dtype=cp.float32)
     sums_cells = cp.zeros(X.shape[0], dtype=cp.float32)
     sums_genes = cp.zeros(X.shape[1], dtype=cp.float32)
 
@@ -420,7 +423,9 @@ def normalize_pearson_residuals(
                 residuals.shape[1],
             ),
         )
+
     if inplace == True:
+        cudata.uns["pearson_residuals_normalization"] = settings_dict
         if layer:
             cudata.layers[layer] = residuals
         else:

diff --git a/rapids_singlecell/cunnData_funcs/_pca.py b/rapids_singlecell/cunnData_funcs/_pca.py
@@ -1,18 +1,20 @@
 from ..cunnData import cunnData
+from anndata import AnnData
 
 from cuml.decomposition import PCA, TruncatedSVD
-from typing import Optional
+from typing import Optional, Union
 
 from cupy.sparse import issparse
 import math
 import numpy as np
 
 
 def pca(
-    cudata: cunnData,
+    cudata: Union[cunnData, AnnData],
     layer: str = None,
-    n_comps: int = 50,
+    n_comps: Optional[int] = None,
     zero_center: bool = True,
+    random_state: Union[int, None] = 0,
     use_highly_variable: Optional[bool] = None,
     chunked: bool = False,
     chunk_size: int = None,
@@ -23,18 +25,22 @@ def pca(
     Parameters
     ----------
         cudata :
-            cunnData object
+            cunnData, AnnData object
 
         layer
             If provided, use `cudata.layers[layer]` for expression values instead of `cudata.X`.
 
         n_comps
-            Number of principal components to compute. Defaults to 50
+            Number of principal components to compute. Defaults to 50, or 1 - minimum
+            dimension size of selected representation
 
         zero_center
             If `True`, compute standard PCA from covariance matrix.
             If `False`, omit zero-centering variables
 
+        random_state
+            Change to use different initial states for the optimization.
+
         use_highly_variable
             Whether to use highly variable genes only, stored in
             `.var['highly_variable']`.
@@ -78,6 +84,13 @@ def pca(
     if use_highly_variable:
         X = X[:, cudata.var["highly_variable"]]
 
+    if n_comps is None:
+        min_dim = min(X.shape[0], X.shape[1])
+        if 50 >= min_dim:
+            n_comps = min_dim - 1
+        else:
+            n_comps = 50
+
     if chunked:
         from cuml.decomposition import IncrementalPCA
 
@@ -97,11 +110,15 @@ def pca(
             X_pca[start_idx:stop_idx] = pca_func.transform(chunk)
     else:
         if zero_center:
-            pca_func = PCA(n_components=n_comps, output_type="numpy")
+            pca_func = PCA(
+                n_components=n_comps, random_state=random_state, output_type="numpy"
+            )
             X_pca = pca_func.fit_transform(X)
 
         elif not zero_center:
-            pca_func = TruncatedSVD(n_components=n_comps, output_type="numpy")
+            pca_func = TruncatedSVD(
+                n_components=n_comps, random_state=random_state, output_type="numpy"
+            )
             X_pca = pca_func.fit_transform(X)
 
     cudata.obsm["X_pca"] = X_pca

diff --git a/rapids_singlecell/cunnData_funcs/_scale.py b/rapids_singlecell/cunnData_funcs/_scale.py
@@ -5,7 +5,7 @@
 
 def scale(
     cudata: cunnData,
-    max_value: int = 10,
+    max_value: Optional[int] = None,
     layer: Optional[str] = None,
     inplace: bool = True,
 ) -> Optional[cp.ndarray]:
@@ -45,7 +45,8 @@ def scale(
     stddev = cp.sqrt(X.var(axis=0))
     X /= stddev
     del stddev
-    X = cp.clip(X, a_max=max_value)
+    if max_value:
+        X = cp.clip(X, a_max=max_value)
     if inplace:
         if layer:
             cudata.layers[layer] = X

diff --git a/rapids_singlecell/cunnData_funcs/_simple.py b/rapids_singlecell/cunnData_funcs/_simple.py
@@ -291,7 +291,7 @@ def calculate_qc_metrics(
             qc_vars = [qc_vars]
         for qc_var in qc_vars:
             sums_cells_sub = cp.zeros(X.shape[0], dtype=cp.float32)
-            mask = cp.array(cudata.var["MT"], dtype=cp.bool_)
+            mask = cp.array(cudata.var[qc_var], dtype=cp.bool_)
             if cp.sparse.issparse(X):
                 if cpx.scipy.sparse.isspmatrix_csr(X):
                     block = (32,)

diff --git a/rapids_singlecell/cunnData_funcs/_utils.py b/rapids_singlecell/cunnData_funcs/_utils.py
@@ -98,7 +98,7 @@ def _check_nonnegative_integers(X):
     if issparse(X):
         data = X.data
     else:
-        data = data
+        data = X
     """Checks values of data to ensure it is count data"""
     # Check no negatives
     if cp.signbit(data).any():

diff --git a/rapids_singlecell/scanpy_gpu/__init__.py b/rapids_singlecell/scanpy_gpu/__init__.py
@@ -3,7 +3,7 @@
 from ._draw_graph import draw_graph
 from ._embedding_density import embedding_density
 from ._harmony_integrate import harmony_integrate
-from ._pca import pca
+from ..pp import pca
 from ._pymde import mde
 from ._rank_gene_groups import rank_genes_groups_logreg
 from ._tsne import tsne