Skip to content

Commit

Permalink
Merge pull request #24 from Intron7/testing
Browse files Browse the repository at this point in the history
Testing & Improvements
  • Loading branch information
Intron7 authored May 17, 2023
2 parents 853a619 + a3ca74f commit 2596bcc
Show file tree
Hide file tree
Showing 31 changed files with 5,745 additions and 164 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ __pycache__/
.Python
build/
develop-eggs/
assets/
dist/
downloads/
eggs/
Expand Down
Binary file added data/pbmc3k_processed.h5ad
Binary file not shown.
Binary file added data/pbmc3k_raw.h5ad
Binary file not shown.
2 changes: 1 addition & 1 deletion rapids_singlecell/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
from . import pl
from . import gr

__version__ = "0.6.2"
__version__ = "0.6.3"
33 changes: 15 additions & 18 deletions rapids_singlecell/cunnData/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ def update_shape(self, shape):

def __setitem__(self, key, item):
if self.shape == item.shape:
if issparse_gpu(item):
item = item
elif isinstance(item, cp.ndarray):
item = item
elif not issparse_cpu(item):
inter = sparse.csr_matrix(item)
item = cp.sparse.csr_matrix(inter, dtype=cp.float32)
else:
item = cp.sparse.csr_matrix(item, dtype=cp.float32)
if issparse_gpu(item):
if item.nnz > 2**31 - 1:
raise ValueError(
Expand Down Expand Up @@ -103,6 +112,7 @@ def __init__(
obsm: Optional[Mapping[str, Any]] = None,
varm: Optional[Mapping[str, Any]] = None,
):
# Initialize from adata
if adata:
if not issparse_cpu(adata.X):
inter = sparse.csr_matrix(adata.X)
Expand All @@ -119,19 +129,15 @@ def __init__(
self.raw = None
if adata.layers:
for key, matrix in adata.layers.items():
if not issparse_cpu(matrix):
inter = sparse.csr_matrix(matrix)
inter = cp.sparse.csr_matrix(inter, dtype=cp.float32)
else:
inter = cp.sparse.csr_matrix(matrix, dtype=cp.float32)
self._layers[key] = inter
self._layers[key] = matrix
if adata.obsm:
for key, matrix in adata.obsm.items():
self._obsm[key] = matrix.copy()
if adata.varm:
for key, matrix in adata.varm.items():
self._varm[key] = matrix.copy()

# Initialize from items
else:
if issparse_gpu(X):
self._X = X
Expand All @@ -143,11 +149,11 @@ def __init__(
del inter
else:
self._X = cp.sparse.csr_matrix(X, dtype=cp.float32)
if obs:
if obs is not None:
self._obs = obs
else:
self._obs = pd.DataFrame(index=range(self.shape[0]))
if var:
if var is not None:
self._var = var
else:
self._var = pd.DataFrame(index=range(self.shape[1]))
Expand All @@ -162,16 +168,7 @@ def __init__(

if layers:
for key, matrix in layers.items():
if issparse_gpu(matrix):
inter = matrix
elif isinstance(matrix, cp.ndarray):
inter = matrix
elif not issparse_cpu(X):
inter = sparse.csr_matrix(matrix)
inter = cp.sparse.csr_matrix(inter, dtype=cp.float32)
else:
inter = cp.sparse.csr_matrix(matrix, dtype=cp.float32)
self.layers[key] = inter
self.layers[key] = matrix
if obsm:
for key, matrix in obsm.items():
self.obsm[key] = matrix.copy()
Expand Down
13 changes: 10 additions & 3 deletions rapids_singlecell/cunnData_funcs/_hvg.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,9 @@ def highly_variable_genes(
batches = cudata.obs[batch_key].cat.categories
df = []
genes = cudata.var.index.to_numpy()
X = cudata.layers[layer] if layer is not None else cudata.X
for batch in batches:
inter_matrix = cudata.X[
np.where(cudata.obs[batch_key] == batch)[0],
].tocsc()
inter_matrix = X[np.where(cudata.obs[batch_key] == batch)[0],].tocsc()
thr_org = cp.diff(inter_matrix.indptr).ravel()
thr = cp.where(thr_org >= 1)[0]
thr_2 = cp.where(thr_org < 1)[0]
Expand Down Expand Up @@ -336,6 +335,12 @@ def _highly_variable_genes_single_batch(
].sort() # interestingly, np.argpartition is slightly slower
if n_top_genes > X.shape[1]:
n_top_genes = X.shape[1]
if n_top_genes > dispersion_norm.size:
warnings.warn(
"`n_top_genes` > number of normalized dispersions, returning all genes with normalized dispersions.",
UserWarning,
)
n_top_genes = dispersion_norm.size
disp_cut_off = dispersion_norm[n_top_genes - 1]
gene_subset = np.nan_to_num(df["dispersions_norm"].values) >= disp_cut_off
else:
Expand Down Expand Up @@ -448,6 +453,7 @@ def _highly_variable_genes_seurat_v3(
ranked_norm_gene_vars = ranked_norm_gene_vars.get()
ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars)
median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan)

df["highly_variable_nbatches"] = num_batches_high_var.get()
df["highly_variable_rank"] = median_ranked
df["variances_norm"] = cp.mean(norm_gene_vars, axis=0).get()
Expand Down Expand Up @@ -598,6 +604,7 @@ def _highly_variable_pearson_residuals(
if clip < 0:
raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")

clip = cp.array([clip], dtype=cp.float32)
theta = cp.array([theta], dtype=cp.float32)
sums_genes = cp.zeros(X_batch.shape[1], dtype=cp.float32)
sums_cells = cp.zeros(X_batch.shape[0], dtype=cp.float32)
Expand Down
5 changes: 5 additions & 0 deletions rapids_singlecell/cunnData_funcs/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,8 @@ def normalize_pearson_residuals(
"`flavor='pearson_residuals'` expects raw count data, but non-integers were found.",
UserWarning,
)
computed_on = layer if layer else "cudata.X"
settings_dict = dict(theta=theta, clip=clip, computed_on=computed_on)
if theta <= 0:
raise ValueError("Pearson residuals require theta > 0")
if clip is None:
Expand All @@ -331,6 +333,7 @@ def normalize_pearson_residuals(
if clip < 0:
raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")
theta = cp.array([1 / theta], dtype=cp.float32)
clip = cp.array([clip], dtype=cp.float32)
sums_cells = cp.zeros(X.shape[0], dtype=cp.float32)
sums_genes = cp.zeros(X.shape[1], dtype=cp.float32)

Expand Down Expand Up @@ -420,7 +423,9 @@ def normalize_pearson_residuals(
residuals.shape[1],
),
)

if inplace == True:
cudata.uns["pearson_residuals_normalization"] = settings_dict
if layer:
cudata.layers[layer] = residuals
else:
Expand Down
31 changes: 24 additions & 7 deletions rapids_singlecell/cunnData_funcs/_pca.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
from ..cunnData import cunnData
from anndata import AnnData

from cuml.decomposition import PCA, TruncatedSVD
from typing import Optional
from typing import Optional, Union

from cupy.sparse import issparse
import math
import numpy as np


def pca(
cudata: cunnData,
cudata: Union[cunnData, AnnData],
layer: str = None,
n_comps: int = 50,
n_comps: Optional[int] = None,
zero_center: bool = True,
random_state: Union[int, None] = 0,
use_highly_variable: Optional[bool] = None,
chunked: bool = False,
chunk_size: int = None,
Expand All @@ -23,18 +25,22 @@ def pca(
Parameters
----------
cudata :
cunnData object
cunnData, AnnData object
layer
If provided, use `cudata.layers[layer]` for expression values instead of `cudata.X`.
n_comps
Number of principal components to compute. Defaults to 50
Number of principal components to compute. Defaults to 50, or 1 - minimum
dimension size of selected representation
zero_center
If `True`, compute standard PCA from covariance matrix.
If `False`, omit zero-centering variables
random_state
Change to use different initial states for the optimization.
use_highly_variable
Whether to use highly variable genes only, stored in
`.var['highly_variable']`.
Expand Down Expand Up @@ -78,6 +84,13 @@ def pca(
if use_highly_variable:
X = X[:, cudata.var["highly_variable"]]

if n_comps is None:
min_dim = min(X.shape[0], X.shape[1])
if 50 >= min_dim:
n_comps = min_dim - 1
else:
n_comps = 50

if chunked:
from cuml.decomposition import IncrementalPCA

Expand All @@ -97,11 +110,15 @@ def pca(
X_pca[start_idx:stop_idx] = pca_func.transform(chunk)
else:
if zero_center:
pca_func = PCA(n_components=n_comps, output_type="numpy")
pca_func = PCA(
n_components=n_comps, random_state=random_state, output_type="numpy"
)
X_pca = pca_func.fit_transform(X)

elif not zero_center:
pca_func = TruncatedSVD(n_components=n_comps, output_type="numpy")
pca_func = TruncatedSVD(
n_components=n_comps, random_state=random_state, output_type="numpy"
)
X_pca = pca_func.fit_transform(X)

cudata.obsm["X_pca"] = X_pca
Expand Down
5 changes: 3 additions & 2 deletions rapids_singlecell/cunnData_funcs/_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def scale(
cudata: cunnData,
max_value: int = 10,
max_value: Optional[int] = None,
layer: Optional[str] = None,
inplace: bool = True,
) -> Optional[cp.ndarray]:
Expand Down Expand Up @@ -45,7 +45,8 @@ def scale(
stddev = cp.sqrt(X.var(axis=0))
X /= stddev
del stddev
X = cp.clip(X, a_max=max_value)
if max_value:
X = cp.clip(X, a_max=max_value)
if inplace:
if layer:
cudata.layers[layer] = X
Expand Down
2 changes: 1 addition & 1 deletion rapids_singlecell/cunnData_funcs/_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def calculate_qc_metrics(
qc_vars = [qc_vars]
for qc_var in qc_vars:
sums_cells_sub = cp.zeros(X.shape[0], dtype=cp.float32)
mask = cp.array(cudata.var["MT"], dtype=cp.bool_)
mask = cp.array(cudata.var[qc_var], dtype=cp.bool_)
if cp.sparse.issparse(X):
if cpx.scipy.sparse.isspmatrix_csr(X):
block = (32,)
Expand Down
2 changes: 1 addition & 1 deletion rapids_singlecell/cunnData_funcs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _check_nonnegative_integers(X):
if issparse(X):
data = X.data
else:
data = data
data = X
"""Checks values of data to ensure it is count data"""
# Check no negatives
if cp.signbit(data).any():
Expand Down
2 changes: 1 addition & 1 deletion rapids_singlecell/scanpy_gpu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from ._draw_graph import draw_graph
from ._embedding_density import embedding_density
from ._harmony_integrate import harmony_integrate
from ._pca import pca
from ..pp import pca
from ._pymde import mde
from ._rank_gene_groups import rank_genes_groups_logreg
from ._tsne import tsne
Loading

0 comments on commit 2596bcc

Please sign in to comment.