Skip to content

Commit

Permalink
adds Sparse PCA for rapids-singlecell (#36)
Browse files Browse the repository at this point in the history
* adds ruff & Sparse PCA

* ruff updates

* updates PCA_sparse

* adds test for sparse PCA

* fixes plotting api

* fixes plotting

* version bump
  • Loading branch information
Intron7 authored Aug 4, 2023
1 parent 404fd70 commit b0bc71d
Show file tree
Hide file tree
Showing 42 changed files with 582 additions and 293 deletions.
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,8 @@ repos:
rev: 23.7.0
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.282
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
10 changes: 5 additions & 5 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,11 @@
#

html_theme = "scanpydoc"
html_theme_options = dict(
repository_url=repository_url,
repository_branch=os.environ.get("READTHEDOCS_GIT_IDENTIFIER", "main"),
use_repository_button=True,
)
html_theme_options = {
"repository_url": repository_url,
"repository_branch": os.environ.get("READTHEDOCS_GIT_IDENTIFIER", "main"),
"use_repository_button": True,
}

html_show_sphinx = False
html_logo = "_static/logo3.svg"
Expand Down
33 changes: 33 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,39 @@ line-length = 88
target-version = ['py38']
include = '^rapids_singlecell/.*\.py$'

[tool.ruff]
src = ["rapids_singlecell"]
exclude = ["rapids_singlecell/tests"]
line-length = 88
select = [
"F", # Errors detected by Pyflakes
"E", # Error detected by Pycodestyle
"W", # Warning detected by Pycodestyle
"I", # isort
"TID", # flake8-tidy-imports
"C4", # flake8-comprehensions
"BLE", # flake8-blind-except
"UP", # pyupgrade
"RUF100", # Report unused noqa directives
]
ignore = [
# line too long -> we accept long comment lines; black gets rid of long code lines
"E501",
# Do not assign a lambda expression, use a def -> lambda expression assignments are convenient
"E731",
# allow I, O, l as variable names -> I is the identity matrix
"E741",
# Missing docstring in public package
"F403",
# First line should be in imperative mood; try rephrasing
]


[tool.ruff.per-file-ignores]
"docs/*" = ["I"]
"tests/*" = ["D"]
"*/__init__.py" = ["F401"]

[tool.flit.sdist]
exclude = [
"rapids_singlecell/tests",
Expand Down
9 changes: 2 additions & 7 deletions rapids_singlecell/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
from . import cunnData
from . import pp
from . import dcg
from . import tl
from . import pl
from . import gr
from . import cunnData, dcg, gr, pl, pp, tl

__version__ = "0.7.2"
__version__ = "0.7.5"
51 changes: 25 additions & 26 deletions rapids_singlecell/cunnData/__init__.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,31 @@
import warnings
from collections import OrderedDict
from itertools import repeat
from typing import Any, List, Mapping, MutableMapping, Optional, Union

import anndata
import cupy as cp
import cupyx as cpx
from anndata import AnnData
from anndata._core.index import _normalize_indices
import anndata

import numpy as np
import pandas as pd
from scipy import sparse
from collections import OrderedDict
from typing import Any, Union, Optional, Mapping, MutableMapping, List
from pandas.api.types import infer_dtype, is_string_dtype
from itertools import repeat
import warnings

from anndata import AnnData
from anndata._core.index import _normalize_indices
from cupyx.scipy.sparse import issparse as issparse_gpu
from natsort import natsorted

from pandas.api.types import infer_dtype, is_string_dtype
from scipy import sparse
from scipy.sparse import issparse as issparse_cpu
from cupyx.scipy.sparse import issparse as issparse_gpu


class Layer_Mapping(dict):
"""
Dictonary subclass for layers handeling in cunnData
"""
"""Dictonary subclass for layers handeling in cunnData"""

def __init__(self, shape=None):
super().__init__({})
self.shape = shape

def update_shape(self, shape):
"""Updates Shape for Layers"""
self.shape = shape

def __setitem__(self, key, item):
Expand Down Expand Up @@ -56,15 +53,14 @@ def __setitem__(self, key, item):


class obsm_Mapping(dict):
"""
Dictonary subclass for obsm handeling in cunnData
"""
"""Dictonary subclass for obsm handeling in cunnData"""

def __init__(self, shape=None):
super().__init__({})
self.shape = shape

def update_shape(self, shape):
"""Updates Shape for obsm"""
self.shape = shape

def __setitem__(self, key, item):
Expand All @@ -75,15 +71,14 @@ def __setitem__(self, key, item):


class varm_Mapping(dict):
"""
Dictonary subclass for obsm handeling in cunnData
"""
"""Dictonary subclass for obsm handeling in cunnData"""

def __init__(self, shape=None):
super().__init__({})
self.shape = shape

def update_shape(self, shape):
"""Updates Shape for varm"""
self.shape = shape

def __setitem__(self, key, item):
Expand All @@ -94,10 +89,11 @@ def __setitem__(self, key, item):


class cunnData:
"""
"""\
The cunnData objects can be used as an AnnData replacement for the inital preprocessing
of single cell Datasets. It replaces some of the most common preprocessing steps within
scanpy for annData objects.
It can be initalized with a preexisting annData object or with a countmatrix and seperate
Dataframes for var and obs. Index of var will be used as gene_names. Initalization with an
AnnData object is advised.
Expand Down Expand Up @@ -310,8 +306,9 @@ def uns(self):

@property
def layers(self):
"""\
"""
Dictionary-like object with values of the same dimensions as :attr:`.X`.
Layers in cunnData are inspired by AnnData.
Return the layer named `"unspliced"`::
Expand All @@ -329,9 +326,10 @@ def layers(self):

@property
def obsm(self):
"""\
"""
Multi-dimensional annotation of observations
(mutable structured :class:`~numpy.ndarray`).
Stores for each key a two or higher-dimensional :class:`~numpy.ndarray`
of length :attr:`n_obs`.
Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`.
Expand All @@ -343,6 +341,7 @@ def varm(self):
"""\
Multi-dimensional annotation of variables/features
(mutable structured :class:`~numpy.ndarray`).
Stores for each key a two or higher-dimensional :class:`~numpy.ndarray`
of length :attr:`n_vars`.
Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`.
Expand Down Expand Up @@ -614,7 +613,7 @@ def varm_keys(self) -> List[str]:

def uns_keys(self) -> List[str]:
"""List keys of unstructured annotation."""
return sorted(list(self._uns.keys()))
return sorted(self._uns.keys())

def to_AnnData(self):
"""
Expand Down
15 changes: 10 additions & 5 deletions rapids_singlecell/cunnData_funcs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from ._hvg import highly_variable_genes
from ._normalize import log1p, normalize_pearson_residuals, normalize_total
from ._pca import pca
from ._regress_out import regress_out
from ._scale import scale
from ._pca import pca
from ._hvg import highly_variable_genes
from ._normalize import normalize_pearson_residuals, log1p, normalize_total
from ._simple import filter_cells, filter_genes, filter_highly_variable
from ._simple import calculate_qc_metrics, flag_gene_family
from ._simple import (
calculate_qc_metrics,
filter_cells,
filter_genes,
filter_highly_variable,
flag_gene_family,
)
58 changes: 31 additions & 27 deletions rapids_singlecell/cunnData_funcs/_hvg.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import cupy as cp
import cupyx as cpx
import numpy as np
import pandas as pd
import math
import warnings
from typing import Optional

from ..cunnData import cunnData
import cupy as cp
import numpy as np
import pandas as pd

from rapids_singlecell.cunnData import cunnData

from ._utils import _check_nonnegative_integers, _get_mean_var


Expand Down Expand Up @@ -203,15 +204,15 @@ def highly_variable_genes(
df = pd.concat(df, axis=0)
df["highly_variable"] = df["highly_variable"].astype(int)
df = df.groupby("gene").agg(
dict(
means=np.nanmean,
dispersions=np.nanmean,
dispersions_norm=np.nanmean,
highly_variable=np.nansum,
)
{
"means": np.nanmean,
"dispersions": np.nanmean,
"dispersions_norm": np.nanmean,
"highly_variable": np.nansum,
}
)
df.rename(
columns=dict(highly_variable="highly_variable_nbatches"), inplace=True
columns={"highly_variable": "highly_variable_nbatches"}, inplace=True
)
df["highly_variable_intersection"] = df["highly_variable_nbatches"] == len(
batches
Expand Down Expand Up @@ -269,11 +270,12 @@ def _highly_variable_genes_single_batch(
):
"""\
See `highly_variable_genes`.
Returns
-------
A DataFrame that contains the columns
`highly_variable`, `means`, `dispersions`, and `dispersions_norm`.
"""
Returns
-------
A DataFrame that contains the columns
`highly_variable`, `means`, `dispersions`, and `dispersions_norm`.
"""
if flavor == "seurat":
X = X.expm1()
mean, var = _get_mean_var(X, axis=1)
Expand All @@ -297,7 +299,7 @@ def _highly_variable_genes_single_batch(
# only a single gene fell in the bin and implicitly set them to have
# a normalized disperion of 1
one_gene_per_bin = disp_std_bin.isnull()
gen_indices = np.where(one_gene_per_bin[df["mean_bin"].values])[0].tolist()
np.where(one_gene_per_bin[df["mean_bin"].values])[0].tolist()

# Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32,
# but there’s still a dtype error without “.value”.
Expand Down Expand Up @@ -368,6 +370,7 @@ def _highly_variable_genes_seurat_v3(
"""\
See `highly_variable_genes`.
For further implementation details see https://www.overleaf.com/read/ckptrbgzzzpg
Returns
-------
updates `.var` with the following fields:
Expand Down Expand Up @@ -663,14 +666,14 @@ def _highly_variable_pearson_residuals(
means, variances = _get_mean_var(X, axis=1)
means, variances = means.get(), variances.get()
df = pd.DataFrame.from_dict(
dict(
means=means,
variances=variances,
residual_variances=cp.mean(residual_gene_vars, axis=0).get(),
highly_variable_rank=medianrank_residual_var,
highly_variable_nbatches=highly_variable_nbatches.astype(np.int64),
highly_variable_intersection=highly_variable_nbatches == n_batches,
)
{
"means": means,
"variances": variances,
"residual_variances": cp.mean(residual_gene_vars, axis=0).get(),
"highly_variable_rank": medianrank_residual_var,
"highly_variable_nbatches": highly_variable_nbatches.astype(np.int64),
"highly_variable_intersection": highly_variable_nbatches == n_batches,
}
)
df = df.set_index(cudata.var_names)
df.sort_values(
Expand Down Expand Up @@ -715,6 +718,7 @@ def _poisson_gene_selection(
The method accounts for library size internally, a raw count matrix should be provided.
Instead of Z-test, enrichment of zeros is quantified by posterior
probabilites from a binomial model, computed through sampling.
Parameters
----------
cudata
Expand All @@ -733,6 +737,7 @@ def _poisson_gene_selection(
Size of temporary matrix for incremental calculation. Larger is faster but
requires more RAM or GPU memory. (The default should be fine unless
there are hundreds of millions cells or millions of genes.)
Returns
-------
Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or
Expand All @@ -750,7 +755,6 @@ def _poisson_gene_selection(
prob_zero_enriched_nbatches : int
If batch_key is given, this denotes in how many batches genes are detected as zero enriched
"""

try:
import torch
except ImportError:
Expand Down
Loading

0 comments on commit b0bc71d

Please sign in to comment.