adds Sparse PCA for rapids-singlecell (#36)

* adds ruff & Sparse PCA * ruff updates * updates PCA_sparse * adds test for sparse PCA * fixes plotting api * fixes plotting * version bump
scverse · Aug 4, 2023 · b0bc71d · b0bc71d
1 parent 404fd70
commit b0bc71d
Show file tree

Hide file tree

Showing 42 changed files with 582 additions and 293 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,3 +9,8 @@ repos:
     rev: 23.7.0
     hooks:
     -   id: black
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.0.282
+    hooks:
+    -   id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
diff --git a/docs/conf.py b/docs/conf.py
@@ -121,11 +121,11 @@
 #
 
 html_theme = "scanpydoc"
-html_theme_options = dict(
-    repository_url=repository_url,
-    repository_branch=os.environ.get("READTHEDOCS_GIT_IDENTIFIER", "main"),
-    use_repository_button=True,
-)
+html_theme_options = {
+    "repository_url": repository_url,
+    "repository_branch": os.environ.get("READTHEDOCS_GIT_IDENTIFIER", "main"),
+    "use_repository_button": True,
+}
 
 html_show_sphinx = False
 html_logo = "_static/logo3.svg"

diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,39 @@ line-length = 88
 target-version = ['py38']
 include = '^rapids_singlecell/.*\.py$'
 
+[tool.ruff]
+src = ["rapids_singlecell"]
+exclude = ["rapids_singlecell/tests"]
+line-length = 88
+select = [
+    "F",  # Errors detected by Pyflakes
+    "E",  # Error detected by Pycodestyle
+    "W",  # Warning detected by Pycodestyle
+    "I",  # isort
+    "TID",  # flake8-tidy-imports
+    "C4",  # flake8-comprehensions
+    "BLE",  # flake8-blind-except
+    "UP",  # pyupgrade
+    "RUF100",  # Report unused noqa directives
+]
+ignore = [
+    # line too long -> we accept long comment lines; black gets rid of long code lines
+    "E501",
+    # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient
+    "E731",
+    # allow I, O, l as variable names -> I is the identity matrix
+    "E741",
+    # Missing docstring in public package
+    "F403",
+    # First line should be in imperative mood; try rephrasing
+]
+
+
+[tool.ruff.per-file-ignores]
+"docs/*" = ["I"]
+"tests/*" = ["D"]
+"*/__init__.py" = ["F401"]
+
 [tool.flit.sdist]
 exclude = [
     "rapids_singlecell/tests",

diff --git a/rapids_singlecell/__init__.py b/rapids_singlecell/__init__.py
@@ -1,8 +1,3 @@
-from . import cunnData
-from . import pp
-from . import dcg
-from . import tl
-from . import pl
-from . import gr
+from . import cunnData, dcg, gr, pl, pp, tl
 
-__version__ = "0.7.2"
+__version__ = "0.7.5"
diff --git a/rapids_singlecell/cunnData/__init__.py b/rapids_singlecell/cunnData/__init__.py
@@ -1,34 +1,31 @@
+import warnings
+from collections import OrderedDict
+from itertools import repeat
+from typing import Any, List, Mapping, MutableMapping, Optional, Union
+
+import anndata
 import cupy as cp
 import cupyx as cpx
-from anndata import AnnData
-from anndata._core.index import _normalize_indices
-import anndata
-
 import numpy as np
 import pandas as pd
-from scipy import sparse
-from collections import OrderedDict
-from typing import Any, Union, Optional, Mapping, MutableMapping, List
-from pandas.api.types import infer_dtype, is_string_dtype
-from itertools import repeat
-import warnings
-
+from anndata import AnnData
+from anndata._core.index import _normalize_indices
+from cupyx.scipy.sparse import issparse as issparse_gpu
 from natsort import natsorted
-
+from pandas.api.types import infer_dtype, is_string_dtype
+from scipy import sparse
 from scipy.sparse import issparse as issparse_cpu
-from cupyx.scipy.sparse import issparse as issparse_gpu
 
 
 class Layer_Mapping(dict):
-    """
-    Dictonary subclass for layers handeling in cunnData
-    """
+    """Dictonary subclass for layers handeling in cunnData"""
 
     def __init__(self, shape=None):
         super().__init__({})
         self.shape = shape
 
     def update_shape(self, shape):
+        """Updates Shape for Layers"""
         self.shape = shape
 
     def __setitem__(self, key, item):
@@ -56,15 +53,14 @@ def __setitem__(self, key, item):
 
 
 class obsm_Mapping(dict):
-    """
-    Dictonary subclass for obsm handeling in cunnData
-    """
+    """Dictonary subclass for obsm handeling in cunnData"""
 
     def __init__(self, shape=None):
         super().__init__({})
         self.shape = shape
 
     def update_shape(self, shape):
+        """Updates Shape for obsm"""
         self.shape = shape
 
     def __setitem__(self, key, item):
@@ -75,15 +71,14 @@ def __setitem__(self, key, item):
 
 
 class varm_Mapping(dict):
-    """
-    Dictonary subclass for obsm handeling in cunnData
-    """
+    """Dictonary subclass for obsm handeling in cunnData"""
 
     def __init__(self, shape=None):
         super().__init__({})
         self.shape = shape
 
     def update_shape(self, shape):
+        """Updates Shape for varm"""
         self.shape = shape
 
     def __setitem__(self, key, item):
@@ -94,10 +89,11 @@ def __setitem__(self, key, item):
 
 
 class cunnData:
-    """
+    """\
     The cunnData objects can be used as an AnnData replacement for the inital preprocessing
     of single cell Datasets. It replaces some of the most common preprocessing steps within
     scanpy for annData objects.
+
     It can be initalized with a preexisting annData object or with a countmatrix and seperate
     Dataframes for var and obs. Index of var will be used as gene_names. Initalization with an
     AnnData object is advised.
@@ -310,8 +306,9 @@ def uns(self):
 
     @property
     def layers(self):
-        """\
+        """
         Dictionary-like object with values of the same dimensions as :attr:`.X`.
+
         Layers in cunnData are inspired by AnnData.
 
         Return the layer named `"unspliced"`::
@@ -329,9 +326,10 @@ def layers(self):
 
     @property
     def obsm(self):
-        """\
+        """
         Multi-dimensional annotation of observations
         (mutable structured :class:`~numpy.ndarray`).
+
         Stores for each key a two or higher-dimensional :class:`~numpy.ndarray`
         of length :attr:`n_obs`.
         Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`.
@@ -343,6 +341,7 @@ def varm(self):
         """\
         Multi-dimensional annotation of variables/features
         (mutable structured :class:`~numpy.ndarray`).
+
         Stores for each key a two or higher-dimensional :class:`~numpy.ndarray`
         of length :attr:`n_vars`.
         Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`.
@@ -614,7 +613,7 @@ def varm_keys(self) -> List[str]:
 
     def uns_keys(self) -> List[str]:
         """List keys of unstructured annotation."""
-        return sorted(list(self._uns.keys()))
+        return sorted(self._uns.keys())
 
     def to_AnnData(self):
         """

diff --git a/rapids_singlecell/cunnData_funcs/__init__.py b/rapids_singlecell/cunnData_funcs/__init__.py
@@ -1,7 +1,12 @@
+from ._hvg import highly_variable_genes
+from ._normalize import log1p, normalize_pearson_residuals, normalize_total
+from ._pca import pca
 from ._regress_out import regress_out
 from ._scale import scale
-from ._pca import pca
-from ._hvg import highly_variable_genes
-from ._normalize import normalize_pearson_residuals, log1p, normalize_total
-from ._simple import filter_cells, filter_genes, filter_highly_variable
-from ._simple import calculate_qc_metrics, flag_gene_family
+from ._simple import (
+    calculate_qc_metrics,
+    filter_cells,
+    filter_genes,
+    filter_highly_variable,
+    flag_gene_family,
+)
diff --git a/rapids_singlecell/cunnData_funcs/_hvg.py b/rapids_singlecell/cunnData_funcs/_hvg.py
@@ -1,12 +1,13 @@
-import cupy as cp
-import cupyx as cpx
-import numpy as np
-import pandas as pd
 import math
 import warnings
 from typing import Optional
 
-from ..cunnData import cunnData
+import cupy as cp
+import numpy as np
+import pandas as pd
+
+from rapids_singlecell.cunnData import cunnData
+
 from ._utils import _check_nonnegative_integers, _get_mean_var
 
 
@@ -203,15 +204,15 @@ def highly_variable_genes(
             df = pd.concat(df, axis=0)
             df["highly_variable"] = df["highly_variable"].astype(int)
             df = df.groupby("gene").agg(
-                dict(
-                    means=np.nanmean,
-                    dispersions=np.nanmean,
-                    dispersions_norm=np.nanmean,
-                    highly_variable=np.nansum,
-                )
+                {
+                    "means": np.nanmean,
+                    "dispersions": np.nanmean,
+                    "dispersions_norm": np.nanmean,
+                    "highly_variable": np.nansum,
+                }
             )
             df.rename(
-                columns=dict(highly_variable="highly_variable_nbatches"), inplace=True
+                columns={"highly_variable": "highly_variable_nbatches"}, inplace=True
             )
             df["highly_variable_intersection"] = df["highly_variable_nbatches"] == len(
                 batches
@@ -269,11 +270,12 @@ def _highly_variable_genes_single_batch(
 ):
     """\
         See `highly_variable_genes`.
-        Returns
-        -------
-        A DataFrame that contains the columns
-        `highly_variable`, `means`, `dispersions`, and `dispersions_norm`.
-        """
+
+    Returns
+    -------
+    A DataFrame that contains the columns
+    `highly_variable`, `means`, `dispersions`, and `dispersions_norm`.
+    """
     if flavor == "seurat":
         X = X.expm1()
     mean, var = _get_mean_var(X, axis=1)
@@ -297,7 +299,7 @@ def _highly_variable_genes_single_batch(
         # only a single gene fell in the bin and implicitly set them to have
         # a normalized disperion of 1
         one_gene_per_bin = disp_std_bin.isnull()
-        gen_indices = np.where(one_gene_per_bin[df["mean_bin"].values])[0].tolist()
+        np.where(one_gene_per_bin[df["mean_bin"].values])[0].tolist()
 
         # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32,
         # but there’s still a dtype error without “.value”.
@@ -368,6 +370,7 @@ def _highly_variable_genes_seurat_v3(
     """\
     See `highly_variable_genes`.
     For further implementation details see https://www.overleaf.com/read/ckptrbgzzzpg
+
     Returns
     -------
     updates `.var` with the following fields:
@@ -663,14 +666,14 @@ def _highly_variable_pearson_residuals(
     means, variances = _get_mean_var(X, axis=1)
     means, variances = means.get(), variances.get()
     df = pd.DataFrame.from_dict(
-        dict(
-            means=means,
-            variances=variances,
-            residual_variances=cp.mean(residual_gene_vars, axis=0).get(),
-            highly_variable_rank=medianrank_residual_var,
-            highly_variable_nbatches=highly_variable_nbatches.astype(np.int64),
-            highly_variable_intersection=highly_variable_nbatches == n_batches,
-        )
+        {
+            "means": means,
+            "variances": variances,
+            "residual_variances": cp.mean(residual_gene_vars, axis=0).get(),
+            "highly_variable_rank": medianrank_residual_var,
+            "highly_variable_nbatches": highly_variable_nbatches.astype(np.int64),
+            "highly_variable_intersection": highly_variable_nbatches == n_batches,
+        }
     )
     df = df.set_index(cudata.var_names)
     df.sort_values(
@@ -715,6 +718,7 @@ def _poisson_gene_selection(
     The method accounts for library size internally, a raw count matrix should be provided.
     Instead of Z-test, enrichment of zeros is quantified by posterior
     probabilites from a binomial model, computed through sampling.
+
     Parameters
     ----------
     cudata
@@ -733,6 +737,7 @@ def _poisson_gene_selection(
         Size of temporary matrix for incremental calculation. Larger is faster but
         requires more RAM or GPU memory. (The default should be fine unless
         there are hundreds of millions cells or millions of genes.)
+
     Returns
     -------
     Depending on `inplace` returns calculated metrics (:class:`~pd.DataFrame`) or
@@ -750,7 +755,6 @@ def _poisson_gene_selection(
     prob_zero_enriched_nbatches : int
         If batch_key is given, this denotes in how many batches genes are detected as zero enriched
     """
-
     try:
         import torch
     except ImportError: