documentation

mims-harvard · Mar 5, 2024 · 87776c9 · 87776c9
1 parent 2d4e4f4
commit 87776c9
Showing 1 changed file with 63 additions and 37 deletions.
diff --git a/tdc/resource/cellxgene-census.py b/tdc/resource/cellxgene-census.py
@@ -20,7 +20,7 @@ class decorators:
 
         @classmethod
         def check_dataset_is_census_data(cls, func):
-            # @wraps(func)
+            """Sets self.dataset to census_data"""
             def check(*args, **kwargs):
                 self = args[0]
                 self.dataset = self._CENSUS_DATA
@@ -30,7 +30,7 @@ def check(*args, **kwargs):
 
         @classmethod
         def check_dataset_is_census_info(cls, func):
-
+            """Sets self.dataset to census_data"""
             def check(*args, **kwargs):
                 self = args[0]
                 self.dataset = self._CENSUS_META
@@ -40,41 +40,62 @@ def check(*args, **kwargs):
 
         @classmethod
         def slice_checks_X_and_FM(cls, func):
-            """Decorator for functions that need X and feature presence matrix apply slicing if not filtering"""
+            """Decorator for:
+            1. functions that need X and feature presence matrix apply slicing if not filtering
+            2. functions with a todense() option abide by required formatting
+            3. functions requiring a measurement name provide a measurement name
+            4. fmt is a valid format
+            asserts these requirements hold in input arguments."""
             def check(*args, **kwargs):
-                upper, lower = kwargs.get('upper', None), kwargs.get("lower", None)
-                measurement_name = kwargs.get("measurement_name")
+                if "upper" in kwargs:
+                    upper, lower = kwargs.get('upper', None), kwargs.get("lower", None)
+                    if upper is None or lower is None:
+                        raise Exception(
+                            "No upper and/or lower bound for slicing was provided. Dataset is too large to fit in memory. \
+                            Memory-Efficient queries are not supported yet.")
                 fmt = kwargs.get("fmt")
-                todense = kwargs.get("todense")
-                if upper is None or lower is None:
-                    raise Exception(
-                        "No upper and/or lower bound for slicing was provided. Dataset is too large to fit in memory. \
-                        Memory-Efficient queries are not supported yet.")
-                elif measurement_name is None:
+                fmt = fmt if fmt is not None else "pandas"
+                if "todense" in kwargs:
+                    todense = kwargs.get("todense")
+                    kwargs["todense"] = todense if todense is not None else False
+                    if todense and fmt != "scipy":
+                        raise ValueError(
+                            "dense representation only available in scipy format")
+                measurement_name = kwargs.get("measurement_name")
+                if measurement_name is None:
                     raise Exception("measurement_name was not provided.")
                 elif fmt is not None and fmt not in ["scipy", "pyarrow"]:
                     raise ValueError(
                         "measurement_matrix only supports 'scipy' or 'pyarrow' format")
-                kwargs["todense"] = todense if todense is not None else False
-                if todense and fmt != "scipy":
-                    raise ValueError(
-                        "dense representation only available in scipy format")
-                kwargs["fmt"] = fmt if fmt is not None else "scipy"
+                kwargs["fmt"] = fmt if fmt is not None else "pandas"
                 return func(*args, **kwargs)
             return check
 
     def __init__(self, census_version=None, organism=None):
         """Initialize the Census Resource.
         
         Args:
-            census_version (str): The date of the census data release in YYYY- 
-            TODO: complete
+            census_version (str): The date of the census data release to use 
+            organism (str): string for census data organism to query data for. defaults to human.
         """
         self.census_version = census_version if census_version is not None else self._LATEST_CENSUS
         self.organism = organism if organism is not None else self._HUMAN
         self.dataset = None  # variable to set target census collection to either info or data
 
     def fmt_cellxgene_data(self, tiledb_ptr, fmt=None):
+        """Transform TileDB DataFrame or SparseNDArray to one of the supported API formats.
+
+        Args:
+            tiledb_ptr (TileDB DataFrame or SparseNDArray): pointer to the TileDB DataFrame
+            fmt (str, optional): deisgnates a format to transfowm TileDB data to. Defaults to None.
+
+        Raises:
+            Exception: if no format is provided
+            Exception: if format is not a valid option
+
+        Returns:
+            The dataset in selected format if it's a valid format
+        """
         if fmt is None:
             raise Exception(
                 "format not provided to fmt_cellxgene_data(), please provide fmt variable"
@@ -145,14 +166,13 @@ def get_measurement_matrix(self,
         """Count matrix for an input measurement by slice
 
         Args:
-            upper (_type_, optional): _description_. Defaults to None.
-            lower (_type_, optional): _description_. Defaults to None.
-            value_adjustment (_type_, optional): _description_. Defaults to None.
-            measurement_name (_type_, optional): _description_. Defaults to None.
+            upper (int, optional): upper bound on the slice to obtain. Defaults to None.
+            lower (int, optional): lower bound on the slice to obtain. Defaults to None.
+            value_adjustment (str, optional): designates the type of count desired for this measurement. Defaults to None.
+            measurement_name (str, optional): name of measurement, i.e. 'raw'. Defaults to None.
 
-        Raises:
-            Exception: _description_
-            Exception: _description_
+        Returns:
+            A slice from the count matrix in the specified format. If `todense` is True, then a dense scipy array will be returned.
         """
         value_adjustment = value_adjustment if value_adjustment is not None else "raw"
         with cellxgene_census.open_soma(
@@ -174,14 +194,26 @@ def get_feature_dataset_presence_matrix(self,
                                             measurement_name=None,
                                             fmt=None,
                                             todense=None):
+        """Gets a slice from the feature_dataset_presence_matrix for a given measurement_name
+
+        Args:
+            upper (int, optional): upper bound on the slice. Defaults to None.
+            lower (int, optional): lower bound on the slice. Defaults to None.
+            measurement_name (str, optional): measurment_name for the query i.e. 'rna'. Defaults to None.
+            fmt (str, optional): output format desired for the output dataset. Defaults to None.
+            todense (bool, optional): if True, returns scipy dense representation. Defaults to None.
+
+        Returns:
+            dataset in desired format
+        """
         with cellxgene_census.open_soma(
                 census_version=self.census_version) as census:
             n_obs = len(census[self.dataset][self.organism].obs)
             n_var = len(
                 census[self.dataset][self.organism].ms[measurement_name].var)
             fMatrix = census[self.dataset][self.organism].ms[measurement_name][
                 "feature_dataset_presence_matrix"]
-            slc = fMatrix.read((slice(0, 5),)).coos((n_obs, n_var))
+            slc = fMatrix.read((slice(lower, upper),)).coos((n_obs, n_var))
             out = self.fmt_cellxgene_data(slc, fmt)
             return out if not todense else out.todense()
 
@@ -217,18 +249,12 @@ def query_measurement_matrix(self,
         """Query the Census Measurement Matrix. Function returns a Python generator.
 
         Args:
-            value_filter (_type_, optional): _description_. Defaults to None.
-            value_adjustment (_type_, optional): _description_. Defaults to None.
-            measurement_name (_type_, optional): _description_. Defaults to None.
-            fmt (_type_, optional): _description_. Defaults to None.
-            todense (_type_, optional): _description_. Defaults to None.
+            value_filter (str, optional): a valuer filter (obs) to apply to the query. Defaults to None.
+            value_adjustment (str, optional): the type of count to obtain from count matricx for this measurement. Defaults to None.
+            measurement_name (str, optional): measurement name to query, i.e. "RNA". Defaults to None.
+            fmt (str, optional): output format for the output dataset. Defaults to None.
+            todense (bool, optional): if True, will output a dense scipy array as the representation. Defaults to None.
 
-        Raises:
-            ValueError: _description_
-            Exception: _description_
-            ValueError: _description_
-            ValueError: _description_
-            
         Yields:
             a slice of the output query in the specified format
         """