diff --git a/src/dummy_anndata/generate_dataframe.py b/src/dummy_anndata/generate_dataframe.py index 8fad0bb..8d3c1c1 100644 --- a/src/dummy_anndata/generate_dataframe.py +++ b/src/dummy_anndata/generate_dataframe.py @@ -1,18 +1,21 @@ import pandas as pd + from .generate_vector import vector_generators -def generate_dataframe(n_rows, types=None): +def generate_dataframe(n_rows: int, types: list[str] | None = None): """ Generate a pandas DataFrame with specified number of rows and column types. - Parameters: + Parameters + ---------- n_rows (int): The number of rows in the DataFrame. types (list, optional): A list of column types to include in the DataFrame. Choose from the list of vector_generators keys. If not provided, all available column types will be included. - Returns: + Returns + ------- pandas.DataFrame: The generated DataFrame. """ diff --git a/src/dummy_anndata/generate_dataset.py b/src/dummy_anndata/generate_dataset.py index 2460f49..cb3511a 100644 --- a/src/dummy_anndata/generate_dataset.py +++ b/src/dummy_anndata/generate_dataset.py @@ -9,19 +9,19 @@ def generate_dataset( - n_obs=10, - n_vars=20, - x_type=None, - layer_types=None, - obs_types=None, - var_types=None, - obsm_types=None, - varm_types=None, - obsp_types=None, - varp_types=None, - uns_types=None, - nested_uns_types=None, -): + n_obs: int = 10, + n_vars: int = 20, + x_type: str | None = None, + layer_types: list[str] | None = None, + obs_types: list[str] | None = None, + var_types: list[str] | None = None, + obsm_types: list[str] | None = None, + varm_types: list[str] | None = None, + obsp_types: list[str] | None = None, + varp_types: list[str] | None = None, + uns_types: list[str] | None = None, + nested_uns_types: list[str] | None = None, +) -> ad.AnnData: """ Generate a synthetic AnnData dataset with specified dimensions and data types. @@ -105,7 +105,7 @@ def generate_dataset( "nullable_boolean_array", ] ) - obsm_types = set(matrix_generators.keys()) - vector_not_allowed + obsm_types = list(set(matrix_generators.keys()) - vector_not_allowed) if varm_types is None: # varm_types are all matrices or vectors, except for categoricals and nullables vector_not_allowed = set( [ @@ -117,7 +117,7 @@ def generate_dataset( "nullable_boolean_array", ] ) - varm_types = set(matrix_generators.keys()) - vector_not_allowed + varm_types = list(set(matrix_generators.keys()) - vector_not_allowed) if obsp_types is None: # obsp_types are all matrices obsp_types = list(matrix_generators.keys()) if varp_types is None: # varp_types are all matrices diff --git a/src/dummy_anndata/generate_dict.py b/src/dummy_anndata/generate_dict.py index d2e7fa2..b7c8eaa 100644 --- a/src/dummy_anndata/generate_dict.py +++ b/src/dummy_anndata/generate_dict.py @@ -1,7 +1,7 @@ import numpy as np -from .generate_matrix import matrix_generators -from .generate_vector import vector_generators +from .generate_matrix import matrix_generators, generated_matrix_types +from .generate_vector import vector_generators, generated_vector_types scalar_generators = { "string": "version", @@ -14,6 +14,7 @@ "nan": np.nan, } +generated_scalar_types = str | int | float | bool | None | np.nan def generate_scalar(scalar_type): if scalar_type[:7] == "scalar_": @@ -30,8 +31,24 @@ def generate_type(type, n_rows, n_cols): return matrix_generators[type](n_rows, n_cols) return None +all_types = generated_scalar_types | generated_vector_types | generated_matrix_types +generated_dict_types = dict[str, all_types | dict[str, all_types]] -def generate_dict(n_rows, n_cols, types=None, nested_uns_types=None): +def generate_dict( + n_rows: int, n_cols: int, types: list[str] | None = None, nested_uns_types: list[str] | None = None +) -> generated_dict_types: + """ + Generates a dictionary with specified types of data. + + Parameters: + n_rows (int): Number of rows for the generated data. + n_cols (int): Number of columns for the generated data. + types (list[str] | None): List of types to generate. If None, defaults to all available types. + nested_uns_types (list[str] | None): List of types for nested 'uns' data. If None, defaults to all available types. + + Returns: + A dictionary containing the generated data. + """ if types is None: # types are all vectors and all matrices types = ( list(scalar_generators.keys()) @@ -52,6 +69,6 @@ def generate_dict(n_rows, n_cols, types=None, nested_uns_types=None): if types: # types is not empty data = {t: generate_type(t, n_rows, n_cols) for t in types} if nested_uns_types: - data["nested"] = generate_dict(n_rows, n_cols, types = nested_uns_types, nested_uns_types=[]) + data["nested"] = generate_dict(n_rows, n_cols, types=nested_uns_types, nested_uns_types=[]) return data diff --git a/src/dummy_anndata/generate_matrix.py b/src/dummy_anndata/generate_matrix.py index 17b4c45..e10d8de 100644 --- a/src/dummy_anndata/generate_matrix.py +++ b/src/dummy_anndata/generate_matrix.py @@ -19,44 +19,35 @@ def int_mtx(n_obs, n_vars): # integer matrices do not support NAs in Python matrix_generators = { "float_matrix": lambda n_obs, n_vars: float_mtx(n_obs, n_vars), - "float_matrix_nas": lambda n_obs, n_vars: float_mtx( - n_obs, n_vars, NAs=True - ), - "float_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix( - float_mtx(n_obs, n_vars) - ), - "float_csparse_nas": lambda n_obs, n_vars: sp.sparse.csc_matrix( - float_mtx(n_obs, n_vars, NAs=True) - ), - "float_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix( - float_mtx(n_obs, n_vars) - ), - "float_rsparse_nas": lambda n_obs, n_vars: sp.sparse.csr_matrix( - float_mtx(n_obs, n_vars, NAs=True) - ), + "float_matrix_nas": lambda n_obs, n_vars: float_mtx(n_obs, n_vars, NAs=True), + "float_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix(float_mtx(n_obs, n_vars)), + "float_csparse_nas": lambda n_obs, n_vars: sp.sparse.csc_matrix(float_mtx(n_obs, n_vars, NAs=True)), + "float_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix(float_mtx(n_obs, n_vars)), + "float_rsparse_nas": lambda n_obs, n_vars: sp.sparse.csr_matrix(float_mtx(n_obs, n_vars, NAs=True)), "integer_matrix": lambda n_obs, n_vars: int_mtx(n_obs, n_vars), - "integer_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix( - int_mtx(n_obs, n_vars) - ), - "integer_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix( - int_mtx(n_obs, n_vars) - ), + "integer_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix(int_mtx(n_obs, n_vars)), + "integer_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix(int_mtx(n_obs, n_vars)), } +generated_matrix_types = np.ndarray | sp.sparse.csc_matrix | sp.sparse.csr_matrix -def generate_matrix(n_obs, n_vars, matrix_type): +def generate_matrix(n_obs: int, n_vars: int, matrix_type: str) -> generated_matrix_types: """ Generate a matrix of given dimensions and type. - Parameters: + Parameters + ---------- n_obs (int): The number of observations (rows) in the matrix. n_vars (int): The number of variables (columns) in the matrix. matrix_type (str): The type of matrix to generate. - Returns: - The generated matrix, either numpy.ndarray or scipy.sparse.csc_matrix or scipy.sparse.csr_matrix. + Returns + ------- + np.ndarray | sp.sparse.csc_matrix | sp.sparse.csr_matrix: + The generated matrix. - Raises: + Raises + ------ AssertionError: If the matrix_type is unknown. """ diff --git a/src/dummy_anndata/generate_vector.py b/src/dummy_anndata/generate_vector.py index 09132c9..2e5bac9 100644 --- a/src/dummy_anndata/generate_vector.py +++ b/src/dummy_anndata/generate_vector.py @@ -1,5 +1,5 @@ -import pandas as pd import numpy as np +import pandas as pd def nullable_integer_array(n): @@ -31,18 +31,10 @@ def missing_values_categorical(n, ordered=True): vector_generators = { - "categorical": lambda n: pd.Categorical( - [["Value1", "Value2"][i % 2] for i in range(n)] - ), - "categorical_ordered": lambda n: pd.Categorical( - [["Value1", "Value2"][i % 2] for i in range(n)], ordered=True - ), - "categorical_missing_values": lambda n: missing_values_categorical( - n, ordered=False - ), - "categorical_ordered_missing_values": lambda n: missing_values_categorical( - n, ordered=True - ), + "categorical": lambda n: pd.Categorical([["Value1", "Value2"][i % 2] for i in range(n)]), + "categorical_ordered": lambda n: pd.Categorical([["Value1", "Value2"][i % 2] for i in range(n)], ordered=True), + "categorical_missing_values": lambda n: missing_values_categorical(n, ordered=False), + "categorical_ordered_missing_values": lambda n: missing_values_categorical(n, ordered=True), "string_array": lambda n: np.array([f"value_{i}" for i in range(n)]), # should we also check a 1d sparse array? We should probably leave it for the matrix generation? "dense_array": lambda n: np.arange(n, dtype=float) + 0.5, @@ -52,21 +44,24 @@ def missing_values_categorical(n, ordered=True): "nullable_boolean_array": nullable_boolean_array, } +generated_vector_types = np.ndarray | pd.Categorical | pd.arrays.IntegerArray | pd.arrays.BooleanArray -def generate_vector(n, vector_type): +def generate_vector(n: int, vector_type: str) -> generated_vector_types: """ - Generate a vector of a specified type. + Generate a vector of a specified type and length. Parameters: - vector_type (str): The type of vector to generate. - n (int): The length of the vector. + n (int): The length of the vector to generate. + vector_type (str): The type of vector to generate. Must be one of the keys in the `vector_generators` dictionary. Returns: - list: The generated vector. + np.ndarray | pd.Categorical | pd.arrays.IntegerArray | pd.arrays.BooleanArray: + A vector of the specified type and length. Raises: - AssertionError: If the vector_type is unknown. + AssertionError: If `vector_type` is not a valid key in `vector_generators`. """ + # check if vector_type is valid assert vector_type in vector_generators, f"Unknown vector type: {vector_type}"