Skip to content

Commit

Permalink
Add generator scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
LouiseDck committed Sep 19, 2024
1 parent 0f5ec6a commit e15fcf0
Show file tree
Hide file tree
Showing 12 changed files with 326 additions and 103 deletions.
8 changes: 5 additions & 3 deletions src/dummy_anndata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from importlib.metadata import version

from . import pl, pp, tl

__all__ = ["pl", "pp", "tl"]
from .generate_dataframe import generate_dataframe
from .generate_dataset import generate_dataset
from .generate_dict import generate_scalar, generate_type, generate_dict
from .generate_matrix import generate_matrix
from .generate_vector import generate_vector

__version__ = version("dummy-anndata")
23 changes: 23 additions & 0 deletions src/dummy_anndata/generate_dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd
from generate_vector import vector_generators


def generate_dataframe(n_rows, types=None):
"""
Generate a pandas DataFrame with specified number of rows and column types.
Parameters:
n_rows (int): The number of rows in the DataFrame.
types (list, optional): A list of column types to include in the DataFrame.
Choose from the list of vector_generators keys.
If not provided, all available column types will be included.
Returns:
pandas.DataFrame: The generated DataFrame.
"""
if types is None:
types = list(vector_generators.keys())

data = {t: vector_generators[t](n_rows) for t in types}
return pd.DataFrame(data)
110 changes: 110 additions & 0 deletions src/dummy_anndata/generate_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import anndata as ad

from generate_matrix import matrix_generators
from generate_vector import vector_generators
from generate_dataframe import generate_dataframe
from generate_dict import scalar_generators, generate_dict


def generate_dataset(
n_obs=10,
n_vars=20,
x_type="generate_integer_matrix",
layer_types=None,
obs_types=None,
var_types=None,
obsm_types=None,
varm_types=None,
obsp_types=None,
varp_types=None,
uns_types=None,
):

assert x_type in matrix_generators, f"Unknown matrix type: {x_type}"
assert layer_types is None or all(
t in matrix_generators.keys() for t in layer_types
), "Unknown layer type"
assert obs_types is None or all(
t in vector_generators.keys() for t in obs_types
), "Unknown obs type"
assert var_types is None or all(
t in vector_generators.keys() for t in var_types
), "Unknown var type"
assert obsm_types is None or all(
t in matrix_generators.keys() or t in vector_generators.keys()
for t in obsm_types
), "Unknown obsm type"
assert varm_types is None or all(
t in matrix_generators.keys() or t in vector_generators.keys()
for t in varm_types
), "Unknown varm type"
assert obsp_types is None or all(
t in matrix_generators.keys() for t in obsp_types
), "Unknown obsp type"
assert varp_types is None or all(
t in matrix_generators.keys() for t in varp_types
), "Unknown varp type"
# TODO uns types

if layer_types is None: # layer_types are all matrices
layer_types = list(matrix_generators.keys())
if obs_types is None: # obs_types are all vectors
obs_types = list(vector_generators.keys())
if var_types is None: # var_types are all vectors
var_types = list(vector_generators.keys())
if obsm_types is None: # obsm_types are all matrices or vectors
obsm_types = list(matrix_generators.keys()) + list(vector_generators.keys())
if varm_types is None: # varm_types are all matrices or vectors
varm_types = list(matrix_generators.keys()) + list(vector_generators.keys())
if obsp_types is None: # obsp_types are all matrices
obsp_types = list(matrix_generators.keys())
if varp_types is None: # varp_types are all matrices
varp_types = list(matrix_generators.keys())
if uns_types is None:
uns_types = (
list(vector_generators.keys())
+ list(matrix_generators.keys())
+ list(scalar_generators.keys())
)

X = matrix_generators[x_type](n_obs, n_vars)
layers = {t: matrix_generators[t](n_obs, n_vars) for t in layer_types}

obs_names = [f"Cell{i:03d}" for i in range(n_obs)]
var_names = [f"Gene{i:03d}" for i in range(n_vars)]

obs = generate_dataframe(n_obs, obs_types)
var = generate_dataframe(n_vars, var_types)
obs.index = obs_names
var.index = var_names

obsm = {}
for t in obsm_types:
if t in matrix_generators.keys():
obsm[t] = matrix_generators[t](n_obs, n_obs)
elif t in vector_generators.keys():
obsm[t] = vector_generators[t](n_obs)

varm = {}
for t in varm_types:
if t in matrix_generators.keys():
varm[t] = matrix_generators[t](n_vars, n_vars)
elif t in vector_generators.keys():
varm[t] = vector_generators[t](n_vars)

obsp = {t: matrix_generators[t](n_obs, n_obs) for t in obsp_types}
varp = {t: matrix_generators[t](n_vars, n_vars) for t in varp_types}

uns = generate_dict(n_obs, n_vars, uns_types)

return ad.AnnData(
X,
layers=layers,
obs=obs,
var=var,
obsm=obsm,
varm=varm,
obsp=obsp,
varp=varp,
uns=uns,
)
50 changes: 50 additions & 0 deletions src/dummy_anndata/generate_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from generate_vector import vector_generators
from generate_matrix import matrix_generators

import pandas as pd
import numpy as np

scalar_generators = {
"string": "version",
"char": "a",
"integer": 1,
"float": 1.0,
"boolean": True,
"none": None,
# "NA": pd.NA, cannot write to h5 group
"nan": np.nan,
}


def generate_scalar(scalar_type):
if scalar_type[:7] == "scalar_":
return vector_generators[scalar_type[7:]](1)
return scalar_generators[scalar_type]


def generate_type(type, n_rows, n_cols):
if type in scalar_generators or type[:7] == "scalar_":
return generate_scalar(type)
if type in vector_generators:
return vector_generators[type](n_rows)
if type in matrix_generators:
return matrix_generators[type](n_rows, n_cols)
return None


def generate_dict(n_rows, n_cols, types=None, nested=True):
if types is None: # types are all vectors and all matrices
scalar_types = list(scalar_generators.keys()) + [
f"scalar_{t}" for t in vector_generators.keys()
]
types = (
scalar_types
+ list(vector_generators.keys())
+ list(matrix_generators.keys())
)

data = {t: generate_type(t, n_rows, n_cols) for t in types}
if nested:
data["nested"] = generate_dict(n_rows, n_cols, types, False)

return data
65 changes: 65 additions & 0 deletions src/dummy_anndata/generate_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import numpy as np
import scipy as sp


def float_mtx(n_obs, n_vars, NAs=False):
# add 0.5 to easily spot conversion issues
mtx = np.arange(n_obs * n_vars, dtype=float).reshape(n_obs, n_vars) + 0.5
if NAs: # numpy matrices do no support pd.NA
mtx[0, 0] = np.nan
return mtx


def int_mtx(n_obs, n_vars):
mtx = np.arange(n_obs * n_vars).reshape(n_obs, n_vars)
return mtx


# Possible matrix generators
# integer matrices do not support NAs in Python
matrix_generators = {
"generate_float_matrix": lambda n_obs, n_vars: float_mtx(n_obs, n_vars),
"generate_float_matrix_nas": lambda n_obs, n_vars: float_mtx(
n_obs, n_vars, NAs=True
),
"generate_float_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix(
float_mtx(n_obs, n_vars)
),
"generate_float_csparse_nas": lambda n_obs, n_vars: sp.sparse.csc_matrix(
float_mtx(n_obs, n_vars, NAs=True)
),
"generate_float_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix(
float_mtx(n_obs, n_vars)
),
"generate_float_rsparse_nas": lambda n_obs, n_vars: sp.sparse.csr_matrix(
float_mtx(n_obs, n_vars, NAs=True)
),
"generate_integer_matrix": lambda n_obs, n_vars: int_mtx(n_obs, n_vars),
"generate_integer_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix(
int_mtx(n_obs, n_vars)
),
"generate_integer_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix(
int_mtx(n_obs, n_vars)
),
}


def generate_matrix(n_obs, n_vars, matrix_type):
"""
Generate a matrix of given dimensions and type.
Parameters:
n_obs (int): The number of observations (rows) in the matrix.
n_vars (int): The number of variables (columns) in the matrix.
matrix_type (str): The type of matrix to generate.
Returns:
The generated matrix, either numpy.ndarray or scipy.sparse.csc_matrix or scipy.sparse.csr_matrix.
Raises:
AssertionError: If the matrix_type is unknown.
"""
assert matrix_type in matrix_generators, f"Unknown matrix type: {matrix_type}"

return matrix_generators[matrix_type](n_obs, n_vars)
73 changes: 73 additions & 0 deletions src/dummy_anndata/generate_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pandas as pd
import numpy as np


def nullable_integer_array(n):
assert n > 0, "an integer array must be at least one value"
nullable_array = [i for i in range(n)]
# np.nan, pd.NA and None should all end up as null values, masked in the h5ad file
nullable_array[0] = np.nan
return pd.array(nullable_array, dtype="Int64")


def nullable_boolean_array(n):
assert n > 0, "a boolean array must be at least one value"
nullable_array = pd.array([[True, False][i % 2] for i in range(n)], dtype="boolean")
# np.nan, pd.NA and None should all end up as null values, masked in the h5ad file
nullable_array[0] = pd.NA
return nullable_array


def missing_values_categorical(n, ordered=True):
assert n > 0, "a categorical must be at least one value"
missing_values = pd.Categorical(
[["Value1", "Value2"][i % 2] for i in range(n)],
categories=["Value1", "Value2"],
ordered=ordered,
)
# They should all end up as code -1 in the h5ad file
missing_values[0] = np.nan
return missing_values


vector_generators = {
"categorical": lambda n: pd.Categorical(
[["Value1", "Value2"][i % 2] for i in range(n)]
),
"categorical_ordered": lambda n: pd.Categorical(
[["Value1", "Value2"][i % 2] for i in range(n)], ordered=True
),
"categorical_missing_values": lambda n: missing_values_categorical(
n, ordered=False
),
"categorical_ordered_missing_values": lambda n: missing_values_categorical(
n, ordered=True
),
"string_array": lambda n: np.array([f"value_{i}" for i in range(n)]),
# should we also check a 1d sparse array? We should probably leave it for the matrix generation?
"dense_array": lambda n: np.arange(n, dtype=float) + 0.5,
"integer_array": lambda n: np.array([i for i in range(n)]),
"nullable_integer_array": nullable_integer_array,
"boolean_array": lambda n: np.array([[True, False][i % 2] for i in range(n)]),
"nullable_boolean_array": nullable_boolean_array,
}


def generate_vector(n, vector_type):
"""
Generate a vector of a specified type.
Parameters:
vector_type (str): The type of vector to generate.
n (int): The length of the vector.
Returns:
list: The generated vector.
Raises:
AssertionError: If the vector_type is unknown.
"""
# check if vector_type is valid
assert vector_type in vector_generators, f"Unknown vector type: {vector_type}"

return vector_generators[vector_type](n)
1 change: 0 additions & 1 deletion src/dummy_anndata/pl/__init__.py

This file was deleted.

Loading

0 comments on commit e15fcf0

Please sign in to comment.