Skip to content

Commit

Permalink
refactor dataset generators
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Sep 20, 2023
1 parent 70948b0 commit c0c64a2
Show file tree
Hide file tree
Showing 11 changed files with 403 additions and 206 deletions.
17 changes: 17 additions & 0 deletions R/generate_dataframe.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#' Generate a dataframe
#'
#' Generate a dataframe with different types of columns
#'
#' @param num_rows Number of rows to generate
#' @param types Types of columns to generate
#'

Check warning on line 7 in R/generate_dataframe.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_dataframe.R,line=7,col=3,[trailing_whitespace_linter] Trailing whitespace is superfluous.
#' @return A dataframe with the generated columns
#'

Check warning on line 9 in R/generate_dataframe.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_dataframe.R,line=9,col=3,[trailing_whitespace_linter] Trailing whitespace is superfluous.
#' @examples
#' generate_dataframe(10L)
generate_dataframe <- function(num_rows, types = names(vector_generators)) {
types <- match.arg(types)
data <- lapply(types, generate_vector, n = num_rows)
names(data) <- types
as.data.frame(data)
}

Check warning on line 17 in R/generate_dataframe.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_dataframe.R,line=17,col=2,[trailing_blank_lines_linter] Missing terminal newline.
214 changes: 214 additions & 0 deletions R/generate_dataset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#' Generate a dataset
#'
#' Generate a dataset with different types of columns and layers
#'
#' @param n_obs Number of observations to generate
#' @param n_vars Number of variables to generate
#' @param format Object type to output, one of "list", "SingleCellExperiment",
#' or "Seurat"
#' @param ... Arguments passed to generate_dataset_as_list
#'
#' @return Object containing the generated dataset as defined by `output`
#'

Check warning on line 12 in R/generate_dataset.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_dataset.R,line=12,col=3,[trailing_whitespace_linter] Trailing whitespace is superfluous.
#' @noRd
#'
#' @examples
#' dummy <- generate_dataset()
generate_dataset <- function(
n_obs = 10L,
n_vars = 20L,
format = c("list", "SingleCellExperiment", "Seurat"),
...
) {
format <- match.arg(format)

fun <- switch(format,
"list" = generate_dataset_as_list,
"SingleCellExperiment" = generate_dataset_as_sce,
"Seurat" = generate_dataset_as_seurat
)

fun(n_obs = n_obs, n_vars = n_vars, ...)
}

#' Dummy data list
#'
#' Generate a dummy dataset as a list
#'
#' @param n_obs Number of observations to generate
#' @param n_vars Number of variables to generate
#'
#' @return A list with the generated dataset
#'

Check warning on line 42 in R/generate_dataset.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_dataset.R,line=42,col=3,[trailing_whitespace_linter] Trailing whitespace is superfluous.
#' @noRd
generate_dataset_as_list <- function(
n_obs = 10L,
n_vars = 20L,
x_type = names(matrix_generators)[[1]],
layer_types = names(matrix_generators),
obs_types = names(vector_generators),
var_types = names(vector_generators),
obsm_types = c(names(matrix_generators), names(vector_generators)),
varm_types = c(names(matrix_generators), names(vector_generators)),
obsp_types = names(matrix_generators),
varp_types = names(matrix_generators)
) {
# generate X
X <- generate_matrix(n_obs, n_vars, x_type)

# generate layers
layers <- lapply(layer_types, generate_matrix, n_obs = n_obs, n_vars = n_vars)
names(layers) <- layer_types

# generate obs
obs <- generate_dataframe(n_obs, obs_types)

# generate var
var <- generate_dataframe(n_vars, var_types)

# generate obs_names
obs_names <- paste0("cell", seq_len(n_obs))

# generate var_names
var_names <- paste0("gene", seq_len(n_vars))

# generate obsm
obsm <- lapply(obsm_types, function(obsm_type) {
if (obsm_type %in% names(vector_generators)) {
generate_dataframe(n_obs, obsm_type)
} else {
generate_matrix(n_obs, n_vars = 10L, obsm_type)
}
})
names(obsm) <- obsm_types

# generate varm
varm <- lapply(varm_types, function(varm_type) {
if (varm_type %in% names(vector_generators)) {
generate_dataframe(n_vars, varm_type)
} else {
generate_matrix(n_vars, n_obs = 10L, varm_type)
}
})
names(varm) <- varm_types

# generate obsp
obsp <- lapply(obsp_types, generate_matrix, n_obs = n_obs, n_vars = n_obs)
names(obsp) <- obsp_types

# generate varp
varp <- lapply(varp_types, generate_matrix, n_obs = n_vars, n_vars = n_vars)
names(varp) <- varp_types

# generate uns by combining other classes
uns <- list(
integer = 1L,
numeric = 1,
character = "a",
factor = factor("a"),
logical = TRUE,
integer_na = NA_integer_,
numeric_na = NA_real_,
character_na = NA_character_,
factor_na = NA_character_,
logical_na = NA,
list = list(1L, 1, "a", factor("a"), TRUE)
)
vectors_for_uns <- lapply(names(vector_generators), generate_vector, n = 10L)
names(vectors_for_uns) <- paste0("vec_", names(vector_generators))
obsm_for_uns <- obsm
names(obsm_for_uns) <- paste0("obsm_", names(obsm_for_uns))

uns <- c(
uns,
vectors_for_uns,
obsm_for_uns
)

list(
X = X,
obs = obs,
obs_names = obs_names,
obsm = obsm,
obsp = obsp,
var = var,
var_names = var_names,
varm = varm,
varp = varp,
layers = layers,
uns = uns
)
}

#' Dummy SingleCellExperiment
#'
#' Generate a dummy dataset as a SingleCellExperiment object
#'
#' @param ... Parameters passed to `generate_dataset_as_list`
#'
#' @return SingleCellExperiment containing the generated data
#'

Check warning on line 150 in R/generate_dataset.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_dataset.R,line=150,col=3,[trailing_whitespace_linter] Trailing whitespace is superfluous.
#' @noRd
generate_dataset_as_sce <- function(...) {
if (!requireNamespace("SingleCellExperiment", quietly = TRUE)) {
stop(
"Creating a SingleCellExperiment requires the 'SingleCellExperiment'",
"package to be installed"
)
}

dummy <- generate_dataset_as_list(...)

assays_list <- c(
list(X = dummy$X),
dummy$layers
)
assays_list <- lapply(assays_list, Matrix::t)

sce <- SingleCellExperiment::SingleCellExperiment(
assays = assays_list,
rowData = dummy$var,
colData = dummy$obs
)
colnames(sce) <- dummy$obs_names
rownames(sce) <- dummy$var_names

return(sce)
}

#' Dummy Seurat
#'
#' Generate a dummy dataset as a Seurat object
#'
#' @param ... Parameters passed to `generate_dataset_as_list`
#'
#' @return Seurat containing the generated data
generate_dataset_as_seurat <- function(...) {
if (!requireNamespace("SeuratObject", quietly = TRUE)) {
stop(
"Creating a Seurat requires the 'SeuratObject' package to be installed"
)
}

dummy <- generate_dataset_as_list(...)

X <- t(dummy$layers[["integer_csparse"]])
colnames(X) <- dummy$obs_names
rownames(X) <- dummy$var_names

seurat <- SeuratObject::CreateSeuratObject(X)

X2 <- Matrix::t(dummy$layers[["numeric_csparse"]])
colnames(X2) <- dummy$obs_names
rownames(X2) <- dummy$var_names
seurat <- SeuratObject::SetAssayData(seurat, "data", X2)

X3 <- Matrix::t(dummy$layers[["numeric_matrix"]])
colnames(X3) <- dummy$obs_names
rownames(X3) <- dummy$var_names
seurat <- SeuratObject::SetAssayData(seurat, "scale.data", X3)

seurat <- SeuratObject::AddMetaData(seurat, dummy$obs)

return(seurat)
}
90 changes: 90 additions & 0 deletions R/generate_matrix.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
matrix_generators <- list(
numeric_matrix = function(n_obs, n_vars) {
matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
},
numeric_dense = function(n_obs, n_vars) {
m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
as(m, "denseMatrix")
},
numeric_csparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
as(m, "CsparseMatrix")
},
numeric_rsparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
as(m, "RsparseMatrix")
},
numeric_matrix_with_nas = function(n_obs, n_vars) {
m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
m[sample.int(n_obs * n_vars, n_obs * n_vars * .1)] <- NA_real_
m
},
numeric_dense_with_nas = function(n_obs, n_vars) {
m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
m[sample.int(n_obs * n_vars, n_obs * n_vars * .1)] <- NA_real_
as(m, "denseMatrix")
},
numeric_csparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[sample.int(n_obs * n_vars, n_obs * n_vars * .1)] <- NA_real_
as(m, "CsparseMatrix")
},
numeric_rsparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[sample.int(n_obs * n_vars, n_obs * n_vars * .1)] <- NA_real_
as(m, "RsparseMatrix")
},
integer_matrix = function(n_obs, n_vars) {
matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
},
integer_dense = function(n_obs, n_vars) {
m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
as(m, "denseMatrix")
},
integer_csparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
as(m, "CsparseMatrix")
},
integer_rsparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
as(m, "RsparseMatrix")
},
integer_matrix_with_nas = function(n_obs, n_vars) {
m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
m[sample.int(n_obs * n_vars, n_obs * n_vars * .1)] <- NA_integer_
m
},
integer_dense_with_nas = function(n_obs, n_vars) {
m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
m[sample.int(n_obs * n_vars, n_obs * n_vars * .1)] <- NA_integer_
as(m, "denseMatrix")
},
integer_csparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[sample.int(n_obs * n_vars, n_obs * n_vars * .1)] <- NA_integer_
as(m, "CsparseMatrix")
},
integer_rsparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[sample.int(n_obs * n_vars, n_obs * n_vars * .1)] <- NA_integer_
as(m, "RsparseMatrix")
}
)

#' Generate a matrix
#'

Check warning on line 75 in R/generate_matrix.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_matrix.R,line=75,col=3,[trailing_whitespace_linter] Trailing whitespace is superfluous.
#' Generate a matrix of a given type
#'

Check warning on line 77 in R/generate_matrix.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_matrix.R,line=77,col=3,[trailing_whitespace_linter] Trailing whitespace is superfluous.
#' @param n_obs Number of observations to generate
#' @param n_vars Number of variables to generate
#'

Check warning on line 80 in R/generate_matrix.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_matrix.R,line=80,col=3,[trailing_whitespace_linter] Trailing whitespace is superfluous.
#' @return A matrix of the given type
#'

Check warning on line 82 in R/generate_matrix.R

View workflow job for this annotation

GitHub Actions / lint

file=R/generate_matrix.R,line=82,col=3,[trailing_whitespace_linter] Trailing whitespace is superfluous.
#' @noRd
#'
#' @examples
#' generate_matrix(10L, 20L)
generate_matrix <- function(n_obs, n_vars, type = names(matrix_generators)) {
type <- match.arg(type)
matrix_generators[[type]](n_obs, n_vars)
}
56 changes: 56 additions & 0 deletions R/generate_vector.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
vector_generators <- list(
character = function(n) paste0("value", seq_len(n)),
integer = function(n) seq_len(n),
factor = function(n) factor(paste0("value", seq_len(n))),
factor_ordered = function(n) factor(paste0("value", seq_len(n)), ordered = TRUE),
logical = function(n) sample(c(TRUE, FALSE), n, replace = TRUE),
numeric = function(n) runif(n),
character_with_nas = function(n, pct = .5) {
x <- paste0("value", seq_len(n))
x[sample.int(n, pct * n)] <- NA_character_
x
},
integer_with_nas = function(n, pct = .5) {
x <- seq_len(n)
x[sample.int(n, pct * n)] <- NA_integer_
x
},
factor_with_nas = function(n, pct = .5) {
x <- factor(paste0("value", seq_len(n)))
x[sample.int(n, pct * n)] <- NA_character_
x
},
factor_ordered_with_nas = function(n, pct = .5) {
x <- factor(paste0("value", seq_len(n)), ordered = TRUE)
x[sample.int(n, pct * n)] <- NA_character_
x
},
logical_with_nas = function(n, pct = .5) {
x <- sample(c(TRUE, FALSE), n, replace = TRUE)
x[sample.int(n, pct * n)] <- NA
x
},
numeric_with_nas = function(n, pct = .5) {
x <- runif(n)
x[sample.int(n, pct * n)] <- NA_real_
x
}
)

#' Generate a vector
#'
#' Generate a vector of a given type
#'
#' @param n Number of elements to generate
#' @param type Type of vector to generate
#'
#' @return A vector of the given type
#'
#' @noRd
#'
#' @examples
#' generate_vector(10L)
generate_vector <- function(n, type = names(vector_generators)) {
type <- match.arg(type)
vector_generators[[type]](n)
}
Loading

0 comments on commit c0c64a2

Please sign in to comment.