Skip to content

Commit

Permalink
restructure api
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Aug 29, 2024
1 parent c545462 commit 86729ec
Show file tree
Hide file tree
Showing 11 changed files with 110 additions and 43 deletions.
51 changes: 51 additions & 0 deletions src/api/comp_data_loader_sc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

info:
type: data_loader
type_info:
label: SC Data Loader
summary: A component to download and store single-cell data.
description: |
This component downloads data from a resource and stores it as a h5ad file.
argument_groups:
- name: Outputs
arguments:
- name: "--output"
__merge__: file_common_singlecell.yaml
direction: output
required: true
- name: Metadata
arguments:
- type: string
name: --dataset_id
description: "A unique identifier for the dataset"
required: true
- name: --dataset_name
type: string
description: Nicely formatted name.
required: true
- type: string
name: --dataset_url
description: Link to the original source of the dataset.
required: false
- name: --dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
- name: --dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: --dataset_description
type: string
description: Long description of the dataset.
required: true
- name: --dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
# test_resources:
# - path: /resources_test/common/pancreas
# dest: resources_test/common/pancreas
# - type: python_script
# path: /common/component_tests/run_and_check_output.py

Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@

info:
type: data_loader
type_info:
label: Data loader
summary: A data loader component which downloads data from a resource and stores it as a zarr file.
label: iST Data Loader
summary: A component to download and store iST data.
description: |
TODO: fill in
This component downloads data from a resource and stores it as a zarr file.
argument_groups:
- name: Outputs
arguments:
- name: "--output"
__merge__: file_spatialdata_raw.yaml
__merge__: file_common_spatialdata.yaml
direction: output
required: true
- name: Metadata
Expand Down
24 changes: 16 additions & 8 deletions src/api/comp_data_preprocessor.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,30 @@

info:
type: data_processor
type_info:
label: Data preprocessor
summary: A data loader component which downloads data from a resource and stores it as a zarr file.
summary: Preprocess a common dataset for the benchmark.
description: |
TODO: fill in
This component processes a common single-cell and a common spatial transcriptomics
dataset for the benchmark.
arguments:
- name: "--input"
__merge__: file_spatialdata_raw.yaml
- name: "--input_sp"
__merge__: file_common_spatialdata.yaml
direction: input
required: true
- name: "--output"
__merge__: file_spatialdata_processed.yaml
- name: "--input_sc"
__merge__: file_common_singlecell.yaml
direction: input
required: true
- name: "--output_sp"
__merge__: file_spatialdata.yaml
direction: output
required: true
- name: "--output_sc"
__merge__: file_singlecell.yaml
direction: output
required: true
# test_resources:
# - path: /resources_test/common/pancreas
# dest: resources_test/common/pancreas
# - type: python_script
# path: /common/component_tests/run_and_check_output.py

Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
type: file
example: "resources_test/common/pancreas/raw.h5ad" #TODO: change this to abc atlas (crop)?
example: "resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
label: "Raw SC Dataset"
summary: An unprocessed dataset as output by a dataset loader.
description: |
This dataset contains raw counts and metadata as output by a dataset loader.
The format of this file is mainly derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
info:
label: "Raw scRNA-seq"
summary: An unprocessed dataset as output by a dataset loader.
format:
variables:
- name: cell_type_suffix
type: string
description: Suffix for specific cell type annotations, e.g. to indicate level of granularity
required: false
description: |
This dataset contains raw counts and metadata as output by a dataset loader.
The format of this file is mainly derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
slots:
layers:
- type: integer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@

type: file
example: "resources_test/common/2023_10x_mouse_brain_xenium/dataset.zarr"
label: "Raw Dataset"
label: "Raw Spatial Dataset"
summary: An unprocessed spatial imaging dataset stored as a zarr file.
description: |
This dataset contains raw images, labels, points, shapes, and tables as output by a dataset loader.
info:
format:
type: spatialdata_zarr
Expand Down
7 changes: 7 additions & 0 deletions src/api/file_singlecell.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
type: file
example: "resources_test/preprocessing_imagingbased_st/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
label: "SC Dataset"
summary: A single-cell reference dataset, preprocessed for this benchmark.
description: |
This dataset contains preprocessed counts and metadata for single-cell RNA-seq data.
__merge__: file_common_singlecell.yaml
7 changes: 7 additions & 0 deletions src/api/file_spatialdata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
type: file
example: "resources_test/preprocessing_imagingbased_st/2023_10x_mouse_brain_xenium/dataset.zarr"
label: "Spatial Dataset"
summary: A spatial transcriptomics dataset, preprocessed for this benchmark.
description: |
This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data.
__merge__: file_common_spatialdata.yaml
2 changes: 1 addition & 1 deletion src/data_loaders/download_10x_xenium/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__merge__: ../../api/comp_data_loader.yaml
__merge__: ../../api/comp_data_loader_sp.yaml
name: download_10x_xenium
namespace: data_loaders

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# __merge__: ../../api/comp_data_loader.yaml # TODO: loader for scrnaseq h5ad instead of spatialdata zarr
__merge__: ../../api/comp_data_loader_sc.yaml
name: download_allen_brain_cell_atlas
namespace: data_loaders

Expand Down
9 changes: 2 additions & 7 deletions src/data_loaders/download_allen_brain_cell_atlas/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,8 @@
adata.var = adata.var.rename(columns={"gene_symbol":"feature_name"})

# Uns
adata.uns["dataset_id"] = "allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2"
adata.uns["dataset_name"] = "ABCA Mouse Brain scRNAseq"
adata.uns["dataset_url"] = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717"
adata.uns["dataset_reference"] = "10.1038/s41586-023-06812-z"
adata.uns["dataset_summary"] = "A high-resolution scRNAseq atlas of cell types in the whole mouse brain"
adata.uns["dataset_description"] = "See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset."
adata.uns["dataset_organism"] = "Mus musculus"
for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
adata.uns[key] = par[key]

# Write data
adata.write_h5ad(par["output"])
24 changes: 10 additions & 14 deletions src/workflows/process_datasets/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,22 @@ namespace: workflows
argument_groups:
- name: Inputs
arguments:
- name: "--input"
__merge__: /src/api/file_common_dataset.yaml
- name: "--input_sp"
__merge__: /src/api/file_common_spatialdata.yaml
required: true
direction: input
- name: "--input_sc"
__merge__: /src/api/file_common_singlecell.yaml
required: true
direction: input
- name: Outputs
arguments:
- name: "--output_train"
__merge__: /src/api/file_train_h5ad.yaml
required: true
direction: output
- name: "--output_test"
__merge__: /src/api/file_test_h5ad.yaml
- name: "--output_sp"
__merge__: /src/api/file_spatialdata.yaml
required: true
direction: output
- name: "--output_solution"
__merge__: /src/api/file_solution.yaml
- name: "--output_sc"
__merge__: /src/api/file_singlecell.yaml
required: true
direction: output

Expand All @@ -30,10 +30,6 @@ resources:
- path: /common/nextflow_helpers/helper.nf

dependencies:
- name: common/check_dataset_schema
repository: openproblems-v2
- name: common/extract_metadata
repository: openproblems-v2
- name: data_processors/process_dataset

runners:
Expand Down

0 comments on commit 86729ec

Please sign in to comment.