restructure api

openproblems-bio · Aug 29, 2024 · 86729ec · 86729ec
1 parent c545462
commit 86729ec
Show file tree

Hide file tree

Showing 11 changed files with 110 additions and 43 deletions.
diff --git a/src/api/comp_data_loader_sc.yaml b/src/api/comp_data_loader_sc.yaml
@@ -0,0 +1,51 @@
+
+info:
+  type: data_loader
+  type_info:
+    label: SC Data Loader
+    summary: A component to download and store single-cell data.
+    description: |
+      This component downloads data from a resource and stores it as a h5ad file.
+argument_groups:
+  - name: Outputs
+    arguments:
+    - name: "--output"
+      __merge__: file_common_singlecell.yaml
+      direction: output
+      required: true
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+# test_resources:
+#   - path: /resources_test/common/pancreas
+#     dest: resources_test/common/pancreas
+#   - type: python_script
+#     path: /common/component_tests/run_and_check_output.py
+
diff --git a/src/api/comp_data_loader.yaml → src/api/comp_data_loader_sp.yaml b/src/api/comp_data_loader.yaml → src/api/comp_data_loader_sp.yaml
@@ -1,15 +1,16 @@
 
 info:
+  type: data_loader
   type_info:
-    label: Data loader
-    summary: A data loader component which downloads data from a resource and stores it as a zarr file.
+    label: iST Data Loader
+    summary: A component to download and store iST data.
     description: |
-      TODO: fill in
+      This component downloads data from a resource and stores it as a zarr file.
 argument_groups:
   - name: Outputs
     arguments:
     - name: "--output"
-      __merge__: file_spatialdata_raw.yaml
+      __merge__: file_common_spatialdata.yaml
       direction: output
       required: true
   - name: Metadata

diff --git a/src/api/comp_data_preprocessor.yaml b/src/api/comp_data_preprocessor.yaml
@@ -1,22 +1,30 @@
-
 info:
+  type: data_processor
   type_info:
     label: Data preprocessor
-    summary: A data loader component which downloads data from a resource and stores it as a zarr file.
+    summary: Preprocess a common dataset for the benchmark.
     description: |
-      TODO: fill in
+      This component processes a common single-cell and a common spatial transcriptomics
+      dataset for the benchmark.
   arguments:
-  - name: "--input"
-    __merge__: file_spatialdata_raw.yaml
+  - name: "--input_sp"
+    __merge__: file_common_spatialdata.yaml
     direction: input
     required: true
-  - name: "--output"
-    __merge__: file_spatialdata_processed.yaml
+  - name: "--input_sc"
+    __merge__: file_common_singlecell.yaml
+    direction: input
+    required: true
+  - name: "--output_sp"
+    __merge__: file_spatialdata.yaml
+    direction: output
+    required: true
+  - name: "--output_sc"
+    __merge__: file_singlecell.yaml
     direction: output
     required: true
 # test_resources:
 #   - path: /resources_test/common/pancreas
 #     dest: resources_test/common/pancreas
 #   - type: python_script
 #     path: /common/component_tests/run_and_check_output.py
-
diff --git a/src/api/file_singlecell_raw.yaml → src/api/file_common_singlecell.yaml b/src/api/file_singlecell_raw.yaml → src/api/file_common_singlecell.yaml
@@ -1,18 +1,18 @@
 type: file
-example: "resources_test/common/pancreas/raw.h5ad" #TODO: change this to abc atlas (crop)?
+example: "resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
+label: "Raw SC Dataset"
+summary: An unprocessed dataset as output by a dataset loader.
+description: |
+  This dataset contains raw counts and metadata as output by a dataset loader.
+
+  The format of this file is mainly derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
 info:
-  label: "Raw scRNA-seq"
-  summary: An unprocessed dataset as output by a dataset loader.
   format:
     variables:
       - name: cell_type_suffix
         type: string
         description: Suffix for specific cell type annotations, e.g. to indicate level of granularity
         required: false
-  description: |
-    This dataset contains raw counts and metadata as output by a dataset loader.
-
-    The format of this file is mainly derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
   slots:
     layers: 
       - type: integer

diff --git a/src/api/file_spatialdata_raw.yaml → src/api/file_common_spatialdata.yaml b/src/api/file_spatialdata_raw.yaml → src/api/file_common_spatialdata.yaml
@@ -23,8 +23,10 @@
 
 type: file
 example: "resources_test/common/2023_10x_mouse_brain_xenium/dataset.zarr"
-label: "Raw Dataset"
+label: "Raw Spatial Dataset"
 summary: An unprocessed spatial imaging dataset stored as a zarr file.
+description: |
+  This dataset contains raw images, labels, points, shapes, and tables as output by a dataset loader.
 info:
   format:
     type: spatialdata_zarr

diff --git a/src/api/file_singlecell.yaml b/src/api/file_singlecell.yaml
@@ -0,0 +1,7 @@
+type: file
+example: "resources_test/preprocessing_imagingbased_st/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
+label: "SC Dataset"
+summary: A single-cell reference dataset, preprocessed for this benchmark.
+description: |
+  This dataset contains preprocessed counts and metadata for single-cell RNA-seq data.
+__merge__: file_common_singlecell.yaml
diff --git a/src/api/file_spatialdata.yaml b/src/api/file_spatialdata.yaml
@@ -0,0 +1,7 @@
+type: file
+example: "resources_test/preprocessing_imagingbased_st/2023_10x_mouse_brain_xenium/dataset.zarr"
+label: "Spatial Dataset"
+summary: A spatial transcriptomics dataset, preprocessed for this benchmark.
+description: |
+  This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data.
+__merge__: file_common_spatialdata.yaml
diff --git a/src/data_loaders/download_10x_xenium/config.vsh.yaml b/src/data_loaders/download_10x_xenium/config.vsh.yaml
@@ -1,4 +1,4 @@
-__merge__: ../../api/comp_data_loader.yaml
+__merge__: ../../api/comp_data_loader_sp.yaml
 name: download_10x_xenium
 namespace: data_loaders
 

diff --git a/src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml b/src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml
@@ -1,4 +1,4 @@
-# __merge__: ../../api/comp_data_loader.yaml # TODO: loader for scrnaseq h5ad instead of spatialdata zarr 
+__merge__: ../../api/comp_data_loader_sc.yaml
 name: download_allen_brain_cell_atlas
 namespace: data_loaders
 

diff --git a/src/data_loaders/download_allen_brain_cell_atlas/script.py b/src/data_loaders/download_allen_brain_cell_atlas/script.py
@@ -116,13 +116,8 @@
 adata.var = adata.var.rename(columns={"gene_symbol":"feature_name"})
 
 # Uns
-adata.uns["dataset_id"] = "allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2"
-adata.uns["dataset_name"] = "ABCA Mouse Brain scRNAseq"
-adata.uns["dataset_url"] = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717"
-adata.uns["dataset_reference"] = "10.1038/s41586-023-06812-z"
-adata.uns["dataset_summary"] = "A high-resolution scRNAseq atlas of cell types in the whole mouse brain"
-adata.uns["dataset_description"] = "See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset."
-adata.uns["dataset_organism"] = "Mus musculus"
+for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
+    adata.uns[key] = par[key]
 
 # Write data
 adata.write_h5ad(par["output"])
diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml
@@ -4,22 +4,22 @@ namespace: workflows
 argument_groups:
   - name: Inputs
     arguments:
-      - name: "--input"
-        __merge__: /src/api/file_common_dataset.yaml
+      - name: "--input_sp"
+        __merge__: /src/api/file_common_spatialdata.yaml
+        required: true
+        direction: input
+      - name: "--input_sc"
+        __merge__: /src/api/file_common_singlecell.yaml
         required: true
         direction: input
   - name: Outputs
     arguments:
-      - name: "--output_train"
-        __merge__: /src/api/file_train_h5ad.yaml
-        required: true
-        direction: output
-      - name: "--output_test"
-        __merge__: /src/api/file_test_h5ad.yaml
+      - name: "--output_sp"
+        __merge__: /src/api/file_spatialdata.yaml
         required: true
         direction: output
-      - name: "--output_solution"
-        __merge__: /src/api/file_solution.yaml
+      - name: "--output_sc"
+        __merge__: /src/api/file_singlecell.yaml
         required: true
         direction: output
 
@@ -30,10 +30,6 @@ resources:
   - path: /common/nextflow_helpers/helper.nf
 
 dependencies:
-  - name: common/check_dataset_schema
-    repository: openproblems-v2
-  - name: common/extract_metadata
-    repository: openproblems-v2
   - name: data_processors/process_dataset
 
 runners: