From e2acdec8a5bc40e9c015e99892983c62a2e591fe Mon Sep 17 00:00:00 2001 From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Date: Mon, 15 Jul 2024 13:42:38 +0200 Subject: [PATCH] Update components (#3) * Update docker setup * remove accelerator parameter * clean up docker setup * update common resources * update ci * update scripts * move dataset simulator to process datasets * change to viash 0.9 * update readme * update test resources dest * add organisation * fix typo * update common submodule * update submodule * Update test.yml --------- Co-authored-by: Robrecht Cannoodt --- .github/workflows/build.yml | 121 ++--------- .github/workflows/test.yml | 110 +--------- README.md | 10 +- _viash.yaml | 35 +-- common | 2 +- scripts/add_component.sh | 21 -- scripts/add_control_method.sh | 12 ++ scripts/add_method.sh | 12 ++ scripts/add_metric.sh | 12 ++ scripts/create_test_resources.sh | 39 ++++ scripts/download_resources.sh | 5 +- scripts/render_readme.sh | 6 +- scripts/run_benchmark.sh | 2 +- scripts/run_benchmark_tw.sh | 2 +- scripts/test_components.sh | 19 +- src/api/comp_control_method.yaml | 75 ++++--- src/api/comp_method.yaml | 57 +++-- src/api/comp_metric.yaml | 61 +++--- src/api/comp_process_dataset.yaml | 63 +++--- src/api/file_common_dataset.yaml | 10 +- src/api/file_simulated_dataset.yaml | 75 +++++++ .../random_proportions/config.vsh.yaml | 27 +-- .../random_proportions/script.py | 4 +- .../true_proportions/config.vsh.yaml | 27 +-- .../true_proportions/script.py | 4 +- src/dataset_simulator/config.vsh.yaml | 200 ------------------ src/methods/cell2location/config.vsh.yaml | 136 ++++++------ src/methods/cell2location/script.py | 4 +- src/methods/destvi/config.vsh.yaml | 57 +++-- src/methods/destvi/script.py | 4 +- src/methods/nmfreg/config.vsh.yaml | 43 ++-- src/methods/nmfreg/script.py | 4 +- src/methods/nnls/config.vsh.yaml | 36 ++-- src/methods/nnls/script.py | 4 +- src/methods/rctd/config.vsh.yaml | 52 ++--- src/methods/rctd/script.R | 4 +- src/methods/seurat/config.vsh.yaml | 50 ++--- src/methods/seurat/script.R | 4 +- src/methods/stereoscope/config.vsh.yaml | 56 +++-- src/methods/stereoscope/script.py | 4 +- src/methods/tangram/config.vsh.yaml | 51 ++--- src/methods/tangram/script.py | 4 +- src/methods/vanillanmf/config.vsh.yaml | 45 ++-- src/methods/vanillanmf/script.py | 4 +- src/metrics/r2/config.vsh.yaml | 43 ++-- src/metrics/r2/script.py | 2 +- .../dataset_simulator/config.vsh.yaml | 76 +++++++ .../dataset_simulator/script.py | 4 +- .../split_dataset/config.vsh.yaml | 46 +++- src/process_dataset/split_dataset/script.py | 6 +- .../process_datasets/config.vsh.yaml | 95 ++++----- src/workflows/process_datasets/run_test.sh | 15 ++ src/workflows/run_benchmark/config.vsh.yaml | 149 +++++++------ src/workflows/run_benchmark/main.nf | 20 +- src/workflows/run_benchmark/run_test.sh | 19 ++ 55 files changed, 957 insertions(+), 1091 deletions(-) delete mode 100644 scripts/add_component.sh create mode 100644 scripts/add_control_method.sh create mode 100644 scripts/add_method.sh create mode 100644 scripts/add_metric.sh create mode 100644 scripts/create_test_resources.sh create mode 100644 src/api/file_simulated_dataset.yaml delete mode 100644 src/dataset_simulator/config.vsh.yaml create mode 100644 src/process_dataset/dataset_simulator/config.vsh.yaml rename src/{ => process_dataset}/dataset_simulator/script.py (98%) create mode 100644 src/workflows/process_datasets/run_test.sh create mode 100644 src/workflows/run_benchmark/run_test.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7bc6da6..f5bc898 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,114 +1,21 @@ -name: main build +name: Build on: push: branches: [ 'main' ] + workflow_dispatch: + inputs: + version: + description: | + The version of the project to build. Example: `1.0.3`. + + If not provided, a development build with a version name + based on the branch name will be built. Otherwise, a release + build with the provided version will be built. + required: false jobs: - # phase 1 - list: - runs-on: ubuntu-latest - - outputs: - component_matrix: ${{ steps.set_matrix.outputs.matrix }} - cache_key: ${{ steps.cache.outputs.cache_key }} - - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - - uses: viash-io/viash-actions/setup@v5 - - - name: Remove target folder from .gitignore - run: | - # allow publishing the target folder - sed -i 's#^/target/$##g' .gitignore - - - uses: viash-io/viash-actions/ns-build@v5 - with: - config_mod: .functionality.version := 'main_build' - parallel: true - - # - name: Build nextflow schemas - # uses: viash-io/viash-actions/pro/build-nextflow-schemas@v4 - # with: - # workflows: src - # components: src - # viash_pro_token: ${{ secrets.GTHB_PAT }} - # tools_version: 'main_build' - - # - name: Build parameter files - # uses: viash-io/viash-actions/pro/build-nextflow-params@v4 - # with: - # workflows: src - # components: src - # viash_pro_token: ${{ secrets.GTHB_PAT }} - # tools_version: 'main_build' - - - name: Deploy to target branch - uses: peaceiris/actions-gh-pages@v4 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: . - publish_branch: main_build - - - id: ns_list - uses: viash-io/viash-actions/ns-list@v5 - with: - platform: docker - src: src - format: json - - - id: set_matrix - run: | - echo "matrix=$(jq -c '[ .[] | - { - "name": (.functionality.namespace + "/" + .functionality.name), - "dir": .info.config | capture("^(?.*\/)").dir - } - ]' ${{ steps.ns_list.outputs.output_file }} )" >> $GITHUB_OUTPUT - - # phase 2 build: - needs: list - - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - component: ${{ fromJson(needs.list.outputs.component_matrix) }} - - steps: - # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' - - uses: data-intuitive/reclaim-the-bytes@v2 - - - uses: actions/checkout@v4 - with: - submodules: true - - - uses: viash-io/viash-actions/setup@v5 - - - name: Build container - uses: viash-io/viash-actions/ns-build@v5 - with: - config_mod: .functionality.version := 'main_build' - platform: docker - src: ${{ matrix.component.dir }} - setup: build - - - name: Login to container registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ secrets.GTHB_USER }} - password: ${{ secrets.GTHB_PAT }} - - - name: Push container - uses: viash-io/viash-actions/ns-build@v5 - with: - config_mod: .functionality.version := 'main_build' - platform: docker - src: ${{ matrix.component.dir }} - setup: push \ No newline at end of file + uses: openproblems-bio/actions/.github/workflows/build.yml@main + with: + version: ${{ github.event.inputs.version }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index aeabf9e..bd7e925 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,110 +1,10 @@ -name: viash test +name: Test on: - pull_request: push: - branches: [ '**' ] + branches: ["main"] + pull_request: jobs: - run_ci_check_job: - runs-on: ubuntu-latest - outputs: - run_ci: ${{ steps.github_cli.outputs.check }} - steps: - - name: 'Check if branch has an existing pull request and the trigger was a push' - id: github_cli - run: | - pull_request=$(gh pr list -R ${{ github.repository }} -H ${{ github.ref_name }} --json url --state open --limit 1 | jq '.[0].url') - # If the branch has a PR and this run was triggered by a push event, do not run - if [[ "$pull_request" != "null" && "$GITHUB_REF_NAME" != "main" && "${{ github.event_name == 'push' }}" == "true" && "${{ !contains(github.event.head_commit.message, 'ci force') }}" == "true" ]]; then - echo "check=false" >> $GITHUB_OUTPUT - else - echo "check=true" >> $GITHUB_OUTPUT - fi - env: - GITHUB_TOKEN: ${{ secrets.GTHB_PAT }} - - # phase 1 - list: - needs: run_ci_check_job - env: - s3_bucket: s3://openproblems-data/resources_test/ - runs-on: ubuntu-latest - if: ${{needs.run_ci_check_job.outputs.run_ci == 'true'}} - - outputs: - matrix: ${{ steps.set_matrix.outputs.matrix }} - cache_key: ${{ steps.cache.outputs.cache_key }} - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: true - - - uses: viash-io/viash-actions/setup@v5 - - - uses: viash-io/viash-actions/project/sync-and-cache-s3@v5 - id: cache - with: - s3_bucket: $s3_bucket - dest_path: resources_test - cache_key_prefix: resources_test__ - - - id: ns_list - uses: viash-io/viash-actions/ns-list@v5 - with: - platform: docker - format: json - - - id: ns_list_filtered - uses: viash-io/viash-actions/project/detect-changed-components@v5 - with: - input_file: "${{ steps.ns_list.outputs.output_file }}" - - - id: set_matrix - run: | - echo "matrix=$(jq -c '[ .[] | - { - "name": (.functionality.namespace + "/" + .functionality.name), - "config": .info.config - } - ]' ${{ steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT - - # phase 2 - viash_test: - needs: list - if: ${{ needs.list.outputs.matrix != '[]' && needs.list.outputs.matrix != '' }} - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - component: ${{ fromJson(needs.list.outputs.matrix) }} - - steps: - # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' - - uses: data-intuitive/reclaim-the-bytes@v2 - - - uses: actions/checkout@v4 - with: - submodules: true - - - uses: viash-io/viash-actions/setup@v5 - - # use cache - - name: Cache resources data - uses: actions/cache@v4 - timeout-minutes: 10 - with: - path: resources_test - key: ${{ needs.list.outputs.cache_key }} - - - name: Run test - timeout-minutes: 30 - run: | - VIASH_TEMP=$RUNNER_TEMP/viash viash test \ - "${{ matrix.component.config }}" \ - --cpus 2 \ - --memory "5gb" - + build: + uses: openproblems-bio/actions/.github/workflows/test.yml@main diff --git a/README.md b/README.md index 40c7287..9ddb773 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Estimation of cell type proportions per spot in 2D space from spatial transcriptomic data coupled with corresponding single-cell data Path to source: -[`src`](https://github.com/openproblems-bio/task-spatial-decomposition/tree/main/src) +[`src`](https://github.com/openproblems-bio/task_spatial_decomposition/src) ## Motivation @@ -83,7 +83,7 @@ flowchart LR A subset of the common dataset. Example file: -`resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/dataset_simulated.h5ad` +`resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad` Format: @@ -92,7 +92,7 @@ Format: AnnData object obs: 'cell_type', 'batch' var: 'hvg', 'hvg_score' - obsm: 'X_pca', 'coordinates', 'proportions_true' + obsm: 'X_pca' layers: 'counts' uns: 'cell_type_names', 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -108,9 +108,7 @@ Slot description: | `obs["batch"]` | `string` | A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | | `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | | `var["hvg_score"]` | `double` | A ranking of the features by hvg. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `obsm["coordinates"]` | `double` | (*Optional*) XY coordinates for each spot. | -| `obsm["proportions_true"]` | `double` | (*Optional*) True cell type proportions for each spot. | +| `obsm["X_pca"]` | `double` | (*Optional*) The resulting PCA embedding. | | `layers["counts"]` | `integer` | Raw counts. | | `uns["cell_type_names"]` | `string` | (*Optional*) Cell type names corresponding to values in `cell_type`. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | diff --git a/_viash.yaml b/_viash.yaml index 18431f9..22e647d 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1,18 +1,25 @@ -viash_version: 0.8.6 +viash_version: 0.9.0-RC6 -# name: task_spatial_decomposition -# description: | -# An OpenProblems benchmark task for spatial decomposition. +name: task_spatial_decomposition +description: | + An OpenProblems benchmark task for spatial decomposition. +license: MIT +keywords: [single-cell, openproblems, benchmark, spatial decomposition] +links: + issue_tracker: https://github.com/openproblems-bio/task_spatial_decomposition/issues + repository: https://github.com/openproblems-bio/task_spatial_decomposition + docker_registry: ghcr.io +info: + test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/common/ + dest: resources_test/common + - type: s3 + path: s3://openproblems-data/resources_test/spatial_decomposition/ + dest: resources_test/spatial_decomposition -source: src -target: target +organization: openproblems-bio +version: dev config_mods: | - .functionality.version := 'dev' - .platforms[.type == 'docker'].target_registry := 'ghcr.io' - .platforms[.type == 'docker'].target_organization := 'openproblems-bio/task-spatial-decomposition' - .platforms[.type == 'docker'].target_image_source := 'https://github.com/openproblems-bio/task-spatial-decomposition' - .platforms[.type == "nextflow"].directives.tag := "$id" - .platforms[.type == "nextflow"].auto.simplifyOutput := false - .platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } - .platforms[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'" + .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } diff --git a/common b/common index 81d4268..28c2b27 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 81d42682169dcf3990f26a9865d658baf67e6335 +Subproject commit 28c2b271687dca388d1c1ed448f464e653af2c24 diff --git a/scripts/add_component.sh b/scripts/add_component.sh deleted file mode 100644 index f222478..0000000 --- a/scripts/add_component.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -echo "This script is not supposed to be run directly." -echo "Please run the script step-by-step." -exit 1 - -# create a new component (types: method, metric, control_method) -type="method" -lang="python" # change this to "r" if need be - -common/create_component/create_component \ - --task spatial_decomposition \ - --type $type \ - --name component_name \ - --language $lang - -# TODO: fill in required fields in src/methods/foo/config.vsh.yaml -# TODO: edit src/methods/foo/script.py/R - -# test the component -viash test src/$type/component_name/config.vsh.yaml diff --git a/scripts/add_control_method.sh b/scripts/add_control_method.sh new file mode 100644 index 0000000..bf6174c --- /dev/null +++ b/scripts/add_control_method.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +task_name="spatial_decomposition" +component_name="my_control_method" +component_lang="python" # change this to "r" if need be + +common/create_component/create_component \ + --task $task_name \ + --language "$component_lang" \ + --name "$component_name" \ + --api_file src/api/comp_control_method.yaml \ + --output "src/control_methods/$component_name" diff --git a/scripts/add_method.sh b/scripts/add_method.sh new file mode 100644 index 0000000..24a9a19 --- /dev/null +++ b/scripts/add_method.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +task_name="spatial_decomposition" +component_name="my_method" +component_lang="python" # change this to "r" if need be + +common/create_component/create_component \ + --task $task_name \ + --language "$component_lang" \ + --name "$component_name" \ + --api_file src/api/comp_method.yaml \ + --output "src/methods/$component_name" diff --git a/scripts/add_metric.sh b/scripts/add_metric.sh new file mode 100644 index 0000000..aa916cd --- /dev/null +++ b/scripts/add_metric.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +task_name="spatial_decomposition" +component_name="my_metric" +component_lang="python" # change this to "r" if need be + +common/create_component/create_component \ + --task $task_name \ + --language "$component_lang" \ + --name "$component_name" \ + --api_file src/api/comp_metric.yaml \ + --output "src/metrics/$component_name" diff --git a/scripts/create_test_resources.sh b/scripts/create_test_resources.sh new file mode 100644 index 0000000..7d19bc3 --- /dev/null +++ b/scripts/create_test_resources.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/spatial_decomposition + +mkdir -p $DATASET_DIR + +# process dataset +echo "Running process_dataset" +nextflow run . \ + -main-script target/nextflow/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --input_states "$RAW_DATA/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_spatial_masked": "$id/spatial_masked.h5ad", "output_single_cell": "$id/single_cell_ref.h5ad", "output_solution": "$id/solution.h5ad", "alpha": 1.0, "simulated_data": "$id/simulated_dataset.h5ad"}' \ + --publish_dir "$DATASET_DIR" \ + --output_state '$id/state.yaml' + +# run one method +viash run src/methods/stereoscope/config.vsh.yaml -- \ + --input_single_cell $DATASET_DIR/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad \ + --input_spatial_masked $DATASET_DIR/cxg_mouse_pancreas_atlas/spatial_masked.h5ad \ + --output $DATASET_DIR/cxg_mouse_pancreas_atlas/output.h5ad + +# run one metric +viash run src/metrics/r2/config.vsh.yaml -- \ + --input_method $DATASET_DIR/cxg_mouse_pancreas_atlas/output.h5ad \ + --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \ + --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh index 9b012f1..c446b7b 100644 --- a/scripts/download_resources.sh +++ b/scripts/download_resources.sh @@ -5,5 +5,6 @@ directories=("common" "spatial_decomposition") for dir in ${directories[@]}; do common/sync_resources/sync_resources \ --input "s3://openproblems-data/resources_test/$dir/cxg_mouse_pancreas_atlas" \ - --output "resources_test/$dir/cxg_mouse_pancreas_atlas" -done \ No newline at end of file + --output "resources_test/$dir/cxg_mouse_pancreas_atlas" \ + --delete +done diff --git a/scripts/render_readme.sh b/scripts/render_readme.sh index abafdc1..e0aca71 100644 --- a/scripts/render_readme.sh +++ b/scripts/render_readme.sh @@ -1,7 +1,5 @@ #!/bin/bash common/create_task_readme/create_task_readme \ - --task "spatial_decomposition" \ - --task_dir "src/" \ - --github_url "https://github.com/openproblems-bio/task-spatial-decomposition/tree/main/" \ - --output "README.md" \ No newline at end of file + --task_dir src \ + --output README.md \ No newline at end of file diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh index 2aeb014..310ab5d 100644 --- a/scripts/run_benchmark.sh +++ b/scripts/run_benchmark.sh @@ -29,7 +29,7 @@ nextflow run . \ -entry auto \ -c common/nextflow_helpers/labels_ci.config \ --input_states "$DATASETS_DIR/**/state.yaml" \ - --rename_keys 'input_single_cell:output_single_cell,input_spatial_masked:output_spatial_masked,input_solution:output_solution' \ + --rename_keys 'input_single_cell:output_single_cell;input_spatial_masked:output_spatial_masked;input_solution:output_solution' \ --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ --publish_dir "$OUTPUT_DIR" \ --output_state "state.yaml" \ No newline at end of file diff --git a/scripts/run_benchmark_tw.sh b/scripts/run_benchmark_tw.sh index 989226f..50a3e44 100644 --- a/scripts/run_benchmark_tw.sh +++ b/scripts/run_benchmark_tw.sh @@ -6,7 +6,7 @@ cat > /tmp/params.yaml << 'HERE' id: spatial_decomposition_process_datasets input_states: s3://openproblems-data/resources/datasets/**/log_cp10k/state.yaml -settings: '{"output_spatial_masked": "$id/spatial_masked.h5ad", "output_single_cell": "$id/single_cell_ref.h5ad", "output_solution": "$id/solution.h5ad", "alpha": 1.0, "simulated_data": "$id/dataset_simulated.h5ad"}' +settings: '{"output_spatial_masked": "$id/spatial_masked.h5ad", "output_single_cell": "$id/single_cell_ref.h5ad", "output_solution": "$id/solution.h5ad", "alpha": 1.0, "simulated_data": "$id/simulated_dataset.h5ad"}' rename_keys: 'input:output_dataset' output_state: "$id/state.yaml" publish_dir: s3://openproblems-data/resources/spatial_decomposition/datasets diff --git a/scripts/test_components.sh b/scripts/test_components.sh index b8f78b3..34895f6 100644 --- a/scripts/test_components.sh +++ b/scripts/test_components.sh @@ -3,20 +3,7 @@ # Test all components in a namespace (refer https://viash.io/reference/cli/ns_test.html) viash ns test --parallel -# Test individual components -viash test src/methods/nnls/config.vsh.yaml - - -DATASET_DIR=resources_test/spatial_decomposition +component_name="my_component_name" -# run one method -viash run src/methods/nnls/config.vsh.yaml -- \ - --input_single_cell $DATASET_DIR/cxg_mouse_pancreas_atlas/single_cell_ref.h5ad \ - --input_spatial_masked $DATASET_DIR/cxg_mouse_pancreas_atlas/spatial_masked.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/output.h5ad - -# run one metric -viash run src/metrics/r2/config.vsh.yaml -- \ - --input_method $DATASET_DIR/cxg_mouse_pancreas_atlas/output.h5ad \ - --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad +# Test individual components +viash test src/methods/$component_name/config.vsh.yaml diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index dde7893..19713a5 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -1,38 +1,37 @@ -functionality: - namespace: "control_methods" - info: - type: control_method - type_info: - label: Control method - summary: Quality control methods for verifying the pipeline. - description: | - Control methods have the same interface as the regular methods - but also receive the solution object as input. It serves as a - starting point to test the relative accuracy of new methods in - the task, and also as a quality control for the metrics defined - in the task. - arguments: - - name: "--input_single_cell" - __merge__: file_single_cell.yaml - direction: input - required: true - - name: "--input_spatial_masked" - __merge__: file_spatial_masked.yaml - direction: input - required: true - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - - name: "--output" - __merge__: file_output.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /common/component_tests/check_method_config.py - - type: python_script - path: /common/component_tests/run_and_check_output.py - - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas - dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas - - path: /common/library.bib +namespace: "control_methods" +info: + type: control_method + type_info: + label: Control method + summary: Quality control methods for verifying the pipeline. + description: | + Control methods have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. +arguments: + - name: "--input_single_cell" + __merge__: file_single_cell.yaml + direction: input + required: true + - name: "--input_spatial_masked" + __merge__: file_spatial_masked.yaml + direction: input + required: true + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--output" + __merge__: file_output.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/check_method_config.py + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + - path: /common/library.bib diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index dfc13d1..ac26ebc 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -1,29 +1,28 @@ -functionality: - namespace: "methods" - info: - type: method - type_info: - label: Method - summary: A spatial composition method. - description: "Method to estimate cell type proportions from spatial and single cell data" - arguments: - - name: "--input_single_cell" - __merge__: file_single_cell.yaml - direction: input - required: true - - name: "--input_spatial_masked" - __merge__: file_spatial_masked.yaml - direction: input - required: true - - name: "--output" - __merge__: file_output.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /common/component_tests/check_method_config.py - - type: python_script - path: /common/component_tests/run_and_check_output.py - - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas - dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas - - path: /common/library.bib \ No newline at end of file +namespace: "methods" +info: + type: method + type_info: + label: Method + summary: A spatial composition method. + description: "Method to estimate cell type proportions from spatial and single cell data" +arguments: + - name: "--input_single_cell" + __merge__: file_single_cell.yaml + direction: input + required: true + - name: "--input_spatial_masked" + __merge__: file_spatial_masked.yaml + direction: input + required: true + - name: "--output" + __merge__: file_output.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/check_method_config.py + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + - path: /common/library.bib \ No newline at end of file diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index d333524..121b166 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -1,31 +1,30 @@ -functionality: - namespace: "metrics" - info: - type: metric - type_info: - label: Metric - summary: A spatial decomposition metric. - description: | - A metric for evaluating accuracy of cell type proportion estimate - arguments: - - name: "--input_method" - __merge__: file_output.yaml - direction: input - required: true - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - - name: "--output" - __merge__: file_score.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /common/component_tests/check_metric_config.py - - type: python_script - path: /common/component_tests/run_and_check_output.py - - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas - dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas - - path: /common/library.bib - \ No newline at end of file +namespace: "metrics" +info: + type: metric + type_info: + label: Metric + summary: A spatial decomposition metric. + description: | + A metric for evaluating accuracy of cell type proportion estimate +arguments: + - name: "--input_method" + __merge__: file_output.yaml + direction: input + required: true + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/check_metric_config.py + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + - path: /common/library.bib + \ No newline at end of file diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml index a72684c..0bf4e83 100644 --- a/src/api/comp_process_dataset.yaml +++ b/src/api/comp_process_dataset.yaml @@ -1,33 +1,30 @@ -functionality: - namespace: process_dataset - info: - type: process_dataset - type_info: - label: Data processor - summary: A spatial decomposition dataset processor. - description: | - Prepare a common dataset for the spatial_decomposition task. - arguments: - - name: "--input" - __merge__: file_common_dataset.yaml - direction: input - required: true - - name: "--output_single_cell" - __merge__: file_single_cell.yaml - direction: output - required: true - - name: "--output_spatial_masked" - __merge__: file_spatial_masked.yaml - direction: output - required: true - - name: "--output_solution" - __merge__: file_solution.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /common/component_tests/run_and_check_output.py - # - path: /resources_test/common/cxg_mouse_pancreas_atlas - # dest: resources_test/common/cxg_mouse_pancreas_atlas - - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas - dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas \ No newline at end of file +namespace: process_dataset +info: + type: process_dataset + type_info: + label: Data processor + summary: A spatial decomposition dataset processor. + description: | + Prepare a common dataset for the spatial_decomposition task. +arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_single_cell" + __merge__: file_single_cell.yaml + direction: output + required: true + - name: "--output_spatial_masked" + __merge__: file_spatial_masked.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/common/cxg_mouse_pancreas_atlas + dest: resources_test/common/cxg_mouse_pancreas_atlas \ No newline at end of file diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index b5399a3..0320c7f 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/dataset_simulated.h5ad" +example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad" info: label: "Common Dataset" summary: A subset of the common dataset. @@ -31,14 +31,6 @@ info: - type: double name: X_pca description: The resulting PCA embedding. - required: true - - type: double - name: coordinates - description: XY coordinates for each spot. - required: false - - type: double - name: proportions_true - description: True cell type proportions for each spot required: false uns: - type: string diff --git a/src/api/file_simulated_dataset.yaml b/src/api/file_simulated_dataset.yaml new file mode 100644 index 0000000..202435c --- /dev/null +++ b/src/api/file_simulated_dataset.yaml @@ -0,0 +1,75 @@ +type: file +example: "resources_test/cxg_mouse_pancreas_atlas/simulated_dataset.h5ad" +info: + label: "Common Dataset" + summary: A subset of the common dataset. + slots: + layers: + - type: integer + name: counts + description: Raw counts. + required: true + obs: + - type: string + name: cell_type + description: Cell type label IDs. + required: true + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + - type: double + name: coordinates + description: XY coordinates for each spot. + required: false + - type: double + name: proportions_true + description: True cell type proportions for each spot + required: false + uns: + - type: string + name: cell_type_names + description: Cell type names corresponding to values in `cell_type` + required: false + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false \ No newline at end of file diff --git a/src/control_methods/random_proportions/config.vsh.yaml b/src/control_methods/random_proportions/config.vsh.yaml index a0ccc70..6cd801b 100644 --- a/src/control_methods/random_proportions/config.vsh.yaml +++ b/src/control_methods/random_proportions/config.vsh.yaml @@ -1,25 +1,26 @@ __merge__: ../../api/comp_control_method.yaml -functionality: - name: random_proportions - info: - label: Random Proportions - summary: "Negative control method that randomly assigns celltype proportions from a Dirichlet distribution." - description: | - A negative control method with random assignment of predicted celltype proportions from a Dirichlet distribution. - preferred_normalization: counts +name: random_proportions +info: + label: Random Proportions + summary: "Negative control method that randomly assigns celltype proportions from a Dirichlet distribution." + description: | + A negative control method with random assignment of predicted celltype proportions from a Dirichlet distribution. + preferred_normalization: counts - resources: - - type: python_script - path: script.py +resources: + - type: python_script + path: script.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: numpy - - type: native + +runners: + - type: executable - type: nextflow directives: label: [midtime, midmem, midcpu] diff --git a/src/control_methods/random_proportions/script.py b/src/control_methods/random_proportions/script.py index 17af41d..4331122 100644 --- a/src/control_methods/random_proportions/script.py +++ b/src/control_methods/random_proportions/script.py @@ -9,7 +9,7 @@ 'output': 'output.h5ad' } meta = { - 'functionality_name': 'random_proportions' + 'name': 'random_proportions' } ## VIASH END @@ -29,7 +29,7 @@ uns={ 'cell_type_names': input_spatial_masked.uns['cell_type_names'], 'dataset_id': input_spatial_masked.uns['dataset_id'], - 'method_id': meta['functionality_name'] + 'method_id': meta['name'] }, obsm={ 'coordinates': input_spatial_masked.obsm['coordinates'], diff --git a/src/control_methods/true_proportions/config.vsh.yaml b/src/control_methods/true_proportions/config.vsh.yaml index 7979f59..b3d980f 100644 --- a/src/control_methods/true_proportions/config.vsh.yaml +++ b/src/control_methods/true_proportions/config.vsh.yaml @@ -1,22 +1,23 @@ __merge__: ../../api/comp_control_method.yaml -functionality: - name: true_proportions - info: - label: True Proportions - summary: "Positive control method that assigns celltype proportions from the ground truth." - description: | - A positive control method with perfect assignment of predicted celltype proportions from the ground truth. - preferred_normalization: counts +name: true_proportions +info: + label: True Proportions + summary: "Positive control method that assigns celltype proportions from the ground truth." + description: | + A positive control method with perfect assignment of predicted celltype proportions from the ground truth. + preferred_normalization: counts - resources: - - type: python_script - path: script.py +resources: + - type: python_script + path: script.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 - - type: native + +runners: + - type: executable - type: nextflow directives: label: [midtime, midmem, midcpu] diff --git a/src/control_methods/true_proportions/script.py b/src/control_methods/true_proportions/script.py index e4c47e3..53b9f4f 100644 --- a/src/control_methods/true_proportions/script.py +++ b/src/control_methods/true_proportions/script.py @@ -8,7 +8,7 @@ 'output': 'output.h5ad' } meta = { - 'functionality_name': 'true_proportions' + 'name': 'true_proportions' } ## VIASH END @@ -27,7 +27,7 @@ uns={ 'cell_type_names': input_spatial_masked.uns['cell_type_names'], 'dataset_id': input_spatial_masked.uns['dataset_id'], - 'method_id': meta['functionality_name'] + 'method_id': meta['name'] }, obsm={ 'coordinates': input_spatial_masked.obsm['coordinates'], diff --git a/src/dataset_simulator/config.vsh.yaml b/src/dataset_simulator/config.vsh.yaml deleted file mode 100644 index af9b429..0000000 --- a/src/dataset_simulator/config.vsh.yaml +++ /dev/null @@ -1,200 +0,0 @@ -functionality: - name: "dataset_simulator" - info: - type: dataset_simulator - type_info: - label: Dataset simulator - summary: Simulate cell aggregates from single-cell data. - description: | - The dataset simulator creates cell-aggregates from the single-cell dataset by sampling from a Dirichlet distribution. The simulated data consists of the the spatial expression matrix, the XY coordinates of the spots, the cell-type proportions in each spot, and the reference single-cell data. - variants: - alpha_1: - alpha: 1 - alpha_5: - alpha: 5 - alpha_0_5: - alpha: 0.5 - arguments: - - name: "--input" - type: file - description: Single-cell reference dataset - direction: input - example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad" - info: - slots: - layers: - - type: integer - name: counts - description: Raw counts. - required: true - obs: - - type: string - name: cell_type - description: Cell type label IDs. - required: true - - type: string - name: batch - description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: false - - type: integer - name: hvg_score - description: A ranking of the features by hvg. - required: false - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: false - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - name: "--alpha" - type: double - description: Alpha value to use for generating synthetic dataset - default: 1.0 - - name: "--n_obs" - type: integer - description: Number of spatial observations to generate. Default value is 100. - default: 100 - - name: "--cell_lb" - type: integer - description: Lower bound for number of cells at each spot. Default value is 10. - default: 10 - - name: "--cell_ub" - type: integer - description: Upper bound for number of cells at each spot. Default value is 30. - default: 30 - - name: "--umi_lb" - type: integer - description: Lower bound for number of cells at each spot. Default value is 1000. - default: 1000 - - name: "--umi_ub" - type: integer - description: Upper bound for number of UMIs at each spot. Default value is 5000. - default: 5000 - - name: "--simulated_data" - type: file - direction: output - description: Simulated dataset - required: false - example: dataset_simulated.h5ad - info: - slots: - layers: - - type: integer - name: counts - description: Raw counts. - required: true - obs: - - type: string - name: cell_type - description: Cell type label IDs. - required: true - - type: string - name: batch - description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: false - - type: integer - name: hvg_score - description: A ranking of the features by hvg. - required: false - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: false - - type: double - name: coordinates - description: XY coordinates for each spot. - required: true - - type: double - name: proportions_true - description: True cell type proportions for each spot. - required: true - uns: - - type: string - name: cell_type_names - description: Cell type names corresponding to values in `cell_type`. - required: true - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - resources: - - type: python_script - path: script.py - test_resources: - - type: python_script - path: /common/component_tests/run_and_check_output.py - - path: /resources_test/common/cxg_mouse_pancreas_atlas - dest: resources_test/common/cxg_mouse_pancreas_atlas -platforms: - - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 - setup: - - type: python - packages: [numpy, scanpy] - - type: nextflow - directives: - label: [midtime, highmem, highcpu] - - type: native diff --git a/src/methods/cell2location/config.vsh.yaml b/src/methods/cell2location/config.vsh.yaml index 42b99b0..bbcd662 100644 --- a/src/methods/cell2location/config.vsh.yaml +++ b/src/methods/cell2location/config.vsh.yaml @@ -1,75 +1,74 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: cell2location +name: cell2location - info: - label: Cell2Location - summary: "Cell2location uses a Bayesian model to resolve cell types in spatial transcriptomic data and create comprehensive cellular maps of diverse tissues." - description: | - Cell2location is a decomposition method based on Negative Binomial regression that is able to account for batch effects in estimating the single-cell gene expression signature used for the spatial decomposition step. - Note that when batch information is unavailable for this task, we can use either a hard-coded reference, or a negative-binomial learned reference without batch labels. The parameter alpha refers to the detection efficiency prior. - preferred_normalization: counts - variants: - cell2location_amortised_detection_alpha_20: - detection_alpha: 20 - amortised: true - cell2location_detection_alpha_1: - detection_alpha: 1 - cell2location_detection_alpha_20: - detection_alpha: 20 - cell2location_detection_alpha_20_nb: - detection_alpha: 20 - hard_coded_reference: false - cell2location_detection_alpha_200: - detection_alpha: 200 - reference: "kleshchevnikov2022cell2location" - documentation_url: https://cell2location.readthedocs.io/en/latest/ - repository_url: https://github.com/BayraktarLab/cell2location - - # Component-specific parameters (optional) - arguments: - - name: "--detection_alpha" - type: double - default: 20.0 - description: Hyperparameter controlling normalisation of within-experiment variation in RNA detection. - - name: "--n_cells_per_location" - type: integer - default: 20 - description: The expected average cell abundance. It is a tissue-dependent hyper-prior which can be estimated from histology images - - name: "--hard_coded_reference" - type: boolean - default: true - description: Whether to use hard-coded reference or negative binomial regression model to account for batch effects. Hard-coded reference used by default. - - name: "--amortised" - type: boolean - default: false - description: Whether to use amortised inference. - - name: "--num_samples" - type: integer - default: 1000 - description: Number of samples to use for summarising posterior distribution. - - name: "--sc_batch_size" - type: integer - default: 2500 - description: Batch size used to train regression model for estimation of reference single-cell gene expression signature. - - name: "--st_batch_size" - type: integer - description: Batch size used to train cell2location model for spatial mapping. - - name: "--max_epochs_sc" - type: integer - default: 250 - description: Maximum number of epochs to train regression model for estimation of reference single-cell gene expression signature. - - name: "--max_epochs_st" - type: integer - default: 30000 - description: Maximum number of epochs to train cell2location model for spatial mapping. +info: + label: Cell2Location + summary: "Cell2location uses a Bayesian model to resolve cell types in spatial transcriptomic data and create comprehensive cellular maps of diverse tissues." + description: | + Cell2location is a decomposition method based on Negative Binomial regression that is able to account for batch effects in estimating the single-cell gene expression signature used for the spatial decomposition step. + Note that when batch information is unavailable for this task, we can use either a hard-coded reference, or a negative-binomial learned reference without batch labels. The parameter alpha refers to the detection efficiency prior. + preferred_normalization: counts + variants: + cell2location_amortised_detection_alpha_20: + detection_alpha: 20 + amortised: true + cell2location_detection_alpha_1: + detection_alpha: 1 + cell2location_detection_alpha_20: + detection_alpha: 20 + cell2location_detection_alpha_20_nb: + detection_alpha: 20 + hard_coded_reference: false + cell2location_detection_alpha_200: + detection_alpha: 200 + reference: "kleshchevnikov2022cell2location" + documentation_url: https://cell2location.readthedocs.io/en/latest/ + repository_url: https://github.com/BayraktarLab/cell2location + +# Component-specific parameters (optional) +arguments: + - name: "--detection_alpha" + type: double + default: 20.0 + description: Hyperparameter controlling normalisation of within-experiment variation in RNA detection. + - name: "--n_cells_per_location" + type: integer + default: 20 + description: The expected average cell abundance. It is a tissue-dependent hyper-prior which can be estimated from histology images + - name: "--hard_coded_reference" + type: boolean + default: true + description: Whether to use hard-coded reference or negative binomial regression model to account for batch effects. Hard-coded reference used by default. + - name: "--amortised" + type: boolean + default: false + description: Whether to use amortised inference. + - name: "--num_samples" + type: integer + default: 1000 + description: Number of samples to use for summarising posterior distribution. + - name: "--sc_batch_size" + type: integer + default: 2500 + description: Batch size used to train regression model for estimation of reference single-cell gene expression signature. + - name: "--st_batch_size" + type: integer + description: Batch size used to train cell2location model for spatial mapping. + - name: "--max_epochs_sc" + type: integer + default: 250 + description: Maximum number of epochs to train regression model for estimation of reference single-cell gene expression signature. + - name: "--max_epochs_st" + type: integer + default: 30000 + description: Maximum number of epochs to train cell2location model for spatial mapping. - resources: - - type: python_script - path: script.py +resources: + - type: python_script + path: script.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: @@ -81,7 +80,8 @@ platforms: - jaxlib==0.4.23 - scipy<1.13 # The scipy.linalg functions tri, triu & tril are deprecated and will be removed in SciPy 1.13. - - type: native +runners: + - type: executable - type: nextflow directives: label: [hightime, midmem, midcpu] diff --git a/src/methods/cell2location/script.py b/src/methods/cell2location/script.py index 3d47991..a8264b6 100644 --- a/src/methods/cell2location/script.py +++ b/src/methods/cell2location/script.py @@ -21,7 +21,7 @@ 'max_epochs_st': 5000 } meta = { - 'functionality_name': 'cell2location' + 'name': 'cell2location' } ## VIASH END @@ -144,7 +144,7 @@ uns={ 'cell_type_names': input_spatial.uns['cell_type_names'], 'dataset_id': input_spatial.uns['dataset_id'], - 'method_id': meta['functionality_name'] + 'method_id': meta['name'] } ) diff --git a/src/methods/destvi/config.vsh.yaml b/src/methods/destvi/config.vsh.yaml index a415bd7..a5b9b62 100644 --- a/src/methods/destvi/config.vsh.yaml +++ b/src/methods/destvi/config.vsh.yaml @@ -1,43 +1,40 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: destvi +name: destvi +info: + label: DestVI + summary: "DestVI is a probabilistic method for multi-resolution analysis for spatial transcriptomics that explicitly models continuous variation within cell types" + description: | + Deconvolution of Spatial Transcriptomics profiles using Variational Inference (DestVI) is a spatial decomposition method that leverages a conditional generative model of spatial transcriptomics down to the sub-cell-type variation level, which is then used to decompose the cell-type proportions determining the spatial organization of a tissue. + preferred_normalization: counts + reference: "lopez2022destvi" + documentation_url: https://docs.scvi-tools.org/en/stable/user_guide/models/destvi.html + repository_url: https://github.com/scverse/scvi-tools - info: - label: DestVI - summary: "DestVI is a probabilistic method for multi-resolution analysis for spatial transcriptomics that explicitly models continuous variation within cell types" - description: | - Deconvolution of Spatial Transcriptomics profiles using Variational Inference (DestVI) is a spatial decomposition method that leverages a conditional generative model of spatial transcriptomics down to the sub-cell-type variation level, which is then used to decompose the cell-type proportions determining the spatial organization of a tissue. - preferred_normalization: counts - reference: "lopez2022destvi" - documentation_url: https://docs.scvi-tools.org/en/stable/user_guide/models/destvi.html - repository_url: https://github.com/scverse/scvi-tools +arguments: + - name: "--max_epochs_sc" + type: integer + default: 500 + description: Number of epochs to train the Conditional version of single-cell Variational Inference (CondSCVI) model using MAP inference. + - name: "--max_epochs_sp" + type: integer + default: 10000 + description: Number of epochs to train the DestVI model using MAP inference. - arguments: - - name: "--max_epochs_sc" - type: integer - default: 500 - description: Number of epochs to train the Conditional version of single-cell Variational Inference (CondSCVI) model using MAP inference. - - name: "--max_epochs_sp" - type: integer - default: 2000 - description: Number of epochs to train the DestVI model using MAP inference. +resources: + - type: python_script + path: script.py - resources: - - type: python_script - path: script.py - -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 setup: - type: python packages: - - scvi-tools>=1.1.0 - - type: docker - run: | - pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - - type: native + - scvi-tools + +runners: + - type: executable - type: nextflow directives: label: [hightime, midmem, midcpu, gpu] diff --git a/src/methods/destvi/script.py b/src/methods/destvi/script.py index 4682e74..8b0d895 100644 --- a/src/methods/destvi/script.py +++ b/src/methods/destvi/script.py @@ -11,7 +11,7 @@ 'max_epochs_sp': 5000 } meta = { - 'functionality_name': 'destvi' + 'name': 'destvi' } ## VIASH END @@ -47,7 +47,7 @@ uns={ 'cell_type_names': input_spatial.uns['cell_type_names'], 'dataset_id': input_spatial.uns['dataset_id'], - 'method_id': meta['functionality_name'] + 'method_id': meta['name'] }, obsm={ 'coordinates': input_spatial.obsm['coordinates'], diff --git a/src/methods/nmfreg/config.vsh.yaml b/src/methods/nmfreg/config.vsh.yaml index 97320cb..c2ba51c 100644 --- a/src/methods/nmfreg/config.vsh.yaml +++ b/src/methods/nmfreg/config.vsh.yaml @@ -1,28 +1,27 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: nmfreg - info: - label: NMFreg - summary: "NMFreg reconstructs gene expression as a weighted combination of cell type signatures defined by scRNA-seq." - description: | - Non-Negative Matrix Factorization regression (NMFreg) is a decomposition method that reconstructs expression of each spatial location as a weighted combination of cell-type signatures defined by scRNA-seq. It was originally developed for Slide-seq data. This is a re-implementation from https://github.com/tudaga/NMFreg_tutorial. - preferred_normalization: counts - reference: "rodriques2019slide" - documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html - repository_url: https://github.com/tudaga/NMFreg_tutorial/tree/master?tab=readme-ov-file +name: nmfreg +info: + label: NMFreg + summary: "NMFreg reconstructs gene expression as a weighted combination of cell type signatures defined by scRNA-seq." + description: | + Non-Negative Matrix Factorization regression (NMFreg) is a decomposition method that reconstructs expression of each spatial location as a weighted combination of cell-type signatures defined by scRNA-seq. It was originally developed for Slide-seq data. This is a re-implementation from https://github.com/tudaga/NMFreg_tutorial. + preferred_normalization: counts + reference: "rodriques2019slide" + documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html + repository_url: https://github.com/tudaga/NMFreg_tutorial/tree/master?tab=readme-ov-file - arguments: - - name: "--n_components" - type: integer - default: 30 - description: Number of components to use for non-negative matrix factorization. +arguments: + - name: "--n_components" + type: integer + default: 30 + description: Number of components to use for non-negative matrix factorization. - resources: - - type: python_script - path: script.py +resources: + - type: python_script + path: script.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: @@ -31,7 +30,9 @@ platforms: - numpy - scipy - scikit-learn - - type: native + +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/methods/nmfreg/script.py b/src/methods/nmfreg/script.py index 1cc0fd1..b53bd2a 100644 --- a/src/methods/nmfreg/script.py +++ b/src/methods/nmfreg/script.py @@ -13,7 +13,7 @@ 'n_components': 30 } meta = { - 'functionality_name': 'nmfreg' + 'name': 'nmfreg' } ## VIASH END @@ -78,7 +78,7 @@ uns={ 'cell_type_names': input_spatial.uns['cell_type_names'], 'dataset_id': input_spatial.uns['dataset_id'], - 'method_id': meta['functionality_name'] + 'method_id': meta['name'] }, obsm={ 'coordinates': input_spatial.obsm['coordinates'], diff --git a/src/methods/nnls/config.vsh.yaml b/src/methods/nnls/config.vsh.yaml index 537c714..9dd1a2d 100644 --- a/src/methods/nnls/config.vsh.yaml +++ b/src/methods/nnls/config.vsh.yaml @@ -1,30 +1,32 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: nnls - info: - label: NNLS - summary: "NNLS is a decomposition method based on Non-Negative Least Square Regression." - description: | - NonNegative Least Squares (NNLS), is a convex optimization problem with convex constraints. It was used by the AutoGeneS method to infer cellular proporrtions by solvong a multi-objective optimization problem. - preferred_normalization: counts - reference: "aliee2021autogenes" - documentation_url: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.nnls.html - repository_url: https://github.com/scipy/scipy +name: nnls +info: + label: NNLS + summary: "NNLS is a decomposition method based on Non-Negative Least Square Regression." + description: | + NonNegative Least Squares (NNLS), is a convex optimization problem with convex constraints. It was used by the AutoGeneS method to infer cellular proporrtions by solvong a multi-objective optimization problem. + preferred_normalization: counts + reference: "aliee2021autogenes" + documentation_url: https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.nnls.html + repository_url: https://github.com/scipy/scipy - resources: - - type: python_script - path: script.py +resources: + - type: python_script + path: script.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: - - numpy + - numpy<2.0 - scipy - - type: native + + +runners: + - type: executable - type: nextflow directives: label: [midtime, midmem, midcpu] diff --git a/src/methods/nnls/script.py b/src/methods/nnls/script.py index 069ba57..1b3d9bc 100644 --- a/src/methods/nnls/script.py +++ b/src/methods/nnls/script.py @@ -10,7 +10,7 @@ 'output': 'output.h5ad' } meta = { - 'functionality_name': 'nnls' + 'name': 'nnls' } ## VIASH END @@ -50,7 +50,7 @@ uns={ 'cell_type_names': input_spatial.uns['cell_type_names'], 'dataset_id': input_spatial.uns['dataset_id'], - 'method_id': meta['functionality_name'] + 'method_id': meta['name'] }, obsm={ 'coordinates': input_spatial.obsm['coordinates'], diff --git a/src/methods/rctd/config.vsh.yaml b/src/methods/rctd/config.vsh.yaml index 2d2f82c..d97e16b 100644 --- a/src/methods/rctd/config.vsh.yaml +++ b/src/methods/rctd/config.vsh.yaml @@ -1,31 +1,31 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: rctd - info: - label: RCTD - summary: "RCTD learns cell type profiles from scRNA-seq to decompose cell type mixtures while correcting for differences across sequencing technologies." - description: | - RCTD (Robust Cell Type Decomposition) is a decomposition method that uses signatures learnt from single-cell data to decompose spatial expression of tissues. It is able to use a platform effect normalization step, which normalizes the scRNA-seq cell type profiles to match the platform effects of the spatial transcriptomics dataset. - preferred_normalization: counts - reference: cable2021robust - documentation_url: https://raw.githack.com/dmcable/spacexr/master/vignettes/spatial-transcriptomics.html - repository_url: https://github.com/dmcable/spacexr +name: rctd +info: + label: RCTD + summary: "RCTD learns cell type profiles from scRNA-seq to decompose cell type mixtures while correcting for differences across sequencing technologies." + description: | + RCTD (Robust Cell Type Decomposition) is a decomposition method that uses signatures learnt from single-cell data to decompose spatial expression of tissues. It is able to use a platform effect normalization step, which normalizes the scRNA-seq cell type profiles to match the platform effects of the spatial transcriptomics dataset. + preferred_normalization: counts + reference: cable2021robust + documentation_url: https://raw.githack.com/dmcable/spacexr/master/vignettes/spatial-transcriptomics.html + repository_url: https://github.com/dmcable/spacexr - arguments: - - name: "--fc_cutoff" - type: double - default: 0.5 - description: Minimum log-fold-change (across cell types) for genes to be included in the platform effect normalization step. - - name: "--fc_cutoff_reg" - type: double - default: 0.75 - description: Minimum log-fold-change (across cell types) for genes to be included in the RCTD step. - resources: - - type: r_script - path: script.R +arguments: + - name: "--fc_cutoff" + type: double + default: 0.5 + description: Minimum log-fold-change (across cell types) for genes to be included in the platform effect normalization step. + - name: "--fc_cutoff_reg" + type: double + default: 0.75 + description: Minimum log-fold-change (across cell types) for genes to be included in the RCTD step. -platforms: +resources: + - type: r_script + path: script.R + +engines: - type: docker image: ghcr.io/openproblems-bio/base_r:1.0.4 setup: @@ -33,7 +33,9 @@ platforms: cran: [ Matrix, pak ] - type: r script: 'pak::pkg_install("dmcable/spacexr")' - - type: native + +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/methods/rctd/script.R b/src/methods/rctd/script.R index f5878ae..dd8c69c 100644 --- a/src/methods/rctd/script.R +++ b/src/methods/rctd/script.R @@ -11,7 +11,7 @@ par <- list( fc_cutoff_reg = 0.75 ) meta <- list( - functionality_name = "rctd", + name = "rctd", cpus = 1 ) ## VIASH END @@ -81,7 +81,7 @@ output <- anndata::AnnData( uns = list( cell_type_names = input_spatial$uns['cell_type_names'], dataset_id = input_spatial$uns[["dataset_id"]], - method_id = meta[["functionality_name"]] + method_id = meta[["name"]] ), obsm = list( coordinates = coordinates, diff --git a/src/methods/seurat/config.vsh.yaml b/src/methods/seurat/config.vsh.yaml index c82a1c9..3fcd2c6 100644 --- a/src/methods/seurat/config.vsh.yaml +++ b/src/methods/seurat/config.vsh.yaml @@ -1,39 +1,39 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: seurat - info: - label: Seurat - summary: "Seurat method that is based on Canonical Correlation Analysis (CCA)." - description: | - This method applies the 'anchor'-based integration workflow introduced in Seurat v3, that enables the probabilistic transfer of annotations from a reference to a query set. First, mutual nearest neighbors (anchors) are identified from the reference scRNA-seq and query spatial datasets. Then, annotations are transfered from the single cell reference data to the sptial data along with prediction scores for each spot. - preferred_normalization: counts - reference: stuart2019comprehensive - documentation_url: https://satijalab.org/seurat/articles/spatial_vignette - repository_url: https://github.com/satijalab/seurat +name: seurat +info: + label: Seurat + summary: "Seurat method that is based on Canonical Correlation Analysis (CCA)." + description: | + This method applies the 'anchor'-based integration workflow introduced in Seurat v3, that enables the probabilistic transfer of annotations from a reference to a query set. First, mutual nearest neighbors (anchors) are identified from the reference scRNA-seq and query spatial datasets. Then, annotations are transfered from the single cell reference data to the sptial data along with prediction scores for each spot. + preferred_normalization: counts + reference: stuart2019comprehensive + documentation_url: https://satijalab.org/seurat/articles/spatial_vignette + repository_url: https://github.com/satijalab/seurat - arguments: - - name: "--n_pcs" - type: integer - default: 30 - description: Number of principal components. - - name: "--sctransform_n_cells" - type: integer - default: 5000 - description: Number of cells sampled to build NB regression. +arguments: + - name: "--n_pcs" + type: integer + default: 30 + description: Number of principal components. + - name: "--sctransform_n_cells" + type: integer + default: 5000 + description: Number of cells sampled to build NB regression. - resources: - - type: r_script - path: script.R +resources: + - type: r_script + path: script.R -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_r:1.0.4 setup: - type: r cran: [Matrix, Seurat] - - type: native +runners: + - type: executable - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/methods/seurat/script.R b/src/methods/seurat/script.R index 77917dd..04a0a4e 100644 --- a/src/methods/seurat/script.R +++ b/src/methods/seurat/script.R @@ -10,7 +10,7 @@ par <- list( sctransform_n_cells = 500 ) meta <- list( - functionality_name = "seurat" + name = "seurat" ) ## VIASH END @@ -86,7 +86,7 @@ output <- anndata::AnnData( uns = list( cell_type_names = input_spatial$uns['cell_type_names'], dataset_id = input_spatial$uns[["dataset_id"]], - method_id = meta[["functionality_name"]] + method_id = meta[["name"]] ), obsm = list( coordinates = sp_coords, diff --git a/src/methods/stereoscope/config.vsh.yaml b/src/methods/stereoscope/config.vsh.yaml index f9a74dd..99fa90f 100644 --- a/src/methods/stereoscope/config.vsh.yaml +++ b/src/methods/stereoscope/config.vsh.yaml @@ -1,43 +1,41 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: stereoscope +name: stereoscope - info: - label: Stereoscope - summary: "Stereoscope is a decomposition method based on Negative Binomial regression." - description: | - Stereoscope is a decomposition method based on Negative Binomial regression. It is similar in scope and implementation to cell2location but less flexible to incorporate additional covariates such as batch effects and other type of experimental design annotations. - preferred_normalization: counts - reference: andersson2020single - documentation_url: https://docs.scvi-tools.org/en/stable/user_guide/models/stereoscope.html - repository_url: https://github.com/scverse/scvi-tools +info: + label: Stereoscope + summary: "Stereoscope is a decomposition method based on Negative Binomial regression." + description: | + Stereoscope is a decomposition method based on Negative Binomial regression. It is similar in scope and implementation to cell2location but less flexible to incorporate additional covariates such as batch effects and other type of experimental design annotations. + preferred_normalization: counts + reference: andersson2020single + documentation_url: https://docs.scvi-tools.org/en/stable/user_guide/models/stereoscope.html + repository_url: https://github.com/scverse/scvi-tools - arguments: - - name: "--max_epochs_sc" - type: integer - default: 100 - description: Number of of epochs to train RNAStereoscope model. - - name: "--max_epochs_sp" - type: integer - default: 1000 - description: Number of of epochs to train SpatialStereoscope model. +arguments: + - name: "--max_epochs_sc" + type: integer + default: 100 + description: Number of of epochs to train RNAStereoscope model. + - name: "--max_epochs_sp" + type: integer + default: 1000 + description: Number of of epochs to train SpatialStereoscope model. - resources: - - type: python_script - path: script.py +resources: + - type: python_script + path: script.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 setup: - type: python packages: - - scvi-tools>=1.1.0 - - type: docker - run: | - pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - - type: native + - scvi-tools + +runners: + - type: executable - type: nextflow directives: label: [hightime, midmem, midcpu, gpu] diff --git a/src/methods/stereoscope/script.py b/src/methods/stereoscope/script.py index e69bb5f..7a6382f 100644 --- a/src/methods/stereoscope/script.py +++ b/src/methods/stereoscope/script.py @@ -11,7 +11,7 @@ 'max_epochs_sp': 1000 } meta = { - 'functionality_name': 'stereoscope' + 'name': 'stereoscope' } ## VIASH END @@ -55,7 +55,7 @@ uns={ 'cell_type_names': input_spatial.uns['cell_type_names'], 'dataset_id': input_spatial.uns['dataset_id'], - 'method_id': meta['functionality_name'] + 'method_id': meta['name'] } ) output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/tangram/config.vsh.yaml b/src/methods/tangram/config.vsh.yaml index e1320e7..a004bf8 100644 --- a/src/methods/tangram/config.vsh.yaml +++ b/src/methods/tangram/config.vsh.yaml @@ -1,38 +1,39 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: tangram - info: - label: Tangram - summary: "Tanagram maps single-cell gene expression data onto spatial gene expression data by fitting gene expression on shared genes" - description: | - Tangram is a method to map gene expression signatures from scRNA-seq data to spatial data. It performs the cell type mapping by learning a similarity matrix between single-cell and spatial locations based on gene expression profiles. - preferred_normalization: counts - reference: biancalani2021deep - documentation_url: https://tangram-sc.readthedocs.io/en/latest/index.html - repository_url: https://github.com/broadinstitute/Tangram +name: tangram +info: + label: Tangram + summary: "Tanagram maps single-cell gene expression data onto spatial gene expression data by fitting gene expression on shared genes" + description: | + Tangram is a method to map gene expression signatures from scRNA-seq data to spatial data. It performs the cell type mapping by learning a similarity matrix between single-cell and spatial locations based on gene expression profiles. + preferred_normalization: counts + reference: biancalani2021deep + documentation_url: https://tangram-sc.readthedocs.io/en/latest/index.html + repository_url: https://github.com/broadinstitute/Tangram - arguments: - - name: "--num_epochs" - type: integer - default: 1000 - description: Number of epochs to use while mapping single cells to spatial locations. - - name: "--n_markers" - type: integer - default: 100 - description: Number of marker genes to use. +arguments: + - name: "--num_epochs" + type: integer + default: 1000 + description: Number of epochs to use while mapping single cells to spatial locations. + - name: "--n_markers" + type: integer + default: 100 + description: Number of marker genes to use. - resources: - - type: python_script - path: script.py +resources: + - type: python_script + path: script.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: tangram-sc - - type: native + +runners: + - type: executable - type: nextflow directives: label: [midtime,midmem, midcpu] diff --git a/src/methods/tangram/script.py b/src/methods/tangram/script.py index 544664f..1de1b82 100644 --- a/src/methods/tangram/script.py +++ b/src/methods/tangram/script.py @@ -13,7 +13,7 @@ 'n_markers': 100 } meta = { - 'functionality_name': 'tangram' + 'name': 'tangram' } ## VIASH END @@ -78,7 +78,7 @@ uns={ 'cell_type_names': input_spatial.uns['cell_type_names'], 'dataset_id': input_spatial.uns['dataset_id'], - 'method_id': meta['functionality_name'] + 'method_id': meta['name'] } ) output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/vanillanmf/config.vsh.yaml b/src/methods/vanillanmf/config.vsh.yaml index 7341cee..a7d8ba5 100644 --- a/src/methods/vanillanmf/config.vsh.yaml +++ b/src/methods/vanillanmf/config.vsh.yaml @@ -1,37 +1,38 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: vanillanmf - info: - label: NMF - summary: "NMF reconstructs gene expression as a weighted combination of cell type signatures defined by scRNA-seq." - description: | - NMF is a decomposition method based on Non-negative Matrix Factorization (NMF) that reconstructs expression of each spatial location as a weighted combination of cell-type signatures defined by scRNA-seq. It is a simpler baseline than NMFreg as it only performs the NMF step based on mean expression signatures of cell types, returning the weights loading of the NMF as (normalized) cell type proportions, without the regression step. - preferred_normalization: counts - reference: cichocki2009fast - documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html - repository_url: https://github.com/scikit-learn/scikit-learn/blob/92c9b1866/sklearn/decomposition/ +name: vanillanmf +info: + label: NMF + summary: "NMF reconstructs gene expression as a weighted combination of cell type signatures defined by scRNA-seq." + description: | + NMF is a decomposition method based on Non-negative Matrix Factorization (NMF) that reconstructs expression of each spatial location as a weighted combination of cell-type signatures defined by scRNA-seq. It is a simpler baseline than NMFreg as it only performs the NMF step based on mean expression signatures of cell types, returning the weights loading of the NMF as (normalized) cell type proportions, without the regression step. + preferred_normalization: counts + reference: cichocki2009fast + documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html + repository_url: https://github.com/scikit-learn/scikit-learn/blob/92c9b1866/sklearn/decomposition/ - arguments: - - name: "--max_iter" - type: integer - default: 4000 - description: Maximum number of iterations before timing out. +arguments: + - name: "--max_iter" + type: integer + default: 4000 + description: Maximum number of iterations before timing out. - resources: - - type: python_script - path: script.py +resources: + - type: python_script + path: script.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: - - numpy + - numpy<2.0 - scipy - scikit-learn - - type: native + +runners: + - type: executable - type: nextflow directives: label: [midtime,midmem, midcpu] diff --git a/src/methods/vanillanmf/script.py b/src/methods/vanillanmf/script.py index ff55079..eae0c1a 100644 --- a/src/methods/vanillanmf/script.py +++ b/src/methods/vanillanmf/script.py @@ -11,7 +11,7 @@ 'max_iter': 4000 } meta = { - 'functionality_name': 'vanillanmf' + 'name': 'vanillanmf' } ## VIASH END @@ -64,7 +64,7 @@ uns={ 'cell_type_names': input_spatial.uns['cell_type_names'], 'dataset_id': input_spatial.uns['dataset_id'], - 'method_id': meta['functionality_name'] + 'method_id': meta['name'] }, obsm={ 'coordinates': input_spatial.obsm['coordinates'], diff --git a/src/metrics/r2/config.vsh.yaml b/src/metrics/r2/config.vsh.yaml index bc5585f..73efeed 100644 --- a/src/metrics/r2/config.vsh.yaml +++ b/src/metrics/r2/config.vsh.yaml @@ -1,32 +1,33 @@ __merge__: ../../api/comp_metric.yaml -functionality: - name: r2 - info: - metrics: - - name: r2 - label: R2 - summary: "R2 represents the proportion of variance in the true proportions which is explained by the predicted proportions." - description: | - R2, or the “coefficient of determination”, reports the fraction of the true proportion values' variance that can be explained by the predicted proportion values. The best score, and upper bound, is 1.0. There is no fixed lower bound for the metric. The uniform/non-weighted average across all cell types/states is used to summarise performance. By default, cases resulting in a score of NaN (perfect predictions) or -Inf (imperfect predictions) are replaced with 1.0 (perfect predictions) or 0.0 (imperfect predictions) respectively. - reference: miles2005rsquared - documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html - repository_url: https://github.com/scikit-learn/scikit-learn/tree/5c4aa5d0d90ba66247d675d4c3fc2fdfba3c39ff - min: -inf - max: 1 - maximize: true +name: r2 +info: + metrics: + - name: r2 + label: R2 + summary: "R2 represents the proportion of variance in the true proportions which is explained by the predicted proportions." + description: | + R2, or the “coefficient of determination”, reports the fraction of the true proportion values' variance that can be explained by the predicted proportion values. The best score, and upper bound, is 1.0. There is no fixed lower bound for the metric. The uniform/non-weighted average across all cell types/states is used to summarise performance. By default, cases resulting in a score of NaN (perfect predictions) or -Inf (imperfect predictions) are replaced with 1.0 (perfect predictions) or 0.0 (imperfect predictions) respectively. + reference: miles2005rsquared + documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html + repository_url: https://github.com/scikit-learn/scikit-learn/tree/5c4aa5d0d90ba66247d675d4c3fc2fdfba3c39ff + min: -inf + max: 1 + maximize: true - resources: - - type: python_script - path: script.py +resources: + - type: python_script + path: script.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: scikit-learn - - type: native + +runners: + - type: executable - type: nextflow directives: - label: [midtime,midmem, midcpu] + label: [midtime, midmem, midcpu] diff --git a/src/metrics/r2/script.py b/src/metrics/r2/script.py index 35420e0..aca21a4 100644 --- a/src/metrics/r2/script.py +++ b/src/metrics/r2/script.py @@ -8,7 +8,7 @@ 'output': 'score.h5ad' } meta = { - 'functionality_name': 'r2' + 'name': 'r2' } ## VIASH END diff --git a/src/process_dataset/dataset_simulator/config.vsh.yaml b/src/process_dataset/dataset_simulator/config.vsh.yaml new file mode 100644 index 0000000..383f4e1 --- /dev/null +++ b/src/process_dataset/dataset_simulator/config.vsh.yaml @@ -0,0 +1,76 @@ +namespace: process_dataset +name: dataset_simulator +info: + type: process_dataset + type_info: + label: Dataset simulator + summary: Simulate cell aggregates from single-cell data. + description: | + The dataset simulator creates cell-aggregates from the single-cell dataset by sampling from a Dirichlet distribution. The simulated data consists of the the spatial expression matrix, the XY coordinates of the spots, the cell-type proportions in each spot, and the reference single-cell data. + variants: + alpha_1: + alpha: 1 + alpha_5: + alpha: 5 + alpha_0_5: + alpha: 0.5 +arguments: + - name: "--input" + type: file + description: Single-cell reference dataset + direction: input + required: true + __merge__: ../../api/file_common_dataset.yaml + - name: "--alpha" + type: double + description: Alpha value to use for generating synthetic dataset + default: 1.0 + - name: "--n_obs" + type: integer + description: Number of spatial observations to generate. Default value is 100. + default: 100 + - name: "--cell_lb" + type: integer + description: Lower bound for number of cells at each spot. Default value is 10. + default: 10 + - name: "--cell_ub" + type: integer + description: Upper bound for number of cells at each spot. Default value is 30. + default: 30 + - name: "--umi_lb" + type: integer + description: Lower bound for number of cells at each spot. Default value is 1000. + default: 1000 + - name: "--umi_ub" + type: integer + description: Upper bound for number of UMIs at each spot. Default value is 5000. + default: 5000 + - name: "--simulated_data" + type: file + direction: output + description: Simulated dataset + required: true + __merge__: ../../api/file_simulated_dataset.yaml +resources: + - type: python_script + path: script.py +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/common/cxg_mouse_pancreas_atlas + dest: resources_test/common/cxg_mouse_pancreas_atlas + +engines: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - numpy<2.0 + - scanpy + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, highmem, highcpu] \ No newline at end of file diff --git a/src/dataset_simulator/script.py b/src/process_dataset/dataset_simulator/script.py similarity index 98% rename from src/dataset_simulator/script.py rename to src/process_dataset/dataset_simulator/script.py index 901d7de..0069219 100644 --- a/src/dataset_simulator/script.py +++ b/src/process_dataset/dataset_simulator/script.py @@ -14,10 +14,10 @@ "cell_ub": 30, "umi_lb": 1000, "umi_ub": 5000, - "simulated_data": "dataset_simulated.h5ad" + "simulated_data": "simulated_dataset.h5ad" } meta = { - "functionality_name": "dataset_simulator", + "name": "dataset_simulator", "resources_dir": "src/tasks/spatial_decomposition/dataset_simulator", } ## VIASH END diff --git a/src/process_dataset/split_dataset/config.vsh.yaml b/src/process_dataset/split_dataset/config.vsh.yaml index c430b01..0f0070d 100644 --- a/src/process_dataset/split_dataset/config.vsh.yaml +++ b/src/process_dataset/split_dataset/config.vsh.yaml @@ -1,13 +1,47 @@ -__merge__: ../../api/comp_process_dataset.yaml -functionality: - name: split_dataset - resources: +namespace: process_dataset +name: split_dataset +info: + type: process_dataset + type_info: + label: Data processor + summary: A spatial decomposition dataset processor. + description: | + Prepare a common dataset for the spatial_decomposition task. +arguments: + - name: "--input" + __merge__: ../../api/file_simulated_dataset.yaml + direction: input + required: true + - name: "--output_single_cell" + __merge__: ../../api/file_single_cell.yaml + direction: output + required: true + - name: "--output_spatial_masked" + __merge__: ../../api/file_spatial_masked.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: ../../api/file_solution.yaml + direction: output + required: true + +resources: - type: python_script path: script.py - path: /common/helper_functions/subset_anndata.py -platforms: + +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + dest: resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas + +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 + +runners: + - type: executable - type: nextflow directives: - label: [midtime, highmem, highcpu] + label: [midtime, highmem, highcpu] \ No newline at end of file diff --git a/src/process_dataset/split_dataset/script.py b/src/process_dataset/split_dataset/script.py index c1ae956..af81bdb 100644 --- a/src/process_dataset/split_dataset/script.py +++ b/src/process_dataset/split_dataset/script.py @@ -4,13 +4,13 @@ ## VIASH START par = { - "input": "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/dataset_simulated.h5ad", + "input": "resources_test/spatial_decomposition/cxg_mouse_pancreas_atlas/simulated_dataset.h5ad", "output_spatial_masked": "spatial_masked.h5ad", "output_single_cell": "single_cell_ref.h5ad", "output_solution": "solution.h5ad", } meta = { - "functionality_name": "split_dataset", + "name": "split_dataset", "resources_dir": "src/process_dataset/split_dataset", "config": "target/nextflow/process_dataset/split_dataset/.config.vsh.yaml" } @@ -22,7 +22,7 @@ print(">> Load dataset", flush=True) adata = ad.read_h5ad(par["input"]) -# TO DO: Non-integer values in the counts layer are detected as un-normalized data by some methods, thereby causing them to fail. +# Non-integer values in the counts layer are detected as un-normalized data by some methods, thereby causing them to fail. Using floor() function to avoid this. adata.layers['counts'] = adata.layers['counts'].floor() print(">> Figuring out which data needs to be copied to which output file", flush=True) diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index 47f3332..ce30fd2 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -1,49 +1,48 @@ -functionality: - name: "process_datasets" - namespace: "workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: "/src/api/file_common_dataset.yaml" - required: true - direction: input - - name: "--alpha" - type: double - required: false - direction: input - - name: Outputs - arguments: - - name: "--output_single_cell" - __merge__: /src/api/file_single_cell.yaml - required: true - direction: output - - name: "--output_spatial_masked" - __merge__: /src/api/file_spatial_masked.yaml - required: true - direction: output - - name: "--output_solution" - __merge__: /src/api/file_solution.yaml - required: true - direction: output - - name: "--simulated_data" - type: file - required: false - direction: output - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /common/nextflow_helpers/helper.nf - dependencies: - - name: common/check_dataset_schema - repository: openproblems_v2 - - name: dataset_simulator - - name: process_dataset/split_dataset - repositories: - - name: openproblems_v2 - type: github - repo: openproblems-bio/openproblems-v2 - tag: main_build -platforms: +namespace: "workflows" +name: "process_datasets" +argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: "/src/api/file_common_dataset.yaml" + required: true + direction: input + - name: "--alpha" + type: double + required: false + direction: input + - name: Outputs + arguments: + - name: "--output_single_cell" + __merge__: /src/api/file_single_cell.yaml + required: true + direction: output + - name: "--output_spatial_masked" + __merge__: /src/api/file_spatial_masked.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/api/file_solution.yaml + required: true + direction: output + - name: "--simulated_data" + type: file + required: false + direction: output +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: common/check_dataset_schema + repository: openproblems_v2 + - name: process_dataset/dataset_simulator + - name: process_dataset/split_dataset +repositories: + - name: openproblems_v2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build +runners: - type: nextflow diff --git a/src/workflows/process_datasets/run_test.sh b/src/workflows/process_datasets/run_test.sh new file mode 100644 index 0000000..1fae69c --- /dev/null +++ b/src/workflows/process_datasets/run_test.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/spatial_decomposition +echo "Running process_dataset" +nextflow run . \ + -main-script target/nextflow/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --input_states "$RAW_DATA/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_spatial_masked": "$id/spatial_masked.h5ad", "output_single_cell": "$id/single_cell_ref.h5ad", "output_solution": "$id/solution.h5ad", "alpha": 1.0, "simulated_data": "$id/simulated_dataset.h5ad"}' \ + --publish_dir "$DATASET_DIR" \ + --output_state '$id/state.yaml' \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 590ca6b..fe39aa1 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -1,76 +1,75 @@ -functionality: - name: "run_benchmark" - namespace: "workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input_single_cell" - __merge__: "/src/api/file_single_cell.yaml" - required: true - direction: input - - name: "--input_spatial_masked" - __merge__: "/src/api/file_spatial_masked.yaml" - required: true - direction: input - - name: "--input_solution" - __merge__: "/src/api/file_solution.yaml" - required: true - direction: input - - name: Outputs - arguments: - - name: "--output_scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: score_uns.yaml - - name: "--output_method_configs" - type: file - required: true - direction: output - default: method_configs.yaml - - name: "--output_metric_configs" - type: file - required: true - direction: output - default: metric_configs.yaml - - name: "--output_dataset_info" - type: file - required: true - direction: output - default: dataset_uns.yaml - - name: "--output_task_info" - type: file - required: true - direction: output - default: task_info.yaml - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - type: file - path: "../../api/task_info.yaml" - dependencies: - - name: common/check_dataset_schema - repository: openproblems_v2 - - name: common/extract_metadata - repository: openproblems_v2 - - name: control_methods/random_proportions - - name: control_methods/true_proportions - - name: methods/cell2location - - name: methods/destvi - - name: methods/nmfreg - - name: methods/nnls - - name: methods/rctd - - name: methods/seurat - - name: methods/stereoscope - - name: methods/tangram - - name: methods/vanillanmf - - name: metrics/r2 - repositories: - - name: openproblems_v2 - type: github - repo: openproblems-bio/openproblems-v2 - tag: main_build -platforms: +name: "run_benchmark" +namespace: "workflows" +argument_groups: + - name: Inputs + arguments: + - name: "--input_single_cell" + __merge__: "/src/api/file_single_cell.yaml" + required: true + direction: input + - name: "--input_spatial_masked" + __merge__: "/src/api/file_spatial_masked.yaml" + required: true + direction: input + - name: "--input_solution" + __merge__: "/src/api/file_solution.yaml" + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "../../api/task_info.yaml" +dependencies: + - name: common/check_dataset_schema + repository: openproblems_v2 + - name: common/extract_metadata + repository: openproblems_v2 + - name: control_methods/random_proportions + - name: control_methods/true_proportions + - name: methods/cell2location + - name: methods/destvi + - name: methods/nmfreg + - name: methods/nnls + - name: methods/rctd + - name: methods/seurat + - name: methods/stereoscope + - name: methods/tangram + - name: methods/vanillanmf + - name: metrics/r2 +repositories: + - name: openproblems_v2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build +runners: - type: nextflow \ No newline at end of file diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 82a29fe..ca2216c 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -63,18 +63,18 @@ workflow run_wf { // use the 'filter' argument to only run a method on the normalisation the component is asking for filter: { id, state, comp -> def norm = state.dataset_uns.normalization_id - def pref = comp.config.functionality.info.preferred_normalization + def pref = comp.config.info.preferred_normalization // if the preferred normalisation is none at all, // we can pass whichever dataset we want def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref - def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) + def method_check = !state.method_ids || state.method_ids.contains(comp.config.name) method_check && norm_check }, // define a new 'id' by appending the method name to the dataset id id: { id, state, comp -> - id + "." + comp.config.functionality.name + id + "." + comp.config.name }, // use 'fromState' to fetch the arguments the component requires from the overall state @@ -83,7 +83,7 @@ workflow run_wf { input_single_cell: state.input_single_cell, input_spatial_masked: state.input_spatial_masked ] - if (comp.config.functionality.info.type == "control_method") { + if (comp.config.info.type == "control_method") { new_args.input_solution = state.input_solution } new_args @@ -92,7 +92,7 @@ workflow run_wf { // use 'toState' to publish that component's outputs to the overall state toState: { id, output, state, comp -> state + [ - method_id: comp.config.functionality.name, + method_id: comp.config.name, method_output: output.output ] } @@ -102,7 +102,7 @@ workflow run_wf { | runEach( components: metrics, id: { id, state, comp -> - id + "." + comp.config.functionality.name + id + "." + comp.config.name }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: { id, state, comp -> @@ -114,7 +114,7 @@ workflow run_wf { // use 'toState' to publish that component's outputs to the overall state toState: { id, output, state, comp -> state + [ - metric_id: comp.config.functionality.name, + metric_id: comp.config.name, metric_output: output.output ] } @@ -127,6 +127,12 @@ workflow run_wf { // extract the dataset metadata dataset_meta_ch = dataset_ch + + // only keep one of the normalization methods + | filter{ id, state -> + state.dataset_uns.normalization_id == "log_cp10k" + } + | joinStates { ids, states -> // store the dataset metadata in a file def dataset_uns = states.collect{state -> diff --git a/src/workflows/run_benchmark/run_test.sh b/src/workflows/run_benchmark/run_test.sh new file mode 100644 index 0000000..633e106 --- /dev/null +++ b/src/workflows/run_benchmark/run_test.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +DATASETS_DIR="resources_test/spatial_decomposition" +OUTPUT_DIR="output/temp" +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi +echo "Running the benchmark" +nextflow run . \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_single_cell:output_single_cell,input_spatial_masked:output_spatial_masked,input_solution:output_solution' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" \ No newline at end of file