From f4dcca2160d5e278673c8d9ef136f6c2c91e68a9 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Wed, 18 Sep 2024 21:39:31 +0200 Subject: [PATCH] Update workflows (#2) * use cxg_mouse_pancreas_atlas instead of pancreas * update dependencies and remove unnecessary arguments * update scripts and test resources * fix paths * update benchmarking workflow * update benchmark workflow * fix default * Rename "functionality_name" to "name" * Update benchmark workflow Now runs on local test * Update run_benchmark workflow config * Set numpy<2.0.0 for pymde and phate methods Avoids "numpy.ndarray size changed, may indicate binary incompatibility" error * also create a state.yaml file * Update run_test_seqeracloud.sh script * Update run full benchmark scripts * Update CHANGELOG * Add all methods/metrics to benchmark workflow * Add dependencies to benchwark workflow config --------- Co-authored-by: Luke Zappia --- CHANGELOG.md | 6 + README.md | 10 +- _viash.yaml | 15 +- common | 2 +- scripts/.gitignore | 3 - scripts/create_component/.gitignore | 2 + .../create_component/create_python_method.sh | 8 + .../create_component/create_python_metric.sh | 8 + scripts/create_component/create_r_method.sh | 8 + scripts/create_component/create_r_metric.sh | 8 + scripts/create_readme.sh | 4 +- scripts/create_resources/resources.sh | 26 ++ scripts/create_resources/test_resources.sh | 44 ++++ scripts/create_test_resources.sh | 38 --- scripts/download_resources.sh | 9 - scripts/project/build_all_components.sh | 6 + .../project/build_all_docker_containers.sh | 7 + scripts/{ => project}/test_all_components.sh | 4 +- scripts/run_benchmark.sh | 23 -- scripts/run_benchmark/run_full_local.sh | 47 ++++ scripts/run_benchmark/run_full_seqeracloud.sh | 40 +++ scripts/run_benchmark/run_test_local.sh | 32 +++ scripts/run_benchmark/run_test_seqeracloud.sh | 35 +++ scripts/run_benchmark_test.sh | 19 -- scripts/sync_resources.sh | 5 + src/api/comp_control_method.yaml | 4 +- src/api/comp_method.yaml | 4 +- src/api/comp_metric.yaml | 4 +- src/api/comp_process_dataset.yaml | 4 +- src/api/file_common_dataset.yaml | 2 +- src/api/file_dataset.yaml | 2 +- src/api/file_embedding.yaml | 2 +- src/api/file_score.yaml | 2 +- src/api/file_solution.yaml | 2 +- src/control_methods/random_features/script.py | 6 +- .../spectral_features/script.py | 6 +- src/control_methods/true_features/script.py | 6 +- .../process_dataset/config.vsh.yaml | 10 - src/data_processors/process_dataset/script.py | 20 +- src/methods/densmap/script.py | 6 +- src/methods/diffusion_map/script.R | 4 +- src/methods/ivis/script.py | 6 +- src/methods/lmds/script.R | 4 +- src/methods/neuralee/script.py | 6 +- src/methods/pca/script.py | 6 +- src/methods/phate/config.vsh.yaml | 1 + src/methods/phate/script.py | 6 +- src/methods/pymde/config.vsh.yaml | 4 +- src/methods/pymde/script.py | 6 +- src/methods/simlr/script.R | 6 +- src/methods/tsne/script.py | 6 +- src/methods/umap/script.py | 6 +- src/metrics/clustering_performance/script.py | 6 +- src/metrics/coranking/script.R | 4 +- src/metrics/density_preservation/script.py | 4 +- src/metrics/distance_correlation/script.py | 4 +- src/metrics/trustworthiness/script.py | 4 +- .../process_datasets/config.vsh.yaml | 44 ++-- src/workflows/process_datasets/main.nf | 129 +--------- src/workflows/run_benchmark/config.vsh.yaml | 130 +++++----- src/workflows/run_benchmark/main.nf | 229 +++++------------- 61 files changed, 529 insertions(+), 575 deletions(-) delete mode 100644 scripts/.gitignore create mode 100644 scripts/create_component/.gitignore create mode 100755 scripts/create_component/create_python_method.sh create mode 100755 scripts/create_component/create_python_metric.sh create mode 100755 scripts/create_component/create_r_method.sh create mode 100755 scripts/create_component/create_r_metric.sh create mode 100755 scripts/create_resources/resources.sh create mode 100755 scripts/create_resources/test_resources.sh delete mode 100644 scripts/create_test_resources.sh delete mode 100755 scripts/download_resources.sh create mode 100755 scripts/project/build_all_components.sh create mode 100755 scripts/project/build_all_docker_containers.sh rename scripts/{ => project}/test_all_components.sh (75%) delete mode 100644 scripts/run_benchmark.sh create mode 100755 scripts/run_benchmark/run_full_local.sh create mode 100755 scripts/run_benchmark/run_full_seqeracloud.sh create mode 100755 scripts/run_benchmark/run_test_local.sh create mode 100755 scripts/run_benchmark/run_test_seqeracloud.sh delete mode 100644 scripts/run_benchmark_test.sh create mode 100755 scripts/sync_resources.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 873699a..fe0b62b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,12 @@ ## BUGFIXES --> +# dimensionality_reduction 0.1.1 2024-09-18 + +## NEW FUNCTIONALITY + +* Updated workflows to work correctly for this task (PR #2) + # dimensionality_reduction 0.1.0 2024-09-05 ## NEW FUNCTIONALITY diff --git a/README.md b/README.md index 729e6ea..7497578 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ flowchart LR The dataset to pass to a method. -Example file: `resources_test/common/pancreas/dataset.h5ad` +Example file: `resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad` Format: @@ -149,7 +149,7 @@ Arguments: The dataset to pass to a method. Example file: -`resources_test/dimensionality_reduction/pancreas/dataset.h5ad` +`resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad` Format: @@ -181,7 +181,7 @@ Data structure: The data for evaluating a dimensionality reduction. Example file: -`resources_test/dimensionality_reduction/pancreas/solution.h5ad` +`resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/solution.h5ad` Format: @@ -268,7 +268,7 @@ Arguments: A dataset with dimensionality reduction embedding. Example file: -`resources_test/dimensionality_reduction/pancreas/embedding.h5ad` +`resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/embedding.h5ad` Format: @@ -298,7 +298,7 @@ Data structure: Metric score file Example file: -`resources_test/dimensionality_reduction/pancreas/score.h5ad` +`resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/score.h5ad` Format: diff --git a/_viash.yaml b/_viash.yaml index 09c27e5..d3142cf 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -67,11 +67,11 @@ info: # Step 5: Replace the task_template to the name of the task. test_resources: - type: s3 - path: s3://openproblems-data/resources_test/common/pancreas/ - dest: resources_test/common/pancreas/ + path: s3://openproblems-data/resources_test/common/cxg_mouse_pancreas_atlas/ + dest: resources_test/common/cxg_mouse_pancreas_atlas/ - type: s3 - path: s3://openproblems-data/resources_test/dimensionality_reduction/ - dest: resources_test/dimensionality_reduction + path: s3://openproblems-data/resources_test/task_dimensionality_reduction/ + dest: resources_test/task_dimensionality_reduction # Step 6: Update the authors of the task. authors: @@ -121,7 +121,8 @@ config_mods: | .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } repositories: - - name: openproblems-v2 + - name: core type: github - repo: openproblems-bio/openproblems-v2 - tag: main_build + repo: openproblems-bio/core + tag: build/main + path: viash/core diff --git a/common b/common index 1660eef..f264283 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 1660eef0b1172c1059270fff77f9abc0a5fc1ea4 +Subproject commit f2642835c89264e0a43e87e3f6c588c6be4902e7 diff --git a/scripts/.gitignore b/scripts/.gitignore deleted file mode 100644 index 2f7ffd3..0000000 --- a/scripts/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -add_a_method.sh -add_a_control_method.sh -add_a_metric.sh \ No newline at end of file diff --git a/scripts/create_component/.gitignore b/scripts/create_component/.gitignore new file mode 100644 index 0000000..09380f9 --- /dev/null +++ b/scripts/create_component/.gitignore @@ -0,0 +1,2 @@ +# if users change the scripts, the changes should not be committed. +/create_*_*.sh \ No newline at end of file diff --git a/scripts/create_component/create_python_method.sh b/scripts/create_component/create_python_method.sh new file mode 100755 index 0000000..b96c05d --- /dev/null +++ b/scripts/create_component/create_python_method.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +common/scripts/create_component \ + --name my_python_method \ + --language python \ + --type method diff --git a/scripts/create_component/create_python_metric.sh b/scripts/create_component/create_python_metric.sh new file mode 100755 index 0000000..d36bc7a --- /dev/null +++ b/scripts/create_component/create_python_metric.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +common/scripts/create_component \ + --name my_python_metric \ + --language python \ + --type metric diff --git a/scripts/create_component/create_r_method.sh b/scripts/create_component/create_r_method.sh new file mode 100755 index 0000000..0ab0394 --- /dev/null +++ b/scripts/create_component/create_r_method.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +common/scripts/create_component \ + --name my_r_method \ + --language r \ + --type method diff --git a/scripts/create_component/create_r_metric.sh b/scripts/create_component/create_r_metric.sh new file mode 100755 index 0000000..1a4794e --- /dev/null +++ b/scripts/create_component/create_r_metric.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +common/scripts/create_component \ + --name my_r_metric \ + --language r \ + --type metric diff --git a/scripts/create_readme.sh b/scripts/create_readme.sh index 5a5544a..0ed7aaf 100755 --- a/scripts/create_readme.sh +++ b/scripts/create_readme.sh @@ -1,3 +1,5 @@ #!/bin/bash -common/scripts/create_task_readme \ No newline at end of file +set -e + +common/scripts/create_task_readme --input src/api diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh new file mode 100755 index 0000000..a733e4c --- /dev/null +++ b/scripts/create_resources/resources.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources/datasets/**/state.yaml +rename_keys: 'input:output_dataset' +output_state: '$id/state.yaml' +settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' +publish_dir: s3://openproblems-data/resources/task_dimensionality_reduction/datasets/ +HERE + +tw launch https://github.com/openproblems-bio/task_dimensionality_reduction.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_dimensionality_reduction,process_datasets diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh new file mode 100755 index 0000000..287504a --- /dev/null +++ b/scripts/create_resources/test_resources.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/task_dimensionality_reduction + +mkdir -p $DATASET_DIR + +# process dataset +echo Running process_dataset +viash run src/data_processors/process_dataset/config.vsh.yaml -- \ + --input $RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad \ + --output_dataset $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \ + --output_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad + +# run one method +viash run src/methods/pca/config.vsh.yaml -- \ + --input $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \ + --output $DATASET_DIR/cxg_mouse_pancreas_atlas/embedding.h5ad + +# run one metric +viash run src/metrics/clustering_performance/config.vsh.yaml -- \ + --input_embedding $DATASET_DIR/cxg_mouse_pancreas_atlas/embedding.h5ad \ + --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \ + --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad + +cat > $DATASET_DIR/cxg_mouse_pancreas_atlas/state.yaml << HERE +id: cxg_mouse_pancreas_atlas +output_dataset: !file dataset.h5ad +output_solution: !file solution.h5ad +HERE + +# only run this if you have access to the openproblems-data bucket +aws s3 sync --profile op \ + "resources_test/task_dimensionality_reduction" \ + s3://openproblems-data/resources_test/task_dimensionality_reduction \ + --delete --dryrun diff --git a/scripts/create_test_resources.sh b/scripts/create_test_resources.sh deleted file mode 100644 index a39f8c4..0000000 --- a/scripts/create_test_resources.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/task_template - -mkdir -p $DATASET_DIR - -# process dataset -echo Running process_dataset -nextflow run . \ - -main-script target/nextflow/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - --input_states "$RAW_DATA/**/state.yaml" \ - --rename_keys 'input:output_dataset' \ - --settings '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}' \ - --publish_dir "$DATASET_DIR" \ - --output_state '$id/state.yaml' - -# run one method -viash run src/methods/logistic_regression/config.vsh.yaml -- \ - --input_train $DATASET_DIR/pancreas/train.h5ad \ - --input_test $DATASET_DIR/pancreas/test.h5ad \ - --output $DATASET_DIR/pancreas/denoised.h5ad - -# run one metric -viash run src/metrics/accuracy/config.vsh.yaml -- \ - --input_predicition $DATASET_DIR/pancreas/predicted.h5ad \ - --input_solution $DATASET_DIR/pancreas/solution.h5ad \ - --output $DATASET_DIR/pancreas/score.h5ad \ No newline at end of file diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh deleted file mode 100755 index 74cc033..0000000 --- a/scripts/download_resources.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -set -e - -echo ">> Downloading resources" - -# the sync_resources script uses the test_resources S3 URI's in the _viash.yaml to download the resources. -common/scripts/sync_resources \ - --delete diff --git a/scripts/project/build_all_components.sh b/scripts/project/build_all_components.sh new file mode 100755 index 0000000..4e90d91 --- /dev/null +++ b/scripts/project/build_all_components.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -e + +# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html) +viash ns build --parallel diff --git a/scripts/project/build_all_docker_containers.sh b/scripts/project/build_all_docker_containers.sh new file mode 100755 index 0000000..5d43639 --- /dev/null +++ b/scripts/project/build_all_docker_containers.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -e + +# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html) +# and set up the container via a cached build +viash ns build --parallel --setup cachedbuild diff --git a/scripts/test_all_components.sh b/scripts/project/test_all_components.sh similarity index 75% rename from scripts/test_all_components.sh rename to scripts/project/test_all_components.sh index cd016e9..8a08afd 100755 --- a/scripts/test_all_components.sh +++ b/scripts/project/test_all_components.sh @@ -1,4 +1,6 @@ #!/bin/bash +set -e + # Test all components in a namespace (refer https://viash.io/reference/cli/ns_test.html) -viash ns test --parallel \ No newline at end of file +viash ns test --parallel diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh deleted file mode 100644 index cc4275e..0000000 --- a/scripts/run_benchmark.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" - -# make sure only log_cp10k is used -cat > /tmp/params.yaml << HERE -input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml -rename_keys: 'input_train:output_train;input_test:output_test' -output_state: "state.yaml" -publish_dir: "$publish_dir" -HERE - -tw launch https://github.com/openproblems-bio/task_template.git \ - --revision build/main \ - --pull-latest \ - --main-script target/nextflow/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config common/nextflow_helpers/labels_tw.config \ - --labels task_template,full \ No newline at end of file diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh new file mode 100755 index 0000000..dcc2c2f --- /dev/null +++ b/scripts/run_benchmark/run_full_local.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# NOTE: depending on the the datasets and components, you may need to launch this workflow +# on a different compute platform (e.g. a HPC, AWS Cloud, Azure Cloud, Google Cloud). +# please refer to the nextflow information for more details: +# https://www.nextflow.io/docs/latest/ + +# remove this when you have implemented the script +# echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." +# echo " Step 1: replace 'task_template' with the name of the task in the following command." +# echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" +# echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" +# echo " Step 4: remove this message" +# exit 1 + +set -e + +echo "Running benchmark on test data" +echo " Make sure to run 'scripts/project/build_all_docker_containers.sh'!" + +# generate a unique id +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="resources/results/${RUN_ID}" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +input_states: resources/datasets/**/state.yaml +rename_keys: 'input_dataset:output_dataset;input_solution:output_solution' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +# run the benchmark +nextflow run openproblems-bio/task_dimensionality_reduction \ + --revision build/main \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + -params-file /tmp/params.yaml diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh new file mode 100755 index 0000000..3bae258 --- /dev/null +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# remove this when you have implemented the script +# echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." +# echo " Step 1: replace 'task_template' with the name of the task in the following command." +# echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" +# echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" +# echo " Step 4: remove this message" +# exit 1 + +set -e + +# generate a unique id +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/task_dimensionality_reduction/results/${RUN_ID}" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources/task_dimensionality_reduction/datasets/**/state.yaml +rename_keys: 'input_dataset:output_dataset;input_solution:output_solution' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_dimensionality_reduction.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_dimensionality_reduction,full diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh new file mode 100755 index 0000000..f1a3b27 --- /dev/null +++ b/scripts/run_benchmark/run_test_local.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +echo "Running benchmark on test data" +echo " Make sure to run 'scripts/project/build_all_docker_containers.sh'!" + +# generate a unique id +RUN_ID="testrun_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="temp/results/${RUN_ID}" + +# write the parameters to file +cat > /tmp/params.yaml << HERE +id: cxg_mouse_pancreas_atlas +input_dataset: "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad" +input_solution: "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/solution.h5ad" +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +nextflow run . \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -c common/nextflow_helpers/labels_ci.config \ + -params-file /tmp/params.yaml diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh new file mode 100755 index 0000000..77e0481 --- /dev/null +++ b/scripts/run_benchmark/run_test_seqeracloud.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# # remove this when you have implemented the script +# echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." +# echo " Step 1: replace 'task_template' with the name of the task in the following command." +# echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" +# echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" +# echo " Step 4: remove this message" +# exit 1 + +set -e + +# write the parameters to file +cat > /tmp/params.yaml << 'HERE' +input_dataset: s3://openproblems-data/resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad +input_solution: s3://openproblems-data/resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/solution.h5ad +output_state: "state.yaml" +publish_dir: s3://openproblems-nextflow/temp/task_dimensionality_reduction/ +HERE + +tw launch https://github.com/openproblems-bio/task_dimensionality_reduction.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_dimensionality_reduction,test diff --git a/scripts/run_benchmark_test.sh b/scripts/run_benchmark_test.sh deleted file mode 100644 index 6c03d42..0000000 --- a/scripts/run_benchmark_test.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -input_states: s3://openproblems-data/resources_test/task_template/**/state.yaml -rename_keys: 'input_train:output_train;input_test:output_test' -output_state: "state.yaml" -publish_dir: s3://openproblems-nextflow/temp/task_template/ -HERE - -tw launch https://github.com/openproblems-bio/task_template.git \ - --revision build/main \ - --pull-latest \ - --main-script target/nextflow/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config common/nextflow_helpers/labels_tw.config \ - --labels task_template,test \ No newline at end of file diff --git a/scripts/sync_resources.sh b/scripts/sync_resources.sh new file mode 100755 index 0000000..20b87e7 --- /dev/null +++ b/scripts/sync_resources.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +set -e + +common/scripts/sync_resources diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 32e36e1..2394ee8 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -24,8 +24,8 @@ arguments: direction: output required: true test_resources: - - path: /resources_test/dimensionality_reduction/pancreas/ - dest: resources_test/dimensionality_reduction/pancreas/ + - path: /resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/ + dest: resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/ - type: python_script path: /common/component_tests/check_config.py - type: python_script diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index 9d5a856..8c435c4 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -17,8 +17,8 @@ arguments: direction: output required: true test_resources: - - path: /resources_test/dimensionality_reduction/pancreas/ - dest: resources_test/dimensionality_reduction/pancreas/ + - path: /resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/ + dest: resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/ - type: python_script path: /common/component_tests/check_config.py - type: python_script diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index d8fc778..dc2bdff 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -20,8 +20,8 @@ arguments: direction: output required: true test_resources: - - path: /resources_test/dimensionality_reduction/pancreas/ - dest: resources_test/dimensionality_reduction/pancreas/ + - path: /resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/ + dest: resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/ - type: python_script path: /common/component_tests/check_config.py - type: python_script diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml index a8a3555..8db8527 100644 --- a/src/api/comp_process_dataset.yaml +++ b/src/api/comp_process_dataset.yaml @@ -20,7 +20,7 @@ arguments: direction: output required: true test_resources: - - path: /resources_test/common/pancreas/ - dest: resources_test/common/pancreas/ + - path: /resources_test/common/cxg_mouse_pancreas_atlas/ + dest: resources_test/common/cxg_mouse_pancreas_atlas/ - type: python_script path: /common/component_tests/run_and_check_output.py diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index 05a1d46..eb7462e 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/common/pancreas/dataset.h5ad" +example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad" label: "Dataset" summary: "The dataset to pass to a method." info: diff --git a/src/api/file_dataset.yaml b/src/api/file_dataset.yaml index fb94287..3f467ed 100644 --- a/src/api/file_dataset.yaml +++ b/src/api/file_dataset.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/dimensionality_reduction/pancreas/dataset.h5ad" +example: "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad" label: "Dataset" summary: "The dataset to pass to a method." info: diff --git a/src/api/file_embedding.yaml b/src/api/file_embedding.yaml index cb6254c..fcaaa41 100644 --- a/src/api/file_embedding.yaml +++ b/src/api/file_embedding.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/dimensionality_reduction/pancreas/embedding.h5ad" +example: "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/embedding.h5ad" label: "Embedding" summary: "A dataset with dimensionality reduction embedding." info: diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml index 286ab81..c5e139b 100644 --- a/src/api/file_score.yaml +++ b/src/api/file_score.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/dimensionality_reduction/pancreas/score.h5ad" +example: "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/score.h5ad" label: "Score" summary: "Metric score file" info: diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml index 3f69f1c..b8fe4ad 100644 --- a/src/api/file_solution.yaml +++ b/src/api/file_solution.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/dimensionality_reduction/pancreas/solution.h5ad" +example: "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/solution.h5ad" label: "Test data" summary: "The data for evaluating a dimensionality reduction." info: diff --git a/src/control_methods/random_features/script.py b/src/control_methods/random_features/script.py index 0fcac25..22de821 100644 --- a/src/control_methods/random_features/script.py +++ b/src/control_methods/random_features/script.py @@ -3,11 +3,11 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/test.h5ad", "output": "reduced.h5ad", } meta = { - "functionality_name": "random_features", + "name": "random_features", } ## VIASH END @@ -24,7 +24,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/control_methods/spectral_features/script.py b/src/control_methods/spectral_features/script.py index a68e40c..6a2f142 100644 --- a/src/control_methods/spectral_features/script.py +++ b/src/control_methods/spectral_features/script.py @@ -3,12 +3,12 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/test.h5ad", "output": "reduced.h5ad", "n_comps": 2, } meta = { - "functionality_name": "spectral_features", + "name": "spectral_features", } ## VIASH END @@ -64,7 +64,7 @@ def diffusion_map(graph, n_comps, t, n_retries): uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/control_methods/true_features/script.py b/src/control_methods/true_features/script.py index 52f701d..883b500 100644 --- a/src/control_methods/true_features/script.py +++ b/src/control_methods/true_features/script.py @@ -2,11 +2,11 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/test.h5ad", "output": "reduced.h5ad", } meta = { - "functionality_name": "true_features", + "name": "true_features", } ## VIASH END @@ -23,7 +23,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 9663538..f672e4e 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -3,18 +3,8 @@ __merge__: /src/api/comp_process_dataset.yaml # Component configuration name: process_dataset -status: disabled # Script configuration -arguments: - - name: "--obs_label" - type: "string" - description: "Which .obs slot to use as label." - default: "cell_type" - - name: "--var_hvg_score" - type: "string" - description: "Which .var slot to use as the hvg score." - default: "hvg_score" resources: - type: python_script path: script.py diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index ffc4710..531a353 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -5,9 +5,7 @@ ## VIASH START par = { - "input": "resources_test/common/pancreas/dataset.h5ad", - "obs_label": "cell_type", - "var_hvg_score": "hvg_score", + "input": "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad", "output_dataset": "train.h5ad", "output_solution": "test.h5ad", } @@ -29,23 +27,11 @@ print(adata) -# Subset the different adatas -print(">> Figuring which data needs to be copied to which output file", flush=True) -# Use par arguments to look for values in different slots -slot_mapping = { - "obs": { - "label": par["obs_label"], - }, - "var": { - "hvg_score": par["var_hvg_score"], - }, -} - print(">> Creating input data", flush=True) -output_dataset = subset_h5ad_by_format(adata, config, "output_dataset", slot_mapping) +output_dataset = subset_h5ad_by_format(adata, config, "output_dataset") print(">> Creating solution data", flush=True) -output_solution = subset_h5ad_by_format(adata, config, "output_solution", slot_mapping) +output_solution = subset_h5ad_by_format(adata, config, "output_solution") print(">> Writing data", flush=True) output_dataset.write_h5ad(par["output_dataset"]) diff --git a/src/methods/densmap/script.py b/src/methods/densmap/script.py index 2510507..d331f04 100644 --- a/src/methods/densmap/script.py +++ b/src/methods/densmap/script.py @@ -4,13 +4,13 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/train.h5ad", "output": "reduced.h5ad", "n_pca_dims": 50, "n_hvg": 1000, } meta = { - "functionality_name": "foo", + "name": "densmap", } ## VIASH END @@ -40,7 +40,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/methods/diffusion_map/script.R b/src/methods/diffusion_map/script.R index eaa7cba..5108ab2 100644 --- a/src/methods/diffusion_map/script.R +++ b/src/methods/diffusion_map/script.R @@ -1,6 +1,6 @@ ## VIASH START par <- list( - input = "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + input = "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad", output = "output.h5ad", n_dim = 2 ) @@ -24,7 +24,7 @@ output <- anndata::AnnData( uns = list( dataset_id = input$uns[["dataset_id"]], normalization_id = input$uns[["normalization_id"]], - method_id = meta$functionality_name + method_id = meta$name ), obsm = list( X_emb = X_emb diff --git a/src/methods/ivis/script.py b/src/methods/ivis/script.py index 6c67efc..9620325 100644 --- a/src/methods/ivis/script.py +++ b/src/methods/ivis/script.py @@ -6,13 +6,13 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad", "output": "reduced.h5ad", "n_hvg": 1000, "n_pca_dims": 50, } meta = { - "functionality_name": "foo", + "name": "ivis", } ## VIASH END @@ -47,7 +47,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/methods/lmds/script.R b/src/methods/lmds/script.R index 71167ef..26771f8 100644 --- a/src/methods/lmds/script.R +++ b/src/methods/lmds/script.R @@ -1,6 +1,6 @@ ## VIASH START par <- list( - input = "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + input = "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad", output = "output.h5ad", n_dim = 3, n_landmarks = 1000, @@ -25,7 +25,7 @@ message("Write output AnnData to file") output <- anndata::AnnData( uns = list( dataset_id = input$uns[["dataset_id"]], - method_id = meta$functionality_name, + method_id = meta$name, normalization_id = input$uns[["normalization_id"]] ), obsm = list( diff --git a/src/methods/neuralee/script.py b/src/methods/neuralee/script.py index 61e05c4..78d6e61 100644 --- a/src/methods/neuralee/script.py +++ b/src/methods/neuralee/script.py @@ -8,14 +8,14 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/train.h5ad", "output": "reduced.h5ad", "n_hvg": 1000, "n_iter": 10, "normalize": True, } meta = { - "functionality_name": "foo", + "name": "neuralee", } ## VIASH END @@ -68,7 +68,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/methods/pca/script.py b/src/methods/pca/script.py index b04ab01..8137a8f 100644 --- a/src/methods/pca/script.py +++ b/src/methods/pca/script.py @@ -3,12 +3,12 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/train.h5ad", "output": "reduced.h5ad", "n_hvg": 1000, } meta = { - "functionality_name": "foo", + "name": "pca", } ## VIASH END @@ -31,7 +31,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/methods/phate/config.vsh.yaml b/src/methods/phate/config.vsh.yaml index 038c4e7..853fda6 100644 --- a/src/methods/phate/config.vsh.yaml +++ b/src/methods/phate/config.vsh.yaml @@ -58,6 +58,7 @@ engines: - phate==1.0.* - scprep - "scikit-learn<1.2" + - numpy<2.0.0 # Avoid "numpy.ndarray size changed, may indicate binary incompatibility" error runners: - type: executable - type: nextflow diff --git a/src/methods/phate/script.py b/src/methods/phate/script.py index 003b467..0daa002 100644 --- a/src/methods/phate/script.py +++ b/src/methods/phate/script.py @@ -3,14 +3,14 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/train.h5ad", "output": "reduced.h5ad", "n_pca_dims": 50, "n_hvg": 1000, "gamma": 1, } meta = { - "functionality_name": "foo", + "name": "phate", } ## VIASH END @@ -35,7 +35,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/methods/pymde/config.vsh.yaml b/src/methods/pymde/config.vsh.yaml index 2bb7987..7d7ed8f 100644 --- a/src/methods/pymde/config.vsh.yaml +++ b/src/methods/pymde/config.vsh.yaml @@ -46,7 +46,9 @@ engines: image: openproblems/base_python:1.0.0 setup: - type: python - packages: pymde + packages: + - pymde + - numpy<2.0.0 # Avoid "numpy.ndarray size changed, may indicate binary incompatibility" error, see https://github.com/cvxgrp/pymde/issues/19 runners: - type: executable - type: nextflow diff --git a/src/methods/pymde/script.py b/src/methods/pymde/script.py index 0483dad..8e42ac2 100644 --- a/src/methods/pymde/script.py +++ b/src/methods/pymde/script.py @@ -4,14 +4,14 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad", "output": "reduced.h5ad", "embed_method": "neighbors", "n_hvg": 1000, "n_pca_dims": 50, } meta = { - "functionality_name": "foo", + "name": "pymde", } ## VIASH END @@ -46,7 +46,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/methods/simlr/script.R b/src/methods/simlr/script.R index 9591690..e194b02 100644 --- a/src/methods/simlr/script.R +++ b/src/methods/simlr/script.R @@ -1,6 +1,6 @@ ## VIASH START par <- list( - input = "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", + input = "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad", output = "output.h5ad", n_clusters = NULL, n_dim = NA, @@ -10,7 +10,7 @@ par <- list( cores_ratio = 1 ) meta <- list( - functionality_name = "simlr" + name = "simlr" ) ## VIASH END @@ -55,7 +55,7 @@ message("Write output AnnData to file") output <- anndata::AnnData( uns = list( dataset_id = input$uns[["dataset_id"]], - method_id = meta$functionality_name, + method_id = meta$name, normalization_id = input$uns[["normalization_id"]] ), obsm = list( diff --git a/src/methods/tsne/script.py b/src/methods/tsne/script.py index 82e0367..50e5d82 100644 --- a/src/methods/tsne/script.py +++ b/src/methods/tsne/script.py @@ -3,13 +3,13 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/train.h5ad", "output": "reduced.h5ad", "n_pca_dims": 50, "n_hvg": 1000, } meta = { - "functionality_name": "foo", + "name": "tsne", } ## VIASH END @@ -37,7 +37,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/methods/umap/script.py b/src/methods/umap/script.py index 69b885d..128222f 100644 --- a/src/methods/umap/script.py +++ b/src/methods/umap/script.py @@ -4,13 +4,13 @@ ## VIASH START par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", + "input": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/train.h5ad", "output": "reduced.h5ad", "n_pca_dims": 50, "n_hvg": 1000, } meta = { - "functionality_name": "umap", + "name": "umap", } ## VIASH END @@ -40,7 +40,7 @@ uns={ "dataset_id": input.uns["dataset_id"], "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"], + "method_id": meta["name"], }, ) diff --git a/src/metrics/clustering_performance/script.py b/src/metrics/clustering_performance/script.py index de66550..5f33886 100644 --- a/src/metrics/clustering_performance/script.py +++ b/src/metrics/clustering_performance/script.py @@ -4,12 +4,12 @@ ## VIASH START par = { - "input_embedding": "resources_test/dimensionality_reduction/pancreas/embedding.h5ad", - "input_solution": "resources_test/dimensionality_reduction/pancreas/solution.h5ad", + "input_embedding": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/embedding.h5ad", + "input_solution": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/solution.h5ad", "output": "output.h5ad", "nmi_avg_method": "arithmetic", } -meta = {"functionality_name": "clustering_performance"} +meta = {"name": "clustering_performance"} ## VIASH END print("Reading input files", flush=True) diff --git a/src/metrics/coranking/script.R b/src/metrics/coranking/script.R index 74c1f1b..a835bdb 100644 --- a/src/metrics/coranking/script.R +++ b/src/metrics/coranking/script.R @@ -3,8 +3,8 @@ library(coRanking) ## VIASH START par <- list( - "input_embedding" = "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", - "input_solution" = "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "input_embedding" = "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/reduced.h5ad", + "input_solution" = "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/test.h5ad", "output" = "score.h5ad" ) ## VIASH END diff --git a/src/metrics/density_preservation/script.py b/src/metrics/density_preservation/script.py index 5635902..37cf27b 100644 --- a/src/metrics/density_preservation/script.py +++ b/src/metrics/density_preservation/script.py @@ -6,8 +6,8 @@ ## VIASH START par = { - "input_embedding": "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", - "input_solution": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "input_embedding": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/reduced.h5ad", + "input_solution": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/test.h5ad", "output": "score.h5ad", "n_neighbors": 30, "seed": 42, diff --git a/src/metrics/distance_correlation/script.py b/src/metrics/distance_correlation/script.py index a925584..b6a6117 100644 --- a/src/metrics/distance_correlation/script.py +++ b/src/metrics/distance_correlation/script.py @@ -8,8 +8,8 @@ ## VIASH START par = { - "input_embedding": "resources_test/dimensionality_reduction/pancreas/embedding.h5ad", - "input_solution": "resources_test/dimensionality_reduction/pancreas/solution.h5ad", + "input_embedding": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/embedding.h5ad", + "input_solution": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/solution.h5ad", "output": "score.h5ad", } ## VIASH END diff --git a/src/metrics/trustworthiness/script.py b/src/metrics/trustworthiness/script.py index cbd7d39..b731b45 100644 --- a/src/metrics/trustworthiness/script.py +++ b/src/metrics/trustworthiness/script.py @@ -3,8 +3,8 @@ ## VIASH START par = { - "input_embedding": "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", - "input_solution": "resources_test/dimensionality_reduction/pancreas/test.h5ad", + "input_embedding": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/reduced.h5ad", + "input_solution": "resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/test.h5ad", "output": "score.h5ad", } ## VIASH END diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index 7000eb8..032cc8e 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -1,29 +1,23 @@ name: process_datasets namespace: workflows -status: disabled - argument_groups: - # - name: Inputs - # arguments: - # - name: "--input" - # __merge__: /src/api/file_common_dataset.yaml - # required: true - # direction: input - # - name: Outputs - # arguments: - # - name: "--output_train" - # __merge__: /src/api/file_train_h5ad.yaml - # required: true - # direction: output - # - name: "--output_test" - # __merge__: /src/api/file_test_h5ad.yaml - # required: true - # direction: output - # - name: "--output_solution" - # __merge__: /src/api/file_solution.yaml - # required: true - # direction: output + - name: Inputs + arguments: + - name: "--input" + __merge__: /src/api/file_common_dataset.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_dataset" + __merge__: /src/api/file_dataset.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/api/file_solution.yaml + required: true + direction: output resources: - type: nextflow_script @@ -32,10 +26,8 @@ resources: - path: /common/nextflow_helpers/helper.nf dependencies: - - name: common/check_dataset_schema - repository: openproblems-v2 - - name: common/extract_metadata - repository: openproblems-v2 + - name: schema/verify_data_structure + repository: core - name: data_processors/process_dataset runners: diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index eae19f7..4459118 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -1,7 +1,7 @@ include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" workflow auto { - findStatesTemp(params, meta.config) + findStates(params, meta.config) | meta.workflow.run( auto: [publish: "state"] ) @@ -14,7 +14,7 @@ workflow run_wf { main: output_ch = input_ch - | check_dataset_schema.run( + | verify_data_structure.run( fromState: { id, state -> def schema = findArgumentSchema(meta.config, "input") def schemaYaml = tempFile("schema.yaml") @@ -39,135 +39,16 @@ workflow run_wf { } | process_dataset.run( - fromState: [ input: "dataset" ], + fromState: [ input: "input" ], toState: [ - output_train: "output_train", - output_test: "output_test", + output_dataset: "output_dataset", output_solution: "output_solution" ] ) // only output the files for which an output file was specified - | setState(["output_train", "output_test", "output_solution"]) + | setState(["output_dataset", "output_solution"]) emit: output_ch } - - -// temp fix for rename_keys typo - -def findStatesTemp(Map params, Map config) { - def auto_config = deepClone(config) - def auto_params = deepClone(params) - - auto_config = auto_config.clone() - // override arguments - auto_config.argument_groups = [] - auto_config.arguments = [ - [ - type: "string", - name: "--id", - description: "A dummy identifier", - required: false - ], - [ - type: "file", - name: "--input_states", - example: "/path/to/input/directory/**/state.yaml", - description: "Path to input directory containing the datasets to be integrated.", - required: true, - multiple: true, - multiple_sep: ";" - ], - [ - type: "string", - name: "--filter", - example: "foo/.*/state.yaml", - description: "Regex to filter state files by path.", - required: false - ], - // to do: make this a yaml blob? - [ - type: "string", - name: "--rename_keys", - example: ["newKey1:oldKey1", "newKey2:oldKey2"], - description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", - required: false, - multiple: true, - multiple_sep: ";" - ], - [ - type: "string", - name: "--settings", - example: '{"output_dataset": "dataset.h5ad", "k": 10}', - description: "Global arguments as a JSON glob to be passed to all components.", - required: false - ] - ] - if (!(auto_params.containsKey("id"))) { - auto_params["id"] = "auto" - } - - // run auto config through processConfig once more - auto_config = processConfig(auto_config) - - workflow findStatesTempWf { - helpMessage(auto_config) - - output_ch = - channelFromParams(auto_params, auto_config) - | flatMap { autoId, args -> - - def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] - - // look for state files in input dir - def stateFiles = args.input_states - - // filter state files by regex - if (args.filter) { - stateFiles = stateFiles.findAll{ stateFile -> - def stateFileStr = stateFile.toString() - def matcher = stateFileStr =~ args.filter - matcher.matches()} - } - - // read in states - def states = stateFiles.collect { stateFile -> - def state_ = readTaggedYaml(stateFile) - [state_.id, state_] - } - - // construct renameMap - if (args.rename_keys) { - def renameMap = args.rename_keys.collectEntries{renameString -> - def split = renameString.split(":") - assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'" - split - } - - // rename keys in state, only let states through which have all keys - // also add global settings - states = states.collectMany{id, state -> - def newState = [:] - - for (key in renameMap.keySet()) { - def origKey = renameMap[key] - if (!(state.containsKey(origKey))) { - return [] - } - newState[key] = state[origKey] - } - - [[id, globalSettings + newState]] - } - } - - states - } - emit: - output_ch - } - - return findStatesTempWf -} \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 1976ed4..25e153a 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -1,60 +1,53 @@ name: run_benchmark namespace: workflows -status: disabled - argument_groups: - # - name: Inputs - # arguments: - # - name: "--input_train" - # __merge__: /src/api/file_train_h5ad.yaml - # type: file - # direction: input - # required: true - # - name: "--input_test" - # __merge__: /src/api/file_test_h5ad.yaml - # type: file - # direction: input - # required: true - # - name: "--input_solution" - # __merge__: /src/api/file_solution.yaml - # type: file - # direction: input - # required: true - # - name: Outputs - # arguments: - # - name: "--output_scores" - # type: file - # required: true - # direction: output - # description: A yaml file containing the scores of each of the methods - # default: score_uns.yaml - # - name: "--output_method_configs" - # type: file - # required: true - # direction: output - # default: method_configs.yaml - # - name: "--output_metric_configs" - # type: file - # required: true - # direction: output - # default: metric_configs.yaml - # - name: "--output_dataset_info" - # type: file - # required: true - # direction: output - # default: dataset_uns.yaml - # - name: "--output_task_info" - # type: file - # required: true - # direction: output - # default: task_info.yaml - # - name: Methods - # arguments: - # - name: "--method_ids" - # type: string - # multiple: true - # description: A list of method ids to run. If not specified, all methods will be run. + - name: Inputs + arguments: + - name: "--input_dataset" + __merge__: /src/api/file_dataset.yaml + type: file + direction: input + required: true + - name: "--input_solution" + __merge__: /src/api/file_solution.yaml + type: file + direction: input + required: true + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Methods + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. resources: - type: nextflow_script @@ -64,13 +57,30 @@ resources: path: /_viash.yaml dependencies: - - name: common/check_dataset_schema - repository: openproblems-v2 - - name: common/extract_metadata - repository: openproblems-v2 - - name: control_methods/true_labels - - name: methods/logistic_regression - - name: metrics/accuracy + - name: h5ad/extract_uns_metadata + repository: core + # Control methods + - name: control_methods/random_features + - name: control_methods/spectral_features + - name: control_methods/true_features + # Methods + - name: methods/densmap + - name: methods/diffusion_map + - name: methods/ivis + - name: methods/lmds + - name: methods/neuralee + - name: methods/pca + - name: methods/phate + - name: methods/pymde + - name: methods/simlr + - name: methods/tsne + - name: methods/umap + # Metrics + - name: metrics/clustering_performance + - name: metrics/coranking + - name: metrics/density_preservation + - name: metrics/distance_correlation + - name: metrics/trustworthiness runners: - type: nextflow diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 68e5ecd..631f4ed 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -1,38 +1,56 @@ workflow auto { - findStatesTemp(params, meta.config) + findStates(params, meta.config) | meta.workflow.run( auto: [publish: "state"] ) } +// construct list of methods and control methods +methods = [ + // Control methods + random_features, + spectral_features, + true_features, + // Real methods + densmap, + diffusion_map, + ivis, + lmds, + neuralee, + pca, + phate, + pymde, + simlr, + tsne, + umap +] + +// construct list of metrics +metrics = [ + clustering_performance, + coranking, + density_preservation, + distance_correlation, + trustworthiness +] + workflow run_wf { take: input_ch main: - // construct list of methods - methods = [ - true_labels, - logistic_regression - ] - - // construct list of metrics - metrics = [ - accuracy - ] - /**************************** * EXTRACT DATASET METADATA * ****************************/ dataset_ch = input_ch // store join id - | map{ id, state -> + | map{ id, state -> [id, state + ["_meta": [join_id: id]]] } // extract the dataset metadata - | extract_metadata.run( + | extract_uns_metadata.run( fromState: [input: "input_solution"], toState: { id, output, state -> state + [ @@ -70,8 +88,7 @@ workflow run_wf { // use 'fromState' to fetch the arguments the component requires from the overall state fromState: { id, state, comp -> def new_args = [ - input_train: state.input_train, - input_test: state.input_test + input: state.input_dataset, ] if (comp.config.info.type == "control_method") { new_args.input_solution = state.input_solution @@ -96,8 +113,8 @@ workflow run_wf { }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ - input_solution: "input_solution", - input_prediction: "method_output" + input_solution: "input_solution", + input_embedding: "method_output" ], // use 'toState' to publish that component's outputs to the overall state toState: { id, output, state, comp -> @@ -108,6 +125,26 @@ workflow run_wf { } ) + // extract the scores + | extract_uns_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + ["output", [output_scores: score_uns_file]] + } /****************************** * GENERATE OUTPUT YAML FILES * @@ -115,7 +152,7 @@ workflow run_wf { // TODO: can we store everything below in a separate helper function? // extract the dataset metadata - dataset_meta_ch = dataset_ch + meta_ch = dataset_ch // only keep one of the normalization methods | filter{ id, state -> state.dataset_uns.normalization_id == "log_cp10k" @@ -131,23 +168,6 @@ workflow run_wf { def dataset_uns_file = tempFile("dataset_uns.yaml") dataset_uns_file.write(dataset_uns_yaml_blob) - ["output", [output_dataset_info: dataset_uns_file]] - } - - output_ch = score_ch - - // extract the scores - | extract_metadata.run( - key: "extract_scores", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> // store the method configs in a file def method_configs = methods.collect{it.config} def method_configs_yaml_blob = toYamlBlob(method_configs) @@ -160,30 +180,24 @@ workflow run_wf { def metric_configs_file = tempFile("metric_configs.yaml") metric_configs_file.write(metric_configs_yaml_blob) + // store the task info in a file def viash_file = meta.resources_dir.resolve("_viash.yaml") - def viash_file_content = toYamlBlob(readYaml(viash_file).info) - def task_info_file = tempFile("task_info.yaml") - task_info_file.write(viash_file_content) - - // store the scores in a file - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) + // create output state def new_state = [ + output_dataset_info: dataset_uns_file, output_method_configs: method_configs_file, output_metric_configs: metric_configs_file, - output_task_info: task_info_file, - output_scores: score_uns_file, + output_task_info: viash_file, _meta: states[0]._meta ] ["output", new_state] } - // merge all of the output data - | mix(dataset_meta_ch) + // merge all of the output data + output_ch = score_ch + | mix(meta_ch) | joinStates{ ids, states -> def mergedStates = states.inject([:]) { acc, m -> acc + m } [ids[0], mergedStates] @@ -192,120 +206,3 @@ workflow run_wf { emit: output_ch } - -// temp fix for rename_keys typo - -def findStatesTemp(Map params, Map config) { - def auto_config = deepClone(config) - def auto_params = deepClone(params) - - auto_config = auto_config.clone() - // override arguments - auto_config.argument_groups = [] - auto_config.arguments = [ - [ - type: "string", - name: "--id", - description: "A dummy identifier", - required: false - ], - [ - type: "file", - name: "--input_states", - example: "/path/to/input/directory/**/state.yaml", - description: "Path to input directory containing the datasets to be integrated.", - required: true, - multiple: true, - multiple_sep: ";" - ], - [ - type: "string", - name: "--filter", - example: "foo/.*/state.yaml", - description: "Regex to filter state files by path.", - required: false - ], - // to do: make this a yaml blob? - [ - type: "string", - name: "--rename_keys", - example: ["newKey1:oldKey1", "newKey2:oldKey2"], - description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", - required: false, - multiple: true, - multiple_sep: ";" - ], - [ - type: "string", - name: "--settings", - example: '{"output_dataset": "dataset.h5ad", "k": 10}', - description: "Global arguments as a JSON glob to be passed to all components.", - required: false - ] - ] - if (!(auto_params.containsKey("id"))) { - auto_params["id"] = "auto" - } - - // run auto config through processConfig once more - auto_config = processConfig(auto_config) - - workflow findStatesTempWf { - helpMessage(auto_config) - - output_ch = - channelFromParams(auto_params, auto_config) - | flatMap { autoId, args -> - - def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] - - // look for state files in input dir - def stateFiles = args.input_states - - // filter state files by regex - if (args.filter) { - stateFiles = stateFiles.findAll{ stateFile -> - def stateFileStr = stateFile.toString() - def matcher = stateFileStr =~ args.filter - matcher.matches()} - } - - // read in states - def states = stateFiles.collect { stateFile -> - def state_ = readTaggedYaml(stateFile) - [state_.id, state_] - } - - // construct renameMap - if (args.rename_keys) { - def renameMap = args.rename_keys.collectEntries{renameString -> - def split = renameString.split(":") - assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'" - split - } - - // rename keys in state, only let states through which have all keys - // also add global settings - states = states.collectMany{id, state -> - def newState = [:] - - for (key in renameMap.keySet()) { - def origKey = renameMap[key] - if (!(state.containsKey(origKey))) { - return [] - } - newState[key] = state[origKey] - } - - [[id, globalSettings + newState]] - } - } - - states - } - emit: - output_ch - } - - return findStatesTempWf -} \ No newline at end of file