Update workflows (#2)

* use cxg_mouse_pancreas_atlas instead of pancreas * update dependencies and remove unnecessary arguments * update scripts and test resources * fix paths * update benchmarking workflow * update benchmark workflow * fix default * Rename "functionality_name" to "name" * Update benchmark workflow Now runs on local test * Update run_benchmark workflow config * Set numpy<2.0.0 for pymde and phate methods Avoids "numpy.ndarray size changed, may indicate binary incompatibility" error * also create a state.yaml file * Update run_test_seqeracloud.sh script * Update run full benchmark scripts * Update CHANGELOG * Add all methods/metrics to benchmark workflow * Add dependencies to benchwark workflow config --------- Co-authored-by: Luke Zappia <[email protected]>
openproblems-bio · Sep 18, 2024 · f4dcca2 · f4dcca2
1 parent ab268cc
commit f4dcca2
Show file tree

Hide file tree

Showing 61 changed files with 529 additions and 575 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,12 @@
 
 ## BUGFIXES -->
 
+# dimensionality_reduction 0.1.1 2024-09-18
+
+## NEW FUNCTIONALITY
+
+* Updated workflows to work correctly for this task (PR #2)
+
 # dimensionality_reduction 0.1.0 2024-09-05
 
 ## NEW FUNCTIONALITY

diff --git a/README.md b/README.md
@@ -93,7 +93,7 @@ flowchart LR
 
 The dataset to pass to a method.
 
-Example file: `resources_test/common/pancreas/dataset.h5ad`
+Example file: `resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad`
 
 Format:
 
@@ -149,7 +149,7 @@ Arguments:
 The dataset to pass to a method.
 
 Example file:
-`resources_test/dimensionality_reduction/pancreas/dataset.h5ad`
+`resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/dataset.h5ad`
 
 Format:
 
@@ -181,7 +181,7 @@ Data structure:
 The data for evaluating a dimensionality reduction.
 
 Example file:
-`resources_test/dimensionality_reduction/pancreas/solution.h5ad`
+`resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/solution.h5ad`
 
 Format:
 
@@ -268,7 +268,7 @@ Arguments:
 A dataset with dimensionality reduction embedding.
 
 Example file:
-`resources_test/dimensionality_reduction/pancreas/embedding.h5ad`
+`resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/embedding.h5ad`
 
 Format:
 
@@ -298,7 +298,7 @@ Data structure:
 Metric score file
 
 Example file:
-`resources_test/dimensionality_reduction/pancreas/score.h5ad`
+`resources_test/task_dimensionality_reduction/cxg_mouse_pancreas_atlas/score.h5ad`
 
 Format:
 

diff --git a/_viash.yaml b/_viash.yaml
@@ -67,11 +67,11 @@ info:
   # Step 5: Replace the task_template to the name of the task.
   test_resources:
     - type: s3
-      path: s3://openproblems-data/resources_test/common/pancreas/
-      dest: resources_test/common/pancreas/
+      path: s3://openproblems-data/resources_test/common/cxg_mouse_pancreas_atlas/
+      dest: resources_test/common/cxg_mouse_pancreas_atlas/
     - type: s3
-      path: s3://openproblems-data/resources_test/dimensionality_reduction/
-      dest: resources_test/dimensionality_reduction
+      path: s3://openproblems-data/resources_test/task_dimensionality_reduction/
+      dest: resources_test/task_dimensionality_reduction
 
 # Step 6: Update the authors of the task.
 authors:
@@ -121,7 +121,8 @@ config_mods: |
   .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
 
 repositories:
-  - name: openproblems-v2
+  - name: core
     type: github
-    repo: openproblems-bio/openproblems-v2
-    tag: main_build
+    repo: openproblems-bio/core
+    tag: build/main
+    path: viash/core
diff --git a/common b/common
diff --git a/scripts/.gitignore b/scripts/.gitignore
diff --git a/scripts/create_component/.gitignore b/scripts/create_component/.gitignore
@@ -0,0 +1,2 @@
+# if users change the scripts, the changes should not be committed.
+/create_*_*.sh
diff --git a/scripts/create_component/create_python_method.sh b/scripts/create_component/create_python_method.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_python_method \
+  --language python \
+  --type method
diff --git a/scripts/create_component/create_python_metric.sh b/scripts/create_component/create_python_metric.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_python_metric \
+  --language python \
+  --type metric
diff --git a/scripts/create_component/create_r_method.sh b/scripts/create_component/create_r_method.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_r_method \
+  --language r \
+  --type method
diff --git a/scripts/create_component/create_r_metric.sh b/scripts/create_component/create_r_metric.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e
+
+common/scripts/create_component \
+  --name my_r_metric \
+  --language r \
+  --type metric
diff --git a/scripts/create_readme.sh b/scripts/create_readme.sh
@@ -1,3 +1,5 @@
 #!/bin/bash
 
-common/scripts/create_task_readme
+set -e
+
+common/scripts/create_task_readme --input src/api
diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+cat > /tmp/params.yaml << 'HERE'
+input_states: s3://openproblems-data/resources/datasets/**/state.yaml
+rename_keys: 'input:output_dataset'
+output_state: '$id/state.yaml'
+settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}'
+publish_dir: s3://openproblems-data/resources/task_dimensionality_reduction/datasets/
+HERE
+
+tw launch https://github.com/openproblems-bio/task_dimensionality_reduction.git \
+  --revision build/main \
+  --pull-latest \
+  --main-script target/nextflow/workflows/process_datasets/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config common/nextflow_helpers/labels_tw.config \
+  --labels task_dimensionality_reduction,process_datasets
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+RAW_DATA=resources_test/common
+DATASET_DIR=resources_test/task_dimensionality_reduction
+
+mkdir -p $DATASET_DIR
+
+# process dataset
+echo Running process_dataset
+viash run src/data_processors/process_dataset/config.vsh.yaml -- \
+    --input $RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad \
+    --output_dataset $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \
+    --output_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad
+
+# run one method
+viash run src/methods/pca/config.vsh.yaml -- \
+    --input $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \
+    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/embedding.h5ad
+
+# run one metric
+viash run src/metrics/clustering_performance/config.vsh.yaml -- \
+    --input_embedding $DATASET_DIR/cxg_mouse_pancreas_atlas/embedding.h5ad \
+    --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \
+    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad
+
+cat > $DATASET_DIR/cxg_mouse_pancreas_atlas/state.yaml << HERE
+id: cxg_mouse_pancreas_atlas
+output_dataset: !file dataset.h5ad
+output_solution: !file solution.h5ad
+HERE
+
+# only run this if you have access to the openproblems-data bucket
+aws s3 sync --profile op \
+  "resources_test/task_dimensionality_reduction" \
+   s3://openproblems-data/resources_test/task_dimensionality_reduction \
+  --delete --dryrun
diff --git a/scripts/create_test_resources.sh b/scripts/create_test_resources.sh
diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh
diff --git a/scripts/project/build_all_components.sh b/scripts/project/build_all_components.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html)
+viash ns build --parallel
diff --git a/scripts/project/build_all_docker_containers.sh b/scripts/project/build_all_docker_containers.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -e
+
+# Build all components in a namespace (refer https://viash.io/reference/cli/ns_build.html)
+# and set up the container via a cached build
+viash ns build --parallel --setup cachedbuild
diff --git a/scripts/test_all_components.sh → scripts/project/test_all_components.sh b/scripts/test_all_components.sh → scripts/project/test_all_components.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
 
+set -e
+
 # Test all components in a namespace (refer https://viash.io/reference/cli/ns_test.html)
-viash ns test --parallel
+viash ns test --parallel
diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+# NOTE: depending on the the datasets and components, you may need to launch this workflow
+# on a different compute platform (e.g. a HPC, AWS Cloud, Azure Cloud, Google Cloud).
+# please refer to the nextflow information for more details:
+# https://www.nextflow.io/docs/latest/
+
+# remove this when you have implemented the script
+# echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it."
+# echo "  Step 1: replace 'task_template' with the name of the task in the following command."
+# echo "  Step 2: replace the rename keys parameters to fit your run_benchmark inputs"
+# echo "  Step 3: replace the settings parameter to fit your run_benchmark outputs"
+# echo "  Step 4: remove this message"
+# exit 1
+
+set -e
+
+echo "Running benchmark on test data"
+echo "  Make sure to run 'scripts/project/build_all_docker_containers.sh'!"
+
+# generate a unique id
+RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+publish_dir="resources/results/${RUN_ID}"
+
+# write the parameters to file
+cat > /tmp/params.yaml << HERE
+input_states: resources/datasets/**/state.yaml
+rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+# run the benchmark
+nextflow run openproblems-bio/task_dimensionality_reduction \
+  --revision build/main \
+  -main-script target/nextflow/workflows/run_benchmark/main.nf \
+  -profile docker \
+  -resume \
+  -entry auto \
+  -c common/nextflow_helpers/labels_ci.config \
+  -params-file /tmp/params.yaml