From e649c2909f3d9852d728f6c02f3b0d051e025998 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 21 Jun 2024 15:30:57 +0200 Subject: [PATCH 001/103] Fix resources --- scripts/download_resources.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh index 945c47e..9eb1285 100644 --- a/scripts/download_resources.sh +++ b/scripts/download_resources.sh @@ -4,7 +4,13 @@ set -e echo ">> Downloading resources" -viash run common/src/sync_resources/config.vsh.yaml -- \ +common/sync_resources/sync_resources \ --input "s3://openproblems-data/resources_test/common/" \ - --output "resources_test" \ - --delete \ No newline at end of file + --output "resources_test/common" \ + --delete + +# After finishing the task and the task specific test_resources are uploaded to s3, uncomment: +# common/sync_resources/sync_resources \ +# --input "s3://openproblems-data/resources_test//" \ +# --output "resources_test/" \ +# --delete \ No newline at end of file From d6e499ecef95682123612cf4060741163cd0c946 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 21 Jun 2024 15:34:52 +0200 Subject: [PATCH 002/103] add submodule initialisation --- scripts/init_submodule.sh | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 scripts/init_submodule.sh diff --git a/scripts/init_submodule.sh b/scripts/init_submodule.sh new file mode 100644 index 0000000..ef2a754 --- /dev/null +++ b/scripts/init_submodule.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +git submodule update --init --recursive \ No newline at end of file From 3c5b4d6eec82d81b31687fee28cf173c15d7a85a Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 21 Jun 2024 15:35:17 +0200 Subject: [PATCH 003/103] add `task_` prefix --- src/api/task_info.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/task_info.yaml b/src/api/task_info.yaml index 5166700..5fcce7b 100644 --- a/src/api/task_info.yaml +++ b/src/api/task_info.yaml @@ -18,7 +18,7 @@ readme: | To get started, you can run the following commands: ```bash - git clone git@github.com:openproblems-bio/.git + git clone git@github.com:openproblems-bio/task_.git cd From 38d4f073b8183203a69deef8c915de635766bfc5 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 21 Jun 2024 16:41:30 +0200 Subject: [PATCH 004/103] update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index fe80ce0..cc7020c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ resources +resources_test work .nextflow* target From 3cb1f64d03fe250798b358a027b8ed3d63f69703 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 27 Jun 2024 10:47:17 +0200 Subject: [PATCH 005/103] WIP instructions --- INSTRUCTIONS.md | 73 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 74287af..dcb539e 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -2,16 +2,77 @@ This is a guide on what to do after you have created a new task repository from the template. More in depth information about how to create a new task can be found in the [OpenProblems Documentation](https://openproblems.bio/documentation/create_task/). +## Requirments + +A list of required software to start developing a new task can be found in the [OpenProblems Requirements](https://openproblems.bio/documentation/create_task/requirements). + ## First things first -* Update the `_viash.yaml` file with the correct task information. -* Update the `src/api/task_info.yaml` file with the information you have provied in the task issue. +### `_viash.yaml` + +* Update the `name` field to the name of the task in snake_case. +* Update the `description` field to a short description of the task. +* Add a keyword to the `keywords` field that describes the task. +* Update the `` in the links field to the name of the task in snake_case. + +### `task_info.yaml` + + +* Update the `src/api/task_info.yaml` file with the information you have provided in the task issue. +* Update the `` in the `readme` field to the name of the task. + +### `common` submodule + +Initialize the `common` submodule by running the following command: + +```bash +scripts/init_submodule.sh +``` ## Resources -THe OpenProblems team has provided some test resources that can be used to test the task. These resources are stored in the `resources` folder. The `scripts/download_resources.sh` script can be used to download these resources. +The OpenProblems team has provided some test resources that can be used to test the task. These resources are stored in the `resources_test` folder. The `scripts/download_resources.sh` script can be used to download these resources. +If these resources are not sufficient, you can add more resources to the `resources_test` folder. The `scripts/download_resources.sh` script can be updated to download these resources. When using new test_resources let the OP team know so they can be added to the s3 bucket. + +```bash +scripts/download_resources.sh +``` + +## Next steps + +### API files ([docs](https://openproblems.bio/documentation/create_task/design_api)) + +Update the API files in the `src/api` folder. These files define the input and output of the methods and metrics. + +### Components ([docs](https://openproblems.bio/documentation/create_task/create_components)) + +To create a component, you can run the respective script in the `script` directory. Before running the script make sure to update the variables `task_name`, `component_name` and `component_lang` and save the file. For additionale components ou will only need to update the `component_name` and `component_lang` variables. + +```bash +scripts/add_a_control_method.sh +``` + +```bash +scripts/add_a_method.sh +``` + +```bash +scripts/add_a_metric.sh +``` + +For each type of component there already is a first component created that you can modify. + +1. Update the `.info` fields in the `config.vsh.yaml`. +2. Add any component specific arguments to the `config.vsh.yaml` file. +3. Add any additional reqources that are required for the component. +4. Update the docker engine image setup if additional packages are required. +5. If you know the required memory and or CPU you can adjust the nextflow `.directive.labels` field. In addition if your component requires a GPU you can add the `gpu` label to the field. +6. Update the `script.py` or `script.R` file with the code for the component. + +> [!NOTE] +> You can remove the comments in the `config.vsh.yaml` file after you have updated the file. + -If these resources are not sufficient, you can add more resources to the `resources` folder. The `scripts/download_resources.sh` script can be updated to download these resources. @@ -23,7 +84,7 @@ If these resources are not sufficient, you can add more resources to the `resour * update scripts/download_resources --> -#!/bin/bash + \ No newline at end of file From 41679cf3c94934a3c5d109b1620f3135dc795594 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 12:48:02 +0200 Subject: [PATCH 006/103] add solution api file --- src/api/file_solution.yaml | 54 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 src/api/file_solution.yaml diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml new file mode 100644 index 0000000..6209a19 --- /dev/null +++ b/src/api/file_solution.yaml @@ -0,0 +1,54 @@ +type: file +example: "resources_test/label_projection/pancreas/solution.h5ad" +info: + label: "Solution" + summary: "The solution for the test data" + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - ... + var: + - ... + obsm: + - ... + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true From 1fbb9619d3bd3d88459bc91102795248c6147af0 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 12:50:14 +0200 Subject: [PATCH 007/103] add process dataset api file --- src/api/comp_process_dataset.yaml | 32 ++++++++++++++++++++++++ src/api/file_common_dataset.yaml | 41 +++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 src/api/comp_process_dataset.yaml create mode 100644 src/api/file_common_dataset.yaml diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml new file mode 100644 index 0000000..90bf545 --- /dev/null +++ b/src/api/comp_process_dataset.yaml @@ -0,0 +1,32 @@ +functionality: + namespace: "label_projection" + info: + type: process_dataset + type_info: + label: Data processor + summary: A label projection dataset processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. + arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_train" + __merge__: file_train.yaml + direction: output + required: true + - name: "--output_test" + __merge__: file_test.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true + test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py + diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml new file mode 100644 index 0000000..0a5a05f --- /dev/null +++ b/src/api/file_common_dataset.yaml @@ -0,0 +1,41 @@ +#TODO: Change to the required and/or optional fields of the anndata +type: file +example: "resources_test/common/pancreas/dataset.h5ad" +info: + label: "Common Dataset" + summary: A subset of the common dataset. + slots: + layers: + - type: integer + name: counts + description: Raw counts + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false From d7d11af1f4375a3bd6bc08179389d534528aac08 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 12:50:29 +0200 Subject: [PATCH 008/103] fix FP in metric api --- src/api/comp_metric.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 9dc8c29..61a138b 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -23,7 +23,7 @@ test_resources: - type: python_script path: /common/comp_tests/check_metric_config.py - type: python_script - path: /common/comp_tests/run_and_check_adata.py + path: /common/component_tests/run_and_check_output.py - path: /common/library.bib #TODO: - path: fill in e.g. /resources/denoising/pancreas #TODO: dest: fill in e.g. resources/denoising/pancreas From 814cb5e07e4a62f297241b5223c82246e8252c78 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 13:28:13 +0200 Subject: [PATCH 009/103] Update instructions --- INSTRUCTIONS.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index dcb539e..649b87c 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -72,7 +72,40 @@ For each type of component there already is a first component created that you c > [!NOTE] > You can remove the comments in the `config.vsh.yaml` file after you have updated the file. +### Testing Components ([docs](https://openproblems.bio/documentation/create_component/run_tests)) +You can test the component by running the following command: + +```bash +viash test /path/to/config.vsh.yaml +``` + +Y0u can also test all components by running the following command: + +```bash +scripts/test_all_components.sh +``` + +It is possible to custumise the command in the above script by adding a `-q` argument to only perform the test on for example methods e.g. ` -q methods`. + + +## Dataset processor ([docs](https://openproblems.bio/documentation/create_task/dataset_processor)) + +The dataset processor is a script that removes all unnecesary info from the dataset for your task. This info is defined in the `api/file_common_dataset.yaml` file. From this filtered dataset several files are created that are used by the methods and metrics. Safeguarding data leaks and laking sure the structure of the data cannot be altered for a method or a metric. + +To create the dataprocessor there is no template available. You can follow the guideline in the documentation. Store the processor in the `src/process_dataset` folder. + +Be sure to update the `file_common_dataset.yaml` file with the correct information required for the methods/metrics. + +> [!IMPORTANT] +> When using your own datasets please advise the openproblems team on how to add these datasets to the s3 bucket. +> As the dataset processor should make use of the `common` datasets folder in the `resources` or `resources_test` directory. + +To create the resources and test_resources for the task we will create a nextflow workflow that will process the datasets. This workflow will be created together with the openproblems team. + +## Benchmarking ([docs](https://openproblems.bio/documentation/create_task/create_workflow)) + +When you are finished with creating your components and datset processor you can create a workflow to benchmark the components. This workflow will be created together with the openproblems team. From 3129d78ace414131ebc960c2ca9ac9f1c66238e5 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 13:31:26 +0200 Subject: [PATCH 010/103] add readme --- INSTRUCTIONS.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 649b87c..2a4a036 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -103,6 +103,14 @@ Be sure to update the `file_common_dataset.yaml` file with the correct informati To create the resources and test_resources for the task we will create a nextflow workflow that will process the datasets. This workflow will be created together with the openproblems team. +## README + +To create the task `README` file preform following command: + +```bash +scripts/create_readme.sh +``` + ## Benchmarking ([docs](https://openproblems.bio/documentation/create_task/create_workflow)) When you are finished with creating your components and datset processor you can create a workflow to benchmark the components. This workflow will be created together with the openproblems team. From f1494dc5bff2c5abe3904ef89c8fb5a9efd58e4a Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 16:22:23 +0200 Subject: [PATCH 011/103] remove github_url from readme --- scripts/create_readme.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/create_readme.sh b/scripts/create_readme.sh index 0857cb6..e5dec6f 100644 --- a/scripts/create_readme.sh +++ b/scripts/create_readme.sh @@ -2,5 +2,4 @@ common/create_task_readme/create_task_readme \ --task_dir src \ - --github_url https://github.com/openproblems-bio/task-template \ --output README.md From 0ea712e6928c73a7eb6e773465f98356f17dc671 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 16:37:37 +0200 Subject: [PATCH 012/103] update process dataset to viash 0.9 --- src/api/comp_process_dataset.yaml | 63 +++++++++++++++---------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml index 90bf545..2dd3fd3 100644 --- a/src/api/comp_process_dataset.yaml +++ b/src/api/comp_process_dataset.yaml @@ -1,32 +1,31 @@ -functionality: - namespace: "label_projection" - info: - type: process_dataset - type_info: - label: Data processor - summary: A label projection dataset processor. - description: | - A component for processing a Common Dataset into a task-specific dataset. - arguments: - - name: "--input" - __merge__: file_common_dataset.yaml - direction: input - required: true - - name: "--output_train" - __merge__: file_train.yaml - direction: output - required: true - - name: "--output_test" - __merge__: file_test.yaml - direction: output - required: true - - name: "--output_solution" - __merge__: file_solution.yaml - direction: output - required: true - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /common/component_tests/run_and_check_output.py - +namespace: "" +info: + type: process_dataset + type_info: + label: Data processor + summary: A label projection dataset processor. + description: | + A component for processing a Common Dataset into a task-specific dataset. +arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_train" + __merge__: file_train.yaml + direction: output + required: true + - name: "--output_test" + __merge__: file_test.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true +test_resources: + - path: /resources_test/common/pancreas + dest: resources_test/common/pancreas + - type: python_script + path: /common/component_tests/run_and_check_output.py + From cae2c1208511db9340330eb7c27335d6ba0c7365 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 16:43:53 +0200 Subject: [PATCH 013/103] update task_info readme section --- src/api/task_info.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/api/task_info.yaml b/src/api/task_info.yaml index 5fcce7b..71c60ac 100644 --- a/src/api/task_info.yaml +++ b/src/api/task_info.yaml @@ -21,6 +21,12 @@ readme: | git clone git@github.com:openproblems-bio/task_.git cd + + # make scripts executable + chmod +x scripts/* + + # initialise submodule + scripts/init_submodule.sh # download resources scripts/download_resources.sh From c5cacb70674a9df7d7f44ebe7d79ebece2182c84 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 16:44:58 +0200 Subject: [PATCH 014/103] rename arg method api files --- src/api/comp_control_method.yaml | 4 ++-- src/api/comp_method.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index fd3ac29..03a4d0e 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -7,11 +7,11 @@ info: description: | A control method to predict effects. arguments: - - name: --train_h5ad + - name: --input_train __merge__: file_train_h5ad.yaml required: false direction: input - - name: --test_h5ad + - name: --input_test __merge__: file_test_h5ad.yaml required: true direction: input diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index 10316b5..afc305c 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -7,7 +7,7 @@ info: description: | A method to predict the task effects. arguments: - - name: --train_h5ad + - name: --input_train __merge__: file_train_h5ad.yaml required: false direction: input From 84a041de79bbed8b493e2adc6ff618f18b3c175b Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 16:46:15 +0200 Subject: [PATCH 015/103] add chmod cmd to instructions --- INSTRUCTIONS.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 2a4a036..4bbf9a1 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -21,6 +21,14 @@ A list of required software to start developing a new task can be found in the [ * Update the `src/api/task_info.yaml` file with the information you have provided in the task issue. * Update the `` in the `readme` field to the name of the task. +### chmod `scripts` + +Make the scripts executable with the following command: + +```bash + chmod +x scripts/* +``` + ### `common` submodule Initialize the `common` submodule by running the following command: From 66468773800cf79a5bc8051959d16065a36629e8 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 16:49:28 +0200 Subject: [PATCH 016/103] `chmod +x scripts/*` --- scripts/add_a_control_method.sh | 0 scripts/add_a_method.sh | 0 scripts/add_a_metric.sh | 0 scripts/create_readme.sh | 0 scripts/download_resources.sh | 0 scripts/init_submodule.sh | 0 scripts/test_all_components.sh | 0 7 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/add_a_control_method.sh mode change 100644 => 100755 scripts/add_a_method.sh mode change 100644 => 100755 scripts/add_a_metric.sh mode change 100644 => 100755 scripts/create_readme.sh mode change 100644 => 100755 scripts/download_resources.sh mode change 100644 => 100755 scripts/init_submodule.sh mode change 100644 => 100755 scripts/test_all_components.sh diff --git a/scripts/add_a_control_method.sh b/scripts/add_a_control_method.sh old mode 100644 new mode 100755 diff --git a/scripts/add_a_method.sh b/scripts/add_a_method.sh old mode 100644 new mode 100755 diff --git a/scripts/add_a_metric.sh b/scripts/add_a_metric.sh old mode 100644 new mode 100755 diff --git a/scripts/create_readme.sh b/scripts/create_readme.sh old mode 100644 new mode 100755 diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh old mode 100644 new mode 100755 diff --git a/scripts/init_submodule.sh b/scripts/init_submodule.sh old mode 100644 new mode 100755 diff --git a/scripts/test_all_components.sh b/scripts/test_all_components.sh old mode 100644 new mode 100755 From 9bfdd56fde95a655ddd79c98d84e27d0ced61819 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 17:32:16 +0200 Subject: [PATCH 017/103] remove chmod cmd --- src/api/task_info.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/api/task_info.yaml b/src/api/task_info.yaml index 71c60ac..0a14da1 100644 --- a/src/api/task_info.yaml +++ b/src/api/task_info.yaml @@ -22,9 +22,6 @@ readme: | cd - # make scripts executable - chmod +x scripts/* - # initialise submodule scripts/init_submodule.sh From 33e6cace6cceefd7af6b1457a306510886df313c Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 17:32:40 +0200 Subject: [PATCH 018/103] add final line in _viash --- _viash.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 923e418..0b10f6b 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -13,4 +13,4 @@ links: version: dev config_mods: | - .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } \ No newline at end of file + .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } From 2e9acc204ac9a41431e46c99077d9df06f07e615 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 22:47:21 +0200 Subject: [PATCH 019/103] remove chmod instructions --- INSTRUCTIONS.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 4bbf9a1..2a4a036 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -21,14 +21,6 @@ A list of required software to start developing a new task can be found in the [ * Update the `src/api/task_info.yaml` file with the information you have provided in the task issue. * Update the `` in the `readme` field to the name of the task. -### chmod `scripts` - -Make the scripts executable with the following command: - -```bash - chmod +x scripts/* -``` - ### `common` submodule Initialize the `common` submodule by running the following command: From d0f1a9f54582632c078c4aa1e0d444665a13e7ef Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 28 Jun 2024 22:50:02 +0200 Subject: [PATCH 020/103] remove comments --- INSTRUCTIONS.md | 59 ------------------------------------------------- 1 file changed, 59 deletions(-) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 2a4a036..7a49542 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -114,62 +114,3 @@ scripts/create_readme.sh ## Benchmarking ([docs](https://openproblems.bio/documentation/create_task/create_workflow)) When you are finished with creating your components and datset processor you can create a workflow to benchmark the components. This workflow will be created together with the openproblems team. - - - - - - - - \ No newline at end of file From 617889e53cc6f04c67d3c505d8d996d17e6d3754 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 1 Jul 2024 14:35:43 +0200 Subject: [PATCH 021/103] update CI --- .github/workflows/build.yaml | 121 +++-------------------------------- .github/workflows/test.yaml | 112 ++------------------------------ 2 files changed, 14 insertions(+), 219 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index b6b568d..b33cbdb 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -1,122 +1,21 @@ -name: build +name: Build on: push: branches: [ 'main' ] workflow_dispatch: inputs: - target_branch: - description: 'Branch to deploy to. If not specified, `build-${BRANCH_NAME}` will be used.' - required: false version: - description: 'Version name to use for the build. If not specified, `build-${BRANCH_NAME}` will be used.' + description: | + The version of the project to build. Example: `1.0.3`. + + If not provided, a development build with a version name + based on the branch name will be built. Otherwise, a release + build with the provided version will be built. required: false jobs: - # phase 1 - list: - runs-on: ubuntu-latest - - outputs: - target_branch: ${{ steps.defaults.outputs.target_branch }} - version: ${{ steps.defaults.outputs.version }} - component_matrix: ${{ steps.set_matrix.outputs.matrix }} - - steps: - - uses: actions/checkout@v4 - with: - submodules: 'recursive' - - - uses: viash-io/viash-actions/setup@v5 - - - name: Determine version tag from branch name - id: defaults - run: | - BRANCH_NAME=$(echo $GITHUB_REF | sed 's/refs\/heads\///') - - VERSION=${{ github.event.inputs.version }} - if [ -z "$VERSION" ]; then - VERSION="build-$BRANCH_NAME" - fi - echo "version=$VERSION" >> $GITHUB_OUTPUT - - TARGET_BRANCH=${{ github.event.inputs.target_branch }} - if [ -z "$TARGET_BRANCH" ]; then - TARGET_BRANCH="build-$BRANCH_NAME" - fi - echo "target_branch=$TARGET_BRANCH" >> $GITHUB_OUTPUT - - - name: Remove target folder from .gitignore - run: | - # allow publishing the target folder - sed -i '/^\/target.*/d' .gitignore - - - uses: viash-io/viash-actions/ns-build@v5 - with: - config_mod: .version := '${{ steps.defaults.outputs.version }}' - parallel: true - - - name: Deploy to target branch - uses: peaceiris/actions-gh-pages@v4 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: . - publish_branch: ${{ steps.defaults.outputs.target_branch }} - - - id: ns_list - uses: viash-io/viash-actions/ns-list@v5 - with: - platform: docker - src: src - format: json - - - id: set_matrix - run: | - echo "matrix=$(jq -c '[ .[] | - { - "name": (.namespace + "/" + .name), - "dir": .info.config | capture("^(?.*\/)").dir - } - ]' ${{ steps.ns_list.outputs.output_file }} )" >> $GITHUB_OUTPUT - - # phase 2 build: - needs: list - - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - component: ${{ fromJson(needs.list.outputs.component_matrix) }} - - steps: - # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' - - uses: data-intuitive/reclaim-the-bytes@v2 - - - uses: actions/checkout@v4 - - - uses: viash-io/viash-actions/setup@v5 - - - name: Build container - uses: viash-io/viash-actions/ns-build@v5 - with: - config_mod: .version := '${{ needs.list.outputs.version }}' - platform: docker - src: ${{ matrix.component.dir }} - setup: build - - - name: Login to container registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ secrets.GTHB_USER }} - password: ${{ secrets.GTHB_PAT }} - - - name: Push container - uses: viash-io/viash-actions/ns-build@v5 - with: - config_mod: .version := '${{ needs.list.outputs.version }}' - platform: docker - src: ${{ matrix.component.dir }} - setup: push \ No newline at end of file + uses: openproblems-bio/actions/.github/workflows/build.yml@main + with: + version: ${{ github.event.inputs.version }} \ No newline at end of file diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 0abad5c..96811dd 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,113 +1,9 @@ -name: test +name: Test on: - pull_request: push: - branches: [ '**' ] + pull_request: jobs: - run_ci_check_job: - runs-on: ubuntu-latest - outputs: - run_ci: ${{ steps.github_cli.outputs.check }} - steps: - - name: 'Check if branch has an existing pull request and the trigger was a push' - id: github_cli - run: | - pull_request=$(gh pr list -R ${{ github.repository }} -H ${{ github.ref_name }} --json url --state open --limit 1 | jq '.[0].url') - # If the branch has a PR and this run was triggered by a push event, do not run - if [[ "$pull_request" != "null" && "$GITHUB_REF_NAME" != "main" && "${{ github.event_name == 'push' }}" == "true" && "${{ !contains(github.event.head_commit.message, 'ci force') }}" == "true" ]]; then - echo "check=false" >> $GITHUB_OUTPUT - else - echo "check=true" >> $GITHUB_OUTPUT - fi - env: - GITHUB_TOKEN: ${{ secrets.GTHB_PAT }} - - # phase 1 - list: - needs: run_ci_check_job - env: - s3_bucket: s3://openproblems-data/resources_test - runs-on: ubuntu-latest - if: ${{ needs.run_ci_check_job.outputs.run_ci == 'true' }} - - outputs: - matrix: ${{ steps.set_matrix.outputs.matrix }} - cache_key: ${{ steps.cache.outputs.cache_key }} - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: 'recursive' - - - uses: christian-ci/action-yaml-github-output@v2 - with: - file_path: _viash.yaml - - - uses: viash-io/viash-actions/setup@v5 - - - uses: viash-io/viash-actions/project/sync-and-cache-s3@v5 - id: cache - with: - s3_bucket: $s3_bucket/$NAME - dest_path: resources - cache_key_prefix: resources__ - - - id: ns_list - uses: viash-io/viash-actions/ns-list@v5 - with: - engine: docker - format: json - - - id: ns_list_filtered - uses: viash-io/viash-actions/project/detect-changed-components@v5 - with: - input_file: "${{ steps.ns_list.outputs.output_file }}" - - - id: set_matrix - run: | - echo "matrix=$(jq -c '[ .[] | - { - "name": (.namespace + "/" + .name), - "config": .info.config - } - ]' ${{ steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT - - # phase 2 - viash_test: - needs: list - if: ${{ needs.list.outputs.matrix != '[]' && needs.list.outputs.matrix != '' }} - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - component: ${{ fromJson(needs.list.outputs.matrix) }} - - steps: - # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' - - uses: data-intuitive/reclaim-the-bytes@v2 - - - uses: actions/checkout@v4 - with: - submodules: 'recursive' - - - uses: viash-io/viash-actions/setup@v5 - - # use cache - - name: Cache resources data - uses: actions/cache@v4 - timeout-minutes: 10 - with: - path: resources - key: ${{ needs.list.outputs.cache_key }} - - - name: Run test - timeout-minutes: 30 - run: | - VIASH_TEMP=$RUNNER_TEMP/viash viash test \ - "${{ matrix.component.config }}" \ - --cpus 2 \ - --memory "16gb" + build: + uses: openproblems-bio/actions/.github/workflows/test.yml@main \ No newline at end of file From 4ec22d823ddd3e11e489775e02f8c4a90cad1dfb Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 1 Jul 2024 14:46:49 +0200 Subject: [PATCH 022/103] add s3 to _viash --- INSTRUCTIONS.md | 2 +- _viash.yaml | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 7a49542..032d525 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -13,7 +13,7 @@ A list of required software to start developing a new task can be found in the [ * Update the `name` field to the name of the task in snake_case. * Update the `description` field to a short description of the task. * Add a keyword to the `keywords` field that describes the task. -* Update the `` in the links field to the name of the task in snake_case. +* Update the `` in the links/info field to the name of the task in snake_case. ### `task_info.yaml` diff --git a/_viash.yaml b/_viash.yaml index 0b10f6b..fd7758a 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -9,6 +9,11 @@ links: issue_tracker: https://github.com/openproblems-bio/task_/issues repository: https://github.com/openproblems-bio/task_ docker_registry: ghcr.io/openproblems-bio +info: + test_resources: + - type: s3 + path: s3://openproblems-data/resources_test/ + dest: test_resources version: dev From 5a08ceb65a0cef007cc32c642ba266d65525bf36 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 1 Jul 2024 15:18:43 +0200 Subject: [PATCH 023/103] Add github templatest --- .github/ISSUE_TEMPLATE/bug_report.md | 24 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/config.yml | 1 + .github/ISSUE_TEMPLATE/feature_request.md | 20 +++++++++++++++++++ .github/PULL_REQUEST_TEMPLATE.md | 17 ++++++++++++++++ 4 files changed, 62 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..9a8a64b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,24 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: [bug] +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..a49eab2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..c17d3c0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: [enhancement] +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..3717137 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,17 @@ +## Describe your changes + + + +## Checklist before requesting a review +- [ ] I have performed a self-review of my code + +- Check the correct box. Does this PR contain: + - [ ] Breaking changes + - [ ] New functionality + - [ ] Major changes + - [ ] Minor changes + - [ ] Bug fixes + +- [ ] Proposed changes are described in the CHANGELOG.md + +- [ ] CI Tests succeed and look good! \ No newline at end of file From 97f74b32e9b60f777ff917bc34d134c8c3e8a900 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 1 Jul 2024 16:12:17 +0200 Subject: [PATCH 024/103] update s3 link --- INSTRUCTIONS.md | 2 +- _viash.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 032d525..892412c 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -13,7 +13,7 @@ A list of required software to start developing a new task can be found in the [ * Update the `name` field to the name of the task in snake_case. * Update the `description` field to a short description of the task. * Add a keyword to the `keywords` field that describes the task. -* Update the `` in the links/info field to the name of the task in snake_case. +* Update the ``/`task_template` in the links/info field to the name of the task in snake_case. ### `task_info.yaml` diff --git a/_viash.yaml b/_viash.yaml index fd7758a..b61e9cf 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -12,7 +12,7 @@ links: info: test_resources: - type: s3 - path: s3://openproblems-data/resources_test/ + path: s3://openproblems-data/resources_test/task_template dest: test_resources version: dev From 361e7e5627584f04a07924283534f97e85a3eb7e Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 1 Jul 2024 16:12:53 +0200 Subject: [PATCH 025/103] update common submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index ecbb47c..95dcc63 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit ecbb47ca0cb36e9350760cf126d5c7e3125f26de +Subproject commit 95dcc63c124ab358ce7a7c48f916c51d55181172 From b74c327580b83082653b7639d05e3a24db481361 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 3 Jul 2024 10:32:08 +0200 Subject: [PATCH 026/103] fix resource test path --- src/api/comp_control_method.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 03a4d0e..23c56a8 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -21,7 +21,7 @@ arguments: direction: output test_resources: - type: python_script - path: /common/src/component_tests/run_and_check_output.py + path: /common/component_tests/run_and_check_output.py - type: python_script path: /common/component_tests/check_method_config.py - path: /common/library.bib From a4abf7f84577fbcbbbbb7a1dc7a82ecfe066dc8b Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 3 Jul 2024 10:42:03 +0200 Subject: [PATCH 027/103] Fix resource test path --- src/api/comp_metric.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 61a138b..62631dc 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -21,7 +21,7 @@ arguments: required: true test_resources: - type: python_script - path: /common/comp_tests/check_metric_config.py + path: /common/component_tests/check_metric_config.py - type: python_script path: /common/component_tests/run_and_check_output.py - path: /common/library.bib From 692c02e77a4040ad98cc927ce63dfc85eade4a9f Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 3 Jul 2024 13:59:06 +0200 Subject: [PATCH 028/103] Update task name --- INSTRUCTIONS.md | 7 +++---- _viash.yaml | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index 892412c..ecd5e52 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -10,14 +10,13 @@ A list of required software to start developing a new task can be found in the [ ### `_viash.yaml` -* Update the `name` field to the name of the task in snake_case. +* Update the `name` field to the name of the task in snake_case the nane should start with `task_`. * Update the `description` field to a short description of the task. * Add a keyword to the `keywords` field that describes the task. -* Update the ``/`task_template` in the links/info field to the name of the task in snake_case. +* Update the `task_template` in the links/info field to the name of the task in snake_case. ### `task_info.yaml` - * Update the `src/api/task_info.yaml` file with the information you have provided in the task issue. * Update the `` in the `readme` field to the name of the task. @@ -32,7 +31,7 @@ scripts/init_submodule.sh ## Resources The OpenProblems team has provided some test resources that can be used to test the task. These resources are stored in the `resources_test` folder. The `scripts/download_resources.sh` script can be used to download these resources. -If these resources are not sufficient, you can add more resources to the `resources_test` folder. The `scripts/download_resources.sh` script can be updated to download these resources. When using new test_resources let the OP team know so they can be added to the s3 bucket. +If these resources are not sufficient, you can add more resources to the `resources_test` folder. The `scripts/download_resources.sh` script can be updated to download these resources. When using new test resources let the OP team know so they can be added to the s3 bucket. ```bash scripts/download_resources.sh diff --git a/_viash.yaml b/_viash.yaml index b61e9cf..83fb187 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -6,8 +6,8 @@ description: | license: MIT keywords: [single-cell, openproblems, benchmark] links: - issue_tracker: https://github.com/openproblems-bio/task_/issues - repository: https://github.com/openproblems-bio/task_ + issue_tracker: https://github.com/openproblems-bio/task_template/issues + repository: https://github.com/openproblems-bio/task_template docker_registry: ghcr.io/openproblems-bio info: test_resources: From c13fea4b5e7953f12d065289483fe441a9cb0420 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 4 Jul 2024 10:22:10 +0200 Subject: [PATCH 029/103] remove `readme` field from task_info --- common | 2 +- src/api/task_info.yaml | 46 ------------------------------------------ 2 files changed, 1 insertion(+), 47 deletions(-) diff --git a/common b/common index 95dcc63..b3db30d 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 95dcc63c124ab358ce7a7c48f916c51d55181172 +Subproject commit b3db30dc136f37c8b8b4cb866f1e5ca6c2304ee6 diff --git a/src/api/task_info.yaml b/src/api/task_info.yaml index 0a14da1..4899361 100644 --- a/src/api/task_info.yaml +++ b/src/api/task_info.yaml @@ -2,52 +2,6 @@ name: A unique identifier. Can only contain lowercase letters, numbers or unders label: A unique, human-readable, short label. Used for creating summary tables and visualisations. summary: A one sentence summary of purpose and methodology. Used for creating an overview tables. image: The name of the image file to use for the component on the website. -readme: | - ## Installation - - You need to have Docker, Java, and Viash installed. Follow - [these instructions](https://openproblems.bio/documentation/fundamentals/requirements) - to install the required dependencies. - - ## Add a method - - To add a method to the repository, follow the instructions in the `scripts/add_a_method.sh` script. - - ## Frequently used commands - - To get started, you can run the following commands: - - ```bash - git clone git@github.com:openproblems-bio/task_.git - - cd - - # initialise submodule - scripts/init_submodule.sh - - # download resources - scripts/download_resources.sh - ``` - - To run the benchmark, you first need to build the components. Afterwards, you can run the benchmark: - - ```bash - viash ns build --parallel --setup cachedbuild - - scripts/run_benchmark.sh - ``` - - After adding a component, it is recommended to run the tests to ensure that the component is working correctly: - - ```bash - viash ns test --parallel - ``` - - Optionally, you can provide the `--query` argument to test only a subset of components: - - ```bash - viash ns test --parallel --query "component_name" - ``` motivation: | Explain the motivation behind your proposed task. Describe the biological or computational problem you aim to address and why it’s important. Discuss the current state of research in From f3fb01fe282c7a786fe34125c319605c79ec17c2 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 4 Jul 2024 17:33:08 +0200 Subject: [PATCH 030/103] update submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index b3db30d..a6e861b 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit b3db30dc136f37c8b8b4cb866f1e5ca6c2304ee6 +Subproject commit a6e861b8fcaef67f9de9a566efc0a4599f208a86 From 0e31cc0cf002321355b5fda93a1d404e9b93c143 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 4 Jul 2024 21:04:32 +0200 Subject: [PATCH 031/103] update submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index a6e861b..a6ea822 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit a6e861b8fcaef67f9de9a566efc0a4599f208a86 +Subproject commit a6ea822566a66de8d0d80b8ac7c80cbcd8ad9b6d From 6900fb422811fbafd8277c70459ec1a1b7372285 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 4 Jul 2024 21:16:27 +0200 Subject: [PATCH 032/103] update submodule documentation --- INSTRUCTIONS.md | 10 ++++++++-- README.md | 9 +++++++++ scripts/init_submodule.sh | 3 --- 3 files changed, 17 insertions(+), 5 deletions(-) delete mode 100755 scripts/init_submodule.sh diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md index ecd5e52..e33c28b 100644 --- a/INSTRUCTIONS.md +++ b/INSTRUCTIONS.md @@ -22,10 +22,16 @@ A list of required software to start developing a new task can be found in the [ ### `common` submodule -Initialize the `common` submodule by running the following command: +If the submodule does not show any files, you will need to initialize the `common` submodule by running the following command: ```bash -scripts/init_submodule.sh +git submodule update --init --recursive +``` + +To update the repository with the latest changes from in the submodule, you can run the following command: + +```bash +git pull --recurse-submodules ``` ## Resources diff --git a/README.md b/README.md index 0c87796..bcc18d0 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,15 @@ The instructions below will guide you through creating a new repository from thi * Set the repository visibility to public. * Click "Create repository from template". +## Clone the repository + +To clone the repository with the submodule files, you can use the following command: + +```bash +git clone --recurse-submodules git@github.com:openproblems-bio/.git +``` + + ## What to do next Check out the [instructions](INSTRUCTIONS.md) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. diff --git a/scripts/init_submodule.sh b/scripts/init_submodule.sh deleted file mode 100755 index ef2a754..0000000 --- a/scripts/init_submodule.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -git submodule update --init --recursive \ No newline at end of file From 5708aa3d8d1b9f2d4db8310dc83256b424396b59 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 4 Jul 2024 22:02:53 +0200 Subject: [PATCH 033/103] update submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index a6ea822..04a1b2a 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit a6ea822566a66de8d0d80b8ac7c80cbcd8ad9b6d +Subproject commit 04a1b2abb8dbb71d45f2ec60b98d8e4b3ccce4a9 From 98367cd2ef6fb4ee87069929b6e0588ebf114e8b Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 4 Jul 2024 22:06:15 +0200 Subject: [PATCH 034/103] update readme --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bcc18d0..71dfc9f 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,11 @@ The instructions below will guide you through creating a new repository from thi To clone the repository with the submodule files, you can use the following command: ```bash -git clone --recurse-submodules git@github.com:openproblems-bio/.git +git clone --recursive git@github.com:openproblems-bio/.git ``` - ## What to do next -Check out the [instructions](INSTRUCTIONS.md) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. +Check out the [instructions](common/INSTRUCTIONS.md) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. -For more information on the OpenProblems v2, check out the [Documentation](https://openproblems.bio/documentation/) on the Open Problems website. +For more information on the OpenProblems v2, check out the [Documentation](https://openproblems.bio/documentation/). \ No newline at end of file From 01ee29ecd260dab74213273b0cbdbdb236446261 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 5 Jul 2024 08:02:34 +0200 Subject: [PATCH 035/103] add changelog --- CHANGELOG.MD | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 CHANGELOG.MD diff --git a/CHANGELOG.MD b/CHANGELOG.MD new file mode 100644 index 0000000..e69de29 From 49370785daaa82f4a0d63edb2de783fe7f3918cb Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 5 Jul 2024 10:29:43 +0200 Subject: [PATCH 036/103] update _viash --- _viash.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 83fb187..c6536ac 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1,6 +1,7 @@ viash_version: 0.9.0-RC6 name: task_template +organization: openproblems-bio description: | An OpenProblems benchmark task. license: MIT @@ -8,7 +9,7 @@ keywords: [single-cell, openproblems, benchmark] links: issue_tracker: https://github.com/openproblems-bio/task_template/issues repository: https://github.com/openproblems-bio/task_template - docker_registry: ghcr.io/openproblems-bio + docker_registry: ghcr.io info: test_resources: - type: s3 From 4e3faaa697c169dfae807a4da9a4770a91fedb2d Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 5 Jul 2024 10:30:57 +0200 Subject: [PATCH 037/103] Update s3 buckets in _viash --- _viash.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index c6536ac..0000009 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -14,7 +14,10 @@ info: test_resources: - type: s3 path: s3://openproblems-data/resources_test/task_template - dest: test_resources + dest: resources_test/task_template + - type: s3 + path: s3://openproblems-data/resources_test/common/ + dest: resources_test/common version: dev From 2d8f3c95991e1d433c908ff23be71a0739d2838c Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 8 Jul 2024 15:15:47 +0200 Subject: [PATCH 038/103] rename changelog --- CHANGELOG.MD => CHANGELOG.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename CHANGELOG.MD => CHANGELOG.md (100%) diff --git a/CHANGELOG.MD b/CHANGELOG.md similarity index 100% rename from CHANGELOG.MD rename to CHANGELOG.md From 8d7adaa7fd044be39b4fa6f769a584adb7d37417 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 8 Jul 2024 15:40:40 +0200 Subject: [PATCH 039/103] Add changelog template --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e69de29..3a2488a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -0,0 +1,14 @@ +# task_template x.y.z + +## BREAKING CHANGES + + + +## NEW FUNCTIONALITY + +## MAJOR CHANGES + +## MINOR CHANGES + +## BUGFIXES + From 9293510761949fb6ec2df73e13d33dca7f1a1b72 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 8 Jul 2024 20:51:50 +0200 Subject: [PATCH 040/103] relocate instructions to common --- INSTRUCTIONS.md | 121 ------------------------------------------------ 1 file changed, 121 deletions(-) delete mode 100644 INSTRUCTIONS.md diff --git a/INSTRUCTIONS.md b/INSTRUCTIONS.md deleted file mode 100644 index e33c28b..0000000 --- a/INSTRUCTIONS.md +++ /dev/null @@ -1,121 +0,0 @@ -# Instructions - -This is a guide on what to do after you have created a new task repository from the template. More in depth information about how to create a new task can be found in the [OpenProblems Documentation](https://openproblems.bio/documentation/create_task/). - -## Requirments - -A list of required software to start developing a new task can be found in the [OpenProblems Requirements](https://openproblems.bio/documentation/create_task/requirements). - -## First things first - -### `_viash.yaml` - -* Update the `name` field to the name of the task in snake_case the nane should start with `task_`. -* Update the `description` field to a short description of the task. -* Add a keyword to the `keywords` field that describes the task. -* Update the `task_template` in the links/info field to the name of the task in snake_case. - -### `task_info.yaml` - -* Update the `src/api/task_info.yaml` file with the information you have provided in the task issue. -* Update the `` in the `readme` field to the name of the task. - -### `common` submodule - -If the submodule does not show any files, you will need to initialize the `common` submodule by running the following command: - -```bash -git submodule update --init --recursive -``` - -To update the repository with the latest changes from in the submodule, you can run the following command: - -```bash -git pull --recurse-submodules -``` - -## Resources - -The OpenProblems team has provided some test resources that can be used to test the task. These resources are stored in the `resources_test` folder. The `scripts/download_resources.sh` script can be used to download these resources. -If these resources are not sufficient, you can add more resources to the `resources_test` folder. The `scripts/download_resources.sh` script can be updated to download these resources. When using new test resources let the OP team know so they can be added to the s3 bucket. - -```bash -scripts/download_resources.sh -``` - -## Next steps - -### API files ([docs](https://openproblems.bio/documentation/create_task/design_api)) - -Update the API files in the `src/api` folder. These files define the input and output of the methods and metrics. - -### Components ([docs](https://openproblems.bio/documentation/create_task/create_components)) - -To create a component, you can run the respective script in the `script` directory. Before running the script make sure to update the variables `task_name`, `component_name` and `component_lang` and save the file. For additionale components ou will only need to update the `component_name` and `component_lang` variables. - -```bash -scripts/add_a_control_method.sh -``` - -```bash -scripts/add_a_method.sh -``` - -```bash -scripts/add_a_metric.sh -``` - -For each type of component there already is a first component created that you can modify. - -1. Update the `.info` fields in the `config.vsh.yaml`. -2. Add any component specific arguments to the `config.vsh.yaml` file. -3. Add any additional reqources that are required for the component. -4. Update the docker engine image setup if additional packages are required. -5. If you know the required memory and or CPU you can adjust the nextflow `.directive.labels` field. In addition if your component requires a GPU you can add the `gpu` label to the field. -6. Update the `script.py` or `script.R` file with the code for the component. - -> [!NOTE] -> You can remove the comments in the `config.vsh.yaml` file after you have updated the file. - -### Testing Components ([docs](https://openproblems.bio/documentation/create_component/run_tests)) - -You can test the component by running the following command: - -```bash -viash test /path/to/config.vsh.yaml -``` - -Y0u can also test all components by running the following command: - -```bash -scripts/test_all_components.sh -``` - -It is possible to custumise the command in the above script by adding a `-q` argument to only perform the test on for example methods e.g. ` -q methods`. - - -## Dataset processor ([docs](https://openproblems.bio/documentation/create_task/dataset_processor)) - -The dataset processor is a script that removes all unnecesary info from the dataset for your task. This info is defined in the `api/file_common_dataset.yaml` file. From this filtered dataset several files are created that are used by the methods and metrics. Safeguarding data leaks and laking sure the structure of the data cannot be altered for a method or a metric. - -To create the dataprocessor there is no template available. You can follow the guideline in the documentation. Store the processor in the `src/process_dataset` folder. - -Be sure to update the `file_common_dataset.yaml` file with the correct information required for the methods/metrics. - -> [!IMPORTANT] -> When using your own datasets please advise the openproblems team on how to add these datasets to the s3 bucket. -> As the dataset processor should make use of the `common` datasets folder in the `resources` or `resources_test` directory. - -To create the resources and test_resources for the task we will create a nextflow workflow that will process the datasets. This workflow will be created together with the openproblems team. - -## README - -To create the task `README` file preform following command: - -```bash -scripts/create_readme.sh -``` - -## Benchmarking ([docs](https://openproblems.bio/documentation/create_task/create_workflow)) - -When you are finished with creating your components and datset processor you can create a workflow to benchmark the components. This workflow will be created together with the openproblems team. From b88f6ed488b47b989cfb047e240da9f442832e3e Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 8 Jul 2024 20:53:11 +0200 Subject: [PATCH 041/103] change submodule link to https --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index a7ec0e3..c07c083 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "common"] path = common - url = git@github.com:openproblems-bio/common-resources.git + url = https://github.com/openproblems-bio/common_resources.git From 7eeb2f768a23fd3e70d006a993e485ece99b5ea9 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 8 Jul 2024 20:53:56 +0200 Subject: [PATCH 042/103] add reference to submodule instructions --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 71dfc9f..1489187 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This repo is a template to create a new task for the OpenProblems v2. This repo ## Create a repository from this template > [!IMPORTANT] -> Before creating a new repository, make sure you are part of the openProblems task team. This will be done when you create an issue for the task and you got the go ahead to create the task. +> Before creating a new repository, make sure you are part of the openProblems task team. This will be done when you create an issue for the task and you get the go ahead to create the task. > For more information on how to create a new task, check out the [Create a new task](https://openproblems.bio/documentation/create_task/) documentation. The instructions below will guide you through creating a new repository from this template ([creating-a-repository-from-a-template](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template#creating-a-repository-from-a-template)). @@ -27,6 +27,8 @@ To clone the repository with the submodule files, you can use the following comm ```bash git clone --recursive git@github.com:openproblems-bio/.git ``` +>[!NOTE] +> If somehow there are no files visible in the submodule after cloning using the above command. Check the instructions [here](common/README.md). ## What to do next From 793c05e474cbbffc0b321023a41cdd5d1b95e72d Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Mon, 8 Jul 2024 21:16:16 +0200 Subject: [PATCH 043/103] Relocate task_info to _viash [WIP] --- _viash.yaml | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 0000009..7a5dda3 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1,16 +1,27 @@ viash_version: 0.9.0-RC6 +# Step 1: Change the name of the task. +# example: task_name_of_this_task name: task_template organization: openproblems-bio +version: dev +# Step 2: Update the description to a short description of the task. description: | An OpenProblems benchmark task. license: MIT +# Step 3: Add keywords to describe the task. keywords: [single-cell, openproblems, benchmark] +# Step 4: Update the `task_template` to the name of the task from step 1. links: issue_tracker: https://github.com/openproblems-bio/task_template/issues repository: https://github.com/openproblems-bio/task_template docker_registry: ghcr.io info: + summary: + motivation: + label: + image: + test_resources: - type: s3 path: s3://openproblems-data/resources_test/task_template @@ -19,7 +30,25 @@ info: path: s3://openproblems-data/resources_test/common/ dest: resources_test/common -version: dev +# Step 5: Update te authors of the task. +authors: + # Full name of the author, usually in the name of FirstName MiddleName LastName. + - name: ... + # Role of the author. Possible values: + # + # * `"author"`: Authors who have made substantial contributions to the component. + # * `"maintainer"`: The maintainer of the component. + # * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.). + roles: [ ... ] + # Additional information on the author + info: + github: ... + orcid: ... + email: ... + twitter: ... + linkedin: ... + + config_mods: | .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } From 21afc2c0f2dc648791f878b67a30cb3f053c6ebe Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 09:31:39 +0200 Subject: [PATCH 044/103] Update _viash [WIP] --- _viash.yaml | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/_viash.yaml b/_viash.yaml index 7a5dda3..abd5d08 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -16,11 +16,21 @@ links: issue_tracker: https://github.com/openproblems-bio/task_template/issues repository: https://github.com/openproblems-bio/task_template docker_registry: ghcr.io +# Step 5: Update the info: - summary: - motivation: - label: - image: + name: A unique identifier. Can only contain lowercase letters, numbers or underscores. (This is the same as the name above but without the `task_` prefix) + label: A unique, human-readable, short label. Used for creating summary tables and visualisations. + description: | + Provide a clear and concise description of your task, detailing the specific problem it aims + to solve. Outline the input data types, the expected output, and any assumptions or constraints. + Be sure to explain any terminology or concepts that are essential for understanding the task. + summary: A one sentence summary of purpose and methodology. Used for creating an overview tables. + motivation: | + Explain the motivation behind your proposed task. Describe the biological or computational + problem you aim to address and why it’s important. Discuss the current state of research in + this area and any gaps or challenges that your task could help address. This section + should convince readers of the significance and relevance of your task. + image: The name of the image file to use for the component on the website. test_resources: - type: s3 @@ -48,7 +58,5 @@ authors: twitter: ... linkedin: ... - - config_mods: | .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } From a675aa8cb98ed9da707628f7eb357b9f76425200 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 09:40:44 +0200 Subject: [PATCH 045/103] add last step comments to _viash --- _viash.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/_viash.yaml b/_viash.yaml index abd5d08..3aa30aa 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -16,7 +16,7 @@ links: issue_tracker: https://github.com/openproblems-bio/task_template/issues repository: https://github.com/openproblems-bio/task_template docker_registry: ghcr.io -# Step 5: Update the +# Step 5: Update the info fields to the text from the task issue. info: name: A unique identifier. Can only contain lowercase letters, numbers or underscores. (This is the same as the name above but without the `task_` prefix) label: A unique, human-readable, short label. Used for creating summary tables and visualisations. @@ -31,7 +31,7 @@ info: this area and any gaps or challenges that your task could help address. This section should convince readers of the significance and relevance of your task. image: The name of the image file to use for the component on the website. - + # Step 6: Replace the task_template to the name of the task in `info.name`. test_resources: - type: s3 path: s3://openproblems-data/resources_test/task_template @@ -40,7 +40,7 @@ info: path: s3://openproblems-data/resources_test/common/ dest: resources_test/common -# Step 5: Update te authors of the task. +# Step 7: Update te authors of the task. authors: # Full name of the author, usually in the name of FirstName MiddleName LastName. - name: ... From cb5730a6dcec32a740e2c50f2fc12a17c83a2911 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 16:19:56 +0200 Subject: [PATCH 046/103] Update common submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index 04a1b2a..38aca0d 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 04a1b2abb8dbb71d45f2ec60b98d8e4b3ccce4a9 +Subproject commit 38aca0d2360a1e7a05032e1255a9081439f00162 From aafaa760422906946444a5a0c56d10cfa1e5f925 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 16:29:35 +0200 Subject: [PATCH 047/103] Update download resources --- scripts/download_resources.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh index 9eb1285..3781a29 100755 --- a/scripts/download_resources.sh +++ b/scripts/download_resources.sh @@ -5,8 +5,6 @@ set -e echo ">> Downloading resources" common/sync_resources/sync_resources \ - --input "s3://openproblems-data/resources_test/common/" \ - --output "resources_test/common" \ --delete # After finishing the task and the task specific test_resources are uploaded to s3, uncomment: From affce64c2bdcf3596f5de464a079a006357d45d6 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 17:11:26 +0200 Subject: [PATCH 048/103] Create working components [WIP] --- src/api/comp_control_method.yaml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 23c56a8..b5186b5 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -3,13 +3,18 @@ info: type: control_method type_info: label: Control Method - summary: A control method. + summary: Quality control methods for verifying the pipeline. description: | - A control method to predict effects. + This folder contains control components for the task. + These components have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. arguments: - name: --input_train __merge__: file_train_h5ad.yaml - required: false + required: true direction: input - name: --input_test __merge__: file_test_h5ad.yaml @@ -25,5 +30,5 @@ test_resources: - type: python_script path: /common/component_tests/check_method_config.py - path: /common/library.bib - #TODO: - path: fill in e.g. /resources/denoising/pancreas - #TODO: dest: fill in e.g. resources/denoising/pancreas \ No newline at end of file + - path: /resources_test/task_template/cxg_mouse_pancreas_atlas + dest: resources_test/task_template/cxg_mouse_pancreas_atlas \ No newline at end of file From 98ab68a9a8b7c8eff1a8f180e87bf697938f9f3d Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 17:13:40 +0200 Subject: [PATCH 049/103] Create working components [WIP]: Update file train api --- src/api/file_train_h5ad.yaml | 37 ++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/src/api/file_train_h5ad.yaml b/src/api/file_train_h5ad.yaml index 9ec0e86..61a3679 100644 --- a/src/api/file_train_h5ad.yaml +++ b/src/api/file_train_h5ad.yaml @@ -1,19 +1,48 @@ #TODO: Change to the required and/or optional fields of the anndata type: file -example: "resources_test/denoising/pancreas/train.h5ad" +example: "resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad" info: label: "Training data" - summary: The subset of molecules used for the training dataset + summary: "The training data in h5ad format" slots: layers: - type: integer name: counts description: Raw counts required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: label + description: Ground truth cell type labels + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true uns: - type: string name: dataset_id description: "A unique identifier for the dataset" required: true - # obs: - # ... \ No newline at end of file + - type: string + name: normalization_id + description: "Which normalization was used" + required: true \ No newline at end of file From f763df05e1f3870d15c15dd0e6d16397991c0141 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 17:15:14 +0200 Subject: [PATCH 050/103] Create working components [WIP]: Update file test api --- src/api/file_test_h5ad.yaml | 53 ++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/src/api/file_test_h5ad.yaml b/src/api/file_test_h5ad.yaml index d373b84..908add2 100644 --- a/src/api/file_test_h5ad.yaml +++ b/src/api/file_test_h5ad.yaml @@ -1,6 +1,6 @@ #TODO: Change to the required and/or optional fields of the anndata type: file -example: "resources_test/denoising/pancreas/test.h5ad" +example: "resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad" info: label: "Test data" summary: The subset of molecules used for the test dataset @@ -10,36 +10,35 @@ info: name: counts description: Raw counts required: true + - type: double + name: normalized + description: Normalized counts + required: true + obs: + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true uns: - type: string name: dataset_id description: "A unique identifier for the dataset" required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - name: train_sum - type: integer - description: The total number of counts in the training dataset. + name: normalization_id + description: "Which normalization was used" required: true \ No newline at end of file From 7a1d0e5926d5e4b6789458c1a29d5e12d0bfe278 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 17:18:15 +0200 Subject: [PATCH 051/103] Create working components [WIP]: Update file prediction API --- src/api/file_prediction.yaml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml index b473e75..629d146 100644 --- a/src/api/file_prediction.yaml +++ b/src/api/file_prediction.yaml @@ -1,20 +1,24 @@ #TODO: Change to the required and/or optional fields of the anndata type: file -example: "resources_test/denoising/pancreas/denoised.h5ad" +example: "resources_test/task_template/pancreas/prediction.h5ad" info: label: "Predicted data" summary: A predicted dataset as output by a method. slots: - layers: - - type: integer - name: prediction - description: predicted data + obs: + - type: string + name: label_pred + description: Predicted labels for the test cells. required: true uns: - type: string name: dataset_id description: "A unique identifier for the dataset" required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true - type: string name: method_id description: "A unique identifier for the method" From 782c2eb035f1528db17d057fa9f3d385a893b33d Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 20:14:22 +0200 Subject: [PATCH 052/103] Create working components [WIP]: update resources_test path --- src/api/file_test_h5ad.yaml | 2 +- src/api/file_train_h5ad.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/file_test_h5ad.yaml b/src/api/file_test_h5ad.yaml index 908add2..f53a396 100644 --- a/src/api/file_test_h5ad.yaml +++ b/src/api/file_test_h5ad.yaml @@ -1,6 +1,6 @@ #TODO: Change to the required and/or optional fields of the anndata type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad" +example: "resources_test/task_template/pancreas/test.h5ad" info: label: "Test data" summary: The subset of molecules used for the test dataset diff --git a/src/api/file_train_h5ad.yaml b/src/api/file_train_h5ad.yaml index 61a3679..e9c2bb4 100644 --- a/src/api/file_train_h5ad.yaml +++ b/src/api/file_train_h5ad.yaml @@ -1,6 +1,6 @@ #TODO: Change to the required and/or optional fields of the anndata type: file -example: "resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad" +example: "resources_test/task_template/pancreas/train.h5ad" info: label: "Training data" summary: "The training data in h5ad format" From 2aca12035e9f2dcbdd404b3e578dc59bbf5a539e Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 20:22:35 +0200 Subject: [PATCH 053/103] Update config control_method to working --- .../my_control_method/config.vsh.yaml | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/control_methods/my_control_method/config.vsh.yaml b/src/control_methods/my_control_method/config.vsh.yaml index bba79d9..2438332 100644 --- a/src/control_methods/my_control_method/config.vsh.yaml +++ b/src/control_methods/my_control_method/config.vsh.yaml @@ -8,20 +8,27 @@ __merge__: ../../api/comp_control_method.yaml # A unique identifier for your component (required). # Can contain only lowercase letters or underscores. -name: my_control_method +name: logistic_regression # Metadata for your component info: # A relatively short label, used when rendering visualisations (required) - label: My Control Method + label: Logistic Regression # A one sentence summary of how this method works (required). Used when # rendering summary tables. - summary: "FILL IN: A one sentence summary of this method." + summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." # A multi-line description of how this component works (required). Used # when rendering reference documentation. description: | - FILL IN: A (multi-line) description of how this method works. - # Which normalisation method this component prefers to use (required). + Logistic Regression estimates parameters of a logistic function for + multivariate classification tasks. Here, we use 100-dimensional whitened PCA + coordinates as independent variables, and the model minimises the cross + entropy loss over all cell type classes. + + reference: "hosmer2013applied" + repository_url: https://github.com/scikit-learn/scikit-learn + documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" + # Which normalization method this component prefers to use (required). preferred_normalization: log_cp10k # Component-specific parameters (optional) @@ -43,12 +50,12 @@ resources: engines: # Specifications for the Docker image for this component. - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_python:1.1.0 # Add custom dependencies here (optional). For more information, see # https://viash.io/reference/config/engines/docker/#setup . - # setup: - # - type: python - # packages: scib==1.1.5 + setup: + - type: python + packages: scikit-learn runners: # This platform allows running the component natively @@ -56,4 +63,4 @@ runners: # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: - label: [midtime,midmem,midcpu] + label: [midtime,midmem,lowcpu] From 045c4b10aaa10003f94b5cd4753c0ad46e2d5982 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 20:28:18 +0200 Subject: [PATCH 054/103] Create working components [WIP]: Update control_method script --- .../my_control_method/script.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/control_methods/my_control_method/script.py b/src/control_methods/my_control_method/script.py index f97215f..426043b 100644 --- a/src/control_methods/my_control_method/script.py +++ b/src/control_methods/my_control_method/script.py @@ -1,39 +1,43 @@ import anndata as ad +import sklearn.linear_model ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { - 'train_h5ad': 'resources_test/task_template/pancreas/train_h5ad.h5ad', - 'test_h5ad': 'resources_test/task_template/pancreas/test_h5ad.h5ad', + 'input_train': 'resources_test/task_template/pancreas/train.h5ad', + 'input_test': 'resources_test/task_template/pancreas/test.h5ad', 'output': 'output.h5ad' } meta = { - 'name': 'my_control_method' + 'name': 'logistic_regression' } ## VIASH END print('Reading input files', flush=True) -train_h5ad = ad.read_h5ad(par['train_h5ad']) -test_h5ad = ad.read_h5ad(par['test_h5ad']) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) print('Preprocess data', flush=True) # ... preprocessing ... print('Train model', flush=True) # ... train model ... +classifier = sklearn.linear_model.LogisticRegression() +classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) print('Generate predictions', flush=True) # ... generate predictions ... +obs = classifier.predict(input_test.obsm["X_pca"]) print("Write output AnnData to file", flush=True) output = ad.AnnData( uns={ - 'dataset_id': train_h5ad.uns['dataset_id'], + 'dataset_id': input_train.uns['dataset_id'], 'method_id': meta['name'] }, - layers={ - 'prediction': layers_prediction + obs={ + 'label_pred': obs } ) output.write_h5ad(par['output'], compression='gzip') From 703be1d7a6b70d96d422ab79b380bcfd66c9380c Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 20:30:22 +0200 Subject: [PATCH 055/103] Create working components [WIP]: fix control method resource test path --- src/api/comp_control_method.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index b5186b5..e8d4251 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -30,5 +30,5 @@ test_resources: - type: python_script path: /common/component_tests/check_method_config.py - path: /common/library.bib - - path: /resources_test/task_template/cxg_mouse_pancreas_atlas - dest: resources_test/task_template/cxg_mouse_pancreas_atlas \ No newline at end of file + - path: /resources_test/task_template/pancreas + dest: resources_test/task_template/pancreas \ No newline at end of file From ac59cbc89c42209845ed431324c56b9122114e4c Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 20:33:37 +0200 Subject: [PATCH 056/103] Create working components [WIP]: fix control method docker image --- src/control_methods/my_control_method/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/control_methods/my_control_method/config.vsh.yaml b/src/control_methods/my_control_method/config.vsh.yaml index 2438332..2b3b692 100644 --- a/src/control_methods/my_control_method/config.vsh.yaml +++ b/src/control_methods/my_control_method/config.vsh.yaml @@ -50,7 +50,7 @@ resources: engines: # Specifications for the Docker image for this component. - type: docker - image: ghcr.io/openproblems-bio/base_python:1.1.0 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 # Add custom dependencies here (optional). For more information, see # https://viash.io/reference/config/engines/docker/#setup . setup: From b2c4bd05d4c875a81248472c78a33de6549760a2 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 21:29:50 +0200 Subject: [PATCH 057/103] Update common submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index 38aca0d..28c2b27 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 38aca0d2360a1e7a05032e1255a9081439f00162 +Subproject commit 28c2b271687dca388d1c1ed448f464e653af2c24 From 3be418ba09bd0388c1812674a5619085270f412f Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 21:43:34 +0200 Subject: [PATCH 058/103] Update comp_method api file --- src/api/comp_method.yaml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index afc305c..651f81e 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -9,8 +9,12 @@ info: arguments: - name: --input_train __merge__: file_train_h5ad.yaml - required: false + required: true + direction: input + - name: "--input_test" + __merge__: file_test_h5ad.yaml direction: input + required: true - name: --output __merge__: file_prediction.yaml required: true @@ -21,5 +25,5 @@ test_resources: - type: python_script path: /common/component_tests/check_method_config.py - path: /common/library.bib - #TODO: - path: fill in e.g. /resources/denoising/pancreas - #TODO: dest: fill in e.g. resources/denoising/pancreas \ No newline at end of file + - path: /resources_test/task_template/pancreas + dest: resources_test/task_template/pancreas \ No newline at end of file From 8fbc4d18d532663b39bab87566eef359ad3d337d Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 21:43:58 +0200 Subject: [PATCH 059/103] Add working method --- .../config.vsh.yaml | 27 ++++++++++--------- .../script.py | 19 ++++++++----- 2 files changed, 28 insertions(+), 18 deletions(-) rename src/methods/{my_method => logistic_regression}/config.vsh.yaml (66%) rename src/methods/{my_method => logistic_regression}/script.py (51%) diff --git a/src/methods/my_method/config.vsh.yaml b/src/methods/logistic_regression/config.vsh.yaml similarity index 66% rename from src/methods/my_method/config.vsh.yaml rename to src/methods/logistic_regression/config.vsh.yaml index 743101f..cb60d48 100644 --- a/src/methods/my_method/config.vsh.yaml +++ b/src/methods/logistic_regression/config.vsh.yaml @@ -8,27 +8,30 @@ __merge__: ../../api/comp_method.yaml # A unique identifier for your component (required). # Can contain only lowercase letters or underscores. -name: my_method +name: logistic_regression # Metadata for your component info: # A relatively short label, used when rendering visualisations (required) - label: My Method + label: Logistic Regression # A one sentence summary of how this method works (required). Used when # rendering summary tables. - summary: "FILL IN: A one sentence summary of this method." + summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." # A multi-line description of how this component works (required). Used # when rendering reference documentation. description: | - FILL IN: A (multi-line) description of how this method works. + Logistic Regression estimates parameters of a logistic function for + multivariate classification tasks. Here, we use 100-dimensional whitened PCA + coordinates as independent variables, and the model minimises the cross + entropy loss over all cell type classes. # Which normalisation method this component prefers to use (required). preferred_normalization: log_cp10k # A reference key from the bibtex library at src/common/library.bib (required). - reference: bibtex_reference_key + reference: "hosmer2013applied" # URL to the documentation for this method (required). - documentation_url: https://url.to/the/documentation + documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" # URL to the code repository for this method (required). - repository_url: https://github.com/organisation/repository + repository_url: https://github.com/scikit-learn/scikit-learn # Component-specific parameters (optional) # arguments: @@ -49,12 +52,12 @@ resources: engines: # Specifications for the Docker image for this component. - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 # Add custom dependencies here (optional). For more information, see # https://viash.io/reference/config/engines/docker/#setup . - # setup: - # - type: python - # packages: scib==1.1.5 + setup: + - type: python + packages: scikit-learn runners: # This platform allows running the component natively @@ -62,4 +65,4 @@ runners: # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: - label: [midtime,midmem,midcpu] + label: [midtime,midmem,lowcpu] diff --git a/src/methods/my_method/script.py b/src/methods/logistic_regression/script.py similarity index 51% rename from src/methods/my_method/script.py rename to src/methods/logistic_regression/script.py index b0ed7f1..a80f152 100644 --- a/src/methods/my_method/script.py +++ b/src/methods/logistic_regression/script.py @@ -1,37 +1,44 @@ import anndata as ad +import sklearn.linear_model ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { - 'train_h5ad': 'resources_test/task_template/pancreas/train_h5ad.h5ad', + 'input_train': 'resources_test/task_template/pancreas/train.h5ad', + 'input_test': 'resources_test/task_template/pancreas/test.h5ad', 'output': 'output.h5ad' } meta = { - 'name': 'my_method' + 'name': 'logistic_regression' } ## VIASH END print('Reading input files', flush=True) -train_h5ad = ad.read_h5ad(par['train_h5ad']) +input_train = ad.read_h5ad(par['input_train']) +input_test = ad.read_h5ad(par['input_test']) print('Preprocess data', flush=True) # ... preprocessing ... print('Train model', flush=True) # ... train model ... +classifier = sklearn.linear_model.LogisticRegression() +classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) print('Generate predictions', flush=True) # ... generate predictions ... +obs_label_pred = classifier.predict(input_test.obsm["X_pca"]) print("Write output AnnData to file", flush=True) output = ad.AnnData( uns={ - 'dataset_id': train_h5ad.uns['dataset_id'], + 'dataset_id': input_train.uns['dataset_id'], + 'normalization_id': input_train.uns['normalization_id'], 'method_id': meta['name'] }, - layers={ - 'prediction': layers_prediction + obs={ + 'label_pred': obs_label_pred } ) output.write_h5ad(par['output'], compression='gzip') From 862d48d6853cd2175002d5cf27a398bd00c48c93 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 21:44:54 +0200 Subject: [PATCH 060/103] Update control_method api --- src/api/comp_control_method.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index e8d4251..0bfd973 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -20,6 +20,10 @@ arguments: __merge__: file_test_h5ad.yaml required: true direction: input + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true - name: --output __merge__: file_prediction.yaml required: true From 2c81f68b631721d0e6fd038804d8fcc2f6c22a92 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 21:51:29 +0200 Subject: [PATCH 061/103] add working control method --- .../config.vsh.yaml | 27 +++++++------------ .../script.py | 12 ++++----- 2 files changed, 16 insertions(+), 23 deletions(-) rename src/control_methods/{my_control_method => true_labels}/config.vsh.yaml (62%) rename src/control_methods/{my_control_method => true_labels}/script.py (76%) diff --git a/src/control_methods/my_control_method/config.vsh.yaml b/src/control_methods/true_labels/config.vsh.yaml similarity index 62% rename from src/control_methods/my_control_method/config.vsh.yaml rename to src/control_methods/true_labels/config.vsh.yaml index 2b3b692..d32eaec 100644 --- a/src/control_methods/my_control_method/config.vsh.yaml +++ b/src/control_methods/true_labels/config.vsh.yaml @@ -8,28 +8,21 @@ __merge__: ../../api/comp_control_method.yaml # A unique identifier for your component (required). # Can contain only lowercase letters or underscores. -name: logistic_regression +name: true_labels # Metadata for your component info: # A relatively short label, used when rendering visualisations (required) - label: Logistic Regression + label: True Labels # A one sentence summary of how this method works (required). Used when # rendering summary tables. - summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." + summary: "a positive control, solution labels are copied 1 to 1 to the predicted data." # A multi-line description of how this component works (required). Used # when rendering reference documentation. description: | - Logistic Regression estimates parameters of a logistic function for - multivariate classification tasks. Here, we use 100-dimensional whitened PCA - coordinates as independent variables, and the model minimises the cross - entropy loss over all cell type classes. - - reference: "hosmer2013applied" - repository_url: https://github.com/scikit-learn/scikit-learn - documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" - # Which normalization method this component prefers to use (required). - preferred_normalization: log_cp10k + A positive control, where the solution labels are copied 1 to 1 to the predicted data. + # Which normalisation method this component prefers to use (required). + preferred_normalization: counts # Component-specific parameters (optional) # arguments: @@ -53,9 +46,9 @@ engines: image: ghcr.io/openproblems-bio/base_images/python:1.1.0 # Add custom dependencies here (optional). For more information, see # https://viash.io/reference/config/engines/docker/#setup . - setup: - - type: python - packages: scikit-learn + # setup: + # - type: python + # packages: scib==1.1.5 runners: # This platform allows running the component natively @@ -63,4 +56,4 @@ runners: # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: - label: [midtime,midmem,lowcpu] + label: [midtime,lowmem,lowcpu] diff --git a/src/control_methods/my_control_method/script.py b/src/control_methods/true_labels/script.py similarity index 76% rename from src/control_methods/my_control_method/script.py rename to src/control_methods/true_labels/script.py index 426043b..6bb9e30 100644 --- a/src/control_methods/my_control_method/script.py +++ b/src/control_methods/true_labels/script.py @@ -1,5 +1,4 @@ import anndata as ad -import sklearn.linear_model ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -7,37 +6,38 @@ par = { 'input_train': 'resources_test/task_template/pancreas/train.h5ad', 'input_test': 'resources_test/task_template/pancreas/test.h5ad', + 'input_solution': 'resources_test/task_template/pancreas/solution.h5ad', 'output': 'output.h5ad' } meta = { - 'name': 'logistic_regression' + 'name': 'true_labels' } ## VIASH END print('Reading input files', flush=True) input_train = ad.read_h5ad(par['input_train']) input_test = ad.read_h5ad(par['input_test']) +input_solution = ad.read_h5ad(par['input_solution']) print('Preprocess data', flush=True) # ... preprocessing ... print('Train model', flush=True) # ... train model ... -classifier = sklearn.linear_model.LogisticRegression() -classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) print('Generate predictions', flush=True) # ... generate predictions ... -obs = classifier.predict(input_test.obsm["X_pca"]) +obs_label_pred = input_solution.obs["label"] print("Write output AnnData to file", flush=True) output = ad.AnnData( uns={ 'dataset_id': input_train.uns['dataset_id'], + 'normalization_id': input_train.uns['normalization_id'], 'method_id': meta['name'] }, obs={ - 'label_pred': obs + 'label_pred': obs_label_pred } ) output.write_h5ad(par['output'], compression='gzip') From e71c07cda469aab9dd4dbcc8c4cd69c727dbf854 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 21:52:48 +0200 Subject: [PATCH 062/103] update solution file api --- src/api/file_solution.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml index 6209a19..e4fff4e 100644 --- a/src/api/file_solution.yaml +++ b/src/api/file_solution.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/label_projection/pancreas/solution.h5ad" +example: "resources_test/task_template/pancreas/solution.h5ad" info: label: "Solution" summary: "The solution for the test data" From ac944d442d65f74368558d0e1931ca1da69fa182 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 21:54:06 +0200 Subject: [PATCH 063/103] update slots in solution file api --- src/api/file_solution.yaml | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml index e4fff4e..e160651 100644 --- a/src/api/file_solution.yaml +++ b/src/api/file_solution.yaml @@ -14,11 +14,28 @@ info: description: Normalized counts required: true obs: - - ... + - type: string + name: label + description: Ground truth cell type labels + required: true + - type: string + name: batch + description: Batch information + required: true var: - - ... + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true obsm: - - ... + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true uns: - type: string name: dataset_id From 8f97e66192730c539155eaad6b5c863bd5330ca6 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 21:57:47 +0200 Subject: [PATCH 064/103] update matric api --- src/api/comp_metric.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 62631dc..03cf9fd 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -3,12 +3,12 @@ info: type: metric type_info: label: Metric - summary: A metric. + summary: A task template metric. description: | A metric for evaluating method predictions. arguments: - - name: "--input_test" - __merge__: file_test_h5ad.yaml + - name: "--input_solution" + __merge__: file_solution.yaml direction: input required: true - name: "--input_prediction" @@ -25,5 +25,5 @@ test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - path: /common/library.bib - #TODO: - path: fill in e.g. /resources/denoising/pancreas - #TODO: dest: fill in e.g. resources/denoising/pancreas + - path: /resources/task_template/pancreas + dest: resources/task_template/pancreas From df8888d764d12b912402bf07f752f4f7e697d725 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 21:58:28 +0200 Subject: [PATCH 065/103] update score file api --- src/api/file_score.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml index 8aefeba..a2567dc 100644 --- a/src/api/file_score.yaml +++ b/src/api/file_score.yaml @@ -10,6 +10,10 @@ info: name: dataset_id description: "A unique identifier for the dataset" required: true + - type: string + name: normalization_id + description: "Which normalization was used" + required: true - type: string name: method_id description: "A unique identifier for the method" From 3022be8a31c6583d26d4e072a000e591c83b5223 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 22:11:17 +0200 Subject: [PATCH 066/103] fix metric test_resources path --- src/api/comp_metric.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 03cf9fd..47286ee 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -25,5 +25,5 @@ test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - path: /common/library.bib - - path: /resources/task_template/pancreas - dest: resources/task_template/pancreas + - path: /resources_test/task_template/pancreas + dest: resources_test/task_template/pancreas From fdb4f2611b59045e92ad4f27d67fdd9c25563cc5 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Wed, 10 Jul 2024 22:12:17 +0200 Subject: [PATCH 067/103] add working metric --- .../{my_metric => accuracy}/config.vsh.yaml | 50 +++++++++---------- src/metrics/accuracy/script.py | 47 +++++++++++++++++ src/metrics/my_metric/script.py | 35 ------------- 3 files changed, 70 insertions(+), 62 deletions(-) rename src/metrics/{my_metric => accuracy}/config.vsh.yaml (51%) create mode 100644 src/metrics/accuracy/script.py delete mode 100644 src/metrics/my_metric/script.py diff --git a/src/metrics/my_metric/config.vsh.yaml b/src/metrics/accuracy/config.vsh.yaml similarity index 51% rename from src/metrics/my_metric/config.vsh.yaml rename to src/metrics/accuracy/config.vsh.yaml index d998cf0..f336d1c 100644 --- a/src/metrics/my_metric/config.vsh.yaml +++ b/src/metrics/accuracy/config.vsh.yaml @@ -8,35 +8,31 @@ __merge__: ../../api/comp_metric.yaml # A unique identifier for your component (required). # Can contain only lowercase letters or underscores. -name: my_metric +name: accuracy # Metadata for your component info: metrics: # A unique identifier for your metric (required). # Can contain only lowercase letters or underscores. - name: my_metric - # A relatively short label, used when rendering visualisarions (required) - label: My Metric - # A one sentence summary of how this metric works (required). Used when - # rendering summary tables. - summary: "FILL IN: A one sentence summary of this metric." - # A multi-line description of how this component works (required). Used - # when rendering reference documentation. - description: | - FILL IN: A (multi-line) description of how this metric works. - # A reference key from the bibtex library at src/common/library.bib (required). - reference: bibtex_reference_key - # URL to the documentation for this metric (required). - documentation_url: https://url.to/the/documentation - # URL to the code repository for this metric (required). - repository_url: https://github.com/organisation/repository - # The minimum possible value for this metric (required) - min: 0 - # The maximum possible value for this metric (required) - max: 1 - # Whether a higher value represents a 'better' solution (required) - maximize: true + - name: accuracy + # A relatively short label, used when rendering visualisarions (required) + label: Accuracy + # A one sentence summary of how this metric works (required). Used when + # rendering summary tables. + summary: "The percentage of correctly predicted labels." + # A multi-line description of how this component works (required). Used + # when rendering reference documentation. + description: | + The percentage of correctly predicted labels. + # A reference key from the bibtex library at src/common/library.bib (required). + reference: grandini2020metrics + # The minimum possible value for this metric (required) + min: 0 + # The maximum possible value for this metric (required) + max: 1 + # Whether a higher value represents a 'better' solution (required) + maximize: true # Component-specific parameters (optional) # arguments: @@ -57,12 +53,12 @@ resources: engines: # Specifications for the Docker image for this component. - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 # Add custom dependencies here (optional). For more information, see # https://viash.io/reference/config/engines/docker/#setup . - # setup: - # - type: python - # packages: scib==1.1.5 + setup: + - type: python + packages: scikit-learn runners: # This platform allows running the component natively diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py new file mode 100644 index 0000000..72dcb1e --- /dev/null +++ b/src/metrics/accuracy/script.py @@ -0,0 +1,47 @@ +import anndata as ad +import numpy as np +import sklearn.preprocessing + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_solution': 'resources_test/task_template/pancreas/solution.h5ad', + 'input_prediction': 'resources_test/task_template/pancreas/prediction.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'name': 'accuracy' +} +## VIASH END + +print('Reading input files', flush=True) +input_solution = ad.read_h5ad(par['input_solution']) +input_prediction = ad.read_h5ad(par['input_prediction']) + +assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" + +print("Encode labels", flush=True) +cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) +encoder = sklearn.preprocessing.LabelEncoder().fit(cats) +input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) +input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) + + +print('Compute metrics', flush=True) +# metric_ids and metric_values can have length > 1 +# but should be of equal length +uns_metric_ids = [ 'accuracy' ] +uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"]) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_prediction.uns['dataset_id'], + 'normalization_id': input_prediction.uns['normalization_id'], + 'method_id': input_prediction.uns['method_id'], + 'metric_ids': uns_metric_ids, + 'metric_values': uns_metric_values + } +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/my_metric/script.py b/src/metrics/my_metric/script.py deleted file mode 100644 index 08dc74d..0000000 --- a/src/metrics/my_metric/script.py +++ /dev/null @@ -1,35 +0,0 @@ -import anndata as ad - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_test': 'resources_test/task_template/pancreas/test.h5ad', - 'input_prediction': 'resources_test/task_template/pancreas/prediction.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'my_metric' -} -## VIASH END - -print('Reading input files', flush=True) -input_test = ad.read_h5ad(par['input_test']) -input_prediction = ad.read_h5ad(par['input_prediction']) - -print('Compute metrics', flush=True) -# metric_ids and metric_values can have length > 1 -# but should be of equal length -uns_metric_ids = [ 'my_metric' ] -uns_metric_values = [ 0.5 ] - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_prediction.uns['dataset_id'], - 'method_id': input_prediction.uns['method_id'], - 'metric_ids': uns_metric_ids, - 'metric_values': uns_metric_values - } -) -output.write_h5ad(par['output'], compression='gzip') From b4e832544bd093f16b1efb404d130604a1ae7bea Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 09:45:10 +0200 Subject: [PATCH 068/103] force ci From 5bc6929f94499badf31109b983e459ea7badb714 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 09:48:13 +0200 Subject: [PATCH 069/103] try fix for resources test s3 path in _viash --- _viash.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_viash.yaml b/_viash.yaml index 3aa30aa..3a30572 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -34,7 +34,7 @@ info: # Step 6: Replace the task_template to the name of the task in `info.name`. test_resources: - type: s3 - path: s3://openproblems-data/resources_test/task_template + path: s3://openproblems-data/resources_test/task_template/ dest: resources_test/task_template - type: s3 path: s3://openproblems-data/resources_test/common/ From 28e4c8a591bfcd9a06c10af532faf4890af30a5e Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 10:36:51 +0200 Subject: [PATCH 070/103] update instructions url --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1489187..107254c 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,6 @@ git clone --recursive git@github.com:openproblems-bio/.git ## What to do next -Check out the [instructions](common/INSTRUCTIONS.md) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. +Check out the [instructions]([common/INSTRUCTIONS.md](https://github.com/openproblems-bio/common_resources/blob/main/INSTRUCTIONS.md)) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. -For more information on the OpenProblems v2, check out the [Documentation](https://openproblems.bio/documentation/). \ No newline at end of file +For more information on the OpenProblems v2, check out the [documentation](https://openproblems.bio/documentation/). \ No newline at end of file From bb9b3bc504969d3840447cb372a616b34af49254 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 10:38:27 +0200 Subject: [PATCH 071/103] update instructions url --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 107254c..4dbfde8 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,6 @@ git clone --recursive git@github.com:openproblems-bio/.git ## What to do next -Check out the [instructions]([common/INSTRUCTIONS.md](https://github.com/openproblems-bio/common_resources/blob/main/INSTRUCTIONS.md)) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. +Check out the [instructions](https://github.com/openproblems-bio/common_resources/blob/main/INSTRUCTIONS.md) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands. For more information on the OpenProblems v2, check out the [documentation](https://openproblems.bio/documentation/). \ No newline at end of file From f578c8887b5aad03e926ccbc7fe3f183ceee90be Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 10:38:45 +0200 Subject: [PATCH 072/103] Update README.md Co-authored-by: Robrecht Cannoodt --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4dbfde8..da3ffe5 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This repo is a template to create a new task for the OpenProblems v2. This repo ## Create a repository from this template > [!IMPORTANT] -> Before creating a new repository, make sure you are part of the openProblems task team. This will be done when you create an issue for the task and you get the go ahead to create the task. +> Before creating a new repository, make sure you are part of the OpenProblems task team. This will be done when you create an issue for the task and you get the go ahead to create the task. > For more information on how to create a new task, check out the [Create a new task](https://openproblems.bio/documentation/create_task/) documentation. The instructions below will guide you through creating a new repository from this template ([creating-a-repository-from-a-template](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template#creating-a-repository-from-a-template)). From 6a8f3719645a20e4fd80779223ae289b7a615354 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 11:12:27 +0200 Subject: [PATCH 073/103] add gitignore for scripts --- scripts/.gitignore | 3 +++ scripts/add_a_control_method.sh | 12 ------------ scripts/add_a_method.sh | 12 ------------ scripts/add_a_metric.sh | 12 ------------ 4 files changed, 3 insertions(+), 36 deletions(-) create mode 100644 scripts/.gitignore delete mode 100755 scripts/add_a_control_method.sh delete mode 100755 scripts/add_a_method.sh delete mode 100755 scripts/add_a_metric.sh diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 0000000..2f7ffd3 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1,3 @@ +add_a_method.sh +add_a_control_method.sh +add_a_metric.sh \ No newline at end of file diff --git a/scripts/add_a_control_method.sh b/scripts/add_a_control_method.sh deleted file mode 100755 index d853907..0000000 --- a/scripts/add_a_control_method.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -task_name="task_template" -component_name="my_control_method" -component_lang="python" # change this to "r" if need be - -common/create_component/create_component \ - --task $task_name \ - --language "$component_lang" \ - --name "$component_name" \ - --api_file src/api/comp_control_method.yaml \ - --output "src/control_methods/$component_name" \ No newline at end of file diff --git a/scripts/add_a_method.sh b/scripts/add_a_method.sh deleted file mode 100755 index 8812644..0000000 --- a/scripts/add_a_method.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -task_name="task_template" -component_name="my_method" -component_lang="python" # change this to "r" if need be - -common/create_component/create_component \ - --task $task_name \ - --language "$component_lang" \ - --name "$component_name" \ - --api_file src/api/comp_method.yaml \ - --output "src/methods/$component_name" \ No newline at end of file diff --git a/scripts/add_a_metric.sh b/scripts/add_a_metric.sh deleted file mode 100755 index 71d6067..0000000 --- a/scripts/add_a_metric.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -task_name="task_template" -component_name="my_metric" -component_lang="python" # change this to "r" if need be - -common/create_component/create_component \ - --task $task_name \ - --language "$component_lang" \ - --name "$component_name" \ - --api_file src/api/comp_metric.yaml \ - --output "src/metrics/$component_name" \ No newline at end of file From 06d1da391c0949b5dfe0afc6622769a6bd92ebbc Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 11:19:27 +0200 Subject: [PATCH 074/103] add config for benchmark workflow --- src/workflows/run_benchmark/config.vsh.yaml | 79 +++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 src/workflows/run_benchmark/config.vsh.yaml diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000..e62a12b --- /dev/null +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,79 @@ +name: run_benchmark +namespace: workflows + +argument_groups: + - name: Inputs + arguments: + - name: "--input_train" + __merge__: /src/tasks/label_projection/api/file_train.yaml + type: file + direction: input + required: true + - name: "--input_test" + __merge__: /src/tasks/label_projection/api/file_test.yaml + type: file + direction: input + required: true + - name: "--input_solution" + __merge__: /src/tasks/label_projection/api/file_solution.yaml + type: file + direction: input + required: true + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Methods + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "../_viash.yaml" + +repositories: + - name: openproblems-v2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build +dependencies: + - name: common/check_dataset_schema + repository: openproblems-v2 + - name: common/extract_metadata + repository: openproblems-v2 + - name: control_methods/true_labels + - name: methods/logistic_regression + - name: metrics/accuracy + +runners: + - type: nextflow From 6510489b0b7f2641b829d140c810b70317a5fbf0 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 11:24:25 +0200 Subject: [PATCH 075/103] add run benchmark workflow [WIP] --- src/workflows/run_benchmark/main.nf | 308 ++++++++++++++++++++++++++++ 1 file changed, 308 insertions(+) create mode 100644 src/workflows/run_benchmark/main.nf diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf new file mode 100644 index 0000000..4d036c5 --- /dev/null +++ b/src/workflows/run_benchmark/main.nf @@ -0,0 +1,308 @@ +workflow auto { + findStatesTempWf(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + true_labels, + logistic_regression + ] + + // construct list of metrics + metrics = [ + accuracy + ] + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input_solution"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // use the 'filter' argument to only run a method on the normalisation the component is asking for + filter: { id, state, comp -> + def norm = state.dataset_uns.normalization_id + def pref = comp.config.info.preferred_normalization + // if the preferred normalisation is none at all, + // we can pass whichever dataset we want + def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref + def method_check = !state.method_ids || state.method_ids.contains(comp.config.name) + + method_check && norm_check + }, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.name + }, + + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + def new_args = [ + input_train: state.input_train, + input_test: state.input_test + ] + if (comp.config.info.type == "control_method") { + new_args.input_solution = state.input_solution + } + new_args + }, + + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: [ + input_solution: "input_solution", + input_prediction: "method_output" + ], + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.name, + metric_output: output.output + ] + } + ) + + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + // only keep one of the normalization methods + | filter{ id, state -> + state.dataset_uns.normalization_id == "log_cp10k" + } + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("_viash.yaml") + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file.info, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +} + +// temp fix for rename_keys typo + +def findStatesTemp(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesTempWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesTempWf +} \ No newline at end of file From 9645f2b5aabe189fd06cddf576d672ca3bb3d9c9 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 11:28:33 +0200 Subject: [PATCH 076/103] Fix paths in benchmark config --- src/workflows/run_benchmark/config.vsh.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index e62a12b..f298617 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -5,17 +5,17 @@ argument_groups: - name: Inputs arguments: - name: "--input_train" - __merge__: /src/tasks/label_projection/api/file_train.yaml + __merge__: /src/api/file_train_h5ad.yaml type: file direction: input required: true - name: "--input_test" - __merge__: /src/tasks/label_projection/api/file_test.yaml + __merge__: /src/api/file_test_h5ad.yaml type: file direction: input required: true - name: "--input_solution" - __merge__: /src/tasks/label_projection/api/file_solution.yaml + __merge__: /src/api/file_solution.yaml type: file direction: input required: true @@ -59,7 +59,7 @@ resources: path: main.nf entrypoint: run_wf - type: file - path: "../_viash.yaml" + path: /_viash.yaml repositories: - name: openproblems-v2 From 8167c6339ce927aaaaaa0df3ccd8547afe3c9d49 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 13:50:38 +0200 Subject: [PATCH 077/103] fix findstatestemp typo --- src/workflows/run_benchmark/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 4d036c5..6350aad 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -1,5 +1,5 @@ workflow auto { - findStatesTempWf(params, meta.config) + findStatesTemp(params, meta.config) | meta.workflow.run( auto: [publish: "state"] ) From c1d2033fca6fd3ffc6be48549b14a1dac98a1338 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 13:50:57 +0200 Subject: [PATCH 078/103] add workflow test [WIP] --- src/workflows/run_benchmark/test.sh | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 src/workflows/run_benchmark/test.sh diff --git a/src/workflows/run_benchmark/test.sh b/src/workflows/run_benchmark/test.sh new file mode 100755 index 0000000..b0dbc24 --- /dev/null +++ b/src/workflows/run_benchmark/test.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +# export TOWER_WORKSPACE_ID=53907369739130 + +DATASETS_DIR="resources_test/task_template" +OUTPUT_DIR="output/temp" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +export NXF_VER=24.04.3 +nextflow run . \ + -main-script target/nextflow/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_train:output_train;input_test:output_test;input_solution:output_solution' \ + --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" From 676d2a8cf942ee4056020a64270f1f373ce2fcbb Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 14:57:23 +0200 Subject: [PATCH 079/103] update submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index 28c2b27..a5643f9 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 28c2b271687dca388d1c1ed448f464e653af2c24 +Subproject commit a5643f932bdb55c64a57555f3319649f5e4d9a5b From 5ef354f1d4d8b738841ec808e2e767be9ca4068f Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 15:50:01 +0200 Subject: [PATCH 080/103] Add obs_names to method output --- src/methods/logistic_regression/script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/methods/logistic_regression/script.py b/src/methods/logistic_regression/script.py index a80f152..cc851f8 100644 --- a/src/methods/logistic_regression/script.py +++ b/src/methods/logistic_regression/script.py @@ -41,4 +41,6 @@ 'label_pred': obs_label_pred } ) +output.obs_names = input_test.obs_names + output.write_h5ad(par['output'], compression='gzip') From 65bc1c3160354edc72958999e223e3d486e924f7 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 15:55:31 +0200 Subject: [PATCH 081/103] add obs_names to control_method output --- src/control_methods/true_labels/script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/control_methods/true_labels/script.py b/src/control_methods/true_labels/script.py index 6bb9e30..0a04aaf 100644 --- a/src/control_methods/true_labels/script.py +++ b/src/control_methods/true_labels/script.py @@ -40,4 +40,6 @@ 'label_pred': obs_label_pred } ) +output.obs_names = input_test.obs_names + output.write_h5ad(par['output'], compression='gzip') From c1f9ba19f9421d3b094d353b01e7beb554f4fa43 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 15:55:39 +0200 Subject: [PATCH 082/103] update submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index a5643f9..19a6b8f 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit a5643f932bdb55c64a57555f3319649f5e4d9a5b +Subproject commit 19a6b8fef711600bcb882d3b764d067168b8bf8a From cdd16e0506860d0a22f01e8557af04993c2e27bd Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 16:39:37 +0200 Subject: [PATCH 083/103] fix task_info extraction from _viash in workflow --- src/workflows/run_benchmark/main.nf | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 6350aad..68e5ecd 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -160,7 +160,10 @@ workflow run_wf { def metric_configs_file = tempFile("metric_configs.yaml") metric_configs_file.write(metric_configs_yaml_blob) - def task_info_file = meta.resources_dir.resolve("_viash.yaml") + def viash_file = meta.resources_dir.resolve("_viash.yaml") + def viash_file_content = toYamlBlob(readYaml(viash_file).info) + def task_info_file = tempFile("task_info.yaml") + task_info_file.write(viash_file_content) // store the scores in a file def score_uns = states.collect{it.score_uns} @@ -171,7 +174,7 @@ workflow run_wf { def new_state = [ output_method_configs: method_configs_file, output_metric_configs: metric_configs_file, - output_task_info: task_info_file.info, + output_task_info: task_info_file, output_scores: score_uns_file, _meta: states[0]._meta ] From 3cd3efecd5ddb9987732dbc77d38f4d1b6abffaa Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 16:39:54 +0200 Subject: [PATCH 084/103] update _viash info --- _viash.yaml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/_viash.yaml b/_viash.yaml index 3a30572..042fe25 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -18,8 +18,10 @@ links: docker_registry: ghcr.io # Step 5: Update the info fields to the text from the task issue. info: - name: A unique identifier. Can only contain lowercase letters, numbers or underscores. (This is the same as the name above but without the `task_` prefix) - label: A unique, human-readable, short label. Used for creating summary tables and visualisations. + # A unique identifier. Can only contain lowercase letters, numbers or underscores. (This is the same as the name above but without the `task_` prefix) + name: template + # A unique, human-readable, short label. Used for creating summary tables and visualisations. + label: Template description: | Provide a clear and concise description of your task, detailing the specific problem it aims to solve. Outline the input data types, the expected output, and any assumptions or constraints. @@ -43,17 +45,17 @@ info: # Step 7: Update te authors of the task. authors: # Full name of the author, usually in the name of FirstName MiddleName LastName. - - name: ... + - name: Kai Waldrant # Role of the author. Possible values: # # * `"author"`: Authors who have made substantial contributions to the component. # * `"maintainer"`: The maintainer of the component. # * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.). - roles: [ ... ] + roles: [ "author", "maintainer" ] # Additional information on the author info: - github: ... - orcid: ... + github: KaiWaldrant + orcid: 0009-0003-8555-1361 email: ... twitter: ... linkedin: ... From 820facc2d60e7d40688ff1d8671b12d61043489f Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 17:10:24 +0200 Subject: [PATCH 085/103] update process_dataset api --- src/api/comp_process_dataset.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml index 2dd3fd3..019c9ba 100644 --- a/src/api/comp_process_dataset.yaml +++ b/src/api/comp_process_dataset.yaml @@ -1,9 +1,9 @@ -namespace: "" +namespace: "data_processors" info: - type: process_dataset + type: data_processor type_info: label: Data processor - summary: A label projection dataset processor. + summary: A data processor. description: | A component for processing a Common Dataset into a task-specific dataset. arguments: From dac82f754954ea20f84d6a109a1ece16580d9d32 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 17:11:04 +0200 Subject: [PATCH 086/103] rename process_dataset api file --- src/api/{comp_process_dataset.yaml => comp_data_processor.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/api/{comp_process_dataset.yaml => comp_data_processor.yaml} (100%) diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_data_processor.yaml similarity index 100% rename from src/api/comp_process_dataset.yaml rename to src/api/comp_data_processor.yaml From 2d74c719aa060ce0d0a167adeaeefec264c411cb Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 17:14:49 +0200 Subject: [PATCH 087/103] add process_dataset config file [WIP] --- .../process_dataset/config.vsh.yaml | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 src/dataset_processors/process_dataset/config.vsh.yaml diff --git a/src/dataset_processors/process_dataset/config.vsh.yaml b/src/dataset_processors/process_dataset/config.vsh.yaml new file mode 100644 index 0000000..1a0b36b --- /dev/null +++ b/src/dataset_processors/process_dataset/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: ../../api/comp_data_processor.yaml +name: process_dataset +arguments: + - name: "--method" + type: "string" + description: "The process method to assign train/test." + choices: ["batch", "random"] + default: "batch" + - name: "--obs_label" + type: "string" + description: "Which .obs slot to use as label." + default: "cell_type" + - name: "--obs_batch" + type: "string" + description: "Which .obs slot to use as batch covariate." + default: "batch" + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 +resources: + - type: python_script + path: script.py + - path: common/helper_functions/subset_anndata.py + +engines: + - type: docker + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 + +runners: + - type: executable + - type: nextflow + directives: + label: [highmem,midcpu,midtime] \ No newline at end of file From 21a5f4f452f837cc5eff4c6401f5357f0b1c20c8 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 17:16:38 +0200 Subject: [PATCH 088/103] relocate dataset_processors to data_processors --- .../process_dataset/config.vsh.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{dataset_processors => data_processors}/process_dataset/config.vsh.yaml (100%) diff --git a/src/dataset_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml similarity index 100% rename from src/dataset_processors/process_dataset/config.vsh.yaml rename to src/data_processors/process_dataset/config.vsh.yaml From e081d1716d1f2a11ec924112d6b0447904595ab3 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 17:20:25 +0200 Subject: [PATCH 089/103] add dataset_processor script --- src/api/comp_data_processor.yaml | 4 +- .../process_dataset/config.vsh.yaml | 2 +- src/data_processors/process_dataset/script.py | 78 +++++++++++++++++++ 3 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 src/data_processors/process_dataset/script.py diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 019c9ba..184bc54 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -12,11 +12,11 @@ arguments: direction: input required: true - name: "--output_train" - __merge__: file_train.yaml + __merge__: file_train_h5ad.yaml direction: output required: true - name: "--output_test" - __merge__: file_test.yaml + __merge__: file_test_h5ad.yaml direction: output required: true - name: "--output_solution" diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 1a0b36b..6f35e96 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -21,7 +21,7 @@ arguments: resources: - type: python_script path: script.py - - path: common/helper_functions/subset_anndata.py + - path: /common/helper_functions/subset_anndata.py engines: - type: docker diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py new file mode 100644 index 0000000..4348dc8 --- /dev/null +++ b/src/data_processors/process_dataset/script.py @@ -0,0 +1,78 @@ +import sys +import random +import numpy as np +import anndata as ad + +## VIASH START +par = { + 'input': 'resources_test/common/pancreas/dataset.h5ad', + 'method': 'batch', + 'seed': None, + 'obs_batch': 'batch', + 'obs_label': 'cell_type', + 'output_train': 'train.h5ad', + 'output_test': 'test.h5ad', + 'output_solution': 'solution.h5ad' +} +meta = { + 'resources_dir': 'data_processors/process_dataset', + 'config': 'data_processors/process_dataset/.config.vsh.yaml' +} +## VIASH END + +# import helper functions +sys.path.append(meta['resources_dir']) +from subset_anndata import read_config_slots_info, subset_anndata + +# set seed if need be +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print(">> Load data", flush=True) +adata = ad.read_h5ad(par["input"]) +print("input:", adata) + +print(f">> Process data using {par['method']} method") +if par["method"] == "batch": + batch_info = adata.obs[par["obs_batch"]] + batch_categories = batch_info.dtype.categories + test_batches = random.sample(list(batch_categories), 1) + is_test = [ x in test_batches for x in batch_info ] +elif par["method"] == "random": + train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) + is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] + +# subset the different adatas +print(">> Figuring which data needs to be copied to which output file", flush=True) +# use par arguments to look for label and batch value in different slots +slot_mapping = { + "obs": { + "label": par["obs_label"], + "batch": par["obs_batch"], + } +} +slot_info = read_config_slots_info(meta["config"], slot_mapping) + +print(">> Creating train data", flush=True) +output_train = subset_anndata( + adata[[not x for x in is_test]], + slot_info["output_train"] +) + +print(">> Creating test data", flush=True) +output_test = subset_anndata( + adata[is_test], + slot_info["output_test"] +) + +print(">> Creating solution data", flush=True) +output_solution = subset_anndata( + adata[is_test], + slot_info['output_solution'] +) + +print(">> Writing data", flush=True) +output_train.write_h5ad(par["output_train"]) +output_test.write_h5ad(par["output_test"]) +output_solution.write_h5ad(par["output_solution"]) From ade957e825ff654e43d316e83260609dc0cbbbbe Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 20:28:59 +0200 Subject: [PATCH 090/103] add process_datasets workflow config --- .../process_datasets/config.vsh.yaml | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 src/workflows/process_datasets/config.vsh.yaml diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000..10d9a36 --- /dev/null +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,34 @@ +name: process_datasets +namespace: workflows +argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: /src/api/file_common_dataset.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output_train" + __merge__: /src/api/file_train_h5ad.yaml + required: true + direction: output + - name: "--output_test" + __merge__: /src/api/file_test_h5ad.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/api/file_solution.yaml + required: true + direction: output +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /common/nextflow_helpers/helper.nf +dependencies: + - name: common/check_dataset_schema + - name: common/extract_metadata + - name: predict_modality/process_dataset +runners: + - type: nextflow \ No newline at end of file From b0b4273f89938b474312f382dfb3b1ab9dfe35ba Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 20:31:18 +0200 Subject: [PATCH 091/103] Update process datasets workflow config --- src/workflows/process_datasets/config.vsh.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index 10d9a36..ed41e20 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -26,9 +26,16 @@ resources: path: main.nf entrypoint: run_wf - path: /common/nextflow_helpers/helper.nf +repositories: + - name: openproblems-v2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build dependencies: - name: common/check_dataset_schema + repository: openproblems-v2 - name: common/extract_metadata - - name: predict_modality/process_dataset + repository: openproblems-v2 + - name: data_processors/process_dataset runners: - type: nextflow \ No newline at end of file From 787920da7d2748413cca252cf6286ea2a2492c25 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 20:31:55 +0200 Subject: [PATCH 092/103] Add process datasets workflow script --- src/workflows/process_datasets/main.nf | 55 ++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 src/workflows/process_datasets/main.nf diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf new file mode 100644 index 0000000..88cf249 --- /dev/null +++ b/src/workflows/process_datasets/main.nf @@ -0,0 +1,55 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | check_dataset_schema.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | process_dataset.run( + fromState: [ input: "dataset" ], + toState: [ + output_train: "output_train", + output_test: "output_test", + output_solution: "output_solution" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_train", "output_test", "output_solution"]) + + emit: + output_ch +} From d062ddc5acfbdb44e0ba3982bbd64a9cb6d3fec9 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 20:38:15 +0200 Subject: [PATCH 093/103] add process datasets test --- src/workflows/process_datasets/test.sh | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100755 src/workflows/process_datasets/test.sh diff --git a/src/workflows/process_datasets/test.sh b/src/workflows/process_datasets/test.sh new file mode 100755 index 0000000..d918102 --- /dev/null +++ b/src/workflows/process_datasets/test.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Run this prior to executing this script: +# bin/viash_build -q 'batch_integration' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +DATASETS_DIR="resources_test/common" +OUTPUT_DIR="output/process_datasets_test" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +export NXF_VER=24.04.3 + +nextflow run . \ + -main-script target/nextflow/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + -c common/nextflow_helpers/labels_ci.config \ + --id run_test \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_train": "train.h5ad", "output_test": "test.h5ad"}' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" \ No newline at end of file From 3b15a12eeae5d6716ffba66fe6ac1931d372c514 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 20:39:51 +0200 Subject: [PATCH 094/103] Update submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index 19a6b8f..f82ff10 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 19a6b8fef711600bcb882d3b764d067168b8bf8a +Subproject commit f82ff105c986474651240f4e7aef53fd5019b8a4 From 7da438f9b8409507463e402c28da2c602c7bf6bd Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 20:50:07 +0200 Subject: [PATCH 095/103] update common dataset file --- src/api/file_common_dataset.yaml | 34 ++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index 0a5a05f..f3bd931 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -1,15 +1,41 @@ -#TODO: Change to the required and/or optional fields of the anndata type: file example: "resources_test/common/pancreas/dataset.h5ad" info: label: "Common Dataset" summary: A subset of the common dataset. slots: - layers: + layers: - type: integer name: counts description: Raw counts required: true + - type: double + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Cell type information + required: true + - type: string + name: batch + description: Batch information + required: true + var: + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true uns: - type: string name: dataset_id @@ -39,3 +65,7 @@ info: type: string description: The organism of the sample in the dataset. required: false + - type: string + name: normalization_id + description: "Which normalization was used" + required: true From aacfdb0afe910e79dad8e47d6f20bcfb3fa75c1f Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 20:50:19 +0200 Subject: [PATCH 096/103] add temp fix to workflow --- src/workflows/process_datasets/main.nf | 120 ++++++++++++++++++++++++- 1 file changed, 119 insertions(+), 1 deletion(-) diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 88cf249..eae19f7 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -1,7 +1,7 @@ include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" workflow auto { - findStates(params, meta.config) + findStatesTemp(params, meta.config) | meta.workflow.run( auto: [publish: "state"] ) @@ -53,3 +53,121 @@ workflow run_wf { emit: output_ch } + + +// temp fix for rename_keys typo + +def findStatesTemp(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesTempWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey;newKey:oldKey'" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesTempWf +} \ No newline at end of file From ee9606717fe912b5fcf69e6c42f4d8c1c50ac33f Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 21:02:47 +0200 Subject: [PATCH 097/103] Add additional entries to changelog --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a2488a..77b0339 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,19 @@ ## NEW FUNCTIONALITY +* Added `control_methods/true_labels` component (PR #5). + +* Added `methods/logistic_regression` component (PR #5). + +* Added `metrics/accuracy` component (PR #5). + ## MAJOR CHANGES +* Updated `api` files (PR #5). + ## MINOR CHANGES +* Updated `README.md` (PR #5). + ## BUGFIXES From c5129bff2f680b18143eca5a6dd66f935fafdd44 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 21:04:28 +0200 Subject: [PATCH 098/103] Remove task_info --- src/api/task_info.yaml | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 src/api/task_info.yaml diff --git a/src/api/task_info.yaml b/src/api/task_info.yaml deleted file mode 100644 index 4899361..0000000 --- a/src/api/task_info.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: A unique identifier. Can only contain lowercase letters, numbers or underscores. -label: A unique, human-readable, short label. Used for creating summary tables and visualisations. -summary: A one sentence summary of purpose and methodology. Used for creating an overview tables. -image: The name of the image file to use for the component on the website. -motivation: | - Explain the motivation behind your proposed task. Describe the biological or computational - problem you aim to address and why it’s important. Discuss the current state of research in - this area and any gaps or challenges that your task could help address. This section - should convince readers of the significance and relevance of your task. -description: | - Provide a clear and concise description of your task, detailing the specific problem it aims - to solve. Outline the input data types, the expected output, and any assumptions or constraints. - Be sure to explain any terminology or concepts that are essential for understanding the task. - -authors: - # Full name of the author, usually in the name of FirstName MiddleName LastName. - - name: ... - # Role of the author. Possible values: - # - # * `"author"`: Authors who have made substantial contributions to the component. - # * `"maintainer"`: The maintainer of the component. - # * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.). - roles: [ ... ] - # Additional information on the author - info: - github: ... - orcid: ... - email: ... - twitter: ... - linkedin: ... \ No newline at end of file From 18a58299999ed4ea22c535cd9e54c1970e29b3f7 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 21:06:01 +0200 Subject: [PATCH 099/103] remove comment from download_resources script --- scripts/download_resources.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh index 3781a29..2740e7f 100755 --- a/scripts/download_resources.sh +++ b/scripts/download_resources.sh @@ -6,9 +6,3 @@ echo ">> Downloading resources" common/sync_resources/sync_resources \ --delete - -# After finishing the task and the task specific test_resources are uploaded to s3, uncomment: -# common/sync_resources/sync_resources \ -# --input "s3://openproblems-data/resources_test//" \ -# --output "resources_test/" \ -# --delete \ No newline at end of file From 2909f3489da790567d9f1e0ff28c8c052154a11b Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 21:12:46 +0200 Subject: [PATCH 100/103] Add comment to download resources --- scripts/download_resources.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh index 2740e7f..8e50685 100755 --- a/scripts/download_resources.sh +++ b/scripts/download_resources.sh @@ -4,5 +4,6 @@ set -e echo ">> Downloading resources" +# the sync_resources script uses the test_resources S3 URI's in the _viash.yaml to download the resources. common/sync_resources/sync_resources \ --delete From 890aa64f808f78416aa2963d8f322f97dcb61330 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 21:41:52 +0200 Subject: [PATCH 101/103] Add create_test_resources script --- scripts/create_test_resources.sh | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 scripts/create_test_resources.sh diff --git a/scripts/create_test_resources.sh b/scripts/create_test_resources.sh new file mode 100644 index 0000000..a39f8c4 --- /dev/null +++ b/scripts/create_test_resources.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/task_template + +mkdir -p $DATASET_DIR + +# process dataset +echo Running process_dataset +nextflow run . \ + -main-script target/nextflow/workflows/process_datasets/main.nf \ + -profile docker \ + -entry auto \ + --input_states "$RAW_DATA/**/state.yaml" \ + --rename_keys 'input:output_dataset' \ + --settings '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}' \ + --publish_dir "$DATASET_DIR" \ + --output_state '$id/state.yaml' + +# run one method +viash run src/methods/logistic_regression/config.vsh.yaml -- \ + --input_train $DATASET_DIR/pancreas/train.h5ad \ + --input_test $DATASET_DIR/pancreas/test.h5ad \ + --output $DATASET_DIR/pancreas/denoised.h5ad + +# run one metric +viash run src/metrics/accuracy/config.vsh.yaml -- \ + --input_predicition $DATASET_DIR/pancreas/predicted.h5ad \ + --input_solution $DATASET_DIR/pancreas/solution.h5ad \ + --output $DATASET_DIR/pancreas/score.h5ad \ No newline at end of file From ecbd4be0d92a59cad3a076afe761e2b911dde325 Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 21:51:28 +0200 Subject: [PATCH 102/103] add run_benchmark scripts --- scripts/run_benchmark.sh | 23 +++++++++++++++++++++++ scripts/run_benchmark_test.sh | 19 +++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 scripts/run_benchmark.sh create mode 100644 scripts/run_benchmark_test.sh diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh new file mode 100644 index 0000000..cc4275e --- /dev/null +++ b/scripts/run_benchmark.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" + +# make sure only log_cp10k is used +cat > /tmp/params.yaml << HERE +input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml +rename_keys: 'input_train:output_train;input_test:output_test' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/task_template.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_template,full \ No newline at end of file diff --git a/scripts/run_benchmark_test.sh b/scripts/run_benchmark_test.sh new file mode 100644 index 0000000..6c03d42 --- /dev/null +++ b/scripts/run_benchmark_test.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +input_states: s3://openproblems-data/resources_test/task_template/**/state.yaml +rename_keys: 'input_train:output_train;input_test:output_test' +output_state: "state.yaml" +publish_dir: s3://openproblems-nextflow/temp/task_template/ +HERE + +tw launch https://github.com/openproblems-bio/task_template.git \ + --revision build/main \ + --pull-latest \ + --main-script target/nextflow/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config common/nextflow_helpers/labels_tw.config \ + --labels task_template,test \ No newline at end of file From 643f6ae952d6d348d61a9fbe222ccbb6c7b35f2b Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Thu, 11 Jul 2024 22:07:34 +0200 Subject: [PATCH 103/103] Apply suggestions from code review Co-authored-by: Robrecht Cannoodt --- _viash.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/_viash.yaml b/_viash.yaml index 042fe25..6399f74 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -18,8 +18,6 @@ links: docker_registry: ghcr.io # Step 5: Update the info fields to the text from the task issue. info: - # A unique identifier. Can only contain lowercase letters, numbers or underscores. (This is the same as the name above but without the `task_` prefix) - name: template # A unique, human-readable, short label. Used for creating summary tables and visualisations. label: Template description: | @@ -42,7 +40,7 @@ info: path: s3://openproblems-data/resources_test/common/ dest: resources_test/common -# Step 7: Update te authors of the task. +# Step 7: Update the authors of the task. authors: # Full name of the author, usually in the name of FirstName MiddleName LastName. - name: Kai Waldrant @@ -59,6 +57,8 @@ authors: email: ... twitter: ... linkedin: ... +# Step 8: Remove all of the comments of the steps you completed +# Step 9: High five yourself! config_mods: | .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }