diff --git a/.github/workflows/viash-test.yml b/.github/workflows/viash-test.yml index e7ebcf4875..b6bab576ed 100644 --- a/.github/workflows/viash-test.yml +++ b/.github/workflows/viash-test.yml @@ -3,34 +3,15 @@ name: viash test on: pull_request: push: - branches: [ '**' ] + branches: [ main ] jobs: - run_ci_check_job: - runs-on: ubuntu-latest - outputs: - run_ci: ${{ steps.github_cli.outputs.check }} - steps: - - name: 'Check if branch has an existing pull request and the trigger was a push' - id: github_cli - run: | - pull_request=$(gh pr list -R ${{ github.repository }} -H ${{ github.ref_name }} --json url --state open --limit 1 | jq '.[0].url') - # If the branch has a PR and this run was triggered by a push event, do not run - if [[ "$pull_request" != "null" && "$GITHUB_REF_NAME" != "main" && "${{ github.event_name == 'push' }}" == "true" && "${{ !contains(github.event.head_commit.message, 'ci force') }}" == "true" ]]; then - echo "check=false" >> $GITHUB_OUTPUT - else - echo "check=true" >> $GITHUB_OUTPUT - fi - env: - GITHUB_TOKEN: ${{ secrets.GTHB_PAT }} # phase 1 list: - needs: run_ci_check_job env: s3_bucket: s3://openproblems-data/resources_test/ runs-on: ubuntu-latest - if: "needs.run_ci_check_job.outputs.run_ci == 'true'" outputs: matrix: ${{ steps.set_matrix.outputs.matrix }} diff --git a/.gitignore b/.gitignore index b27efa26e7..c19f926ba4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +README.html +README_files/ *.DS_Store *__pycache__ *.h5ad diff --git a/CONTRIBUTING.qmd b/CONTRIBUTING.qmd index 995c02d361..6b6e33ae07 100644 --- a/CONTRIBUTING.qmd +++ b/CONTRIBUTING.qmd @@ -185,7 +185,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: [scikit-learn] diff --git a/src/common/check_dataset_schema/config.vsh.yaml b/src/common/check_dataset_schema/config.vsh.yaml index d25bf5766e..08449c3e7d 100644 --- a/src/common/check_dataset_schema/config.vsh.yaml +++ b/src/common/check_dataset_schema/config.vsh.yaml @@ -36,7 +36,7 @@ functionality: path: test.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 test_setup: - type: python packages: viashpy diff --git a/src/common/check_yaml_schema/config.vsh.yaml b/src/common/check_yaml_schema/config.vsh.yaml index 40a62fd2c8..b87bec5429 100644 --- a/src/common/check_yaml_schema/config.vsh.yaml +++ b/src/common/check_yaml_schema/config.vsh.yaml @@ -18,7 +18,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/common/create_component/script.py b/src/common/create_component/script.py index 822d0eac87..8c954a66d4 100644 --- a/src/common/create_component/script.py +++ b/src/common/create_component/script.py @@ -141,11 +141,11 @@ def generate_resources(par, script_path) -> str: def generate_docker_platform(par) -> str: """Set up the docker platform for Python.""" if par["language"] == "python": - image_str = "ghcr.io/openproblems-bio/base_python:1.0.4" + image_str = "openproblems/base_python:1.0.0" setup_type = "python" package_example = "scib==1.1.5" elif par["language"] == "r": - image_str = "ghcr.io/openproblems-bio/base_r:1.0.4" + image_str = "openproblems/base_r:1.0.0" setup_type = "r" package_example = "tidyverse" return strip_margin(f'''\ diff --git a/src/common/create_task_readme/config.vsh.yaml b/src/common/create_task_readme/config.vsh.yaml index 6ba0a726c7..d268974ce8 100644 --- a/src/common/create_task_readme/config.vsh.yaml +++ b/src/common/create_task_readme/config.vsh.yaml @@ -48,7 +48,7 @@ functionality: dest: openproblems-v2/_viash.yaml platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r packages: [dplyr, purrr, rlang, glue, yaml, fs, cli, igraph, rmarkdown, processx] diff --git a/src/common/extract_metadata/config.vsh.yaml b/src/common/extract_metadata/config.vsh.yaml index 0636812619..76e73cb975 100644 --- a/src/common/extract_metadata/config.vsh.yaml +++ b/src/common/extract_metadata/config.vsh.yaml @@ -31,7 +31,7 @@ functionality: path: test.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 test_setup: - type: python packages: viashpy diff --git a/src/common/extract_scores/config.vsh.yaml b/src/common/extract_scores/config.vsh.yaml index 46fb174924..72270b7a95 100644 --- a/src/common/extract_scores/config.vsh.yaml +++ b/src/common/extract_scores/config.vsh.yaml @@ -26,7 +26,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ tidyverse ] diff --git a/src/common/helper_functions/read_anndata_partial.py b/src/common/helper_functions/read_anndata_partial.py new file mode 100644 index 0000000000..efbea0592d --- /dev/null +++ b/src/common/helper_functions/read_anndata_partial.py @@ -0,0 +1,77 @@ +import warnings +from pathlib import Path +import anndata as ad +import h5py +from scipy.sparse import csr_matrix +from anndata.experimental import read_elem, sparse_dataset + + +def read_anndata( + file: str, + backed: bool = False, + **kwargs +) -> ad.AnnData: + """ + Read anndata file + :param file: path to anndata file in h5ad format + :param kwargs: AnnData parameter to group mapping + """ + assert Path(file).exists(), f'File not found: {file}' + + f = h5py.File(file, 'r') + kwargs = {x: x for x in f} if not kwargs else kwargs + if len(f.keys()) == 0: + return ad.AnnData() + # check if keys are available + for name, slot in kwargs.items(): + if slot not in f: + warnings.warn( + f'Cannot find "{slot}" for AnnData parameter `{name}` from "{file}"' + ) + adata = read_partial(f, backed=backed, **kwargs) + if not backed: + f.close() + + return adata + + +def read_partial( + group: h5py.Group, + backed: bool = False, + force_sparse_types: [str, list] = None, + **kwargs +) -> ad.AnnData: + """ + Partially read h5py groups + :params group: file group + :params force_sparse_types: encoding types to convert to sparse_dataset via csr_matrix + :params backed: read sparse matrix as sparse_dataset + :params **kwargs: dict of slot_name: slot, by default use all available slot for the h5py file + :return: AnnData object + """ + if force_sparse_types is None: + force_sparse_types = [] + elif isinstance(force_sparse_types, str): + force_sparse_types = [force_sparse_types] + slots = {} + if backed: + print('Read as backed sparse matrix...') + + for slot_name, slot in kwargs.items(): + print(f'Read slot "{slot}", store as "{slot_name}"...') + if slot not in group: + warnings.warn(f'Slot "{slot}" not found, skip...') + slots[slot_name] = None + else: + elem = group[slot] + iospec = ad._io.specs.get_spec(elem) + if iospec.encoding_type in ("csr_matrix", "csc_matrix") and backed: + slots[slot_name] = sparse_dataset(elem) + elif iospec.encoding_type in force_sparse_types: + slots[slot_name] = csr_matrix(read_elem(elem)) + if backed: + slots[slot_name] = sparse_dataset(slots[slot_name]) + else: + slots[slot_name] = read_elem(elem) + return ad.AnnData(**slots) + diff --git a/src/common/ontology/check_obsolete_terms/config.vsh.yaml b/src/common/ontology/check_obsolete_terms/config.vsh.yaml index dbb0506098..fc006f6cf9 100644 --- a/src/common/ontology/check_obsolete_terms/config.vsh.yaml +++ b/src/common/ontology/check_obsolete_terms/config.vsh.yaml @@ -70,7 +70,7 @@ functionality: - path: /resources_test/common/cellxgene_census platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r packages: [ dplyr, tidyr, tibble, ontologyIndex, processx ] \ No newline at end of file diff --git a/src/common/process_task_results/generate_qc/config.vsh.yaml b/src/common/process_task_results/generate_qc/config.vsh.yaml index 9b3b07dc01..68a5d19682 100644 --- a/src/common/process_task_results/generate_qc/config.vsh.yaml +++ b/src/common/process_task_results/generate_qc/config.vsh.yaml @@ -33,7 +33,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [lowmem, lowtime, lowcpu] diff --git a/src/common/process_task_results/get_api_info/config.vsh.yaml b/src/common/process_task_results/get_api_info/config.vsh.yaml index 2026c007ab..0e7eb1696e 100644 --- a/src/common/process_task_results/get_api_info/config.vsh.yaml +++ b/src/common/process_task_results/get_api_info/config.vsh.yaml @@ -8,7 +8,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ purrr, dplyr, yaml, rlang, processx ] diff --git a/src/common/process_task_results/get_dataset_info/config.vsh.yaml b/src/common/process_task_results/get_dataset_info/config.vsh.yaml index 75f4952738..10247a22ba 100644 --- a/src/common/process_task_results/get_dataset_info/config.vsh.yaml +++ b/src/common/process_task_results/get_dataset_info/config.vsh.yaml @@ -11,7 +11,7 @@ functionality: dest: test_file.yaml platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ purrr, yaml, rlang, processx ] diff --git a/src/common/process_task_results/get_method_info/config.vsh.yaml b/src/common/process_task_results/get_method_info/config.vsh.yaml index ee606c852e..053bbac53c 100644 --- a/src/common/process_task_results/get_method_info/config.vsh.yaml +++ b/src/common/process_task_results/get_method_info/config.vsh.yaml @@ -11,7 +11,7 @@ functionality: dest: test_file.yaml platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ purrr, yaml, rlang, processx ] diff --git a/src/common/process_task_results/get_metric_info/config.vsh.yaml b/src/common/process_task_results/get_metric_info/config.vsh.yaml index e6555bca36..ee5833b5b9 100644 --- a/src/common/process_task_results/get_metric_info/config.vsh.yaml +++ b/src/common/process_task_results/get_metric_info/config.vsh.yaml @@ -11,7 +11,7 @@ functionality: dest: test_file.yaml platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ purrr, yaml, rlang, processx ] diff --git a/src/common/process_task_results/get_results/config.vsh.yaml b/src/common/process_task_results/get_results/config.vsh.yaml index 5e1b716731..cd639fad4d 100644 --- a/src/common/process_task_results/get_results/config.vsh.yaml +++ b/src/common/process_task_results/get_results/config.vsh.yaml @@ -42,7 +42,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ purrr, yaml, rlang, dplyr, tidyr, readr, lubridate, dynutils, processx ] diff --git a/src/common/process_task_results/get_task_info/config.vsh.yaml b/src/common/process_task_results/get_task_info/config.vsh.yaml index b74c67c3e7..2e8fbd2b66 100644 --- a/src/common/process_task_results/get_task_info/config.vsh.yaml +++ b/src/common/process_task_results/get_task_info/config.vsh.yaml @@ -11,7 +11,7 @@ functionality: dest: test_file.yaml platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ purrr, yaml, rlang, processx ] diff --git a/src/common/process_task_results/yaml_to_json/config.vsh.yaml b/src/common/process_task_results/yaml_to_json/config.vsh.yaml index de54b44cce..7231cdcdbf 100644 --- a/src/common/process_task_results/yaml_to_json/config.vsh.yaml +++ b/src/common/process_task_results/yaml_to_json/config.vsh.yaml @@ -11,6 +11,6 @@ functionality: dest: test_file.yaml platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow - type: native diff --git a/src/datasets/loaders/cellxgene_census/config.vsh.yaml b/src/datasets/loaders/cellxgene_census/config.vsh.yaml index f59547174a..667e1c6a6b 100644 --- a/src/datasets/loaders/cellxgene_census/config.vsh.yaml +++ b/src/datasets/loaders/cellxgene_census/config.vsh.yaml @@ -151,7 +151,7 @@ functionality: path: test.py platforms: - type: docker - #image: ghcr.io/openproblems-bio/base_python:1.0.4 + #image: openproblems/base_python:1.0.0 image: python:3.11 setup: - type: python diff --git a/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml b/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml index 91fad8769f..7ee4166d9d 100644 --- a/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml +++ b/src/datasets/loaders/cellxgene_census_from_source_h5ad/config.vsh.yaml @@ -114,7 +114,7 @@ functionality: path: test.py platforms: - type: docker - #image: ghcr.io/openproblems-bio/base_python:1.0.4 + #image: openproblems/base_python:1.0.0 image: python:3.11 setup: - type: python diff --git a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml index f8837cba6d..96dad30e76 100644 --- a/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml +++ b/src/datasets/loaders/openproblems_neurips2021_bmmc/config.vsh.yaml @@ -68,7 +68,7 @@ functionality: # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: - label: [highmem, midcpu , midtime] \ No newline at end of file + label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml b/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml index a6d79c701d..b2141482f1 100644 --- a/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml +++ b/src/datasets/loaders/openproblems_neurips2022_pbmc/config.vsh.yaml @@ -74,7 +74,7 @@ functionality: # path: /resources_test/common/openproblems_neurips2021/neurips2021_bmmc_cite.h5ad platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/datasets/loaders/openproblems_v1/config.vsh.yaml b/src/datasets/loaders/openproblems_v1/config.vsh.yaml index 5a53755b82..a07a60d0ac 100644 --- a/src/datasets/loaders/openproblems_v1/config.vsh.yaml +++ b/src/datasets/loaders/openproblems_v1/config.vsh.yaml @@ -72,7 +72,7 @@ functionality: path: test.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: apt packages: git diff --git a/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml b/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml index 0f07dbff62..812e52be62 100644 --- a/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml +++ b/src/datasets/loaders/openproblems_v1_multimodal/config.vsh.yaml @@ -80,7 +80,7 @@ functionality: path: test.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: apt packages: git diff --git a/src/datasets/normalization/atac_tfidf/config.vsh.yaml b/src/datasets/normalization/atac_tfidf/config.vsh.yaml index 3240fec43f..5a8f56306a 100644 --- a/src/datasets/normalization/atac_tfidf/config.vsh.yaml +++ b/src/datasets/normalization/atac_tfidf/config.vsh.yaml @@ -12,7 +12,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/datasets/normalization/l1_sqrt/config.vsh.yaml b/src/datasets/normalization/l1_sqrt/config.vsh.yaml index a133bdee48..212eadc968 100644 --- a/src/datasets/normalization/l1_sqrt/config.vsh.yaml +++ b/src/datasets/normalization/l1_sqrt/config.vsh.yaml @@ -16,12 +16,12 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: - scprep - - numpy + - numpy<2 - type: nextflow directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/log_cp/config.vsh.yaml b/src/datasets/normalization/log_cp/config.vsh.yaml index 25b4d2a9eb..89b2a283f9 100644 --- a/src/datasets/normalization/log_cp/config.vsh.yaml +++ b/src/datasets/normalization/log_cp/config.vsh.yaml @@ -12,7 +12,7 @@ functionality: description: "Number of counts per cell. When set to -1, will use None." platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/normalization/log_scran_pooling/config.vsh.yaml b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml index 3431e8174b..4cbf81ff5a 100644 --- a/src/datasets/normalization/log_scran_pooling/config.vsh.yaml +++ b/src/datasets/normalization/log_scran_pooling/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ Matrix, rlang, scran, BiocParallel ] diff --git a/src/datasets/normalization/prot_clr/config.vsh.yaml b/src/datasets/normalization/prot_clr/config.vsh.yaml index 351cc0569a..8f6bbe269f 100644 --- a/src/datasets/normalization/prot_clr/config.vsh.yaml +++ b/src/datasets/normalization/prot_clr/config.vsh.yaml @@ -16,7 +16,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/datasets/normalization/sqrt_cp/config.vsh.yaml b/src/datasets/normalization/sqrt_cp/config.vsh.yaml index 7da0165229..4d95636f4c 100644 --- a/src/datasets/normalization/sqrt_cp/config.vsh.yaml +++ b/src/datasets/normalization/sqrt_cp/config.vsh.yaml @@ -12,7 +12,7 @@ functionality: description: "Number of counts per cell" platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, midmem, midcpu] diff --git a/src/datasets/processors/hvg/config.vsh.yaml b/src/datasets/processors/hvg/config.vsh.yaml index d2702d90d6..aed18c6d38 100644 --- a/src/datasets/processors/hvg/config.vsh.yaml +++ b/src/datasets/processors/hvg/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/knn/config.vsh.yaml b/src/datasets/processors/knn/config.vsh.yaml index 652676ab90..9908fe9086 100644 --- a/src/datasets/processors/knn/config.vsh.yaml +++ b/src/datasets/processors/knn/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/pca/config.vsh.yaml b/src/datasets/processors/pca/config.vsh.yaml index 027faf1e08..7f0213b922 100644 --- a/src/datasets/processors/pca/config.vsh.yaml +++ b/src/datasets/processors/pca/config.vsh.yaml @@ -11,7 +11,7 @@ functionality: # - path: "../../../resources_test/common/pancreas" platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, highmem, midcpu] diff --git a/src/datasets/processors/subsample/config.vsh.yaml b/src/datasets/processors/subsample/config.vsh.yaml index bbfacdf832..4e52e93db5 100644 --- a/src/datasets/processors/subsample/config.vsh.yaml +++ b/src/datasets/processors/subsample/config.vsh.yaml @@ -41,7 +41,7 @@ functionality: - path: /resources_test/common/pancreas platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 test_setup: - type: python packages: diff --git a/src/datasets/processors/svd/config.vsh.yaml b/src/datasets/processors/svd/config.vsh.yaml index e59865da5e..bbad17f58c 100644 --- a/src/datasets/processors/svd/config.vsh.yaml +++ b/src/datasets/processors/svd/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: [scikit-learn] diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh index af32b8c853..8fd7e3a72d 100755 --- a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh +++ b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh @@ -9,7 +9,7 @@ param_list: input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fcite%5FBMMC%5Fprocessed%2Eh5ad%2Egz" mod1: GEX mod2: ADT - dataset_name: OpenProblems NeurIPS2021 CITE-Seq + dataset_name: NeurIPS2021 CITE-Seq dataset_organism: homo_sapiens dataset_summary: Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors. dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." @@ -19,7 +19,7 @@ param_list: input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fmultiome%5FBMMC%5Fprocessed%2Eh5ad%2Egz" mod1: GEX mod2: ATAC - dataset_name: OpenProblems NeurIPS2021 Multiome + dataset_name: NeurIPS2021 Multiome dataset_organism: homo_sapiens dataset_summary: Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors. dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X Multiome Gene Expression and Chromatin Accessibility kit. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site." @@ -35,15 +35,6 @@ output_state: '$id/state.yaml' publish_dir: s3://openproblems-data/resources/datasets HERE -cat > /tmp/nextflow.config << HERE -process { - withName:'.*publishStatesProc' { - memory = '16GB' - disk = '100GB' - } -} -HERE - tw launch https://github.com/openproblems-bio/openproblems-v2.git \ --revision main_build \ --pull-latest \ @@ -51,5 +42,5 @@ tw launch https://github.com/openproblems-bio/openproblems-v2.git \ --workspace 53907369739130 \ --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ --params-file "$params_file" \ - --config /tmp/nextflow.config \ - --labels openproblems_neurips2021_bmmc,dataset_loader \ + --config src/wf_utils/labels_tw.config \ + --labels neurips2021,dataset_loader \ diff --git a/src/migration/check_migration_status/config.vsh.yaml b/src/migration/check_migration_status/config.vsh.yaml index a2dea51e00..bd8107381c 100644 --- a/src/migration/check_migration_status/config.vsh.yaml +++ b/src/migration/check_migration_status/config.vsh.yaml @@ -25,6 +25,6 @@ functionality: path: test.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow - type: native diff --git a/src/migration/list_git_shas/config.vsh.yaml b/src/migration/list_git_shas/config.vsh.yaml index 53fc63aabb..c70366612a 100644 --- a/src/migration/list_git_shas/config.vsh.yaml +++ b/src/migration/list_git_shas/config.vsh.yaml @@ -32,7 +32,7 @@ functionality: path: test.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 test_setup: - type: docker run: "git clone https://github.com/openproblems-bio/openproblems-v2.git" diff --git a/src/migration/update_bibtex/config.vsh.yaml b/src/migration/update_bibtex/config.vsh.yaml index 0df07b66c2..147e0b6c22 100644 --- a/src/migration/update_bibtex/config.vsh.yaml +++ b/src/migration/update_bibtex/config.vsh.yaml @@ -19,7 +19,7 @@ functionality: path: test.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: git+https://github.com/sciunto-org/python-bibtexparser@main diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml index 67d74ae8ab..c2484fbaa2 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml @@ -14,9 +14,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py index 7fbb4a537e..801440ce65 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py +++ b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py @@ -1,3 +1,4 @@ +import sys import scanpy as sc import numpy as np @@ -15,9 +16,18 @@ ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) -adata.X = adata.layers["normalized"] +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) adata.var["highly_variable"] = adata.var["hvg"] print("Process dataset", flush=True) @@ -27,7 +37,7 @@ n_comps = min(50, np.sum(batch_idx)) solver = "full" if n_comps == np.sum(batch_idx) else "arpack" adata.obsm["X_emb"][batch_idx, :n_comps] = sc.tl.pca( - adata[batch_idx], + adata[batch_idx].copy(), n_comps=n_comps, use_highly_variable=True, svd_solver=solver, diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml index 6b2f724ed9..95212518c5 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml @@ -14,9 +14,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py index 4b16b82525..f45038806b 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py +++ b/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py @@ -1,3 +1,4 @@ +import sys import scanpy as sc ## VIASH START @@ -15,8 +16,17 @@ ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) print("process dataset", flush=True) adata.obsm["X_emb"] = adata.obsm["X_pca"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml index 7b1013221e..b20701c8f1 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml @@ -14,9 +14,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py index 9ddbab0432..2acdbf9b7a 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py +++ b/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py @@ -1,3 +1,4 @@ +import sys import scanpy as sc ## VIASH START @@ -15,12 +16,22 @@ ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) # no processing, subset matrix to highly variable genes adata_hvg = adata[:, adata.var["hvg"]].copy() -adata.layers['corrected_counts'] = adata_hvg.layers["normalized"].copy() +adata.layers['corrected_counts'] = adata_hvg.X.copy() print("Store outputs", flush=True) adata.uns['method_id'] = meta['functionality_name'] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml index ead6281806..86886ce263 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py index 22b39d10d5..4824c8f443 100644 --- a/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py +++ b/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py @@ -19,10 +19,16 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _set_uns +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) print("process dataset", flush=True) neighbors_map = adata.uns['knn'] diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml index 9d50f13aaf..6c853a7719 100644 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py index b15ce33047..ca16a60ab2 100644 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py @@ -16,10 +16,15 @@ ## VIASH END sys.path.append(meta["resources_dir"]) from utils import _perfect_embedding +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + uns='uns' +) print('Process data...', flush=True) adata.obsm["X_emb"] = _perfect_embedding(partition=adata.obs["label"]) diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml index e0af4e4a5b..e945e3bc58 100644 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml @@ -18,10 +18,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py index 75f5889f8d..8f88f77472 100644 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py +++ b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py @@ -17,10 +17,15 @@ ## VIASH END sys.path.append(meta["resources_dir"]) from utils import _perfect_embedding +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + uns='uns' +) print('Process data...', flush=True) adata.obsm["X_emb"] = _perfect_embedding( diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml index 717d14ab42..d8bcee01d4 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py index 3cc476b863..175a449a49 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py @@ -19,9 +19,15 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) print("process dataset", flush=True) adata.obsm["X_emb"] = _randomize_features( diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml index ad1957b070..5f98284bb9 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py index 755f4782f9..630871e780 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py @@ -19,12 +19,20 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) adata.layers['corrected_counts'] = _randomize_features( - adata.layers["normalized"], + adata.X, partition=adata.obs["batch"], ) diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml index 553e7431a8..72a12c5031 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py index d07e3b339e..d5c20aa185 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py @@ -18,10 +18,16 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_graph +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) print('Randomize graph...', flush=True) adata = _randomize_graph( diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml index d591b2a1df..b4457498c9 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py index bf793fad75..bf26568079 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py @@ -16,10 +16,16 @@ ## VIASH END sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) print('Process data...', flush=True) adata.obsm["X_emb"] = _randomize_features( diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml index 2719a68d87..7c483739c2 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py index a06e6c1ab7..9f1302df0d 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py @@ -19,14 +19,21 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) print("Process data...", flush=True) adata.layers['corrected_counts'] = _randomize_features( - adata.layers["normalized"], + adata.X, partition=adata.obs["label"] ) diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml index 948bcacf29..6015185616 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py index 7b02353ed4..3634d55dbd 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py @@ -19,9 +19,15 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_graph +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) print("Process data...", flush=True) adata = _randomize_graph( diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml index b17174744f..0343c37817 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py index fc7ba6cee5..ca626600b8 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py @@ -19,9 +19,15 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) print("process dataset", flush=True) adata.obsm["X_emb"] = _randomize_features(adata.obsm["X_pca"]) diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml index 8dd71aec93..f49ee146a1 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py index 1c7c838b6e..c74c7d2a5e 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py @@ -19,11 +19,18 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_features +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) - -adata.layers['corrected_counts'] = _randomize_features(adata.layers["normalized"]) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +adata.layers['corrected_counts'] = _randomize_features(adata.X) print("Store outputs", flush=True) adata.uns['method_id'] = meta['functionality_name'] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml index 9780485e92..1b92cbc70a 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml @@ -14,10 +14,12 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py - path: ../../utils.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py index c0277c74b7..cd4d64f043 100644 --- a/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py +++ b/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py @@ -18,10 +18,16 @@ # add helper scripts to path sys.path.append(meta["resources_dir"]) from utils import _randomize_graph +from read_anndata_partial import read_anndata print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsp='obsp', + uns='uns' +) print('Randomize graph...', flush=True) adata = _randomize_graph(adata, neighbors_key="knn") diff --git a/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml b/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml index 1d1d42aa89..8eff37339f 100644 --- a/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml @@ -37,9 +37,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/bbknn/script.py b/src/tasks/batch_integration/methods/bbknn/script.py index d2a6e464ae..1496fda0bb 100644 --- a/src/tasks/batch_integration/methods/bbknn/script.py +++ b/src/tasks/batch_integration/methods/bbknn/script.py @@ -1,4 +1,6 @@ +import sys import anndata as ad +import scanpy as sc import bbknn ## VIASH START @@ -15,13 +17,24 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] adata = adata[:, idx].copy() + sc.pp.pca(adata) print('Run BBKNN', flush=True) kwargs = dict(batch_key='batch', copy=True) diff --git a/src/tasks/batch_integration/methods/combat/config.vsh.yaml b/src/tasks/batch_integration/methods/combat/config.vsh.yaml index dbb4b042ec..f94333627d 100644 --- a/src/tasks/batch_integration/methods/combat/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/combat/config.vsh.yaml @@ -32,9 +32,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, highmem, lowcpu] diff --git a/src/tasks/batch_integration/methods/combat/script.py b/src/tasks/batch_integration/methods/combat/script.py index c5f0ed8dd5..9f282efb9c 100644 --- a/src/tasks/batch_integration/methods/combat/script.py +++ b/src/tasks/batch_integration/methods/combat/script.py @@ -1,3 +1,4 @@ +import sys import scanpy as sc from scipy.sparse import csr_matrix @@ -15,8 +16,18 @@ ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -25,7 +36,6 @@ print('Run Combat', flush=True) -adata.X = adata.layers['normalized'] adata.X = sc.pp.combat(adata, key='batch', inplace=False) diff --git a/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml b/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml index 1fc6910a81..cd885da3cd 100644 --- a/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml @@ -26,7 +26,7 @@ functionality: path: ../fastmnn_feature/script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r bioc: diff --git a/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml index 4336f93c8e..e28406eb54 100644 --- a/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml @@ -25,7 +25,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r bioc: batchelor diff --git a/src/tasks/batch_integration/methods/liger/config.vsh.yaml b/src/tasks/batch_integration/methods/liger/config.vsh.yaml index d0db8e2996..4c638d467b 100644 --- a/src/tasks/batch_integration/methods/liger/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/liger/config.vsh.yaml @@ -19,7 +19,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: apt packages: cmake diff --git a/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml b/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml index 7a795fc759..1c999fa540 100644 --- a/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml @@ -17,7 +17,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r bioc: diff --git a/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml b/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml index de1894ab68..649672dac1 100644 --- a/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml @@ -30,6 +30,8 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: # Due to a [ gcc-8 ] dependency in the mnnpy package, we need to use a python:3.8 image - type: docker diff --git a/src/tasks/batch_integration/methods/mnnpy/script.py b/src/tasks/batch_integration/methods/mnnpy/script.py index 34e726133e..56d5cce3a2 100644 --- a/src/tasks/batch_integration/methods/mnnpy/script.py +++ b/src/tasks/batch_integration/methods/mnnpy/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import mnnpy @@ -13,8 +14,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -22,7 +33,6 @@ adata = adata[:, idx].copy() print('Run mnn', flush=True) -adata.X = adata.layers['normalized'] split = [] batch_categories = adata.obs['batch'].cat.categories for i in batch_categories: diff --git a/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml b/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml index 0d8f262620..cf16b2e684 100644 --- a/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml @@ -21,9 +21,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/pyliger/script.py b/src/tasks/batch_integration/methods/pyliger/script.py index aa2250a857..2066e6965b 100644 --- a/src/tasks/batch_integration/methods/pyliger/script.py +++ b/src/tasks/batch_integration/methods/pyliger/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import numpy as np import pyliger @@ -12,21 +13,24 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('>> Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) +adata.layers['norm_data'] = read_anndata(par['input'], X='layers/normalized').X print('>> Prepare data', flush=True) adata_per_batch = [] for batch in adata.obs['batch'].unique(): adb = adata[adata.obs['batch'] == batch].copy() - - # move counts - adb.X = adb.layers['counts'] - del adb.layers['counts'] - - # move normalized data - adb.layers["norm_data"] = adb.layers["normalized"] - del adb.layers["normalized"] # save row sum and sum of squares for further use norm_sum = np.ravel(np.sum(adb.layers["norm_data"], axis=0)) diff --git a/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml b/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml index 179d478412..3437df19c9 100644 --- a/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml @@ -25,9 +25,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scalex_embed/script.py b/src/tasks/batch_integration/methods/scalex_embed/script.py index 1259fd130a..9974eba4b3 100644 --- a/src/tasks/batch_integration/methods/scalex_embed/script.py +++ b/src/tasks/batch_integration/methods/scalex_embed/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import scalex @@ -13,8 +14,19 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -22,7 +34,6 @@ adata = adata[:, idx].copy() print('Run SCALEX', flush=True) -adata.X = adata.layers['normalized'] adata = scalex.SCALEX( adata, batch_key="batch", @@ -42,6 +53,9 @@ output = ad.AnnData( obs=adata.obs[[]], var=adata.var[[]], + layers={ + 'corrected_counts': adata.layers["impute"], + }, obsm={ 'X_emb': adata.obsm['latent'], }, diff --git a/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml index 2d8d05a98f..1874bc190e 100644 --- a/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml @@ -24,10 +24,12 @@ functionality: description: Number of highly variable genes to use. resources: - type: python_script - path: script.py + path: ../scalex_embed/script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scalex_feature/script.py b/src/tasks/batch_integration/methods/scalex_feature/script.py deleted file mode 100644 index ef33ee2a43..0000000000 --- a/src/tasks/batch_integration/methods/scalex_feature/script.py +++ /dev/null @@ -1,56 +0,0 @@ -import anndata as ad -import scanpy as sc -import scalex - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'n_hvg': 2000, -} -meta = { - 'functionality_name' : 'foo', - 'config': 'bar' -} -## VIASH END - -print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) - -if par['n_hvg']: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] - adata = adata[:, idx].copy() - -print('Run SCALEX', flush=True) -adata.X = adata.layers['normalized'] -adata = scalex.SCALEX( - adata, - batch_key="batch", - ignore_umap=True, - impute=adata.obs["batch"].cat.categories[0], - processed=True, - max_iteration=40, - min_features=None, - min_cells=None, - n_top_features=0, - outdir=None, - gpu=0, -) - -print("Store output", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - layers={ - 'corrected_counts': adata.layers["impute"], - }, - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml index 387745fc38..b5dcd8f54a 100644 --- a/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml @@ -27,9 +27,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scanorama_embed/script.py b/src/tasks/batch_integration/methods/scanorama_embed/script.py index 950aa3b193..db12b458d5 100644 --- a/src/tasks/batch_integration/methods/scanorama_embed/script.py +++ b/src/tasks/batch_integration/methods/scanorama_embed/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import scanorama @@ -13,6 +14,10 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + # based on scib # -> https://github.com/theislab/scib/blob/59ae6eee5e611d9d3db067685ec96c28804e9127/scib/utils.py#L51C1-L72C62 def merge_adata(*adata_list, **kwargs): @@ -40,7 +45,13 @@ def merge_adata(*adata_list, **kwargs): print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) if par['n_hvg']: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -48,7 +59,6 @@ def merge_adata(*adata_list, **kwargs): adata = adata[:, idx].copy() print('Run scanorama', flush=True) -adata.X = adata.layers['normalized'] split = [] batch_categories = adata.obs['batch'].cat.categories for i in batch_categories: @@ -65,6 +75,9 @@ def merge_adata(*adata_list, **kwargs): 'normalization_id': adata.uns['normalization_id'], 'method_id': meta['functionality_name'], }, + layers={ + 'corrected_counts': corrected.X, + }, obsm={ 'X_emb': corrected.obsm["X_scanorama"], } diff --git a/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml index 50246875ae..3f735ddffd 100644 --- a/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml @@ -26,10 +26,12 @@ functionality: description: Number of highly variable genes to use. resources: - type: python_script - path: script.py + path: ../scanorama_embed/script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scanorama_feature/script.py b/src/tasks/batch_integration/methods/scanorama_feature/script.py deleted file mode 100644 index 614180ec99..0000000000 --- a/src/tasks/batch_integration/methods/scanorama_feature/script.py +++ /dev/null @@ -1,74 +0,0 @@ -import anndata as ad -import scanorama - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'hvg': True, -} -meta = { - 'functionality_name': 'foo', - 'config': 'bar' -} -## VIASH END - -# based on scib -# -> https://github.com/theislab/scib/blob/59ae6eee5e611d9d3db067685ec96c28804e9127/scib/utils.py#L51C1-L72C62 -def merge_adata(*adata_list, **kwargs): - """Merge adatas from list while remove duplicated ``obs`` and ``var`` columns - - :param adata_list: ``anndata`` objects to be concatenated - :param kwargs: arguments to be passed to ``anndata.AnnData.concatenate`` - """ - - if len(adata_list) == 1: - return adata_list[0] - - # Make sure that adatas do not contain duplicate columns - for _adata in adata_list: - for attr in ("obs", "var"): - df = getattr(_adata, attr) - dup_mask = df.columns.duplicated() - if dup_mask.any(): - print( - f"Deleting duplicated keys `{list(df.columns[dup_mask].unique())}` from `adata.{attr}`." - ) - setattr(_adata, attr, df.loc[:, ~dup_mask]) - - return ad.AnnData.concatenate(*adata_list, **kwargs) - - -print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) - -if par['n_hvg']: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] - adata = adata[:, idx].copy() - -print('Run scanorama', flush=True) -adata.X = adata.layers['normalized'] -split = [] -batch_categories = adata.obs['batch'].cat.categories -for i in batch_categories: - split.append(adata[adata.obs['batch'] == i].copy()) -corrected = scanorama.correct_scanpy(split, return_dimred=True) -corrected = merge_adata(*corrected, batch_key='batch', batch_categories=batch_categories, index_unique=None) - -print("Store output", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - }, - layers={ - 'corrected_counts': corrected.X, - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml index 3801d5bbe7..5615fd72cd 100644 --- a/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml @@ -44,9 +44,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scanvi/script.py b/src/tasks/batch_integration/methods/scanvi/script.py index 9c0886816d..35d5b80f32 100644 --- a/src/tasks/batch_integration/methods/scanvi/script.py +++ b/src/tasks/batch_integration/methods/scanvi/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scvi.model import SCVI, SCANVI @@ -17,8 +18,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) if par["n_hvg"]: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -26,7 +37,7 @@ adata = adata[:, idx].copy() print("Processing data", flush=True) -SCVI.setup_anndata(adata, layer="counts", batch_key="batch") +SCVI.setup_anndata(adata, batch_key="batch") print("Run scVI", flush=True) model_kwargs = { diff --git a/src/tasks/batch_integration/methods/scvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scvi/config.vsh.yaml index 86f9e919b2..45eb09d5cf 100644 --- a/src/tasks/batch_integration/methods/scvi/config.vsh.yaml +++ b/src/tasks/batch_integration/methods/scvi/config.vsh.yaml @@ -42,9 +42,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/methods/scvi/script.py b/src/tasks/batch_integration/methods/scvi/script.py index 3c5feb6f9c..26490737a5 100644 --- a/src/tasks/batch_integration/methods/scvi/script.py +++ b/src/tasks/batch_integration/methods/scvi/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scvi.model import SCVI @@ -16,8 +17,17 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) if par["n_hvg"]: print(f"Select top {par['n_hvg']} high variable genes", flush=True) @@ -25,7 +35,7 @@ adata = adata[:, idx].copy() print("Processing data", flush=True) -SCVI.setup_anndata(adata, layer="counts", batch_key="batch") +SCVI.setup_anndata(adata, batch_key="batch") print("Run scVI", flush=True) model_kwargs = { diff --git a/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml index bb2f7b48c7..be6567271c 100644 --- a/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml @@ -36,9 +36,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/asw_batch/script.py b/src/tasks/batch_integration/metrics/asw_batch/script.py index fceda260d9..35b110b895 100644 --- a/src/tasks/batch_integration/metrics/asw_batch/script.py +++ b/src/tasks/batch_integration/metrics/asw_batch/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import silhouette_batch @@ -11,14 +12,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.obsm["X_emb"] = input_integrated.obsm["X_emb"] +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = silhouette_batch( - input_solution, + adata, batch_key='batch', label_key='label', embed='X_emb', @@ -27,9 +32,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml index 4fd0d7ac32..068381b9e3 100644 --- a/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml @@ -24,9 +24,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/asw_label/script.py b/src/tasks/batch_integration/metrics/asw_label/script.py index 938efef5ac..01a7a2ad41 100644 --- a/src/tasks/batch_integration/metrics/asw_label/script.py +++ b/src/tasks/batch_integration/metrics/asw_label/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import silhouette @@ -12,14 +13,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.obsm["X_emb"] = input_integrated.obsm["X_emb"] +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = silhouette( - input_solution, + adata, label_key='label', embed='X_emb' ) @@ -27,9 +32,9 @@ print("Create output AnnData object", flush=True) output = ad.AnnData( uns={ - "dataset_id": input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - "method_id": input_integrated.uns['method_id'], + "dataset_id": adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + "method_id": adata.uns['method_id'], "metric_ids": [meta['functionality_name']], "metric_values": [score] } diff --git a/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml b/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml index 1e8edd5ee7..3852029a60 100644 --- a/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml @@ -33,9 +33,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py b/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py index 6114defd81..fa432a21c6 100644 --- a/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py +++ b/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import cell_cycle import numpy as np @@ -12,15 +13,27 @@ 'functionality_name': 'foo' } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.X = input_solution.layers['normalized'] +adata_solution = read_anndata( + par['input_solution'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) +adata_integrated = read_anndata( + par['input_integrated'], + obs='obs', + obsm='obsm', + uns='uns' +) print('Use gene symbols for features', flush=True) -input_solution.var_names = input_solution.var['feature_name'] -input_integrated.var_names = input_integrated.var['feature_name'] +adata_solution.var_names = adata_solution.var['feature_name'] translator = { "homo_sapiens": "human", @@ -28,13 +41,13 @@ } print('Compute score', flush=True) -if input_solution.uns['dataset_organism'] not in translator: +if adata_solution.uns['dataset_organism'] not in translator: score = np.nan else: - organism = translator[input_solution.uns['dataset_organism']] + organism = translator[adata_solution.uns['dataset_organism']] score = cell_cycle( - input_solution, - input_integrated, + adata_solution, + adata_integrated, batch_key='batch', embed='X_emb', organism=organism, @@ -43,9 +56,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata_solution.uns['dataset_id'], + 'normalization_id': adata_solution.uns['normalization_id'], + 'method_id': adata_integrated.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml b/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml index 6fa7b9c9a9..8d92033e40 100644 --- a/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml @@ -47,9 +47,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/clustering_overlap/script.py b/src/tasks/batch_integration/metrics/clustering_overlap/script.py index b92ecd66cb..7bb9e533c8 100644 --- a/src/tasks/batch_integration/metrics/clustering_overlap/script.py +++ b/src/tasks/batch_integration/metrics/clustering_overlap/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import scanpy as sc from scib.metrics.clustering import cluster_optimal_resolution @@ -5,7 +6,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_graph.h5ad', + 'adata_integrated': 'resources_test/batch_integration/pancreas/integrated_graph.h5ad', 'output': 'output.h5ad', } @@ -14,36 +15,35 @@ } ## VIASH END -print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata -input_solution.obsp["connectivities"] = input_integrated.obsp["connectivities"] -input_solution.obsp["distances"] = input_integrated.obsp["distances"] -# TODO: if we don't copy neighbors over, the metric doesn't work -input_solution.uns["neighbors"] = input_integrated.uns["neighbors"] +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('Run optimal Leiden clustering', flush=True) cluster_optimal_resolution( - adata=input_solution, + adata=adata, label_key='label', cluster_key='cluster', cluster_function=sc.tl.leiden, ) print('Compute ARI score', flush=True) -ari_score = ari(input_solution, group1='cluster', group2='label') +ari_score = ari(adata, cluster_key='cluster', label_key='label') print('Compute NMI score', flush=True) -nmi_score = nmi(input_solution, group1='cluster', group2='label') +nmi_score = nmi(adata, cluster_key='cluster', label_key='label') print("Create output AnnData object", flush=True) output = ad.AnnData( uns={ - "dataset_id": input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - "method_id": input_integrated.uns['method_id'], + "dataset_id": adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + "method_id": adata.uns['method_id'], "metric_ids": [ "ari", "nmi" ], "metric_values": [ ari_score, nmi_score ] } diff --git a/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml b/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml index 627f480e4c..6384feca62 100644 --- a/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml @@ -33,9 +33,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/graph_connectivity/script.py b/src/tasks/batch_integration/metrics/graph_connectivity/script.py index 35a1b2367c..ead8f146bc 100644 --- a/src/tasks/batch_integration/metrics/graph_connectivity/script.py +++ b/src/tasks/batch_integration/metrics/graph_connectivity/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad import scib @@ -11,28 +12,27 @@ } ## VIASH END -print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata -input_solution.obsp["connectivities"] = input_integrated.obsp["connectivities"] -input_solution.obsp["distances"] = input_integrated.obsp["distances"] -# TODO: if we don't copy neighbors over, the metric doesn't work -input_solution.uns["neighbors"] = input_integrated.uns["neighbors"] +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = scib.metrics.graph_connectivity( - input_solution, + adata, label_key='label' ) print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml b/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml index 1076f03619..a8025783d6 100644 --- a/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml @@ -32,9 +32,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/hvg_overlap/script.py b/src/tasks/batch_integration/metrics/hvg_overlap/script.py index e3221765fd..b7d177e991 100644 --- a/src/tasks/batch_integration/metrics/hvg_overlap/script.py +++ b/src/tasks/batch_integration/metrics/hvg_overlap/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import hvg_overlap @@ -12,25 +13,39 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.X = input_solution.layers["normalized"] -input_integrated.X = input_integrated.layers["corrected_counts"] +adata_solution = read_anndata( + par['input_solution'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) +adata_integrated = read_anndata( + par['input_integrated'], + X='layers/corrected_counts', + obs='obs', + var='var', + uns='uns' +) print('compute score', flush=True) score = hvg_overlap( - input_solution, - input_integrated, + adata_solution, + adata_integrated, batch_key="batch" ) print("Create output AnnData object", flush=True) output = ad.AnnData( uns={ - "dataset_id": input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - "method_id": input_integrated.uns['method_id'], + "dataset_id": adata_solution.uns['dataset_id'], + 'normalization_id': adata_solution.uns['normalization_id'], + "method_id": adata_integrated.uns['method_id'], "metric_ids": [meta['functionality_name']], "metric_values": [score] } diff --git a/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml b/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml index cf1702fb93..65e1970c4f 100644 --- a/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml @@ -26,9 +26,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/isolated_label_asw/script.py b/src/tasks/batch_integration/metrics/isolated_label_asw/script.py index 176239665b..094937e687 100644 --- a/src/tasks/batch_integration/metrics/isolated_label_asw/script.py +++ b/src/tasks/batch_integration/metrics/isolated_label_asw/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import isolated_labels_asw @@ -12,15 +13,19 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.obsm["X_emb"] = input_integrated.obsm["X_emb"] +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = isolated_labels_asw( - input_solution, + adata, label_key='label', batch_key='batch', embed='X_emb', @@ -32,9 +37,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml b/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml index 4208e502ec..6b8f0703bf 100644 --- a/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml @@ -38,9 +38,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/isolated_label_f1/script.py b/src/tasks/batch_integration/metrics/isolated_label_f1/script.py index 8c89b98f8f..30fe25bccf 100644 --- a/src/tasks/batch_integration/metrics/isolated_label_f1/script.py +++ b/src/tasks/batch_integration/metrics/isolated_label_f1/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import isolated_labels_f1 @@ -12,19 +13,18 @@ } ## VIASH END -print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata -input_solution.obsp["connectivities"] = input_integrated.obsp["connectivities"] -input_solution.obsp["distances"] = input_integrated.obsp["distances"] -# TODO: if we don't copy neighbors over, the metric doesn't work -input_solution.uns["neighbors"] = input_integrated.uns["neighbors"] +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = isolated_labels_f1( - input_solution, + adata, label_key='label', batch_key='batch', embed=None, @@ -36,9 +36,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml b/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml index 39bd895680..aca556a8fc 100644 --- a/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml @@ -38,9 +38,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r github: theislab/kBET @@ -49,6 +51,7 @@ platforms: - scib==1.1.5 - rpy2>=3 - anndata2ri + - scipy<=1.13 - type: nextflow directives: label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/kbet/script.py b/src/tasks/batch_integration/metrics/kbet/script.py index 24cf8bdf69..9834f525d5 100644 --- a/src/tasks/batch_integration/metrics/kbet/script.py +++ b/src/tasks/batch_integration/metrics/kbet/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import kBET @@ -12,14 +13,18 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.obsm["X_emb"] = input_integrated.obsm["X_emb"] +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute score', flush=True) score = kBET( - input_solution, + adata, batch_key="batch", label_key="label", type_="embed", @@ -32,9 +37,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml b/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml index 1687dc5c1c..750574f84a 100644 --- a/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml @@ -42,13 +42,15 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: - - git+https://github.com/theislab/scib.git@v1.1.4 + - git+https://github.com/theislab/scib.git@v1.1.5 - type: nextflow directives: label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/lisi/script.py b/src/tasks/batch_integration/metrics/lisi/script.py index bdc9ed4e1a..44181dab71 100644 --- a/src/tasks/batch_integration/metrics/lisi/script.py +++ b/src/tasks/batch_integration/metrics/lisi/script.py @@ -1,6 +1,7 @@ +import sys import numpy as np import anndata as ad -from scib.metrics.lisi import recompute_knn, lisi_graph_py +from scib.metrics.lisi import lisi_graph_py ## VIASH START par = { @@ -12,19 +13,18 @@ } ## VIASH END -print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata -input_solution.obsp["connectivities"] = input_integrated.obsp["connectivities"] -input_solution.obsp["distances"] = input_integrated.obsp["distances"] -# TODO: if we don't copy neighbors over, the metric doesn't work -input_solution.uns["neighbors"] = input_integrated.uns["neighbors"] +print('Read input', flush=True) +adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata.obs = read_anndata(par['input_solution'], obs='obs').obs +adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('compute iLISI score...', flush=True) ilisi_scores = lisi_graph_py( - adata=input_solution, + adata=adata, obs_key='batch', n_neighbors=90, perplexity=None, @@ -33,11 +33,11 @@ verbose=False, ) ilisi = np.nanmedian(ilisi_scores) -ilisi = (ilisi - 1) / (input_solution.obs['batch'].nunique() - 1) +ilisi = (ilisi - 1) / (adata.obs['batch'].nunique() - 1) print('compute cLISI scores...', flush=True) clisi_scores = lisi_graph_py( - adata=input_solution, + adata=adata, obs_key='label', n_neighbors=90, perplexity=None, @@ -46,15 +46,15 @@ verbose=False, ) clisi = np.nanmedian(clisi_scores) -nlabs = input_solution.obs['label'].nunique() +nlabs = adata.obs['label'].nunique() clisi = (nlabs - clisi) / (nlabs - 1) print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': adata.uns['method_id'], 'metric_ids': [ 'ilisi', 'clisi' ], 'metric_values': [ ilisi, clisi ] } diff --git a/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml b/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml index 8644120657..d3391fb528 100644 --- a/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml +++ b/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml @@ -30,9 +30,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/metrics/pcr/script.py b/src/tasks/batch_integration/metrics/pcr/script.py index 392332963c..512b3dff6b 100644 --- a/src/tasks/batch_integration/metrics/pcr/script.py +++ b/src/tasks/batch_integration/metrics/pcr/script.py @@ -1,3 +1,4 @@ +import sys import anndata as ad from scib.metrics import pcr_comparison @@ -12,15 +13,31 @@ } ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_integrated = ad.read_h5ad(par['input_integrated']) -input_solution.X = input_solution.layers['normalized'] +adata_solution = read_anndata( + par['input_solution'], + X='layers/normalized', + obs='obs', + var='var', + # obsm='obsm', + # varm='varm', + uns='uns' +) +adata_integrated = read_anndata( + par['input_integrated'], + obs='obs', + obsm='obsm', + uns='uns' +) print('compute score', flush=True) score = pcr_comparison( - input_solution, - input_integrated, + adata_solution, + adata_integrated, embed='X_emb', covariate='batch', verbose=False @@ -29,9 +46,9 @@ print('Create output AnnData object', flush=True) output = ad.AnnData( uns={ - 'dataset_id': input_solution.uns['dataset_id'], - 'normalization_id': input_solution.uns['normalization_id'], - 'method_id': input_integrated.uns['method_id'], + 'dataset_id': adata_solution.uns['dataset_id'], + 'normalization_id': adata_solution.uns['normalization_id'], + 'method_id': adata_integrated.uns['method_id'], 'metric_ids': [ meta['functionality_name'] ], 'metric_values': [ score ] } diff --git a/src/tasks/batch_integration/process_dataset/config.vsh.yaml b/src/tasks/batch_integration/process_dataset/config.vsh.yaml index 0dbe5f3bcd..73ea5815c3 100644 --- a/src/tasks/batch_integration/process_dataset/config.vsh.yaml +++ b/src/tasks/batch_integration/process_dataset/config.vsh.yaml @@ -8,7 +8,7 @@ functionality: - path: /src/common/helper_functions/subset_anndata.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml b/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml index 7f881da214..e841081a91 100644 --- a/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml +++ b/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml @@ -9,12 +9,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 - setup: - - type: python - pypi: scanpy + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/transformers/embed_to_graph/script.py b/src/tasks/batch_integration/transformers/embed_to_graph/script.py index 1731e82066..74166eb77c 100644 --- a/src/tasks/batch_integration/transformers/embed_to_graph/script.py +++ b/src/tasks/batch_integration/transformers/embed_to_graph/script.py @@ -1,4 +1,4 @@ -import yaml +import sys import scanpy as sc ## VIASH START @@ -6,12 +6,27 @@ 'input': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', 'ouput': 'output.h5ad' } + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata = sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + obs='obs', + obsm='obsm', + uns='uns' +) + -print('Run kNN', flush=True) +print('Run kNN...', flush=True) sc.pp.neighbors(adata, use_rep='X_emb') print("Store outputs", flush=True) diff --git a/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml b/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml index 8ec4da8170..e08013c63b 100644 --- a/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml +++ b/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml @@ -10,12 +10,11 @@ functionality: resources: - type: python_script path: script.py + - type: python_script + path: /src/common/helper_functions/read_anndata_partial.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 - setup: - - type: python - pypi: scanpy + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/transformers/feature_to_embed/script.py b/src/tasks/batch_integration/transformers/feature_to_embed/script.py index f7793bb153..0e022db8b1 100644 --- a/src/tasks/batch_integration/transformers/feature_to_embed/script.py +++ b/src/tasks/batch_integration/transformers/feature_to_embed/script.py @@ -1,22 +1,38 @@ +import sys import scanpy as sc -import yaml ## VIASH START par = { 'input': 'resources_test/batch_integration/pancreas/integrated_feature.h5ad', 'ouput': 'output.h5ad' } + +meta = { + 'functionality': 'foo', + 'config': 'bar' +} + ## VIASH END +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + print('Read input', flush=True) -adata= sc.read_h5ad(par['input']) +adata = read_anndata( + par['input'], + X='layers/corrected_counts', + obs='obs', + var='var', + uns='uns' +) print('Run PCA', flush=True) adata.obsm['X_emb'] = sc.pp.pca( - adata.layers["corrected_counts"], + adata.X, n_comps=50, - use_highly_variable=False, + use_highly_variable=False, # Do we want to set this to True? svd_solver='arpack', return_info=False ) diff --git a/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml b/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml index ec3ed2469d..64a35f9986 100644 --- a/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml +++ b/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml @@ -16,7 +16,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, midmem, midcpu] diff --git a/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml b/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml index c36581e205..b16862360b 100644 --- a/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml +++ b/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml @@ -16,7 +16,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, midmem, midcpu] diff --git a/src/tasks/denoising/methods/alra/config.vsh.yaml b/src/tasks/denoising/methods/alra/config.vsh.yaml index 1354b5c00c..374d317fce 100644 --- a/src/tasks/denoising/methods/alra/config.vsh.yaml +++ b/src/tasks/denoising/methods/alra/config.vsh.yaml @@ -33,7 +33,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ Matrix, rsvd ] diff --git a/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml b/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml index 975d729990..b0c55ae0d8 100644 --- a/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml +++ b/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml @@ -29,7 +29,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/denoising/methods/magic/config.vsh.yaml b/src/tasks/denoising/methods/magic/config.vsh.yaml index 0d2ff14c98..380666a1b5 100644 --- a/src/tasks/denoising/methods/magic/config.vsh.yaml +++ b/src/tasks/denoising/methods/magic/config.vsh.yaml @@ -54,7 +54,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pip: [scprep, magic-impute, scipy, scikit-learn<1.2] diff --git a/src/tasks/denoising/methods/saver/config.vsh.yaml b/src/tasks/denoising/methods/saver/config.vsh.yaml index fcd3e4b88a..3c997fc36f 100644 --- a/src/tasks/denoising/methods/saver/config.vsh.yaml +++ b/src/tasks/denoising/methods/saver/config.vsh.yaml @@ -23,7 +23,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r github: mohuangx/SAVER diff --git a/src/tasks/denoising/metrics/mse/config.vsh.yaml b/src/tasks/denoising/metrics/mse/config.vsh.yaml index 3260c2694a..8330a8de31 100644 --- a/src/tasks/denoising/metrics/mse/config.vsh.yaml +++ b/src/tasks/denoising/metrics/mse/config.vsh.yaml @@ -19,7 +19,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/denoising/metrics/poisson/config.vsh.yaml b/src/tasks/denoising/metrics/poisson/config.vsh.yaml index 5d239b0c92..e523a9306e 100644 --- a/src/tasks/denoising/metrics/poisson/config.vsh.yaml +++ b/src/tasks/denoising/metrics/poisson/config.vsh.yaml @@ -19,7 +19,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pip: scprep diff --git a/src/tasks/denoising/process_dataset/config.vsh.yaml b/src/tasks/denoising/process_dataset/config.vsh.yaml index 6c30e6ab12..c9b5b06c1a 100644 --- a/src/tasks/denoising/process_dataset/config.vsh.yaml +++ b/src/tasks/denoising/process_dataset/config.vsh.yaml @@ -26,7 +26,7 @@ functionality: - path: helper.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml index cc5a4be442..6c0d36ad44 100644 --- a/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml @@ -16,7 +16,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, highmem, highcpu] \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml index 8e409f7bbc..b3ae5aa95b 100644 --- a/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml @@ -29,7 +29,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml index 43f660d34d..a83d393072 100644 --- a/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml @@ -16,7 +16,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml index 778ee410bb..ff5764a561 100644 --- a/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml @@ -33,7 +33,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml index e8ad2b1f99..ced082c708 100644 --- a/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml @@ -22,7 +22,7 @@ functionality: default: 3 platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r bioc: destiny diff --git a/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml index 9f72001546..aa3c5ca0b4 100644 --- a/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml @@ -33,7 +33,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml index 98a7cd53a4..2b651271a9 100644 --- a/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml @@ -35,7 +35,7 @@ functionality: platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ Matrix, lmds ] diff --git a/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml index 130cd7faf2..0d3d0234c4 100644 --- a/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml @@ -44,7 +44,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml index ec86b6725f..11d3841fb6 100644 --- a/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml @@ -31,7 +31,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: scanpy diff --git a/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml index 1c2dd68e2b..ff63659780 100644 --- a/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml @@ -46,7 +46,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml index 1948fd51ea..2f733bb714 100644 --- a/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml @@ -32,7 +32,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: pymde diff --git a/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml index 210801ac0e..ba4b7b3b84 100644 --- a/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml @@ -45,7 +45,7 @@ functionality: platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r packages: [ grDevices ] diff --git a/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml index 9dbc917e0f..cedaba0484 100644 --- a/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml @@ -35,7 +35,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: apt packages: diff --git a/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml index 6b4b222fef..a073e9dbe3 100644 --- a/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml @@ -39,7 +39,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml index 56757c0a02..67f1078f13 100644 --- a/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml @@ -51,7 +51,7 @@ functionality: platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: [ scikit-learn, scanpy, leidenalg ] diff --git a/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml index b7a82fe3d7..6787e88f7e 100644 --- a/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml @@ -157,7 +157,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ coRanking ] diff --git a/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml index c421f6a479..4b1e9f3a32 100644 --- a/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml @@ -30,7 +30,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml index 44ff1950e5..b08c93db2c 100644 --- a/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml @@ -36,7 +36,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml index 2e66527bd4..5f75fa8e26 100644 --- a/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml @@ -20,7 +20,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml b/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml index 292318947d..d6f62e0c7e 100644 --- a/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml +++ b/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml @@ -7,7 +7,7 @@ functionality: - path: /src/common/helper_functions/subset_anndata.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, highmem, highcpu] diff --git a/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml b/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml index 641cc89b71..8f0915a1dd 100644 --- a/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml +++ b/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml @@ -16,7 +16,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml b/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml index 52f4c6a072..728157a644 100644 --- a/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml +++ b/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml @@ -16,7 +16,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: scanpy diff --git a/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml b/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml index c464a3a9dc..ec536fcc7d 100644 --- a/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml +++ b/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml @@ -16,7 +16,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/label_projection/methods/knn/config.vsh.yaml b/src/tasks/label_projection/methods/knn/config.vsh.yaml index a083f1ff20..499fa69e81 100644 --- a/src/tasks/label_projection/methods/knn/config.vsh.yaml +++ b/src/tasks/label_projection/methods/knn/config.vsh.yaml @@ -28,7 +28,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: [scikit-learn, jsonschema] diff --git a/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml b/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml index 2497d5c803..88f4c2d5af 100644 --- a/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml +++ b/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml @@ -25,7 +25,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: scikit-learn diff --git a/src/tasks/label_projection/methods/mlp/config.vsh.yaml b/src/tasks/label_projection/methods/mlp/config.vsh.yaml index 944ed6e4f7..9c7e92fc68 100644 --- a/src/tasks/label_projection/methods/mlp/config.vsh.yaml +++ b/src/tasks/label_projection/methods/mlp/config.vsh.yaml @@ -38,7 +38,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: scikit-learn diff --git a/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml b/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml index 2a09c7fa5d..90f6e72a52 100644 --- a/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml +++ b/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml @@ -24,7 +24,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: scikit-learn diff --git a/src/tasks/label_projection/methods/scanvi/config.vsh.yaml b/src/tasks/label_projection/methods/scanvi/config.vsh.yaml index b271878a5c..6c36ead072 100644 --- a/src/tasks/label_projection/methods/scanvi/config.vsh.yaml +++ b/src/tasks/label_projection/methods/scanvi/config.vsh.yaml @@ -32,7 +32,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 + image: openproblems/base_pytorch_nvidia:1.0.0 setup: - type: python packages: diff --git a/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml b/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml index ddd18f4e91..ccf2f449b4 100644 --- a/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml +++ b/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml @@ -41,7 +41,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 + image: openproblems/base_pytorch_nvidia:1.0.0 setup: - type: python pypi: scvi-tools>=1.1.0 diff --git a/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml b/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml index 2bdac1c370..d51b532917 100644 --- a/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml +++ b/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml @@ -27,7 +27,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ Matrix>=1.5.3, Seurat, rlang ] diff --git a/src/tasks/label_projection/methods/xgboost/config.vsh.yaml b/src/tasks/label_projection/methods/xgboost/config.vsh.yaml index d0892b9e8c..516308fbdd 100644 --- a/src/tasks/label_projection/methods/xgboost/config.vsh.yaml +++ b/src/tasks/label_projection/methods/xgboost/config.vsh.yaml @@ -25,7 +25,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: xgboost diff --git a/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml b/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml index cad211fd77..8fc7021ffa 100644 --- a/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml +++ b/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml @@ -19,7 +19,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: scikit-learn diff --git a/src/tasks/label_projection/metrics/f1/config.vsh.yaml b/src/tasks/label_projection/metrics/f1/config.vsh.yaml index d059a236bb..f5abc0caa6 100644 --- a/src/tasks/label_projection/metrics/f1/config.vsh.yaml +++ b/src/tasks/label_projection/metrics/f1/config.vsh.yaml @@ -41,7 +41,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: scikit-learn diff --git a/src/tasks/label_projection/process_dataset/config.vsh.yaml b/src/tasks/label_projection/process_dataset/config.vsh.yaml index 63724de9cd..aa010876cb 100644 --- a/src/tasks/label_projection/process_dataset/config.vsh.yaml +++ b/src/tasks/label_projection/process_dataset/config.vsh.yaml @@ -25,7 +25,7 @@ functionality: - path: /src/common/helper_functions/subset_anndata.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [highmem, midcpu , midtime] diff --git a/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml b/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml index efe3dd5e42..8c021c3bdf 100644 --- a/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml +++ b/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml @@ -15,7 +15,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml b/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml index 0fefd7f49b..bc897dd821 100644 --- a/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml +++ b/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml @@ -15,7 +15,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml b/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml index db343218b7..4e143ec67b 100644 --- a/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml +++ b/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml @@ -20,7 +20,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r bioc: batchelor diff --git a/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml b/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml index 466c26d160..3146db56e0 100644 --- a/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml +++ b/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml @@ -27,7 +27,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python github: diff --git a/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml b/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml index e0c95afcc2..db7b49383b 100644 --- a/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml +++ b/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml @@ -19,7 +19,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python pypi: diff --git a/src/tasks/match_modalities/methods/scot/config.vsh.yaml b/src/tasks/match_modalities/methods/scot/config.vsh.yaml index 5ea1a45164..e86fe4438a 100644 --- a/src/tasks/match_modalities/methods/scot/config.vsh.yaml +++ b/src/tasks/match_modalities/methods/scot/config.vsh.yaml @@ -19,7 +19,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: apt packages: git diff --git a/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml b/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml index d629f20c38..e7067a20b5 100644 --- a/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml +++ b/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml @@ -25,7 +25,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/match_modalities/metrics/mse/config.vsh.yaml b/src/tasks/match_modalities/metrics/mse/config.vsh.yaml index a6fadd43f1..b1dfc15746 100644 --- a/src/tasks/match_modalities/metrics/mse/config.vsh.yaml +++ b/src/tasks/match_modalities/metrics/mse/config.vsh.yaml @@ -20,11 +20,11 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: - - numpy + - numpy<2 - scipy - scprep - type: nextflow diff --git a/src/tasks/match_modalities/process_dataset/config.vsh.yaml b/src/tasks/match_modalities/process_dataset/config.vsh.yaml index 4560b350b3..35dc757809 100644 --- a/src/tasks/match_modalities/process_dataset/config.vsh.yaml +++ b/src/tasks/match_modalities/process_dataset/config.vsh.yaml @@ -12,7 +12,7 @@ functionality: - path: /src/common/helper_functions/subset_anndata.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [highmem, midcpu , midtime] diff --git a/src/tasks/predict_modality/api/comp_method_predict.yaml b/src/tasks/predict_modality/api/comp_method_predict.yaml index ebd56aed51..a43cd1e5c5 100644 --- a/src/tasks/predict_modality/api/comp_method_predict.yaml +++ b/src/tasks/predict_modality/api/comp_method_predict.yaml @@ -11,11 +11,11 @@ functionality: - name: "--input_train_mod1" __merge__: file_train_mod1.yaml direction: input - required: true + required: false - name: "--input_train_mod2" __merge__: file_train_mod2.yaml direction: input - required: true + required: false - name: "--input_test_mod1" __merge__: file_test_mod1.yaml direction: input diff --git a/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml b/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml index c82e0be026..4824a05c46 100644 --- a/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml +++ b/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml @@ -44,6 +44,16 @@ info: name: hvg_score description: A score for the feature indicating how highly variable it is. required: true + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true uns: - type: string name: dataset_id diff --git a/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml b/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml index 1a447b24c0..e0b1b3bae9 100644 --- a/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml +++ b/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml @@ -44,6 +44,16 @@ info: name: hvg_score description: A score for the feature indicating how highly variable it is. required: true + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A ranking of the features by hvg. + required: true uns: - type: string name: dataset_id diff --git a/src/tasks/predict_modality/api/task_info.yaml b/src/tasks/predict_modality/api/task_info.yaml index ba3b567d01..e0d1ed9da7 100644 --- a/src/tasks/predict_modality/api/task_info.yaml +++ b/src/tasks/predict_modality/api/task_info.yaml @@ -53,4 +53,15 @@ authors: roles: [ contributor ] info: email: dengkw@umich.edu - github: nonztalk \ No newline at end of file + github: nonztalk + - name: Xueer Chen + roles: [ contributor ] + info: + github: xuerchen + email: xc2579@columbia.edu + - name: Jiwei Liu + roles: [ contributor ] + info: + github: daxiongshu + email: jiweil@nvidia.com + orcid: "0000-0002-8799-9763" diff --git a/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml b/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml index 87696bd678..9521b90508 100644 --- a/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml +++ b/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml @@ -10,7 +10,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml b/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml index d36e02d3c0..3324c53a91 100644 --- a/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml +++ b/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml @@ -10,7 +10,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml b/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml index 64cf880a97..350b0e79ea 100644 --- a/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml +++ b/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml @@ -10,7 +10,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml b/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml index d557556c32..344df9c338 100644 --- a/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml +++ b/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml @@ -10,7 +10,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml b/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml index a81ed56cb3..8663123ad9 100644 --- a/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml +++ b/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml @@ -31,7 +31,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 setup: - type: python packages: diff --git a/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml b/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml index 4ef88dd62b..543ee71fa1 100644 --- a/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml +++ b/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml @@ -27,7 +27,7 @@ functionality: path: script.py platforms: - type: docker - image: ghcr.io/openproblems-bio/base_python:1.0.4 + image: openproblems/base_python:1.0.0 - type: nextflow directives: label: [hightime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml b/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml index cda809b109..448b3ca0b8 100644 --- a/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml +++ b/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml @@ -27,7 +27,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ lmds, FNN, proxyC] diff --git a/src/tasks/predict_modality/methods/lm/config.vsh.yaml b/src/tasks/predict_modality/methods/lm/config.vsh.yaml index 98d3268abd..3fdbc0f243 100644 --- a/src/tasks/predict_modality/methods/lm/config.vsh.yaml +++ b/src/tasks/predict_modality/methods/lm/config.vsh.yaml @@ -23,7 +23,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ lmds, RcppArmadillo, pbapply] diff --git a/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml b/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml index 37906c9c96..ba86f0631e 100644 --- a/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml +++ b/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml @@ -28,7 +28,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [lmds, ranger, pbapply, irlba] diff --git a/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml b/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml index 1e01b4fc6a..385f1234bb 100644 --- a/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml +++ b/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml @@ -32,7 +32,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ lmds, FNN, proxy, proxyC ] diff --git a/src/tasks/predict_modality/methods/novel/helper_functions.py b/src/tasks/predict_modality/methods/novel/helper_functions.py new file mode 100644 index 0000000000..17c57c9b3b --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/helper_functions.py @@ -0,0 +1,247 @@ +import torch + +from torch import nn +import torch.nn.functional as F + +from torch.utils.data import Dataset + +from typing import Optional + +import anndata +import numpy as np +import pandas as pd +import scipy.sparse +import sklearn.decomposition +import sklearn.feature_extraction.text +import sklearn.preprocessing +import sklearn.neighbors +import sklearn.utils.extmath + +class tfidfTransformer(): + def __init__(self): + self.idf = None + self.fitted = False + + def fit(self, X): + self.idf = X.shape[0] / X.sum(axis=0) + self.fitted = True + + def transform(self, X): + if not self.fitted: + raise RuntimeError('Transformer was not fitted on any data') + if scipy.sparse.issparse(X): + tf = X.multiply(1 / X.sum(axis=1)) + return tf.multiply(self.idf) + else: + tf = X / X.sum(axis=1, keepdims=True) + return tf * self.idf + + def fit_transform(self, X): + self.fit(X) + return self.transform(X) + +class lsiTransformer(): + def __init__(self, + n_components: int = 20, + use_highly_variable = None + ): + self.n_components = n_components + self.use_highly_variable = use_highly_variable + self.tfidfTransformer = tfidfTransformer() + self.normalizer = sklearn.preprocessing.Normalizer(norm="l1") + self.pcaTransformer = sklearn.decomposition.TruncatedSVD(n_components = self.n_components, random_state=777) + # self.lsi_mean = None + # self.lsi_std = None + self.fitted = None + + def fit(self, adata: anndata.AnnData): + if self.use_highly_variable is None: + self.use_highly_variable = "hvg" in adata.var + adata_use = adata[:, adata.var["hvg"]] if self.use_highly_variable else adata + X = self.tfidfTransformer.fit_transform(adata_use.X) + X_norm = self.normalizer.fit_transform(X) + X_norm = np.log1p(X_norm * 1e4) + X_lsi = self.pcaTransformer.fit_transform(X_norm) + # self.lsi_mean = X_lsi.mean(axis=1, keepdims=True) + # self.lsi_std = X_lsi.std(axis=1, ddof=1, keepdims=True) + self.fitted = True + + def transform(self, adata): + if not self.fitted: + raise RuntimeError('Transformer was not fitted on any data') + adata_use = adata[:, adata.var["hvg"]] if self.use_highly_variable else adata + X = self.tfidfTransformer.transform(adata_use.X) + X_norm = self.normalizer.transform(X) + X_norm = np.log1p(X_norm * 1e4) + X_lsi = self.pcaTransformer.transform(X_norm) + X_lsi -= X_lsi.mean(axis=1, keepdims=True) + X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True) + lsi_df = pd.DataFrame(X_lsi, index = adata_use.obs_names) + return lsi_df + + def fit_transform(self, adata): + self.fit(adata) + return self.transform(adata) + +class ModalityMatchingDataset(Dataset): + def __init__( + self, df_modality1, df_modality2, is_train=True + ): + super().__init__() + self.df_modality1 = df_modality1 + self.df_modality2 = df_modality2 + self.is_train = is_train + def __len__(self): + return self.df_modality1.shape[0] + + def __getitem__(self, index: int): + if self.is_train == True: + x = self.df_modality1.iloc[index].values + y = self.df_modality2.iloc[index].values + return x, y + else: + x = self.df_modality1.iloc[index].values + return x + +class Swish(torch.autograd.Function): + @staticmethod + def forward(ctx, i): + result = i * sigmoid(i) + ctx.save_for_backward(i) + return result + @staticmethod + def backward(ctx, grad_output): + i = ctx.saved_variables[0] + sigmoid_i = sigmoid(i) + return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i))) + +class Swish_module(nn.Module): + def forward(self, x): + return Swish.apply(x) + +sigmoid = torch.nn.Sigmoid() + +class ModelRegressionGex2Atac(nn.Module): + def __init__(self, dim_mod1, dim_mod2): + super(ModelRegressionGex2Atac, self).__init__() + #self.bn = torch.nn.BatchNorm1d(1024) + self.input_ = nn.Linear(dim_mod1, 1024) + self.fc = nn.Linear(1024, 256) + self.fc1 = nn.Linear(256, 2048) + self.dropout1 = nn.Dropout(p=0.298885630228993) + self.dropout2 = nn.Dropout(p=0.11289717442776658) + self.dropout3 = nn.Dropout(p=0.13523634924414762) + self.output = nn.Linear(2048, dim_mod2) + def forward(self, x): + x = F.gelu(self.input_(x)) + x = self.dropout1(x) + x = F.gelu(self.fc(x)) + x = self.dropout2(x) + x = F.gelu(self.fc1(x)) + x = self.dropout3(x) + x = F.gelu(self.output(x)) + return x + +class ModelRegressionAtac2Gex(nn.Module): # + def __init__(self, dim_mod1, dim_mod2): + super(ModelRegressionAtac2Gex, self).__init__() + self.input_ = nn.Linear(dim_mod1, 2048) + self.fc = nn.Linear(2048, 2048) + self.fc1 = nn.Linear(2048, 512) + self.dropout1 = nn.Dropout(p=0.2649138776004753) + self.dropout2 = nn.Dropout(p=0.1769628308148758) + self.dropout3 = nn.Dropout(p=0.2516791883012817) + self.output = nn.Linear(512, dim_mod2) + def forward(self, x): + x = F.gelu(self.input_(x)) + x = self.dropout1(x) + x = F.gelu(self.fc(x)) + x = self.dropout2(x) + x = F.gelu(self.fc1(x)) + x = self.dropout3(x) + x = F.gelu(self.output(x)) + return x + +class ModelRegressionAdt2Gex(nn.Module): + def __init__(self, dim_mod1, dim_mod2): + super(ModelRegressionAdt2Gex, self).__init__() + self.input_ = nn.Linear(dim_mod1, 512) + self.dropout1 = nn.Dropout(p=0.0) + self.swish = Swish_module() + self.fc = nn.Linear(512, 512) + self.fc1 = nn.Linear(512, 512) + self.fc2 = nn.Linear(512, 512) + self.output = nn.Linear(512, dim_mod2) + def forward(self, x): + x = F.gelu(self.input_(x)) + x = F.gelu(self.fc(x)) + x = F.gelu(self.fc1(x)) + x = F.gelu(self.fc2(x)) + x = F.gelu(self.output(x)) + return x + +class ModelRegressionGex2Adt(nn.Module): + def __init__(self, dim_mod1, dim_mod2): + super(ModelRegressionGex2Adt, self).__init__() + self.input_ = nn.Linear(dim_mod1, 512) + self.dropout1 = nn.Dropout(p=0.20335661386636347) + self.dropout2 = nn.Dropout(p=0.15395289261127876) + self.dropout3 = nn.Dropout(p=0.16902655078832815) + self.fc = nn.Linear(512, 512) + self.fc1 = nn.Linear(512, 2048) + self.output = nn.Linear(2048, dim_mod2) + def forward(self, x): + # x = self.batchswap_noise(x) + x = F.gelu(self.input_(x)) + x = self.dropout1(x) + x = F.gelu(self.fc(x)) + x = self.dropout2(x) + x = F.gelu(self.fc1(x)) + x = self.dropout3(x) + x = F.gelu(self.output(x)) + return x + +def rmse(y, y_pred): + return np.sqrt(np.mean(np.square(y - y_pred))) + +def train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, name_model, device): + best_score = 100000 + for i in range(100): + train_losses = [] + test_losses = [] + model.train() + + for x, y in dataloader_train: + optimizer.zero_grad() + output = model(x.float().to(device)) + loss = torch.sqrt(loss_fn(output, y.float().to(device))) + loss.backward() + train_losses.append(loss.item()) + optimizer.step() + + model.eval() + with torch.no_grad(): + for x, y in dataloader_test: + output = model(x.float().to(device)) + output[output<0] = 0.0 + loss = torch.sqrt(loss_fn(output, y.float().to(device))) + test_losses.append(loss.item()) + + outputs = [] + targets = [] + model.eval() + with torch.no_grad(): + for x, y in dataloader_test: + output = model(x.float().to(device)) + + outputs.append(output.detach().cpu().numpy()) + targets.append(y.float().detach().cpu().numpy()) + cat_outputs = np.concatenate(outputs) + cat_targets = np.concatenate(targets) + cat_outputs[cat_outputs<0.0] = 0 + + if best_score > rmse(cat_targets,cat_outputs): + torch.save(model.state_dict(), name_model) + best_score = rmse(cat_targets,cat_outputs) + print("best rmse: ", best_score) + diff --git a/src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml b/src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml new file mode 100644 index 0000000000..72e3292407 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../../api/comp_method_predict.yaml +functionality: + name: novel_predict + arguments: + - name: "--input_transform" + type: file + direction: input + required: false + example: "lsi_transformer.pickle" + resources: + - type: python_script + path: script.py + - path: ../helper_functions.py +platforms: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + packages: + - scikit-learn + - networkx + - type: nextflow + directives: + label: [highmem, hightime, midcpu, highsharedmem, gpu] + diff --git a/src/tasks/predict_modality/methods/novel/predict/run_test.sh b/src/tasks/predict_modality/methods/novel/predict/run_test.sh new file mode 100644 index 0000000000..af5550e5d7 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/predict/run_test.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +viash run src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml -- \ + --input_train_mod2 'resources/predict_modality/datasets/openproblems_neurips2021/bmmc_cite/normal/log_cp10k/train_mod2.h5ad' \ + --input_test_mod1 'resources/predict_modality/datasets/openproblems_neurips2021/bmmc_cite/normal/log_cp10k/test_mod1.h5ad' \ + --input_model output/novel/model.pt \ + --input_transform output/novel/lsi_transform.pickle \ + --output 'output/novel/novel_test.h5ad' \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/predict/script.py b/src/tasks/predict_modality/methods/novel/predict/script.py new file mode 100644 index 0000000000..5f336ce7b0 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/predict/script.py @@ -0,0 +1,119 @@ +import sys +import torch +from torch.utils.data import DataLoader + +import anndata as ad +import pickle +import numpy as np +from scipy.sparse import csc_matrix + +#check gpu available +if (torch.cuda.is_available()): + device = 'cuda:0' #switch to current device + print('current device: gpu', flush=True) +else: + device = 'cpu' + print('current device: cpu', flush=True) + + +## VIASH START + +par = { + 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad', + 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad', + 'input_model': 'resources_test/predict_modality/neurips2021_bmmc_cite/model.pt', + 'input_transform': 'transformer.pickle' +} +meta = { + 'resources_dir': 'src/tasks/predict_modality/methods/novel', + 'functionality_name': '171129' +} +## VIASH END + +sys.path.append(meta['resources_dir']) +from helper_functions import ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModelRegressionGex2Adt, ModelRegressionGex2Atac, ModalityMatchingDataset + +print("Load data", flush=True) + +input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) + +mod1 = input_test_mod1.uns['modality'] +mod2 = input_train_mod2.uns['modality'] + +n_vars_mod1 = input_train_mod2.uns["model_dim"]["mod1"] +n_vars_mod2 = input_train_mod2.uns["model_dim"]["mod2"] + +input_test_mod1.X = input_test_mod1.layers['normalized'].tocsr() + +# Remove vars that were removed from training set. Mostlyy only applicable for testing. +if input_train_mod2.uns.get("removed_vars"): + rem_var = input_train_mod2.uns["removed_vars"] + input_test_mod1 = input_test_mod1[:, ~input_test_mod1.var_names.isin(rem_var)] + +del input_train_mod2 + + +model_fp = par['input_model'] + +print("Start predict", flush=True) + +if mod1 == 'GEX' and mod2 == 'ADT': + model = ModelRegressionGex2Adt(n_vars_mod1,n_vars_mod2) + weight = torch.load(model_fp, map_location='cpu') + with open(par['input_transform'], 'rb') as f: + lsi_transformer_gex = pickle.load(f) + + model.load_state_dict(weight) + input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) + +elif mod1 == 'GEX' and mod2 == 'ATAC': + model = ModelRegressionGex2Atac(n_vars_mod1,n_vars_mod2) + weight = torch.load(model_fp, map_location='cpu') + with open(par['input_transform'], 'rb') as f: + lsi_transformer_gex = pickle.load(f) + + model.load_state_dict(weight) + input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) + +elif mod1 == 'ATAC' and mod2 == 'GEX': + model = ModelRegressionAtac2Gex(n_vars_mod1,n_vars_mod2) + weight = torch.load(model_fp, map_location='cpu') + with open(par['input_transform'], 'rb') as f: + lsi_transformer_gex = pickle.load(f) + + model.load_state_dict(weight) + input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) + +elif mod1 == 'ADT' and mod2 == 'GEX': + model = ModelRegressionAdt2Gex(n_vars_mod1,n_vars_mod2) + weight = torch.load(model_fp, map_location='cpu') + + model.load_state_dict(weight) + input_test_mod1_ = input_test_mod1.to_df() + +dataset_test = ModalityMatchingDataset(input_test_mod1_, None, is_train=False) +dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4) + +outputs = [] +model.eval() +with torch.no_grad(): + for x in dataloader_test: + output = model(x.float()) + outputs.append(output.detach().cpu().numpy()) + +outputs = np.concatenate(outputs) +outputs[outputs<0] = 0 +outputs = csc_matrix(outputs) + +adata = ad.AnnData( + layers={"normalized": outputs}, + shape=outputs.shape, + uns={ + 'dataset_id': input_test_mod1.uns['dataset_id'], + 'method_id': meta['functionality_name'], + }, +) +adata.write_h5ad(par['output'], compression = "gzip") + + diff --git a/src/tasks/predict_modality/methods/novel/run/config.vsh.yaml b/src/tasks/predict_modality/methods/novel/run/config.vsh.yaml new file mode 100644 index 0000000000..682782e059 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/run/config.vsh.yaml @@ -0,0 +1,21 @@ +__merge__: ../../../api/comp_method.yaml +functionality: + name: novel + info: + label: Novel + summary: A method using encoder-decoder MLP model + description: This method trains an encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework. + documentation_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/novel#readme + repository_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/novel + reference: pmlr-v176-lance2022multimodal + submission_id: "169769" + preferred_normalization: log_cp10k + resources: + - path: main.nf + type: nextflow_script + entrypoint: run_wf + dependencies: + - name: predict_modality/methods/novel_train + - name: predict_modality/methods/novel_predict +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/run/main.nf b/src/tasks/predict_modality/methods/novel/run/main.nf new file mode 100644 index 0000000000..59111194cb --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/run/main.nf @@ -0,0 +1,25 @@ +workflow run_wf { + take: input_ch + main: + output_ch = input_ch + | novel_train.run( + fromState: ["input_train_mod1", "input_train_mod2"], + toState: ["input_model": "output", "input_transform": "output_transform", "output_train_mod2": "output_train_mod2"] + ) + | novel_predict.run( + fromState: { id, state -> + [ + "input_train_mod2": state.output_train_mod2, + "input_test_mod1": state.input_test_mod1, + "input_model": state.input_model, + "input_transform": state.input_transform, + "output": state.output]}, + toState: ["output": "output"] + ) + + | map { tup -> + [tup[0], [output: tup[1].output]] + } + + emit: output_ch +} \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/run/run_test.sh b/src/tasks/predict_modality/methods/novel/run/run_test.sh new file mode 100644 index 0000000000..f6da6b0863 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/run/run_test.sh @@ -0,0 +1,15 @@ +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +nextflow run . \ + -main-script target/nextflow/predict_modality/methods/novel/main.nf \ + -profile docker \ + -c src/wf_utils/labels_ci.config \ + --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad \ + --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad \ + --input_test_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad \ + --publish_dir output/novel/nextflow diff --git a/src/tasks/predict_modality/methods/novel/train/config.vsh.yaml b/src/tasks/predict_modality/methods/novel/train/config.vsh.yaml new file mode 100644 index 0000000000..87ea471301 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/train/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../../../api/comp_method_train.yaml +functionality: + name: novel_train + arguments: + - name: --output_transform + type: file + description: "The output transform file" + required: false + default: "lsi_transformer.pickle" + direction: output + - name: --output_train_mod2 + type: file + description: copy of the input with model dim in `.uns` + direction: output + default: "train_mod2.h5ad" + required: false + resources: + - path: script.py + type: python_script + - path: ../helper_functions.py +platforms: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + packages: + - scikit-learn + - networkx + - type: nextflow + directives: + label: [highmem, hightime, midcpu, highsharedmem, gpu] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/train/run_test.sh b/src/tasks/predict_modality/methods/novel/train/run_test.sh new file mode 100644 index 0000000000..08630b1ac0 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/train/run_test.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Run script for all test resources + +echo "GEX2ADT" +viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ + --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad \ + --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad \ + --output output/model.pt + +# echo "ADT2GEX" +# viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ +# --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad \ +# --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad \ +# --output output/model.pt + +# echo "GEX2ATAC" +# viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ +# --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod1.h5ad \ +# --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod2.h5ad \ +# --output output/model.pt + +# echo "ATAC2GEX" +# viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ +# --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad \ +# --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad \ +# --output output/model.pt + + diff --git a/src/tasks/predict_modality/methods/novel/train/script.py b/src/tasks/predict_modality/methods/novel/train/script.py new file mode 100644 index 0000000000..39ea8b4778 --- /dev/null +++ b/src/tasks/predict_modality/methods/novel/train/script.py @@ -0,0 +1,148 @@ +import sys + +import torch +from torch.utils.data import DataLoader +# from sklearn.model_selection import train_test_split + +import anndata as ad +import pickle + +#check gpu available +if (torch.cuda.is_available()): + device = 'cuda:0' #switch to current device + print('current device: gpu', flush=True) +else: + device = 'cpu' + print('current device: cpu', flush=True) + + +## VIASH START + +par = { + 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad', + 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad', + 'output_train_mod2': 'train_mod2.h5ad', + 'output': 'model.pt' +} + +meta = { + 'resources_dir': 'src/tasks/predict_modality/methods/novel', +} +## VIASH END + + +sys.path.append(meta['resources_dir']) +from helper_functions import train_and_valid, lsiTransformer, ModalityMatchingDataset +from helper_functions import ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModelRegressionGex2Adt, ModelRegressionGex2Atac + +print('Load data', flush=True) + +input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) + +adata = input_train_mod2.copy() + +mod1 = input_train_mod1.uns['modality'] +mod2 = input_train_mod2.uns['modality'] + +input_train_mod1.X = input_train_mod1.layers['normalized'] +input_train_mod2.X = input_train_mod2.layers['normalized'] + +input_train_mod2_df = input_train_mod2.to_df() + +del input_train_mod2 + +print('Start train', flush=True) + + +# Check for zero divide +zero_row = input_train_mod1.X.sum(axis=0) == 0 + +rem_var = None +if True in zero_row: + rem_var = input_train_mod1[:, zero_row].var_names + input_train_mod1 = input_train_mod1[:, ~zero_row] + + +# select number of variables for LSI +n_comp = input_train_mod1.n_vars -1 if input_train_mod1.n_vars < 256 else 256 + +if mod1 != 'ADT': + lsi_transformer_gex = lsiTransformer(n_components=n_comp) + input_train_mod1_df = lsi_transformer_gex.fit_transform(input_train_mod1) +else: + input_train_mod1_df = input_train_mod1.to_df() + +# reproduce train/test split from phase 1 +batch = input_train_mod1.obs["batch"] +train_ix = [ k for k,v in enumerate(batch) if v not in {'s1d2', 's3d7'} ] +test_ix = [ k for k,v in enumerate(batch) if v in {'s1d2', 's3d7'} ] + +train_mod1 = input_train_mod1_df.iloc[train_ix, :] +train_mod2 = input_train_mod2_df.iloc[train_ix, :] +test_mod1 = input_train_mod1_df.iloc[test_ix, :] +test_mod2 = input_train_mod2_df.iloc[test_ix, :] + +n_vars_train_mod1 = train_mod1.shape[1] +n_vars_train_mod2 = train_mod2.shape[1] +n_vars_test_mod1 = test_mod1.shape[1] +n_vars_test_mod2 = test_mod2.shape[1] + +n_vars_mod1 = input_train_mod1_df.shape[1] +n_vars_mod2 = input_train_mod2_df.shape[1] + +if mod1 == 'ATAC' and mod2 == 'GEX': + dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) + dataloader_train = DataLoader(dataset_train, 256, shuffle = True, num_workers = 8) + + dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) + dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) + + model = ModelRegressionAtac2Gex(n_vars_mod1,n_vars_mod2).to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=0.00008386597445284492,weight_decay=0.000684887347727808) + +elif mod1 == 'ADT' and mod2 == 'GEX': + dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) + dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 4) + + dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) + dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4) + + model = ModelRegressionAdt2Gex(n_vars_mod1,n_vars_mod2).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=0.00041, weight_decay=0.0000139) + + +elif mod1 == 'GEX' and mod2 == 'ADT': + dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) + dataloader_train = DataLoader(dataset_train, 32, shuffle = True, num_workers = 8) + + dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) + dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) + + model = ModelRegressionGex2Adt(n_vars_mod1,n_vars_mod2).to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=0.000034609210829678734, weight_decay=0.0009965881574697426) + + +elif mod1 == 'GEX' and mod2 == 'ATAC': + dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) + dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 8) + + dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) + dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) + + model = ModelRegressionGex2Atac(n_vars_mod1,n_vars_mod2).to(device) + optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001806762345275399, weight_decay=0.0004084171379280058) + +loss_fn = torch.nn.MSELoss() +train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, par['output'], device) + +# Add model dim for use in predict part +adata.uns["model_dim"] = {"mod1": n_vars_mod1, "mod2": n_vars_mod2} +if rem_var: + adata.uns["removed_vars"] = [rem_var[0]] +adata.write_h5ad(par['output_train_mod2'], compression="gzip") + +if mod1 != 'ADT': + with open(par['output_transform'], 'wb') as f: + pickle.dump(lsi_transformer_gex, f) + diff --git a/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml b/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml index 110582e07c..a1ee69041d 100644 --- a/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml +++ b/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml @@ -28,7 +28,7 @@ functionality: path: script.R platforms: - type: docker - image: ghcr.io/openproblems-bio/base_r:1.0.4 + image: openproblems/base_r:1.0.0 setup: - type: r cran: [ lmds, ranger, pbapply] diff --git a/src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml b/src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml new file mode 100644 index 0000000000..ef972e416f --- /dev/null +++ b/src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml @@ -0,0 +1,21 @@ +__merge__: ../../../api/comp_method_predict.yaml +functionality: + name: simplemlp_predict + resources: + - type: python_script + path: script.py + - path: ../resources/ +platforms: + - type: docker + # image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime + image: openproblems/base_pytorch_nvidia:1.0.0 + # run_args: ["--gpus all --ipc=host"] + setup: + - type: python + pypi: + - scikit-learn + - scanpy + - pytorch-lightning + - type: nextflow + directives: + label: [highmem, hightime, midcpu, gpu, highsharedmem] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/simple_mlp/predict/script.py b/src/tasks/predict_modality/methods/simple_mlp/predict/script.py new file mode 100644 index 0000000000..b67284e348 --- /dev/null +++ b/src/tasks/predict_modality/methods/simple_mlp/predict/script.py @@ -0,0 +1,104 @@ +from glob import glob +import sys +import numpy as np +from scipy.sparse import csc_matrix +import anndata as ad +import torch +from torch.utils.data import TensorDataset,DataLoader + +## VIASH START +par = { + 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad', + 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad', + 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad', + 'input_model': 'output/model', + 'output': 'output/prediction' +} +meta = { + 'resources_dir': 'src/tasks/predict_modality/methods/simple_mlp', + 'cpus': 10 +} +## VIASH END + +resources_dir = f"{meta['resources_dir']}/resources" +sys.path.append(resources_dir) +from models import MLP +import utils + +def _predict(model,dl): + model = model.cuda() + model.eval() + yps = [] + for x in dl: + with torch.no_grad(): + yp = model(x[0].cuda()) + yps.append(yp.detach().cpu().numpy()) + yp = np.vstack(yps) + return yp + + +print('Load data', flush=True) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) +input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) + +# determine variables +mod_1 = input_test_mod1.uns['modality'] +mod_2 = input_train_mod2.uns['modality'] + +task = f'{mod_1}2{mod_2}' + +print('Load ymean', flush=True) +ymean_path = f"{par['input_model']}/{task}_ymean.npy" +ymean = np.load(ymean_path) + +print('Start predict', flush=True) +if task == 'GEX2ATAC': + y_pred = ymean*np.ones([input_test_mod1.n_obs, input_test_mod1.n_vars]) +else: + folds = [0, 1, 2] + + ymean = torch.from_numpy(ymean).float() + yaml_path=f"{resources_dir}/yaml/mlp_{task}.yaml" + config = utils.load_yaml(yaml_path) + X = input_test_mod1.layers["normalized"].toarray() + X = torch.from_numpy(X).float() + + te_ds = TensorDataset(X) + + yp = 0 + for fold in folds: + # load_path = f"{par['input_model']}/{task}_fold_{fold}/version_0/checkpoints/*" + load_path = f"{par['input_model']}/{task}_fold_{fold}/**.ckpt" + print(load_path) + ckpt = glob(load_path)[0] + model_inf = MLP.load_from_checkpoint( + ckpt, + in_dim=X.shape[1], + out_dim=input_test_mod1.n_vars, + ymean=ymean, + config=config + ) + te_loader = DataLoader( + te_ds, + batch_size=config.batch_size, + num_workers=0, + shuffle=False, + drop_last=False + ) + yp = yp + _predict(model_inf, te_loader) + + y_pred = yp/len(folds) + +y_pred = csc_matrix(y_pred) + +adata = ad.AnnData( + layers={"normalized": y_pred}, + shape=y_pred.shape, + uns={ + 'dataset_id': input_test_mod1.uns['dataset_id'], + 'method_id': meta['functionality_name'], + }, +) + +print('Write data', flush=True) +adata.write_h5ad(par['output'], compression = "gzip") \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/simple_mlp/resources/models.py b/src/tasks/predict_modality/methods/simple_mlp/resources/models.py new file mode 100644 index 0000000000..25ce9b2995 --- /dev/null +++ b/src/tasks/predict_modality/methods/simple_mlp/resources/models.py @@ -0,0 +1,68 @@ +import torch +import pytorch_lightning as pl +import torch.nn as nn +import torch.nn.functional as F + +class MLP(pl.LightningModule): + def __init__(self,in_dim,out_dim,ymean,config): + super(MLP, self).__init__() + self.ymean = ymean.cuda() + H1 = config.H1 + H2 = config.H2 + p = config.dropout + self.config = config + self.fc1 = nn.Linear(in_dim, H1) + self.fc2 = nn.Linear(H1,H2) + self.fc3 = nn.Linear(H1+H2, out_dim) + self.dp2 = nn.Dropout(p=p) + + def forward(self, x): + x0 = x + x1 = F.relu(self.fc1(x)) + x1 = self.dp2(x1) + x = F.relu(self.fc2(x1)) + x = torch.cat([x,x1],dim=1) + x = self.fc3(x) + x = self.apply_mask(x) + return x + + def apply_mask(self,yp): + tmp = torch.ones_like(yp).float()*self.ymean + mask = tmp