diff --git a/CHANGELOG.md b/CHANGELOG.md index 45247e658ae..3443a5d382b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -129,6 +129,8 @@ * `metadata/duplicate_var` component: Added a component to make a copy from one .var field or index to another .var field within the same MuData object (PR #877). +* `filter/subset_obsp` component: Added a component to subset an .obsp matrix by column based on the value of an .obs field. The resulting subset is moved to an .obsm field (PR #888). + ## MINOR CHANGES * `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726). diff --git a/src/filter/subset_obsp/config.vsh.yaml b/src/filter/subset_obsp/config.vsh.yaml new file mode 100644 index 00000000000..27da5d9f800 --- /dev/null +++ b/src/filter/subset_obsp/config.vsh.yaml @@ -0,0 +1,75 @@ +name: subset_obsp +namespace: "filter" +description: | + Create a subset of an .obsp field in a mudata file, by filtering the columns based on the values of an .obs column. The resulting subset is moved to an .obsm slot. +authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] +argument_groups: + - name: Input + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + type: string + default: "rna" + required: false + - name: "--input_obsp_key" + type: string + required: true + description: The .obsp field to be filtered. + - name: "--input_obs_key" + type: string + required: true + description: The .obs column to filter on. + - name: "--input_obs_value" + type: string + required: true + description: The value to filter on in the .obs column. + - name: Output + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--output_obsm_key" + type: string + required: true + description: The .obsm key to store the subset in. + - name: "--output_compression" + type: string + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + +resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py +test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu + +engines: +- type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + __merge__: /src/base/requirements/anndata_mudata.yaml + __merge__: [ /src/base/requirements/python_test_setup.yaml, .] + +runners: +- type: executable +- type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/filter/subset_obsp/script.py b/src/filter/subset_obsp/script.py new file mode 100644 index 00000000000..71c9d481131 --- /dev/null +++ b/src/filter/subset_obsp/script.py @@ -0,0 +1,54 @@ +import mudata as mu + +### VIASH START +par = { + 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu', + 'modality': 'rna', + 'input_obsp_key': 'distances', + 'input_obs_key': 'leiden', + 'input_obs_value': '1', + 'output_obsm_key': "leiden_1", + 'output': 'subset_obsp_output.h5mu', + 'output_compression': None, +} +### VIASH END + +# START TEMPORARY WORKAROUND setup_logger +# reason: resources aren't available when using Nextflow fusion +# from setup_logger import setup_logger +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger +# END TEMPORARY WORKAROUND setup_logger +logger = setup_logger() + +def main(): + logger.info(f"Reading {par['input']}") + mdata = mu.read_h5mu(par["input"]) + adata = mdata.mod[par["modality"]] + + logger.info(f"Subset columns of obsp matrix under {par['input_obsp_key']} based on {par['input_obs_key']} == {par['input_obs_value']}") + # .obsp, .obs and .obsm index and .obsp columns all have a dimension length of `n_obs` + # the index dimensions remain unaltered, but .obsp columns will be subset + obsp = adata.obsp[par["input_obsp_key"]] + idx = adata.obs[par["input_obs_key"]].astype(str) == par["input_obs_value"] + obsm_subset = obsp[:, idx] + + logger.info(f"Writing subset obsp matrix to .obsm {par['output_obsm_key']}") + adata.obsm[par["output_obsm_key"]] = obsm_subset + + logger.info(f"Writing output to {par['output']}") + mdata.write_h5mu(par["output"], compression=par["output_compression"]) + + +if __name__ == '__main__': + main() diff --git a/src/filter/subset_obsp/test.py b/src/filter/subset_obsp/test.py new file mode 100644 index 00000000000..ce72563275d --- /dev/null +++ b/src/filter/subset_obsp/test.py @@ -0,0 +1,48 @@ +import sys +import pytest +import mudata as mu + +## VIASH START +meta = { + 'resources_dir': 'resources_test/pbmc_1k_protein_v3/' +} +## VIASH END + + +@pytest.fixture +def input_h5mu(): + input = mu.read_h5mu(f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu") + input.mod["rna"].obs["filter_column"] = "group_2" + input.mod["rna"].obs["filter_column"][:50] = "group_1" + return input + + +@pytest.fixture +def input_path(write_mudata_to_file, input_h5mu): + return write_mudata_to_file(input_h5mu) + + +def test_subset_obsp(input_path, run_component, tmp_path): + output_path = tmp_path / "output.h5mu" + + # run component + run_component([ + "--input", input_path, + "--output", str(output_path), + "--input_obsp_key", "distances", + "--input_obs_key", "filter_column", + "--input_obs_value", "group_1", + "--output_obsm_key", "group_1" + ]) + + assert output_path.is_file(), "Output file not found" + + # check output file + mu_out = mu.read_h5mu(output_path) + + assert "group_1" in mu_out.mod["rna"].obsm, "Output should contain group_1 in .obsm" + assert mu_out.mod["rna"].obsm["group_1"].shape[1] == 50, "Obsm should only contain a subset of the original obsp matrix" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__]))