Skip to content

Commit

Permalink
Add component to subset obsp (#888)
Browse files Browse the repository at this point in the history
* add component to subset obsp

* update changelog

* update descriptions

* fix tests

* add comment
  • Loading branch information
dorien-er authored Oct 11, 2024
1 parent d194bf5 commit 4c373aa
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@

* `metadata/duplicate_var` component: Added a component to make a copy from one .var field or index to another .var field within the same MuData object (PR #877).

* `filter/subset_obsp` component: Added a component to subset an .obsp matrix by column based on the value of an .obs field. The resulting subset is moved to an .obsm field (PR #888).

## MINOR CHANGES

* `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726).
Expand Down
75 changes: 75 additions & 0 deletions src/filter/subset_obsp/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
name: subset_obsp
namespace: "filter"
description: |
Create a subset of an .obsp field in a mudata file, by filtering the columns based on the values of an .obs column. The resulting subset is moved to an .obsm slot.
authors:
- __merge__: /src/authors/dorien_roosen.yaml
roles: [ author, maintainer ]
argument_groups:
- name: Input
arguments:
- name: "--input"
type: file
description: Input h5mu file
direction: input
required: true
example: input.h5mu
- name: "--modality"
type: string
default: "rna"
required: false
- name: "--input_obsp_key"
type: string
required: true
description: The .obsp field to be filtered.
- name: "--input_obs_key"
type: string
required: true
description: The .obs column to filter on.
- name: "--input_obs_value"
type: string
required: true
description: The value to filter on in the .obs column.
- name: Output
arguments:
- name: "--output"
type: file
description: Output h5mu file.
direction: output
example: output.h5mu
- name: "--output_obsm_key"
type: string
required: true
description: The .obsm key to store the subset in.
- name: "--output_compression"
type: string
description: The compression format to be used on the output h5mu object.
choices: ["gzip", "lzf"]
required: false
example: "gzip"

resources:
- type: python_script
path: script.py
- path: /src/utils/setup_logger.py
test_resources:
- type: python_script
path: test.py
- path: /resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu

engines:
- type: docker
image: python:3.12-slim
setup:
- type: apt
packages:
- procps
- type: python
__merge__: /src/base/requirements/anndata_mudata.yaml
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]

runners:
- type: executable
- type: nextflow
directives:
label: [singlecpu, lowmem]
54 changes: 54 additions & 0 deletions src/filter/subset_obsp/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import mudata as mu

### VIASH START
par = {
'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_mms.h5mu',
'modality': 'rna',
'input_obsp_key': 'distances',
'input_obs_key': 'leiden',
'input_obs_value': '1',
'output_obsm_key': "leiden_1",
'output': 'subset_obsp_output.h5mu',
'output_compression': None,
}
### VIASH END

# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
import logging
from sys import stdout

logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler(stdout)
logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
console_handler.setFormatter(logFormatter)
logger.addHandler(console_handler)

return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()

def main():
logger.info(f"Reading {par['input']}")
mdata = mu.read_h5mu(par["input"])
adata = mdata.mod[par["modality"]]

logger.info(f"Subset columns of obsp matrix under {par['input_obsp_key']} based on {par['input_obs_key']} == {par['input_obs_value']}")
# .obsp, .obs and .obsm index and .obsp columns all have a dimension length of `n_obs`
# the index dimensions remain unaltered, but .obsp columns will be subset
obsp = adata.obsp[par["input_obsp_key"]]
idx = adata.obs[par["input_obs_key"]].astype(str) == par["input_obs_value"]
obsm_subset = obsp[:, idx]

logger.info(f"Writing subset obsp matrix to .obsm {par['output_obsm_key']}")
adata.obsm[par["output_obsm_key"]] = obsm_subset

logger.info(f"Writing output to {par['output']}")
mdata.write_h5mu(par["output"], compression=par["output_compression"])


if __name__ == '__main__':
main()
48 changes: 48 additions & 0 deletions src/filter/subset_obsp/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import sys
import pytest
import mudata as mu

## VIASH START
meta = {
'resources_dir': 'resources_test/pbmc_1k_protein_v3/'
}
## VIASH END


@pytest.fixture
def input_h5mu():
input = mu.read_h5mu(f"{meta['resources_dir']}/pbmc_1k_protein_v3_mms.h5mu")
input.mod["rna"].obs["filter_column"] = "group_2"
input.mod["rna"].obs["filter_column"][:50] = "group_1"
return input


@pytest.fixture
def input_path(write_mudata_to_file, input_h5mu):
return write_mudata_to_file(input_h5mu)


def test_subset_obsp(input_path, run_component, tmp_path):
output_path = tmp_path / "output.h5mu"

# run component
run_component([
"--input", input_path,
"--output", str(output_path),
"--input_obsp_key", "distances",
"--input_obs_key", "filter_column",
"--input_obs_value", "group_1",
"--output_obsm_key", "group_1"
])

assert output_path.is_file(), "Output file not found"

# check output file
mu_out = mu.read_h5mu(output_path)

assert "group_1" in mu_out.mod["rna"].obsm, "Output should contain group_1 in .obsm"
assert mu_out.mod["rna"].obsm["group_1"].shape[1] == 50, "Obsm should only contain a subset of the original obsp matrix"


if __name__ == "__main__":
sys.exit(pytest.main([__file__]))

0 comments on commit 4c373aa

Please sign in to comment.