Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add subworkflow for merging SpatialData #96

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ compatible with further downstream analyses and/or exploration in _e.g._

### `Added`

- Add subworkflow for merging per-sample SpatialData into one [[#96](https://github.com/nf-core/spatialvi/pull/96)]
- Add quality control metrics as custom MultiQC content [[#88](https://github.com/nf-core/spatialvi/pull/88)]
- Add MultiQC support for Space Ranger outputs [[#70](https://github.com/nf-core/spatialvi/pull/70)]
- Use the QUARTONOTEBOOK nf-core module instead of local Quarto-based modules [[#68](https://github.com/nf-core/spatialvi/pull/68)]
Expand Down
32 changes: 32 additions & 0 deletions bin/merge_sdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env python

import argparse
import spatialdata

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Merge SpatialData objects")
parser.add_argument("files", nargs="+", help="List of SpatialData files to merge")
parser.add_argument("output", help="Output file name")
args = parser.parse_args()

# Read all zarr SpatialData folders
sdatas = []
for file in args.files:
sdata = spatialdata.read_zarr(file)
sdatas.append(sdata)

# Merge the data
output_sdata = spatialdata.concatenate(
sdatas,
region_key=None,
instance_key=None,
concatenate_tables=False,
obs_names_make_unique=True,
modify_tables_inplace=False,
)

# Save the concatenated data
output_sdata.write(
args.output,
overwrite=True
)
2 changes: 1 addition & 1 deletion bin/spatially_variable_genes.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -115,5 +115,5 @@ adata.uns[svg_autocorr_method_string].to_csv(os.path.join(artifact_dir, output_c
adata.write(output_adata)
del sdata.tables["table"]
sdata.tables["table"] = adata
sdata.write("./" + output_sdata)
sdata.write(os.path.join(artifact_dir, output_sdata))
```
4 changes: 4 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,8 @@ process {
]
}

withName: 'MERGE_SDATA' {
ext.when = params.merge_sdata
}

}
13 changes: 8 additions & 5 deletions conf/test_downstream.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,18 @@
----------------------------------------------------------------------------------------
*/

process {
resourceLimits = [
cpus: 4,
memory: '12.GB',
time: '1.h'
]
}

params {
config_profile_name = 'Downstream test profile'
config_profile_description = 'Test pipeline for downstream (post-Space Ranger) functionality'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '3.GB'
max_time = '2.h'

// Input and output
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/spatialvi/testdata/human-brain-cancer-11-mm-capture-area-ffpe-2-standard_v2_ffpe_cytassist/samplesheet_downstream.csv'
spaceranger_probeset = "https://raw.githubusercontent.com/nf-core/test-datasets/spatialvi/testdata/human-brain-cancer-11-mm-capture-area-ffpe-2-standard_v2_ffpe_cytassist/outs/probe_set.csv"
Expand Down
13 changes: 8 additions & 5 deletions conf/test_spaceranger_v1.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,18 @@
----------------------------------------------------------------------------------------
*/

process {
resourceLimits = [
cpus: 4,
memory: '12.GB',
time: '1.h'
]
}

params {
config_profile_name = 'Space Ranger v1 test profile'
config_profile_description = 'Test pipeline functionality, including Space Ranger v1'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '3.GB'
max_time = '2.h'

// Input and output
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/spatialvi/testdata/human-ovarian-cancer-1-standard_v1_ffpe/samplesheet_spaceranger.csv'
spaceranger_probeset = 'https://raw.githubusercontent.com/nf-core/test-datasets/spatialvi/testdata/human-ovarian-cancer-1-standard_v1_ffpe/Visium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv'
Expand Down
38 changes: 38 additions & 0 deletions modules/local/merge_sdata.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//
// Merge per-sample SpatialData into a single SpatialData
//
process MERGE_SDATA {

label 'process_low'
container "docker.io/erikfas/spatialvi"

input:
path(sdata, stageAs: "?/*")

output:
path("aggregated-sdata.zarr"), emit: sdata
path("versions.yml") , emit: versions

when:
task.ext.when == null || task.ext.when

script:
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "The MERGE_SDATA module does not support Conda/Mamba, please use Docker / Singularity / Podman instead."
}
"""
# Set environment variables
export XDG_CACHE_HOME="./.xdg_cache_home"
export XDG_DATA_HOME="./.xdg_data_home"

# Execute script
merge_sdata.py \\
${sdata} \\
aggregated-sdata.zarr

cat <<-END_VERSIONS > versions.yml
"${task.process}":
spatialdata_io: \$(python -c "import spatialdata_io; print(spatialdata_io.__version__)")
END_VERSIONS
"""
}
3 changes: 3 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ params {
svg_autocorr_method = "moran"
n_top_svgs = 14

// Data aggregation
merge_sdata = false

// MultiQC options
multiqc_config = null
multiqc_title = null
Expand Down
17 changes: 17 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,20 @@
}
},

"aggregation_options": {
"title": "Data aggregation options",
"type": "object",
"fa_icon": "fas fa-rocket",
"description": "Options related to aggregation of final data objects",
"properties": {
"merge_sdata": {
"type": "boolean",
"description": "Merge per-sample SpatialData objects into one",
"fa_icon": "fas fa-arrows-to-dot"
}
}
},

"institutional_config_options": {
"title": "Institutional config options",
"type": "object",
Expand Down Expand Up @@ -317,6 +331,9 @@
{
"$ref": "#/$defs/analysis_options"
},
{
"$ref": "#/$defs/aggregation_options"
},
{
"$ref": "#/$defs/institutional_config_options"
},
Expand Down
34 changes: 34 additions & 0 deletions subworkflows/local/aggregation.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//
// Subworkflow for aggregation of sample data
//

include { MERGE_SDATA } from '../../modules/local/merge_sdata'

workflow AGGREGATION {

take:
ch_sdata // Channel: [ meta, zarr ]

main:

ch_versions = Channel.empty()

//
// MODULE: Merge per-sample SpatialData objects into one
//
ch_sdata_files = ch_sdata
| map {
meta, zarr ->
return [zarr]
}
MERGE_SDATA (
ch_sdata_files.collect()
)
ch_versions = ch_versions.mix(MERGE_SDATA.out.versions)
ch_merged_sdata = MERGE_SDATA.out.sdata

emit:
merged_sdata = ch_merged_sdata // channel: [ aggregated-sdata.zarr ]
versions = ch_versions // channel: [ versions.yml ]

}
42 changes: 28 additions & 14 deletions subworkflows/local/downstream.nf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ workflow DOWNSTREAM {
extensions
)
ch_versions = ch_versions.mix(CLUSTERING.out.versions)
ch_clustering_html = CLUSTERING.out.html
ch_clustering_sdata = CLUSTERING.out.artifacts
ch_clustering_nb = CLUSTERING.out.notebook
ch_clustering_params = CLUSTERING.out.params_yaml

//
// Spatially variable genes
Expand All @@ -109,23 +113,33 @@ workflow DOWNSTREAM {
extensions
)
ch_versions = ch_versions.mix(SPATIALLY_VARIABLE_GENES.out.versions)
ch_svg_html = SPATIALLY_VARIABLE_GENES.out.html
ch_svg_nb = SPATIALLY_VARIABLE_GENES.out.notebook
ch_svg_params = SPATIALLY_VARIABLE_GENES.out.params_yaml
ch_svg_artifacts = SPATIALLY_VARIABLE_GENES.out.artifacts
| transpose ( )
| branch {
csv: it[1].name.endsWith('.csv')
sdata: it[1].name.endsWith('.zarr')
}

emit:
qc_html = ch_qc_html // channel: [ meta, html ]
qc_sdata = ch_qc_sdata // channel: [ meta, zarr ]
qc_mqc = ch_qc_mqc // channel: [ meta, csv ]
qc_nb = ch_qc_nb // channel: [ meta, qmd ]
qc_params = ch_qc_yml // channel: [ meta, yml ]
qc_html = ch_qc_html // channel: [ meta, html ]
qc_sdata = ch_qc_sdata // channel: [ meta, zarr ]
qc_mqc = ch_qc_mqc // channel: [ meta, csv ]
qc_nb = ch_qc_nb // channel: [ meta, qmd ]
qc_params = ch_qc_yml // channel: [ meta, yml ]

clustering_html = CLUSTERING.out.html // channel: [ html ]
clustering_sdata = CLUSTERING.out.artifacts // channel: [ meta, h5ad]
clustering_nb = CLUSTERING.out.notebook // channel: [ meta, qmd ]
clustering_params = CLUSTERING.out.params_yaml // channel: [ meta, yml ]
clustering_html = ch_clustering_html // channel: [ html ]
clustering_sdata = ch_clustering_sdata // channel: [ meta, zarr]
clustering_nb = ch_clustering_nb // channel: [ meta, qmd ]
clustering_params = ch_clustering_params // channel: [ meta, yml ]

svg_html = SPATIALLY_VARIABLE_GENES.out.html // channel: [ meta, html ]
svg_csv = SPATIALLY_VARIABLE_GENES.out.artifacts // channel: [ meta, csv ]
svg_nb = SPATIALLY_VARIABLE_GENES.out.notebook // channel: [ meta, qmd ]
svg_params = SPATIALLY_VARIABLE_GENES.out.params_yaml // channel: [ meta, yml ]
svg_html = ch_svg_html // channel: [ meta, html ]
svg_csv = ch_svg_artifacts.csv // channel: [ meta, csv ]
svg_sdata = ch_svg_artifacts.sdata // channel: [ meta, zarr ]
svg_nb = ch_svg_nb // channel: [ meta, qmd ]
svg_params = ch_svg_params // channel: [ meta, yml ]

versions = ch_versions // channel: [ versions.yml ]
versions = ch_versions // channel: [ versions.yml ]
}
13 changes: 11 additions & 2 deletions workflows/spatialvi.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main'
include { INPUT_CHECK } from '../subworkflows/local/input_check'
include { SPACERANGER } from '../subworkflows/local/spaceranger'
include { DOWNSTREAM } from '../subworkflows/local/downstream'
include { AGGREGATION } from '../subworkflows/local/aggregation'
include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline'
include { paramsSummaryMap } from 'plugin/nf-schema'
include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
Expand Down Expand Up @@ -83,6 +84,14 @@ workflow SPATIALVI {
)
ch_versions = ch_versions.mix(DOWNSTREAM.out.versions)

//
// SUBWORKFLOW: Sample aggregation (optional)
//
AGGREGATION (
DOWNSTREAM.out.svg_sdata
)
ch_versions = ch_versions.mix(AGGREGATION.out.versions)

//
// Collate and save software versions
//
Expand All @@ -94,7 +103,6 @@ workflow SPATIALVI {
newLine: true
).set { ch_collated_versions }


//
// MODULE: MultiQC
//
Expand Down Expand Up @@ -140,7 +148,8 @@ workflow SPATIALVI {
[]
)

emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html
emit:
multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html
versions = ch_versions // channel: [ path(versions.yml) ]

}
Expand Down
Loading