nf-core · fasterius · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ compatible with further downstream analyses and/or exploration in _e.g._
 
 ### `Added`
 
+- Add subworkflow for merging per-sample SpatialData into one [[#96](https://github.com/nf-core/spatialvi/pull/96)]
 - Add quality control metrics as custom MultiQC content [[#88](https://github.com/nf-core/spatialvi/pull/88)]
 - Add MultiQC support for Space Ranger outputs [[#70](https://github.com/nf-core/spatialvi/pull/70)]
 - Use the QUARTONOTEBOOK nf-core module instead of local Quarto-based modules [[#68](https://github.com/nf-core/spatialvi/pull/68)]

diff --git a/bin/merge_sdata.py b/bin/merge_sdata.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+import argparse
+import spatialdata
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Merge SpatialData objects")
+    parser.add_argument("files", nargs="+", help="List of SpatialData files to merge")
+    parser.add_argument("output", help="Output file name")
+    args = parser.parse_args()
+
+    # Read all zarr SpatialData folders
+    sdatas = []
+    for file in args.files:
+        sdata = spatialdata.read_zarr(file)
+        sdatas.append(sdata)
+
+    # Merge the data
+    output_sdata = spatialdata.concatenate(
+        sdatas,
+        region_key=None,
+        instance_key=None,
+        concatenate_tables=False,
+        obs_names_make_unique=True,
+        modify_tables_inplace=False,
+    )
+
+    # Save the concatenated data
+    output_sdata.write(
+        args.output,
+        overwrite=True
+    )
diff --git a/bin/spatially_variable_genes.qmd b/bin/spatially_variable_genes.qmd
@@ -115,5 +115,5 @@ adata.uns[svg_autocorr_method_string].to_csv(os.path.join(artifact_dir, output_c
 adata.write(output_adata)
 del sdata.tables["table"]
 sdata.tables["table"] = adata
-sdata.write("./" + output_sdata)
+sdata.write(os.path.join(artifact_dir, output_sdata))
 ```
diff --git a/conf/modules.config b/conf/modules.config
@@ -97,4 +97,8 @@ process {
         ]
     }
 
+    withName: 'MERGE_SDATA' {
+        ext.when = params.merge_sdata
+    }
+
 }
diff --git a/conf/test_downstream.config b/conf/test_downstream.config
@@ -10,15 +10,18 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '12.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Downstream test profile'
     config_profile_description = 'Test pipeline for downstream (post-Space Ranger) functionality'
 
-    // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 2
-    max_memory = '3.GB'
-    max_time   = '2.h'
-
     // Input and output
     input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/spatialvi/testdata/human-brain-cancer-11-mm-capture-area-ffpe-2-standard_v2_ffpe_cytassist/samplesheet_downstream.csv'
     spaceranger_probeset = "https://raw.githubusercontent.com/nf-core/test-datasets/spatialvi/testdata/human-brain-cancer-11-mm-capture-area-ffpe-2-standard_v2_ffpe_cytassist/outs/probe_set.csv"

diff --git a/conf/test_spaceranger_v1.config b/conf/test_spaceranger_v1.config
@@ -10,15 +10,18 @@
 ----------------------------------------------------------------------------------------
 */
 
+process {
+    resourceLimits = [
+        cpus: 4,
+        memory: '12.GB',
+        time: '1.h'
+    ]
+}
+
 params {
     config_profile_name        = 'Space Ranger v1 test profile'
     config_profile_description = 'Test pipeline functionality, including Space Ranger v1'
 
-    // Limit resources so that this can run on GitHub Actions
-    max_cpus   = 2
-    max_memory = '3.GB'
-    max_time   = '2.h'
-
     // Input and output
     input  = 'https://raw.githubusercontent.com/nf-core/test-datasets/spatialvi/testdata/human-ovarian-cancer-1-standard_v1_ffpe/samplesheet_spaceranger.csv'
     spaceranger_probeset = 'https://raw.githubusercontent.com/nf-core/test-datasets/spatialvi/testdata/human-ovarian-cancer-1-standard_v1_ffpe/Visium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv'

diff --git a/modules/local/merge_sdata.nf b/modules/local/merge_sdata.nf
@@ -0,0 +1,38 @@
+//
+// Merge per-sample SpatialData into a single SpatialData
+//
+process MERGE_SDATA {
+
+    label 'process_low'
+    container "docker.io/erikfas/spatialvi"
+
+    input:
+    path(sdata, stageAs: "?/*")
+
+    output:
+    path("aggregated-sdata.zarr"), emit: sdata
+    path("versions.yml")         , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
+        exit 1, "The MERGE_SDATA module does not support Conda/Mamba, please use Docker / Singularity / Podman instead."
+    }
+    """
+    # Set environment variables
+    export XDG_CACHE_HOME="./.xdg_cache_home"
+    export XDG_DATA_HOME="./.xdg_data_home"
+
+    # Execute script
+    merge_sdata.py \\
+        ${sdata} \\
+        aggregated-sdata.zarr
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        spatialdata_io: \$(python -c "import spatialdata_io; print(spatialdata_io.__version__)")
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -33,6 +33,9 @@ params {
     svg_autocorr_method       = "moran"
     n_top_svgs                = 14
 
+    // Data aggregation
+    merge_sdata                  = false
+
     // MultiQC options
     multiqc_config               = null
     multiqc_title                = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -164,6 +164,20 @@
             }
         },
 
+        "aggregation_options": {
+            "title": "Data aggregation options",
+            "type": "object",
+            "fa_icon": "fas fa-rocket",
+            "description": "Options related to aggregation of final data objects",
+            "properties": {
+                "merge_sdata": {
+                    "type": "boolean",
+                    "description": "Merge per-sample SpatialData objects into one",
+                    "fa_icon": "fas fa-arrows-to-dot"
+                }
+            }
+        },
+
         "institutional_config_options": {
             "title": "Institutional config options",
             "type": "object",
@@ -317,6 +331,9 @@
         {
             "$ref": "#/$defs/analysis_options"
         },
+        {
+            "$ref": "#/$defs/aggregation_options"
+        },
         {
             "$ref": "#/$defs/institutional_config_options"
         },

diff --git a/subworkflows/local/aggregation.nf b/subworkflows/local/aggregation.nf
@@ -0,0 +1,34 @@
+//
+// Subworkflow for aggregation of sample data
+//
+
+include { MERGE_SDATA } from '../../modules/local/merge_sdata'
+
+workflow AGGREGATION {
+
+    take:
+    ch_sdata // Channel: [ meta, zarr ]
+
+    main:
+
+    ch_versions = Channel.empty()
+
+    //
+    // MODULE: Merge per-sample SpatialData objects into one
+    //
+    ch_sdata_files = ch_sdata
+        | map {
+            meta, zarr ->
+            return [zarr]
+        }
+    MERGE_SDATA (
+        ch_sdata_files.collect()
+    )
+    ch_versions = ch_versions.mix(MERGE_SDATA.out.versions)
+    ch_merged_sdata = MERGE_SDATA.out.sdata
+
+    emit:
+    merged_sdata = ch_merged_sdata // channel: [ aggregated-sdata.zarr ]
+    versions     = ch_versions     // channel: [ versions.yml ]
+
+}
diff --git a/subworkflows/local/downstream.nf b/subworkflows/local/downstream.nf
@@ -85,6 +85,10 @@ workflow DOWNSTREAM {
         extensions
     )
     ch_versions = ch_versions.mix(CLUSTERING.out.versions)
+    ch_clustering_html   = CLUSTERING.out.html
+    ch_clustering_sdata  = CLUSTERING.out.artifacts
+    ch_clustering_nb     = CLUSTERING.out.notebook
+    ch_clustering_params = CLUSTERING.out.params_yaml
 
     //
     // Spatially variable genes
@@ -109,23 +113,33 @@ workflow DOWNSTREAM {
         extensions
     )
     ch_versions = ch_versions.mix(SPATIALLY_VARIABLE_GENES.out.versions)
+    ch_svg_html   = SPATIALLY_VARIABLE_GENES.out.html
+    ch_svg_nb     = SPATIALLY_VARIABLE_GENES.out.notebook
+    ch_svg_params = SPATIALLY_VARIABLE_GENES.out.params_yaml
+    ch_svg_artifacts = SPATIALLY_VARIABLE_GENES.out.artifacts
+        | transpose ( )
+        | branch {
+            csv: it[1].name.endsWith('.csv')
+            sdata: it[1].name.endsWith('.zarr')
+        }
 
     emit:
-    qc_html           = ch_qc_html  // channel: [ meta, html ]
-    qc_sdata          = ch_qc_sdata // channel: [ meta, zarr ]
-    qc_mqc            = ch_qc_mqc   // channel: [ meta, csv ]
-    qc_nb             = ch_qc_nb    // channel: [ meta, qmd ]
-    qc_params         = ch_qc_yml   // channel: [ meta, yml ]
+    qc_html           = ch_qc_html             // channel: [ meta, html ]
+    qc_sdata          = ch_qc_sdata            // channel: [ meta, zarr ]
+    qc_mqc            = ch_qc_mqc              // channel: [ meta, csv ]
+    qc_nb             = ch_qc_nb               // channel: [ meta, qmd ]
+    qc_params         = ch_qc_yml              // channel: [ meta, yml ]
 
-    clustering_html   = CLUSTERING.out.html                      // channel: [ html ]
-    clustering_sdata  = CLUSTERING.out.artifacts                 // channel: [ meta, h5ad]
-    clustering_nb     = CLUSTERING.out.notebook                  // channel: [ meta, qmd ]
-    clustering_params = CLUSTERING.out.params_yaml               // channel: [ meta, yml ]
+    clustering_html   = ch_clustering_html     // channel: [ html ]
+    clustering_sdata  = ch_clustering_sdata    // channel: [ meta, zarr]
+    clustering_nb     = ch_clustering_nb       // channel: [ meta, qmd ]
+    clustering_params = ch_clustering_params   // channel: [ meta, yml ]
 
-    svg_html          = SPATIALLY_VARIABLE_GENES.out.html        // channel: [ meta, html ]
-    svg_csv           = SPATIALLY_VARIABLE_GENES.out.artifacts   // channel: [ meta, csv ]
-    svg_nb            = SPATIALLY_VARIABLE_GENES.out.notebook    // channel: [ meta, qmd ]
-    svg_params        = SPATIALLY_VARIABLE_GENES.out.params_yaml // channel: [ meta, yml ]
+    svg_html          = ch_svg_html            // channel: [ meta, html ]
+    svg_csv           = ch_svg_artifacts.csv   // channel: [ meta, csv ]
+    svg_sdata         = ch_svg_artifacts.sdata // channel: [ meta, zarr ]
+    svg_nb            = ch_svg_nb              // channel: [ meta, qmd ]
+    svg_params        = ch_svg_params          // channel: [ meta, yml ]
 
-    versions          = ch_versions                              // channel: [ versions.yml ]
+    versions          = ch_versions            // channel: [ versions.yml ]
 }
diff --git a/workflows/spatialvi.nf b/workflows/spatialvi.nf
@@ -10,6 +10,7 @@ include { MULTIQC                } from '../modules/nf-core/multiqc/main'
 include { INPUT_CHECK            } from '../subworkflows/local/input_check'
 include { SPACERANGER            } from '../subworkflows/local/spaceranger'
 include { DOWNSTREAM             } from '../subworkflows/local/downstream'
+include { AGGREGATION            } from '../subworkflows/local/aggregation'
 include { paramsSummaryMultiqc   } from '../subworkflows/nf-core/utils_nfcore_pipeline'
 include { paramsSummaryMap       } from 'plugin/nf-schema'
 include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline'
@@ -83,6 +84,14 @@ workflow SPATIALVI {
     )
     ch_versions = ch_versions.mix(DOWNSTREAM.out.versions)
 
+    //
+    // SUBWORKFLOW: Sample aggregation (optional)
+    //
+    AGGREGATION (
+        DOWNSTREAM.out.svg_sdata
+    )
+    ch_versions = ch_versions.mix(AGGREGATION.out.versions)
+
     //
     // Collate and save software versions
     //
@@ -94,7 +103,6 @@ workflow SPATIALVI {
             newLine: true
         ).set { ch_collated_versions }
 
-
     //
     // MODULE: MultiQC
     //
@@ -140,7 +148,8 @@ workflow SPATIALVI {
         []
     )
 
-    emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html
+    emit:
+    multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html
     versions       = ch_versions                 // channel: [ path(versions.yml) ]
 
 }