AlexsLemonade · allyhawkins · Jan 3, 2024 · Dec 20, 2023 · Dec 20, 2023 · Dec 20, 2023
diff --git a/.github/workflows/nextflow-stub-check.yaml b/.github/workflows/nextflow-stub-check.yaml
@@ -1,4 +1,3 @@
-
 name: Check nextflow stub
 
 on:
@@ -11,7 +10,6 @@ jobs:
   nf-stub-check:
     runs-on: ubuntu-22.04
     steps:
-
       - name: Checkout repo
         uses: actions/checkout@v3
 
@@ -30,14 +28,18 @@ jobs:
         with:
           args: nextflow -log celltype-ref-run.log run build-celltype-ref.nf -stub -profile stub -ansi-log false
 
+      - name: Check Nextflow workflow for merging objects
+        uses: docker://nextflow/nextflow:21.10.6
+        with:
+          args: nextflow -log merge-run.log run merge.nf -stub -profile stub -ansi-log false --project STUBP01
+
       - name: Join log files
         if: ${{ !cancelled() }}
-        run: cat stub-run.log checkpoint-run.log celltype-ref-run.log > nextflow-runs.log
+        run: cat stub-run.log checkpoint-run.log celltype-ref-run.log merge-run.log > nextflow-runs.log
 
       - name: Upload nextflow log
         if: ${{ !cancelled() }}
-        uses:  actions/upload-artifact@v3
+        uses: actions/upload-artifact@v3
         with:
           name: nextflow-log
           path: nextflow-runs.log
-
diff --git a/bin/merge_sces.R b/bin/merge_sces.R
@@ -34,6 +34,13 @@ option_list <- list(
     default = FALSE,
     help = "Keep any altExp present in the merged object."
   ),
+  make_option(
+    opt_str = c("--multiplexed"),
+    action = "store_true",
+    default = FALSE,
+    help = "Indicates if the provided SCE's contain multiplexed data.
+      If so, the sample metadata will not be added to the colData."
+  ),
   make_option(
     opt_str = c("-t", "--threads"),
     type = "integer",
@@ -118,6 +125,36 @@ merged_sce <- scpcaTools::merge_sce_list(
   include_altexp = opt$include_altexp
 )
 
+# add sample metadata to colData as long as there are no multiplexed data
+if (!opt$multiplexed) {
+  merged_sce <- scpcaTools::metadata_to_coldata(
+    merged_sce,
+    join_columns = "library_id"
+  )
+
+  # remove sample metadata
+  metadata(merged_sce) <- metadata(merged_sce)[names(metadata(merged_sce)) != "sample_metadata"]
+}
+
+# grab technology and EFO from metadata$library_metadata
+library_df <- names(input_sce_files) |>
+  purrr::map(\(library_id){
+    lib_meta <- metadata(merged_sce) |>
+      purrr::pluck("library_metadata", library_id)
+    data.frame(
+      library_id = library_id,
+      tech_version = lib_meta$tech_version,
+      assay_ontology_term_id = lib_meta$assay_ontology_term_id,
+      seq_unit = lib_meta$seq_unit
+    )
+  }) |>
+  dplyr::bind_rows()
+
+# join tech and EFO with colData
+colData(merged_sce) <- colData(merged_sce) |>
+  as.data.frame() |>
+  dplyr::left_join(library_df, by = c("library_id"), relationship = "one-to-one") |>
+  DataFrame(row.names = rownames(colData(merged_sce)))
 
 # HVG selection ----------------------------------------------------------------
 

diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R
@@ -71,11 +71,13 @@ format_czi <- function(sce) {
     sce$library_id <- metadata(sce)$library_id
   }
 
-  # add sample metadata to colData sce
-  sce <- scpcaTools::metadata_to_coldata(
-    sce,
-    join_columns = "library_id"
-  )
+  # if sample metadata is present, add to colData
+  if ("sample_metadata" %in% names(metadata(sce))) {
+    sce <- scpcaTools::metadata_to_coldata(
+      sce,
+      join_columns = "library_id"
+    )
+  }
 
   # modify colData to be AnnData and CZI compliant
   coldata_df <- colData(sce) |>

diff --git a/merge.nf b/merge.nf
@@ -32,9 +32,9 @@ process merge_sce {
   label 'mem_32'
   publishDir "${params.results_dir}/merged/${merge_group_id}"
   input:
-    tuple val(merge_group_id), val(has_adt), val(library_ids), path(scpca_nf_file)
+    tuple val(merge_group_id), val(has_adt), val(multiplexed), val(library_ids), path(scpca_nf_file)
   output:
-    tuple val(merge_group_id), val(has_adt), path(merged_sce_file)
+    tuple path(merged_sce_file), val(merge_group_id), val(has_adt), val(multiplexed)
   script:
     input_library_ids = library_ids.join(',')
     input_sces = scpca_nf_file.join(',')
@@ -46,10 +46,11 @@ process merge_sce {
       --output_sce_file "${merged_sce_file}" \
       --n_hvg ${params.num_hvg} \
       ${has_adt ? "--include_altexp" : ''} \
+      ${multiplexed ? "--multiplexed" : '' } \
       --threads ${task.cpus}
     """
   stub:
-    merged_sce_file = "${merge_group}_merged.rds"
+    merged_sce_file = "${merge_group_id}_merged.rds"
     """
     touch ${merged_sce_file}
     """
@@ -59,15 +60,15 @@ process merge_sce {
 // create merge report
 process generate_merge_report {
   container params.SCPCATOOLS_CONTAINER
-  publishDir "${params.results_dir}/merged/${merge_group}"
+  publishDir "${params.results_dir}/merged/${merge_group_id}"
   label 'mem_16'
   input:
-    tuple val(merge_group_id), val(has_adt), path(merged_sce_file)
+    tuple path(merged_sce_file), val(merge_group_id), val(has_adt), val(multiplexed)
     path(report_template)
   output:
     path(merge_report)
   script:
-    merge_report = "${merge_group}_summary_report.html"
+    merge_report = "${merge_group_id}_summary_report.html"
     """
     Rscript -e "rmarkdown::render( \
       '${report_template}', \
@@ -87,15 +88,15 @@ process generate_merge_report {
 process export_anndata{
     container params.SCPCATOOLS_CONTAINER
     label 'mem_32'
-    tag "${merge_group}"
-    publishDir "${params.results_dir}/merged/${merge_group}", mode: 'copy'
+    tag "${merge_group_id}"
+    publishDir "${params.results_dir}/merged/${merge_group_id}", mode: 'copy'
     input:
-      tuple val(merge_group), val(has_adt), path(merged_sce_file)
+      tuple path(merged_sce_file), val(merge_group_id), val(has_adt)
     output:
-      tuple val(merge_group), path("${merge_group}_merged_*.hdf5")
+      tuple val(merge_group_id), path("${merge_group_id}_merged_*.hdf5")
     script:
-      rna_hdf5_file = "${merge_group}_merged_rna.hdf5"
-      feature_hdf5_file = "${merge_group}_merged_adt.hdf5"
+      rna_hdf5_file = "${merge_group_id}_merged_rna.hdf5"
+      feature_hdf5_file = "${merge_group_id}_merged_adt.hdf5"
       """
       sce_to_anndata.R \
         --input_sce_file ${merged_sce_file} \
@@ -108,8 +109,8 @@ process export_anndata{
       ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_hdf5_file}" : ''}
       """
     stub:
-      rna_hdf5_file = "${merge_group}_merged_rna.hdf5"
-      feature_hdf5_file = "${merge_group}_merged_adt.hdf5"
+      rna_hdf5_file = "${merge_group_id}_merged_rna.hdf5"
+      feature_hdf5_file = "${merge_group_id}_merged_adt.hdf5"
       """
       touch ${rna_hdf5_file}
       ${has_adt ? "touch ${feature_hdf5_file}" : ''}
@@ -133,14 +134,19 @@ workflow {
       .collect{it.scpca_project_id}
       .unique()
 
+    multiplex_projects = libraries_ch
+      .filter{it.technology.startsWith('cellhash')}
+      .collect{it.scpca_project_id}
+      .unique()
+
     grouped_libraries_ch = libraries_ch
       // only include single-cell/single-nuclei which ensures we don't try to merge libraries from spatial or bulk data
       .filter{it.seq_unit in ['cell', 'nucleus']}
       // create tuple of [project id, library_id, processed_sce_file]
       .map{[
         it.scpca_project_id,
         it.scpca_library_id,
-        "${params.results_dir}/${it.scpca_project_id}/${it.scpca_sample_id}/${it.scpca_library_id}_processed.rds"
+        file("${params.results_dir}/${it.scpca_project_id}/${it.scpca_sample_id}/${it.scpca_library_id}_processed.rds")
       ]}
       // only include libraries that have been processed through scpca-nf
       .filter{file(it[2]).exists()}
@@ -152,6 +158,7 @@ workflow {
       .map{project_id, library_id_list, sce_file_list -> tuple(
         project_id,
         project_id in adt_projects, // determines if altExp should be included in the merged object
+        project_id in multiplex_projects, // determines if sample metadata should be added to colData and to skip anndata
         library_id_list,
         sce_file_list
       )}
@@ -162,5 +169,9 @@ workflow {
     generate_merge_report(merge_sce.out, file(merge_template))
 
     // export merged objects to AnnData
-    export_anndata(merge_sce.out)
+    anndata_ch = merge_sce.out
+      .filter{!it[3]} // remove multiplexed samples before export
+      .take(3) // keep everything but multiplexed
+
+    export_anndata(anndata_ch)
 }
diff --git a/test/output/results/STUBP01/STUBS01/STUBL01_processed.rds b/test/output/results/STUBP01/STUBS01/STUBL01_processed.rds
diff --git a/test/output/results/STUBP01/STUBS16/STUBL16_processed.rds b/test/output/results/STUBP01/STUBS16/STUBL16_processed.rds
diff --git a/test/stub-run-metadata.tsv b/test/stub-run-metadata.tsv
@@ -14,3 +14,4 @@ STUBR12	STUBL11	STUBS12,STUBS13	STUBP06	10Xv3.1	cell	stub	test/runs/STUBR12	NA	N
 STUBR13	STUBL11	STUBS12,STUBS13	STUBP06	cellhash_10Xv3	cell	stub	test/runs/STUBR13	test/references/barcodes/cellhash.stub.tsv	2[1-15]	NA	NA	NA
 STUBR14	STUBL12	STUBS14	STUBP07	10Xv3.1	cell	stub	test/runs/STUBR14	NA	NA	NA	NA	test/celltypes/STUBL12-submitter-celltypes.tsv
 STUBR15	STUBL13	STUBS15	STUBP08	10Xv3.1	cell	stub	test/runs/STUBR15	NA	NA	NA	NA	NA
+STUBR16	STUBL16	STUBS16	STUBP01	10Xv2	cell	stub	test/runs/STUBR16	NA	NA	NA	NA	NA