From c79530eff1f9b8d2bc42c3097ba058a3797bbffc Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Sat, 15 Feb 2025 17:02:11 -0800
Subject: [PATCH 1/7] Support input as tar balls

---
 workflows/cellranger/cellranger_count.wdl | 44 +++++++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl
index a92dc3ec..7269d4b2 100644
--- a/workflows/cellranger/cellranger_count.wdl
+++ b/workflows/cellranger/cellranger_count.wdl
@@ -140,6 +140,22 @@ task run_cellranger_count {
         from subprocess import check_call, CalledProcessError, DEVNULL, STDOUT
         from packaging import version
 
+        def rename_fastq_file(path, sample_name):
+            folder = os.path.dirname(path)
+            filename = os.path.basename(path)
+            pattern = r"(_S\d+_L\d+_R\d+_001\.fastq\.gz)"
+            match = re.search(pattern, filename)
+            if match:
+                idx = match.start()
+                cur_name = filename[:idx]
+                suffix = filename[idx:]
+                if cur_name != sample_name:
+                    call_args = ["mv", path, folder+"/"+sample_name+suffix]
+                    print(' '.join(call_args))
+                    check_call(call_args)
+            else:
+                raise Exception(path + " does not follow Illumina naming convention!")
+
         samples = data_types = fbfs = None
         fastqs_dirs = []
 
@@ -188,9 +204,31 @@ task run_cellranger_count {
                     except CalledProcessError:
                         if not os.path.exists(target):
                             os.mkdir(target)
-                        call_args = ['strato', 'cp', '-m', directory + '/' + samples[i] + '_S*_L*_*_001.fastq.gz' , target]
-                        print(' '.join(call_args))
-                        check_call(call_args)
+                        try:
+                            call_args = ['strato', 'cp', '-m', directory + '/' + samples[i] + '_S*_L*_*_001.fastq.gz' , target]
+                            print(' '.join(call_args))
+                            check_call(call_args, stdout=DEVNULL, stderr=STDOUT)
+                        except CalledProcessError:
+                            # Localize tar file
+                            call_args = ['strato', 'cp', '-m', directory + '/*.tar', target]
+                            print(' '.join(call_args))
+                            check_call(call_args)
+
+                            # Untar
+                            tar_file = glob.glob(target+"/*.tar")[0]
+                            call_args = ["tar", "--strip-components=1", "-xf", tar_file, "-C", target]
+                            print(' '.join(call_args))
+                            check_call(call_args)
+
+                            # Remove tar file
+                            call_args = ["rm", tar_file]
+                            print(' '.join(call_args))
+                            check_call(call_args)
+
+                            # Rename FASTQ files if needed
+                            fastq_files = glob.glob(target+"/*.fastq.gz")
+                            for fastq_f in fastq_files:
+                                rename_fastq_file(fastq_f, samples[i])
                     feature_type = ''
                     if data_types[i] == 'rna':
                         feature_type = 'Gene Expression'

From 001a0f91610852d105fd7bdd78441ea222a7e605 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Sat, 15 Feb 2025 17:55:06 -0800
Subject: [PATCH 2/7] wrap FASTQ localization as a function

---
 workflows/cellranger/cellranger_count.wdl | 88 +++++++++++------------
 1 file changed, 40 insertions(+), 48 deletions(-)

diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl
index 7269d4b2..e3f3cbea 100644
--- a/workflows/cellranger/cellranger_count.wdl
+++ b/workflows/cellranger/cellranger_count.wdl
@@ -156,6 +156,43 @@ task run_cellranger_count {
             else:
                 raise Exception(path + " does not follow Illumina naming convention!")
 
+        def localize_fastqs(directory, target, sample_name):
+            try:
+                call_args = ['strato', 'exists', directory + '/' + sample_name + '/']
+                print(' '.join(call_args))
+                check_call(call_args, stdout=DEVNULL, stderr=STDOUT)
+                call_args = ['strato', 'sync', '-m', directory + '/' + sample_name, target]
+                print(' '.join(call_args))
+                check_call(call_args)
+            except CalledProcessError:
+                if not os.path.exists(target):
+                    os.mkdir(target)
+                try:
+                    call_args = ['strato', 'cp', '-m', directory + '/' + sample_name + '_S*_L*_*_001.fastq.gz' , target]
+                    print(' '.join(call_args))
+                    check_call(call_args, stdout=DEVNULL, stderr=STDOUT)
+                except CalledProcessError:
+                    # Localize tar file
+                    call_args = ['strato', 'cp', '-m', directory + '/' + "*.tar", target]
+                    print(' '.join(call_args))
+                    check_call(call_args)
+
+                    # Untar
+                    tar_file = glob.glob(target+"/*.tar")[0]
+                    call_args = ["tar", "--strip-components=1", "-xf", tar_file, "-C", target]
+                    print(' '.join(call_args))
+                    check_call(call_args)
+
+                    # Remove tar file
+                    call_args = ["rm", tar_file]
+                    print(' '.join(call_args))
+                    check_call(call_args)
+
+                    # Rename FASTQ files if needed
+                    fastq_files = glob.glob(target+"/*.fastq.gz")
+                    for fastq_f in fastq_files:
+                        rename_fastq_file(fastq_f, sample_name)
+
         samples = data_types = fbfs = None
         fastqs_dirs = []
 
@@ -194,41 +231,8 @@ task run_cellranger_count {
                 for i, directory in enumerate('~{input_fastqs_directories}'.split(',')):
                     directory = re.sub('/+$', '', directory) # remove trailing slashes
                     target = samples[i] + "_" + str(i)
-                    try:
-                        call_args = ['strato', 'exists', directory + '/' + samples[i] + '/']
-                        print(' '.join(call_args))
-                        check_call(call_args, stdout=DEVNULL, stderr=STDOUT)
-                        call_args = ['strato', 'sync', '-m', directory + '/' + samples[i], target]
-                        print(' '.join(call_args))
-                        check_call(call_args)
-                    except CalledProcessError:
-                        if not os.path.exists(target):
-                            os.mkdir(target)
-                        try:
-                            call_args = ['strato', 'cp', '-m', directory + '/' + samples[i] + '_S*_L*_*_001.fastq.gz' , target]
-                            print(' '.join(call_args))
-                            check_call(call_args, stdout=DEVNULL, stderr=STDOUT)
-                        except CalledProcessError:
-                            # Localize tar file
-                            call_args = ['strato', 'cp', '-m', directory + '/*.tar', target]
-                            print(' '.join(call_args))
-                            check_call(call_args)
-
-                            # Untar
-                            tar_file = glob.glob(target+"/*.tar")[0]
-                            call_args = ["tar", "--strip-components=1", "-xf", tar_file, "-C", target]
-                            print(' '.join(call_args))
-                            check_call(call_args)
-
-                            # Remove tar file
-                            call_args = ["rm", tar_file]
-                            print(' '.join(call_args))
-                            check_call(call_args)
-
-                            # Rename FASTQ files if needed
-                            fastq_files = glob.glob(target+"/*.fastq.gz")
-                            for fastq_f in fastq_files:
-                                rename_fastq_file(fastq_f, samples[i])
+                    localize_fastqs(directory, target, samples[i])
+
                     feature_type = ''
                     if data_types[i] == 'rna':
                         feature_type = 'Gene Expression'
@@ -246,19 +250,7 @@ task run_cellranger_count {
             for i, directory in enumerate('~{input_fastqs_directories}'.split(',')):
                 directory = re.sub('/+$', '', directory) # remove trailing slashes
                 target = '~{sample_id}_' + str(i)
-                try:
-                    call_args = ['strato', 'exists', directory + '/~{sample_id}/']
-                    print(' '.join(call_args))
-                    check_call(call_args, stdout=DEVNULL, stderr=STDOUT)
-                    call_args = ['strato', 'sync', '-m', directory + '/~{sample_id}', target]
-                    print(' '.join(call_args))
-                    check_call(call_args)
-                except CalledProcessError:
-                    if not os.path.exists(target):
-                        os.mkdir(target)
-                    call_args = ['strato', 'cp', '-m', directory + '/~{sample_id}' + '_S*_L*_*_001.fastq.gz' , target]
-                    print(' '.join(call_args))
-                    check_call(call_args)
+                localize_fastqs(directory, target, '~{input_samples}')
                 fastqs_dirs.append(target)
 
         mem_size = re.findall(r"\d+", "~{memory}")[0]

From 7ad3bbd3cfe142ea878f5a673053695c321d94fc Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Sat, 15 Feb 2025 18:07:34 -0800
Subject: [PATCH 3/7] fix bug

---
 workflows/cellranger/cellranger_count.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl
index e3f3cbea..856dea7d 100644
--- a/workflows/cellranger/cellranger_count.wdl
+++ b/workflows/cellranger/cellranger_count.wdl
@@ -250,7 +250,7 @@ task run_cellranger_count {
             for i, directory in enumerate('~{input_fastqs_directories}'.split(',')):
                 directory = re.sub('/+$', '', directory) # remove trailing slashes
                 target = '~{sample_id}_' + str(i)
-                localize_fastqs(directory, target, '~{input_samples}')
+                localize_fastqs(directory, target, '~{sample_id}')
                 fastqs_dirs.append(target)
 
         mem_size = re.findall(r"\d+", "~{memory}")[0]

From cfa1118df54c3ba4584ffd91d9ce356596601efe Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Sat, 15 Feb 2025 18:33:12 -0800
Subject: [PATCH 4/7] add missing import

---
 workflows/cellranger/cellranger_count.wdl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl
index 856dea7d..bf8678f1 100644
--- a/workflows/cellranger/cellranger_count.wdl
+++ b/workflows/cellranger/cellranger_count.wdl
@@ -137,6 +137,7 @@ task run_cellranger_count {
         import re
         import os
         import sys
+        import glob
         from subprocess import check_call, CalledProcessError, DEVNULL, STDOUT
         from packaging import version
 

From a573b70f7fcbeb9c2a1d07c27852115beb799af1 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Sat, 15 Feb 2025 19:21:26 -0800
Subject: [PATCH 5/7] accept I1/I2 as well

---
 workflows/cellranger/cellranger_count.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl
index bf8678f1..3b9aa7f3 100644
--- a/workflows/cellranger/cellranger_count.wdl
+++ b/workflows/cellranger/cellranger_count.wdl
@@ -144,7 +144,7 @@ task run_cellranger_count {
         def rename_fastq_file(path, sample_name):
             folder = os.path.dirname(path)
             filename = os.path.basename(path)
-            pattern = r"(_S\d+_L\d+_R\d+_001\.fastq\.gz)"
+            pattern = r"(_S\d+_L\d+_[RI]\d+_001\.fastq\.gz)"
             match = re.search(pattern, filename)
             if match:
                 idx = match.start()

From d2cdfc10984cc9c7d4ed6c0ba70072208e51340b Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Mon, 17 Feb 2025 02:02:10 -0800
Subject: [PATCH 6/7] remove mkfastq part from Cellranger workflow

---
 docs/cellranger/feature_barcoding.rst        |  10 +-
 docs/cellranger/general_steps.rst            |  27 +-
 docs/cellranger/index.rst                    |  12 +-
 docs/cellranger/sc_atac.rst                  |  24 -
 docs/cellranger/sc_sn_rnaseq.rst             |  96 +--
 docs/cellranger/sc_vdj.rst                   |  16 -
 docs/spaceranger.rst                         |   2 +-
 workflows/cellranger/cellranger_workflow.wdl | 687 ++++++-------------
 8 files changed, 250 insertions(+), 624 deletions(-)

diff --git a/docs/cellranger/feature_barcoding.rst b/docs/cellranger/feature_barcoding.rst
index 1ff27f24..6e30ccd3 100644
--- a/docs/cellranger/feature_barcoding.rst
+++ b/docs/cellranger/feature_barcoding.rst
@@ -193,11 +193,11 @@ For feature barcoding data, ``cellranger_workflow`` takes Illumina outputs as in
 		  - 0.1
 		  - 0.1
 		* - cellranger_version
-		  - cellranger version, could be: 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0, 6.1.2, 6.1.1, 6.0.2, 6.0.1, 6.0.0, 5.0.1, 5.0.0
+		  - cellranger version, could be: 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0
 		  - "9.0.0"
 		  - "9.0.0"
 		* - cumulus_feature_barcoding_version
-		  - Cumulus_feature_barcoding version for extracting feature barcode matrix. Version available: 0.11.4, 0.11.3, 0.11.2, 0.11.1, 0.11.0, 0.10.0, 0.9.0, 0.8.0, 0.7.0, 0.6.0, 0.5.0, 0.4.0, 0.3.0, 0.2.0.
+		  - Cumulus_feature_barcoding version for extracting feature barcode matrix.
 		  - "0.11.4"
 		  - "0.11.4"
 		* - docker_registry
@@ -208,12 +208,6 @@ For feature barcoding data, ``cellranger_workflow`` takes Illumina outputs as in
 		  	- "cumulusprod" for backup images on Docker Hub.
 		  - "quay.io/cumulus"
 		  - "quay.io/cumulus"
-		* - mkfastq_docker_registry
-		  - Docker registry to use for ``cellranger mkfastq``.
-		    Default is the registry to which only Broad users have access.
-		    See :ref:`bcl2fastq-docker` for making your own registry.
-		  - "gcr.io/broad-cumulus"
-		  - "gcr.io/broad-cumulus"
 		* - acronym_file
 		  - | The link/path of an index file in TSV format for fetching preset genome references, chemistry whitelists, etc. by their names.
 		    | Set an GS URI if *backend* is ``gcp``; an S3 URI for ``aws`` backend; an absolute file path for ``local`` backend.
diff --git a/docs/cellranger/general_steps.rst b/docs/cellranger/general_steps.rst
index 933aa5c3..b6d456b3 100644
--- a/docs/cellranger/general_steps.rst
+++ b/docs/cellranger/general_steps.rst
@@ -66,12 +66,6 @@ Alternatively, users can submit jobs through command line interface (CLI) using
 		    | If starts with FASTQ files, this should be Google bucket URLs of uploaded FASTQ folders.
 		    | The FASTQ folders should contain one subfolder for each sample in the flowcell with the sample name as the subfolder name.
 		    | Each subfolder contains FASTQ files for that sample.
-		* - **Lane**
-		  -
-		    | Tells which lanes the sample was pooled into.
-		    | Can be either single lane (e.g. 8) or a range (e.g. 7-8) or all (e.g. \*).
-		* - **Index**
-		  - Sample index (e.g. SI-GA-A12).
 		* - Chemistry
 		  - Describes the 10x chemistry used for the sample. This column is optional.
 		* - DataType
@@ -108,15 +102,15 @@ Alternatively, users can submit jobs through command line interface (CLI) using
 
 	Example::
 
-		Sample,Reference,Flowcell,Lane,Index,Chemistry,DataType
-		sample_1,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,1-2,SI-GA-A8,threeprime,rna
-		sample_2,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,3-4,SI-GA-B8,SC3Pv3,rna
-		sample_3,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,5-6,SI-GA-C8,fiveprime,rna
-		sample_4,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,7-8,SI-GA-D8,fiveprime,rna
-		sample_1,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,1-2,SI-GA-A8,threeprime,rna
-		sample_2,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,3-4,SI-GA-B8,SC3Pv3,rna
-		sample_3,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,5-6,SI-GA-C8,fiveprime,rna
-		sample_4,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,7-8,SI-GA-D8,fiveprime,rna
+		Sample,Reference,Flowcell,Chemistry,DataType
+		sample_1,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,threeprime,rna
+		sample_2,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,SC3Pv3,rna
+		sample_3,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,fiveprime,rna
+		sample_4,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,fiveprime,rna
+		sample_1,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,threeprime,rna
+		sample_2,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,SC3Pv3,rna
+		sample_3,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,fiveprime,rna
+		sample_4,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,fiveprime,rna
 
 	**3.2 Upload your sample sheet to the workspace bucket:**
 
@@ -183,9 +177,6 @@ Alternatively, users can submit jobs through command line interface (CLI) using
 		* - Name
 		  - Type
 		  - Description
-		* - fastq_outputs
-		  - Array[Array[String]?]
-		  - The top-level array contains results (as arrays) for different data modalities. The inner-level array contains cloud locations of FASTQ files, one url per flowcell.
 		* - count_outputs
 		  - Array[Array[String]?]
 		  - The top-level array contains results (as arrays) for different data modalities. The inner-level array contains cloud locations of count matrices, one url per sample.
diff --git a/docs/cellranger/index.rst b/docs/cellranger/index.rst
index fddb0166..7e94049e 100644
--- a/docs/cellranger/index.rst
+++ b/docs/cellranger/index.rst
@@ -24,17 +24,17 @@ Feature barcoding assays (cell & nucleus hashing, CITE-seq and Perturb-seq)
 
 ---------------------------------
 
-Single-cell ATAC-seq
-^^^^^^^^^^^^^^^^^^^^
+Single-cell immune profiling
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. include:: sc_atac.rst
+.. include:: sc_vdj.rst
 
 ---------------------------------
 
-Single-cell immune profiling
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Single-cell ATAC-seq
+^^^^^^^^^^^^^^^^^^^^
 
-.. include:: sc_vdj.rst
+.. include:: sc_atac.rst
 
 ---------------------------------
 
diff --git a/docs/cellranger/sc_atac.rst b/docs/cellranger/sc_atac.rst
index 15880efb..5694cb1d 100644
--- a/docs/cellranger/sc_atac.rst
+++ b/docs/cellranger/sc_atac.rst
@@ -19,30 +19,6 @@ Sample sheet
 		  - Mouse mm10, cellranger-arc/atac reference 2.0.0
 		* - **GRCh38_and_mm10-2020-A_atac_v2.0.0**
 		  - Human GRCh38 and mouse mm10, cellranger-atac reference 2.0.0
-		* - **GRCh38_atac_v1.2.0**
-		  - Human GRCh38, cellranger-atac reference 1.2.0
-		* - **mm10_atac_v1.2.0**
-		  - Mouse mm10, cellranger-atac reference 1.2.0
-		* - **hg19_atac_v1.2.0**
-		  - Human hg19, cellranger-atac reference 1.2.0
-		* - **b37_atac_v1.2.0**
-		  - Human b37 build, cellranger-atac reference 1.2.0
-		* - **GRCh38_and_mm10_atac_v1.2.0**
-		  - Human GRCh38 and mouse mm10, cellranger-atac reference 1.2.0
-		* - **hg19_and_mm10_atac_v1.2.0**
-		  - Human hg19 and mouse mm10, cellranger-atac reference 1.2.0
-		* - **GRCh38_atac_v1.1.0**
-		  - Human GRCh38, cellranger-atac reference 1.1.0
-		* - **mm10_atac_v1.1.0**
-		  - Mouse mm10, cellranger-atac reference 1.1.0
-		* - **hg19_atac_v1.1.0**
-		  - Human hg19, cellranger-atac reference 1.1.0
-		* - **b37_atac_v1.1.0**
-		  - Human b37 build, cellranger-atac reference 1.1.0
-		* - **GRCh38_and_mm10_atac_v1.1.0**
-		  - Human GRCh38 and mouse mm10, cellranger-atac reference 1.1.0
-		* - **hg19_and_mm10_atac_v1.1.0**
-		  - Human hg19 and mouse mm10, cellranger-atac reference 1.1.0
 
 #. **Index** column.
 
diff --git a/docs/cellranger/sc_sn_rnaseq.rst b/docs/cellranger/sc_sn_rnaseq.rst
index 0ed7ac73..510e997f 100644
--- a/docs/cellranger/sc_sn_rnaseq.rst
+++ b/docs/cellranger/sc_sn_rnaseq.rst
@@ -25,49 +25,6 @@ Sample sheet
 		  - Mouse mm10 (GENCODE vM23/Ensembl 98)
 		* - **GRCh38_and_mm10-2020-A**
 		  - Human GRCh38 (GENCODE v32/Ensembl 98) and mouse mm10 (GENCODE vM23/Ensembl 98)
-		* - **GRCh38_v3.0.0**
-		  - Human GRCh38, cellranger reference 3.0.0, Ensembl v93 gene annotation
-		* - **hg19_v3.0.0**
-		  - Human hg19, cellranger reference 3.0.0, Ensembl v87 gene annotation
-		* - **mm10_v3.0.0**
-		  - Mouse mm10, cellranger reference 3.0.0, Ensembl v93 gene annotation
-		* - **GRCh38_and_mm10_v3.1.0**
-		  - Human (GRCh38) and mouse (mm10), cellranger references 3.1.0, Ensembl v93 gene annotations for both human and mouse
-		* - **hg19_and_mm10_v3.0.0**
-		  - Human (hg19) and mouse (mm10), cellranger reference 3.0.0, Ensembl v93 gene annotations for both human and mouse
-		* - **GRCh38_v1.2.0** or **GRCh38**
-		  - Human GRCh38, cellranger reference 1.2.0, Ensembl v84 gene annotation
-		* - **hg19_v1.2.0** or **hg19**
-		  - Human hg19, cellranger reference 1.2.0, Ensembl v82 gene annotation
-		* - **mm10_v1.2.0** or **mm10**
-		  - Mouse mm10, cellranger reference 1.2.0, Ensembl v84 gene annotation
-		* - **GRCh38_and_mm10_v1.2.0** or **GRCh38_and_mm10**
-		  - Human and mouse, built from GRCh38 and mm10 cellranger references, Ensembl v84 gene annotations are used
-		* - **GRCh38_and_SARSCoV2**
-		  - Human GRCh38 and SARS-COV-2 RNA genome, cellranger reference 3.0.0, generated by `Carly Ziegler`_. The SARS-COV-2 viral sequence and gtf are as described in `[Kim et al. Cell 2020]`_ (https://github.com/hyeshik/sars-cov-2-transcriptome, BetaCov/South Korea/KCDC03/2020 based on NC_045512.2). The GTF was edited to include only CDS regions, and regions were added to describe the 5' UTR ("SARSCoV2_5prime"), the 3' UTR ("SARSCoV2_3prime"), and reads aligning to anywhere within the Negative Strand("SARSCoV2_NegStrand"). Additionally, trailing A's at the 3' end of the virus were excluded from the SARSCoV2 fasta, as these were found to drive spurious viral alignment in pre-COVID19 samples.
-
-	Pre-built snRNA-seq references are summarized below.
-
-	.. list-table::
-		:widths: 5 20
-		:header-rows: 1
-
-		* - Keyword
-		  - Description
-		* - **GRCh38_premrna_v3.0.0**
-		  - Human, introns included, built from GRCh38 cellranger reference 3.0.0, Ensembl v93 gene annotation, treating annotated transcripts as exons
-		* - **GRCh38_premrna_v1.2.0** or **GRCh38_premrna**
-		  - Human, introns included, built from GRCh38 cellranger reference 1.2.0, Ensembl v84 gene annotation, treating annotated transcripts as exons
-		* - **mm10_premrna_v1.2.0** or **mm10_premrna**
-		  - Mouse, introns included, built from mm10 cellranger reference 1.2.0, Ensembl v84 gene annotation, treating annotated transcripts as exons
-		* - **GRCh38_premrna_and_mm10_premrna_v1.2.0** or **GRCh38_premrna_and_mm10_premrna**
-		  - Human and mouse, introns included, built from GRCh38_premrna_v1.2.0 and mm10_premrna_v1.2.0
-		* - **GRCh38_premrna_and_SARSCoV2**
-		  - Human, introns included, built from GRCh38_premrna_v3.0.0, and SARS-COV-2 RNA genome. This reference was generated by `Carly Ziegler`_. The SARS-COV-2 RNA genome is from `[Kim et al. Cell 2020]`_ (https://github.com/hyeshik/sars-cov-2-transcriptome, BetaCov/South Korea/KCDC03/2020 based on NC_045512.2). Please see the description of *GRCh38_and_SARSCoV2* above for details.
-
-#. **Index** column.
-
-	Put `10x single cell RNA-seq sample index set names`_ (e.g. SI-GA-A12) here.
 
 #. *Chemistry* column.
 
@@ -85,22 +42,9 @@ Sample sheet
 		  - Single Cell 3′
 		* - **fiveprime**
 		  - Single Cell 5′
-		* - **SC3Pv1**
-		  - Single Cell 3′ v1
-		* - **SC3Pv2**
-		  - Single Cell 3′ v2
-		* - **SC3Pv3**
-		  - Single Cell 3′ v3. You should set cellranger version input parameter to >= 3.0.2
-		* - **SC3Pv4**
-		  - Single Cell 3' v4. **Notice:** This is GEM-X chemistry, and only works for Cell Ranger v8.0.0+
-		* - **SC5P-PE**
-		  - Single Cell 5′ paired-end (both R1 and R2 are used for alignment)
-		* - **SC5P-PE-v3**
-		  - Single Cell 5' paired-end v3 (both R1 and R2 are used for alignment). **Notice:** This is GEM-X chemistry, and only works for Cell Ranger v8.0.0+
-		* - **SC5P-R2**
-		  - Single Cell 5′ R2-only (where only R2 is used for alignment)
-		* - **SC5P-R2-v3**
-		  - Single Cell 5' R2-only v3 (where only R2 is used for alignment). **Notice:** This is GEM-X chemistry, and only works for Cell Rangrer v8.0.0+
+
+#. *Flowcell* column.
+
 
 #. *DataType* column.
 
@@ -140,38 +84,6 @@ For sc/snRNA-seq data, ``cellranger_workflow`` takes Illumina outputs as input a
 		  - Output directory
 		  - "gs://fc-e0000000-0000-0000-0000-000000000000/cellranger_output"
 		  - Results are written under directory *output_directory* and will overwrite any existing files at this location.
-		* - run_mkfastq
-		  - If you want to run ``cellranger mkfastq``
-		  - true
-		  - true
-		* - run_count
-		  - If you want to run ``cellranger count``
-		  - true
-		  - true
-		* - delete_input_bcl_directory
-		  - If delete BCL directories after demux. If false, you should delete this folder yourself so as to not incur storage charges
-		  - false
-		  - false
-		* - mkfastq_barcode_mismatches
-		  - Number of mismatches allowed in matching barcode indices (bcl2fastq2 default is 1)
-		  - 0
-		  -
-		* - mkfastq_force_single_index
-		  - If 10x-supplied i7/i5 paired indices are specified, but the flowcell was run with only one sample index, allow the demultiplex to proceed using the i7 half of the sample index pair
-		  - false
-		  - false
-		* - mkfastq_filter_single_index
-		  - Only demultiplex samples identified by an i7-only sample index, ignoring dual-indexed samples. Dual-indexed samples will not be demultiplexed
-		  - false
-		  - false
-		* - mkfastq_use_bases_mask
-		  - Override the read lengths as specified in *RunInfo.xml*
-		  - "Y28n*,I8n*,N10,Y90n*"
-		  -
-		* - mkfastq_delete_undetermined
-		  - Delete undetermined FASTQ files generated by bcl2fastq2
-		  - true
-		  - false
 		* - force_cells
 		  - Force pipeline to use this number of cells, bypassing the cell detection algorithm, mutually exclusive with expect_cells
 		  - 6000
@@ -193,7 +105,7 @@ For sc/snRNA-seq data, ``cellranger_workflow`` takes Illumina outputs as input a
 		  - false
 		  - false
 		* - cellranger_version
-		  - cellranger version, could be: 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0, 6.1.2, 6.1.1, 6.0.2, 6.0.1, 6.0.0, 5.0.1, 5.0.0
+		  - cellranger version, could be: 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0
 		  - "9.0.0"
 		  - "9.0.0"
 		* - config_version
diff --git a/docs/cellranger/sc_vdj.rst b/docs/cellranger/sc_vdj.rst
index d5656a08..4fd348b5 100644
--- a/docs/cellranger/sc_vdj.rst
+++ b/docs/cellranger/sc_vdj.rst
@@ -19,22 +19,6 @@ Sample sheet
 		  - Human GRCh38 V(D)J sequences, cellranger reference 7.0.0, annotation built from Ensembl *Homo_sapiens.GRCh38.94.chr_patch_hapl_scaff.gtf*
 		* - **GRCm38_vdj_v7.0.0**
 		  - Mouse GRCm38 V(D)J sequences, cellranger reference 7.0.0, annotation built from Ensembl *Mus_musculus.GRCm38.94.gtf*
-		* - **GRCh38_vdj_v5.0.0**
-		  - Human GRCh38 V(D)J sequences, cellranger reference 5.0.0, annotation built from Ensembl *Homo_sapiens.GRCh38.94.chr_patch_hapl_scaff.gtf*
-		* - **GRCm38_vdj_v5.0.0**
-		  - Mouse GRCm38 V(D)J sequences, cellranger reference 5.0.0, annotation built from Ensembl *Mus_musculus.GRCm38.94.gtf*
-		* - **GRCh38_vdj_v4.0.0**
-		  - Human GRCh38 V(D)J sequences, cellranger reference 4.0.0, annotation built from Ensembl *Homo_sapiens.GRCh38.94.chr_patch_hapl_scaff.gtf*
-		* - **GRCm38_vdj_v4.0.0**
-		  - Mouse GRCm38 V(D)J sequences, cellranger reference 4.0.0, annotation built from Ensembl *Mus_musculus.GRCm38.94.gtf*
-		* - **GRCh38_vdj_v3.1.0**
-		  - Human GRCh38 V(D)J sequences, cellranger reference 3.1.0, annotation built from Ensembl *Homo_sapiens.GRCh38.94.chr_patch_hapl_scaff.gtf*
-		* - **GRCm38_vdj_v3.1.0**
-		  - Mouse GRCm38 V(D)J sequences, cellranger reference 3.1.0, annotation built from Ensembl *Mus_musculus.GRCm38.94.gtf*
-		* - **GRCh38_vdj_v2.0.0** or **GRCh38_vdj**
-		  - Human GRCh38 V(D)J sequences, cellranger reference 2.0.0, annotation built from Ensembl *Homo_sapiens.GRCh38.87.chr_patch_hapl_scaff.gtf* and *vdj_GRCh38_alts_ensembl_10x_genes-2.0.0.gtf*
-		* - **GRCm38_vdj_v2.2.0** or **GRCm38_vdj**
-		  - Mouse GRCm38 V(D)J sequences, cellranger reference 2.2.0, annotation built from Ensembl *Mus_musculus.GRCm38.90.chr_patch_hapl_scaff.gtf*
 
 #. **Index** column.
 
diff --git a/docs/spaceranger.rst b/docs/spaceranger.rst
index 7827f6a6..d9975fa7 100644
--- a/docs/spaceranger.rst
+++ b/docs/spaceranger.rst
@@ -276,7 +276,7 @@ For spatial data, ``spaceranger_workflow`` takes Illumina outputs and related im
 		  - 50
 		  -
 		* - spaceranger_version
-		  - spaceranger version, could be: 3.1.2, 3.0.1, 3.0.0, 2.1.1, 2.0.1, 2.0.0, 1.3.1, 1.3.0
+		  - spaceranger version, could be: 3.1.2, 3.0.1, 3.0.0
 		  - "3.1.2"
 		  - "3.1.2"
 		* - config_version
diff --git a/workflows/cellranger/cellranger_workflow.wdl b/workflows/cellranger/cellranger_workflow.wdl
index 6dbaa614..0f0b5ec0 100644
--- a/workflows/cellranger/cellranger_workflow.wdl
+++ b/workflows/cellranger/cellranger_workflow.wdl
@@ -1,43 +1,19 @@
 version 1.0
 
-import "cellranger_mkfastq.wdl" as crm
 import "cellranger_count.wdl" as crc
 import "cellranger_multi.wdl" as crmulti
 import "cellranger_vdj.wdl" as crv
 import "../cumulus/cumulus_adt.wdl" as ca
-import "cellranger_atac_mkfastq.wdl" as cram
 import "cellranger_atac_count.wdl" as crac
-import "cellranger_arc_mkfastq.wdl" as crarm
 import "cellranger_arc_count.wdl" as crarc
 
 workflow cellranger_workflow {
     input {
-        # 5 - 9 columns (Sample, Reference, Flowcell, Lane, Index, [Chemistry, DataType, FeatureBarcodeFile, Link]). gs URL
+        # Columns: Sample, Reference, Flowcell, [Chemistry, DataType, FeatureBarcodeFile, Link]).
         File input_csv_file
-        # Output directory, gs URL
+        # Output directory, AWS or GCP URI
         String output_directory
 
-        # If run mkfastq
-        Boolean run_mkfastq = true
-        # If run count
-        Boolean run_count = true
-
-        # for mkfastq
-
-        # Whether to delete input_bcl_directory, default: false
-        Boolean delete_input_bcl_directory = false
-        # Number of allowed mismatches per index
-        Int? mkfastq_barcode_mismatches
-        # If 10x-supplied i7/i5 paired indices are specified, but the flowcell was run with only one sample index, allow the demultiplex to proceed using the i7 half of the sample index pair.
-        Boolean mkfastq_force_single_index = false
-        # Only demultiplex samples identified by an i7-only sample index, ignoring dual-indexed samples.  Dual-indexed samples will not be demultiplexed.
-        Boolean mkfastq_filter_single_index = false
-        # Override the read lengths as specified in RunInfo.xml
-        String? mkfastq_use_bases_mask
-        # Delete undetermined FASTQ files generated by bcl2fastq2
-        Boolean mkfastq_delete_undetermined = false
-
-
         # For cellranger count
 
         # Force pipeline to use this number of cells, bypassing the cell detection algorithm, mutually exclusive with expect_cells.
@@ -98,21 +74,18 @@ workflow cellranger_workflow {
         # Index TSV file
         File acronym_file = "gs://regev-lab/resources/cellranger/index.tsv"
 
-        # 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0, 6.1.2, 6.1.1, 6.0.2, 6.0.1, 6.0.0, 5.0.1, 5.0.0
+        # 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0
         String cellranger_version = "9.0.0"
-        # 0.11.4, 0.11.3, 0.11.2, 0.11.1, 0.11.0, 0.10.0, 0.9.0, 0.8.0, 0.7.0, 0.6.0, 0.5.0, 0.4.0, 0.3.0, 0.2.0
         String cumulus_feature_barcoding_version = "0.11.4"
-        # 2.1.0, 2.0.0, 1.2.0, 1.1.0
+        # 2.1.0, 2.0.0
         String cellranger_atac_version = "2.1.0"
-        # 2.0.2.strato, 2.0.2.custom-max-cell, 2.0.2, 2.0.1, 2.0.0, 1.0.1, 1.0.0
+        # 2.0.2.strato, 2.0.2.custom-max-cell, 2.0.2, 2.0.1, 2.0.0
         String cellranger_arc_version = "2.0.2.strato"
         # config version
         String config_version = "0.3"
 
         # Which docker registry to use: quay.io/cumulus (default) or cumulusprod
         String docker_registry = "quay.io/cumulus"
-        # cellranger/cellranger-atac/cellranger-arc mkfastq registry, default to gcr.io/broad-cumulus
-        String mkfastq_docker_registry = "gcr.io/broad-cumulus"
         # Google cloud zones, default to "us-central1-a us-central1-b us-central1-c us-central1-f us-east1-b us-east1-c us-east1-d us-west1-a us-west1-b us-west1-c"
         String zones = "us-central1-a us-central1-b us-central1-c us-central1-f us-east1-b us-east1-c us-east1-d us-west1-a us-west1-b us-west1-c"
         # Backend
@@ -137,8 +110,6 @@ workflow cellranger_workflow {
         # Memory string for cellranger-arc count
         String arc_memory = "160G"
 
-        # Optional disk space for mkfastq.
-        Int mkfastq_disk_space = 1500
         # Optional disk space needed for cell ranger count.
         Int count_disk_space = 500
         # Optional disk space needed for cell ranger multi.
@@ -162,16 +133,56 @@ workflow cellranger_workflow {
     String output_directory_stripped = sub(output_directory, "[/\\s]+$", "")
 
     String docker_registry_stripped = sub(docker_registry, "/+$", "")
-    String mkfastq_docker_registry_stripped = sub(mkfastq_docker_registry, "/+$", "")
 
     Map[String, String] acronym2gsurl = read_map(acronym_file)
     String null_file = acronym2gsurl["null_file"]
 
-    if (run_mkfastq) {
-        call generate_bcl_csv {
+
+    call generate_count_config {
+        input:
+            input_csv_file = input_csv_file,
+            output_dir = output_directory_stripped,
+            config_version = config_version,
+            docker_registry = docker_registry_stripped,
+            zones = zones,
+            preemptible = preemptible,
+            awsQueueArn = awsQueueArn,
+            backend = backend,
+            null_file = null_file
+    }
+
+    if (length(generate_count_config.sample_ids) > 0) {
+        scatter (sample_id in generate_count_config.sample_ids) {
+            call crc.cellranger_count as cellranger_count {
+                input:
+                    sample_id = sample_id,
+                    input_fastqs_directories = generate_count_config.sample2dir[sample_id],
+                    output_directory = output_directory_stripped,
+                    genome = generate_count_config.sample2genome[sample_id],
+                    target_panel = generate_count_config.sample2fbf[sample_id],
+                    chemistry = generate_count_config.sample2chemistry[sample_id],
+                    include_introns = include_introns,
+                    acronym_file = acronym_file,
+                    no_bam = no_bam,
+                    secondary = secondary,
+                    force_cells = force_cells,
+                    expect_cells = expect_cells,
+                    cellranger_version = cellranger_version,
+                    docker_registry = docker_registry_stripped,
+                    zones = zones,
+                    num_cpu = num_cpu,
+                    memory = memory,
+                    disk_space = count_disk_space,
+                    preemptible = preemptible,
+                    backend = backend,
+                    awsQueueArn = awsQueueArn
+            }
+        }
+
+        call collect_summaries {
             input:
-                input_csv_file = input_csv_file,
-                output_dir = output_directory_stripped,
+                summaries = cellranger_count.output_metrics_summary,
+                sample_ids = cellranger_count.output_count_directory,
                 config_version = config_version,
                 docker_registry = docker_registry_stripped,
                 zones = zones,
@@ -179,464 +190,239 @@ workflow cellranger_workflow {
                 awsQueueArn = awsQueueArn,
                 backend = backend
         }
+    }
 
-        if (length(generate_bcl_csv.bcl_csv_rna) > 0) {
-            scatter (bcl_csv in generate_bcl_csv.bcl_csv_rna) {
-                String rna_key = basename(bcl_csv)
-                call crm.cellranger_mkfastq as cellranger_mkfastq {
-                    input:
-                        input_bcl_directory = generate_bcl_csv.inpdirs[rna_key],
-                        input_csv_file = bcl_csv,
-                        output_directory = output_directory_stripped,
-                        delete_input_bcl_directory = delete_input_bcl_directory,
-                        barcode_mismatches = mkfastq_barcode_mismatches,
-                        force_single_index = mkfastq_force_single_index,
-                        filter_single_index = mkfastq_filter_single_index,
-                        use_bases_mask = mkfastq_use_bases_mask,
-                        delete_undetermined = mkfastq_delete_undetermined,
-                        cellranger_version = cellranger_version,
-                        docker_registry = mkfastq_docker_registry_stripped,
-                        zones = zones,
-                        num_cpu = num_cpu,
-                        memory = memory,
-                        disk_space = mkfastq_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
-            }
-        }
-
-        if (length(generate_bcl_csv.bcl_csv_atac) > 0) {
-            scatter (bcl_csv in generate_bcl_csv.bcl_csv_atac) {
-                String atac_key = basename(bcl_csv)
-                call cram.cellranger_atac_mkfastq as cellranger_atac_mkfastq {
-                    input:
-                        input_bcl_directory = generate_bcl_csv.inpdirs[atac_key],
-                        input_csv_file = bcl_csv,
-                        output_directory = output_directory_stripped,
-                        delete_input_bcl_directory = delete_input_bcl_directory,
-                        barcode_mismatches = mkfastq_barcode_mismatches,
-                        force_single_index = mkfastq_force_single_index,
-                        filter_single_index = mkfastq_filter_single_index,
-                        use_bases_mask = mkfastq_use_bases_mask,
-                        delete_undetermined = mkfastq_delete_undetermined,
-                        cellranger_atac_version = cellranger_atac_version,
-                        docker_registry = mkfastq_docker_registry_stripped,
-                        zones = zones,
-                        num_cpu = num_cpu,
-                        memory = memory,
-                        disk_space = mkfastq_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
-            }
-        }
-
-        if (length(generate_bcl_csv.bcl_csv_arc) > 0) {
-            scatter (bcl_csv in generate_bcl_csv.bcl_csv_arc) {
-                String arc_key = basename(bcl_csv)
-                call crarm.cellranger_arc_mkfastq as cellranger_arc_mkfastq {
-                    input:
-                        input_bcl_directory = generate_bcl_csv.inpdirs[arc_key],
-                        input_csv_file = bcl_csv,
-                        output_directory = output_directory_stripped,
-                        delete_input_bcl_directory = delete_input_bcl_directory,
-                        barcode_mismatches = mkfastq_barcode_mismatches,
-                        force_single_index = mkfastq_force_single_index,
-                        filter_single_index = mkfastq_filter_single_index,
-                        use_bases_mask = mkfastq_use_bases_mask,
-                        delete_undetermined = mkfastq_delete_undetermined,
-                        cellranger_arc_version = cellranger_arc_version,
-                        docker_registry = mkfastq_docker_registry_stripped,
-                        zones = zones,
-                        num_cpu = num_cpu,
-                        memory = memory,
-                        disk_space = mkfastq_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
+    if (length(generate_count_config.sample_vdj_ids) > 0) {
+        scatter (sample_id in generate_count_config.sample_vdj_ids) {
+            call crv.cellranger_vdj as cellranger_vdj {
+                input:
+                    sample_id = sample_id,
+                    input_fastqs_directories = generate_count_config.sample2dir[sample_id],
+                    output_directory = output_directory_stripped,
+                    genome = generate_count_config.sample2genome[sample_id],
+                    acronym_file = acronym_file,
+                    denovo = vdj_denovo,
+                    chain = vdj_chain,
+                    cellranger_version = cellranger_version,
+                    docker_registry = docker_registry_stripped,
+                    zones = zones,
+                    num_cpu = num_cpu,
+                    memory = memory,
+                    disk_space = vdj_disk_space,
+                    preemptible = preemptible,
+                    backend = backend,
+                    awsQueueArn = awsQueueArn
             }
         }
-    }
 
-    if (run_count) {
-        call generate_count_config {
+        call collect_summaries as collect_summaries_vdj {
             input:
-                input_csv_file = input_csv_file,
-                output_dir = output_directory_stripped,
-                fastq_dirs = cellranger_mkfastq.output_fastqs_flowcell_directory,
-                fastq_dirs_atac = cellranger_atac_mkfastq.output_fastqs_flowcell_directory,
-                fastq_dirs_arc = cellranger_arc_mkfastq.output_fastqs_flowcell_directory,
+                summaries = cellranger_vdj.output_metrics_summary,
+                sample_ids = cellranger_vdj.output_vdj_directory,
                 config_version = config_version,
                 docker_registry = docker_registry_stripped,
                 zones = zones,
                 preemptible = preemptible,
                 awsQueueArn = awsQueueArn,
-                backend = backend,
-                null_file = null_file
+                backend = backend
         }
+    }
 
-        if (length(generate_count_config.sample_ids) > 0) {
-            scatter (sample_id in generate_count_config.sample_ids) {
-                call crc.cellranger_count as cellranger_count {
-                    input:
-                        sample_id = sample_id,
-                        input_fastqs_directories = generate_count_config.sample2dir[sample_id],
-                        output_directory = output_directory_stripped,
-                        genome = generate_count_config.sample2genome[sample_id],
-                        target_panel = generate_count_config.sample2fbf[sample_id],
-                        chemistry = generate_count_config.sample2chemistry[sample_id],
-                        include_introns = include_introns,
-                        acronym_file = acronym_file,
-                        no_bam = no_bam,
-                        secondary = secondary,
-                        force_cells = force_cells,
-                        expect_cells = expect_cells,
-                        cellranger_version = cellranger_version,
-                        docker_registry = docker_registry_stripped,
-                        zones = zones,
-                        num_cpu = num_cpu,
-                        memory = memory,
-                        disk_space = count_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
-            }
-
-            call collect_summaries {
+    if (length(generate_count_config.sample_feature_ids) > 0) {
+        scatter (sample_id in generate_count_config.sample_feature_ids) {
+            call ca.cumulus_adt as cumulus_adt {
                 input:
-                    summaries = cellranger_count.output_metrics_summary,
-                    sample_ids = cellranger_count.output_count_directory,
-                    config_version = config_version,
+                    sample_id = sample_id,
+                    input_fastqs_directories = generate_count_config.sample2dir[sample_id],
+                    output_directory = output_directory_stripped,
+                    chemistry = generate_count_config.sample2chemistry[sample_id],
+                    data_type = generate_count_config.sample2datatype[sample_id],
+                    feature_barcode_file = generate_count_config.sample2fbf[sample_id],
+                    crispr_barcode_pos = crispr_barcode_pos,
+                    scaffold_sequence = scaffold_sequence,
+                    max_mismatch = max_mismatch,
+                    min_read_ratio = min_read_ratio,
+                    cumulus_feature_barcoding_version = cumulus_feature_barcoding_version,
                     docker_registry = docker_registry_stripped,
+                    acronym_file = acronym_file,
                     zones = zones,
+                    num_cpu = feature_num_cpu,
+                    memory = feature_memory,
+                    disk_space = feature_disk_space,
                     preemptible = preemptible,
-                    awsQueueArn = awsQueueArn,
-                    backend = backend
+                    backend = backend,
+                    awsQueueArn = awsQueueArn
             }
         }
+    }
 
-        if (length(generate_count_config.sample_vdj_ids) > 0) {
-            scatter (sample_id in generate_count_config.sample_vdj_ids) {
-                call crv.cellranger_vdj as cellranger_vdj {
-                    input:
-                        sample_id = sample_id,
-                        input_fastqs_directories = generate_count_config.sample2dir[sample_id],
-                        output_directory = output_directory_stripped,
-                        genome = generate_count_config.sample2genome[sample_id],
-                        acronym_file = acronym_file,
-                        denovo = vdj_denovo,
-                        chain = vdj_chain,
-                        cellranger_version = cellranger_version,
-                        docker_registry = docker_registry_stripped,
-                        zones = zones,
-                        num_cpu = num_cpu,
-                        memory = memory,
-                        disk_space = vdj_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
-            }
-
-            call collect_summaries as collect_summaries_vdj {
+    if (length(generate_count_config.sample_atac_ids) > 0) {
+        scatter (sample_id in generate_count_config.sample_atac_ids) {
+            call crac.cellranger_atac_count as cellranger_atac_count {
                 input:
-                    summaries = cellranger_vdj.output_metrics_summary,
-                    sample_ids = cellranger_vdj.output_vdj_directory,
-                    config_version = config_version,
+                    sample_id = sample_id,
+                    input_fastqs_directories = generate_count_config.sample2dir[sample_id],
+                    output_directory = output_directory_stripped,
+                    genome = generate_count_config.sample2genome[sample_id],
+                    acronym_file = acronym_file,
+                    force_cells = force_cells,
+                    dim_reduce = atac_dim_reduce,
+                    peaks = peaks,
+                    chemistry = generate_count_config.sample2chemistry[sample_id],
+                    cellranger_atac_version = cellranger_atac_version,
                     docker_registry = docker_registry_stripped,
                     zones = zones,
+                    num_cpu = atac_num_cpu,
+                    memory = atac_memory,
+                    disk_space = atac_disk_space,
                     preemptible = preemptible,
-                    awsQueueArn = awsQueueArn,
-                    backend = backend
+                    backend = backend,
+                    awsQueueArn = awsQueueArn
             }
         }
 
-        if (length(generate_count_config.sample_feature_ids) > 0) {
-            scatter (sample_id in generate_count_config.sample_feature_ids) {
-                call ca.cumulus_adt as cumulus_adt {
-                    input:
-                        sample_id = sample_id,
-                        input_fastqs_directories = generate_count_config.sample2dir[sample_id],
-                        output_directory = output_directory_stripped,
-                        chemistry = generate_count_config.sample2chemistry[sample_id],
-                        data_type = generate_count_config.sample2datatype[sample_id],
-                        feature_barcode_file = generate_count_config.sample2fbf[sample_id],
-                        crispr_barcode_pos = crispr_barcode_pos,
-                        scaffold_sequence = scaffold_sequence,
-                        max_mismatch = max_mismatch,
-                        min_read_ratio = min_read_ratio,
-                        cumulus_feature_barcoding_version = cumulus_feature_barcoding_version,
-                        docker_registry = docker_registry_stripped,
-                        acronym_file = acronym_file,
-                        zones = zones,
-                        num_cpu = feature_num_cpu,
-                        memory = feature_memory,
-                        disk_space = feature_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
-            }
+        call collect_summaries as collect_summaries_atac {
+            input:
+                summaries = cellranger_atac_count.output_metrics_summary,
+                sample_ids = cellranger_atac_count.output_count_directory,
+                config_version = config_version,
+                docker_registry = docker_registry_stripped,
+                zones = zones,
+                preemptible = preemptible,
+                awsQueueArn = awsQueueArn,
+                backend = backend
         }
+    }
 
-        if (length(generate_count_config.sample_atac_ids) > 0) {
-            scatter (sample_id in generate_count_config.sample_atac_ids) {
-                call crac.cellranger_atac_count as cellranger_atac_count {
-                    input:
-                        sample_id = sample_id,
-                        input_fastqs_directories = generate_count_config.sample2dir[sample_id],
-                        output_directory = output_directory_stripped,
-                        genome = generate_count_config.sample2genome[sample_id],
-                        acronym_file = acronym_file,
-                        force_cells = force_cells,
-                        dim_reduce = atac_dim_reduce,
-                        peaks = peaks,
-                        chemistry = generate_count_config.sample2chemistry[sample_id],
-                        cellranger_atac_version = cellranger_atac_version,
-                        docker_registry = docker_registry_stripped,
-                        zones = zones,
-                        num_cpu = atac_num_cpu,
-                        memory = atac_memory,
-                        disk_space = atac_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
-            }
-
-            call collect_summaries as collect_summaries_atac {
+    if (length(generate_count_config.link_arc_ids) > 0) {
+        scatter (link_id in generate_count_config.link_arc_ids) {
+            call crarc.cellranger_arc_count as cellranger_arc_count {
                 input:
-                    summaries = cellranger_atac_count.output_metrics_summary,
-                    sample_ids = cellranger_atac_count.output_count_directory,
-                    config_version = config_version,
+                    link_id = link_id,
+                    input_samples = generate_count_config.link2sample[link_id],
+                    input_fastqs_directories = generate_count_config.sample2dir[link_id],
+                    input_data_types = generate_count_config.sample2datatype[link_id],
+                    output_directory = output_directory_stripped,
+                    acronym_file = acronym_file,
+                    genome = generate_count_config.sample2genome[link_id],
+                    gex_exclude_introns = arc_gex_exclude_introns,
+                    no_bam = no_bam,
+                    min_atac_count = arc_min_atac_count,
+                    min_gex_count = arc_min_gex_count,
+                    peaks = peaks,
+                    cellranger_arc_version = cellranger_arc_version,
                     docker_registry = docker_registry_stripped,
                     zones = zones,
+                    num_cpu = arc_num_cpu,
+                    memory = arc_memory,
+                    disk_space = arc_disk_space,
                     preemptible = preemptible,
-                    awsQueueArn = awsQueueArn,
-                    backend = backend
+                    backend = backend,
+                    awsQueueArn = awsQueueArn
             }
         }
 
-        if (length(generate_count_config.link_arc_ids) > 0) {
-            scatter (link_id in generate_count_config.link_arc_ids) {
-                call crarc.cellranger_arc_count as cellranger_arc_count {
-                    input:
-                        link_id = link_id,
-                        input_samples = generate_count_config.link2sample[link_id],
-                        input_fastqs_directories = generate_count_config.sample2dir[link_id],
-                        input_data_types = generate_count_config.sample2datatype[link_id],
-                        output_directory = output_directory_stripped,
-                        acronym_file = acronym_file,
-                        genome = generate_count_config.sample2genome[link_id],
-                        gex_exclude_introns = arc_gex_exclude_introns,
-                        no_bam = no_bam,
-                        min_atac_count = arc_min_atac_count,
-                        min_gex_count = arc_min_gex_count,
-                        peaks = peaks,
-                        cellranger_arc_version = cellranger_arc_version,
-                        docker_registry = docker_registry_stripped,
-                        zones = zones,
-                        num_cpu = arc_num_cpu,
-                        memory = arc_memory,
-                        disk_space = arc_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
-            }
+        call collect_summaries as collect_summaries_arc {
+            input:
+                summaries = cellranger_arc_count.output_metrics_summary,
+                sample_ids = cellranger_arc_count.output_count_directory,
+                config_version = config_version,
+                docker_registry = docker_registry_stripped,
+                zones = zones,
+                preemptible = preemptible,
+                awsQueueArn = awsQueueArn,
+                backend = backend
+        }
+    }
 
-            call collect_summaries as collect_summaries_arc {
+    if (length(generate_count_config.link_multi_ids) > 0) {
+        scatter (link_id in generate_count_config.link_multi_ids) {
+            call crmulti.cellranger_multi as cellranger_multi {
                 input:
-                    summaries = cellranger_arc_count.output_metrics_summary,
-                    sample_ids = cellranger_arc_count.output_count_directory,
-                    config_version = config_version,
+                    link_id = link_id,
+                    input_samples = generate_count_config.link2sample[link_id],
+                    input_fastqs_directories = generate_count_config.sample2dir[link_id],
+                    input_data_types = generate_count_config.sample2datatype[link_id],
+                    input_fbf = generate_count_config.sample2fbf[link_id],
+                    output_directory = output_directory_stripped,
+                    acronym_file = acronym_file,
+                    genome = generate_count_config.sample2genome[link_id],
+                    probe_set = generate_count_config.sample2probeset[link_id],
+                    cmo_set = cmo_set,
+                    include_introns = include_introns,
+                    no_bam = no_bam,
+                    secondary = secondary,
+                    force_cells = force_cells,
+                    expect_cells = expect_cells,
+                    cellranger_version = cellranger_version,
                     docker_registry = docker_registry_stripped,
                     zones = zones,
+                    num_cpu = num_cpu,
+                    memory = memory,
+                    disk_space = multi_disk_space,
                     preemptible = preemptible,
-                    awsQueueArn = awsQueueArn,
-                    backend = backend
+                    backend = backend,
+                    awsQueueArn = awsQueueArn
             }
         }
+    }
 
-        if (length(generate_count_config.link_multi_ids) > 0) {
-            scatter (link_id in generate_count_config.link_multi_ids) {
-                call crmulti.cellranger_multi as cellranger_multi {
-                    input:
-                        link_id = link_id,
-                        input_samples = generate_count_config.link2sample[link_id],
-                        input_fastqs_directories = generate_count_config.sample2dir[link_id],
-                        input_data_types = generate_count_config.sample2datatype[link_id],
-                        input_fbf = generate_count_config.sample2fbf[link_id],
-                        output_directory = output_directory_stripped,
-                        acronym_file = acronym_file,
-                        genome = generate_count_config.sample2genome[link_id],
-                        probe_set = generate_count_config.sample2probeset[link_id],
-                        cmo_set = cmo_set,
-                        include_introns = include_introns,
-                        no_bam = no_bam,
-                        secondary = secondary,
-                        force_cells = force_cells,
-                        expect_cells = expect_cells,
-                        cellranger_version = cellranger_version,
-                        docker_registry = docker_registry_stripped,
-                        zones = zones,
-                        num_cpu = num_cpu,
-                        memory = memory,
-                        disk_space = multi_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
-            }
-        }
-
-        if (length(generate_count_config.link_fbc_ids) > 0) {
-            scatter (link_id in generate_count_config.link_fbc_ids) {
-                call crc.cellranger_count as cellranger_count_fbc {
-                    input:
-                        sample_id = link_id,
-                        input_samples = generate_count_config.link2sample[link_id],
-                        input_fastqs_directories = generate_count_config.sample2dir[link_id],
-                        input_data_types = generate_count_config.sample2datatype[link_id],
-                        input_fbf = generate_count_config.sample2fbf[link_id],
-                        output_directory = output_directory_stripped,
-                        acronym_file = acronym_file,
-                        genome = generate_count_config.sample2genome[link_id],
-                        include_introns = include_introns,
-                        no_bam = no_bam,
-                        secondary = secondary,
-                        force_cells = force_cells,
-                        expect_cells = expect_cells,
-                        cellranger_version = cellranger_version,
-                        docker_registry = docker_registry_stripped,
-                        zones = zones,
-                        num_cpu = num_cpu,
-                        memory = memory,
-                        disk_space = count_disk_space,
-                        preemptible = preemptible,
-                        backend = backend,
-                        awsQueueArn = awsQueueArn
-                }
-            }
-
-            call collect_summaries as collect_summaries_fbc {
+    if (length(generate_count_config.link_fbc_ids) > 0) {
+        scatter (link_id in generate_count_config.link_fbc_ids) {
+            call crc.cellranger_count as cellranger_count_fbc {
                 input:
-                    summaries = cellranger_count_fbc.output_metrics_summary,
-                    sample_ids = cellranger_count_fbc.output_count_directory,
-                    config_version = config_version,
+                    sample_id = link_id,
+                    input_samples = generate_count_config.link2sample[link_id],
+                    input_fastqs_directories = generate_count_config.sample2dir[link_id],
+                    input_data_types = generate_count_config.sample2datatype[link_id],
+                    input_fbf = generate_count_config.sample2fbf[link_id],
+                    output_directory = output_directory_stripped,
+                    acronym_file = acronym_file,
+                    genome = generate_count_config.sample2genome[link_id],
+                    include_introns = include_introns,
+                    no_bam = no_bam,
+                    secondary = secondary,
+                    force_cells = force_cells,
+                    expect_cells = expect_cells,
+                    cellranger_version = cellranger_version,
                     docker_registry = docker_registry_stripped,
                     zones = zones,
+                    num_cpu = num_cpu,
+                    memory = memory,
+                    disk_space = count_disk_space,
                     preemptible = preemptible,
-                    awsQueueArn = awsQueueArn,
-                    backend = backend
+                    backend = backend,
+                    awsQueueArn = awsQueueArn
             }
         }
+
+        call collect_summaries as collect_summaries_fbc {
+            input:
+                summaries = cellranger_count_fbc.output_metrics_summary,
+                sample_ids = cellranger_count_fbc.output_count_directory,
+                config_version = config_version,
+                docker_registry = docker_registry_stripped,
+                zones = zones,
+                preemptible = preemptible,
+                awsQueueArn = awsQueueArn,
+                backend = backend
+        }
     }
 
     output {
-        Array[Array[String]?] fastq_outputs = [cellranger_mkfastq.output_fastqs_flowcell_directory, cellranger_atac_mkfastq.output_fastqs_flowcell_directory, cellranger_arc_mkfastq.output_fastqs_flowcell_directory]
-        Map[String, Array[String]?] count_outputs = {"gex": cellranger_count.output_count_directory,
-                                                     "vdj": cellranger_vdj.output_vdj_directory,
-                                                     "adt": cumulus_adt.output_count_directory,
-                                                     "atac": cellranger_atac_count.output_count_directory,
-                                                     "arc": cellranger_arc_count.output_count_directory,
-                                                     "multi": cellranger_multi.output_multi_directory,
-                                                     "fbc": cellranger_count_fbc.output_count_directory
-                                                    }
+        Map[String, Array[String]?] count_outputs = {
+            "gex": cellranger_count.output_count_directory,
+            "vdj": cellranger_vdj.output_vdj_directory,
+            "adt": cumulus_adt.output_count_directory,
+            "atac": cellranger_atac_count.output_count_directory,
+            "arc": cellranger_arc_count.output_count_directory,
+            "multi": cellranger_multi.output_multi_directory,
+            "fbc": cellranger_count_fbc.output_count_directory
+        }
         File? count_matrix = generate_count_config.count_matrix
     }
 }
 
-task generate_bcl_csv {
-    input {
-        File input_csv_file
-        String output_dir
-        String config_version
-        String docker_registry
-        String zones
-        Int preemptible
-        String awsQueueArn
-        String backend
-    }
-
-    command {
-        set -e
-        export TMPDIR=/tmp
-
-        python /software/check_uri.py "~{backend}" "~{output_dir}"
-
-        python <<CODE
-        import os
-        import re
-        import sys
-        import pandas as pd
-        from collections import defaultdict
-
-        df = pd.read_csv('~{input_csv_file}', header = 0, dtype = str, index_col = False)
-        df.columns = df.columns.str.strip()
-
-        if 'DataType' not in df.columns:
-            df['DataType'] = 'rna'
-        else:
-            df.loc[df['DataType'].isna(), 'DataType'] = 'rna'
-
-        for c in df.columns:
-            df[c] = df[c].str.strip()
-
-        multiomics = defaultdict(set)
-        for idx, row in df.iterrows():
-            row['Flowcell'] = re.sub('/+$', '', row['Flowcell'])
-            if row['DataType'] not in ['rna', 'vdj', 'adt', 'citeseq', 'hashing', 'cmo', 'crispr', 'atac', 'frp']:
-                print("Unknown DataType " + row['DataType'] + " is detected!", file = sys.stderr)
-                sys.exit(1)
-            if ('Link' in row) and pd.notnull(row['Link']) and (row['Link'] != ''):
-                multiomics[row['Link']].add(row['DataType'])
-            if row['DataType'] in ['vdj', 'adt', 'citeseq', 'hashing', 'cmo', 'crispr', 'frp']:
-                row['DataType'] = 'rna'
-            if re.search('[^a-zA-Z0-9_-]', row['Sample']) is not None:
-                print('Sample must contain only alphanumeric characters, hyphens, and underscores.', file = sys.stderr)
-                print('Examples of common characters that are not allowed are the space character and the following: ?()[]/\=+<>:;"\',*^| &', file = sys.stderr)
-                sys.exit(1)
-        for idx, row in df.iterrows():
-            if ('Link' in row) and pd.notnull(row['Link']) and (row['Link'] != ''):
-                omics = multiomics[row['Link']]
-                if 'atac' in omics:
-                    if omics != set(['atac', 'rna']):
-                        print('CellRanger ARC only works with ATAC+RNA data! Link \'' + row['Link'] + '\' contains ' + ', '.join(list(omics)) + '.', file = sys.stderr)
-                        sys.exit(1)
-                    row['DataType'] = 'arc'
-
-        with open('inpdirs.txt', 'w') as fo:
-            for input_dir in df['Flowcell'].unique():
-                run_id = os.path.basename(input_dir)
-                flowcell_df = df.loc[df['Flowcell'] == input_dir]
-                for datatype in flowcell_df['DataType'].unique():
-                    bcl_df = flowcell_df.loc[flowcell_df['DataType'] == datatype, ['Lane', 'Sample', 'Index']]
-                    bcl_file = run_id + '_' + datatype + '_bcl.csv'
-                    bcl_df.to_csv(bcl_file, index = False)
-                    fo.write(bcl_file + '\t' + input_dir + '\n')
-        CODE
-    }
-
-    output {
-        Map[String, String] inpdirs = read_map('inpdirs.txt')
-        Array[File] bcl_csv_rna = glob('*_rna_bcl.csv')
-        Array[File] bcl_csv_atac = glob('*_atac_bcl.csv')
-        Array[File] bcl_csv_arc = glob('*_arc_bcl.csv')
-    }
-
-    runtime {
-        docker: "~{docker_registry}/config:~{config_version}"
-        zones: zones
-        preemptible: preemptible
-        queueArn: awsQueueArn
-    }
-}
 
 task generate_count_config {
     input {
@@ -693,20 +479,6 @@ task generate_count_config {
                 print('Examples of common characters that are not allowed are the space character and the following: ?()[]/\=+<>:;"\',*^| &', file = sys.stderr)
                 sys.exit(1)
 
-        def parse_fastq_dirs(dirs_str):
-            r2f = dict()
-            if dirs_str == '':
-                return r2f
-            dirs = dirs_str.split(',')
-            for dir in dirs:
-                run_id = dir.split('/')[-3].rpartition('_')[0]
-                r2f[run_id] = dir
-            return r2f
-
-        r2f = parse_fastq_dirs('~{sep="," fastq_dirs}')
-        r2f.update(parse_fastq_dirs('~{sep="," fastq_dirs_atac}'))
-        r2f.update(parse_fastq_dirs('~{sep="," fastq_dirs_arc}'))
-
         with open('sample_ids.txt', 'w') as fo1, open('sample_vdj_ids.txt', 'w') as fo2, open('sample_feature_ids.txt', 'w') as fo3, open('sample_atac_ids.txt', 'w') as fo4, \
              open('sample2dir.txt', 'w') as foo1, open('sample2datatype.txt', 'w') as foo2, open('sample2genome.txt', 'w') as foo3, \
              open('sample2chemistry.txt', 'w') as foo4, open('sample2fbf.txt', 'w') as foo5, open('count_matrix.csv', 'w') as foo6, \
@@ -733,10 +505,7 @@ task generate_count_config {
 
                 datatype = df_local['DataType'].iat[0]
 
-                if len(r2f) > 0:
-                    dirs = df_local['Flowcell'].map(lambda x: r2f[os.path.basename(x)]).values # if also run mkfastq
-                else:
-                    dirs = df_local['Flowcell'].values # if start from count step
+                dirs = df_local['Flowcell'].values
 
                 reference = 'null'
                 if datatype in ['rna', 'vdj', 'atac', 'frp']:

From 4d25800c9612f13e20653d0b81e1c21fa1809309 Mon Sep 17 00:00:00 2001
From: Yiming Yang <yang.yihming@gmail.com>
Date: Mon, 17 Feb 2025 02:02:37 -0800
Subject: [PATCH 7/7] check sample name prefix consistency instead of enforcing
 renaming

---
 workflows/cellranger/cellranger_count.wdl | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl
index 3b9aa7f3..2e553a1b 100644
--- a/workflows/cellranger/cellranger_count.wdl
+++ b/workflows/cellranger/cellranger_count.wdl
@@ -141,19 +141,16 @@ task run_cellranger_count {
         from subprocess import check_call, CalledProcessError, DEVNULL, STDOUT
         from packaging import version
 
-        def rename_fastq_file(path, sample_name):
+        def check_fastq_file(path, sample_name):
             folder = os.path.dirname(path)
             filename = os.path.basename(path)
             pattern = r"(_S\d+_L\d+_[RI]\d+_001\.fastq\.gz)"
             match = re.search(pattern, filename)
             if match:
                 idx = match.start()
-                cur_name = filename[:idx]
-                suffix = filename[idx:]
+                cur_name = filename[:match.start()]
                 if cur_name != sample_name:
-                    call_args = ["mv", path, folder+"/"+sample_name+suffix]
-                    print(' '.join(call_args))
-                    check_call(call_args)
+                    raise Exception("FASTQ sample name prefix mismatch! Expect " + sample_name + ". Get " + cur_name + ".")
             else:
                 raise Exception(path + " does not follow Illumina naming convention!")
 
@@ -174,12 +171,12 @@ task run_cellranger_count {
                     check_call(call_args, stdout=DEVNULL, stderr=STDOUT)
                 except CalledProcessError:
                     # Localize tar file
-                    call_args = ['strato', 'cp', '-m', directory + '/' + "*.tar", target]
+                    tar_file = sample_name + ".tar"
+                    call_args = ['strato', 'cp', '-m', directory + '/' + tar_file, target]
                     print(' '.join(call_args))
                     check_call(call_args)
 
                     # Untar
-                    tar_file = glob.glob(target+"/*.tar")[0]
                     call_args = ["tar", "--strip-components=1", "-xf", tar_file, "-C", target]
                     print(' '.join(call_args))
                     check_call(call_args)
@@ -192,7 +189,7 @@ task run_cellranger_count {
                     # Rename FASTQ files if needed
                     fastq_files = glob.glob(target+"/*.fastq.gz")
                     for fastq_f in fastq_files:
-                        rename_fastq_file(fastq_f, sample_name)
+                        check_fastq_file(fastq_f, sample_name)
 
         samples = data_types = fbfs = None
         fastqs_dirs = []