From c79530eff1f9b8d2bc42c3097ba058a3797bbffc Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Sat, 15 Feb 2025 17:02:11 -0800 Subject: [PATCH 1/7] Support input as tar balls --- workflows/cellranger/cellranger_count.wdl | 44 +++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl index a92dc3ec..7269d4b2 100644 --- a/workflows/cellranger/cellranger_count.wdl +++ b/workflows/cellranger/cellranger_count.wdl @@ -140,6 +140,22 @@ task run_cellranger_count { from subprocess import check_call, CalledProcessError, DEVNULL, STDOUT from packaging import version + def rename_fastq_file(path, sample_name): + folder = os.path.dirname(path) + filename = os.path.basename(path) + pattern = r"(_S\d+_L\d+_R\d+_001\.fastq\.gz)" + match = re.search(pattern, filename) + if match: + idx = match.start() + cur_name = filename[:idx] + suffix = filename[idx:] + if cur_name != sample_name: + call_args = ["mv", path, folder+"/"+sample_name+suffix] + print(' '.join(call_args)) + check_call(call_args) + else: + raise Exception(path + " does not follow Illumina naming convention!") + samples = data_types = fbfs = None fastqs_dirs = [] @@ -188,9 +204,31 @@ task run_cellranger_count { except CalledProcessError: if not os.path.exists(target): os.mkdir(target) - call_args = ['strato', 'cp', '-m', directory + '/' + samples[i] + '_S*_L*_*_001.fastq.gz' , target] - print(' '.join(call_args)) - check_call(call_args) + try: + call_args = ['strato', 'cp', '-m', directory + '/' + samples[i] + '_S*_L*_*_001.fastq.gz' , target] + print(' '.join(call_args)) + check_call(call_args, stdout=DEVNULL, stderr=STDOUT) + except CalledProcessError: + # Localize tar file + call_args = ['strato', 'cp', '-m', directory + '/*.tar', target] + print(' '.join(call_args)) + check_call(call_args) + + # Untar + tar_file = glob.glob(target+"/*.tar")[0] + call_args = ["tar", "--strip-components=1", "-xf", tar_file, "-C", target] + print(' '.join(call_args)) + check_call(call_args) + + # Remove tar file + call_args = ["rm", tar_file] + print(' '.join(call_args)) + check_call(call_args) + + # Rename FASTQ files if needed + fastq_files = glob.glob(target+"/*.fastq.gz") + for fastq_f in fastq_files: + rename_fastq_file(fastq_f, samples[i]) feature_type = '' if data_types[i] == 'rna': feature_type = 'Gene Expression' From 001a0f91610852d105fd7bdd78441ea222a7e605 Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Sat, 15 Feb 2025 17:55:06 -0800 Subject: [PATCH 2/7] wrap FASTQ localization as a function --- workflows/cellranger/cellranger_count.wdl | 88 +++++++++++------------ 1 file changed, 40 insertions(+), 48 deletions(-) diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl index 7269d4b2..e3f3cbea 100644 --- a/workflows/cellranger/cellranger_count.wdl +++ b/workflows/cellranger/cellranger_count.wdl @@ -156,6 +156,43 @@ task run_cellranger_count { else: raise Exception(path + " does not follow Illumina naming convention!") + def localize_fastqs(directory, target, sample_name): + try: + call_args = ['strato', 'exists', directory + '/' + sample_name + '/'] + print(' '.join(call_args)) + check_call(call_args, stdout=DEVNULL, stderr=STDOUT) + call_args = ['strato', 'sync', '-m', directory + '/' + sample_name, target] + print(' '.join(call_args)) + check_call(call_args) + except CalledProcessError: + if not os.path.exists(target): + os.mkdir(target) + try: + call_args = ['strato', 'cp', '-m', directory + '/' + sample_name + '_S*_L*_*_001.fastq.gz' , target] + print(' '.join(call_args)) + check_call(call_args, stdout=DEVNULL, stderr=STDOUT) + except CalledProcessError: + # Localize tar file + call_args = ['strato', 'cp', '-m', directory + '/' + "*.tar", target] + print(' '.join(call_args)) + check_call(call_args) + + # Untar + tar_file = glob.glob(target+"/*.tar")[0] + call_args = ["tar", "--strip-components=1", "-xf", tar_file, "-C", target] + print(' '.join(call_args)) + check_call(call_args) + + # Remove tar file + call_args = ["rm", tar_file] + print(' '.join(call_args)) + check_call(call_args) + + # Rename FASTQ files if needed + fastq_files = glob.glob(target+"/*.fastq.gz") + for fastq_f in fastq_files: + rename_fastq_file(fastq_f, sample_name) + samples = data_types = fbfs = None fastqs_dirs = [] @@ -194,41 +231,8 @@ task run_cellranger_count { for i, directory in enumerate('~{input_fastqs_directories}'.split(',')): directory = re.sub('/+$', '', directory) # remove trailing slashes target = samples[i] + "_" + str(i) - try: - call_args = ['strato', 'exists', directory + '/' + samples[i] + '/'] - print(' '.join(call_args)) - check_call(call_args, stdout=DEVNULL, stderr=STDOUT) - call_args = ['strato', 'sync', '-m', directory + '/' + samples[i], target] - print(' '.join(call_args)) - check_call(call_args) - except CalledProcessError: - if not os.path.exists(target): - os.mkdir(target) - try: - call_args = ['strato', 'cp', '-m', directory + '/' + samples[i] + '_S*_L*_*_001.fastq.gz' , target] - print(' '.join(call_args)) - check_call(call_args, stdout=DEVNULL, stderr=STDOUT) - except CalledProcessError: - # Localize tar file - call_args = ['strato', 'cp', '-m', directory + '/*.tar', target] - print(' '.join(call_args)) - check_call(call_args) - - # Untar - tar_file = glob.glob(target+"/*.tar")[0] - call_args = ["tar", "--strip-components=1", "-xf", tar_file, "-C", target] - print(' '.join(call_args)) - check_call(call_args) - - # Remove tar file - call_args = ["rm", tar_file] - print(' '.join(call_args)) - check_call(call_args) - - # Rename FASTQ files if needed - fastq_files = glob.glob(target+"/*.fastq.gz") - for fastq_f in fastq_files: - rename_fastq_file(fastq_f, samples[i]) + localize_fastqs(directory, target, samples[i]) + feature_type = '' if data_types[i] == 'rna': feature_type = 'Gene Expression' @@ -246,19 +250,7 @@ task run_cellranger_count { for i, directory in enumerate('~{input_fastqs_directories}'.split(',')): directory = re.sub('/+$', '', directory) # remove trailing slashes target = '~{sample_id}_' + str(i) - try: - call_args = ['strato', 'exists', directory + '/~{sample_id}/'] - print(' '.join(call_args)) - check_call(call_args, stdout=DEVNULL, stderr=STDOUT) - call_args = ['strato', 'sync', '-m', directory + '/~{sample_id}', target] - print(' '.join(call_args)) - check_call(call_args) - except CalledProcessError: - if not os.path.exists(target): - os.mkdir(target) - call_args = ['strato', 'cp', '-m', directory + '/~{sample_id}' + '_S*_L*_*_001.fastq.gz' , target] - print(' '.join(call_args)) - check_call(call_args) + localize_fastqs(directory, target, '~{input_samples}') fastqs_dirs.append(target) mem_size = re.findall(r"\d+", "~{memory}")[0] From 7ad3bbd3cfe142ea878f5a673053695c321d94fc Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Sat, 15 Feb 2025 18:07:34 -0800 Subject: [PATCH 3/7] fix bug --- workflows/cellranger/cellranger_count.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl index e3f3cbea..856dea7d 100644 --- a/workflows/cellranger/cellranger_count.wdl +++ b/workflows/cellranger/cellranger_count.wdl @@ -250,7 +250,7 @@ task run_cellranger_count { for i, directory in enumerate('~{input_fastqs_directories}'.split(',')): directory = re.sub('/+$', '', directory) # remove trailing slashes target = '~{sample_id}_' + str(i) - localize_fastqs(directory, target, '~{input_samples}') + localize_fastqs(directory, target, '~{sample_id}') fastqs_dirs.append(target) mem_size = re.findall(r"\d+", "~{memory}")[0] From cfa1118df54c3ba4584ffd91d9ce356596601efe Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Sat, 15 Feb 2025 18:33:12 -0800 Subject: [PATCH 4/7] add missing import --- workflows/cellranger/cellranger_count.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl index 856dea7d..bf8678f1 100644 --- a/workflows/cellranger/cellranger_count.wdl +++ b/workflows/cellranger/cellranger_count.wdl @@ -137,6 +137,7 @@ task run_cellranger_count { import re import os import sys + import glob from subprocess import check_call, CalledProcessError, DEVNULL, STDOUT from packaging import version From a573b70f7fcbeb9c2a1d07c27852115beb799af1 Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Sat, 15 Feb 2025 19:21:26 -0800 Subject: [PATCH 5/7] accept I1/I2 as well --- workflows/cellranger/cellranger_count.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl index bf8678f1..3b9aa7f3 100644 --- a/workflows/cellranger/cellranger_count.wdl +++ b/workflows/cellranger/cellranger_count.wdl @@ -144,7 +144,7 @@ task run_cellranger_count { def rename_fastq_file(path, sample_name): folder = os.path.dirname(path) filename = os.path.basename(path) - pattern = r"(_S\d+_L\d+_R\d+_001\.fastq\.gz)" + pattern = r"(_S\d+_L\d+_[RI]\d+_001\.fastq\.gz)" match = re.search(pattern, filename) if match: idx = match.start() From d2cdfc10984cc9c7d4ed6c0ba70072208e51340b Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Mon, 17 Feb 2025 02:02:10 -0800 Subject: [PATCH 6/7] remove mkfastq part from Cellranger workflow --- docs/cellranger/feature_barcoding.rst | 10 +- docs/cellranger/general_steps.rst | 27 +- docs/cellranger/index.rst | 12 +- docs/cellranger/sc_atac.rst | 24 - docs/cellranger/sc_sn_rnaseq.rst | 96 +-- docs/cellranger/sc_vdj.rst | 16 - docs/spaceranger.rst | 2 +- workflows/cellranger/cellranger_workflow.wdl | 687 ++++++------------- 8 files changed, 250 insertions(+), 624 deletions(-) diff --git a/docs/cellranger/feature_barcoding.rst b/docs/cellranger/feature_barcoding.rst index 1ff27f24..6e30ccd3 100644 --- a/docs/cellranger/feature_barcoding.rst +++ b/docs/cellranger/feature_barcoding.rst @@ -193,11 +193,11 @@ For feature barcoding data, ``cellranger_workflow`` takes Illumina outputs as in - 0.1 - 0.1 * - cellranger_version - - cellranger version, could be: 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0, 6.1.2, 6.1.1, 6.0.2, 6.0.1, 6.0.0, 5.0.1, 5.0.0 + - cellranger version, could be: 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0 - "9.0.0" - "9.0.0" * - cumulus_feature_barcoding_version - - Cumulus_feature_barcoding version for extracting feature barcode matrix. Version available: 0.11.4, 0.11.3, 0.11.2, 0.11.1, 0.11.0, 0.10.0, 0.9.0, 0.8.0, 0.7.0, 0.6.0, 0.5.0, 0.4.0, 0.3.0, 0.2.0. + - Cumulus_feature_barcoding version for extracting feature barcode matrix. - "0.11.4" - "0.11.4" * - docker_registry @@ -208,12 +208,6 @@ For feature barcoding data, ``cellranger_workflow`` takes Illumina outputs as in - "cumulusprod" for backup images on Docker Hub. - "quay.io/cumulus" - "quay.io/cumulus" - * - mkfastq_docker_registry - - Docker registry to use for ``cellranger mkfastq``. - Default is the registry to which only Broad users have access. - See :ref:`bcl2fastq-docker` for making your own registry. - - "gcr.io/broad-cumulus" - - "gcr.io/broad-cumulus" * - acronym_file - | The link/path of an index file in TSV format for fetching preset genome references, chemistry whitelists, etc. by their names. | Set an GS URI if *backend* is ``gcp``; an S3 URI for ``aws`` backend; an absolute file path for ``local`` backend. diff --git a/docs/cellranger/general_steps.rst b/docs/cellranger/general_steps.rst index 933aa5c3..b6d456b3 100644 --- a/docs/cellranger/general_steps.rst +++ b/docs/cellranger/general_steps.rst @@ -66,12 +66,6 @@ Alternatively, users can submit jobs through command line interface (CLI) using | If starts with FASTQ files, this should be Google bucket URLs of uploaded FASTQ folders. | The FASTQ folders should contain one subfolder for each sample in the flowcell with the sample name as the subfolder name. | Each subfolder contains FASTQ files for that sample. - * - **Lane** - - - | Tells which lanes the sample was pooled into. - | Can be either single lane (e.g. 8) or a range (e.g. 7-8) or all (e.g. \*). - * - **Index** - - Sample index (e.g. SI-GA-A12). * - Chemistry - Describes the 10x chemistry used for the sample. This column is optional. * - DataType @@ -108,15 +102,15 @@ Alternatively, users can submit jobs through command line interface (CLI) using Example:: - Sample,Reference,Flowcell,Lane,Index,Chemistry,DataType - sample_1,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,1-2,SI-GA-A8,threeprime,rna - sample_2,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,3-4,SI-GA-B8,SC3Pv3,rna - sample_3,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,5-6,SI-GA-C8,fiveprime,rna - sample_4,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,7-8,SI-GA-D8,fiveprime,rna - sample_1,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,1-2,SI-GA-A8,threeprime,rna - sample_2,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,3-4,SI-GA-B8,SC3Pv3,rna - sample_3,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,5-6,SI-GA-C8,fiveprime,rna - sample_4,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,7-8,SI-GA-D8,fiveprime,rna + Sample,Reference,Flowcell,Chemistry,DataType + sample_1,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,threeprime,rna + sample_2,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,SC3Pv3,rna + sample_3,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,fiveprime,rna + sample_4,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK18WBC6Z4,fiveprime,rna + sample_1,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,threeprime,rna + sample_2,GRCh38-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,SC3Pv3,rna + sample_3,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,fiveprime,rna + sample_4,mm10-2020-A,gs://fc-e0000000-0000-0000-0000-000000000000/VK10WBC9Z2,fiveprime,rna **3.2 Upload your sample sheet to the workspace bucket:** @@ -183,9 +177,6 @@ Alternatively, users can submit jobs through command line interface (CLI) using * - Name - Type - Description - * - fastq_outputs - - Array[Array[String]?] - - The top-level array contains results (as arrays) for different data modalities. The inner-level array contains cloud locations of FASTQ files, one url per flowcell. * - count_outputs - Array[Array[String]?] - The top-level array contains results (as arrays) for different data modalities. The inner-level array contains cloud locations of count matrices, one url per sample. diff --git a/docs/cellranger/index.rst b/docs/cellranger/index.rst index fddb0166..7e94049e 100644 --- a/docs/cellranger/index.rst +++ b/docs/cellranger/index.rst @@ -24,17 +24,17 @@ Feature barcoding assays (cell & nucleus hashing, CITE-seq and Perturb-seq) --------------------------------- -Single-cell ATAC-seq -^^^^^^^^^^^^^^^^^^^^ +Single-cell immune profiling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. include:: sc_atac.rst +.. include:: sc_vdj.rst --------------------------------- -Single-cell immune profiling -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Single-cell ATAC-seq +^^^^^^^^^^^^^^^^^^^^ -.. include:: sc_vdj.rst +.. include:: sc_atac.rst --------------------------------- diff --git a/docs/cellranger/sc_atac.rst b/docs/cellranger/sc_atac.rst index 15880efb..5694cb1d 100644 --- a/docs/cellranger/sc_atac.rst +++ b/docs/cellranger/sc_atac.rst @@ -19,30 +19,6 @@ Sample sheet - Mouse mm10, cellranger-arc/atac reference 2.0.0 * - **GRCh38_and_mm10-2020-A_atac_v2.0.0** - Human GRCh38 and mouse mm10, cellranger-atac reference 2.0.0 - * - **GRCh38_atac_v1.2.0** - - Human GRCh38, cellranger-atac reference 1.2.0 - * - **mm10_atac_v1.2.0** - - Mouse mm10, cellranger-atac reference 1.2.0 - * - **hg19_atac_v1.2.0** - - Human hg19, cellranger-atac reference 1.2.0 - * - **b37_atac_v1.2.0** - - Human b37 build, cellranger-atac reference 1.2.0 - * - **GRCh38_and_mm10_atac_v1.2.0** - - Human GRCh38 and mouse mm10, cellranger-atac reference 1.2.0 - * - **hg19_and_mm10_atac_v1.2.0** - - Human hg19 and mouse mm10, cellranger-atac reference 1.2.0 - * - **GRCh38_atac_v1.1.0** - - Human GRCh38, cellranger-atac reference 1.1.0 - * - **mm10_atac_v1.1.0** - - Mouse mm10, cellranger-atac reference 1.1.0 - * - **hg19_atac_v1.1.0** - - Human hg19, cellranger-atac reference 1.1.0 - * - **b37_atac_v1.1.0** - - Human b37 build, cellranger-atac reference 1.1.0 - * - **GRCh38_and_mm10_atac_v1.1.0** - - Human GRCh38 and mouse mm10, cellranger-atac reference 1.1.0 - * - **hg19_and_mm10_atac_v1.1.0** - - Human hg19 and mouse mm10, cellranger-atac reference 1.1.0 #. **Index** column. diff --git a/docs/cellranger/sc_sn_rnaseq.rst b/docs/cellranger/sc_sn_rnaseq.rst index 0ed7ac73..510e997f 100644 --- a/docs/cellranger/sc_sn_rnaseq.rst +++ b/docs/cellranger/sc_sn_rnaseq.rst @@ -25,49 +25,6 @@ Sample sheet - Mouse mm10 (GENCODE vM23/Ensembl 98) * - **GRCh38_and_mm10-2020-A** - Human GRCh38 (GENCODE v32/Ensembl 98) and mouse mm10 (GENCODE vM23/Ensembl 98) - * - **GRCh38_v3.0.0** - - Human GRCh38, cellranger reference 3.0.0, Ensembl v93 gene annotation - * - **hg19_v3.0.0** - - Human hg19, cellranger reference 3.0.0, Ensembl v87 gene annotation - * - **mm10_v3.0.0** - - Mouse mm10, cellranger reference 3.0.0, Ensembl v93 gene annotation - * - **GRCh38_and_mm10_v3.1.0** - - Human (GRCh38) and mouse (mm10), cellranger references 3.1.0, Ensembl v93 gene annotations for both human and mouse - * - **hg19_and_mm10_v3.0.0** - - Human (hg19) and mouse (mm10), cellranger reference 3.0.0, Ensembl v93 gene annotations for both human and mouse - * - **GRCh38_v1.2.0** or **GRCh38** - - Human GRCh38, cellranger reference 1.2.0, Ensembl v84 gene annotation - * - **hg19_v1.2.0** or **hg19** - - Human hg19, cellranger reference 1.2.0, Ensembl v82 gene annotation - * - **mm10_v1.2.0** or **mm10** - - Mouse mm10, cellranger reference 1.2.0, Ensembl v84 gene annotation - * - **GRCh38_and_mm10_v1.2.0** or **GRCh38_and_mm10** - - Human and mouse, built from GRCh38 and mm10 cellranger references, Ensembl v84 gene annotations are used - * - **GRCh38_and_SARSCoV2** - - Human GRCh38 and SARS-COV-2 RNA genome, cellranger reference 3.0.0, generated by `Carly Ziegler`_. The SARS-COV-2 viral sequence and gtf are as described in `[Kim et al. Cell 2020]`_ (https://github.com/hyeshik/sars-cov-2-transcriptome, BetaCov/South Korea/KCDC03/2020 based on NC_045512.2). The GTF was edited to include only CDS regions, and regions were added to describe the 5' UTR ("SARSCoV2_5prime"), the 3' UTR ("SARSCoV2_3prime"), and reads aligning to anywhere within the Negative Strand("SARSCoV2_NegStrand"). Additionally, trailing A's at the 3' end of the virus were excluded from the SARSCoV2 fasta, as these were found to drive spurious viral alignment in pre-COVID19 samples. - - Pre-built snRNA-seq references are summarized below. - - .. list-table:: - :widths: 5 20 - :header-rows: 1 - - * - Keyword - - Description - * - **GRCh38_premrna_v3.0.0** - - Human, introns included, built from GRCh38 cellranger reference 3.0.0, Ensembl v93 gene annotation, treating annotated transcripts as exons - * - **GRCh38_premrna_v1.2.0** or **GRCh38_premrna** - - Human, introns included, built from GRCh38 cellranger reference 1.2.0, Ensembl v84 gene annotation, treating annotated transcripts as exons - * - **mm10_premrna_v1.2.0** or **mm10_premrna** - - Mouse, introns included, built from mm10 cellranger reference 1.2.0, Ensembl v84 gene annotation, treating annotated transcripts as exons - * - **GRCh38_premrna_and_mm10_premrna_v1.2.0** or **GRCh38_premrna_and_mm10_premrna** - - Human and mouse, introns included, built from GRCh38_premrna_v1.2.0 and mm10_premrna_v1.2.0 - * - **GRCh38_premrna_and_SARSCoV2** - - Human, introns included, built from GRCh38_premrna_v3.0.0, and SARS-COV-2 RNA genome. This reference was generated by `Carly Ziegler`_. The SARS-COV-2 RNA genome is from `[Kim et al. Cell 2020]`_ (https://github.com/hyeshik/sars-cov-2-transcriptome, BetaCov/South Korea/KCDC03/2020 based on NC_045512.2). Please see the description of *GRCh38_and_SARSCoV2* above for details. - -#. **Index** column. - - Put `10x single cell RNA-seq sample index set names`_ (e.g. SI-GA-A12) here. #. *Chemistry* column. @@ -85,22 +42,9 @@ Sample sheet - Single Cell 3′ * - **fiveprime** - Single Cell 5′ - * - **SC3Pv1** - - Single Cell 3′ v1 - * - **SC3Pv2** - - Single Cell 3′ v2 - * - **SC3Pv3** - - Single Cell 3′ v3. You should set cellranger version input parameter to >= 3.0.2 - * - **SC3Pv4** - - Single Cell 3' v4. **Notice:** This is GEM-X chemistry, and only works for Cell Ranger v8.0.0+ - * - **SC5P-PE** - - Single Cell 5′ paired-end (both R1 and R2 are used for alignment) - * - **SC5P-PE-v3** - - Single Cell 5' paired-end v3 (both R1 and R2 are used for alignment). **Notice:** This is GEM-X chemistry, and only works for Cell Ranger v8.0.0+ - * - **SC5P-R2** - - Single Cell 5′ R2-only (where only R2 is used for alignment) - * - **SC5P-R2-v3** - - Single Cell 5' R2-only v3 (where only R2 is used for alignment). **Notice:** This is GEM-X chemistry, and only works for Cell Rangrer v8.0.0+ + +#. *Flowcell* column. + #. *DataType* column. @@ -140,38 +84,6 @@ For sc/snRNA-seq data, ``cellranger_workflow`` takes Illumina outputs as input a - Output directory - "gs://fc-e0000000-0000-0000-0000-000000000000/cellranger_output" - Results are written under directory *output_directory* and will overwrite any existing files at this location. - * - run_mkfastq - - If you want to run ``cellranger mkfastq`` - - true - - true - * - run_count - - If you want to run ``cellranger count`` - - true - - true - * - delete_input_bcl_directory - - If delete BCL directories after demux. If false, you should delete this folder yourself so as to not incur storage charges - - false - - false - * - mkfastq_barcode_mismatches - - Number of mismatches allowed in matching barcode indices (bcl2fastq2 default is 1) - - 0 - - - * - mkfastq_force_single_index - - If 10x-supplied i7/i5 paired indices are specified, but the flowcell was run with only one sample index, allow the demultiplex to proceed using the i7 half of the sample index pair - - false - - false - * - mkfastq_filter_single_index - - Only demultiplex samples identified by an i7-only sample index, ignoring dual-indexed samples. Dual-indexed samples will not be demultiplexed - - false - - false - * - mkfastq_use_bases_mask - - Override the read lengths as specified in *RunInfo.xml* - - "Y28n*,I8n*,N10,Y90n*" - - - * - mkfastq_delete_undetermined - - Delete undetermined FASTQ files generated by bcl2fastq2 - - true - - false * - force_cells - Force pipeline to use this number of cells, bypassing the cell detection algorithm, mutually exclusive with expect_cells - 6000 @@ -193,7 +105,7 @@ For sc/snRNA-seq data, ``cellranger_workflow`` takes Illumina outputs as input a - false - false * - cellranger_version - - cellranger version, could be: 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0, 6.1.2, 6.1.1, 6.0.2, 6.0.1, 6.0.0, 5.0.1, 5.0.0 + - cellranger version, could be: 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0 - "9.0.0" - "9.0.0" * - config_version diff --git a/docs/cellranger/sc_vdj.rst b/docs/cellranger/sc_vdj.rst index d5656a08..4fd348b5 100644 --- a/docs/cellranger/sc_vdj.rst +++ b/docs/cellranger/sc_vdj.rst @@ -19,22 +19,6 @@ Sample sheet - Human GRCh38 V(D)J sequences, cellranger reference 7.0.0, annotation built from Ensembl *Homo_sapiens.GRCh38.94.chr_patch_hapl_scaff.gtf* * - **GRCm38_vdj_v7.0.0** - Mouse GRCm38 V(D)J sequences, cellranger reference 7.0.0, annotation built from Ensembl *Mus_musculus.GRCm38.94.gtf* - * - **GRCh38_vdj_v5.0.0** - - Human GRCh38 V(D)J sequences, cellranger reference 5.0.0, annotation built from Ensembl *Homo_sapiens.GRCh38.94.chr_patch_hapl_scaff.gtf* - * - **GRCm38_vdj_v5.0.0** - - Mouse GRCm38 V(D)J sequences, cellranger reference 5.0.0, annotation built from Ensembl *Mus_musculus.GRCm38.94.gtf* - * - **GRCh38_vdj_v4.0.0** - - Human GRCh38 V(D)J sequences, cellranger reference 4.0.0, annotation built from Ensembl *Homo_sapiens.GRCh38.94.chr_patch_hapl_scaff.gtf* - * - **GRCm38_vdj_v4.0.0** - - Mouse GRCm38 V(D)J sequences, cellranger reference 4.0.0, annotation built from Ensembl *Mus_musculus.GRCm38.94.gtf* - * - **GRCh38_vdj_v3.1.0** - - Human GRCh38 V(D)J sequences, cellranger reference 3.1.0, annotation built from Ensembl *Homo_sapiens.GRCh38.94.chr_patch_hapl_scaff.gtf* - * - **GRCm38_vdj_v3.1.0** - - Mouse GRCm38 V(D)J sequences, cellranger reference 3.1.0, annotation built from Ensembl *Mus_musculus.GRCm38.94.gtf* - * - **GRCh38_vdj_v2.0.0** or **GRCh38_vdj** - - Human GRCh38 V(D)J sequences, cellranger reference 2.0.0, annotation built from Ensembl *Homo_sapiens.GRCh38.87.chr_patch_hapl_scaff.gtf* and *vdj_GRCh38_alts_ensembl_10x_genes-2.0.0.gtf* - * - **GRCm38_vdj_v2.2.0** or **GRCm38_vdj** - - Mouse GRCm38 V(D)J sequences, cellranger reference 2.2.0, annotation built from Ensembl *Mus_musculus.GRCm38.90.chr_patch_hapl_scaff.gtf* #. **Index** column. diff --git a/docs/spaceranger.rst b/docs/spaceranger.rst index 7827f6a6..d9975fa7 100644 --- a/docs/spaceranger.rst +++ b/docs/spaceranger.rst @@ -276,7 +276,7 @@ For spatial data, ``spaceranger_workflow`` takes Illumina outputs and related im - 50 - * - spaceranger_version - - spaceranger version, could be: 3.1.2, 3.0.1, 3.0.0, 2.1.1, 2.0.1, 2.0.0, 1.3.1, 1.3.0 + - spaceranger version, could be: 3.1.2, 3.0.1, 3.0.0 - "3.1.2" - "3.1.2" * - config_version diff --git a/workflows/cellranger/cellranger_workflow.wdl b/workflows/cellranger/cellranger_workflow.wdl index 6dbaa614..0f0b5ec0 100644 --- a/workflows/cellranger/cellranger_workflow.wdl +++ b/workflows/cellranger/cellranger_workflow.wdl @@ -1,43 +1,19 @@ version 1.0 -import "cellranger_mkfastq.wdl" as crm import "cellranger_count.wdl" as crc import "cellranger_multi.wdl" as crmulti import "cellranger_vdj.wdl" as crv import "../cumulus/cumulus_adt.wdl" as ca -import "cellranger_atac_mkfastq.wdl" as cram import "cellranger_atac_count.wdl" as crac -import "cellranger_arc_mkfastq.wdl" as crarm import "cellranger_arc_count.wdl" as crarc workflow cellranger_workflow { input { - # 5 - 9 columns (Sample, Reference, Flowcell, Lane, Index, [Chemistry, DataType, FeatureBarcodeFile, Link]). gs URL + # Columns: Sample, Reference, Flowcell, [Chemistry, DataType, FeatureBarcodeFile, Link]). File input_csv_file - # Output directory, gs URL + # Output directory, AWS or GCP URI String output_directory - # If run mkfastq - Boolean run_mkfastq = true - # If run count - Boolean run_count = true - - # for mkfastq - - # Whether to delete input_bcl_directory, default: false - Boolean delete_input_bcl_directory = false - # Number of allowed mismatches per index - Int? mkfastq_barcode_mismatches - # If 10x-supplied i7/i5 paired indices are specified, but the flowcell was run with only one sample index, allow the demultiplex to proceed using the i7 half of the sample index pair. - Boolean mkfastq_force_single_index = false - # Only demultiplex samples identified by an i7-only sample index, ignoring dual-indexed samples. Dual-indexed samples will not be demultiplexed. - Boolean mkfastq_filter_single_index = false - # Override the read lengths as specified in RunInfo.xml - String? mkfastq_use_bases_mask - # Delete undetermined FASTQ files generated by bcl2fastq2 - Boolean mkfastq_delete_undetermined = false - - # For cellranger count # Force pipeline to use this number of cells, bypassing the cell detection algorithm, mutually exclusive with expect_cells. @@ -98,21 +74,18 @@ workflow cellranger_workflow { # Index TSV file File acronym_file = "gs://regev-lab/resources/cellranger/index.tsv" - # 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0, 6.1.2, 6.1.1, 6.0.2, 6.0.1, 6.0.0, 5.0.1, 5.0.0 + # 9.0.0, 8.0.1, 8.0.0, 7.2.0, 7.1.0, 7.0.1, 7.0.0 String cellranger_version = "9.0.0" - # 0.11.4, 0.11.3, 0.11.2, 0.11.1, 0.11.0, 0.10.0, 0.9.0, 0.8.0, 0.7.0, 0.6.0, 0.5.0, 0.4.0, 0.3.0, 0.2.0 String cumulus_feature_barcoding_version = "0.11.4" - # 2.1.0, 2.0.0, 1.2.0, 1.1.0 + # 2.1.0, 2.0.0 String cellranger_atac_version = "2.1.0" - # 2.0.2.strato, 2.0.2.custom-max-cell, 2.0.2, 2.0.1, 2.0.0, 1.0.1, 1.0.0 + # 2.0.2.strato, 2.0.2.custom-max-cell, 2.0.2, 2.0.1, 2.0.0 String cellranger_arc_version = "2.0.2.strato" # config version String config_version = "0.3" # Which docker registry to use: quay.io/cumulus (default) or cumulusprod String docker_registry = "quay.io/cumulus" - # cellranger/cellranger-atac/cellranger-arc mkfastq registry, default to gcr.io/broad-cumulus - String mkfastq_docker_registry = "gcr.io/broad-cumulus" # Google cloud zones, default to "us-central1-a us-central1-b us-central1-c us-central1-f us-east1-b us-east1-c us-east1-d us-west1-a us-west1-b us-west1-c" String zones = "us-central1-a us-central1-b us-central1-c us-central1-f us-east1-b us-east1-c us-east1-d us-west1-a us-west1-b us-west1-c" # Backend @@ -137,8 +110,6 @@ workflow cellranger_workflow { # Memory string for cellranger-arc count String arc_memory = "160G" - # Optional disk space for mkfastq. - Int mkfastq_disk_space = 1500 # Optional disk space needed for cell ranger count. Int count_disk_space = 500 # Optional disk space needed for cell ranger multi. @@ -162,16 +133,56 @@ workflow cellranger_workflow { String output_directory_stripped = sub(output_directory, "[/\\s]+$", "") String docker_registry_stripped = sub(docker_registry, "/+$", "") - String mkfastq_docker_registry_stripped = sub(mkfastq_docker_registry, "/+$", "") Map[String, String] acronym2gsurl = read_map(acronym_file) String null_file = acronym2gsurl["null_file"] - if (run_mkfastq) { - call generate_bcl_csv { + + call generate_count_config { + input: + input_csv_file = input_csv_file, + output_dir = output_directory_stripped, + config_version = config_version, + docker_registry = docker_registry_stripped, + zones = zones, + preemptible = preemptible, + awsQueueArn = awsQueueArn, + backend = backend, + null_file = null_file + } + + if (length(generate_count_config.sample_ids) > 0) { + scatter (sample_id in generate_count_config.sample_ids) { + call crc.cellranger_count as cellranger_count { + input: + sample_id = sample_id, + input_fastqs_directories = generate_count_config.sample2dir[sample_id], + output_directory = output_directory_stripped, + genome = generate_count_config.sample2genome[sample_id], + target_panel = generate_count_config.sample2fbf[sample_id], + chemistry = generate_count_config.sample2chemistry[sample_id], + include_introns = include_introns, + acronym_file = acronym_file, + no_bam = no_bam, + secondary = secondary, + force_cells = force_cells, + expect_cells = expect_cells, + cellranger_version = cellranger_version, + docker_registry = docker_registry_stripped, + zones = zones, + num_cpu = num_cpu, + memory = memory, + disk_space = count_disk_space, + preemptible = preemptible, + backend = backend, + awsQueueArn = awsQueueArn + } + } + + call collect_summaries { input: - input_csv_file = input_csv_file, - output_dir = output_directory_stripped, + summaries = cellranger_count.output_metrics_summary, + sample_ids = cellranger_count.output_count_directory, config_version = config_version, docker_registry = docker_registry_stripped, zones = zones, @@ -179,464 +190,239 @@ workflow cellranger_workflow { awsQueueArn = awsQueueArn, backend = backend } + } - if (length(generate_bcl_csv.bcl_csv_rna) > 0) { - scatter (bcl_csv in generate_bcl_csv.bcl_csv_rna) { - String rna_key = basename(bcl_csv) - call crm.cellranger_mkfastq as cellranger_mkfastq { - input: - input_bcl_directory = generate_bcl_csv.inpdirs[rna_key], - input_csv_file = bcl_csv, - output_directory = output_directory_stripped, - delete_input_bcl_directory = delete_input_bcl_directory, - barcode_mismatches = mkfastq_barcode_mismatches, - force_single_index = mkfastq_force_single_index, - filter_single_index = mkfastq_filter_single_index, - use_bases_mask = mkfastq_use_bases_mask, - delete_undetermined = mkfastq_delete_undetermined, - cellranger_version = cellranger_version, - docker_registry = mkfastq_docker_registry_stripped, - zones = zones, - num_cpu = num_cpu, - memory = memory, - disk_space = mkfastq_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } - } - } - - if (length(generate_bcl_csv.bcl_csv_atac) > 0) { - scatter (bcl_csv in generate_bcl_csv.bcl_csv_atac) { - String atac_key = basename(bcl_csv) - call cram.cellranger_atac_mkfastq as cellranger_atac_mkfastq { - input: - input_bcl_directory = generate_bcl_csv.inpdirs[atac_key], - input_csv_file = bcl_csv, - output_directory = output_directory_stripped, - delete_input_bcl_directory = delete_input_bcl_directory, - barcode_mismatches = mkfastq_barcode_mismatches, - force_single_index = mkfastq_force_single_index, - filter_single_index = mkfastq_filter_single_index, - use_bases_mask = mkfastq_use_bases_mask, - delete_undetermined = mkfastq_delete_undetermined, - cellranger_atac_version = cellranger_atac_version, - docker_registry = mkfastq_docker_registry_stripped, - zones = zones, - num_cpu = num_cpu, - memory = memory, - disk_space = mkfastq_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } - } - } - - if (length(generate_bcl_csv.bcl_csv_arc) > 0) { - scatter (bcl_csv in generate_bcl_csv.bcl_csv_arc) { - String arc_key = basename(bcl_csv) - call crarm.cellranger_arc_mkfastq as cellranger_arc_mkfastq { - input: - input_bcl_directory = generate_bcl_csv.inpdirs[arc_key], - input_csv_file = bcl_csv, - output_directory = output_directory_stripped, - delete_input_bcl_directory = delete_input_bcl_directory, - barcode_mismatches = mkfastq_barcode_mismatches, - force_single_index = mkfastq_force_single_index, - filter_single_index = mkfastq_filter_single_index, - use_bases_mask = mkfastq_use_bases_mask, - delete_undetermined = mkfastq_delete_undetermined, - cellranger_arc_version = cellranger_arc_version, - docker_registry = mkfastq_docker_registry_stripped, - zones = zones, - num_cpu = num_cpu, - memory = memory, - disk_space = mkfastq_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } + if (length(generate_count_config.sample_vdj_ids) > 0) { + scatter (sample_id in generate_count_config.sample_vdj_ids) { + call crv.cellranger_vdj as cellranger_vdj { + input: + sample_id = sample_id, + input_fastqs_directories = generate_count_config.sample2dir[sample_id], + output_directory = output_directory_stripped, + genome = generate_count_config.sample2genome[sample_id], + acronym_file = acronym_file, + denovo = vdj_denovo, + chain = vdj_chain, + cellranger_version = cellranger_version, + docker_registry = docker_registry_stripped, + zones = zones, + num_cpu = num_cpu, + memory = memory, + disk_space = vdj_disk_space, + preemptible = preemptible, + backend = backend, + awsQueueArn = awsQueueArn } } - } - if (run_count) { - call generate_count_config { + call collect_summaries as collect_summaries_vdj { input: - input_csv_file = input_csv_file, - output_dir = output_directory_stripped, - fastq_dirs = cellranger_mkfastq.output_fastqs_flowcell_directory, - fastq_dirs_atac = cellranger_atac_mkfastq.output_fastqs_flowcell_directory, - fastq_dirs_arc = cellranger_arc_mkfastq.output_fastqs_flowcell_directory, + summaries = cellranger_vdj.output_metrics_summary, + sample_ids = cellranger_vdj.output_vdj_directory, config_version = config_version, docker_registry = docker_registry_stripped, zones = zones, preemptible = preemptible, awsQueueArn = awsQueueArn, - backend = backend, - null_file = null_file + backend = backend } + } - if (length(generate_count_config.sample_ids) > 0) { - scatter (sample_id in generate_count_config.sample_ids) { - call crc.cellranger_count as cellranger_count { - input: - sample_id = sample_id, - input_fastqs_directories = generate_count_config.sample2dir[sample_id], - output_directory = output_directory_stripped, - genome = generate_count_config.sample2genome[sample_id], - target_panel = generate_count_config.sample2fbf[sample_id], - chemistry = generate_count_config.sample2chemistry[sample_id], - include_introns = include_introns, - acronym_file = acronym_file, - no_bam = no_bam, - secondary = secondary, - force_cells = force_cells, - expect_cells = expect_cells, - cellranger_version = cellranger_version, - docker_registry = docker_registry_stripped, - zones = zones, - num_cpu = num_cpu, - memory = memory, - disk_space = count_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } - } - - call collect_summaries { + if (length(generate_count_config.sample_feature_ids) > 0) { + scatter (sample_id in generate_count_config.sample_feature_ids) { + call ca.cumulus_adt as cumulus_adt { input: - summaries = cellranger_count.output_metrics_summary, - sample_ids = cellranger_count.output_count_directory, - config_version = config_version, + sample_id = sample_id, + input_fastqs_directories = generate_count_config.sample2dir[sample_id], + output_directory = output_directory_stripped, + chemistry = generate_count_config.sample2chemistry[sample_id], + data_type = generate_count_config.sample2datatype[sample_id], + feature_barcode_file = generate_count_config.sample2fbf[sample_id], + crispr_barcode_pos = crispr_barcode_pos, + scaffold_sequence = scaffold_sequence, + max_mismatch = max_mismatch, + min_read_ratio = min_read_ratio, + cumulus_feature_barcoding_version = cumulus_feature_barcoding_version, docker_registry = docker_registry_stripped, + acronym_file = acronym_file, zones = zones, + num_cpu = feature_num_cpu, + memory = feature_memory, + disk_space = feature_disk_space, preemptible = preemptible, - awsQueueArn = awsQueueArn, - backend = backend + backend = backend, + awsQueueArn = awsQueueArn } } + } - if (length(generate_count_config.sample_vdj_ids) > 0) { - scatter (sample_id in generate_count_config.sample_vdj_ids) { - call crv.cellranger_vdj as cellranger_vdj { - input: - sample_id = sample_id, - input_fastqs_directories = generate_count_config.sample2dir[sample_id], - output_directory = output_directory_stripped, - genome = generate_count_config.sample2genome[sample_id], - acronym_file = acronym_file, - denovo = vdj_denovo, - chain = vdj_chain, - cellranger_version = cellranger_version, - docker_registry = docker_registry_stripped, - zones = zones, - num_cpu = num_cpu, - memory = memory, - disk_space = vdj_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } - } - - call collect_summaries as collect_summaries_vdj { + if (length(generate_count_config.sample_atac_ids) > 0) { + scatter (sample_id in generate_count_config.sample_atac_ids) { + call crac.cellranger_atac_count as cellranger_atac_count { input: - summaries = cellranger_vdj.output_metrics_summary, - sample_ids = cellranger_vdj.output_vdj_directory, - config_version = config_version, + sample_id = sample_id, + input_fastqs_directories = generate_count_config.sample2dir[sample_id], + output_directory = output_directory_stripped, + genome = generate_count_config.sample2genome[sample_id], + acronym_file = acronym_file, + force_cells = force_cells, + dim_reduce = atac_dim_reduce, + peaks = peaks, + chemistry = generate_count_config.sample2chemistry[sample_id], + cellranger_atac_version = cellranger_atac_version, docker_registry = docker_registry_stripped, zones = zones, + num_cpu = atac_num_cpu, + memory = atac_memory, + disk_space = atac_disk_space, preemptible = preemptible, - awsQueueArn = awsQueueArn, - backend = backend + backend = backend, + awsQueueArn = awsQueueArn } } - if (length(generate_count_config.sample_feature_ids) > 0) { - scatter (sample_id in generate_count_config.sample_feature_ids) { - call ca.cumulus_adt as cumulus_adt { - input: - sample_id = sample_id, - input_fastqs_directories = generate_count_config.sample2dir[sample_id], - output_directory = output_directory_stripped, - chemistry = generate_count_config.sample2chemistry[sample_id], - data_type = generate_count_config.sample2datatype[sample_id], - feature_barcode_file = generate_count_config.sample2fbf[sample_id], - crispr_barcode_pos = crispr_barcode_pos, - scaffold_sequence = scaffold_sequence, - max_mismatch = max_mismatch, - min_read_ratio = min_read_ratio, - cumulus_feature_barcoding_version = cumulus_feature_barcoding_version, - docker_registry = docker_registry_stripped, - acronym_file = acronym_file, - zones = zones, - num_cpu = feature_num_cpu, - memory = feature_memory, - disk_space = feature_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } - } + call collect_summaries as collect_summaries_atac { + input: + summaries = cellranger_atac_count.output_metrics_summary, + sample_ids = cellranger_atac_count.output_count_directory, + config_version = config_version, + docker_registry = docker_registry_stripped, + zones = zones, + preemptible = preemptible, + awsQueueArn = awsQueueArn, + backend = backend } + } - if (length(generate_count_config.sample_atac_ids) > 0) { - scatter (sample_id in generate_count_config.sample_atac_ids) { - call crac.cellranger_atac_count as cellranger_atac_count { - input: - sample_id = sample_id, - input_fastqs_directories = generate_count_config.sample2dir[sample_id], - output_directory = output_directory_stripped, - genome = generate_count_config.sample2genome[sample_id], - acronym_file = acronym_file, - force_cells = force_cells, - dim_reduce = atac_dim_reduce, - peaks = peaks, - chemistry = generate_count_config.sample2chemistry[sample_id], - cellranger_atac_version = cellranger_atac_version, - docker_registry = docker_registry_stripped, - zones = zones, - num_cpu = atac_num_cpu, - memory = atac_memory, - disk_space = atac_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } - } - - call collect_summaries as collect_summaries_atac { + if (length(generate_count_config.link_arc_ids) > 0) { + scatter (link_id in generate_count_config.link_arc_ids) { + call crarc.cellranger_arc_count as cellranger_arc_count { input: - summaries = cellranger_atac_count.output_metrics_summary, - sample_ids = cellranger_atac_count.output_count_directory, - config_version = config_version, + link_id = link_id, + input_samples = generate_count_config.link2sample[link_id], + input_fastqs_directories = generate_count_config.sample2dir[link_id], + input_data_types = generate_count_config.sample2datatype[link_id], + output_directory = output_directory_stripped, + acronym_file = acronym_file, + genome = generate_count_config.sample2genome[link_id], + gex_exclude_introns = arc_gex_exclude_introns, + no_bam = no_bam, + min_atac_count = arc_min_atac_count, + min_gex_count = arc_min_gex_count, + peaks = peaks, + cellranger_arc_version = cellranger_arc_version, docker_registry = docker_registry_stripped, zones = zones, + num_cpu = arc_num_cpu, + memory = arc_memory, + disk_space = arc_disk_space, preemptible = preemptible, - awsQueueArn = awsQueueArn, - backend = backend + backend = backend, + awsQueueArn = awsQueueArn } } - if (length(generate_count_config.link_arc_ids) > 0) { - scatter (link_id in generate_count_config.link_arc_ids) { - call crarc.cellranger_arc_count as cellranger_arc_count { - input: - link_id = link_id, - input_samples = generate_count_config.link2sample[link_id], - input_fastqs_directories = generate_count_config.sample2dir[link_id], - input_data_types = generate_count_config.sample2datatype[link_id], - output_directory = output_directory_stripped, - acronym_file = acronym_file, - genome = generate_count_config.sample2genome[link_id], - gex_exclude_introns = arc_gex_exclude_introns, - no_bam = no_bam, - min_atac_count = arc_min_atac_count, - min_gex_count = arc_min_gex_count, - peaks = peaks, - cellranger_arc_version = cellranger_arc_version, - docker_registry = docker_registry_stripped, - zones = zones, - num_cpu = arc_num_cpu, - memory = arc_memory, - disk_space = arc_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } - } + call collect_summaries as collect_summaries_arc { + input: + summaries = cellranger_arc_count.output_metrics_summary, + sample_ids = cellranger_arc_count.output_count_directory, + config_version = config_version, + docker_registry = docker_registry_stripped, + zones = zones, + preemptible = preemptible, + awsQueueArn = awsQueueArn, + backend = backend + } + } - call collect_summaries as collect_summaries_arc { + if (length(generate_count_config.link_multi_ids) > 0) { + scatter (link_id in generate_count_config.link_multi_ids) { + call crmulti.cellranger_multi as cellranger_multi { input: - summaries = cellranger_arc_count.output_metrics_summary, - sample_ids = cellranger_arc_count.output_count_directory, - config_version = config_version, + link_id = link_id, + input_samples = generate_count_config.link2sample[link_id], + input_fastqs_directories = generate_count_config.sample2dir[link_id], + input_data_types = generate_count_config.sample2datatype[link_id], + input_fbf = generate_count_config.sample2fbf[link_id], + output_directory = output_directory_stripped, + acronym_file = acronym_file, + genome = generate_count_config.sample2genome[link_id], + probe_set = generate_count_config.sample2probeset[link_id], + cmo_set = cmo_set, + include_introns = include_introns, + no_bam = no_bam, + secondary = secondary, + force_cells = force_cells, + expect_cells = expect_cells, + cellranger_version = cellranger_version, docker_registry = docker_registry_stripped, zones = zones, + num_cpu = num_cpu, + memory = memory, + disk_space = multi_disk_space, preemptible = preemptible, - awsQueueArn = awsQueueArn, - backend = backend + backend = backend, + awsQueueArn = awsQueueArn } } + } - if (length(generate_count_config.link_multi_ids) > 0) { - scatter (link_id in generate_count_config.link_multi_ids) { - call crmulti.cellranger_multi as cellranger_multi { - input: - link_id = link_id, - input_samples = generate_count_config.link2sample[link_id], - input_fastqs_directories = generate_count_config.sample2dir[link_id], - input_data_types = generate_count_config.sample2datatype[link_id], - input_fbf = generate_count_config.sample2fbf[link_id], - output_directory = output_directory_stripped, - acronym_file = acronym_file, - genome = generate_count_config.sample2genome[link_id], - probe_set = generate_count_config.sample2probeset[link_id], - cmo_set = cmo_set, - include_introns = include_introns, - no_bam = no_bam, - secondary = secondary, - force_cells = force_cells, - expect_cells = expect_cells, - cellranger_version = cellranger_version, - docker_registry = docker_registry_stripped, - zones = zones, - num_cpu = num_cpu, - memory = memory, - disk_space = multi_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } - } - } - - if (length(generate_count_config.link_fbc_ids) > 0) { - scatter (link_id in generate_count_config.link_fbc_ids) { - call crc.cellranger_count as cellranger_count_fbc { - input: - sample_id = link_id, - input_samples = generate_count_config.link2sample[link_id], - input_fastqs_directories = generate_count_config.sample2dir[link_id], - input_data_types = generate_count_config.sample2datatype[link_id], - input_fbf = generate_count_config.sample2fbf[link_id], - output_directory = output_directory_stripped, - acronym_file = acronym_file, - genome = generate_count_config.sample2genome[link_id], - include_introns = include_introns, - no_bam = no_bam, - secondary = secondary, - force_cells = force_cells, - expect_cells = expect_cells, - cellranger_version = cellranger_version, - docker_registry = docker_registry_stripped, - zones = zones, - num_cpu = num_cpu, - memory = memory, - disk_space = count_disk_space, - preemptible = preemptible, - backend = backend, - awsQueueArn = awsQueueArn - } - } - - call collect_summaries as collect_summaries_fbc { + if (length(generate_count_config.link_fbc_ids) > 0) { + scatter (link_id in generate_count_config.link_fbc_ids) { + call crc.cellranger_count as cellranger_count_fbc { input: - summaries = cellranger_count_fbc.output_metrics_summary, - sample_ids = cellranger_count_fbc.output_count_directory, - config_version = config_version, + sample_id = link_id, + input_samples = generate_count_config.link2sample[link_id], + input_fastqs_directories = generate_count_config.sample2dir[link_id], + input_data_types = generate_count_config.sample2datatype[link_id], + input_fbf = generate_count_config.sample2fbf[link_id], + output_directory = output_directory_stripped, + acronym_file = acronym_file, + genome = generate_count_config.sample2genome[link_id], + include_introns = include_introns, + no_bam = no_bam, + secondary = secondary, + force_cells = force_cells, + expect_cells = expect_cells, + cellranger_version = cellranger_version, docker_registry = docker_registry_stripped, zones = zones, + num_cpu = num_cpu, + memory = memory, + disk_space = count_disk_space, preemptible = preemptible, - awsQueueArn = awsQueueArn, - backend = backend + backend = backend, + awsQueueArn = awsQueueArn } } + + call collect_summaries as collect_summaries_fbc { + input: + summaries = cellranger_count_fbc.output_metrics_summary, + sample_ids = cellranger_count_fbc.output_count_directory, + config_version = config_version, + docker_registry = docker_registry_stripped, + zones = zones, + preemptible = preemptible, + awsQueueArn = awsQueueArn, + backend = backend + } } output { - Array[Array[String]?] fastq_outputs = [cellranger_mkfastq.output_fastqs_flowcell_directory, cellranger_atac_mkfastq.output_fastqs_flowcell_directory, cellranger_arc_mkfastq.output_fastqs_flowcell_directory] - Map[String, Array[String]?] count_outputs = {"gex": cellranger_count.output_count_directory, - "vdj": cellranger_vdj.output_vdj_directory, - "adt": cumulus_adt.output_count_directory, - "atac": cellranger_atac_count.output_count_directory, - "arc": cellranger_arc_count.output_count_directory, - "multi": cellranger_multi.output_multi_directory, - "fbc": cellranger_count_fbc.output_count_directory - } + Map[String, Array[String]?] count_outputs = { + "gex": cellranger_count.output_count_directory, + "vdj": cellranger_vdj.output_vdj_directory, + "adt": cumulus_adt.output_count_directory, + "atac": cellranger_atac_count.output_count_directory, + "arc": cellranger_arc_count.output_count_directory, + "multi": cellranger_multi.output_multi_directory, + "fbc": cellranger_count_fbc.output_count_directory + } File? count_matrix = generate_count_config.count_matrix } } -task generate_bcl_csv { - input { - File input_csv_file - String output_dir - String config_version - String docker_registry - String zones - Int preemptible - String awsQueueArn - String backend - } - - command { - set -e - export TMPDIR=/tmp - - python /software/check_uri.py "~{backend}" "~{output_dir}" - - python <:;"\',*^| &', file = sys.stderr) - sys.exit(1) - for idx, row in df.iterrows(): - if ('Link' in row) and pd.notnull(row['Link']) and (row['Link'] != ''): - omics = multiomics[row['Link']] - if 'atac' in omics: - if omics != set(['atac', 'rna']): - print('CellRanger ARC only works with ATAC+RNA data! Link \'' + row['Link'] + '\' contains ' + ', '.join(list(omics)) + '.', file = sys.stderr) - sys.exit(1) - row['DataType'] = 'arc' - - with open('inpdirs.txt', 'w') as fo: - for input_dir in df['Flowcell'].unique(): - run_id = os.path.basename(input_dir) - flowcell_df = df.loc[df['Flowcell'] == input_dir] - for datatype in flowcell_df['DataType'].unique(): - bcl_df = flowcell_df.loc[flowcell_df['DataType'] == datatype, ['Lane', 'Sample', 'Index']] - bcl_file = run_id + '_' + datatype + '_bcl.csv' - bcl_df.to_csv(bcl_file, index = False) - fo.write(bcl_file + '\t' + input_dir + '\n') - CODE - } - - output { - Map[String, String] inpdirs = read_map('inpdirs.txt') - Array[File] bcl_csv_rna = glob('*_rna_bcl.csv') - Array[File] bcl_csv_atac = glob('*_atac_bcl.csv') - Array[File] bcl_csv_arc = glob('*_arc_bcl.csv') - } - - runtime { - docker: "~{docker_registry}/config:~{config_version}" - zones: zones - preemptible: preemptible - queueArn: awsQueueArn - } -} task generate_count_config { input { @@ -693,20 +479,6 @@ task generate_count_config { print('Examples of common characters that are not allowed are the space character and the following: ?()[]/\=+<>:;"\',*^| &', file = sys.stderr) sys.exit(1) - def parse_fastq_dirs(dirs_str): - r2f = dict() - if dirs_str == '': - return r2f - dirs = dirs_str.split(',') - for dir in dirs: - run_id = dir.split('/')[-3].rpartition('_')[0] - r2f[run_id] = dir - return r2f - - r2f = parse_fastq_dirs('~{sep="," fastq_dirs}') - r2f.update(parse_fastq_dirs('~{sep="," fastq_dirs_atac}')) - r2f.update(parse_fastq_dirs('~{sep="," fastq_dirs_arc}')) - with open('sample_ids.txt', 'w') as fo1, open('sample_vdj_ids.txt', 'w') as fo2, open('sample_feature_ids.txt', 'w') as fo3, open('sample_atac_ids.txt', 'w') as fo4, \ open('sample2dir.txt', 'w') as foo1, open('sample2datatype.txt', 'w') as foo2, open('sample2genome.txt', 'w') as foo3, \ open('sample2chemistry.txt', 'w') as foo4, open('sample2fbf.txt', 'w') as foo5, open('count_matrix.csv', 'w') as foo6, \ @@ -733,10 +505,7 @@ task generate_count_config { datatype = df_local['DataType'].iat[0] - if len(r2f) > 0: - dirs = df_local['Flowcell'].map(lambda x: r2f[os.path.basename(x)]).values # if also run mkfastq - else: - dirs = df_local['Flowcell'].values # if start from count step + dirs = df_local['Flowcell'].values reference = 'null' if datatype in ['rna', 'vdj', 'atac', 'frp']: From 4d25800c9612f13e20653d0b81e1c21fa1809309 Mon Sep 17 00:00:00 2001 From: Yiming Yang Date: Mon, 17 Feb 2025 02:02:37 -0800 Subject: [PATCH 7/7] check sample name prefix consistency instead of enforcing renaming --- workflows/cellranger/cellranger_count.wdl | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/workflows/cellranger/cellranger_count.wdl b/workflows/cellranger/cellranger_count.wdl index 3b9aa7f3..2e553a1b 100644 --- a/workflows/cellranger/cellranger_count.wdl +++ b/workflows/cellranger/cellranger_count.wdl @@ -141,19 +141,16 @@ task run_cellranger_count { from subprocess import check_call, CalledProcessError, DEVNULL, STDOUT from packaging import version - def rename_fastq_file(path, sample_name): + def check_fastq_file(path, sample_name): folder = os.path.dirname(path) filename = os.path.basename(path) pattern = r"(_S\d+_L\d+_[RI]\d+_001\.fastq\.gz)" match = re.search(pattern, filename) if match: idx = match.start() - cur_name = filename[:idx] - suffix = filename[idx:] + cur_name = filename[:match.start()] if cur_name != sample_name: - call_args = ["mv", path, folder+"/"+sample_name+suffix] - print(' '.join(call_args)) - check_call(call_args) + raise Exception("FASTQ sample name prefix mismatch! Expect " + sample_name + ". Get " + cur_name + ".") else: raise Exception(path + " does not follow Illumina naming convention!") @@ -174,12 +171,12 @@ task run_cellranger_count { check_call(call_args, stdout=DEVNULL, stderr=STDOUT) except CalledProcessError: # Localize tar file - call_args = ['strato', 'cp', '-m', directory + '/' + "*.tar", target] + tar_file = sample_name + ".tar" + call_args = ['strato', 'cp', '-m', directory + '/' + tar_file, target] print(' '.join(call_args)) check_call(call_args) # Untar - tar_file = glob.glob(target+"/*.tar")[0] call_args = ["tar", "--strip-components=1", "-xf", tar_file, "-C", target] print(' '.join(call_args)) check_call(call_args) @@ -192,7 +189,7 @@ task run_cellranger_count { # Rename FASTQ files if needed fastq_files = glob.glob(target+"/*.fastq.gz") for fastq_f in fastq_files: - rename_fastq_file(fastq_f, sample_name) + check_fastq_file(fastq_f, sample_name) samples = data_types = fbfs = None fastqs_dirs = []