From 019b201bbbec6042e2f6d61ca3a228c3b8f87a1b Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 15 Dec 2023 09:17:10 -0700 Subject: [PATCH] fix: testing fixes for all use cases of input, add bclconvert docker --- README.md | 2 +- docker/{demux => bcl2fastq}/Dockerfile | 0 docker/bclconvert/Dockerfile | 9 +++++ docs/index.md | 2 +- scripts/files.py | 14 ++++--- weave | 20 ++++------ workflow/Snakefile | 22 ++++------- workflow/demux.smk | 30 +++++++-------- workflow/fastq.smk | 52 +++++++++++++------------- workflow/qc.smk | 41 ++++++++++---------- 10 files changed, 97 insertions(+), 95 deletions(-) rename docker/{demux => bcl2fastq}/Dockerfile (100%) create mode 100644 docker/bclconvert/Dockerfile diff --git a/README.md b/README.md index 01c7865..0ad37e6 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ The **`./weave`** pipeline is composed of two sub commands to setup and run the **weave** common use is to gauge the qualtiy of reads for potential downstream analysis. Since bioinformatic analysis requires robust and accurate data to draw scientific conclusions, this helps save time and resources when it comes to analyzing the volumous amount of sequencing data that is collected routinely. Several of the applications that **weave** uses to visualize and report quality metrics are: -- [Kraken](https://github.com/DerrickWood/kraken2)71, kmer analysis +- [Kraken](https://github.com/DerrickWood/kraken2)7, kmer analysis - [Kaiju](https://bioinformatics-centre.github.io/kaiju/)4, kmer analysis - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), fastq statistics - [fastp](https://github.com/OpenGene/fastp)6, fastq adapter removal (trimming) diff --git a/docker/demux/Dockerfile b/docker/bcl2fastq/Dockerfile similarity index 100% rename from docker/demux/Dockerfile rename to docker/bcl2fastq/Dockerfile diff --git a/docker/bclconvert/Dockerfile b/docker/bclconvert/Dockerfile new file mode 100644 index 0000000..c17a3c4 --- /dev/null +++ b/docker/bclconvert/Dockerfile @@ -0,0 +1,9 @@ +FROM ubuntu:23.04 +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qq update; apt-get -qq install pstack gdb wget vim curl alien +WORKDIR "/" +RUN wget -q https://hpc.nih.gov/~OpenOmics/weave/bcl-convert-4.2.4-2.el7.x86_64.rpm +RUN alien --scripts bcl-convert-4.2.4-2.el7.x86_64.rpm +RUN dpkg -i bcl-convert_4.2.4-3_amd64.deb +RUN mkdir /work; chmod -R 777 /work +WORKDIR "/work" diff --git a/docs/index.md b/docs/index.md index be51720..46847a6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -49,7 +49,7 @@ The **`./weave`** pipeline is composed of two sub commands to setup and run the **weave** common use is to gauge the qualtiy of reads for potential downstream analysis. Since bioinformatic analysis requires robust and accurate data to draw scientific conclusions, this helps save time and resources when it comes to analyzing the volumous amount of sequencing data that is collected routinely. Several of the applications that **weave** uses to visualize and report quality metrics are: -- [Kraken](https://github.com/DerrickWood/kraken2)71, kmer analysis +- [Kraken](https://github.com/DerrickWood/kraken2)7, kmer analysis - [Kaiju](https://bioinformatics-centre.github.io/kaiju/)4, kmer analysis - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), fastq statistics - [fastp](https://github.com/OpenGene/fastp)6, fastq adapter removal (trimming) diff --git a/scripts/files.py b/scripts/files.py index 17712ad..9f4c5ad 100644 --- a/scripts/files.py +++ b/scripts/files.py @@ -29,11 +29,11 @@ def get_all_seq_dirs(top_dir, server): def check_if_demuxed(data_dir): - is_demuxed = False + do_demuxed = True if Path(data_dir, 'Analysis').exists(): if list(Path(data_dir, 'Analysis').rglob('*.fastq*')): - is_demuxed = True - return is_demuxed + do_demuxed = False + return do_demuxed def valid_run_output(output_directory, dry_run=False): @@ -59,7 +59,11 @@ def runid2samplesheet(runid, top_dir=DIRECTORY_CONFIGS['bigsky']['seq']): ss_path = Path(top_dir, runid) if not ss_path.exists(): raise FileNotFoundError(f"Run directory does not exist: {ss_path}") - if Path(ss_path, f"SampleSheet_{runid}.txt").exists(): + if Path(ss_path, f"SampleSheet.txt").exists(): + ss_path = Path(ss_path, f"SampleSheet.txt") + elif Path(ss_path, f"SampleSheet.csv").exists(): + ss_path = Path(ss_path, f"SampleSheet.csv") + elif Path(ss_path, f"SampleSheet_{runid}.txt").exists(): ss_path = Path(ss_path, f"SampleSheet_{runid}.txt") elif Path(ss_path, f"SampleSheet_{runid}.csv").exists(): ss_path = Path(ss_path, f"SampleSheet_{runid}.csv") @@ -149,7 +153,7 @@ def get_run_directories(runids, seq_dir=None, sheetname=None): this_run_info = dict(run_id=rid) if Path(run_p, 'SampleSheet.csv').exists(): - sheet = parse_samplesheet(Path(run_p, 'SampleSheet.csv').absolute()) + sheet = Path(run_p, 'SampleSheet.csv').absolute() elif Path(run_p, f'SampleSheet_{rid}.csv').exists(): sheet = Path(run_p, f'SampleSheet_{rid}.csv').absolute() elif Path(run_p, f'SampleSheet_{rid}.csv').exists(): diff --git a/weave b/weave index 5ac8e3e..47fd4f3 100755 --- a/weave +++ b/weave @@ -25,25 +25,21 @@ def run(args): "Please file issue if this message is blocking: https://github.com/OpenOmics/weave/issues") pairs = ['1', '2'] if sample_sheet.is_paired_end else ['1'] - - # ~~~ general run configuration ~~~ - exec_config['bclconvert'].append(utils.is_bclconvert(sample_sheet)) - exec_config['run_ids'].append(rundir.name) - exec_config['demux_input_dir'].append(rundir.absolute()) - exec_config['sids'].append([x['sid'] for x in sample_list]) - exec_config['project'].append(project_list[0]) - exec_config['rnums'].append(pairs) - # ~~~ demultiplexing configuration ~~~ bcls = [x for x in Path(rundir).rglob('*.bcl.*') if not 'tmp' in str(x)] if not bcls: bcls = [x for x in Path(rundir).rglob('*.cbcl') if not 'tmp' in str(x)] + exec_config['sample_sheet'].append(str(sample_sheet.path)) exec_config['bcl_files'].append(bcls) exec_config['demux_data'].append(files.check_if_demuxed(rundir)) # ~~~ QC/QA configuration ~~~ - sample_sheet = run_infos['samplesheet'] - exec_config['sample_sheet'].append(str(sample_sheet.path.absolute())) + exec_config['bclconvert'].append(utils.is_bclconvert(sample_sheet)) + exec_config['run_ids'].append(rundir.name) + exec_config['demux_input_dir'].append(rundir.absolute()) + exec_config['sids'].append([x['sid'] for x in sample_list]) + exec_config['project'].append(project_list[0]) + exec_config['rnums'].append(pairs) exec_config['samples'].append(sample_list) # ~~~ output verification ~~~ @@ -85,7 +81,7 @@ if __name__ == '__main__': parser_run.add_argument('-d', '--dry-run', action='store_true', help='Dry run the demultiplexing workflow') parser_run.add_argument('-n', '--noqc', action='store_false', - help='Dry run the demultiplexing workflow') + help='Do not run the QC/QA portion of the workflow (Default is on)') parser_run.add_argument('--sheetname', metavar='Sample Sheet Filename', help='Name of the sample sheet file to look for (default is SampleSheet.csv)') parser_run.add_argument('-l', '--local', action='store_true', diff --git a/workflow/Snakefile b/workflow/Snakefile index 6675353..5828bbc 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -28,59 +28,53 @@ qa_qc_outputs = flatten( [ # ~~ fastqc on untrimmed reads ~~ expand( - "{out_dir}/{rid}/{project}/{sids}/fastqc_untrimmed/{sids}_R{rnum}_001_fastqc.zip", + "{out_dir}/{project}/{sids}/fastqc_untrimmed/{sids}_R{rnum}_001_fastqc.zip", out_dir=config["out_to"], project=config["project"], - rid=config["run_ids"], sids=config["sids"], rnum=config["rnums"], ), # ~~ fastqc on trimmed reads ~~ expand( - "{out_dir}/{rid}/{project}/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnum}_fastqc.zip", + "{out_dir}/{project}/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnum}_fastqc.zip", out_dir=config["out_to"], sids=config["sids"], project=config["project"], - rid=config["run_ids"], rnum=config["rnums"], ), # ~~ fastp trimming metrics ~~ expand( - "{out_dir}/{rid}/{project}/{sids}/fastp/{sids}_trimmed_R{rnum}.fastq.gz", + "{out_dir}/{project}/{sids}/fastp/{sids}_trimmed_R{rnum}.fastq.gz", out_dir=config["out_to"], sids=config["sids"], project=config["project"], - rid=config["run_ids"], rnum=config["rnums"], ), # ~~ fastq screen ~~ expand( - "{out_dir}/{rid}/{project}/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.html", + "{out_dir}/{project}/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.html", out_dir=config["out_to"], sids=config["sids"], rnum=config["rnums"], - rid=config["run_ids"], project=config["project"], ), # kraken2 expand( - "{out_dir}/{rid}/{project}/{sids}/kraken/{sids}.tsv", + "{out_dir}/{project}/{sids}/kraken/{sids}.tsv", out_dir=config["out_to"], sids=config["sids"], project=config["project"], - rid=config["run_ids"], ), # kaiju expand( - "{out_dir}/{rid}/{project}/{sids}/kaiju/{sids}.tsv", + "{out_dir}/{project}/{sids}/kaiju/{sids}.tsv", out_dir=config["out_to"], sids=config["sids"], project=config["project"], - rid=config["run_ids"], ), # multiqc expand( - "{out_dir}/{rid}/{project}/multiqc/Run-{rid}-Project-{project}_multiqc_report.html", + "{out_dir}/{project}/multiqc/Run-{rid}-Project-{project}_multiqc_report.html", out_dir=config["out_to"], project=config["project"], rid=config["run_ids"], @@ -116,7 +110,7 @@ dragen_linker_outputs = flatten( ] ) -if not config['demux_data']: +if config['demux_data']: if config['bclconvert']: all_outputs = bclconvert_outputs else: diff --git a/workflow/demux.smk b/workflow/demux.smk index 4ffedcf..41346e7 100644 --- a/workflow/demux.smk +++ b/workflow/demux.smk @@ -32,9 +32,10 @@ rule bcl2fastq: params: out_dir = config["out_to"] + "/demux", container: config["resources"]["sif"] + "bcl2fastq.sif", + log: config["out_to"] + "/logs/bcl2fastq/" + config["run_ids"] + "_" + config["project"] + ".log", threads: 26 resources: - mem_mb = "32g", + mem_mb = "32G", slurm_partition = "quick", runtime = 60*4, tasks = 1, @@ -49,7 +50,6 @@ rule bcl2fastq: --fastq-compression-level 9 \ --no-lane-splitting \ -o {params.out_dir} - find . > .fqlist touch {output.breadcrumb} """ @@ -81,7 +81,7 @@ rule bclconvert: input: run_dir = config['demux_input_dir'], binary_base_calls = expand("{files}", files=config['bcl_files'] if config['bclconvert'] else demux_noop_args), - samplesheets = expand("{run}/SampleSheet.csv", run=config['demux_input_dir'] if config['bclconvert'] else demux_noop_args), + samplesheet = expand("{ss}", ss=config['sample_sheet'] if config['bclconvert'] else demux_noop_args), runinfo = expand("{run}/RunInfo.xml", run=config['demux_input_dir'] if config['bclconvert'] else demux_noop_args), params: out_dir = config["out_to"] + "/demux/", @@ -95,7 +95,6 @@ rule bclconvert: top_unknown = expand("{out_to}/demux/Reports/Top_Unknown_Barcodes.csv", **demux_expand_args if config['bclconvert'] else demux_noop_args), breadcrumb = expand("{out_to}/demux/.BC_DEMUX_COMPLETE", **demux_expand_args if config['bclconvert'] else demux_noop_args), container: config["resources"]["sif"] + "weave_bclconvert_0.0.3.sif", - log: config["out_to"] + "/logs/bclconvert/" + config["run_ids"] + "_" + config["project"] + ".log", threads: 75 resources: mem_mb = int(64e3) shell: @@ -104,6 +103,7 @@ rule bclconvert: --bcl-input-directory {input.run_dir} \ --force \ --output-directory {params.out_dir} \ + --sample-sheet {input.samplesheet} \ --fastq-gzip-compression-level 9 \ --bcl-sampleproject-subdirectories true \ --bcl-num-conversion-threads 24 \ @@ -117,18 +117,18 @@ rule bclconvert: rule fastq_linker_from_dragen: input: - read1 = expand(config["demux_input_dir"] + "/Analysis/1/Data/fastq/{full_sid}_R1_001.fastq.gz", full_sid=config["sids"]), - read2 = expand(config["demux_input_dir"] + "/Analysis/1/Data/fastq/{full_sid}_R2_001.fastq.gz", full_sid=config["sids"]), - adapter_metrics = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Adapter_Metrics.csv", - qual_metrics = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Quality_Metrics.csv", - demux_stats = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Demultiplex_Stats.csv", + read1 = expand(config["demux_input_dir"] + "/Analysis/1/Data/fastq/{full_sid}_R1_001.fastq.gz", full_sid=config["sids"]) if not config['demux_data'] else [], + read2 = expand(config["demux_input_dir"] + "/Analysis/1/Data/fastq/{full_sid}_R2_001.fastq.gz", full_sid=config["sids"]) if not config['demux_data'] else [], + adapter_metrics = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Adapter_Metrics.csv" if not config['demux_data'] else [], + qual_metrics = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Quality_Metrics.csv" if not config['demux_data'] else [], + demux_stats = config["demux_input_dir"] + "/Analysis/1/Data/Reports/Demultiplex_Stats.csv" if not config['demux_data'] else [], output: - out_read1 = expand(config["out_to"] + "/demux/" + config["project"] + "/{full_sid}_R1_dragen.fastq.gz", full_sid=config["sids"]), - out_read2 = expand(config["out_to"] + "/demux/" + config["project"] + "/{full_sid}_R2_dragen.fastq.gz", full_sid=config["sids"]), - breadcrumb = expand(config["out_to"] + "/demux/.breadcrumb/{full_sid}", full_sid=config["sids"]), - adapter_metrics_out = config["out_to"] + "/demux/dragen_reports/Adapter_Metrics.csv", - qual_metrics_out = config["out_to"] + "/demux/dragen_reports/Quality_Metrics.csv", - demux_stats_out = config["out_to"] + "/demux/dragen_reports/Demultiplex_Stats.csv", + out_read1 = expand(config["out_to"] + "/demux/" + config["project"] + "/{full_sid}_R1_dragen.fastq.gz", full_sid=config["sids"]) if not config['demux_data'] else [], + out_read2 = expand(config["out_to"] + "/demux/" + config["project"] + "/{full_sid}_R2_dragen.fastq.gz", full_sid=config["sids"]) if not config['demux_data'] else [], + breadcrumb = expand(config["out_to"] + "/demux/.breadcrumb/{full_sid}", full_sid=config["sids"]) if not config['demux_data'] else [], + adapter_metrics_out = config["out_to"] + "/demux/dragen_reports/Adapter_Metrics.csv" if not config['demux_data'] else [], + qual_metrics_out = config["out_to"] + "/demux/dragen_reports/Quality_Metrics.csv" if not config['demux_data'] else [], + demux_stats_out = config["out_to"] + "/demux/dragen_reports/Demultiplex_Stats.csv" if not config['demux_data'] else [], run: demux_dir = Path(config["out_to"], 'demux').resolve() bc_dir = Path(demux_dir, '.breadcrumb').resolve() diff --git a/workflow/fastq.smk b/workflow/fastq.smk index ec37743..2d43a11 100644 --- a/workflow/fastq.smk +++ b/workflow/fastq.smk @@ -1,4 +1,4 @@ -if config['demux_data']: +if not config['demux_data']: trim_input_affix = 'dragen' else: trim_input_affix = '001' @@ -9,16 +9,16 @@ rule trim_w_fastp: in_read1 = config["out_to"] + "/demux/" + config["project"] + "/{sids}_R1_" + trim_input_affix + ".fastq.gz", in_read2 = config["out_to"] + "/demux/" + config["project"] + "/{sids}_R2_" + trim_input_affix + ".fastq.gz" if len(config['rnums']) == 2 else [], output: - html = config["out_to"] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastp/{sids}.html", - json = config["out_to"] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastp/{sids}_fastp.json", - out_read1 = config["out_to"] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz", - out_read2 = config["out_to"] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if len(config['rnums']) == 2 else [], + html = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}.html", + json = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_fastp.json", + out_read1 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz", + out_read2 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if len(config['rnums']) == 2 else [], containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" threads: 4, params: read_args = lambda _, output, input: f"--in2 {input.in_read2} --out2 {output.out_read2} --detect_adapter_for_pe""" if len(config['rnums']) == 2 else "" resources: mem_mb = 8192, - log: config["out_to"] + "/logs/" + config["run_ids"] + "/" + config["project"] + "/fastp/{sids}.log", + log: config["out_to"] + "/logs/" + config["project"] + "/fastp/{sids}.log", shell: """ fastp \ @@ -30,20 +30,20 @@ rule trim_w_fastp: rule fastq_screen: input: - read = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnum}.fastq.gz", + read = config['out_to'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnum}.fastq.gz", output: - txt = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.txt", - png = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.png", - html = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.html", + txt = config['out_to'] + "/" + config["project"] + "/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.txt", + png = config['out_to'] + "/" + config["project"] + "/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.png", + html = config['out_to'] + "/" + config["project"] + "/{sids}/fastq_screen/{sids}_trimmed_R{rnum}_screen.html", params: config_file = "/etc/fastq_screen.conf", subset = 1000000, aligner = "bowtie2", - output_dir = lambda w: config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/" + w.sids + "/fastq_screen/", + output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastq_screen/", containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" threads: 4, resources: mem_mb = 8192, - log: config['out_to'] + "/logs/" + config['run_ids'] + "/" + config["project"] + "/fastq_screen/{sids}_R{rnum}.log", + log: config['out_to'] + "/logs/" + config["project"] + "/fastq_screen/{sids}_R{rnum}.log", shell: """ fastq_screen --outdir {params.output_dir} \ @@ -58,22 +58,22 @@ rule fastq_screen: rule kaiju_annotation: input: - read1 = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz", - read2 = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if len(config['rnums']) == 2 else [], + read1 = config['out_to'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz", + read2 = config['out_to'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if len(config['rnums']) == 2 else [], output: - kaiju_report = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/kaiju/{sids}.tsv", - kaiju_order = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_order.tsv", - kaiju_family = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_family.tsv", - kaiju_species = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_species.tsv", - kaiju_phylum = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_phylum.tsv", - kaiju_genus = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_genus.tsv", + kaiju_report = config['out_to'] + "/" + config["project"] + "/{sids}/kaiju/{sids}.tsv", + kaiju_order = config['out_to'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_order.tsv", + kaiju_family = config['out_to'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_family.tsv", + kaiju_species = config['out_to'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_species.tsv", + kaiju_phylum = config['out_to'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_phylum.tsv", + kaiju_genus = config['out_to'] + "/" + config["project"] + "/{sids}/kaiju/{sids}_genus.tsv", params: nodes = config["resources"]["mounts"]["kaiju"]["to"] + "/nodes.dmp", names = config["resources"]["mounts"]["kaiju"]["to"] + "/names.dmp", database = config["resources"]["mounts"]["kaiju"]["to"] + "/kaiju_db_nr_euk.fmi", reads_in_arg = lambda wc, input, output: f"-j {input.read1} -i {input.read2}" if input.read2 else f"-j {input.read1}", containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" - log: config['out_to'] + "/logs/" + config['run_ids'] + "/" + config["project"] + "/kaiju/{sids}.log", + log: config['out_to'] + "/logs/" + config["project"] + "/kaiju/{sids}.log", threads: 24 resources: mem_mb = 220000, @@ -96,16 +96,16 @@ rule kaiju_annotation: rule kraken_annotation: input: - read1 = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz", - read2 = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if len(config['rnums']) == 2 else [], + read1 = config['out_to'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz", + read2 = config['out_to'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if len(config['rnums']) == 2 else [], output: - kraken_report = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/kraken/{sids}.tsv", - kraken_log = config['out_to'] + "/" + config['run_ids'] + "/" + config["project"] + "/{sids}/kraken/{sids}.log", + kraken_report = config['out_to'] + "/" + config["project"] + "/{sids}/kraken/{sids}.tsv", + kraken_log = config['out_to'] + "/" + config["project"] + "/{sids}/kraken/{sids}.log", params: kraken_db = config["resources"]["mounts"]["kraken2"]["to"], reads_in_arg = lambda wc, input, output: f"{input.read1} {input.read2}" if input.read2 else f"{input.read1}", containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif", - log: config['out_to'] + "/logs/" + config['run_ids'] + "/" + config["project"] + "/kraken/{sids}.log", + log: config['out_to'] + "/logs/" + config["project"] + "/kraken/{sids}.log", threads: 24 resources: mem_mb = 220000, diff --git a/workflow/qc.smk b/workflow/qc.smk index dbb2c51..e6f0942 100644 --- a/workflow/qc.smk +++ b/workflow/qc.smk @@ -4,7 +4,7 @@ qc_expand_args = { } -if config['demux_data']: +if not config['demux_data']: trim_input_suffix = 'dragen' demux_stats = config["out_to"] + "/demux/dragen_reports/Demultiplex_Stats.csv" else: @@ -19,11 +19,11 @@ rule fastqc_untrimmed: input: samples = config['out_to'] + "/demux/" + config["project"] + "/{sids}_R{rnums}_" + trim_input_suffix + ".fastq.gz", output: - html = config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_001_fastqc.html", - fqreport = config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_001_fastqc.zip", + html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_001_fastqc.html", + fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_001_fastqc.zip", params: - output_dir = lambda w: config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/" - log: config['out_to'] + "/logs/" + config["run_ids"] + "/" + config["project"] + "/fastqc_untrimmed/{sids}_R{rnums}.log" + output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/" + log: config['out_to'] + "/logs/" + "/" + config["project"] + "/fastqc_untrimmed/{sids}_R{rnums}.log" threads: 4 containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" resources: mem_mb = 8096 @@ -36,16 +36,16 @@ rule fastqc_untrimmed: rule fastqc_trimmed: input: - in_read = config["out_to"] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz", + in_read = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz", output: - html = config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html", - fqreport = config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip", + html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html", + fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip", params: - output_dir = lambda w: config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/" + output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/" containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" threads: 4 resources: mem_mb = 8096 - log: config['out_to'] + "/logs/" + config["run_ids"] + "/" + config["project"] + "/fastqc_trimmed/{sids}_R{rnums}.log" + log: config['out_to'] + "/logs/" + config["project"] + "/fastqc_trimmed/{sids}_R{rnums}.log" shell: """ mkdir -p {params.output_dir} @@ -58,32 +58,31 @@ rule multiqc_report: # demux status demux_stats, # fastqc on untrimmed reads - expand(config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_001_fastqc.zip", **qc_expand_args), + expand(config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_001_fastqc.zip", **qc_expand_args), # fastqc on trimmed reads - expand(config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip", **qc_expand_args), + expand(config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip", **qc_expand_args), # fastp trimming metrics - expand(config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz", **qc_expand_args), + expand(config['out_to'] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz", **qc_expand_args), # fastq screen - expand(config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/fastq_screen/{sids}_trimmed_R{rnums}_screen.html", **qc_expand_args), + expand(config['out_to'] + "/" + config["project"] + "/{sids}/fastq_screen/{sids}_trimmed_R{rnums}_screen.html", **qc_expand_args), # kraken2 - expand(config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/kraken/{sids}.tsv", **qc_expand_args), + expand(config['out_to'] + "/" + config["project"] + "/{sids}/kraken/{sids}.tsv", **qc_expand_args), # kaiju - expand(config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/{sids}/kaiju/{sids}.tsv", **qc_expand_args), + expand(config['out_to'] + "/" + config["project"] + "/{sids}/kaiju/{sids}.tsv", **qc_expand_args), output: - mqc_report = expand(config['out_to'] + "/{rid}/{project}" + \ + mqc_report = expand(config['out_to'] + "/{project}" + \ "/multiqc/Run-{rid}-Project-{project}_multiqc_report.html", project=config['project'], rid=config['run_ids']), params: input_dir = config['out_to'], - demux_dir = config['out_to'] + "/demux/" + config["run_ids"], - output_dir = config['out_to'] + "/" + config["run_ids"] + "/" + config["project"] + "/multiqc/", + output_dir = config['out_to'] + "/" + config["project"] + "/multiqc/", report_title = "Run: " + config["run_ids"] + ", Project: " + config["project"], containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" threads: 4 resources: mem_mb = 8096 log: expand( - config['out_to'] + "/logs/{project}/{rid}/multiqc/multiqc.log", + config['out_to'] + "/logs/multiqc/multiqc_{rid}_{project}.log", project=config['project'], rid=config['run_ids'] ) shell: @@ -91,6 +90,6 @@ rule multiqc_report: multiqc -q -ip \ --title \"{params.report_title}\" \ -o {params.output_dir} \ - {params.input_dir} {params.demux_dir} \ + {params.input_dir} \ --ignore ".cache" --ignore ".config" --ignore ".snakemake" --ignore ".slurm" --ignore ".singularity" --ignore ".logs" """