From 603f6eb8c3ea1ad747645d2a47510b1e501ff917 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 3 Jan 2024 17:38:23 -0600 Subject: [PATCH 1/9] feat: add disambiguate to docker image --- docker/NGS_QC/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/NGS_QC/Dockerfile b/docker/NGS_QC/Dockerfile index 3ad8ca4..38a2fed 100644 --- a/docker/NGS_QC/Dockerfile +++ b/docker/NGS_QC/Dockerfile @@ -3,10 +3,13 @@ ENV TZ=America/New_York ENV DEBIAN_FRONTEND=noninteractive RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone RUN apt-get update; apt-get upgrade -y bash && \ - apt-get install -y fastqc multiqc fastp kraken2 bowtie2 libgd-graph-perl git curl libz-dev build-essential + apt-get install -y fastqc python-is-python3 python3-pip samtools multiqc fastp kraken2 bowtie2 libgd-graph-perl git curl libz-dev build-essential RUN mkdir /opt2; curl -S -s -L https://github.com/StevenWingett/FastQ-Screen/archive/refs/tags/v0.15.3.tar.gz --output /opt2/v0.15.3.tar.gz RUN cd /opt2 && tar xvf v0.15.3.tar.gz RUN cd /opt2 && git clone https://github.com/bioinformatics-centre/kaiju.git RUN cd /opt2/kaiju/src; make +RUN cd /opt2 && git clone https://github.com/AstraZeneca-NGS/disambiguate.git +RUN pip install pysam +RUN chmod +x /opt2/disambiguate/disambiguate.py; ln -sf /opt2/disambiguate/disambiguate.py /usr/bin/disambiguate.py ENV PATH="${PATH}:/opt2/FastQ-Screen-0.15.3:/opt2/kaiju/bin" ADD docker/NGS_QC/fastq_screen.conf /etc/fastq_screen.conf From 679ad599154b1b695a64cb4154f1af88e970e4a6 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 5 Jan 2024 14:55:21 -0600 Subject: [PATCH 2/9] feat: add short read aligners to docker --- docker/NGS_QC/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/NGS_QC/Dockerfile b/docker/NGS_QC/Dockerfile index 38a2fed..2a34a09 100644 --- a/docker/NGS_QC/Dockerfile +++ b/docker/NGS_QC/Dockerfile @@ -3,7 +3,7 @@ ENV TZ=America/New_York ENV DEBIAN_FRONTEND=noninteractive RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone RUN apt-get update; apt-get upgrade -y bash && \ - apt-get install -y fastqc python-is-python3 python3-pip samtools multiqc fastp kraken2 bowtie2 libgd-graph-perl git curl libz-dev build-essential + apt-get install -y fastqc python-is-python3 rna-star bwa hisat2 python3-pip samtools multiqc fastp kraken2 bowtie2 libgd-graph-perl git curl libz-dev build-essential RUN mkdir /opt2; curl -S -s -L https://github.com/StevenWingett/FastQ-Screen/archive/refs/tags/v0.15.3.tar.gz --output /opt2/v0.15.3.tar.gz RUN cd /opt2 && tar xvf v0.15.3.tar.gz RUN cd /opt2 && git clone https://github.com/bioinformatics-centre/kaiju.git From e857d0f5d706944dc311e985e3f4323685234963 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 10 Jan 2024 15:41:17 -0700 Subject: [PATCH 3/9] feat: add disambiguate functionality to pipeline --- .github/workflows/doc.yml | 2 +- config/bigsky.json | 6 ++-- config/biowulf.json | 4 +-- scripts/config.py | 38 +++++++++++++++++++++++ scripts/utils.py | 47 +++++++++++++++++++++++++++-- weave | 24 +++++++++++---- workflow/Snakefile | 9 ++++++ workflow/qc.smk | 63 ++++++++++++++++++++++++++++++++++----- 8 files changed, 171 insertions(+), 22 deletions(-) diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml index f7df7ca..b50f4f2 100644 --- a/.github/workflows/doc.yml +++ b/.github/workflows/doc.yml @@ -28,4 +28,4 @@ jobs: restore-keys: | mkdocs-material- - run: pip install -r docs/requirements.txt - - run: mkdocs gh-deploy --force + - run: mkdocs gh-deploy --forcescreen -r diff --git a/config/bigsky.json b/config/bigsky.json index 44eb73e..50aabf2 100644 --- a/config/bigsky.json +++ b/config/bigsky.json @@ -3,17 +3,17 @@ "mounts": { "kaiju": { "to": "/opt/kaiju", - "from": "/gs1/RTS/OpenOmics/references/Dmux/kaiju/kaiju_db_nr_euk_2023-05-10", + "from": "/gs1/RTS/OpenOmics/references/weave/kaiju/kaiju_db_nr_euk_2023-05-10", "mode": "ro" }, "kraken2" : { "to": "/opt/kraken2", - "from": "/gs1/RTS/OpenOmics/references/Dmux/kraken2/k2_pluspfp_20230605", + "from": "/gs1/RTS/OpenOmics/references/weave/kraken2/k2_pluspfp_20230605", "mode": "ro" }, "fastq_screen" : { "to": "/fdb/fastq_screen/FastQ_Screen_Genomes", - "from": "/gs1/RTS/OpenOmics/references/Dmux/FastQ_Screen_Genomes", + "from": "/gs1/RTS/OpenOmics/references/weave/FastQ_Screen_Genomes", "mode": "ro" } } diff --git a/config/biowulf.json b/config/biowulf.json index d1f9f90..838a7e8 100644 --- a/config/biowulf.json +++ b/config/biowulf.json @@ -3,12 +3,12 @@ "mounts": { "kaiju": { "to": "/opt/kaiju", - "from": "/data/OpenOmics/references/Dmux/kaiju/kaiju_db_nr_euk_2023-05-10", + "from": "/data/OpenOmics/references/weave/kaiju/kaiju_db_nr_euk_2023-05-10", "mode": "ro" }, "kraken2" : { "to": "/opt/kraken2", - "from": "/data/OpenOmics/references/Dmux/kraken2/k2_pluspfp_20230605", + "from": "/data/OpenOmics/references/weave/kraken2/k2_pluspfp_20230605", "mode": "ro" }, "fastq_screen" : { diff --git a/scripts/config.py b/scripts/config.py index 81c84a4..e9f50e2 100644 --- a/scripts/config.py +++ b/scripts/config.py @@ -158,4 +158,42 @@ def get_bigsky_seq_dirs(): "seq": get_biowulf_seq_dirs(), "profile": Path(Path(__file__).parent.parent, "utils", "profiles", "biowulf").resolve(), } +} + + +GENOME_CONFIGS = { + "biowulf": { + "hg19": "/vf/users/OpenOmics/references/genomes/human/hg19/GRCh37.primary_assembly.genome.fa.gz", + "grch37": "/vf/users/OpenOmics/references/genomes/human/GRCh37/GRCh37.primary_assembly.genome.fa.gz", + "hg38": "/vf/users/OpenOmics/references/genomes/human/hg38/GRCh38.p14.genome.fa.gz", + "grch38": "/vf/users/OpenOmics/references/genomes/human/GRCh38/GRCh38.p14.genome.fa.gz", + "mm9": "/vf/users/OpenOmics/references/genomes/mouse/mm9/mm9.fa.gz", + "grcm37": "/vf/users/OpenOmics/references/genomes/mouse/GRCm37/mm9.fa.gz", + "mm10": "/vf/users/OpenOmics/references/genomes/mouse/mm10/GRCm38.p4.genome.fa.gz", + "grcm38": "/vf/users/OpenOmics/references/genomes/mouse/GRCm38/GRCm38.p4.genome.fa.gz", + "mm39": "/vf/users/OpenOmics/references/genomes/mouse/mm39/GRCm39.genome.fa.gz", + "grcm39": "/vf/users/OpenOmics/references/genomes/mouse/GRCm39/GRCm3s9.genome.fa.gz", + "rhemac10": "/vf/users/OpenOmics/references/genomes/rhesus/rhe10/rheMac10.fa.gz", + "mmul10": "/vf/users/OpenOmics/references/genomes/rhesus/mmul10/rheMac10.fa.gz", + "agm": "/vf/users/OpenOmics/references/genomes/agm/1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz", + "mesaur": "/vf/users/OpenOmics/references/genomes/mesaur/2.0/GCF_017639785.1_BCM_Maur_2.0_genomic.fna.gz", + "cynomac": "/vf/users/OpenOmics/references/genomes/cynomac/v2/GCF_012559485.2_MFA1912RKSv2_genomic.fna.gz", + }, + "bigsky": { + "hg19": "/gs1/RTS/OpenOmics/references/genomes/human/hg19/GRCh37.primary_assembly.genome.fa.gz", + "grch37": "/gs1/RTS/OpenOmics/references/genomes/human/GRCh37/GRCh37.primary_assembly.genome.fa.gz", + "hg38": "/gs1/RTS/OpenOmics/references/genomes/human/hg38/GRCh38.p14.genome.fa.gz", + "grch38": "/gs1/RTS/OpenOmics/references/genomes/human/GRCh38/GRCh38.p14.genome.fa.gz", + "mm9": "/gs1/RTS/OpenOmics/references/genomes/mouse/mm9/mm9.fa.gz", + "grcm37": "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm37/mm9.fa.gz", + "mm10": "/gs1/RTS/OpenOmics/references/genomes/mouse/mm10/GRCm38.p4.genome.fa.gz", + "grcm38": "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm38/GRCm38.p4.genome.fa.gz", + "mm39": "/gs1/RTS/OpenOmics/references/genomes/mouse/mm39/GRCm39.genome.fa.gz", + "grcm39": "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm39/GRCm39.genome.fa.gz", + "rhemac10": "/gs1/RTS/OpenOmics/references/genomes/rhesus/rhe10/rheMac10.fa.gz", + "mmul10": "/gs1/RTS/OpenOmics/references/genomes/rhesus/mmul10/rheMac10.fa.gz", + "agm": "/gs1/RTS/OpenOmics/references/genomes/agm/1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz", + "mesaur": "/gs1/RTS/OpenOmics/references/genomes/mesaur/2.0/GCF_017639785.1_BCM_Maur_2.0_genomic.fna.gz", + "cynomac": "/gs1/RTS/OpenOmics/references/genomes/cynomac/v2/GCF_012559485.2_MFA1912RKSv2_genomic.fna.gz", + }, } \ No newline at end of file diff --git a/scripts/utils.py b/scripts/utils.py index 131de42..12a6029 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -15,7 +15,7 @@ # ~~~ internals ~~~ from .files import parse_samplesheet, mk_or_pass_dirs -from .config import SNAKEFILE, DIRECTORY_CONFIGS, get_current_server, get_resource_config +from .config import SNAKEFILE, DIRECTORY_CONFIGS, GENOME_CONFIGS, get_current_server, get_resource_config host = get_current_server() @@ -40,6 +40,19 @@ def default(self, obj): return super(self).default(obj) +def get_alias_table(): + pp_tbl = lambda x: "\n".join([y.lstrip().rstrip() for y in x.split('\n')]) + return pp_tbl("""+----------------+-------------------------------------------+ + | Organism | Genomes supported (aka) | + +----------------+-------------------------------------------+ + | HUMAN | hg19(grch37) / hg38(grch38) | + +----------------+-------------------------------------------+ + | MOUSE | mm9(grcm37) / mm10(grcm38) / mm39(grcm39) | + +----------------+-------------------------------------------+ + | RHESUS MACAQUE | RHEMAC10(mmul10) | + +----------------+-------------------------------------------+""") + + def valid_runid(id_to_check): ''' Given an input ID get it's validity against the run id format: @@ -233,6 +246,9 @@ def exec_pipeline(configs, dry_run=False, local=False): if not bclcon_log_dir.exists(): bclcon_log_dir.mkdir(mode=0o755, parents=True) extra_to_mount.append(str(bclcon_log_dir) + ":" + "/var/log/bcl-convert:rw") + if this_config['disambiguate']: + extra_to_mount.append(Path(this_config['host_genome']).parent) + extra_to_mount.append(Path(this_config['pathogen_genome']).parent) singularity_binds = get_mounts(*extra_to_mount) config_file = Path(this_config['out_to'], '.config', f'config_job_{str(i)}.json').absolute() json.dump(this_config, open(config_file, 'w'), cls=PathJSONEncoder, indent=4) @@ -275,4 +291,31 @@ def is_bclconvert(samplesheet): check = False if samplesheet.instrument in BCLCONVERT_INSTRUMENTS or samplesheet.platform in BCLCONVERT_PLATFORMS: check = True - return check \ No newline at end of file + return check + + +def valid_host_pathogen_genomes(host, pathogen): + g1, g2 = False, False + genomes = GENOME_CONFIGS[get_current_server()] + + if Path(host).absolute().exists(): + g1 = True + host = str(Path(host).absolute()) + elif host.lower() in genomes: + g1 = True + host = genomes[host.lower()] + + if Path(g2).absolute().exists(): + g2 = True + pathogen = str(Path(pathogen).absolute()) + elif pathogen.lower() in genomes: + g2 = True + pathogen = genomes[pathogen.lower()] + + if not all([g1, g2]): + if not g1: + raise ValueError('Host genome does not exist on the file system or in the aliases.') + if not g2: + raise ValueError('Pathogen genome does not exist on the file system or in the aliases.') + + return host, pathogen \ No newline at end of file diff --git a/weave b/weave index 47fd4f3..02d4568 100755 --- a/weave +++ b/weave @@ -33,6 +33,17 @@ def run(args): exec_config['bcl_files'].append(bcls) exec_config['demux_data'].append(files.check_if_demuxed(rundir)) + # ~~~ disambiguate genome configuration ~~~ + exec_config['host_genome'] = None + exec_config['pathogen_genome'] = None + if all([args.host, args.pathogen]): + utils.valid_host_pathogen_genomes(args.host, args.pathogen) + exec_config['disambiguate'] = True + exec_config['host_genome'] = config.get_genome_path(args.host) + exec_config['pathogen_genome'] = config.get_genome_path(args.pathogen) + else: + assert(not any([args.host, args.pathogen])), 'Must specify both host and pathogen genometype!' + # ~~~ QC/QA configuration ~~~ exec_config['bclconvert'].append(utils.is_bclconvert(sample_sheet)) exec_config['run_ids'].append(rundir.name) @@ -86,12 +97,13 @@ if __name__ == '__main__': help='Name of the sample sheet file to look for (default is SampleSheet.csv)') parser_run.add_argument('-l', '--local', action='store_true', help='Execute pipeline locally without a dispatching executor') - # force endedness flags - endedness = parser_run.add_mutually_exclusive_group() - endedness.add_argument('--single_end', default=None, action='store_true', - help='Declare endedness of run (in cases in which it is not detectable from sample sheet), mutally exclusive to paired_end') - endedness.add_argument('--paired_end', default=None, action='store_true', - help='Declare endedness of run (in cases in which it is not detectable from sample sheet), mutally exclusive to single_end') + + # disambiguate arguments + parser_run.add_argument('-h', '--host', type=files.valid_fasta, default=None, + help='Execute pipeline locally without a dispatching executor.') + alias_table = "\n" + utils.get_alias_table() + "\n" + parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None, + help='Execute pipeline locally without a dispatching executor. ' + alias_table) parser_cache = sub_parsers.add_parser('cache') parser_cache.add_argument('cachedir', metavar='', type=cache.valid_dir, diff --git a/workflow/Snakefile b/workflow/Snakefile index 2a8d4cc..661c99a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -128,6 +128,15 @@ else: if config["runqc"]: all_outputs.extend(qa_qc_outputs) +if config['disambiguate']: + all_outputs.extend(flatten([ + expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam", **demux_expand_args), + expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam", **demux_expand_args), + expand("{out_to}/{project}/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam", **demux_expand_args), + expand("{out_to}/{project}/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam", **demux_expand_args), + expand("{out_to}/{project}/{sids}/disambiguate/{sids}_summary.txt", **demux_expand_args), + ])) + rule all: input: diff --git a/workflow/qc.smk b/workflow/qc.smk index 1775c22..a7b6394 100644 --- a/workflow/qc.smk +++ b/workflow/qc.smk @@ -17,12 +17,12 @@ else: rule fastqc_untrimmed: input: - samples = config['out_to'] + "/demux/" + config["project"] + "/{sids}_R{rnums}_" + trim_input_suffix + ".fastq.gz", + samples = config['out_to'] + "/demux/" + config["project"] + "/{sids}_R{rnums}_" + trim_input_suffix + ".fastq.gz", output: - html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.html", - fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.zip", + html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.html", + fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.zip", params: - output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/" + output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/" log: config['out_to'] + "/logs/" + "/" + config["project"] + "/fastqc_untrimmed/{sids}_R{rnums}.log" threads: 4 containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" @@ -36,12 +36,12 @@ rule fastqc_untrimmed: rule fastqc_trimmed: input: - in_read = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz", + in_read = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz", output: - html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html", - fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip", + html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html", + fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip", params: - output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/" + output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/" containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" threads: 4 resources: mem_mb = 8096 @@ -53,6 +53,53 @@ rule fastqc_trimmed: """ +rule bwa: + input: + in_read1 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz" if config['disambiguate'] else [], + in_read2 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if config['disambiguate'] and len(config['rnums']) == 2 else [], + output: + aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam" + aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam" + params: + host_genome = config['host_genome'] + path_genome = config['pathogen_genome'] + threads: 32 + resources: mem_mb = 64768 + containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.2.sif" + log: config['out_to'] + "/logs/" + config["project"] + "/bwa_mem/{sids}.log" + shell: + """ + bwa mem -t {threads} {params.host_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoA} - + bwa mem -t {threads} {params.path_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoB} - + """ + + +rule disambiguate: + input: + aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam" + aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam" + output: + ambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam" + ambiguousB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam" + disambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam" + disambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam" + ambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}_summary.txt" + params: + host_genome = config['host_genome'] + path_genome = config['pathogen_genome'] + this_sid = lambda wc: wc.sids + out_dir = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/" + containerized: config["resources"]["sif"] + "ngs_disambiguate_2018.05.03.sif" + log: config['out_to'] + "/logs/" + config["project"] + "/disambiguate/{sids}.log" + threads: 32 + resources: mem_mb = 64768 + shell: + """ + ngs_disambiguate -s {params.this_sid} -o {params.out_dir} -a bwa {input.aligntoA} {input.aligntoB} + """ + + + rule multiqc_report: input: # demux status From 539ed71199a805e4b99bd76eb150d47ccf810594 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Thu, 11 Jan 2024 09:22:14 -0700 Subject: [PATCH 4/9] fix: applying testing fixes for disambiguate feat --- scripts/files.py | 30 ++++++++++++++++++++++++++++-- scripts/utils.py | 14 ++++---------- weave | 17 +++++++++++------ workflow/Snakefile | 2 +- workflow/qc.smk | 34 +++++++++++++++++----------------- 5 files changed, 61 insertions(+), 36 deletions(-) diff --git a/scripts/files.py b/scripts/files.py index 9f4c5ad..adb415c 100644 --- a/scripts/files.py +++ b/scripts/files.py @@ -3,12 +3,12 @@ # ~~~~~~~~~~~~~~~ # file system helper functions for the Dmux software package # ~~~~~~~~~~~~~~~ -from pathlib import Path import xml.etree.ElementTree as ET +from pathlib import Path from os import access as check_access, R_OK, W_OK from functools import partial from .samplesheet import IllumniaSampleSheet -from .config import get_current_server, LABKEY_CONFIGS, DIRECTORY_CONFIGS +from .config import get_current_server, GENOME_CONFIGS, DIRECTORY_CONFIGS def get_all_seq_dirs(top_dir, server): @@ -48,6 +48,32 @@ def valid_run_output(output_directory, dry_run=False): return output_directory +def valid_fasta(suspect): + server_genomes = GENOME_CONFIGS[get_current_server()] + is_valid = False + if suspect.lower() in server_genomes: + is_valid = True + suspect = server_genomes[suspect.lower()] + else: + the_suspect = Path(suspect) + exts = the_suspect.suffixes + if any([ + '.fna' in exts, + '.fa' in exts, + '.fasta' in exts, + '.fna' in exts and '.gz' in exts, + '.fa' in exts and '.gz' in exts, + '.fasta' in exts and '.gz' in exts, + ]): + is_valid = True + suspect = str(Path(suspect).absolute()) + + if not is_valid: + raise ValueError + + return suspect + + def get_all_staged_dirs(top_dir, server): return list(filter(partial(is_dir_staged, server), get_all_seq_dirs(top_dir, server))) diff --git a/scripts/utils.py b/scripts/utils.py index 12a6029..9ead137 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -246,7 +246,7 @@ def exec_pipeline(configs, dry_run=False, local=False): if not bclcon_log_dir.exists(): bclcon_log_dir.mkdir(mode=0o755, parents=True) extra_to_mount.append(str(bclcon_log_dir) + ":" + "/var/log/bcl-convert:rw") - if this_config['disambiguate']: + if this_config.get('disambiguate', False): extra_to_mount.append(Path(this_config['host_genome']).parent) extra_to_mount.append(Path(this_config['pathogen_genome']).parent) singularity_binds = get_mounts(*extra_to_mount) @@ -301,21 +301,15 @@ def valid_host_pathogen_genomes(host, pathogen): if Path(host).absolute().exists(): g1 = True host = str(Path(host).absolute()) - elif host.lower() in genomes: - g1 = True - host = genomes[host.lower()] - if Path(g2).absolute().exists(): + if Path(pathogen).absolute().exists(): g2 = True pathogen = str(Path(pathogen).absolute()) - elif pathogen.lower() in genomes: - g2 = True - pathogen = genomes[pathogen.lower()] if not all([g1, g2]): if not g1: - raise ValueError('Host genome does not exist on the file system or in the aliases.') + raise ValueError('Host genome does not exist on the file system.') if not g2: - raise ValueError('Pathogen genome does not exist on the file system or in the aliases.') + raise ValueError('Pathogen genome does not exist on the file system.') return host, pathogen \ No newline at end of file diff --git a/weave b/weave index 02d4568..844332d 100755 --- a/weave +++ b/weave @@ -34,13 +34,18 @@ def run(args): exec_config['demux_data'].append(files.check_if_demuxed(rundir)) # ~~~ disambiguate genome configuration ~~~ - exec_config['host_genome'] = None - exec_config['pathogen_genome'] = None if all([args.host, args.pathogen]): + if 'disambiguate' not in exec_config: + exec_config['disambiguate'] = [] + if 'host_genome' not in exec_config: + exec_config['host_genome'] = [] + if 'pathogen_genome' not in exec_config: + exec_config['pathogen_genome'] = [] + utils.valid_host_pathogen_genomes(args.host, args.pathogen) - exec_config['disambiguate'] = True - exec_config['host_genome'] = config.get_genome_path(args.host) - exec_config['pathogen_genome'] = config.get_genome_path(args.pathogen) + exec_config['disambiguate'].append(True) + exec_config['host_genome'].append(args.host) + exec_config['pathogen_genome'].append(args.pathogen) else: assert(not any([args.host, args.pathogen])), 'Must specify both host and pathogen genometype!' @@ -99,7 +104,7 @@ if __name__ == '__main__': help='Execute pipeline locally without a dispatching executor') # disambiguate arguments - parser_run.add_argument('-h', '--host', type=files.valid_fasta, default=None, + parser_run.add_argument('-t', '--host', type=files.valid_fasta, default=None, help='Execute pipeline locally without a dispatching executor.') alias_table = "\n" + utils.get_alias_table() + "\n" parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None, diff --git a/workflow/Snakefile b/workflow/Snakefile index 661c99a..15f559c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -128,7 +128,7 @@ else: if config["runqc"]: all_outputs.extend(qa_qc_outputs) -if config['disambiguate']: +if config.get('disambiguate', False): all_outputs.extend(flatten([ expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam", **demux_expand_args), expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam", **demux_expand_args), diff --git a/workflow/qc.smk b/workflow/qc.smk index a7b6394..824f16e 100644 --- a/workflow/qc.smk +++ b/workflow/qc.smk @@ -55,14 +55,14 @@ rule fastqc_trimmed: rule bwa: input: - in_read1 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz" if config['disambiguate'] else [], - in_read2 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if config['disambiguate'] and len(config['rnums']) == 2 else [], + in_read1 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz" if config.get('disambiguate', False) else [], + in_read2 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if config.get('disambiguate', False) and len(config['rnums']) == 2 else [], output: - aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam" - aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam" + aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam", + aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam", params: - host_genome = config['host_genome'] - path_genome = config['pathogen_genome'] + host_genome = config.get('host_genome', ''), + path_genome = config.get('pathogen_genome', ''), threads: 32 resources: mem_mb = 64768 containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.2.sif" @@ -76,19 +76,19 @@ rule bwa: rule disambiguate: input: - aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam" - aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam" + aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam", + aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam", output: - ambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam" - ambiguousB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam" - disambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam" - disambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam" - ambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}_summary.txt" + ambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam", + ambiguousB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam", + disambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam", + disambiguousB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam", + dis_summary = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}_summary.txt", params: - host_genome = config['host_genome'] - path_genome = config['pathogen_genome'] - this_sid = lambda wc: wc.sids - out_dir = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/" + host_genome = config.get('host_genome', ''), + path_genome = config.get('pathogen_genome', ''), + this_sid = lambda wc: wc.sids, + out_dir = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/", containerized: config["resources"]["sif"] + "ngs_disambiguate_2018.05.03.sif" log: config['out_to'] + "/logs/" + config["project"] + "/disambiguate/{sids}.log" threads: 32 From 340fc9a51bdc1b5eb1ab12b5093a85f4306bbda3 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 12 Jan 2024 08:55:28 -0700 Subject: [PATCH 5/9] fix: remove type --- .github/workflows/doc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml index b50f4f2..f7df7ca 100644 --- a/.github/workflows/doc.yml +++ b/.github/workflows/doc.yml @@ -28,4 +28,4 @@ jobs: restore-keys: | mkdocs-material- - run: pip install -r docs/requirements.txt - - run: mkdocs gh-deploy --forcescreen -r + - run: mkdocs gh-deploy --force From d4a439db0e6765de6ba2a167d07d0761d459e578 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 12 Jan 2024 09:00:07 -0700 Subject: [PATCH 6/9] fix: natural name sort bams --- workflow/qc.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/qc.smk b/workflow/qc.smk index 824f16e..68a8fb9 100644 --- a/workflow/qc.smk +++ b/workflow/qc.smk @@ -69,8 +69,8 @@ rule bwa: log: config['out_to'] + "/logs/" + config["project"] + "/bwa_mem/{sids}.log" shell: """ - bwa mem -t {threads} {params.host_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoA} - - bwa mem -t {threads} {params.path_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoB} - + bwa mem -t {threads} {params.host_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -n -o {output.aligntoA} - + bwa mem -t {threads} {params.path_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -n -o {output.aligntoB} - """ From 47814bc86572074e7bd3fba78fd96f448acd5f94 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 12 Jan 2024 09:34:01 -0700 Subject: [PATCH 7/9] fix: help genome alias table and remote cache assets --- config/remote.json | 2 ++ scripts/utils.py | 5 +++-- weave | 8 ++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/config/remote.json b/config/remote.json index 6371456..097d764 100644 --- a/config/remote.json +++ b/config/remote.json @@ -1,6 +1,8 @@ { "bcl2fastq": "docker://umccr/bcl2fastq:latest", "weave": "docker://rroutsong/weave_ngsqc:0.0.1", + "bclconvert": "docker://rroutsong/weave_bclconvert:0.0.3", + "disambiguate": "docker://quay.io/biocontainers/ngs-disambiguate:2018.05.03--hf393df8_7", "kraken": "https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_16gb_20231009.tar.gz", "kaiju": "https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_euk_2023-05-10.tgz", "fastq_screen": "filelist://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/genome_locations.txt" diff --git a/scripts/utils.py b/scripts/utils.py index 9ead137..29fda6a 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -8,6 +8,7 @@ import os import yaml import sys +import textwrap from argparse import ArgumentTypeError from dateutil.parser import parse as date_parser from subprocess import Popen, PIPE, STDOUT @@ -41,8 +42,8 @@ def default(self, obj): def get_alias_table(): - pp_tbl = lambda x: "\n".join([y.lstrip().rstrip() for y in x.split('\n')]) - return pp_tbl("""+----------------+-------------------------------------------+ + return textwrap.dedent("""Genome short name alias table: + +----------------+-------------------------------------------+ | Organism | Genomes supported (aka) | +----------------+-------------------------------------------+ | HUMAN | hg19(grch37) / hg38(grch38) | diff --git a/weave b/weave index 844332d..681fbf5 100755 --- a/weave +++ b/weave @@ -83,10 +83,10 @@ def unlock_dir(sub_args): # ~~~~ argument parsing commands ~~~~ if __name__ == '__main__': - main_parser = argparse.ArgumentParser(prog='weave') + main_parser = argparse.ArgumentParser(prog='weave', formatter_class=argparse.RawTextHelpFormatter, epilog=utils.get_alias_table()) sub_parsers = main_parser.add_subparsers(help='run subcommand help') - parser_run = sub_parsers.add_parser('run') + parser_run = sub_parsers.add_parser('run', formatter_class=argparse.RawTextHelpFormatter, epilog=utils.get_alias_table()) parser_run.add_argument('rundir', metavar='', nargs="+", type=utils.valid_run_input, help='Full & complete run id (format YYMMDD_INSTRUMENTID_TIME_FLOWCELLID) or absolute paths') parser_run.add_argument('-s', '--seq_dir', metavar='', default=None, type=str, @@ -106,9 +106,9 @@ if __name__ == '__main__': # disambiguate arguments parser_run.add_argument('-t', '--host', type=files.valid_fasta, default=None, help='Execute pipeline locally without a dispatching executor.') - alias_table = "\n" + utils.get_alias_table() + "\n" + parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None, - help='Execute pipeline locally without a dispatching executor. ' + alias_table) + help='Execute pipeline locally without a dispatching executor.') parser_cache = sub_parsers.add_parser('cache') parser_cache.add_argument('cachedir', metavar='', type=cache.valid_dir, From 820e0afaa26b907508de339596460b5183718651 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 12 Jan 2024 09:41:38 -0700 Subject: [PATCH 8/9] fix: add actual help messages to argparse arguments --- weave | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/weave b/weave index 681fbf5..95af79b 100755 --- a/weave +++ b/weave @@ -88,37 +88,37 @@ if __name__ == '__main__': parser_run = sub_parsers.add_parser('run', formatter_class=argparse.RawTextHelpFormatter, epilog=utils.get_alias_table()) parser_run.add_argument('rundir', metavar='', nargs="+", type=utils.valid_run_input, - help='Full & complete run id (format YYMMDD_INSTRUMENTID_TIME_FLOWCELLID) or absolute paths') + help='Full & complete run id (format YYMMDD_INSTRUMENTID_TIME_FLOWCELLID) or absolute paths.') parser_run.add_argument('-s', '--seq_dir', metavar='', default=None, type=str, help='Root directory for sequencing data (defaults for biowulf/bigsky/locus), must contain directories ' + \ 'matching run ids, if not using full paths.') parser_run.add_argument('-o', '--output', metavar='', default=None, type=str, - help='Top-level output directory for demultiplexing data (defaults to input directory + runid + "_demux")') + help='Top-level output directory for demultiplexing data (defaults to input directory + runid + "_demux").') parser_run.add_argument('-d', '--dry-run', action='store_true', - help='Dry run the demultiplexing workflow') + help='Dry run the demultiplexing workflow.') parser_run.add_argument('-n', '--noqc', action='store_false', - help='Do not run the QC/QA portion of the workflow (Default is on)') + help='Do not run the QC/QA portion of the workflow (Default is on).') parser_run.add_argument('--sheetname', metavar='Sample Sheet Filename', - help='Name of the sample sheet file to look for (default is SampleSheet.csv)') + help='Name of the sample sheet file to look for (default is SampleSheet.csv).') parser_run.add_argument('-l', '--local', action='store_true', - help='Execute pipeline locally without a dispatching executor') + help='Execute pipeline locally without a dispatching executor.') # disambiguate arguments parser_run.add_argument('-t', '--host', type=files.valid_fasta, default=None, - help='Execute pipeline locally without a dispatching executor.') + help='Full path to host genome for disambiguate to use or short name genome alias.') parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None, - help='Execute pipeline locally without a dispatching executor.') + help='Full path to pathogen/graft/parasite genome for disambiguate to use or short name genome alias.') parser_cache = sub_parsers.add_parser('cache') parser_cache.add_argument('cachedir', metavar='', type=cache.valid_dir, - help='Relative or absolute path to directory for cache storage') + help='Relative or absolute path to directory for cache storage.') parser_cache.add_argument('-l', '--local', action='store_true', help='Execute pipeline locally without a dispatching executor') parser_unlock = sub_parsers.add_parser('unlock') parser_unlock.add_argument('unlockdir', metavar='', type=cache.valid_dir, - help='Relative or absolute path to directory for cache storage') + help='Full path to directory to unlock.') parser_run.set_defaults(func = run) parser_cache.set_defaults(func = get_cache) From 76737ebb15c6d0ee1a7175c18fb66c657e76bdeb Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 16 Jan 2024 09:24:25 -0700 Subject: [PATCH 9/9] fix: correct readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0ad37e6..70e00bc 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@

weave 🔬

- **_An awesome metagenomic and metatranscriptomics pipeline_** + **_An awesome demultiplexing and quality control pipeline_** - [![tests](https://github.com/OpenOmics/weave/workflows/tests/badge.svg)](https://github.com/OpenOmics/weave/actions/workflows/main.yaml) [![docs](https://github.com/OpenOmics/weave/workflows/docs/badge.svg)](https://github.com/OpenOmics/weave/actions/workflows/docs.yml) [![GitHub issues](https://img.shields.io/github/issues/OpenOmics/weave?color=brightgreen)](https://github.com/OpenOmics/weave/issues) [![GitHub license](https://img.shields.io/github/license/OpenOmics/weave)](https://github.com/OpenOmics/weave/blob/main/LICENSE) + [![tests](https://github.com/OpenOmics/weave/workflows/tests/badge.svg)](https://github.com/OpenOmics/weave/actions/workflows/main.yaml) [![docs](https://github.com/OpenOmics/weave/actions/workflows/doc.yml/badge.svg)](https://github.com/OpenOmics/weave/actions/workflows/docs.yml) [![GitHub issues](https://img.shields.io/github/issues/OpenOmics/weave?color=brightgreen)](https://github.com/OpenOmics/weave/issues) [![GitHub license](https://img.shields.io/github/license/OpenOmics/weave)](https://github.com/OpenOmics/weave/blob/main/LICENSE) This is the home of the pipeline, weave. Its long-term goals: to provide accurate quantification, taxonomic classification, and functional profiling of assembled (bacteria and archaea) metagenomes!