From e857d0f5d706944dc311e985e3f4323685234963 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 10 Jan 2024 15:41:17 -0700 Subject: [PATCH] feat: add disambiguate functionality to pipeline --- .github/workflows/doc.yml | 2 +- config/bigsky.json | 6 ++-- config/biowulf.json | 4 +-- scripts/config.py | 38 +++++++++++++++++++++++ scripts/utils.py | 47 +++++++++++++++++++++++++++-- weave | 24 +++++++++++---- workflow/Snakefile | 9 ++++++ workflow/qc.smk | 63 ++++++++++++++++++++++++++++++++++----- 8 files changed, 171 insertions(+), 22 deletions(-) diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml index f7df7ca..b50f4f2 100644 --- a/.github/workflows/doc.yml +++ b/.github/workflows/doc.yml @@ -28,4 +28,4 @@ jobs: restore-keys: | mkdocs-material- - run: pip install -r docs/requirements.txt - - run: mkdocs gh-deploy --force + - run: mkdocs gh-deploy --forcescreen -r diff --git a/config/bigsky.json b/config/bigsky.json index 44eb73e..50aabf2 100644 --- a/config/bigsky.json +++ b/config/bigsky.json @@ -3,17 +3,17 @@ "mounts": { "kaiju": { "to": "/opt/kaiju", - "from": "/gs1/RTS/OpenOmics/references/Dmux/kaiju/kaiju_db_nr_euk_2023-05-10", + "from": "/gs1/RTS/OpenOmics/references/weave/kaiju/kaiju_db_nr_euk_2023-05-10", "mode": "ro" }, "kraken2" : { "to": "/opt/kraken2", - "from": "/gs1/RTS/OpenOmics/references/Dmux/kraken2/k2_pluspfp_20230605", + "from": "/gs1/RTS/OpenOmics/references/weave/kraken2/k2_pluspfp_20230605", "mode": "ro" }, "fastq_screen" : { "to": "/fdb/fastq_screen/FastQ_Screen_Genomes", - "from": "/gs1/RTS/OpenOmics/references/Dmux/FastQ_Screen_Genomes", + "from": "/gs1/RTS/OpenOmics/references/weave/FastQ_Screen_Genomes", "mode": "ro" } } diff --git a/config/biowulf.json b/config/biowulf.json index d1f9f90..838a7e8 100644 --- a/config/biowulf.json +++ b/config/biowulf.json @@ -3,12 +3,12 @@ "mounts": { "kaiju": { "to": "/opt/kaiju", - "from": "/data/OpenOmics/references/Dmux/kaiju/kaiju_db_nr_euk_2023-05-10", + "from": "/data/OpenOmics/references/weave/kaiju/kaiju_db_nr_euk_2023-05-10", "mode": "ro" }, "kraken2" : { "to": "/opt/kraken2", - "from": "/data/OpenOmics/references/Dmux/kraken2/k2_pluspfp_20230605", + "from": "/data/OpenOmics/references/weave/kraken2/k2_pluspfp_20230605", "mode": "ro" }, "fastq_screen" : { diff --git a/scripts/config.py b/scripts/config.py index 81c84a4..e9f50e2 100644 --- a/scripts/config.py +++ b/scripts/config.py @@ -158,4 +158,42 @@ def get_bigsky_seq_dirs(): "seq": get_biowulf_seq_dirs(), "profile": Path(Path(__file__).parent.parent, "utils", "profiles", "biowulf").resolve(), } +} + + +GENOME_CONFIGS = { + "biowulf": { + "hg19": "/vf/users/OpenOmics/references/genomes/human/hg19/GRCh37.primary_assembly.genome.fa.gz", + "grch37": "/vf/users/OpenOmics/references/genomes/human/GRCh37/GRCh37.primary_assembly.genome.fa.gz", + "hg38": "/vf/users/OpenOmics/references/genomes/human/hg38/GRCh38.p14.genome.fa.gz", + "grch38": "/vf/users/OpenOmics/references/genomes/human/GRCh38/GRCh38.p14.genome.fa.gz", + "mm9": "/vf/users/OpenOmics/references/genomes/mouse/mm9/mm9.fa.gz", + "grcm37": "/vf/users/OpenOmics/references/genomes/mouse/GRCm37/mm9.fa.gz", + "mm10": "/vf/users/OpenOmics/references/genomes/mouse/mm10/GRCm38.p4.genome.fa.gz", + "grcm38": "/vf/users/OpenOmics/references/genomes/mouse/GRCm38/GRCm38.p4.genome.fa.gz", + "mm39": "/vf/users/OpenOmics/references/genomes/mouse/mm39/GRCm39.genome.fa.gz", + "grcm39": "/vf/users/OpenOmics/references/genomes/mouse/GRCm39/GRCm3s9.genome.fa.gz", + "rhemac10": "/vf/users/OpenOmics/references/genomes/rhesus/rhe10/rheMac10.fa.gz", + "mmul10": "/vf/users/OpenOmics/references/genomes/rhesus/mmul10/rheMac10.fa.gz", + "agm": "/vf/users/OpenOmics/references/genomes/agm/1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz", + "mesaur": "/vf/users/OpenOmics/references/genomes/mesaur/2.0/GCF_017639785.1_BCM_Maur_2.0_genomic.fna.gz", + "cynomac": "/vf/users/OpenOmics/references/genomes/cynomac/v2/GCF_012559485.2_MFA1912RKSv2_genomic.fna.gz", + }, + "bigsky": { + "hg19": "/gs1/RTS/OpenOmics/references/genomes/human/hg19/GRCh37.primary_assembly.genome.fa.gz", + "grch37": "/gs1/RTS/OpenOmics/references/genomes/human/GRCh37/GRCh37.primary_assembly.genome.fa.gz", + "hg38": "/gs1/RTS/OpenOmics/references/genomes/human/hg38/GRCh38.p14.genome.fa.gz", + "grch38": "/gs1/RTS/OpenOmics/references/genomes/human/GRCh38/GRCh38.p14.genome.fa.gz", + "mm9": "/gs1/RTS/OpenOmics/references/genomes/mouse/mm9/mm9.fa.gz", + "grcm37": "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm37/mm9.fa.gz", + "mm10": "/gs1/RTS/OpenOmics/references/genomes/mouse/mm10/GRCm38.p4.genome.fa.gz", + "grcm38": "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm38/GRCm38.p4.genome.fa.gz", + "mm39": "/gs1/RTS/OpenOmics/references/genomes/mouse/mm39/GRCm39.genome.fa.gz", + "grcm39": "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm39/GRCm39.genome.fa.gz", + "rhemac10": "/gs1/RTS/OpenOmics/references/genomes/rhesus/rhe10/rheMac10.fa.gz", + "mmul10": "/gs1/RTS/OpenOmics/references/genomes/rhesus/mmul10/rheMac10.fa.gz", + "agm": "/gs1/RTS/OpenOmics/references/genomes/agm/1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz", + "mesaur": "/gs1/RTS/OpenOmics/references/genomes/mesaur/2.0/GCF_017639785.1_BCM_Maur_2.0_genomic.fna.gz", + "cynomac": "/gs1/RTS/OpenOmics/references/genomes/cynomac/v2/GCF_012559485.2_MFA1912RKSv2_genomic.fna.gz", + }, } \ No newline at end of file diff --git a/scripts/utils.py b/scripts/utils.py index 131de42..12a6029 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -15,7 +15,7 @@ # ~~~ internals ~~~ from .files import parse_samplesheet, mk_or_pass_dirs -from .config import SNAKEFILE, DIRECTORY_CONFIGS, get_current_server, get_resource_config +from .config import SNAKEFILE, DIRECTORY_CONFIGS, GENOME_CONFIGS, get_current_server, get_resource_config host = get_current_server() @@ -40,6 +40,19 @@ def default(self, obj): return super(self).default(obj) +def get_alias_table(): + pp_tbl = lambda x: "\n".join([y.lstrip().rstrip() for y in x.split('\n')]) + return pp_tbl("""+----------------+-------------------------------------------+ + | Organism | Genomes supported (aka) | + +----------------+-------------------------------------------+ + | HUMAN | hg19(grch37) / hg38(grch38) | + +----------------+-------------------------------------------+ + | MOUSE | mm9(grcm37) / mm10(grcm38) / mm39(grcm39) | + +----------------+-------------------------------------------+ + | RHESUS MACAQUE | RHEMAC10(mmul10) | + +----------------+-------------------------------------------+""") + + def valid_runid(id_to_check): ''' Given an input ID get it's validity against the run id format: @@ -233,6 +246,9 @@ def exec_pipeline(configs, dry_run=False, local=False): if not bclcon_log_dir.exists(): bclcon_log_dir.mkdir(mode=0o755, parents=True) extra_to_mount.append(str(bclcon_log_dir) + ":" + "/var/log/bcl-convert:rw") + if this_config['disambiguate']: + extra_to_mount.append(Path(this_config['host_genome']).parent) + extra_to_mount.append(Path(this_config['pathogen_genome']).parent) singularity_binds = get_mounts(*extra_to_mount) config_file = Path(this_config['out_to'], '.config', f'config_job_{str(i)}.json').absolute() json.dump(this_config, open(config_file, 'w'), cls=PathJSONEncoder, indent=4) @@ -275,4 +291,31 @@ def is_bclconvert(samplesheet): check = False if samplesheet.instrument in BCLCONVERT_INSTRUMENTS or samplesheet.platform in BCLCONVERT_PLATFORMS: check = True - return check \ No newline at end of file + return check + + +def valid_host_pathogen_genomes(host, pathogen): + g1, g2 = False, False + genomes = GENOME_CONFIGS[get_current_server()] + + if Path(host).absolute().exists(): + g1 = True + host = str(Path(host).absolute()) + elif host.lower() in genomes: + g1 = True + host = genomes[host.lower()] + + if Path(g2).absolute().exists(): + g2 = True + pathogen = str(Path(pathogen).absolute()) + elif pathogen.lower() in genomes: + g2 = True + pathogen = genomes[pathogen.lower()] + + if not all([g1, g2]): + if not g1: + raise ValueError('Host genome does not exist on the file system or in the aliases.') + if not g2: + raise ValueError('Pathogen genome does not exist on the file system or in the aliases.') + + return host, pathogen \ No newline at end of file diff --git a/weave b/weave index 47fd4f3..02d4568 100755 --- a/weave +++ b/weave @@ -33,6 +33,17 @@ def run(args): exec_config['bcl_files'].append(bcls) exec_config['demux_data'].append(files.check_if_demuxed(rundir)) + # ~~~ disambiguate genome configuration ~~~ + exec_config['host_genome'] = None + exec_config['pathogen_genome'] = None + if all([args.host, args.pathogen]): + utils.valid_host_pathogen_genomes(args.host, args.pathogen) + exec_config['disambiguate'] = True + exec_config['host_genome'] = config.get_genome_path(args.host) + exec_config['pathogen_genome'] = config.get_genome_path(args.pathogen) + else: + assert(not any([args.host, args.pathogen])), 'Must specify both host and pathogen genometype!' + # ~~~ QC/QA configuration ~~~ exec_config['bclconvert'].append(utils.is_bclconvert(sample_sheet)) exec_config['run_ids'].append(rundir.name) @@ -86,12 +97,13 @@ if __name__ == '__main__': help='Name of the sample sheet file to look for (default is SampleSheet.csv)') parser_run.add_argument('-l', '--local', action='store_true', help='Execute pipeline locally without a dispatching executor') - # force endedness flags - endedness = parser_run.add_mutually_exclusive_group() - endedness.add_argument('--single_end', default=None, action='store_true', - help='Declare endedness of run (in cases in which it is not detectable from sample sheet), mutally exclusive to paired_end') - endedness.add_argument('--paired_end', default=None, action='store_true', - help='Declare endedness of run (in cases in which it is not detectable from sample sheet), mutally exclusive to single_end') + + # disambiguate arguments + parser_run.add_argument('-h', '--host', type=files.valid_fasta, default=None, + help='Execute pipeline locally without a dispatching executor.') + alias_table = "\n" + utils.get_alias_table() + "\n" + parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None, + help='Execute pipeline locally without a dispatching executor. ' + alias_table) parser_cache = sub_parsers.add_parser('cache') parser_cache.add_argument('cachedir', metavar='', type=cache.valid_dir, diff --git a/workflow/Snakefile b/workflow/Snakefile index 2a8d4cc..661c99a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -128,6 +128,15 @@ else: if config["runqc"]: all_outputs.extend(qa_qc_outputs) +if config['disambiguate']: + all_outputs.extend(flatten([ + expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam", **demux_expand_args), + expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam", **demux_expand_args), + expand("{out_to}/{project}/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam", **demux_expand_args), + expand("{out_to}/{project}/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam", **demux_expand_args), + expand("{out_to}/{project}/{sids}/disambiguate/{sids}_summary.txt", **demux_expand_args), + ])) + rule all: input: diff --git a/workflow/qc.smk b/workflow/qc.smk index 1775c22..a7b6394 100644 --- a/workflow/qc.smk +++ b/workflow/qc.smk @@ -17,12 +17,12 @@ else: rule fastqc_untrimmed: input: - samples = config['out_to'] + "/demux/" + config["project"] + "/{sids}_R{rnums}_" + trim_input_suffix + ".fastq.gz", + samples = config['out_to'] + "/demux/" + config["project"] + "/{sids}_R{rnums}_" + trim_input_suffix + ".fastq.gz", output: - html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.html", - fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.zip", + html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.html", + fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.zip", params: - output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/" + output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/" log: config['out_to'] + "/logs/" + "/" + config["project"] + "/fastqc_untrimmed/{sids}_R{rnums}.log" threads: 4 containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" @@ -36,12 +36,12 @@ rule fastqc_untrimmed: rule fastqc_trimmed: input: - in_read = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz", + in_read = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz", output: - html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html", - fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip", + html = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html", + fqreport = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip", params: - output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/" + output_dir = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/" containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif" threads: 4 resources: mem_mb = 8096 @@ -53,6 +53,53 @@ rule fastqc_trimmed: """ +rule bwa: + input: + in_read1 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz" if config['disambiguate'] else [], + in_read2 = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if config['disambiguate'] and len(config['rnums']) == 2 else [], + output: + aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam" + aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam" + params: + host_genome = config['host_genome'] + path_genome = config['pathogen_genome'] + threads: 32 + resources: mem_mb = 64768 + containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.2.sif" + log: config['out_to'] + "/logs/" + config["project"] + "/bwa_mem/{sids}.log" + shell: + """ + bwa mem -t {threads} {params.host_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoA} - + bwa mem -t {threads} {params.path_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoB} - + """ + + +rule disambiguate: + input: + aligntoA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam" + aligntoB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam" + output: + ambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam" + ambiguousB = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam" + disambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam" + disambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam" + ambiguousA = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}_summary.txt" + params: + host_genome = config['host_genome'] + path_genome = config['pathogen_genome'] + this_sid = lambda wc: wc.sids + out_dir = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/" + containerized: config["resources"]["sif"] + "ngs_disambiguate_2018.05.03.sif" + log: config['out_to'] + "/logs/" + config["project"] + "/disambiguate/{sids}.log" + threads: 32 + resources: mem_mb = 64768 + shell: + """ + ngs_disambiguate -s {params.this_sid} -o {params.out_dir} -a bwa {input.aligntoA} {input.aligntoB} + """ + + + rule multiqc_report: input: # demux status