From 603f6eb8c3ea1ad747645d2a47510b1e501ff917 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Wed, 3 Jan 2024 17:38:23 -0600
Subject: [PATCH 1/9] feat: add disambiguate to docker image

---
 docker/NGS_QC/Dockerfile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docker/NGS_QC/Dockerfile b/docker/NGS_QC/Dockerfile
index 3ad8ca4..38a2fed 100644
--- a/docker/NGS_QC/Dockerfile
+++ b/docker/NGS_QC/Dockerfile
@@ -3,10 +3,13 @@ ENV TZ=America/New_York
 ENV DEBIAN_FRONTEND=noninteractive
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 RUN apt-get update; apt-get upgrade -y bash && \
-    apt-get install -y fastqc multiqc fastp kraken2 bowtie2 libgd-graph-perl git curl libz-dev build-essential
+    apt-get install -y fastqc python-is-python3 python3-pip samtools multiqc fastp kraken2 bowtie2 libgd-graph-perl git curl libz-dev build-essential
 RUN mkdir /opt2; curl -S -s -L https://github.com/StevenWingett/FastQ-Screen/archive/refs/tags/v0.15.3.tar.gz --output /opt2/v0.15.3.tar.gz
 RUN cd /opt2 && tar xvf v0.15.3.tar.gz 
 RUN cd /opt2 && git clone https://github.com/bioinformatics-centre/kaiju.git 
 RUN cd /opt2/kaiju/src; make
+RUN cd /opt2 && git clone https://github.com/AstraZeneca-NGS/disambiguate.git
+RUN pip install pysam
+RUN chmod +x /opt2/disambiguate/disambiguate.py; ln -sf /opt2/disambiguate/disambiguate.py /usr/bin/disambiguate.py
 ENV PATH="${PATH}:/opt2/FastQ-Screen-0.15.3:/opt2/kaiju/bin"
 ADD docker/NGS_QC/fastq_screen.conf /etc/fastq_screen.conf

From 679ad599154b1b695a64cb4154f1af88e970e4a6 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.gov>
Date: Fri, 5 Jan 2024 14:55:21 -0600
Subject: [PATCH 2/9] feat: add short read aligners to docker

---
 docker/NGS_QC/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/NGS_QC/Dockerfile b/docker/NGS_QC/Dockerfile
index 38a2fed..2a34a09 100644
--- a/docker/NGS_QC/Dockerfile
+++ b/docker/NGS_QC/Dockerfile
@@ -3,7 +3,7 @@ ENV TZ=America/New_York
 ENV DEBIAN_FRONTEND=noninteractive
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 RUN apt-get update; apt-get upgrade -y bash && \
-    apt-get install -y fastqc python-is-python3 python3-pip samtools multiqc fastp kraken2 bowtie2 libgd-graph-perl git curl libz-dev build-essential
+    apt-get install -y fastqc python-is-python3 rna-star bwa hisat2 python3-pip samtools multiqc fastp kraken2 bowtie2 libgd-graph-perl git curl libz-dev build-essential
 RUN mkdir /opt2; curl -S -s -L https://github.com/StevenWingett/FastQ-Screen/archive/refs/tags/v0.15.3.tar.gz --output /opt2/v0.15.3.tar.gz
 RUN cd /opt2 && tar xvf v0.15.3.tar.gz 
 RUN cd /opt2 && git clone https://github.com/bioinformatics-centre/kaiju.git 

From e857d0f5d706944dc311e985e3f4323685234963 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.giv>
Date: Wed, 10 Jan 2024 15:41:17 -0700
Subject: [PATCH 3/9] feat: add disambiguate functionality to pipeline

---
 .github/workflows/doc.yml |  2 +-
 config/bigsky.json        |  6 ++--
 config/biowulf.json       |  4 +--
 scripts/config.py         | 38 +++++++++++++++++++++++
 scripts/utils.py          | 47 +++++++++++++++++++++++++++--
 weave                     | 24 +++++++++++----
 workflow/Snakefile        |  9 ++++++
 workflow/qc.smk           | 63 ++++++++++++++++++++++++++++++++++-----
 8 files changed, 171 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml
index f7df7ca..b50f4f2 100644
--- a/.github/workflows/doc.yml
+++ b/.github/workflows/doc.yml
@@ -28,4 +28,4 @@ jobs:
           restore-keys: |
             mkdocs-material-
       - run: pip install -r docs/requirements.txt
-      - run: mkdocs gh-deploy --force
+      - run: mkdocs gh-deploy --forcescreen -r 
diff --git a/config/bigsky.json b/config/bigsky.json
index 44eb73e..50aabf2 100644
--- a/config/bigsky.json
+++ b/config/bigsky.json
@@ -3,17 +3,17 @@
     "mounts": {
         "kaiju": {
             "to": "/opt/kaiju",
-            "from": "/gs1/RTS/OpenOmics/references/Dmux/kaiju/kaiju_db_nr_euk_2023-05-10",
+            "from": "/gs1/RTS/OpenOmics/references/weave/kaiju/kaiju_db_nr_euk_2023-05-10",
             "mode": "ro"
         },
         "kraken2" : {
             "to": "/opt/kraken2",
-            "from": "/gs1/RTS/OpenOmics/references/Dmux/kraken2/k2_pluspfp_20230605",
+            "from": "/gs1/RTS/OpenOmics/references/weave/kraken2/k2_pluspfp_20230605",
             "mode": "ro"
         }, 
         "fastq_screen" : {
             "to": "/fdb/fastq_screen/FastQ_Screen_Genomes",
-            "from": "/gs1/RTS/OpenOmics/references/Dmux/FastQ_Screen_Genomes",
+            "from": "/gs1/RTS/OpenOmics/references/weave/FastQ_Screen_Genomes",
             "mode": "ro"
         }
     }
diff --git a/config/biowulf.json b/config/biowulf.json
index d1f9f90..838a7e8 100644
--- a/config/biowulf.json
+++ b/config/biowulf.json
@@ -3,12 +3,12 @@
     "mounts": {
         "kaiju": {
             "to": "/opt/kaiju",
-            "from": "/data/OpenOmics/references/Dmux/kaiju/kaiju_db_nr_euk_2023-05-10",
+            "from": "/data/OpenOmics/references/weave/kaiju/kaiju_db_nr_euk_2023-05-10",
             "mode": "ro"
         },
         "kraken2" : {
             "to": "/opt/kraken2",
-            "from": "/data/OpenOmics/references/Dmux/kraken2/k2_pluspfp_20230605",
+            "from": "/data/OpenOmics/references/weave/kraken2/k2_pluspfp_20230605",
             "mode": "ro"
         }, 
         "fastq_screen" : {
diff --git a/scripts/config.py b/scripts/config.py
index 81c84a4..e9f50e2 100644
--- a/scripts/config.py
+++ b/scripts/config.py
@@ -158,4 +158,42 @@ def get_bigsky_seq_dirs():
         "seq": get_biowulf_seq_dirs(),
         "profile": Path(Path(__file__).parent.parent, "utils", "profiles", "biowulf").resolve(),
     }
+}
+
+
+GENOME_CONFIGS = {
+    "biowulf": {
+        "hg19": "/vf/users/OpenOmics/references/genomes/human/hg19/GRCh37.primary_assembly.genome.fa.gz",
+        "grch37": "/vf/users/OpenOmics/references/genomes/human/GRCh37/GRCh37.primary_assembly.genome.fa.gz",
+        "hg38": "/vf/users/OpenOmics/references/genomes/human/hg38/GRCh38.p14.genome.fa.gz",
+        "grch38":  "/vf/users/OpenOmics/references/genomes/human/GRCh38/GRCh38.p14.genome.fa.gz",
+        "mm9":  "/vf/users/OpenOmics/references/genomes/mouse/mm9/mm9.fa.gz",
+        "grcm37":  "/vf/users/OpenOmics/references/genomes/mouse/GRCm37/mm9.fa.gz",
+        "mm10":  "/vf/users/OpenOmics/references/genomes/mouse/mm10/GRCm38.p4.genome.fa.gz",
+        "grcm38":  "/vf/users/OpenOmics/references/genomes/mouse/GRCm38/GRCm38.p4.genome.fa.gz",
+        "mm39":  "/vf/users/OpenOmics/references/genomes/mouse/mm39/GRCm39.genome.fa.gz",
+        "grcm39":  "/vf/users/OpenOmics/references/genomes/mouse/GRCm39/GRCm3s9.genome.fa.gz",
+        "rhemac10": "/vf/users/OpenOmics/references/genomes/rhesus/rhe10/rheMac10.fa.gz",
+        "mmul10": "/vf/users/OpenOmics/references/genomes/rhesus/mmul10/rheMac10.fa.gz",
+        "agm": "/vf/users/OpenOmics/references/genomes/agm/1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz",
+        "mesaur": "/vf/users/OpenOmics/references/genomes/mesaur/2.0/GCF_017639785.1_BCM_Maur_2.0_genomic.fna.gz",
+        "cynomac": "/vf/users/OpenOmics/references/genomes/cynomac/v2/GCF_012559485.2_MFA1912RKSv2_genomic.fna.gz",
+    },
+    "bigsky": {
+        "hg19": "/gs1/RTS/OpenOmics/references/genomes/human/hg19/GRCh37.primary_assembly.genome.fa.gz",
+        "grch37": "/gs1/RTS/OpenOmics/references/genomes/human/GRCh37/GRCh37.primary_assembly.genome.fa.gz",
+        "hg38": "/gs1/RTS/OpenOmics/references/genomes/human/hg38/GRCh38.p14.genome.fa.gz",
+        "grch38":  "/gs1/RTS/OpenOmics/references/genomes/human/GRCh38/GRCh38.p14.genome.fa.gz",
+        "mm9":  "/gs1/RTS/OpenOmics/references/genomes/mouse/mm9/mm9.fa.gz",
+        "grcm37":  "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm37/mm9.fa.gz",
+        "mm10":  "/gs1/RTS/OpenOmics/references/genomes/mouse/mm10/GRCm38.p4.genome.fa.gz",
+        "grcm38":  "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm38/GRCm38.p4.genome.fa.gz",
+        "mm39":  "/gs1/RTS/OpenOmics/references/genomes/mouse/mm39/GRCm39.genome.fa.gz",
+        "grcm39":  "/gs1/RTS/OpenOmics/references/genomes/mouse/GRCm39/GRCm39.genome.fa.gz",
+        "rhemac10": "/gs1/RTS/OpenOmics/references/genomes/rhesus/rhe10/rheMac10.fa.gz",
+        "mmul10": "/gs1/RTS/OpenOmics/references/genomes/rhesus/mmul10/rheMac10.fa.gz",
+        "agm": "/gs1/RTS/OpenOmics/references/genomes/agm/1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz",
+        "mesaur": "/gs1/RTS/OpenOmics/references/genomes/mesaur/2.0/GCF_017639785.1_BCM_Maur_2.0_genomic.fna.gz",
+        "cynomac": "/gs1/RTS/OpenOmics/references/genomes/cynomac/v2/GCF_012559485.2_MFA1912RKSv2_genomic.fna.gz",
+    },
 }
\ No newline at end of file
diff --git a/scripts/utils.py b/scripts/utils.py
index 131de42..12a6029 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -15,7 +15,7 @@
 
 # ~~~ internals ~~~
 from .files import parse_samplesheet, mk_or_pass_dirs
-from .config import SNAKEFILE, DIRECTORY_CONFIGS, get_current_server, get_resource_config
+from .config import SNAKEFILE, DIRECTORY_CONFIGS, GENOME_CONFIGS, get_current_server, get_resource_config
 
 
 host = get_current_server()
@@ -40,6 +40,19 @@ def default(self, obj):
         return super(self).default(obj)
 
 
+def get_alias_table():
+    pp_tbl = lambda x: "\n".join([y.lstrip().rstrip() for y in x.split('\n')])
+    return pp_tbl("""+----------------+-------------------------------------------+
+                     | Organism       | Genomes supported (aka)                   |
+                     +----------------+-------------------------------------------+
+                     | HUMAN          | hg19(grch37) / hg38(grch38)               |
+                     +----------------+-------------------------------------------+
+                     | MOUSE          | mm9(grcm37) / mm10(grcm38) / mm39(grcm39) |
+                     +----------------+-------------------------------------------+
+                     | RHESUS MACAQUE | RHEMAC10(mmul10)                          |
+                     +----------------+-------------------------------------------+""")
+
+
 def valid_runid(id_to_check):
     '''
         Given an input ID get it's validity against the run id format:
@@ -233,6 +246,9 @@ def exec_pipeline(configs, dry_run=False, local=False):
             if not bclcon_log_dir.exists():
                 bclcon_log_dir.mkdir(mode=0o755, parents=True)
             extra_to_mount.append(str(bclcon_log_dir) + ":" + "/var/log/bcl-convert:rw")
+        if this_config['disambiguate']:
+            extra_to_mount.append(Path(this_config['host_genome']).parent)
+            extra_to_mount.append(Path(this_config['pathogen_genome']).parent)
         singularity_binds = get_mounts(*extra_to_mount)
         config_file = Path(this_config['out_to'], '.config', f'config_job_{str(i)}.json').absolute()
         json.dump(this_config, open(config_file, 'w'), cls=PathJSONEncoder, indent=4)
@@ -275,4 +291,31 @@ def is_bclconvert(samplesheet):
     check = False
     if samplesheet.instrument in BCLCONVERT_INSTRUMENTS or samplesheet.platform in BCLCONVERT_PLATFORMS:
         check = True
-    return check
\ No newline at end of file
+    return check
+
+
+def valid_host_pathogen_genomes(host, pathogen):
+    g1, g2 = False, False
+    genomes = GENOME_CONFIGS[get_current_server()]
+
+    if Path(host).absolute().exists():
+        g1 = True
+        host = str(Path(host).absolute())
+    elif host.lower() in genomes:
+        g1 = True
+        host = genomes[host.lower()]
+
+    if Path(g2).absolute().exists():
+        g2 = True
+        pathogen = str(Path(pathogen).absolute())
+    elif pathogen.lower() in genomes:
+        g2 = True
+        pathogen = genomes[pathogen.lower()]
+
+    if not all([g1, g2]):
+        if not g1:
+            raise ValueError('Host genome does not exist on the file system or in the aliases.')
+        if not g2:
+            raise ValueError('Pathogen genome does not exist on the file system or in the aliases.')
+
+    return host, pathogen
\ No newline at end of file
diff --git a/weave b/weave
index 47fd4f3..02d4568 100755
--- a/weave
+++ b/weave
@@ -33,6 +33,17 @@ def run(args):
         exec_config['bcl_files'].append(bcls)
         exec_config['demux_data'].append(files.check_if_demuxed(rundir))
 
+        # ~~~ disambiguate genome configuration ~~~
+        exec_config['host_genome'] = None
+        exec_config['pathogen_genome'] = None
+        if all([args.host, args.pathogen]):
+            utils.valid_host_pathogen_genomes(args.host, args.pathogen)
+            exec_config['disambiguate'] = True
+            exec_config['host_genome'] = config.get_genome_path(args.host)
+            exec_config['pathogen_genome'] = config.get_genome_path(args.pathogen)
+        else:
+            assert(not any([args.host, args.pathogen])), 'Must specify both host and pathogen genometype!'
+
         # ~~~ QC/QA configuration ~~~
         exec_config['bclconvert'].append(utils.is_bclconvert(sample_sheet))
         exec_config['run_ids'].append(rundir.name)
@@ -86,12 +97,13 @@ if __name__ == '__main__':
                             help='Name of the sample sheet file to look for (default is SampleSheet.csv)')
     parser_run.add_argument('-l', '--local', action='store_true',
                             help='Execute pipeline locally without a dispatching executor')
-    # force endedness flags
-    endedness = parser_run.add_mutually_exclusive_group()
-    endedness.add_argument('--single_end', default=None, action='store_true', 
-                            help='Declare endedness of run (in cases in which it is not detectable from sample sheet), mutally exclusive to paired_end')
-    endedness.add_argument('--paired_end', default=None, action='store_true',
-                           help='Declare endedness of run (in cases in which it is not detectable from sample sheet), mutally exclusive to single_end')
+    
+    # disambiguate arguments
+    parser_run.add_argument('-h', '--host', type=files.valid_fasta, default=None,
+                        help='Execute pipeline locally without a dispatching executor.')
+    alias_table = "\n" + utils.get_alias_table() + "\n"
+    parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None,
+                        help='Execute pipeline locally without a dispatching executor. ' + alias_table)
 
     parser_cache = sub_parsers.add_parser('cache')
     parser_cache.add_argument('cachedir', metavar='<cache directory>', type=cache.valid_dir, 
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 2a8d4cc..661c99a 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -128,6 +128,15 @@ else:
 if config["runqc"]:
     all_outputs.extend(qa_qc_outputs)
 
+if config['disambiguate']:
+    all_outputs.extend(flatten([
+        expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam", **demux_expand_args),
+        expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam", **demux_expand_args),
+        expand("{out_to}/{project}/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam", **demux_expand_args),
+        expand("{out_to}/{project}/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam", **demux_expand_args),
+        expand("{out_to}/{project}/{sids}/disambiguate/{sids}_summary.txt", **demux_expand_args),
+    ]))
+
 
 rule all:
     input:
diff --git a/workflow/qc.smk b/workflow/qc.smk
index 1775c22..a7b6394 100644
--- a/workflow/qc.smk
+++ b/workflow/qc.smk
@@ -17,12 +17,12 @@ else:
 
 rule fastqc_untrimmed:
     input:
-        samples     = config['out_to'] + "/demux/" + config["project"] + "/{sids}_R{rnums}_" + trim_input_suffix + ".fastq.gz",
+        samples       = config['out_to'] + "/demux/" + config["project"] + "/{sids}_R{rnums}_" + trim_input_suffix + ".fastq.gz",
     output:   
-        html        = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.html",
-        fqreport    = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.zip",
+        html          = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.html",
+        fqreport      = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_untrimmed/{sids}_R{rnums}_" + trim_input_suffix + "_fastqc.zip",
     params:
-        output_dir  = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/"
+        output_dir    = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_untrimmed/"
     log: config['out_to'] + "/logs/" + "/" + config["project"] + "/fastqc_untrimmed/{sids}_R{rnums}.log"
     threads: 4
     containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif"
@@ -36,12 +36,12 @@ rule fastqc_untrimmed:
 
 rule fastqc_trimmed:
     input:
-        in_read     = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz",
+        in_read       = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R{rnums}.fastq.gz",
     output:
-        html        = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html",
-        fqreport    = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip",
+        html          = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.html",
+        fqreport      = config['out_to'] + "/" + config["project"] + "/{sids}/fastqc_trimmed/{sids}_trimmed_R{rnums}_fastqc.zip",
     params:
-        output_dir  = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/"
+        output_dir    = lambda w: config['out_to'] + "/" + config["project"] + "/" + w.sids + "/fastqc_trimmed/"
     containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.1.sif"
     threads: 4
     resources: mem_mb = 8096
@@ -53,6 +53,53 @@ rule fastqc_trimmed:
         """
 
 
+rule bwa:
+    input:
+        in_read1       = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz" if config['disambiguate'] else [],
+        in_read2       = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if config['disambiguate'] and len(config['rnums']) == 2 else [],
+    output:
+        aligntoA       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam"
+        aligntoB       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam"
+    params:
+        host_genome    = config['host_genome']
+        path_genome    = config['pathogen_genome']
+    threads: 32
+    resources: mem_mb = 64768
+    containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.2.sif"
+    log: config['out_to'] + "/logs/" + config["project"] + "/bwa_mem/{sids}.log"
+    shell:
+        """
+        bwa mem -t {threads} {params.host_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoA} -
+        bwa mem -t {threads} {params.path_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoB} -
+        """
+
+
+rule disambiguate:
+    input:
+        aligntoA       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam"
+        aligntoB       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam"
+    output:
+        ambiguousA     = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam"
+        ambiguousB     = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam"
+        disambiguousA  = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam"
+        disambiguousA  = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam"
+        ambiguousA     = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}_summary.txt"
+    params:
+        host_genome    = config['host_genome']
+        path_genome    = config['pathogen_genome']
+        this_sid       = lambda wc: wc.sids
+        out_dir        = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/"
+    containerized: config["resources"]["sif"] + "ngs_disambiguate_2018.05.03.sif"
+    log: config['out_to'] + "/logs/" + config["project"] + "/disambiguate/{sids}.log"
+    threads: 32
+    resources: mem_mb = 64768
+    shell:
+        """
+        ngs_disambiguate -s {params.this_sid} -o {params.out_dir} -a bwa {input.aligntoA} {input.aligntoB}
+        """
+
+
+
 rule multiqc_report:
     input:
         # demux status

From 539ed71199a805e4b99bd76eb150d47ccf810594 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.giv>
Date: Thu, 11 Jan 2024 09:22:14 -0700
Subject: [PATCH 4/9] fix: applying testing fixes for disambiguate feat

---
 scripts/files.py   | 30 ++++++++++++++++++++++++++++--
 scripts/utils.py   | 14 ++++----------
 weave              | 17 +++++++++++------
 workflow/Snakefile |  2 +-
 workflow/qc.smk    | 34 +++++++++++++++++-----------------
 5 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/scripts/files.py b/scripts/files.py
index 9f4c5ad..adb415c 100644
--- a/scripts/files.py
+++ b/scripts/files.py
@@ -3,12 +3,12 @@
 # ~~~~~~~~~~~~~~~
 #   file system helper functions for the Dmux software package
 # ~~~~~~~~~~~~~~~
-from pathlib import Path
 import xml.etree.ElementTree as ET
+from pathlib import Path
 from os import access as check_access, R_OK, W_OK
 from functools import partial
 from .samplesheet import IllumniaSampleSheet
-from .config import get_current_server, LABKEY_CONFIGS, DIRECTORY_CONFIGS
+from .config import get_current_server, GENOME_CONFIGS, DIRECTORY_CONFIGS
 
 
 def get_all_seq_dirs(top_dir, server):
@@ -48,6 +48,32 @@ def valid_run_output(output_directory, dry_run=False):
     return output_directory
 
 
+def valid_fasta(suspect):
+    server_genomes = GENOME_CONFIGS[get_current_server()]
+    is_valid = False
+    if suspect.lower() in server_genomes:
+        is_valid = True
+        suspect = server_genomes[suspect.lower()]
+    else:
+        the_suspect = Path(suspect)
+        exts = the_suspect.suffixes
+        if any([
+            '.fna' in exts,
+            '.fa' in exts,
+            '.fasta' in exts,
+            '.fna' in exts and '.gz' in exts,
+            '.fa' in exts and '.gz' in exts,
+            '.fasta' in exts and '.gz' in exts,
+        ]):
+            is_valid = True
+            suspect = str(Path(suspect).absolute())
+
+    if not is_valid:
+        raise ValueError
+            
+    return suspect
+
+
 def get_all_staged_dirs(top_dir, server):
     return list(filter(partial(is_dir_staged, server), get_all_seq_dirs(top_dir, server)))
 
diff --git a/scripts/utils.py b/scripts/utils.py
index 12a6029..9ead137 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -246,7 +246,7 @@ def exec_pipeline(configs, dry_run=False, local=False):
             if not bclcon_log_dir.exists():
                 bclcon_log_dir.mkdir(mode=0o755, parents=True)
             extra_to_mount.append(str(bclcon_log_dir) + ":" + "/var/log/bcl-convert:rw")
-        if this_config['disambiguate']:
+        if this_config.get('disambiguate', False):
             extra_to_mount.append(Path(this_config['host_genome']).parent)
             extra_to_mount.append(Path(this_config['pathogen_genome']).parent)
         singularity_binds = get_mounts(*extra_to_mount)
@@ -301,21 +301,15 @@ def valid_host_pathogen_genomes(host, pathogen):
     if Path(host).absolute().exists():
         g1 = True
         host = str(Path(host).absolute())
-    elif host.lower() in genomes:
-        g1 = True
-        host = genomes[host.lower()]
 
-    if Path(g2).absolute().exists():
+    if Path(pathogen).absolute().exists():
         g2 = True
         pathogen = str(Path(pathogen).absolute())
-    elif pathogen.lower() in genomes:
-        g2 = True
-        pathogen = genomes[pathogen.lower()]
 
     if not all([g1, g2]):
         if not g1:
-            raise ValueError('Host genome does not exist on the file system or in the aliases.')
+            raise ValueError('Host genome does not exist on the file system.')
         if not g2:
-            raise ValueError('Pathogen genome does not exist on the file system or in the aliases.')
+            raise ValueError('Pathogen genome does not exist on the file system.')
 
     return host, pathogen
\ No newline at end of file
diff --git a/weave b/weave
index 02d4568..844332d 100755
--- a/weave
+++ b/weave
@@ -34,13 +34,18 @@ def run(args):
         exec_config['demux_data'].append(files.check_if_demuxed(rundir))
 
         # ~~~ disambiguate genome configuration ~~~
-        exec_config['host_genome'] = None
-        exec_config['pathogen_genome'] = None
         if all([args.host, args.pathogen]):
+            if 'disambiguate' not in exec_config:
+                exec_config['disambiguate'] = []
+            if 'host_genome' not in exec_config:
+                exec_config['host_genome'] = []
+            if 'pathogen_genome' not in exec_config:
+                exec_config['pathogen_genome'] = []
+
             utils.valid_host_pathogen_genomes(args.host, args.pathogen)
-            exec_config['disambiguate'] = True
-            exec_config['host_genome'] = config.get_genome_path(args.host)
-            exec_config['pathogen_genome'] = config.get_genome_path(args.pathogen)
+            exec_config['disambiguate'].append(True)
+            exec_config['host_genome'].append(args.host)
+            exec_config['pathogen_genome'].append(args.pathogen)
         else:
             assert(not any([args.host, args.pathogen])), 'Must specify both host and pathogen genometype!'
 
@@ -99,7 +104,7 @@ if __name__ == '__main__':
                             help='Execute pipeline locally without a dispatching executor')
     
     # disambiguate arguments
-    parser_run.add_argument('-h', '--host', type=files.valid_fasta, default=None,
+    parser_run.add_argument('-t', '--host', type=files.valid_fasta, default=None,
                         help='Execute pipeline locally without a dispatching executor.')
     alias_table = "\n" + utils.get_alias_table() + "\n"
     parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None,
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 661c99a..15f559c 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -128,7 +128,7 @@ else:
 if config["runqc"]:
     all_outputs.extend(qa_qc_outputs)
 
-if config['disambiguate']:
+if config.get('disambiguate', False):
     all_outputs.extend(flatten([
         expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam", **demux_expand_args),
         expand("{out_to}/{project}/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam", **demux_expand_args),
diff --git a/workflow/qc.smk b/workflow/qc.smk
index a7b6394..824f16e 100644
--- a/workflow/qc.smk
+++ b/workflow/qc.smk
@@ -55,14 +55,14 @@ rule fastqc_trimmed:
 
 rule bwa:
     input:
-        in_read1       = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz" if config['disambiguate'] else [],
-        in_read2       = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if config['disambiguate'] and len(config['rnums']) == 2 else [],
+        in_read1       = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R1.fastq.gz" if config.get('disambiguate', False) else [],
+        in_read2       = config["out_to"] + "/" + config["project"] + "/{sids}/fastp/{sids}_trimmed_R2.fastq.gz" if config.get('disambiguate', False) and len(config['rnums']) == 2 else [],
     output:
-        aligntoA       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam"
-        aligntoB       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam"
+        aligntoA       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam",
+        aligntoB       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam",
     params:
-        host_genome    = config['host_genome']
-        path_genome    = config['pathogen_genome']
+        host_genome    = config.get('host_genome', ''),
+        path_genome    = config.get('pathogen_genome', ''),
     threads: 32
     resources: mem_mb = 64768
     containerized: config["resources"]["sif"] + "weave_ngsqc_0.0.2.sif"
@@ -76,19 +76,19 @@ rule bwa:
 
 rule disambiguate:
     input:
-        aligntoA       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam"
-        aligntoB       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam"
+        aligntoA       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeA.bam",
+        aligntoB       = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.AligntoGenomeB.bam",
     output:
-        ambiguousA     = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam"
-        ambiguousB     = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam"
-        disambiguousA  = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam"
-        disambiguousA  = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam"
-        ambiguousA     = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}_summary.txt"
+        ambiguousA     = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesA.bam",
+        ambiguousB     = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.ambiguousSpeciesB.bam",
+        disambiguousA  = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesA.bam",
+        disambiguousB  = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}.disambiguatedSpeciesB.bam",
+        dis_summary    = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/{sids}_summary.txt",
     params:
-        host_genome    = config['host_genome']
-        path_genome    = config['pathogen_genome']
-        this_sid       = lambda wc: wc.sids
-        out_dir        = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/"
+        host_genome    = config.get('host_genome', ''),
+        path_genome    = config.get('pathogen_genome', ''),
+        this_sid       = lambda wc: wc.sids,
+        out_dir        = config["out_to"] + "/" + config["project"] + "/{sids}/disambiguate/",
     containerized: config["resources"]["sif"] + "ngs_disambiguate_2018.05.03.sif"
     log: config['out_to'] + "/logs/" + config["project"] + "/disambiguate/{sids}.log"
     threads: 32

From 340fc9a51bdc1b5eb1ab12b5093a85f4306bbda3 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.giv>
Date: Fri, 12 Jan 2024 08:55:28 -0700
Subject: [PATCH 5/9] fix: remove type

---
 .github/workflows/doc.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml
index b50f4f2..f7df7ca 100644
--- a/.github/workflows/doc.yml
+++ b/.github/workflows/doc.yml
@@ -28,4 +28,4 @@ jobs:
           restore-keys: |
             mkdocs-material-
       - run: pip install -r docs/requirements.txt
-      - run: mkdocs gh-deploy --forcescreen -r 
+      - run: mkdocs gh-deploy --force

From d4a439db0e6765de6ba2a167d07d0761d459e578 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.giv>
Date: Fri, 12 Jan 2024 09:00:07 -0700
Subject: [PATCH 6/9] fix: natural name sort bams

---
 workflow/qc.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/qc.smk b/workflow/qc.smk
index 824f16e..68a8fb9 100644
--- a/workflow/qc.smk
+++ b/workflow/qc.smk
@@ -69,8 +69,8 @@ rule bwa:
     log: config['out_to'] + "/logs/" + config["project"] + "/bwa_mem/{sids}.log"
     shell:
         """
-        bwa mem -t {threads} {params.host_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoA} -
-        bwa mem -t {threads} {params.path_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -o {output.aligntoB} -
+        bwa mem -t {threads} {params.host_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -n -o {output.aligntoA} -
+        bwa mem -t {threads} {params.path_genome} {input.in_read1} {input.in_read2} | samtools sort -@ {threads} -n -o {output.aligntoB} -
         """
 
 

From 47814bc86572074e7bd3fba78fd96f448acd5f94 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.giv>
Date: Fri, 12 Jan 2024 09:34:01 -0700
Subject: [PATCH 7/9] fix: help genome alias table and remote cache assets

---
 config/remote.json | 2 ++
 scripts/utils.py   | 5 +++--
 weave              | 8 ++++----
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/config/remote.json b/config/remote.json
index 6371456..097d764 100644
--- a/config/remote.json
+++ b/config/remote.json
@@ -1,6 +1,8 @@
 {
     "bcl2fastq": "docker://umccr/bcl2fastq:latest",
     "weave": "docker://rroutsong/weave_ngsqc:0.0.1",
+    "bclconvert": "docker://rroutsong/weave_bclconvert:0.0.3",
+    "disambiguate": "docker://quay.io/biocontainers/ngs-disambiguate:2018.05.03--hf393df8_7",
     "kraken": "https://genome-idx.s3.amazonaws.com/kraken/k2_pluspfp_16gb_20231009.tar.gz",
     "kaiju": "https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_euk_2023-05-10.tgz",
     "fastq_screen": "filelist://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/genome_locations.txt"
diff --git a/scripts/utils.py b/scripts/utils.py
index 9ead137..29fda6a 100644
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -8,6 +8,7 @@
 import os
 import yaml
 import sys
+import textwrap
 from argparse import ArgumentTypeError
 from dateutil.parser import parse as date_parser
 from subprocess import Popen, PIPE, STDOUT
@@ -41,8 +42,8 @@ def default(self, obj):
 
 
 def get_alias_table():
-    pp_tbl = lambda x: "\n".join([y.lstrip().rstrip() for y in x.split('\n')])
-    return pp_tbl("""+----------------+-------------------------------------------+
+    return textwrap.dedent("""Genome short name alias table:
+                     +----------------+-------------------------------------------+
                      | Organism       | Genomes supported (aka)                   |
                      +----------------+-------------------------------------------+
                      | HUMAN          | hg19(grch37) / hg38(grch38)               |
diff --git a/weave b/weave
index 844332d..681fbf5 100755
--- a/weave
+++ b/weave
@@ -83,10 +83,10 @@ def unlock_dir(sub_args):
 
 # ~~~~ argument parsing commands ~~~~
 if __name__ == '__main__':
-    main_parser = argparse.ArgumentParser(prog='weave')
+    main_parser = argparse.ArgumentParser(prog='weave', formatter_class=argparse.RawTextHelpFormatter, epilog=utils.get_alias_table())
     sub_parsers = main_parser.add_subparsers(help='run subcommand help')
 
-    parser_run = sub_parsers.add_parser('run')
+    parser_run = sub_parsers.add_parser('run', formatter_class=argparse.RawTextHelpFormatter, epilog=utils.get_alias_table())
     parser_run.add_argument('rundir', metavar='<run directory>', nargs="+", type=utils.valid_run_input, 
                             help='Full & complete run id (format YYMMDD_INSTRUMENTID_TIME_FLOWCELLID) or absolute paths')
     parser_run.add_argument('-s', '--seq_dir', metavar='<sequencing directory>', default=None, type=str,
@@ -106,9 +106,9 @@ if __name__ == '__main__':
     # disambiguate arguments
     parser_run.add_argument('-t', '--host', type=files.valid_fasta, default=None,
                         help='Execute pipeline locally without a dispatching executor.')
-    alias_table = "\n" + utils.get_alias_table() + "\n"
+    
     parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None,
-                        help='Execute pipeline locally without a dispatching executor. ' + alias_table)
+                        help='Execute pipeline locally without a dispatching executor.')
 
     parser_cache = sub_parsers.add_parser('cache')
     parser_cache.add_argument('cachedir', metavar='<cache directory>', type=cache.valid_dir, 

From 820e0afaa26b907508de339596460b5183718651 Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.giv>
Date: Fri, 12 Jan 2024 09:41:38 -0700
Subject: [PATCH 8/9] fix: add actual help messages to argparse arguments

---
 weave | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/weave b/weave
index 681fbf5..95af79b 100755
--- a/weave
+++ b/weave
@@ -88,37 +88,37 @@ if __name__ == '__main__':
 
     parser_run = sub_parsers.add_parser('run', formatter_class=argparse.RawTextHelpFormatter, epilog=utils.get_alias_table())
     parser_run.add_argument('rundir', metavar='<run directory>', nargs="+", type=utils.valid_run_input, 
-                            help='Full & complete run id (format YYMMDD_INSTRUMENTID_TIME_FLOWCELLID) or absolute paths')
+                            help='Full & complete run id (format YYMMDD_INSTRUMENTID_TIME_FLOWCELLID) or absolute paths.')
     parser_run.add_argument('-s', '--seq_dir', metavar='<sequencing directory>', default=None, type=str,
                             help='Root directory for sequencing data (defaults for biowulf/bigsky/locus), must contain directories ' + \
                             'matching run ids, if not using full paths.')
     parser_run.add_argument('-o', '--output', metavar='<output directory>', default=None, type=str, 
-                            help='Top-level output directory for demultiplexing data (defaults to input directory + runid + "_demux")')
+                            help='Top-level output directory for demultiplexing data (defaults to input directory + runid + "_demux").')
     parser_run.add_argument('-d', '--dry-run', action='store_true',
-                            help='Dry run the demultiplexing workflow')
+                            help='Dry run the demultiplexing workflow.')
     parser_run.add_argument('-n', '--noqc', action='store_false',
-                            help='Do not run the QC/QA portion of the workflow (Default is on)')
+                            help='Do not run the QC/QA portion of the workflow (Default is on).')
     parser_run.add_argument('--sheetname', metavar='Sample Sheet Filename', 
-                            help='Name of the sample sheet file to look for (default is SampleSheet.csv)')
+                            help='Name of the sample sheet file to look for (default is SampleSheet.csv).')
     parser_run.add_argument('-l', '--local', action='store_true',
-                            help='Execute pipeline locally without a dispatching executor')
+                            help='Execute pipeline locally without a dispatching executor.')
     
     # disambiguate arguments
     parser_run.add_argument('-t', '--host', type=files.valid_fasta, default=None,
-                        help='Execute pipeline locally without a dispatching executor.')
+                        help='Full path to host genome for disambiguate to use or short name genome alias.')
     
     parser_run.add_argument('-p', '--pathogen', type=files.valid_fasta, default=None,
-                        help='Execute pipeline locally without a dispatching executor.')
+                        help='Full path to pathogen/graft/parasite genome for disambiguate to use or short name genome alias.')
 
     parser_cache = sub_parsers.add_parser('cache')
     parser_cache.add_argument('cachedir', metavar='<cache directory>', type=cache.valid_dir, 
-                            help='Relative or absolute path to directory for cache storage')
+                            help='Relative or absolute path to directory for cache storage.')
     parser_cache.add_argument('-l', '--local', action='store_true',
                             help='Execute pipeline locally without a dispatching executor')
     
     parser_unlock = sub_parsers.add_parser('unlock')
     parser_unlock.add_argument('unlockdir', metavar='<directory to unlock>', type=cache.valid_dir, 
-                            help='Relative or absolute path to directory for cache storage')
+                            help='Full path to directory to unlock.')
 
     parser_run.set_defaults(func = run)
     parser_cache.set_defaults(func = get_cache)

From 76737ebb15c6d0ee1a7175c18fb66c657e76bdeb Mon Sep 17 00:00:00 2001
From: Ryan Routsong <routsongrm@nih.giv>
Date: Tue, 16 Jan 2024 09:24:25 -0700
Subject: [PATCH 9/9] fix: correct readme

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0ad37e6..70e00bc 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,9 @@
    
   <h1>weave 🔬</h1>
   
-  **_An awesome metagenomic and metatranscriptomics pipeline_**
+  **_An awesome demultiplexing and quality control pipeline_**
 
-  [![tests](https://github.com/OpenOmics/weave/workflows/tests/badge.svg)](https://github.com/OpenOmics/weave/actions/workflows/main.yaml) [![docs](https://github.com/OpenOmics/weave/workflows/docs/badge.svg)](https://github.com/OpenOmics/weave/actions/workflows/docs.yml) [![GitHub issues](https://img.shields.io/github/issues/OpenOmics/weave?color=brightgreen)](https://github.com/OpenOmics/weave/issues)  [![GitHub license](https://img.shields.io/github/license/OpenOmics/weave)](https://github.com/OpenOmics/weave/blob/main/LICENSE) 
+  [![tests](https://github.com/OpenOmics/weave/workflows/tests/badge.svg)](https://github.com/OpenOmics/weave/actions/workflows/main.yaml) [![docs](https://github.com/OpenOmics/weave/actions/workflows/doc.yml/badge.svg)](https://github.com/OpenOmics/weave/actions/workflows/docs.yml) [![GitHub issues](https://img.shields.io/github/issues/OpenOmics/weave?color=brightgreen)](https://github.com/OpenOmics/weave/issues)  [![GitHub license](https://img.shields.io/github/license/OpenOmics/weave)](https://github.com/OpenOmics/weave/blob/main/LICENSE) 
   
   <i>
     This is the home of the pipeline, weave. Its long-term goals: to provide accurate quantification, taxonomic classification, and functional profiling of assembled (bacteria and archaea) metagenomes!