From d176365f3a17418c795abef83ec3b1c5775f4c80 Mon Sep 17 00:00:00 2001 From: Chris Tomkins-Tinch Date: Mon, 10 Jun 2024 12:59:44 -0400 Subject: [PATCH] index bam before calling samtools idxstats; warn user if input lacks reads (#102) * index bam before calling samtools idxstats; warn user if input lacks reads in `read_utils.py::minimap2_idxstats()`, warn the user if the input bam file lacks reads, and index the post-alignment post-filtering bam before calling `samtools idxstats` so `minimap2_idxstats()` succeeds if the input bam lacks reads. The minimap2 wrapper, `tools/minimap2.py::Minimap2::align_bam()`, tolerates empty input and indexes bam/cram output, but the post-filtering final bam in minimap2_idxstats was not being indexed, which led to a problematic exit-on-failure condition in a WDL command invoking the function. Also fix import of the InvalidBamHeaderError class used in several tool modules (add it to errors.py, and import the error subclasses where needed). This also runs minimap2 alignment when a read group lacks reads, since the newer version of minimap2 seems to better tolerate empty RGs. This allows samtools idxstats to yield zeros across the board for all input sequences when no reads are present, rather than only emitting the catchall "*" (which can cause issues downstream where metrics are expected) * add pandas to deps * attempt to fix coverage reporting due to breaking changes in coveralls reporter see: https://github.com/coverallsapp/coverage-reporter/issues/124 https://github.com/coverallsapp/github-action/issues/205 additional refs: https://github.com/coverallsapp/github-action?tab=readme-ov-file https://github.com/coverallsapp/coverage-reporter#supported-coverage-report-formats --- errors.py | 6 +++++- read_utils.py | 7 +++++++ tools/bwa.py | 1 + tools/minimap2.py | 7 ++++++- tools/novoalign.py | 19 +++++++------------ 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/errors.py b/errors.py index f9c81aed..4d369652 100644 --- a/errors.py +++ b/errors.py @@ -4,4 +4,8 @@ class QCError(RuntimeError): '''Indicates a failure at a QC step.''' def __init__(self, reason): - super(QCError, self).__init__(reason) \ No newline at end of file + super(QCError, self).__init__(reason) + +class InvalidBamHeaderError(ValueError): + '''Indicates a malformed or otherwise unusable bam file header''' + pass \ No newline at end of file diff --git a/read_utils.py b/read_utils.py index 7fc3cd86..5d5d2f19 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1388,6 +1388,9 @@ def minimap2_idxstats(inBam, refFasta, outBam=None, outStats=None, ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(refFasta, ref_indexed) + if samtools.isEmpty(inBam): + log.warning("The input bam file appears to have zero reads: %s", inBam) + mm2.align_bam(inBam, ref_indexed, bam_aligned) if filterReadsAfterAlignment: @@ -1401,6 +1404,10 @@ def minimap2_idxstats(inBam, refFasta, outBam=None, outStats=None, shutil.move(bam_aligned, bam_filtered) if outStats is not None: + # index the final bam before calling idxstats + # but only if it is a bam or cram file (sam cannot be indexed) + if (bam_filtered.endswith(".bam") or bam_filtered.endswith(".cram")): + samtools.index(bam_filtered) samtools.idxstats(bam_filtered, outStats) if outBam is None: diff --git a/tools/bwa.py b/tools/bwa.py index 09a9d180..799ace8d 100644 --- a/tools/bwa.py +++ b/tools/bwa.py @@ -16,6 +16,7 @@ import tools.picard import util.file import util.misc +from errors import * TOOL_NAME = 'bwa' diff --git a/tools/minimap2.py b/tools/minimap2.py index 0162caa3..e20f0e63 100644 --- a/tools/minimap2.py +++ b/tools/minimap2.py @@ -14,6 +14,7 @@ import tools.picard import util.file import util.misc +from errors import * TOOL_NAME = 'minimap2' @@ -80,6 +81,7 @@ def align_bam(self, inBam, refDb, outBam, options=None, log.warning("No alignment output for RG %s in file %s against %s", rg, inBam, refDb) if len(align_bams)==0: + log.warning("All read groups in file %s appear to be empty.", inBam) with util.file.tempfname('.empty.sam') as empty_sam: samtools.dumpHeader(inBam, empty_sam) samtools.sort(empty_sam, outBam) @@ -92,6 +94,7 @@ def align_bam(self, inBam, refDb, outBam, options=None, picardOptions=picardOptions, JVMmemory=JVMmemory ) + for bam in align_bams: os.unlink(bam) @@ -181,8 +184,10 @@ def align_one_rg(self, inBam, refDb, outBam, rgid=None, preset=None, options=Non # perform actual alignment if samtools.isEmpty(one_rg_inBam): + log.warning("Input file %s appears to lack reads for RG '%s'", inBam, rgid) # minimap doesn't like empty inputs, so copy empty bam through - samtools.sort(one_rg_inBam, outBam) + # samtools.sort(one_rg_inBam, outBam) + self.align_cmd(one_rg_inBam, refDb, outBam, options=options, threads=threads) else: self.align_cmd(one_rg_inBam, refDb, outBam, options=options, threads=threads) diff --git a/tools/novoalign.py b/tools/novoalign.py index 60e91571..80d7ec9e 100644 --- a/tools/novoalign.py +++ b/tools/novoalign.py @@ -7,12 +7,6 @@ either in $PATH or $NOVOALIGN_PATH. ''' -import tools -import tools.picard -import tools.samtools -import util.file -import util.misc - import logging import os import os.path @@ -21,11 +15,16 @@ import stat import sys +import tools +import tools.picard +import tools.samtools +import util.file +import util.misc +from errors import * + _log = logging.getLogger(__name__) TOOL_NAME = "novoalign" -#TOOL_VERSION = "3.09.04" - class NovoalignTool(tools.Tool): @@ -217,7 +216,3 @@ def index_fasta(self, refFasta, k=None, s=None): os.chmod(outfname, mode) except (IOError, OSError): _log.warning('could not chmod "%s", this is likely OK', refFasta) - - -class InvalidBamHeaderError(ValueError): - pass