From ba61d2634929cf90f81ced16cd6d71053fabd31f Mon Sep 17 00:00:00 2001 From: Clint Valentine Date: Mon, 9 Sep 2024 12:10:59 -0400 Subject: [PATCH] Ensure `--min-reads` is a hard filter in CallDuplexConsensusReads (#1010) --- .../fulcrumgenomics/alignment/Alignment.scala | 2 +- .../umi/DuplexConsensusCaller.scala | 23 ++++++++++- .../umi/FilterConsensusReads.scala | 5 +-- .../umi/UmiConsensusCaller.scala | 2 +- .../scala/com/fulcrumgenomics/umi/Umis.scala | 25 ++++++----- .../umi/DuplexConsensusCallerTest.scala | 38 +++++++++++++---- .../com/fulcrumgenomics/umi/UmisTest.scala | 41 +++++++++++++++---- 7 files changed, 101 insertions(+), 35 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala b/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala index 16d24a919..201951ef4 100644 --- a/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala +++ b/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala @@ -164,7 +164,7 @@ private[alignment] object CigarStats { */ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] { // Cache whether or not the Cigar is coalesced already (i.e. has no pair of adjacent elements with the same operator) - private lazy val isCoalesced: Boolean = { + private lazy val isCoalesced: Boolean = elems.length == 1 || { var itIs = true var index = 0 while (index < elems.length-1 && itIs) { diff --git a/src/main/scala/com/fulcrumgenomics/umi/DuplexConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/DuplexConsensusCaller.scala index 8cb7195fb..cea231c98 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/DuplexConsensusCaller.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/DuplexConsensusCaller.scala @@ -194,7 +194,16 @@ class DuplexConsensusCaller(override val readNamePrefix: String, val y = groups.lift(1).getOrElse(Seq.empty) if (hasMinimumNumberOfReads(x, y)) { - callDuplexConsensusRead(x, y) + val consensus = callDuplexConsensusRead(x, y) + if (consensus.forall(duplexHasMinimumNumberOfReads)) { + consensus + } else { + // Even though we had enough reads going into consensus calling, we can + // lose some in the process (e.g. to CIGAR filtering), and end up with + // consensus reads that will than fail the min-reads check + rejectRecords(groups.flatten, FilterMinReads) + Nil + } } else { rejectRecords(groups.flatten, FilterMinReads) @@ -204,7 +213,7 @@ class DuplexConsensusCaller(override val readNamePrefix: String, } - /** Returns true if there enough reads according to the minReads option. */ + /** Returns true if there are enough reads according to the minReads option. */ private def hasMinimumNumberOfReads(x: Seq[SamRecord], y: Seq[SamRecord]): Boolean = { // Get the number of reads per strand, in decreasing (more stringent) order. val (numXy: Int, numYx: Int) = { @@ -215,6 +224,16 @@ class DuplexConsensusCaller(override val readNamePrefix: String, this.minTotalReads <= numXy + numYx && this.minXyReads <= numXy && this.minYxReads <= numYx } + /** Returns true if there are enough reads composing the consensus according to the minReads option. */ + private def duplexHasMinimumNumberOfReads(consensus: SamRecord): Boolean = { + val (numXy: Int, numYx: Int) = { + val numAb = consensus[Int](ConsensusTags.PerRead.AbRawReadCount) + val numBa = consensus[Int](ConsensusTags.PerRead.BaRawReadCount) + if (numAb >= numBa) (numAb, numBa) else (numBa, numAb) + } + this.minTotalReads <= numXy + numYx && this.minXyReads <= numXy && this.minYxReads <= numYx + } + private def areAllSameStrand(reads: Seq[SamRecord]): Boolean = { if (reads.nonEmpty) { val ss1Flag = reads.head.negativeStrand diff --git a/src/main/scala/com/fulcrumgenomics/umi/FilterConsensusReads.scala b/src/main/scala/com/fulcrumgenomics/umi/FilterConsensusReads.scala index b147036f2..2385e6cc7 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/FilterConsensusReads.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/FilterConsensusReads.scala @@ -250,7 +250,7 @@ class FilterConsensusReads FilterResult(keepRead=false, 0) } else { - val result = if (isDuplexRecord(rec)) filterDuplexConsensusRead(rec) else filterVanillaConsensusRead(rec) + val result = if (Umis.isFgbioDuplexConsensus(rec)) filterDuplexConsensusRead(rec) else filterVanillaConsensusRead(rec) result.copy(keepRead = result.keepRead && fractionNoCalls(rec) <= this.maxNoCallFraction) } } @@ -329,9 +329,6 @@ class FilterConsensusReads } } - /** Quick check to see if a record is a duplex record. */ - private def isDuplexRecord(rec: SamRecord): Boolean = rec.get(ConsensusTags.PerRead.AbRawReadCount).isDefined - /** Reverses all the consensus tags if the right conditions are set. */ private def reverseConsensusTagsIfNeeded(rec: SamRecord): Unit = if (reversePerBaseTags && rec.negativeStrand) { ConsensusTags.PerBase.TagsToReverse.foreach { tag => diff --git a/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala index 55e51eb24..cc2d38e93 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/UmiConsensusCaller.scala @@ -369,7 +369,7 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] { private def simplifyCigar(cigar: Cigar) = { import CigarOperator._ if (cigar.forall(e => e.operator == M || e.operator == I || e.operator == D)) { - cigar + cigar.coalesce } else { val newElems = cigar.elems.map { diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index ed5512567..4a7c27790 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -26,7 +26,7 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.SamRecord -import com.fulcrumgenomics.umi.ConsensusTags +import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{RawReadCount, AbRawReadCount, BaRawReadCount} import com.fulcrumgenomics.util.Sequences object Umis { @@ -129,14 +129,17 @@ object Umis { ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-' } - /** Returns True if the record appears to be a consensus read, - * typically produced by fgbio's CallMolecularConsensusReads or CallDuplexConsensusReads. - * - * @param rec the record to check - * @return boolean indicating if the record is a consensus or not - */ - def isFgbioStyleConsensus(rec: SamRecord): Boolean = { - rec.contains(ConsensusTags.PerRead.RawReadCount) || - (rec.contains(ConsensusTags.PerRead.AbRawReadCount) && rec.contains(ConsensusTags.PerRead.BaRawReadCount)) - } + /** Returns True if the record appears to be a consensus sequence typically produced by fgbio's + * CallMolecularConsensusReads or CallDuplexConsensusReads. + * + * @param rec the record to check + * @return boolean indicating if the record is a consensus or not + */ + def isFgbioStyleConsensus(rec: SamRecord): Boolean = isFgbioSimplexConsensus(rec) || isFgbioDuplexConsensus(rec) + + /** Returns true if the record appears to be a simplex consensus sequence. */ + def isFgbioSimplexConsensus(rec: SamRecord): Boolean = rec.contains(RawReadCount) && !isFgbioDuplexConsensus(rec) + + /** Returns true if the record appears to be a duplex consensus sequence. */ + def isFgbioDuplexConsensus(rec: SamRecord): Boolean = rec.contains(AbRawReadCount) && rec.contains(BaRawReadCount) } diff --git a/src/test/scala/com/fulcrumgenomics/umi/DuplexConsensusCallerTest.scala b/src/test/scala/com/fulcrumgenomics/umi/DuplexConsensusCallerTest.scala index a22ca6d24..30039eafe 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/DuplexConsensusCallerTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/DuplexConsensusCallerTest.scala @@ -160,7 +160,7 @@ class DuplexConsensusCallerTest extends UnitSpec with OptionValues { } /** A builder where we have one duplex, with five total reads, three on one strand and two on the other. */ - private val builderForMinReads: SamBuilder = { + private def builderForMinReads(): SamBuilder = { val builder = new SamBuilder(readLength=10, baseQuality=20) builder.addPair(name="q1", start1=100, start2=200, strand1=Plus, strand2=Minus, bases1="AAAAAAAAAA", bases2="CCCCCCCCCC", attrs=Map(MI -> "foo/A")) builder.addPair(name="q2", start1=100, start2=200, strand1=Plus, strand2=Minus, bases1="AAAAAAAAAA", bases2="CCCCCCCCCC", attrs=Map(MI -> "foo/A")) @@ -171,18 +171,42 @@ class DuplexConsensusCallerTest extends UnitSpec with OptionValues { } it should "support the --min-reads option when one value is provided" in { - caller(minReads=Seq(3)).consensusReadsFromSamRecords(builderForMinReads.toSeq) shouldBe empty - caller(minReads=Seq(2)).consensusReadsFromSamRecords(builderForMinReads.toSeq) should have size 2 + val builder = builderForMinReads() + caller(minReads=Seq(3)).consensusReadsFromSamRecords(builder.toSeq) shouldBe empty + caller(minReads=Seq(2)).consensusReadsFromSamRecords(builder.toSeq) should have size 2 } it should "support the --min-reads option when two values are provided" in { - caller(minReads=Seq(5, 3)).consensusReadsFromSamRecords(builderForMinReads.toSeq) shouldBe empty - caller(minReads=Seq(5, 2)).consensusReadsFromSamRecords(builderForMinReads.toSeq) should have size 2 + val builder = builderForMinReads() + caller(minReads=Seq(5, 3)).consensusReadsFromSamRecords(builder.toSeq) shouldBe empty + caller(minReads=Seq(5, 2)).consensusReadsFromSamRecords(builder.toSeq) should have size 2 } it should "support the --min-reads option when three values are provided" in { - caller(minReads=Seq(5, 3, 3)).consensusReadsFromSamRecords(builderForMinReads.toSeq) shouldBe empty - caller(minReads=Seq(5, 3, 2)).consensusReadsFromSamRecords(builderForMinReads.toSeq) should have size 2 + val builder = builderForMinReads() + caller(minReads=Seq(5, 3, 3)).consensusReadsFromSamRecords(builder.toSeq) shouldBe empty + caller(minReads=Seq(5, 3, 2)).consensusReadsFromSamRecords(builder.toSeq) should have size 2 + } + + it should "support the --min-reads option as a hard filter if some records are removed during consensus calling for not having the most common structural alignment" in { + val builder = builderForMinReads() + val records = builder.toSeq + caller(minReads=Seq(3)).consensusReadsFromSamRecords(records) shouldBe empty + caller(minReads=Seq(2)).consensusReadsFromSamRecords(records) should have size 2 + + // Test for a coalesced and simplified cigar + val similar1 = builder.addPair(name="q6", start1=200, start2=100, strand1=Minus, strand2=Plus, bases1="CCCCCCCCCC", bases2="AAAAAAAAAA", attrs=Map(MI -> "foo/B"), cigar1 = "10M") + caller(minReads=Seq(3)).consensusReadsFromSamRecords(records ++ similar1) should have size 2 // the new read pair is considered in the final consensus + caller(minReads=Seq(2)).consensusReadsFromSamRecords(records ++ similar1) should have size 2 + + // Test for a non-coalesced but simplified cigar + val similar2 = builder.addPair(name="q6", start1=200, start2=100, strand1=Minus, strand2=Plus, bases1="CCCCCCCCCC", bases2="AAAAAAAAAA", attrs=Map(MI -> "foo/B"), cigar1 = "5M5M") + caller(minReads=Seq(3)).consensusReadsFromSamRecords(records ++ similar2) should have size 2 // the new read pair is considered in the final consensus + caller(minReads=Seq(2)).consensusReadsFromSamRecords(records ++ similar2) should have size 2 + + val dissimilar = builder.addPair(name="q6", start1=200, start2=100, strand1=Minus, strand2=Plus, bases1="CCCCCCCCCC", bases2="AAAAAAAAAA", attrs=Map(MI -> "foo/B"), cigar1 = "5M1D5M") + caller(minReads=Seq(3)).consensusReadsFromSamRecords(records ++ dissimilar) shouldBe empty // the new read pair is *not* considered in the final consensus + caller(minReads=Seq(2)).consensusReadsFromSamRecords(records ++ dissimilar) should have size 2 } it should "not saturate the qualities with deep AB and light BA coverage" in { diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index d469aa168..798a82da3 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -27,6 +27,7 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{RawReadCount, AbRawReadCount, BaRawReadCount} import org.scalatest.OptionValues class UmisTest extends UnitSpec with OptionValues { @@ -112,19 +113,41 @@ class UmisTest extends UnitSpec with OptionValues { an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME:CCKC-ACGT")) } - "Umis.isConsensusRead" should "return false for reads without consensus tags" in { + "Umis.isFgbioStyleConsensus" should "return false for reads without consensus tags" in { val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) - builder.addFrag(start=100).exists(Umis.isFgbioStyleConsensus) shouldBe false - builder.addPair(start1=100, start2=100, unmapped2=true).exists(Umis.isFgbioStyleConsensus) shouldBe false + Umis.isFgbioStyleConsensus(builder.addFrag(start=100).value) shouldBe false + val pair = builder.addPair(start1=100, start2=100, unmapped2=true) + pair.length shouldBe 2 + pair.forall(rec => Umis.isFgbioStyleConsensus(rec)) shouldBe false } it should "return true for reads with consensus tags" in { val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) - builder.addFrag(start=10, attrs=Map(ConsensusTags.PerRead.RawReadCount -> 10)) - .exists(Umis.isFgbioStyleConsensus) shouldBe true - builder.addFrag( - start=10, - attrs=Map(ConsensusTags.PerRead.AbRawReadCount -> 10, ConsensusTags.PerRead.BaRawReadCount -> 10) - ).exists(Umis.isFgbioStyleConsensus) shouldBe true + Umis.isFgbioStyleConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).value) shouldBe true + Umis.isFgbioStyleConsensus(builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe true + } + + "Umis.isFgbioSimplexConsensus" should "return true for reads with simplex only consensus tags" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) + Umis.isFgbioSimplexConsensus(builder.addFrag(start=100).value) shouldBe false + Umis.isFgbioSimplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).value) shouldBe true + Umis.isFgbioSimplexConsensus(builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe false + Umis.isFgbioSimplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 20, AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe false + + val pair = builder.addPair(start1=100, start2=100, unmapped2=true) + pair.length shouldBe 2 + pair.forall(rec => Umis.isFgbioSimplexConsensus(rec)) shouldBe false + } + + "Umis.isFgbioDuplexConsensus" should "return true for reads with duplex only consensus tags" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) + Umis.isFgbioDuplexConsensus(builder.addFrag(start=100).value) shouldBe false + Umis.isFgbioDuplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).value) shouldBe false + Umis.isFgbioDuplexConsensus(builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe true + Umis.isFgbioDuplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 20, AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe true + + val pair = builder.addPair(start1=100, start2=100, unmapped2=true) + pair.length shouldBe 2 + pair.forall(rec => Umis.isFgbioDuplexConsensus(rec)) shouldBe false } }