Skip to content

Commit

Permalink
Ensure --min-reads is a hard filter in CallDuplexConsensusReads (#1010
Browse files Browse the repository at this point in the history
)
  • Loading branch information
clintval authored Sep 9, 2024
1 parent 6ca7733 commit ba61d26
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ private[alignment] object CigarStats {
*/
case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] {
// Cache whether or not the Cigar is coalesced already (i.e. has no pair of adjacent elements with the same operator)
private lazy val isCoalesced: Boolean = {
private lazy val isCoalesced: Boolean = elems.length == 1 || {
var itIs = true
var index = 0
while (index < elems.length-1 && itIs) {
Expand Down
23 changes: 21 additions & 2 deletions src/main/scala/com/fulcrumgenomics/umi/DuplexConsensusCaller.scala
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,16 @@ class DuplexConsensusCaller(override val readNamePrefix: String,
val y = groups.lift(1).getOrElse(Seq.empty)

if (hasMinimumNumberOfReads(x, y)) {
callDuplexConsensusRead(x, y)
val consensus = callDuplexConsensusRead(x, y)
if (consensus.forall(duplexHasMinimumNumberOfReads)) {
consensus
} else {
// Even though we had enough reads going into consensus calling, we can
// lose some in the process (e.g. to CIGAR filtering), and end up with
// consensus reads that will than fail the min-reads check
rejectRecords(groups.flatten, FilterMinReads)
Nil
}
}
else {
rejectRecords(groups.flatten, FilterMinReads)
Expand All @@ -204,7 +213,7 @@ class DuplexConsensusCaller(override val readNamePrefix: String,

}

/** Returns true if there enough reads according to the minReads option. */
/** Returns true if there are enough reads according to the minReads option. */
private def hasMinimumNumberOfReads(x: Seq[SamRecord], y: Seq[SamRecord]): Boolean = {
// Get the number of reads per strand, in decreasing (more stringent) order.
val (numXy: Int, numYx: Int) = {
Expand All @@ -215,6 +224,16 @@ class DuplexConsensusCaller(override val readNamePrefix: String,
this.minTotalReads <= numXy + numYx && this.minXyReads <= numXy && this.minYxReads <= numYx
}

/** Returns true if there are enough reads composing the consensus according to the minReads option. */
private def duplexHasMinimumNumberOfReads(consensus: SamRecord): Boolean = {
val (numXy: Int, numYx: Int) = {
val numAb = consensus[Int](ConsensusTags.PerRead.AbRawReadCount)
val numBa = consensus[Int](ConsensusTags.PerRead.BaRawReadCount)
if (numAb >= numBa) (numAb, numBa) else (numBa, numAb)
}
this.minTotalReads <= numXy + numYx && this.minXyReads <= numXy && this.minYxReads <= numYx
}

private def areAllSameStrand(reads: Seq[SamRecord]): Boolean = {
if (reads.nonEmpty) {
val ss1Flag = reads.head.negativeStrand
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ class FilterConsensusReads
FilterResult(keepRead=false, 0)
}
else {
val result = if (isDuplexRecord(rec)) filterDuplexConsensusRead(rec) else filterVanillaConsensusRead(rec)
val result = if (Umis.isFgbioDuplexConsensus(rec)) filterDuplexConsensusRead(rec) else filterVanillaConsensusRead(rec)
result.copy(keepRead = result.keepRead && fractionNoCalls(rec) <= this.maxNoCallFraction)
}
}
Expand Down Expand Up @@ -329,9 +329,6 @@ class FilterConsensusReads
}
}

/** Quick check to see if a record is a duplex record. */
private def isDuplexRecord(rec: SamRecord): Boolean = rec.get(ConsensusTags.PerRead.AbRawReadCount).isDefined

/** Reverses all the consensus tags if the right conditions are set. */
private def reverseConsensusTagsIfNeeded(rec: SamRecord): Unit = if (reversePerBaseTags && rec.negativeStrand) {
ConsensusTags.PerBase.TagsToReverse.foreach { tag =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ trait UmiConsensusCaller[ConsensusRead <: SimpleRead] {
private def simplifyCigar(cigar: Cigar) = {
import CigarOperator._
if (cigar.forall(e => e.operator == M || e.operator == I || e.operator == D)) {
cigar
cigar.coalesce
}
else {
val newElems = cigar.elems.map {
Expand Down
25 changes: 14 additions & 11 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.umi.ConsensusTags
import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{RawReadCount, AbRawReadCount, BaRawReadCount}
import com.fulcrumgenomics.util.Sequences

object Umis {
Expand Down Expand Up @@ -129,14 +129,17 @@ object Umis {
ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-'
}

/** Returns True if the record appears to be a consensus read,
* typically produced by fgbio's CallMolecularConsensusReads or CallDuplexConsensusReads.
*
* @param rec the record to check
* @return boolean indicating if the record is a consensus or not
*/
def isFgbioStyleConsensus(rec: SamRecord): Boolean = {
rec.contains(ConsensusTags.PerRead.RawReadCount) ||
(rec.contains(ConsensusTags.PerRead.AbRawReadCount) && rec.contains(ConsensusTags.PerRead.BaRawReadCount))
}
/** Returns True if the record appears to be a consensus sequence typically produced by fgbio's
* CallMolecularConsensusReads or CallDuplexConsensusReads.
*
* @param rec the record to check
* @return boolean indicating if the record is a consensus or not
*/
def isFgbioStyleConsensus(rec: SamRecord): Boolean = isFgbioSimplexConsensus(rec) || isFgbioDuplexConsensus(rec)

/** Returns true if the record appears to be a simplex consensus sequence. */
def isFgbioSimplexConsensus(rec: SamRecord): Boolean = rec.contains(RawReadCount) && !isFgbioDuplexConsensus(rec)

/** Returns true if the record appears to be a duplex consensus sequence. */
def isFgbioDuplexConsensus(rec: SamRecord): Boolean = rec.contains(AbRawReadCount) && rec.contains(BaRawReadCount)
}
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ class DuplexConsensusCallerTest extends UnitSpec with OptionValues {
}

/** A builder where we have one duplex, with five total reads, three on one strand and two on the other. */
private val builderForMinReads: SamBuilder = {
private def builderForMinReads(): SamBuilder = {
val builder = new SamBuilder(readLength=10, baseQuality=20)
builder.addPair(name="q1", start1=100, start2=200, strand1=Plus, strand2=Minus, bases1="AAAAAAAAAA", bases2="CCCCCCCCCC", attrs=Map(MI -> "foo/A"))
builder.addPair(name="q2", start1=100, start2=200, strand1=Plus, strand2=Minus, bases1="AAAAAAAAAA", bases2="CCCCCCCCCC", attrs=Map(MI -> "foo/A"))
Expand All @@ -171,18 +171,42 @@ class DuplexConsensusCallerTest extends UnitSpec with OptionValues {
}

it should "support the --min-reads option when one value is provided" in {
caller(minReads=Seq(3)).consensusReadsFromSamRecords(builderForMinReads.toSeq) shouldBe empty
caller(minReads=Seq(2)).consensusReadsFromSamRecords(builderForMinReads.toSeq) should have size 2
val builder = builderForMinReads()
caller(minReads=Seq(3)).consensusReadsFromSamRecords(builder.toSeq) shouldBe empty
caller(minReads=Seq(2)).consensusReadsFromSamRecords(builder.toSeq) should have size 2
}

it should "support the --min-reads option when two values are provided" in {
caller(minReads=Seq(5, 3)).consensusReadsFromSamRecords(builderForMinReads.toSeq) shouldBe empty
caller(minReads=Seq(5, 2)).consensusReadsFromSamRecords(builderForMinReads.toSeq) should have size 2
val builder = builderForMinReads()
caller(minReads=Seq(5, 3)).consensusReadsFromSamRecords(builder.toSeq) shouldBe empty
caller(minReads=Seq(5, 2)).consensusReadsFromSamRecords(builder.toSeq) should have size 2
}

it should "support the --min-reads option when three values are provided" in {
caller(minReads=Seq(5, 3, 3)).consensusReadsFromSamRecords(builderForMinReads.toSeq) shouldBe empty
caller(minReads=Seq(5, 3, 2)).consensusReadsFromSamRecords(builderForMinReads.toSeq) should have size 2
val builder = builderForMinReads()
caller(minReads=Seq(5, 3, 3)).consensusReadsFromSamRecords(builder.toSeq) shouldBe empty
caller(minReads=Seq(5, 3, 2)).consensusReadsFromSamRecords(builder.toSeq) should have size 2
}

it should "support the --min-reads option as a hard filter if some records are removed during consensus calling for not having the most common structural alignment" in {
val builder = builderForMinReads()
val records = builder.toSeq
caller(minReads=Seq(3)).consensusReadsFromSamRecords(records) shouldBe empty
caller(minReads=Seq(2)).consensusReadsFromSamRecords(records) should have size 2

// Test for a coalesced and simplified cigar
val similar1 = builder.addPair(name="q6", start1=200, start2=100, strand1=Minus, strand2=Plus, bases1="CCCCCCCCCC", bases2="AAAAAAAAAA", attrs=Map(MI -> "foo/B"), cigar1 = "10M")
caller(minReads=Seq(3)).consensusReadsFromSamRecords(records ++ similar1) should have size 2 // the new read pair is considered in the final consensus
caller(minReads=Seq(2)).consensusReadsFromSamRecords(records ++ similar1) should have size 2

// Test for a non-coalesced but simplified cigar
val similar2 = builder.addPair(name="q6", start1=200, start2=100, strand1=Minus, strand2=Plus, bases1="CCCCCCCCCC", bases2="AAAAAAAAAA", attrs=Map(MI -> "foo/B"), cigar1 = "5M5M")
caller(minReads=Seq(3)).consensusReadsFromSamRecords(records ++ similar2) should have size 2 // the new read pair is considered in the final consensus
caller(minReads=Seq(2)).consensusReadsFromSamRecords(records ++ similar2) should have size 2

val dissimilar = builder.addPair(name="q6", start1=200, start2=100, strand1=Minus, strand2=Plus, bases1="CCCCCCCCCC", bases2="AAAAAAAAAA", attrs=Map(MI -> "foo/B"), cigar1 = "5M1D5M")
caller(minReads=Seq(3)).consensusReadsFromSamRecords(records ++ dissimilar) shouldBe empty // the new read pair is *not* considered in the final consensus
caller(minReads=Seq(2)).consensusReadsFromSamRecords(records ++ dissimilar) should have size 2
}

it should "not saturate the qualities with deep AB and light BA coverage" in {
Expand Down
41 changes: 32 additions & 9 deletions src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord}
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{RawReadCount, AbRawReadCount, BaRawReadCount}
import org.scalatest.OptionValues

class UmisTest extends UnitSpec with OptionValues {
Expand Down Expand Up @@ -112,19 +113,41 @@ class UmisTest extends UnitSpec with OptionValues {
an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME:CCKC-ACGT"))
}

"Umis.isConsensusRead" should "return false for reads without consensus tags" in {
"Umis.isFgbioStyleConsensus" should "return false for reads without consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
builder.addFrag(start=100).exists(Umis.isFgbioStyleConsensus) shouldBe false
builder.addPair(start1=100, start2=100, unmapped2=true).exists(Umis.isFgbioStyleConsensus) shouldBe false
Umis.isFgbioStyleConsensus(builder.addFrag(start=100).value) shouldBe false
val pair = builder.addPair(start1=100, start2=100, unmapped2=true)
pair.length shouldBe 2
pair.forall(rec => Umis.isFgbioStyleConsensus(rec)) shouldBe false
}

it should "return true for reads with consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
builder.addFrag(start=10, attrs=Map(ConsensusTags.PerRead.RawReadCount -> 10))
.exists(Umis.isFgbioStyleConsensus) shouldBe true
builder.addFrag(
start=10,
attrs=Map(ConsensusTags.PerRead.AbRawReadCount -> 10, ConsensusTags.PerRead.BaRawReadCount -> 10)
).exists(Umis.isFgbioStyleConsensus) shouldBe true
Umis.isFgbioStyleConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).value) shouldBe true
Umis.isFgbioStyleConsensus(builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe true
}

"Umis.isFgbioSimplexConsensus" should "return true for reads with simplex only consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
Umis.isFgbioSimplexConsensus(builder.addFrag(start=100).value) shouldBe false
Umis.isFgbioSimplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).value) shouldBe true
Umis.isFgbioSimplexConsensus(builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe false
Umis.isFgbioSimplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 20, AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe false

val pair = builder.addPair(start1=100, start2=100, unmapped2=true)
pair.length shouldBe 2
pair.forall(rec => Umis.isFgbioSimplexConsensus(rec)) shouldBe false
}

"Umis.isFgbioDuplexConsensus" should "return true for reads with duplex only consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
Umis.isFgbioDuplexConsensus(builder.addFrag(start=100).value) shouldBe false
Umis.isFgbioDuplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).value) shouldBe false
Umis.isFgbioDuplexConsensus(builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe true
Umis.isFgbioDuplexConsensus(builder.addFrag(start=10, attrs=Map(RawReadCount -> 20, AbRawReadCount -> 10, BaRawReadCount -> 10)).value) shouldBe true

val pair = builder.addPair(start1=100, start2=100, unmapped2=true)
pair.length shouldBe 2
pair.forall(rec => Umis.isFgbioDuplexConsensus(rec)) shouldBe false
}
}

0 comments on commit ba61d26

Please sign in to comment.