Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: raise exception if CollectDuplexSeqMetrics run on consensus BAM #1003

Merged
merged 13 commits into from
Jul 31, 2024
1 change: 1 addition & 0 deletions src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ package com.fulcrumgenomics.bam.api

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.alignment.Cigar
import com.fulcrumgenomics.umi.ConsensusTags.PerRead.AllPerReadTags
znorgaard marked this conversation as resolved.
Show resolved Hide resolved
import htsjdk.samtools
import htsjdk.samtools.SamPairUtil.PairOrientation
import htsjdk.samtools._
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,11 @@ class CollectDuplexSeqMetrics
// Build the iterator we'll use based on whether or not we're restricting to a set of intervals
val in = SamSource(input)
val _filteredIterator = in.iterator.filter(r => r.paired && r.mapped && r.mateMapped && r.firstOfPair && !r.secondary && !r.supplementary)
.tapEach {
r =>
if (Umis.isConsensusRead(r)) throw new IllegalArgumentException("Found consensus record. Expected UMI-grouped BAM")
else r
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple of thoughts here:

  1. Is it necessary to check each and every read? Or could you add .bufferBetter to the construction of _filteredIterator and then just check the first element?
  2. I think this is a case where a much more comprehensive error message would be useful, something like:
Input BAM file to CollectDuplexSeqMetrics ($input) appears to contain consensus reads.
CollectDuplexSeqMetrics cannot run on consensus BAMs, and instead requires the UMI-grouped BAM prior to consensus calling.  The UMI-grouped BAM is the output of running
GroupReadsByUmi.

First record in $input has consensus tags present:
${_filteredIterator.peek()}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the first point, I think it depends on what we're trying to guard against. If we're only guarding against a well formed consensus BAM being input then I think checking the first record should work.

If we're guarding against all the chaos a user may concoct, then this will not be sufficient. But, I think guarding against all those possibilities is probably unnecessary and unrealistic.

val iterator = intervals match {
case None => _filteredIterator
case Some(path) =>
Expand Down
11 changes: 11 additions & 0 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.umi.ConsensusTags.PerBase.AbRawReadCount
import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{BaRawReadCount, RawReadCount}
znorgaard marked this conversation as resolved.
Show resolved Hide resolved
import com.fulcrumgenomics.util.Sequences

object Umis {
Expand Down Expand Up @@ -127,4 +129,13 @@ object Umis {
@inline private def isValidUmiCharacter(ch: Char): Boolean = {
ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-'
}

/** Tests if a record is a consensus or not
znorgaard marked this conversation as resolved.
Show resolved Hide resolved
*
* @param rec the record to test
znorgaard marked this conversation as resolved.
Show resolved Hide resolved
* @return boolean indicating if the record is a consensus or not
*/
def isConsensusRead(rec: SamRecord): Boolean = {
znorgaard marked this conversation as resolved.
Show resolved Hide resolved
rec.contains(RawReadCount) | (rec.contains(AbRawReadCount) & rec.contains(BaRawReadCount))
znorgaard marked this conversation as resolved.
Show resolved Hide resolved
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ package com.fulcrumgenomics.umi

import java.nio.file.{Path, Paths}
import java.util.Random

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.SamOrder
import com.fulcrumgenomics.commons.util.SimpleCounter
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
import com.fulcrumgenomics.umi.ConsensusTags.PerBase.AbRawReadCount
import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{AllPerReadTags, BaRawReadCount}
import com.fulcrumgenomics.util.{Io, Metric, Rscript}
import htsjdk.samtools.util.{Interval, IntervalList}
import org.apache.commons.math3.distribution.NormalDistribution
Expand All @@ -41,6 +42,7 @@ import scala.math.{max, min}
class CollectDuplexSeqMetricsTest extends UnitSpec {
private val MI = "MI"
private val RX = "RX"
private val aD = "aD"

// Case class to hold pointers to all the outputs of CollectDuplexSeqMetrics
private case class Outputs(families: Path, duplexFamilies: Path, umis: Path, duplexUmis: Path, yields: Path, plots: Path) {
Expand Down Expand Up @@ -312,6 +314,17 @@ class CollectDuplexSeqMetricsTest extends UnitSpec {
duplexFamilies.find(f => f.ab_size == 6 && f.ba_size == 0).get.count shouldBe 1
}

it should "raise an exception if duplex consensus records are provided" in {
val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.TemplateCoordinate))
builder.addPair(
contig=1,
start1=1000,
start2=1100,
attrs=Map(RX -> "AAA-GGG", MI -> "1/A", AbRawReadCount -> 10, BaRawReadCount -> 10)
)
an[IllegalArgumentException] shouldBe thrownBy { exec(builder) }
}

"CollectDuplexSeqMetrics.updateUmiMetrics" should "not count duplex umis" in collector(duplex=false).foreach { c =>
val builder = new SamBuilder(readLength=10)
builder.addPair(start1=100, start2=200, attrs=Map(RX -> "AAA-CCC", MI -> "1/A"))
Expand Down
17 changes: 16 additions & 1 deletion src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@

package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord}
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
import com.fulcrumgenomics.umi.ConsensusTags.PerBase.AbRawReadCount
import com.fulcrumgenomics.umi.ConsensusTags.PerRead.{BaRawReadCount, RawReadCount}
import org.scalatest.OptionValues

class UmisTest extends UnitSpec with OptionValues {
Expand Down Expand Up @@ -111,4 +113,17 @@ class UmisTest extends UnitSpec with OptionValues {
an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME:ACGT-CCKC"))
an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME:CCKC-ACGT"))
}

"Umis.isConsensusRead" should "return false for reads without consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
builder.addFrag(start=100).exists(Umis.isConsensusRead) shouldBe false
builder.addPair(start1=100, start2=100, unmapped2=true).exists(Umis.isConsensusRead) shouldBe false
}

it should "return true for reads with consensus tags" in {
val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20)
builder.addFrag(start=10, attrs=Map(RawReadCount -> 10)).exists(Umis.isConsensusRead) shouldBe true
builder.addFrag(start=10, attrs=Map(AbRawReadCount -> 10, BaRawReadCount -> 10))
.exists(Umis.isConsensusRead) shouldBe true
}
}
Loading