Allow the PairedAssigner to use UMI pairs where one is absent (#954)

* Allow the PairedAssigner to use UMI pairs where one is absent * Let the DuplexConsensusCaller support UMI pairs with one absent * Fallback to fast string splitting
fulcrumgenomics · Jan 3, 2024 · 97a9e06 · 97a9e06
1 parent 9c475fd
commit 97a9e06
Show file tree

Hide file tree

Showing 8 changed files with 128 additions and 17 deletions.
diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala
@@ -389,8 +389,8 @@ class CollectDuplexSeqMetrics
     val umi1s = IndexedSeq.newBuilder[String] // ab UMI 1 (and ba UMI 2) sequences
     val umi2s = IndexedSeq.newBuilder[String] // ab UMI 2 (and ba UMI 1) sequences
 
-    ab.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
-    ba.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }
+    ab.iterator.map(r => r[String](this.umiTag).split("-", -1)).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
+    ba.iterator.map(r => r[String](this.umiTag).split("-", -1)).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }
 
     val Seq(abConsensusUmi, baConsensusUmi) = Seq(umi1s, umi2s).map(_.result()).map{ umis =>
       val consensus = this.consensusBuilder.callConsensus(umis)
@@ -530,7 +530,7 @@ class CollectDuplexSeqMetrics
     val uniqueTotal      = metrics.map(_.unique_observations).sum.toDouble
 
     metrics.foreach { m =>
-      val Array(umi1, umi2)                   = m.umi.split('-')
+      val Array(umi1, umi2)                   = m.umi.split("-", -1)
       m.fraction_raw_observations             = m.raw_observations / rawTotal
       m.fraction_unique_observations          = m.unique_observations / uniqueTotal
       m.fraction_unique_observations_expected = singleUmiMetrics(umi1).fraction_unique_observations * singleUmiMetrics(umi2).fraction_unique_observations

diff --git a/src/main/scala/com/fulcrumgenomics/umi/CorrectUmis.scala b/src/main/scala/com/fulcrumgenomics/umi/CorrectUmis.scala
@@ -185,7 +185,7 @@ class CorrectUmis
           missingUmisRecords += 1
           rejectOut.foreach(w => w += rec)
         case Some(umi: String) =>
-          val sequences = umi.split('-')
+          val sequences = umi.split("-", -1)
           if (sequences.exists(_.length != umiLength)) {
             if (wrongLengthRecords == 0) {
               logger.warning(s"Read (${rec.name}) detected with unexpected length UMI(s): ${sequences.mkString(" ")}.")

diff --git a/src/main/scala/com/fulcrumgenomics/umi/DuplexConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/DuplexConsensusCaller.scala
@@ -25,7 +25,6 @@
 package com.fulcrumgenomics.umi
 
 import java.lang.Math.min
-
 import com.fulcrumgenomics.FgBioDef._
 import com.fulcrumgenomics.bam.api.SamRecord
 import com.fulcrumgenomics.commons.util.LazyLogging
@@ -324,7 +323,7 @@ class DuplexConsensusCaller(override val readNamePrefix: String,
     // UMI bases are present, `None` otherwise.
     reads.flatMap(_.sam).flatMap { rec =>
       rec.get[String](ConsensusTags.UmiBases).map { umi =>
-        if (rec.firstOfPair == firstOfPair) umi else umi.split('-').reverse.mkString("-")
+        if (rec.firstOfPair == firstOfPair) umi else umi.split("-", -1).reverse.mkString("-")
       }
     }
   }

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -42,8 +42,8 @@ import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy
 import htsjdk.samtools._
 import htsjdk.samtools.util.SequenceUtil
 
+import java.util.concurrent.Executors
 import java.util.concurrent.atomic.AtomicLong
-import java.util.concurrent.{Executors, ForkJoinPool}
 import scala.collection.immutable.IndexedSeq
 import scala.collection.mutable.ListBuffer
 import scala.collection.parallel.ExecutionContextTaskSupport
@@ -529,14 +529,14 @@ object Strategy extends FgBioEnum[Strategy] {
     |                  reads per UMI, but breaks down at very high coverage of UMIs.
     |3. **adjacency**: a version of the directed adjacency method described in [umi_tools](http://dx.doi.org/10.1101/051755)
     |                  that allows for errors between UMIs but only when there is a count gradient.
-    |4. **paired**:    similar to adjacency but for methods that produce template with a pair of UMIs
-    |                  such that a read with A-B is related to but not identical to a read with B-A.
-    |                  Expects the pair of UMIs to be stored in a single tag, separated by a hyphen
-    |                  (e.g. `ACGT-CCGG`).  The molecular IDs produced have more structure than for single
-    |                  UMI strategies, and are of the form `{base}/{AB|BA}`. E.g. two UMI pairs would be
-    |                  mapped as follows AAAA-GGGG -> 1/AB, GGGG-AAAA -> 1/BA.
+    |4. **paired**:    similar to adjacency but for methods that produce template such that a read with A-B is related
+    |                  to but not identical to a read with B-A. Expects the UMI sequences to be stored in a single SAM
+    |                  tag separated by a hyphen (e.g. `ACGT-CCGG`) and allows for one of the two UMIs to be absent
+    |                  (e.g. `ACGT-` or `-ACGT`). The molecular IDs produced have more structure than for single
+    |                  UMI strategies and are of the form `{base}/{A|B}`. E.g. two UMI pairs would be mapped as
+    |                  follows AAAA-GGGG -> 1/A, GGGG-AAAA -> 1/B.
     |
-    |`edit`, `adjacency` and `paired` make use of the `--edits` parameter to control the matching of
+    |Strategies `edit`, `adjacency`, and `paired` make use of the `--edits` parameter to control the matching of
     |non-identical UMIs.
     |
     |By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI
@@ -831,8 +831,8 @@ class GroupReadsByUmi
         val pos1 = if (r1.positiveStrand) r1.unclippedStart else r1.unclippedEnd
         val pos2 = if (r2.positiveStrand) r2.unclippedStart else r2.unclippedEnd
         val r1Lower = r1.refIndex < r2.refIndex || (r1.refIndex == r2.refIndex && (pos1 < pos2 || (pos1 == pos2 && r1.positiveStrand)))
-        val umis = umi.split('-')
-        require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments: $umi")
+        val umis = umi.split("-", -1) // Split and ensure we return empty strings for missing UMIs.
+        require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments delimited by a '-': $umi")
 
         if (r1Lower) paired.lowerReadUmiPrefix  + ":" + umis(0) + "-" + paired.higherReadUmiPrefix + ":" + umis(1)
         else         paired.higherReadUmiPrefix + ":" + umis(0) + "-" + paired.lowerReadUmiPrefix  + ":" + umis(1)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/SimpleConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/SimpleConsensusCaller.scala
@@ -60,7 +60,10 @@ private[umi] class SimpleConsensusCaller(val errorRatePreLabeling: Byte = 90.toB
     require(sequences.nonEmpty, "Can't call consensus on an empty set of sequences!")
 
     if (sequences.length == 1) sequences.head else {
-      require(sequences.forall(_.length == sequences.head.length), "Sequences must all have the same length")
+      require(
+        sequences.forall(_.length == sequences.head.length),
+        s"Sequences must all have the same length. Found ${sequences.mkString(", ")}"
+      )
       val buffer = new StringBuilder
       val firstRead  = sequences.head
       val readLength = firstRead.length

diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala
@@ -343,4 +343,41 @@ class CollectDuplexSeqMetricsTest extends UnitSpec {
     metrics.find(_.umi == "AAA-TTT") shouldBe defined
     metrics.find(_.umi == "AAA-TTT").foreach(m => m.unique_observations shouldBe 1)
   }
+
+  it should "count UMIs even if one of the UMIs on a 'molecule end' is an empty sequence" in collector(duplex = true).foreach { c =>
+    val builder = new SamBuilder(readLength = 10)
+    builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-CCC", MI -> "1/A"))
+    builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "CCC-", MI -> "1/B"))
+    builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "AAA-", MI -> "2/A"))
+    builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-AAA", MI -> "2/B"))
+    builder.addPair(start1 = 300, start2 = 400, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-GGG", MI -> "3/A"))
+    builder.addPair(start1 = 900, start2 = 800, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "TTT-", MI -> "4/A"))
+    builder.addPair(start1 = 300, start2 = 400, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-CGC", MI -> "5/A"))
+    builder.addPair(start1 = 900, start2 = 800, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "TAT-", MI -> "6/A"))
+
+    builder.toSeq.groupBy(r => r[String](MI).takeWhile(_ != '/')).values
+      .map(rs => rs.groupBy(r => r[String](MI)).values.toSeq).toSeq
+      .foreach(group => c.updateUmiMetrics(group))
+
+    val metrics = c.duplexUmiMetrics(c.umiMetrics)
+    metrics should have size 6
+
+    metrics.find(_.umi == "-CCC") shouldBe defined
+    metrics.find(_.umi == "-CCC").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "AAA-") shouldBe defined
+    metrics.find(_.umi == "AAA-").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "-GGG") shouldBe defined
+    metrics.find(_.umi == "-GGG").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "TTT-") shouldBe defined
+    metrics.find(_.umi == "TTT-").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "CGC-") shouldBe defined
+    metrics.find(_.umi == "CGC-").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "-TAT") shouldBe defined
+    metrics.find(_.umi == "-TAT").foreach(m => m.unique_observations shouldBe 1)
+  }
 }
diff --git a/src/test/scala/com/fulcrumgenomics/umi/DuplexConsensusCallerTest.scala b/src/test/scala/com/fulcrumgenomics/umi/DuplexConsensusCallerTest.scala
@@ -88,6 +88,29 @@ class DuplexConsensusCallerTest extends UnitSpec with OptionValues {
     r2.quals(0) > 30 shouldBe true
   }
 
+  Seq(
+    Seq("right", "ACT-", "-ACT"),
+    Seq("left", "-ACT", "ACT-"),
+  ).foreach { case Seq(orientation, leftUmi, rightUmi) =>
+    it should s"create a simple double stranded consensus for a pair of A and a pair of B reads in which the $orientation UMI is absent" in {
+      val builder = new SamBuilder(readLength = 10, baseQuality = 20)
+      builder.addPair(name = "q1", start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, bases1 = "AAAAAAAAAA", bases2 = "CCCCCCCCCC", attrs = Map(MI -> "foo/A", RX -> leftUmi))
+      builder.addPair(name = "q2", start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, bases1 = "CCCCCCCCCC", bases2 = "AAAAAAAAAA", attrs = Map(MI -> "foo/B", RX -> rightUmi))
+
+      val recs = c.consensusReadsFromSamRecords(builder.toSeq)
+      recs should have size 2
+      val r1 = recs.find(_.firstOfPair).getOrElse(fail("No first of pair."))
+      val r2 = recs.find(_.secondOfPair).getOrElse(fail("No second of pair."))
+      r1.basesString shouldBe "AAAAAAAAAA"
+      r2.basesString shouldBe "GGGGGGGGGG" // after un-revcomping
+      r1.quals(0) > 30 shouldBe true
+      r2.quals(0) > 30 shouldBe true
+      // Both R1 and R2 inherit the RX pattern of the "A" family which is the left-oriented read pair in this test.
+      r1.get[String](RX) shouldBe Some(leftUmi)
+      r2.get[String](RX) shouldBe Some(leftUmi)
+    }
+  }
+
   Seq(true).foreach { allowSingleStrand =>
     val extraName: String = if (allowSingleStrand) "" else "not "
     Seq("A", "B").foreach { duplexStrand =>

diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -200,6 +200,24 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
   }
 
+  it should "correctly assign a/b for paired UMI prefixes when the UMI for one end of the source molecule is absent" in {
+    val tool = new GroupReadsByUmi(rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 0, allowInterContig = true)
+    val builder = new SamBuilder(readLength = 100)
+    val templates = Seq(
+      // This should be a::AAA-b:: since contig1 is lower
+      builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
+      // This should be b::AAA-a:: since contig2 is lower
+      builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
+      // This should be a::-b::TTT since contig1 is lower
+      builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
+      // This should be b::-a::TTT since contig2 is lower
+      builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
+    ).map { pair => Template(r1 = pair.headOption, r2 = pair.lastOption) }
+    val expected = n("a::AAA-b::", 1) ++ n("b::AAA-a::", 1) ++  n("a::-b::TTT", 1) ++ n("b::-a::TTT", 1)
+    val umiForRead = PrivateMethod[String](Symbol("umiForRead"))
+    templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
+  }
+
   "GroupReads.ReadInfo" should "extract the same ReadEnds from a Template as from an R1 with mate cigar" in {
     val builder = new SamBuilder(readLength=100, sort=None)
     val Seq(r1, r2) = builder.addPair(contig=2, contig2=Some(1), start1=300, start2=400, cigar1="10S90M", cigar2="90M10S")
@@ -420,6 +438,37 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     an[Exception] should be thrownBy tool.execute()
   }
 
+  Seq(
+    Seq("right", "ACT-", "-ACT"),
+    Seq("left", "-ACT", "ACT-"),
+  ).foreach { case Seq(orientation, leftUmi, rightUmi) =>
+    it should s"succeed when run in paired mode but the $orientation end of the source molecule does not have a UMI" in {
+      val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
+      builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
+      builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
+      builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
+      builder.addPair(name = "a04", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
+
+      builder.addPair(name = "b01", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
+      builder.addPair(name = "b02", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
+      builder.addPair(name = "b03", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
+      builder.addPair(name = "b04", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
+
+      val in  = builder.toTempFile()
+      val out = Files.createTempFile("umi_grouped.", ".sam")
+      new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 1).execute()
+
+      val recs = readBamRecs(out)
+      val aIds = recs.filter(_.name.startsWith("a")).map(r => r[String]("MI")).distinct
+      val bIds = recs.filter(_.name.startsWith("b")).map(r => r[String]("MI")).distinct
+
+      aIds should have size 1
+      bIds should have size 1
+      aIds.head shouldBe "0/A"
+      bIds.head shouldBe "0/B"
+    }
+  }
+
   it should "fail when the raw tag is not present" in {
     val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
     builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus)