Skip to content

Commit

Permalink
Allow the PairedAssigner to use UMI pairs where one is absent (#954)
Browse files Browse the repository at this point in the history
* Allow the PairedAssigner to use UMI pairs where one is absent

* Let the DuplexConsensusCaller support UMI pairs with one absent

* Fallback to fast string splitting
  • Loading branch information
clintval authored Jan 3, 2024
1 parent 9c475fd commit 97a9e06
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -389,8 +389,8 @@ class CollectDuplexSeqMetrics
val umi1s = IndexedSeq.newBuilder[String] // ab UMI 1 (and ba UMI 2) sequences
val umi2s = IndexedSeq.newBuilder[String] // ab UMI 2 (and ba UMI 1) sequences

ab.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
ba.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }
ab.iterator.map(r => r[String](this.umiTag).split("-", -1)).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
ba.iterator.map(r => r[String](this.umiTag).split("-", -1)).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }

val Seq(abConsensusUmi, baConsensusUmi) = Seq(umi1s, umi2s).map(_.result()).map{ umis =>
val consensus = this.consensusBuilder.callConsensus(umis)
Expand Down Expand Up @@ -530,7 +530,7 @@ class CollectDuplexSeqMetrics
val uniqueTotal = metrics.map(_.unique_observations).sum.toDouble

metrics.foreach { m =>
val Array(umi1, umi2) = m.umi.split('-')
val Array(umi1, umi2) = m.umi.split("-", -1)
m.fraction_raw_observations = m.raw_observations / rawTotal
m.fraction_unique_observations = m.unique_observations / uniqueTotal
m.fraction_unique_observations_expected = singleUmiMetrics(umi1).fraction_unique_observations * singleUmiMetrics(umi2).fraction_unique_observations
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/com/fulcrumgenomics/umi/CorrectUmis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class CorrectUmis
missingUmisRecords += 1
rejectOut.foreach(w => w += rec)
case Some(umi: String) =>
val sequences = umi.split('-')
val sequences = umi.split("-", -1)
if (sequences.exists(_.length != umiLength)) {
if (wrongLengthRecords == 0) {
logger.warning(s"Read (${rec.name}) detected with unexpected length UMI(s): ${sequences.mkString(" ")}.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
package com.fulcrumgenomics.umi

import java.lang.Math.min

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.commons.util.LazyLogging
Expand Down Expand Up @@ -324,7 +323,7 @@ class DuplexConsensusCaller(override val readNamePrefix: String,
// UMI bases are present, `None` otherwise.
reads.flatMap(_.sam).flatMap { rec =>
rec.get[String](ConsensusTags.UmiBases).map { umi =>
if (rec.firstOfPair == firstOfPair) umi else umi.split('-').reverse.mkString("-")
if (rec.firstOfPair == firstOfPair) umi else umi.split("-", -1).reverse.mkString("-")
}
}
}
Expand Down
20 changes: 10 additions & 10 deletions src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy
import htsjdk.samtools._
import htsjdk.samtools.util.SequenceUtil

import java.util.concurrent.Executors
import java.util.concurrent.atomic.AtomicLong
import java.util.concurrent.{Executors, ForkJoinPool}
import scala.collection.immutable.IndexedSeq
import scala.collection.mutable.ListBuffer
import scala.collection.parallel.ExecutionContextTaskSupport
Expand Down Expand Up @@ -529,14 +529,14 @@ object Strategy extends FgBioEnum[Strategy] {
| reads per UMI, but breaks down at very high coverage of UMIs.
|3. **adjacency**: a version of the directed adjacency method described in [umi_tools](http://dx.doi.org/10.1101/051755)
| that allows for errors between UMIs but only when there is a count gradient.
|4. **paired**: similar to adjacency but for methods that produce template with a pair of UMIs
| such that a read with A-B is related to but not identical to a read with B-A.
| Expects the pair of UMIs to be stored in a single tag, separated by a hyphen
| (e.g. `ACGT-CCGG`). The molecular IDs produced have more structure than for single
| UMI strategies, and are of the form `{base}/{AB|BA}`. E.g. two UMI pairs would be
| mapped as follows AAAA-GGGG -> 1/AB, GGGG-AAAA -> 1/BA.
|4. **paired**: similar to adjacency but for methods that produce template such that a read with A-B is related
| to but not identical to a read with B-A. Expects the UMI sequences to be stored in a single SAM
| tag separated by a hyphen (e.g. `ACGT-CCGG`) and allows for one of the two UMIs to be absent
| (e.g. `ACGT-` or `-ACGT`). The molecular IDs produced have more structure than for single
| UMI strategies and are of the form `{base}/{A|B}`. E.g. two UMI pairs would be mapped as
| follows AAAA-GGGG -> 1/A, GGGG-AAAA -> 1/B.
|
|`edit`, `adjacency` and `paired` make use of the `--edits` parameter to control the matching of
|Strategies `edit`, `adjacency`, and `paired` make use of the `--edits` parameter to control the matching of
|non-identical UMIs.
|
|By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI
Expand Down Expand Up @@ -831,8 +831,8 @@ class GroupReadsByUmi
val pos1 = if (r1.positiveStrand) r1.unclippedStart else r1.unclippedEnd
val pos2 = if (r2.positiveStrand) r2.unclippedStart else r2.unclippedEnd
val r1Lower = r1.refIndex < r2.refIndex || (r1.refIndex == r2.refIndex && (pos1 < pos2 || (pos1 == pos2 && r1.positiveStrand)))
val umis = umi.split('-')
require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments: $umi")
val umis = umi.split("-", -1) // Split and ensure we return empty strings for missing UMIs.
require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments delimited by a '-': $umi")

if (r1Lower) paired.lowerReadUmiPrefix + ":" + umis(0) + "-" + paired.higherReadUmiPrefix + ":" + umis(1)
else paired.higherReadUmiPrefix + ":" + umis(0) + "-" + paired.lowerReadUmiPrefix + ":" + umis(1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ private[umi] class SimpleConsensusCaller(val errorRatePreLabeling: Byte = 90.toB
require(sequences.nonEmpty, "Can't call consensus on an empty set of sequences!")

if (sequences.length == 1) sequences.head else {
require(sequences.forall(_.length == sequences.head.length), "Sequences must all have the same length")
require(
sequences.forall(_.length == sequences.head.length),
s"Sequences must all have the same length. Found ${sequences.mkString(", ")}"
)
val buffer = new StringBuilder
val firstRead = sequences.head
val readLength = firstRead.length
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,4 +343,41 @@ class CollectDuplexSeqMetricsTest extends UnitSpec {
metrics.find(_.umi == "AAA-TTT") shouldBe defined
metrics.find(_.umi == "AAA-TTT").foreach(m => m.unique_observations shouldBe 1)
}

it should "count UMIs even if one of the UMIs on a 'molecule end' is an empty sequence" in collector(duplex = true).foreach { c =>
val builder = new SamBuilder(readLength = 10)
builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-CCC", MI -> "1/A"))
builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "CCC-", MI -> "1/B"))
builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "AAA-", MI -> "2/A"))
builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-AAA", MI -> "2/B"))
builder.addPair(start1 = 300, start2 = 400, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-GGG", MI -> "3/A"))
builder.addPair(start1 = 900, start2 = 800, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "TTT-", MI -> "4/A"))
builder.addPair(start1 = 300, start2 = 400, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-CGC", MI -> "5/A"))
builder.addPair(start1 = 900, start2 = 800, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "TAT-", MI -> "6/A"))

builder.toSeq.groupBy(r => r[String](MI).takeWhile(_ != '/')).values
.map(rs => rs.groupBy(r => r[String](MI)).values.toSeq).toSeq
.foreach(group => c.updateUmiMetrics(group))

val metrics = c.duplexUmiMetrics(c.umiMetrics)
metrics should have size 6

metrics.find(_.umi == "-CCC") shouldBe defined
metrics.find(_.umi == "-CCC").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "AAA-") shouldBe defined
metrics.find(_.umi == "AAA-").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "-GGG") shouldBe defined
metrics.find(_.umi == "-GGG").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "TTT-") shouldBe defined
metrics.find(_.umi == "TTT-").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "CGC-") shouldBe defined
metrics.find(_.umi == "CGC-").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "-TAT") shouldBe defined
metrics.find(_.umi == "-TAT").foreach(m => m.unique_observations shouldBe 1)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,29 @@ class DuplexConsensusCallerTest extends UnitSpec with OptionValues {
r2.quals(0) > 30 shouldBe true
}

Seq(
Seq("right", "ACT-", "-ACT"),
Seq("left", "-ACT", "ACT-"),
).foreach { case Seq(orientation, leftUmi, rightUmi) =>
it should s"create a simple double stranded consensus for a pair of A and a pair of B reads in which the $orientation UMI is absent" in {
val builder = new SamBuilder(readLength = 10, baseQuality = 20)
builder.addPair(name = "q1", start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, bases1 = "AAAAAAAAAA", bases2 = "CCCCCCCCCC", attrs = Map(MI -> "foo/A", RX -> leftUmi))
builder.addPair(name = "q2", start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, bases1 = "CCCCCCCCCC", bases2 = "AAAAAAAAAA", attrs = Map(MI -> "foo/B", RX -> rightUmi))

val recs = c.consensusReadsFromSamRecords(builder.toSeq)
recs should have size 2
val r1 = recs.find(_.firstOfPair).getOrElse(fail("No first of pair."))
val r2 = recs.find(_.secondOfPair).getOrElse(fail("No second of pair."))
r1.basesString shouldBe "AAAAAAAAAA"
r2.basesString shouldBe "GGGGGGGGGG" // after un-revcomping
r1.quals(0) > 30 shouldBe true
r2.quals(0) > 30 shouldBe true
// Both R1 and R2 inherit the RX pattern of the "A" family which is the left-oriented read pair in this test.
r1.get[String](RX) shouldBe Some(leftUmi)
r2.get[String](RX) shouldBe Some(leftUmi)
}
}

Seq(true).foreach { allowSingleStrand =>
val extraName: String = if (allowSingleStrand) "" else "not "
Seq("A", "B").foreach { duplexStrand =>
Expand Down
49 changes: 49 additions & 0 deletions src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,24 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
}

it should "correctly assign a/b for paired UMI prefixes when the UMI for one end of the source molecule is absent" in {
val tool = new GroupReadsByUmi(rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 0, allowInterContig = true)
val builder = new SamBuilder(readLength = 100)
val templates = Seq(
// This should be a::AAA-b:: since contig1 is lower
builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
// This should be b::AAA-a:: since contig2 is lower
builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
// This should be a::-b::TTT since contig1 is lower
builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
// This should be b::-a::TTT since contig2 is lower
builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
).map { pair => Template(r1 = pair.headOption, r2 = pair.lastOption) }
val expected = n("a::AAA-b::", 1) ++ n("b::AAA-a::", 1) ++ n("a::-b::TTT", 1) ++ n("b::-a::TTT", 1)
val umiForRead = PrivateMethod[String](Symbol("umiForRead"))
templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
}

"GroupReads.ReadInfo" should "extract the same ReadEnds from a Template as from an R1 with mate cigar" in {
val builder = new SamBuilder(readLength=100, sort=None)
val Seq(r1, r2) = builder.addPair(contig=2, contig2=Some(1), start1=300, start2=400, cigar1="10S90M", cigar2="90M10S")
Expand Down Expand Up @@ -420,6 +438,37 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
an[Exception] should be thrownBy tool.execute()
}

Seq(
Seq("right", "ACT-", "-ACT"),
Seq("left", "-ACT", "ACT-"),
).foreach { case Seq(orientation, leftUmi, rightUmi) =>
it should s"succeed when run in paired mode but the $orientation end of the source molecule does not have a UMI" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
builder.addPair(name = "a04", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))

builder.addPair(name = "b01", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
builder.addPair(name = "b02", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
builder.addPair(name = "b03", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
builder.addPair(name = "b04", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 1).execute()

val recs = readBamRecs(out)
val aIds = recs.filter(_.name.startsWith("a")).map(r => r[String]("MI")).distinct
val bIds = recs.filter(_.name.startsWith("b")).map(r => r[String]("MI")).distinct

aIds should have size 1
bIds should have size 1
aIds.head shouldBe "0/A"
bIds.head shouldBe "0/B"
}
}

it should "fail when the raw tag is not present" in {
val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus)
Expand Down

0 comments on commit 97a9e06

Please sign in to comment.