Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow the PairedAssigner to use UMI pairs where one is absent #954

Merged
merged 3 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -389,8 +389,8 @@ class CollectDuplexSeqMetrics
val umi1s = IndexedSeq.newBuilder[String] // ab UMI 1 (and ba UMI 2) sequences
val umi2s = IndexedSeq.newBuilder[String] // ab UMI 2 (and ba UMI 1) sequences

ab.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
ba.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }
ab.iterator.map(r => r[String](this.umiTag).split("-", -1)).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
ba.iterator.map(r => r[String](this.umiTag).split("-", -1)).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }

val Seq(abConsensusUmi, baConsensusUmi) = Seq(umi1s, umi2s).map(_.result()).map{ umis =>
val consensus = this.consensusBuilder.callConsensus(umis)
Expand Down Expand Up @@ -530,7 +530,7 @@ class CollectDuplexSeqMetrics
val uniqueTotal = metrics.map(_.unique_observations).sum.toDouble

metrics.foreach { m =>
val Array(umi1, umi2) = m.umi.split('-')
val Array(umi1, umi2) = m.umi.split("-", -1)
m.fraction_raw_observations = m.raw_observations / rawTotal
m.fraction_unique_observations = m.unique_observations / uniqueTotal
m.fraction_unique_observations_expected = singleUmiMetrics(umi1).fraction_unique_observations * singleUmiMetrics(umi2).fraction_unique_observations
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/com/fulcrumgenomics/umi/CorrectUmis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class CorrectUmis
missingUmisRecords += 1
rejectOut.foreach(w => w += rec)
case Some(umi: String) =>
val sequences = umi.split('-')
val sequences = umi.split("-", -1)
if (sequences.exists(_.length != umiLength)) {
if (wrongLengthRecords == 0) {
logger.warning(s"Read (${rec.name}) detected with unexpected length UMI(s): ${sequences.mkString(" ")}.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
package com.fulcrumgenomics.umi

import java.lang.Math.min

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.commons.util.LazyLogging
Expand Down Expand Up @@ -324,7 +323,7 @@ class DuplexConsensusCaller(override val readNamePrefix: String,
// UMI bases are present, `None` otherwise.
reads.flatMap(_.sam).flatMap { rec =>
rec.get[String](ConsensusTags.UmiBases).map { umi =>
if (rec.firstOfPair == firstOfPair) umi else umi.split('-').reverse.mkString("-")
if (rec.firstOfPair == firstOfPair) umi else umi.split("-", -1).reverse.mkString("-")
}
}
}
Expand Down
20 changes: 10 additions & 10 deletions src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy
import htsjdk.samtools._
import htsjdk.samtools.util.SequenceUtil

import java.util.concurrent.Executors
import java.util.concurrent.atomic.AtomicLong
import java.util.concurrent.{Executors, ForkJoinPool}
import scala.collection.immutable.IndexedSeq
import scala.collection.mutable.ListBuffer
import scala.collection.parallel.ExecutionContextTaskSupport
Expand Down Expand Up @@ -529,14 +529,14 @@ object Strategy extends FgBioEnum[Strategy] {
| reads per UMI, but breaks down at very high coverage of UMIs.
|3. **adjacency**: a version of the directed adjacency method described in [umi_tools](http://dx.doi.org/10.1101/051755)
| that allows for errors between UMIs but only when there is a count gradient.
|4. **paired**: similar to adjacency but for methods that produce template with a pair of UMIs
| such that a read with A-B is related to but not identical to a read with B-A.
| Expects the pair of UMIs to be stored in a single tag, separated by a hyphen
| (e.g. `ACGT-CCGG`). The molecular IDs produced have more structure than for single
| UMI strategies, and are of the form `{base}/{AB|BA}`. E.g. two UMI pairs would be
| mapped as follows AAAA-GGGG -> 1/AB, GGGG-AAAA -> 1/BA.
|4. **paired**: similar to adjacency but for methods that produce template such that a read with A-B is related
| to but not identical to a read with B-A. Expects the UMI sequences to be stored in a single SAM
| tag separated by a hyphen (e.g. `ACGT-CCGG`) and allows for one of the two UMIs to be absent
| (e.g. `ACGT-` or `-ACGT`). The molecular IDs produced have more structure than for single
| UMI strategies and are of the form `{base}/{A|B}`. E.g. two UMI pairs would be mapped as
| follows AAAA-GGGG -> 1/A, GGGG-AAAA -> 1/B.
|
|`edit`, `adjacency` and `paired` make use of the `--edits` parameter to control the matching of
|Strategies `edit`, `adjacency`, and `paired` make use of the `--edits` parameter to control the matching of
|non-identical UMIs.
|
|By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI
Expand Down Expand Up @@ -831,8 +831,8 @@ class GroupReadsByUmi
val pos1 = if (r1.positiveStrand) r1.unclippedStart else r1.unclippedEnd
val pos2 = if (r2.positiveStrand) r2.unclippedStart else r2.unclippedEnd
val r1Lower = r1.refIndex < r2.refIndex || (r1.refIndex == r2.refIndex && (pos1 < pos2 || (pos1 == pos2 && r1.positiveStrand)))
val umis = umi.split('-')
require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments: $umi")
val umis = umi.split("-", -1) // Split and ensure we return empty strings for missing UMIs.
require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments delimited by a '-': $umi")

if (r1Lower) paired.lowerReadUmiPrefix + ":" + umis(0) + "-" + paired.higherReadUmiPrefix + ":" + umis(1)
else paired.higherReadUmiPrefix + ":" + umis(0) + "-" + paired.lowerReadUmiPrefix + ":" + umis(1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ private[umi] class SimpleConsensusCaller(val errorRatePreLabeling: Byte = 90.toB
require(sequences.nonEmpty, "Can't call consensus on an empty set of sequences!")

if (sequences.length == 1) sequences.head else {
require(sequences.forall(_.length == sequences.head.length), "Sequences must all have the same length")
require(
sequences.forall(_.length == sequences.head.length),
s"Sequences must all have the same length. Found ${sequences.mkString(", ")}"
)
val buffer = new StringBuilder
val firstRead = sequences.head
val readLength = firstRead.length
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,4 +343,41 @@ class CollectDuplexSeqMetricsTest extends UnitSpec {
metrics.find(_.umi == "AAA-TTT") shouldBe defined
metrics.find(_.umi == "AAA-TTT").foreach(m => m.unique_observations shouldBe 1)
}

it should "count UMIs even if one of the UMIs on a 'molecule end' is an empty sequence" in collector(duplex = true).foreach { c =>
val builder = new SamBuilder(readLength = 10)
builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-CCC", MI -> "1/A"))
builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "CCC-", MI -> "1/B"))
builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "AAA-", MI -> "2/A"))
builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-AAA", MI -> "2/B"))
builder.addPair(start1 = 300, start2 = 400, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-GGG", MI -> "3/A"))
builder.addPair(start1 = 900, start2 = 800, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "TTT-", MI -> "4/A"))
builder.addPair(start1 = 300, start2 = 400, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-CGC", MI -> "5/A"))
builder.addPair(start1 = 900, start2 = 800, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "TAT-", MI -> "6/A"))

builder.toSeq.groupBy(r => r[String](MI).takeWhile(_ != '/')).values
.map(rs => rs.groupBy(r => r[String](MI)).values.toSeq).toSeq
.foreach(group => c.updateUmiMetrics(group))

val metrics = c.duplexUmiMetrics(c.umiMetrics)
metrics should have size 6

metrics.find(_.umi == "-CCC") shouldBe defined
metrics.find(_.umi == "-CCC").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "AAA-") shouldBe defined
metrics.find(_.umi == "AAA-").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "-GGG") shouldBe defined
metrics.find(_.umi == "-GGG").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "TTT-") shouldBe defined
metrics.find(_.umi == "TTT-").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "CGC-") shouldBe defined
metrics.find(_.umi == "CGC-").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "-TAT") shouldBe defined
metrics.find(_.umi == "-TAT").foreach(m => m.unique_observations shouldBe 1)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,29 @@ class DuplexConsensusCallerTest extends UnitSpec with OptionValues {
r2.quals(0) > 30 shouldBe true
}

Seq(
Seq("right", "ACT-", "-ACT"),
Seq("left", "-ACT", "ACT-"),
).foreach { case Seq(orientation, leftUmi, rightUmi) =>
it should s"create a simple double stranded consensus for a pair of A and a pair of B reads in which the $orientation UMI is absent" in {
val builder = new SamBuilder(readLength = 10, baseQuality = 20)
builder.addPair(name = "q1", start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, bases1 = "AAAAAAAAAA", bases2 = "CCCCCCCCCC", attrs = Map(MI -> "foo/A", RX -> leftUmi))
builder.addPair(name = "q2", start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, bases1 = "CCCCCCCCCC", bases2 = "AAAAAAAAAA", attrs = Map(MI -> "foo/B", RX -> rightUmi))

val recs = c.consensusReadsFromSamRecords(builder.toSeq)
recs should have size 2
val r1 = recs.find(_.firstOfPair).getOrElse(fail("No first of pair."))
val r2 = recs.find(_.secondOfPair).getOrElse(fail("No second of pair."))
r1.basesString shouldBe "AAAAAAAAAA"
r2.basesString shouldBe "GGGGGGGGGG" // after un-revcomping
r1.quals(0) > 30 shouldBe true
r2.quals(0) > 30 shouldBe true
// Both R1 and R2 inherit the RX pattern of the "A" family which is the left-oriented read pair in this test.
r1.get[String](RX) shouldBe Some(leftUmi)
r2.get[String](RX) shouldBe Some(leftUmi)
}
}

Seq(true).foreach { allowSingleStrand =>
val extraName: String = if (allowSingleStrand) "" else "not "
Seq("A", "B").foreach { duplexStrand =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,24 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
}

it should "correctly assign a/b for paired UMI prefixes when the UMI for one end of the source molecule is absent" in {
val tool = new GroupReadsByUmi(rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 0, allowInterContig = true)
val builder = new SamBuilder(readLength = 100)
val templates = Seq(
// This should be a::AAA-b:: since contig1 is lower
builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
// This should be b::AAA-a:: since contig2 is lower
builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
// This should be a::-b::TTT since contig1 is lower
builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
// This should be b::-a::TTT since contig2 is lower
builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
).map { pair => Template(r1 = pair.headOption, r2 = pair.lastOption) }
val expected = n("a::AAA-b::", 1) ++ n("b::AAA-a::", 1) ++ n("a::-b::TTT", 1) ++ n("b::-a::TTT", 1)
val umiForRead = PrivateMethod[String](Symbol("umiForRead"))
templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
}

"GroupReads.ReadInfo" should "extract the same ReadEnds from a Template as from an R1 with mate cigar" in {
val builder = new SamBuilder(readLength=100, sort=None)
val Seq(r1, r2) = builder.addPair(contig=2, contig2=Some(1), start1=300, start2=400, cigar1="10S90M", cigar2="90M10S")
Expand Down Expand Up @@ -420,6 +438,37 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
an[Exception] should be thrownBy tool.execute()
}

Seq(
Seq("right", "ACT-", "-ACT"),
Seq("left", "-ACT", "ACT-"),
).foreach { case Seq(orientation, leftUmi, rightUmi) =>
it should s"succeed when run in paired mode but the $orientation end of the source molecule does not have a UMI" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
builder.addPair(name = "a04", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))

builder.addPair(name = "b01", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
builder.addPair(name = "b02", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
builder.addPair(name = "b03", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
builder.addPair(name = "b04", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 1).execute()

val recs = readBamRecs(out)
val aIds = recs.filter(_.name.startsWith("a")).map(r => r[String]("MI")).distinct
val bIds = recs.filter(_.name.startsWith("b")).map(r => r[String]("MI")).distinct

aIds should have size 1
bIds should have size 1
aIds.head shouldBe "0/A"
bIds.head shouldBe "0/B"
}
}

it should "fail when the raw tag is not present" in {
val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus)
Expand Down
Loading