fulcrumgenomics · nh13 · Jan 29, 2024 · Jan 29, 2024 · Jan 30, 2024 · nh13
@@ -25,6 +25,7 @@
 package com.fulcrumgenomics.bam
 
 import com.fulcrumgenomics.FgBioDef._
+import com.fulcrumgenomics.alignment.Cigar
 import com.fulcrumgenomics.bam.api.SamOrder.Queryname
 import com.fulcrumgenomics.bam.api._
 import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator}
@@ -41,6 +42,36 @@
 import java.io.Closeable
 import scala.math.{max, min}
 
+
+
+case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) {
+  def negativeStrand: Boolean = !positiveStrand
+  def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex
+
+  def end: Int = start + cigar.lengthOnTarget - 1
+  def unclippedStart: Int = {
+    SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar)
+  }
+
+  def unclippedEnd: Int = {
+    SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar)
+  }
+}
+
+object Supplementary {
+  /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
+  def toString(rec: SamRecord): String = {
+    val strand = if (rec.positiveStrand) '+' else '-'
+    f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}"
+  }
+
+
+  def apply(sa: String): Supplementary = {
+    val parts = sa.split(",")
+    Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
-    Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
+    Supplementary(parts(0), parts(1).toInt - 1, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
-    Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
+    Supplementary(parts(0), parts(1).toInt - 1, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
+  }
+}
+
 /**
   * Class that represents all reads from a template within a BAM file.
   */
@@ -107,11 +138,21 @@
 
   /** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */
   def fixMateInfo(): Unit = {
-    for (primary <- r1; supp <- r2Supplementals) {
-      SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
+    // Set all mate info on BOTH secondary and supplementary records, not just supplementary records.  We also need to
+    // add the "pa" and "pm" tags with information about the primary alignments.  Finally, we need the MQ tag!
+    val r1NonPrimary = r1Supplementals ++ r1Secondaries
+    val r2NonPrimary = r2Supplementals ++ r2Secondaries
+    for (primary <- r1; nonPrimary <- r2NonPrimary) {
+      SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
+      nonPrimary(SAMTag.MQ.name()) = primary.mapq
+      nonPrimary("mp") = Supplementary.toString(primary)
+      r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
     }
-    for (primary <- r2; supp <- r1Supplementals) {
-      SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
+    for (primary <- r2; nonPrimary <- r1NonPrimary) {
+      SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
+      nonPrimary(SAMTag.MQ.name()) = primary.mapq
+      nonPrimary("mp") = Supplementary.toString(primary)
+      r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
     }
     for (first <- r1; second <- r2) {
       SamPairUtil.setMateInfo(first.asSam, second.asSam, true)

@@ -24,12 +24,15 @@
 
 package com.fulcrumgenomics.bam.api
 
+import com.fulcrumgenomics.bam.{Bams, Supplementary}
 import com.fulcrumgenomics.umi.ConsensusTags
 import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
 import htsjdk.samtools.util.Murmur3
 import htsjdk.samtools.{SAMFileHeader, SAMUtils}
 import org.apache.commons.math3.genetics.RandomKey
 
+import scala.reflect.runtime.universe.Template
+
-import scala.reflect.runtime.universe.Template
-import scala.reflect.runtime.universe.Template
 
 /** Trait for specifying BAM orderings. */
 sealed trait SamOrder extends Product {
@@ -175,24 +178,50 @@ object SamOrder {
     override val groupOrder: GroupOrder     = GroupOrder.query
     override val subSort:    Option[String] = Some("template-coordinate")
     override val sortkey: SamRecord => A = rec => {
-      val readChrom = if (rec.unmapped)     Int.MaxValue else rec.refIndex
-      val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
-      val readNeg   = rec.negativeStrand
-      val mateNeg   = if (rec.paired) rec.mateNegativeStrand else false
-      val readPos   = if (rec.unmapped)     Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
-      val matePos   = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
-      val lib       = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
-      val mid       = rec.get[String](ConsensusTags.MolecularId).map { m =>
-        val index: Int = m.lastIndexOf('/')
-        if (index >= 0) m.substring(0, index) else m
-      }.getOrElse("")
-
-      if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
-           (readChrom == mateChrom && readPos == matePos && !readNeg)) {
-        TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
-      }
-      else {
-        TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
+      // For non-secondary/non-supplementary alignments, use the info in the record.  For secondary and supplementary
+      // alignments, use the info in the pa/pm tags.
+      if (!rec.secondary && !rec.supplementary) {
+        val readChrom = if (rec.unmapped)     Int.MaxValue else rec.refIndex
+        val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
+        val readNeg   = rec.negativeStrand
+        val mateNeg   = if (rec.paired) rec.mateNegativeStrand else false
+        val readPos   = if (rec.unmapped)     Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
+        val matePos   = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
+        val lib       = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
+        val mid       = rec.get[String](ConsensusTags.MolecularId).map { m =>
+          val index: Int = m.lastIndexOf('/')
+          if (index >= 0) m.substring(0, index) else m
+        }.getOrElse("")
+
+        if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
+             (readChrom == mateChrom && readPos == matePos && !readNeg)) {
+          TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
+        }
+        else {
+          TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
+        }
+      } else {
+        val primary   = Supplementary(rec[String]("rp"))
+        val mate      = Supplementary(rec[String]("mp"))
+        val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header)
+        val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header)
+        val readNeg   = primary.negativeStrand
+        val mateNeg   = if (rec.paired) mate.negativeStrand else false
+        val readPos   = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart
+        val matePos   = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart
+        val lib       = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
+        val mid       = rec.get[String](ConsensusTags.MolecularId).map { m =>
+          val index: Int = m.lastIndexOf('/')
+          if (index >= 0) m.substring(0, index) else m
+        }.getOrElse("")
+
+        if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
+          (readChrom == mateChrom && readPos == matePos && !readNeg)) {
+          TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
+        }
+        else {
+          TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
+        }
       }
     }
   }

@@ -719,7 +719,7 @@ class GroupReadsByUmi
 
       // Then output the records in the right order (assigned tag, read name, r1, r2)
       templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => {
-        templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => {
+        templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => {
           out += rec
         })
       })

@@ -25,11 +25,13 @@
 package com.fulcrumgenomics.bam.api
 
 import java.util.Random
-
 import com.fulcrumgenomics.FgBioDef._
+import com.fulcrumgenomics.bam.Bams
+import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator}
 import com.fulcrumgenomics.commons.util.SimpleCounter
 import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
 import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
+import com.fulcrumgenomics.util.Io
 import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator}
 import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
 import htsjdk.samtools.util.Murmur3
@@ -217,4 +219,30 @@ class SamOrderTest extends UnitSpec {
 
     actual should contain theSameElementsInOrderAs expected
   }
+
+  it should "handle supplementary alignments" in {
+    val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Queryname))
+    val exp = ListBuffer[SamRecord]()
+    // primary read pairs for q1, that map to different contigs
+    exp ++= builder.addPair("q1", contig=1, contig2=Some(2), start1=66, start2=47, cigar1="60M40S", cigar2="55M45S", strand2=Plus)
+    // supplementary R2 (ignore R1), which maps to the same chromosome as the primary R1
+    val Seq(_, s2) = builder.addPair("q1", contig=1, contig2=Some(1), start1=66, start2=66, cigar1="60M40S", strand2=Minus)
+    s2.supplementary = true
+    s2.properlyPaired = true
+    exp += s2
+    // primary read pairs for q2, that map to different contigs, but earlier that q1
+    exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S")
+
+    // Fix the mate information.  Note: sorting here to get a template-iterator will write the records to disk first,
+    // so we cannot use the records in builder/exp.
+    val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template =>
+      template.fixMateInfo()
+      template.allReads
+    }.toList
+
+    val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup")
+    val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)
+
+    actual should contain theSameElementsInOrderAs expected
+  }
 }
@@ -26,13 +26,14 @@
  */
 package com.fulcrumgenomics.umi
 
-import com.fulcrumgenomics.bam.Template
-import com.fulcrumgenomics.bam.api.SamOrder
+import com.fulcrumgenomics.bam.{Bams, Template}
+import com.fulcrumgenomics.bam.api.{SamOrder, SamWriter}
 import com.fulcrumgenomics.bam.api.SamOrder.TemplateCoordinate
 import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
 import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
 import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
 import com.fulcrumgenomics.umi.GroupReadsByUmi._
+import com.fulcrumgenomics.util.Io
 import org.scalatest.{OptionValues, PrivateMethodTester}
 
 import java.nio.file.Files
@@ -338,6 +339,46 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     recs.filter(_.name.equals("a04")).forall(_.duplicate == true) shouldBe true
   }
 
+  it should "mark duplicates on supplementary reads" in {
+    val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
+    // primary read pairs for q1, that map to different contigs
+    builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT"))
+    // supplementary R2, which maps to the same chromosome as the primary R1
+    val Seq(s1, s2) = builder.addPair("q1", contig = 1, contig2 = Some(1), start1 = 66, start2 = 66, cigar1 = "60M40S", strand2 = Minus, attrs = Map("RX" -> "ACT"))
+    s1.supplementary = true
+    s2.supplementary = true
+    s2.properlyPaired = true
+    // primary read pairs for q2, that map to different contigs, but earlier that q1
+    builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))
+
+    // primary read pairs for q3, that are duplicates of q2
+    builder.addPair("q3", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))
+
+    // Fix the mate information and write the input.  Note: sorting here to get a template-iterator will write the
+    // records to disk first, so we cannot use the records in builder and therefore must write to the input file
+    // directly.
+    val in = Files.createTempFile("raw_reads", ".sam")
+    val writer = SamWriter(in, builder.header, sort = builder.sort)
+    Bams.templateIterator(iterator = builder.iterator, header = builder.header, maxInMemory = Bams.MaxInMemory, tmpDir = Io.tmpDir).foreach { template =>
+      template.fixMateInfo()
+      writer ++= template.allReads
+    }
+    writer.close()
+
+    val out  = Files.createTempFile("umi_grouped.", ".sam")
+    val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
+    val gr   = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true)
+
+    gr.markDuplicates shouldBe true
+    gr.execute()
+
+    val recs = readBamRecs(out)
+    recs.length shouldBe 8
+    recs.filter(_.name.equals("q1")).forall(_.duplicate == false) shouldBe true
+    recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true
+    recs.filter(_.name.equals("q3")).forall(_.duplicate == true) shouldBe true
+  }
+
   it should "correctly group reads with the paired assigner when the two UMIs are the same" in {
     val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
     builder.addPair(name="a01", start1=100, start2=300, strand1=Plus,  strand2=Minus, attrs=Map("RX" -> "ACT-ACT"))