-
-
Notifications
You must be signed in to change notification settings - Fork 67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GroupReadsByUmi may fail when marking duplicates including secondary/supplementary reads #964
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -25,6 +25,7 @@ | |||||
package com.fulcrumgenomics.bam | ||||||
|
||||||
import com.fulcrumgenomics.FgBioDef._ | ||||||
import com.fulcrumgenomics.alignment.Cigar | ||||||
import com.fulcrumgenomics.bam.api.SamOrder.Queryname | ||||||
import com.fulcrumgenomics.bam.api._ | ||||||
import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator} | ||||||
|
@@ -41,6 +42,36 @@ | |||||
import java.io.Closeable | ||||||
import scala.math.{max, min} | ||||||
|
||||||
|
||||||
|
||||||
case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) { | ||||||
def negativeStrand: Boolean = !positiveStrand | ||||||
def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I may invert this, and store |
||||||
|
||||||
def end: Int = start + cigar.lengthOnTarget - 1 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question Is end inclusive or exclusive? (And maybe add scaladoc to clarify) I recently added an equivalent property to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
def unclippedStart: Int = { | ||||||
SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we probably could just compute these directly without having to route to htsjdk |
||||||
} | ||||||
|
||||||
def unclippedEnd: Int = { | ||||||
SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar) | ||||||
} | ||||||
} | ||||||
|
||||||
object Supplementary { | ||||||
/** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ | ||||||
def toString(rec: SamRecord): String = { | ||||||
val strand = if (rec.positiveStrand) '+' else '-' | ||||||
f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}" | ||||||
} | ||||||
|
||||||
|
||||||
def apply(sa: String): Supplementary = { | ||||||
Comment on lines
+61
to
+69
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I would prefer to have two |
||||||
val parts = sa.split(",") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably good to check we get 6 parts There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. Is validation of the type/value of each part also necessary? |
||||||
Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
suggestion We need to subtract 1 if Without scaladoc I'm not sure, but I'm assuming it's zero based; and the
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto: |
||||||
} | ||||||
} | ||||||
|
||||||
/** | ||||||
* Class that represents all reads from a template within a BAM file. | ||||||
*/ | ||||||
|
@@ -107,11 +138,21 @@ | |||||
|
||||||
/** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */ | ||||||
def fixMateInfo(): Unit = { | ||||||
for (primary <- r1; supp <- r2Supplementals) { | ||||||
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) | ||||||
// Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment on line 139 should be updated (or removed) to reflect this |
||||||
// add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag! | ||||||
val r1NonPrimary = r1Supplementals ++ r1Secondaries | ||||||
val r2NonPrimary = r2Supplementals ++ r2Secondaries | ||||||
for (primary <- r1; nonPrimary <- r2NonPrimary) { | ||||||
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) | ||||||
nonPrimary(SAMTag.MQ.name()) = primary.mapq | ||||||
nonPrimary("mp") = Supplementary.toString(primary) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: store these tag definitions somewhere else |
||||||
r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) | ||||||
} | ||||||
for (primary <- r2; supp <- r1Supplementals) { | ||||||
SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) | ||||||
for (primary <- r2; nonPrimary <- r1NonPrimary) { | ||||||
SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) | ||||||
nonPrimary(SAMTag.MQ.name()) = primary.mapq | ||||||
nonPrimary("mp") = Supplementary.toString(primary) | ||||||
r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) | ||||||
Comment on lines
+145
to
+155
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question Would you find it more legible to extract these for loops into a helper so we don't repeat it twice? |
||||||
} | ||||||
for (first <- r1; second <- r2) { | ||||||
SamPairUtil.setMateInfo(first.asSam, second.asSam, true) | ||||||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -24,12 +24,15 @@ | |||
|
||||
package com.fulcrumgenomics.bam.api | ||||
|
||||
import com.fulcrumgenomics.bam.{Bams, Supplementary} | ||||
import com.fulcrumgenomics.umi.ConsensusTags | ||||
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} | ||||
import htsjdk.samtools.util.Murmur3 | ||||
import htsjdk.samtools.{SAMFileHeader, SAMUtils} | ||||
import org.apache.commons.math3.genetics.RandomKey | ||||
|
||||
import scala.reflect.runtime.universe.Template | ||||
|
||||
Comment on lines
+34
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
|
||||
/** Trait for specifying BAM orderings. */ | ||||
sealed trait SamOrder extends Product { | ||||
|
@@ -175,24 +178,50 @@ object SamOrder { | |||
override val groupOrder: GroupOrder = GroupOrder.query | ||||
override val subSort: Option[String] = Some("template-coordinate") | ||||
override val sortkey: SamRecord => A = rec => { | ||||
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex | ||||
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex | ||||
val readNeg = rec.negativeStrand | ||||
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false | ||||
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart | ||||
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) | ||||
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") | ||||
val mid = rec.get[String](ConsensusTags.MolecularId).map { m => | ||||
val index: Int = m.lastIndexOf('/') | ||||
if (index >= 0) m.substring(0, index) else m | ||||
}.getOrElse("") | ||||
|
||||
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || | ||||
(readChrom == mateChrom && readPos == matePos && !readNeg)) { | ||||
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) | ||||
} | ||||
else { | ||||
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) | ||||
// For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. todo: how can we simplify these two branches, since they're very similar There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you additionally set val primary = if (!rec.secondary && !rec.supplementary) Supplementary(rec) else Supplementary(rec[String]("rp"))
val mate = Supplementary(rec[String]("mp"))
// Just the second branch, using the info from `Supplementary` instead of `SamRecord`
... |
||||
// alignments, use the info in the pa/pm tags. | ||||
if (!rec.secondary && !rec.supplementary) { | ||||
val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex | ||||
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex | ||||
val readNeg = rec.negativeStrand | ||||
val mateNeg = if (rec.paired) rec.mateNegativeStrand else false | ||||
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart | ||||
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) | ||||
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") | ||||
val mid = rec.get[String](ConsensusTags.MolecularId).map { m => | ||||
val index: Int = m.lastIndexOf('/') | ||||
if (index >= 0) m.substring(0, index) else m | ||||
}.getOrElse("") | ||||
|
||||
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || | ||||
(readChrom == mateChrom && readPos == matePos && !readNeg)) { | ||||
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) | ||||
} | ||||
else { | ||||
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) | ||||
} | ||||
} else { | ||||
val primary = Supplementary(rec[String]("rp")) | ||||
val mate = Supplementary(rec[String]("mp")) | ||||
Comment on lines
+204
to
+205
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Todo, better error message or fallback |
||||
val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header) | ||||
val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header) | ||||
val readNeg = primary.negativeStrand | ||||
val mateNeg = if (rec.paired) mate.negativeStrand else false | ||||
val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart | ||||
val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart | ||||
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") | ||||
val mid = rec.get[String](ConsensusTags.MolecularId).map { m => | ||||
val index: Int = m.lastIndexOf('/') | ||||
if (index >= 0) m.substring(0, index) else m | ||||
}.getOrElse("") | ||||
|
||||
if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || | ||||
(readChrom == mateChrom && readPos == matePos && !readNeg)) { | ||||
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) | ||||
} | ||||
else { | ||||
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) | ||||
} | ||||
} | ||||
} | ||||
} | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -719,7 +719,7 @@ class GroupReadsByUmi | |
|
||
// Then output the records in the right order (assigned tag, read name, r1, r2) | ||
templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => { | ||
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => { | ||
templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. question Where are There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the initial filter of the BAM file (see the |
||
out += rec | ||
}) | ||
}) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
scaladocs needed later