Skip to content

Commit

Permalink
Add minimum allele depths for downsampling
Browse files Browse the repository at this point in the history
  • Loading branch information
TedBrookings committed Jun 4, 2024
1 parent 75de344 commit d6d6594
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 14 deletions.
27 changes: 21 additions & 6 deletions src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,13 @@ object DownsampleVcf extends LazyLogging {
*/
def downsampleAndRegenotype(variant: Variant,
proportions: Map[String, Double],
random: Random, epsilon: Double=0.01): Variant = {
random: Random, epsilon: Double=0.01,
minAdHomvar: Int = 4, minAdHomref: Int = 2): Variant = {
try {
variant.copy(genotypes=variant.genotypes.map { case (sample, gt) =>
val proportion = proportions(sample)
sample -> downsampleAndRegenotype(gt=gt, proportion=proportion, random=random, epsilon=epsilon)
sample -> downsampleAndRegenotype(gt=gt, proportion=proportion, random=random, epsilon=epsilon,
minAdHomvar=minAdHomvar, minAdHomref=minAdHomref)
})
} catch {
case e: MatchError => throw new Exception(
Expand All @@ -95,13 +97,23 @@ object DownsampleVcf extends LazyLogging {
* @param epsilon the sequencing error rate for genotyping
* @return a new Genotype with updated allele depths, PLs, and genotype
*/
def downsampleAndRegenotype(gt: Genotype, proportion: Double, random: Random, epsilon: Double): Genotype = {
def downsampleAndRegenotype(
gt: Genotype, proportion: Double, random: Random, epsilon: Double, minAdHomvar: Int, minAdHomref: Int
): Genotype = {
val oldAds = gt.getOrElse[IndexedSeq[Int]]("AD", throw new Exception(s"AD tag not found for sample ${gt.sample}"))
val newAds = downsampleADs(oldAds, proportion, random)
val likelihoods = Likelihoods(newAds)
val pls = likelihoods.pls
val calls = likelihoods.mostLikelyCall(gt.alleles.toSeq)
gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> newAds.sum), calls=calls)
val totalAd = newAds.sum
val new_gt = gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> totalAd), calls=calls)
if (new_gt.isHomVar && totalAd < minAdHomvar) {
new_gt.copy(calls = calls.map(_ => Allele.NoCallAllele))
} else if(new_gt.isHomRef && totalAd < minAdHomref) {
new_gt.copy(calls = calls.map(_ => Allele.NoCallAllele))
} else {
new_gt
}
}

object Likelihoods {
Expand Down Expand Up @@ -148,7 +160,7 @@ object DownsampleVcf extends LazyLogging {
math.log10((1 - epsilon) / 2),
math.log10(1 - epsilon)
)
// compute genotype log-likelihoods
// compute genotype log-likelihoods
(0 until numAlleles).flatMap(b =>
(0 to b).map(a =>
(0 until numAlleles).map(allele =>
Expand Down Expand Up @@ -236,6 +248,8 @@ class DownsampleVcf
@arg(flag='o', doc="Output file name.") val output: PathToVcf,
@arg(flag='w', doc="Winnowing window size.") val windowSize: Int = 0,
@arg(flag='e', doc="Sequencing Error rate for genotyping.") val epsilon: Double = 0.01,
@arg(flag='v', doc="Minimum allele depth to call HOMVAR (otherwise NO-CALL)") val minAdHomvar: Int = 4,
@arg(flag='r', doc="Minimum allele depth to call HOMREF (otherwise NO-CALL)") val minAdHomref: Int = 2,
@arg(flag='c', doc="True to write out no-calls.") val writeNoCall: Boolean = false,
@arg(flag='s', doc="Random seed value.") val seed: Int = 42,
) extends FgBioTool {
Expand Down Expand Up @@ -297,7 +311,8 @@ class DownsampleVcf
val progress = ProgressLogger(logger, noun="variants written")
val random = new Random(seed)
winnowed.foreach { v =>
val ds = downsampleAndRegenotype(v, proportions=proportions, random=random, epsilon=epsilon)
val ds = downsampleAndRegenotype(v, proportions=proportions, random=random, epsilon=epsilon,
minAdHomvar=minAdHomvar, minAdHomref=minAdHomref)
if (writeNoCall || !ds.gts.forall(g => g.isNoCall)) {
outputVcf += ds
progress.record(ds)
Expand Down
40 changes: 32 additions & 8 deletions src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -295,47 +295,69 @@ class DownsampleVcfTest extends UnitSpec {

"DownsampleVcf.downsampleAndRegneotype(Genotype)" should "return no call if all allele depths are zero" in {
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,0))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.01, random = new Random(42), epsilon = 0.01)
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.01, random = new Random(42), epsilon = 0.01,
minAdHomvar = 0, minAdHomref = 0)
val expected = IndexedSeq(Allele("."), Allele("."))
newGeno.calls should contain theSameElementsInOrderAs expected
}

it should "return two ref alleles if the ref AD is much larger than the alt AD" in {
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(100,0))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01,
minAdHomvar=0, minAdHomref=0)
val expected = IndexedSeq(Allele("A"), Allele("A"))
newGeno.calls should contain theSameElementsInOrderAs expected
}

it should "return two alt alleles if the alt AD is greater than the ref AD" in {
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,100))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01,
minAdHomvar = 0, minAdHomref = 0)
val expected = IndexedSeq(Allele("T"), Allele("T"))
newGeno.calls should contain theSameElementsInOrderAs expected
}

it should "return two alt alleles if ref and alt AD > 0 but ref << alt" in {
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(30,200))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01,
minAdHomvar = 0, minAdHomref = 0)
val expected = IndexedSeq(Allele("T"), Allele("T"))
newGeno.calls should contain theSameElementsInOrderAs expected
}

it should "return het if ref and alt ADs are similar but ref < alt" in {
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(190,200))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01,
minAdHomvar = 0, minAdHomref = 0)
val expected = IndexedSeq(Allele("A"), Allele("T"))
newGeno.calls should contain theSameElementsInOrderAs expected
}


it should "return a ref and alt allele if the ref and alt ADs are the same" in {
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(100,100))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01,
minAdHomvar=0, minAdHomref=0)
val expected = IndexedSeq(Allele("A"), Allele("T"))
newGeno.calls should contain theSameElementsInOrderAs expected
}

it should "return no call if all alleles are alt but allele depth is less than minAdHomvar" in {
val geno = makeGt(ref = "A", alt = "T", ads = IndexedSeq(0, 400))
val newGeno = downsampleAndRegenotype(gt = geno, proportion = 0.01, random = new Random(42), epsilon = 0.01,
minAdHomvar = 8, minAdHomref = 0)
val expected = IndexedSeq(Allele("."), Allele("."))
newGeno.calls should contain theSameElementsInOrderAs expected
}

it should "return no call if all alleles are ref but allele depth is less than minAdHomref" in {
val geno = makeGt(ref = "A", alt = "T", ads = IndexedSeq(200, 0))
val newGeno = downsampleAndRegenotype(gt = geno, proportion = 0.01, random = new Random(42), epsilon = 0.01,
minAdHomvar = 0, minAdHomref = 4)
val expected = IndexedSeq(Allele("."), Allele("."))
newGeno.calls should contain theSameElementsInOrderAs expected
}

/*
testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes
*/
Expand All @@ -348,14 +370,16 @@ class DownsampleVcfTest extends UnitSpec {

it should "return ref,alt1 for a tri-allelic genotype if those alleles have the highest depth" in {
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01,
minAdHomvar=0, minAdHomref=0)
val expected = IndexedSeq(Allele("A"), Allele("T"))
newGeno.calls should contain theSameElementsInOrderAs expected
}

it should "return alt1,alt2 for a tri-allelic genotype if those alleles have the highest depth" in {
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01,
minAdHomvar=0, minAdHomref=0)
val expected = IndexedSeq(Allele("T"), Allele("G"))
newGeno.calls should contain theSameElementsInOrderAs expected
}
Expand Down

0 comments on commit d6d6594

Please sign in to comment.