Skip to content

Commit

Permalink
Allow haploid GTs
Browse files Browse the repository at this point in the history
  • Loading branch information
TedBrookings committed Aug 19, 2024
1 parent d6d6594 commit cb37718
Showing 1 changed file with 106 additions and 6 deletions.
112 changes: 106 additions & 6 deletions src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import com.fulcrumgenomics.util.{Metric, ProgressLogger}
import com.fulcrumgenomics.vcf.api.Allele.NoCallAllele
import com.fulcrumgenomics.vcf.api.{Allele, Genotype, Variant, VcfCount, VcfFieldType, VcfFormatHeader, VcfHeader, VcfSource, VcfWriter}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.vcf.DownsampleVcf.{downsampleAndRegenotype, winnowVariants}
import com.fulcrumgenomics.vcf.DownsampleVcf.{HasLikelihoods, downsampleAndRegenotype, winnowVariants}

import scala.math.log10
import scala.util.Random
Expand Down Expand Up @@ -102,7 +102,11 @@ object DownsampleVcf extends LazyLogging {
): Genotype = {
val oldAds = gt.getOrElse[IndexedSeq[Int]]("AD", throw new Exception(s"AD tag not found for sample ${gt.sample}"))
val newAds = downsampleADs(oldAds, proportion, random)
val likelihoods = Likelihoods(newAds)
val likelihoods = gt.ploidy match {
case 1 => HaploidLikelihoods(newAds)
case 2 => Likelihoods(newAds)
case _ => throw new RuntimeException(s"Got ploidy ${gt.ploidy}")
}
val pls = likelihoods.pls
val calls = likelihoods.mostLikelyCall(gt.alleles.toSeq)
val totalAd = newAds.sum
Expand All @@ -116,6 +120,12 @@ object DownsampleVcf extends LazyLogging {
}
}

trait HasLikelihoods {
def pls: IndexedSeq[Int]

def mostLikelyCall(alleles: Seq[Allele]): IndexedSeq[Allele]
}

object Likelihoods {
/**Converts a sequence of log-likelihoods to phred-scale by 1) multiplying each by -10, 2)
* subtracting from each the min value so the smallest value is 0, and 3) rounding to the
Expand Down Expand Up @@ -181,7 +191,7 @@ object DownsampleVcf extends LazyLogging {
* @param numAlleles the number of alleles the variant has
* @param genotypeLikelihoods sequence of GLs in the order specified in the VCF spec
*/
case class Likelihoods(numAlleles: Int, genotypeLikelihoods: IndexedSeq[Double]) {
case class Likelihoods(numAlleles: Int, genotypeLikelihoods: IndexedSeq[Double]) extends HasLikelihoods {
/**
* Returns the likelihoods as a list of phred-scaled integers (i.e, the value of the PL tag).
* @return a list of phred-scaled likelihooodS for AA, AB, BB.
Expand All @@ -190,7 +200,7 @@ object DownsampleVcf extends LazyLogging {
Likelihoods.logToPhredLikelihoods(genotypeLikelihoods)
}

def mostLikelyGenotype: Option[(Int, Int)] = {
private def mostLikelyGenotype: Option[(Int, Int)] = {
val minIndexes = pls.zipWithIndex.filter(pair => pair._1 == 0)
minIndexes.length match {
case 0 => throw new RuntimeException("expected the most likely PL to have a value of 0.0")
Expand All @@ -213,6 +223,96 @@ object DownsampleVcf extends LazyLogging {
}
}


object HaploidLikelihoods {
/**Converts a sequence of log-likelihoods to phred-scale by 1) multiplying each by -10, 2)
* subtracting from each the min value so the smallest value is 0, and 3) rounding to the
* nearest integer.
*/
def logToPhredLikelihoods(logLikelihoods: IndexedSeq[Double]): IndexedSeq[Int] = {
val rawPL = logLikelihoods.map(gl => gl * -10)
val minPL = rawPL.min
rawPL.map(pl => (pl - minPL).round.toInt)
}

/** Computes the likelihoods for each possible biallelic genotype.
* @param alleleDepthA the reference allele depth
* @param alleleDepthB the alternate allele depth
* @param epsilon the error rate for genotyping
* @return a new `Likelihood` that has the likelihoods of AA, AB, and BB
*/
def biallelic(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): IndexedSeq[Double] = {
val aGivenA = log10(1 - epsilon)
val aGivenB = log10(epsilon)

val rawGlA = (alleleDepthA * aGivenA) + (alleleDepthB * aGivenB)
val rawGlB = (alleleDepthA * aGivenB) + (alleleDepthB * aGivenA)

IndexedSeq(rawGlA, rawGlB)
}

/** Computes the likelihoods for each possible genotype given a sequence of read depths for any
* number of alleles.
* @param alleleDepths the sequence of allele depths in the order specified in the VCF
* @param epsilon the error rate for genotyping
* @return a new `Likelihood` that has the log likelihoods of all possible genotypes in the
* order specified in VFC spec for the GL/PL tags.
*/
def generalized(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): IndexedSeq[Double] = {
val numAlleles = alleleDepths.length
// probabilities associated with each possible genotype for a pair of alleles
val logProbs: Array[Double] = Array(
math.log10(epsilon),
math.log10(1 - epsilon)
)
// compute genotype log-likelihoods
(0 until numAlleles).map(a =>
(0 until numAlleles).map(allele =>
logProbs(if(a == allele) 1 else 0) * alleleDepths(allele)
).sum
)
}

def apply(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): HaploidLikelihoods = {
val numAlleles = alleleDepths.length
require(numAlleles >= 2, "at least two alleles are required to calculate genotype likelihoods")
HaploidLikelihoods(numAlleles, generalized(alleleDepths, epsilon))
}
}

/** Stores the log10(likelihoods) for all possible genotypes.
* @param numAlleles the number of alleles the variant has
* @param genotypeLikelihoods sequence of GLs in the order specified in the VCF spec
*/
case class HaploidLikelihoods(numAlleles: Int, genotypeLikelihoods: IndexedSeq[Double]) extends HasLikelihoods {
/**
* Returns the likelihoods as a list of phred-scaled integers (i.e, the value of the PL tag).
* @return a list of phred-scaled likelihooodS for AA, AB, BB.
*/
def pls: IndexedSeq[Int] = {
HaploidLikelihoods.logToPhredLikelihoods(genotypeLikelihoods)
}

private def mostLikelyGenotype: Option[Int] = {
val minIndexes = pls.zipWithIndex.filter(pair => pair._1 == 0)
minIndexes.length match {
case 0 => throw new RuntimeException("expected the most likely PL to have a value of 0.0")
case 1 => {
Some(minIndexes.head._2)
}
case _ => None // if multiple genotypes are most likely, don't make a call
}
}

def mostLikelyCall(alleles: Seq[Allele]): IndexedSeq[Allele] = {
mostLikelyGenotype match {
case None => IndexedSeq(NoCallAllele)
case Some(a) => IndexedSeq(alleles(a))
}
}
}


@clp(group=ClpGroups.VcfOrBcf, description =
"""
|Re-genotypes a VCF after downsampling the allele counts.
Expand Down Expand Up @@ -248,8 +348,8 @@ class DownsampleVcf
@arg(flag='o', doc="Output file name.") val output: PathToVcf,
@arg(flag='w', doc="Winnowing window size.") val windowSize: Int = 0,
@arg(flag='e', doc="Sequencing Error rate for genotyping.") val epsilon: Double = 0.01,
@arg(flag='v', doc="Minimum allele depth to call HOMVAR (otherwise NO-CALL)") val minAdHomvar: Int = 4,
@arg(flag='r', doc="Minimum allele depth to call HOMREF (otherwise NO-CALL)") val minAdHomref: Int = 2,
@arg(flag='v', doc="Minimum allele depth to call HOMVAR (otherwise NO-CALL)") val minAdHomvar: Int = 3,
@arg(flag='r', doc="Minimum allele depth to call HOMREF (otherwise NO-CALL)") val minAdHomref: Int = 1,
@arg(flag='c', doc="True to write out no-calls.") val writeNoCall: Boolean = false,
@arg(flag='s', doc="Random seed value.") val seed: Int = 42,
) extends FgBioTool {
Expand Down

0 comments on commit cb37718

Please sign in to comment.