Skip to content

Commit

Permalink
Merge pull request #759 from samtools/rhl_validate_ac_af_no_gt_757
Browse files Browse the repository at this point in the history
Validate VariantContext AC and AF without genotypes
  • Loading branch information
ronlevine authored Dec 7, 2016
2 parents 4d0070b + 7e91886 commit e69aff0
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 39 deletions.
79 changes: 42 additions & 37 deletions src/main/java/htsjdk/variant/variantcontext/VariantContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

package htsjdk.variant.variantcontext;

import htsjdk.samtools.util.Tuple;
import htsjdk.tribble.Feature;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.util.ParsingUtils;
Expand Down Expand Up @@ -263,7 +262,7 @@ public class VariantContext implements Feature, Serializable {
* @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first
*/
public List<String> calcVCFGenotypeKeys(final VCFHeader header) {
final Set<String> keys = new HashSet<String>();
final Set<String> keys = new HashSet<>();

boolean sawGoodGT = false;
boolean sawGoodQual = false;
Expand All @@ -287,11 +286,11 @@ public List<String> calcVCFGenotypeKeys(final VCFHeader header) {
if ( sawPL ) keys.add(VCFConstants.GENOTYPE_PL_KEY);
if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY);

List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));
List<String> sortedList = ParsingUtils.sortList(new ArrayList<>(keys));

// make sure the GT is first
if (sawGoodGT) {
final List<String> newList = new ArrayList<String>(sortedList.size()+1);
final List<String> newList = new ArrayList<>(sortedList.size() + 1);
newList.add(VCFConstants.GENOTYPE_KEY);
newList.addAll(sortedList);
sortedList = newList;
Expand Down Expand Up @@ -434,7 +433,7 @@ public VariantContext subContextFromSamples(Set<String> sampleNames, final boole
Set<Allele> allelesFromGenotypes = allelesOfGenotypes(newGenotypes);

// ensure original order of genotypes
List<Allele> rederivedAlleles = new ArrayList<Allele>(allelesFromGenotypes.size());
List<Allele> rederivedAlleles = new ArrayList<>(allelesFromGenotypes.size());
for (Allele allele : alleles)
if (allelesFromGenotypes.contains(allele))
rederivedAlleles.add(allele);
Expand Down Expand Up @@ -469,7 +468,7 @@ public VariantContext subContextFromSample(String sampleName) {
* @return allele set
*/
private final Set<Allele> allelesOfGenotypes(Collection<Genotype> genotypes) {
final Set<Allele> alleles = new HashSet<Allele>();
final Set<Allele> alleles = new HashSet<>();

boolean addedref = false;
for ( final Genotype g : genotypes ) {
Expand Down Expand Up @@ -863,7 +862,7 @@ public List<Integer> getIndelLengths() {
return null;
}

List<Integer> lengths = new ArrayList<Integer>();
List<Integer> lengths = new ArrayList<>();
for ( Allele a : getAlternateAlleles() ) {
lengths.add(a.length() - getReference().length());
}
Expand Down Expand Up @@ -973,7 +972,7 @@ public GenotypesContext getGenotypes(String sampleName) {
* @throws IllegalArgumentException if sampleName isn't bound to a genotype
*/
protected GenotypesContext getGenotypes(Collection<String> sampleNames) {
return getGenotypes().subsetToSamples(new HashSet<String>(sampleNames));
return getGenotypes().subsetToSamples(new HashSet<>(sampleNames));
}

public GenotypesContext getGenotypes(Set<String> sampleNames) {
Expand Down Expand Up @@ -1049,7 +1048,7 @@ public int getCalledChrCount(Set<String> sampleIds) {
* @return chromosome count
*/
public int getCalledChrCount(Allele a) {
return getCalledChrCount(a,new HashSet<String>(0));
return getCalledChrCount(a, new HashSet<>(0));
}

/**
Expand Down Expand Up @@ -1162,7 +1161,7 @@ public int getMixedCount() {
* Run all extra-strict validation tests on a Variant Context object
*
* @param reportedReference the reported reference allele
* @param observedReference the actual reference allele
* @param observedReference the observed reference allele
* @param rsIDs the true dbSNP IDs
*/
public void extraStrictValidation(final Allele reportedReference, final Allele observedReference, final Set<String> rsIDs) {
Expand All @@ -1172,7 +1171,7 @@ public void extraStrictValidation(final Allele reportedReference, final Allele o
// validate the RS IDs
validateRSIDs(rsIDs);

// validate the altenate alleles
// validate the alternate alleles
validateAlternateAlleles();

// validate the AN and AC fields
Expand Down Expand Up @@ -1201,7 +1200,7 @@ public void validateAlternateAlleles() {
if ( !hasGenotypes() )
return;

// maintain a list of non-symbolic alleles reported in the REF and ALT fields of the record
// maintain a list of non-symbolic alleles expected in the REF and ALT fields of the record
// (we exclude symbolic alleles because it's commonly expected that they don't show up in the genotypes, e.g. with GATK gVCFs)
final List<Allele> reportedAlleles = new ArrayList<Allele>();
for ( final Allele allele : getAlleles() ) {
Expand All @@ -1210,7 +1209,7 @@ public void validateAlternateAlleles() {
}

// maintain a list of non-symbolic alleles observed in the genotypes
final Set<Allele> observedAlleles = new HashSet<Allele>();
final Set<Allele> observedAlleles = new HashSet<>();
observedAlleles.add(getReference());
for ( final Genotype g : getGenotypes() ) {
if ( g.isCalled() ) {
Expand All @@ -1233,24 +1232,39 @@ public void validateAlternateAlleles() {
throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getContig(), getStart()));
}

private void validateAttributeIsExpectedSize(final String attributeKey, final int numAlternateAlleles ) {
final List<Object> actualValues = getAttributeAsList(attributeKey);
if (!actualValues.isEmpty()) {
// always have at least one actual value
final int expectedValuesSize = numAlternateAlleles > 0 ? numAlternateAlleles : 1;
if (actualValues.size() != expectedValuesSize) {
throw new TribbleException.InternalCodecException(String.format("the %s tag has the incorrect number of records at position %s:%d, %d vs. %d", attributeKey, getContig(), getStart(), actualValues.size(), expectedValuesSize));
}
}
}

public void validateChromosomeCounts() {
final int numberOfAlternateAlleles = alleles.size() - 1;
validateAttributeIsExpectedSize(VCFConstants.ALLELE_COUNT_KEY, numberOfAlternateAlleles);
validateAttributeIsExpectedSize(VCFConstants.ALLELE_FREQUENCY_KEY, numberOfAlternateAlleles);

if ( !hasGenotypes() )
return;

// AN
if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) {
int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString());
int observedAN = getCalledChrCount();
final int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString());
final int observedAN = getCalledChrCount();
if ( reportedAN != observedAN )
throw new TribbleException.InternalCodecException(String.format("the Allele Number (AN) tag is incorrect for the record at position %s:%d, %d vs. %d", getContig(), getStart(), reportedAN, observedAN));
}

// AC
if ( hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
ArrayList<Integer> observedACs = new ArrayList<Integer>();
final ArrayList<Integer> observedACs = new ArrayList<>();

// if there are alternate alleles, record the relevant tags
if (!getAlternateAlleles().isEmpty()) {
if ( numberOfAlternateAlleles > 0 ) {
for ( Allele allele : getAlternateAlleles() ) {
observedACs.add(getCalledChrCount(allele));
}
Expand All @@ -1259,22 +1273,13 @@ public void validateChromosomeCounts() {
observedACs.add(0);
}

if ( getAttribute(VCFConstants.ALLELE_COUNT_KEY) instanceof List ) {
final List reportedACs = (List)getAttribute(VCFConstants.ALLELE_COUNT_KEY);
if ( observedACs.size() != reportedACs.size() )
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have the correct number of values for the record at position %s:%d, %d vs. %d", getContig(), getStart(), reportedACs.size(), observedACs.size()));
for (int i = 0; i < observedACs.size(); i++) {
// need to cast to int to make sure we don't have an issue below with object equals (earlier bug) - EB
final int reportedAC = Integer.valueOf(reportedACs.get(i).toString());
if ( reportedAC != observedACs.get(i) )
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %s vs. %d", getContig(), getStart(), reportedAC, observedACs.get(i)));
}
} else {
if ( observedACs.size() != 1 )
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have enough values for the record at position %s:%d", getContig(), getStart()));
int reportedAC = Integer.valueOf(getAttribute(VCFConstants.ALLELE_COUNT_KEY).toString());
if ( reportedAC != observedACs.get(0) )
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getContig(), getStart(), reportedAC, observedACs.get(0)));
final List<Object> reportedACs = getAttributeAsList(VCFConstants.ALLELE_COUNT_KEY);

for (int i = 0; i < observedACs.size(); i++) {
// need to cast to int to make sure we don't have an issue below with object equals (earlier bug) - EB
final int reportedAC = Integer.valueOf(reportedACs.get(i).toString());
if ( reportedAC != observedACs.get(i) )
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %s vs. %d", getContig(), getStart(), reportedAC, observedACs.get(i)));
}
}
}
Expand Down Expand Up @@ -1473,7 +1478,7 @@ public String toStringWithoutGenotypes() {

// protected basic manipulation routines
private static List<Allele> makeAlleles(Collection<Allele> alleles) {
final List<Allele> alleleList = new ArrayList<Allele>(alleles.size());
final List<Allele> alleleList = new ArrayList<>(alleles.size());

boolean sawRef = false;
for ( final Allele a : alleles ) {
Expand Down Expand Up @@ -1544,7 +1549,7 @@ private final void fullyDecodeInfo(final VariantContextBuilder builder, final VC
private final Map<String, Object> fullyDecodeAttributes(final Map<String, Object> attributes,
final VCFHeader header,
final boolean lenientDecoding) {
final Map<String, Object> newAttributes = new HashMap<String, Object>(10);
final Map<String, Object> newAttributes = new HashMap<>(10);

for ( final Map.Entry<String, Object> attr : attributes.entrySet() ) {
final String field = attr.getKey();
Expand Down Expand Up @@ -1582,7 +1587,7 @@ private final Object decodeValue(final String field, final Object value, final V
final String string = (String)value;
if ( string.indexOf(',') != -1 ) {
final String[] splits = string.split(",");
final List<Object> values = new ArrayList<Object>(splits.length);
final List<Object> values = new ArrayList<>(splits.length);
for ( int i = 0; i < splits.length; i++ )
values.add(decodeOne(field, splits[i], format));
return values;
Expand All @@ -1591,7 +1596,7 @@ private final Object decodeValue(final String field, final Object value, final V
}
} else if ( value instanceof List && (((List) value).get(0)) instanceof String ) {
final List<String> asList = (List<String>)value;
final List<Object> values = new ArrayList<Object>(asList.size());
final List<Object> values = new ArrayList<>(asList.size());
for ( final String s : asList )
values.add(decodeOne(field, s, format));
return values;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1259,13 +1259,34 @@ public Object[][] testValidateChromosomeCountsDataProvider() {
final VariantContext vcACSetTwoAlts =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACTwoAlts, hetVarTC);

// with AC set, and two different ALTs (T and C), with no GT, we expect a 2 count values.
final Map<String, Object> attributesACNoGtTwoAlts = new HashMap<String, Object>();
attributesACNoGtTwoAlts.put(VCFConstants.ALLELE_COUNT_KEY, Arrays.asList("1", "1"));
final VariantContext vcACNoGtSetTwoAlts =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACNoGtTwoAlts, null);

// with AF set, and two different ALTs (T and C), with GT of 1/2, we expect two frequncy values.
// With two ALTs, a list is expected, so we set the attribute as a list of 0.5,0.5
final Map<String, Object> attributesAFTwoAlts = new HashMap<String, Object>();
attributesAFTwoAlts.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5", "0.5"));
final VariantContext vcAFSetTwoAlts =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFTwoAlts, hetVarTC);

// with AF set, and two different ALTs (T and C), with no GT, we expect two frequency values.
final Map<String, Object> attributesAFNoGtTwoAlts = new HashMap<String, Object>();
attributesAFNoGtTwoAlts.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5", "0.5"));
final VariantContext vcAFNoGtSetTwoAlts =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFNoGtTwoAlts, null);

return new Object[][]{
{vcNoGenotypes},
{vcANSet},
{vcANSetNoCall},
{vcACSet},
{vcACSetNoAlts},
{vcACSetTwoAlts}
{vcACNoGtSetTwoAlts},
{vcAFSetTwoAlts},
{vcAFNoGtSetTwoAlts}
};
}
@Test(dataProvider = "testValidateChromosomeCountsDataProvider")
Expand Down Expand Up @@ -1319,13 +1340,34 @@ public Object[][] testValidateChromosomeCountsFailureDataProvider() {
final VariantContext vcACSetTwoAltsOneAltCount =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACTwoAltsOneAltCount, hetVarTC);

// with AC set, no GT, two ALTs, but only count for one ALT (we expect two items in the list: 1,1)
final Map<String, Object> attributesACNoGtTwoAltsOneAltCount = new HashMap<String, Object>();
attributesACNoGtTwoAltsOneAltCount.put(VCFConstants.ALLELE_COUNT_KEY, Arrays.asList("1"));
final VariantContext vcACNoGtSetTwoAltsOneAltCount =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACNoGtTwoAltsOneAltCount, null);

// with AF set, two ALTs, but only frequency for one ALT (we expect two items in the list
final Map<String, Object> attributesAFTwoAltsWrongFreq = new HashMap<String, Object>();
attributesAFTwoAltsWrongFreq.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5"));
final VariantContext vcAFSetTwoAltsWrongFreq =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFTwoAltsWrongFreq, hetVarTC);

// with AF set, no GT, two ALTs, but only frequency for one ALT (we expect two items in the list
final Map<String, Object> attributesAFNoGtTwoAltsWrongCount = new HashMap<String, Object>();
attributesAFNoGtTwoAltsWrongCount.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5"));
final VariantContext vcAFNoGtSetTwoAltsWrongFreq =
createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFNoGtTwoAltsWrongCount, null);

return new Object[][]{
{vcANSet},
{vcANSetNoCall},
{vcACWrongCount},
{vcACSetTwoAlts},
{vcACSetTwoAltsWrongCount},
{vcACSetTwoAltsOneAltCount}
{vcACSetTwoAltsOneAltCount},
{vcACNoGtSetTwoAltsOneAltCount},
{vcAFSetTwoAltsWrongFreq},
{vcAFNoGtSetTwoAltsWrongFreq}
};
}
@Test(dataProvider = "testValidateChromosomeCountsFailureDataProvider", expectedExceptions = TribbleException.class)
Expand Down

0 comments on commit e69aff0

Please sign in to comment.