From 7e9188621af8c2214071716e8b5bc69895ba2530 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Sat, 26 Nov 2016 21:32:13 -0500 Subject: [PATCH] Validate VariantContext AC and AF without genotypes --- .../variantcontext/VariantContext.java | 79 ++++++++++--------- .../VariantContextUnitTest.java | 46 ++++++++++- 2 files changed, 86 insertions(+), 39 deletions(-) diff --git a/src/main/java/htsjdk/variant/variantcontext/VariantContext.java b/src/main/java/htsjdk/variant/variantcontext/VariantContext.java index a6b8a5fce5..55825fb4d7 100644 --- a/src/main/java/htsjdk/variant/variantcontext/VariantContext.java +++ b/src/main/java/htsjdk/variant/variantcontext/VariantContext.java @@ -25,7 +25,6 @@ package htsjdk.variant.variantcontext; -import htsjdk.samtools.util.Tuple; import htsjdk.tribble.Feature; import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.ParsingUtils; @@ -263,7 +262,7 @@ public class VariantContext implements Feature, Serializable { * @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first */ public List calcVCFGenotypeKeys(final VCFHeader header) { - final Set keys = new HashSet(); + final Set keys = new HashSet<>(); boolean sawGoodGT = false; boolean sawGoodQual = false; @@ -287,11 +286,11 @@ public List calcVCFGenotypeKeys(final VCFHeader header) { if ( sawPL ) keys.add(VCFConstants.GENOTYPE_PL_KEY); if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY); - List sortedList = ParsingUtils.sortList(new ArrayList(keys)); + List sortedList = ParsingUtils.sortList(new ArrayList<>(keys)); // make sure the GT is first if (sawGoodGT) { - final List newList = new ArrayList(sortedList.size()+1); + final List newList = new ArrayList<>(sortedList.size() + 1); newList.add(VCFConstants.GENOTYPE_KEY); newList.addAll(sortedList); sortedList = newList; @@ -434,7 +433,7 @@ public VariantContext subContextFromSamples(Set sampleNames, final boole Set allelesFromGenotypes = allelesOfGenotypes(newGenotypes); // ensure original order of genotypes - List rederivedAlleles = new ArrayList(allelesFromGenotypes.size()); + List rederivedAlleles = new ArrayList<>(allelesFromGenotypes.size()); for (Allele allele : alleles) if (allelesFromGenotypes.contains(allele)) rederivedAlleles.add(allele); @@ -469,7 +468,7 @@ public VariantContext subContextFromSample(String sampleName) { * @return allele set */ private final Set allelesOfGenotypes(Collection genotypes) { - final Set alleles = new HashSet(); + final Set alleles = new HashSet<>(); boolean addedref = false; for ( final Genotype g : genotypes ) { @@ -863,7 +862,7 @@ public List getIndelLengths() { return null; } - List lengths = new ArrayList(); + List lengths = new ArrayList<>(); for ( Allele a : getAlternateAlleles() ) { lengths.add(a.length() - getReference().length()); } @@ -973,7 +972,7 @@ public GenotypesContext getGenotypes(String sampleName) { * @throws IllegalArgumentException if sampleName isn't bound to a genotype */ protected GenotypesContext getGenotypes(Collection sampleNames) { - return getGenotypes().subsetToSamples(new HashSet(sampleNames)); + return getGenotypes().subsetToSamples(new HashSet<>(sampleNames)); } public GenotypesContext getGenotypes(Set sampleNames) { @@ -1049,7 +1048,7 @@ public int getCalledChrCount(Set sampleIds) { * @return chromosome count */ public int getCalledChrCount(Allele a) { - return getCalledChrCount(a,new HashSet(0)); + return getCalledChrCount(a, new HashSet<>(0)); } /** @@ -1162,7 +1161,7 @@ public int getMixedCount() { * Run all extra-strict validation tests on a Variant Context object * * @param reportedReference the reported reference allele - * @param observedReference the actual reference allele + * @param observedReference the observed reference allele * @param rsIDs the true dbSNP IDs */ public void extraStrictValidation(final Allele reportedReference, final Allele observedReference, final Set rsIDs) { @@ -1172,7 +1171,7 @@ public void extraStrictValidation(final Allele reportedReference, final Allele o // validate the RS IDs validateRSIDs(rsIDs); - // validate the altenate alleles + // validate the alternate alleles validateAlternateAlleles(); // validate the AN and AC fields @@ -1201,7 +1200,7 @@ public void validateAlternateAlleles() { if ( !hasGenotypes() ) return; - // maintain a list of non-symbolic alleles reported in the REF and ALT fields of the record + // maintain a list of non-symbolic alleles expected in the REF and ALT fields of the record // (we exclude symbolic alleles because it's commonly expected that they don't show up in the genotypes, e.g. with GATK gVCFs) final List reportedAlleles = new ArrayList(); for ( final Allele allele : getAlleles() ) { @@ -1210,7 +1209,7 @@ public void validateAlternateAlleles() { } // maintain a list of non-symbolic alleles observed in the genotypes - final Set observedAlleles = new HashSet(); + final Set observedAlleles = new HashSet<>(); observedAlleles.add(getReference()); for ( final Genotype g : getGenotypes() ) { if ( g.isCalled() ) { @@ -1233,24 +1232,39 @@ public void validateAlternateAlleles() { throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getContig(), getStart())); } + private void validateAttributeIsExpectedSize(final String attributeKey, final int numAlternateAlleles ) { + final List actualValues = getAttributeAsList(attributeKey); + if (!actualValues.isEmpty()) { + // always have at least one actual value + final int expectedValuesSize = numAlternateAlleles > 0 ? numAlternateAlleles : 1; + if (actualValues.size() != expectedValuesSize) { + throw new TribbleException.InternalCodecException(String.format("the %s tag has the incorrect number of records at position %s:%d, %d vs. %d", attributeKey, getContig(), getStart(), actualValues.size(), expectedValuesSize)); + } + } + } + public void validateChromosomeCounts() { + final int numberOfAlternateAlleles = alleles.size() - 1; + validateAttributeIsExpectedSize(VCFConstants.ALLELE_COUNT_KEY, numberOfAlternateAlleles); + validateAttributeIsExpectedSize(VCFConstants.ALLELE_FREQUENCY_KEY, numberOfAlternateAlleles); + if ( !hasGenotypes() ) return; // AN if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { - int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString()); - int observedAN = getCalledChrCount(); + final int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString()); + final int observedAN = getCalledChrCount(); if ( reportedAN != observedAN ) throw new TribbleException.InternalCodecException(String.format("the Allele Number (AN) tag is incorrect for the record at position %s:%d, %d vs. %d", getContig(), getStart(), reportedAN, observedAN)); } // AC if ( hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { - ArrayList observedACs = new ArrayList(); + final ArrayList observedACs = new ArrayList<>(); // if there are alternate alleles, record the relevant tags - if (!getAlternateAlleles().isEmpty()) { + if ( numberOfAlternateAlleles > 0 ) { for ( Allele allele : getAlternateAlleles() ) { observedACs.add(getCalledChrCount(allele)); } @@ -1259,22 +1273,13 @@ public void validateChromosomeCounts() { observedACs.add(0); } - if ( getAttribute(VCFConstants.ALLELE_COUNT_KEY) instanceof List ) { - final List reportedACs = (List)getAttribute(VCFConstants.ALLELE_COUNT_KEY); - if ( observedACs.size() != reportedACs.size() ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have the correct number of values for the record at position %s:%d, %d vs. %d", getContig(), getStart(), reportedACs.size(), observedACs.size())); - for (int i = 0; i < observedACs.size(); i++) { - // need to cast to int to make sure we don't have an issue below with object equals (earlier bug) - EB - final int reportedAC = Integer.valueOf(reportedACs.get(i).toString()); - if ( reportedAC != observedACs.get(i) ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %s vs. %d", getContig(), getStart(), reportedAC, observedACs.get(i))); - } - } else { - if ( observedACs.size() != 1 ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have enough values for the record at position %s:%d", getContig(), getStart())); - int reportedAC = Integer.valueOf(getAttribute(VCFConstants.ALLELE_COUNT_KEY).toString()); - if ( reportedAC != observedACs.get(0) ) - throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getContig(), getStart(), reportedAC, observedACs.get(0))); + final List reportedACs = getAttributeAsList(VCFConstants.ALLELE_COUNT_KEY); + + for (int i = 0; i < observedACs.size(); i++) { + // need to cast to int to make sure we don't have an issue below with object equals (earlier bug) - EB + final int reportedAC = Integer.valueOf(reportedACs.get(i).toString()); + if ( reportedAC != observedACs.get(i) ) + throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %s vs. %d", getContig(), getStart(), reportedAC, observedACs.get(i))); } } } @@ -1473,7 +1478,7 @@ public String toStringWithoutGenotypes() { // protected basic manipulation routines private static List makeAlleles(Collection alleles) { - final List alleleList = new ArrayList(alleles.size()); + final List alleleList = new ArrayList<>(alleles.size()); boolean sawRef = false; for ( final Allele a : alleles ) { @@ -1544,7 +1549,7 @@ private final void fullyDecodeInfo(final VariantContextBuilder builder, final VC private final Map fullyDecodeAttributes(final Map attributes, final VCFHeader header, final boolean lenientDecoding) { - final Map newAttributes = new HashMap(10); + final Map newAttributes = new HashMap<>(10); for ( final Map.Entry attr : attributes.entrySet() ) { final String field = attr.getKey(); @@ -1582,7 +1587,7 @@ private final Object decodeValue(final String field, final Object value, final V final String string = (String)value; if ( string.indexOf(',') != -1 ) { final String[] splits = string.split(","); - final List values = new ArrayList(splits.length); + final List values = new ArrayList<>(splits.length); for ( int i = 0; i < splits.length; i++ ) values.add(decodeOne(field, splits[i], format)); return values; @@ -1591,7 +1596,7 @@ private final Object decodeValue(final String field, final Object value, final V } } else if ( value instanceof List && (((List) value).get(0)) instanceof String ) { final List asList = (List)value; - final List values = new ArrayList(asList.size()); + final List values = new ArrayList<>(asList.size()); for ( final String s : asList ) values.add(decodeOne(field, s, format)); return values; diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java index 2dc345fedb..14056f833b 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextUnitTest.java @@ -1259,13 +1259,34 @@ public Object[][] testValidateChromosomeCountsDataProvider() { final VariantContext vcACSetTwoAlts = createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACTwoAlts, hetVarTC); + // with AC set, and two different ALTs (T and C), with no GT, we expect a 2 count values. + final Map attributesACNoGtTwoAlts = new HashMap(); + attributesACNoGtTwoAlts.put(VCFConstants.ALLELE_COUNT_KEY, Arrays.asList("1", "1")); + final VariantContext vcACNoGtSetTwoAlts = + createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACNoGtTwoAlts, null); + + // with AF set, and two different ALTs (T and C), with GT of 1/2, we expect two frequncy values. + // With two ALTs, a list is expected, so we set the attribute as a list of 0.5,0.5 + final Map attributesAFTwoAlts = new HashMap(); + attributesAFTwoAlts.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5", "0.5")); + final VariantContext vcAFSetTwoAlts = + createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFTwoAlts, hetVarTC); + + // with AF set, and two different ALTs (T and C), with no GT, we expect two frequency values. + final Map attributesAFNoGtTwoAlts = new HashMap(); + attributesAFNoGtTwoAlts.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5", "0.5")); + final VariantContext vcAFNoGtSetTwoAlts = + createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFNoGtTwoAlts, null); + return new Object[][]{ {vcNoGenotypes}, {vcANSet}, {vcANSetNoCall}, {vcACSet}, {vcACSetNoAlts}, - {vcACSetTwoAlts} + {vcACNoGtSetTwoAlts}, + {vcAFSetTwoAlts}, + {vcAFNoGtSetTwoAlts} }; } @Test(dataProvider = "testValidateChromosomeCountsDataProvider") @@ -1319,13 +1340,34 @@ public Object[][] testValidateChromosomeCountsFailureDataProvider() { final VariantContext vcACSetTwoAltsOneAltCount = createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACTwoAltsOneAltCount, hetVarTC); + // with AC set, no GT, two ALTs, but only count for one ALT (we expect two items in the list: 1,1) + final Map attributesACNoGtTwoAltsOneAltCount = new HashMap(); + attributesACNoGtTwoAltsOneAltCount.put(VCFConstants.ALLELE_COUNT_KEY, Arrays.asList("1")); + final VariantContext vcACNoGtSetTwoAltsOneAltCount = + createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesACNoGtTwoAltsOneAltCount, null); + + // with AF set, two ALTs, but only frequency for one ALT (we expect two items in the list + final Map attributesAFTwoAltsWrongFreq = new HashMap(); + attributesAFTwoAltsWrongFreq.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5")); + final VariantContext vcAFSetTwoAltsWrongFreq = + createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFTwoAltsWrongFreq, hetVarTC); + + // with AF set, no GT, two ALTs, but only frequency for one ALT (we expect two items in the list + final Map attributesAFNoGtTwoAltsWrongCount = new HashMap(); + attributesAFNoGtTwoAltsWrongCount.put(VCFConstants.ALLELE_FREQUENCY_KEY, Arrays.asList("0.5")); + final VariantContext vcAFNoGtSetTwoAltsWrongFreq = + createValidateChromosomeCountsContext(Arrays.asList(Aref, T, C), attributesAFNoGtTwoAltsWrongCount, null); + return new Object[][]{ {vcANSet}, {vcANSetNoCall}, {vcACWrongCount}, {vcACSetTwoAlts}, {vcACSetTwoAltsWrongCount}, - {vcACSetTwoAltsOneAltCount} + {vcACSetTwoAltsOneAltCount}, + {vcACNoGtSetTwoAltsOneAltCount}, + {vcAFSetTwoAltsWrongFreq}, + {vcAFNoGtSetTwoAltsWrongFreq} }; } @Test(dataProvider = "testValidateChromosomeCountsFailureDataProvider", expectedExceptions = TribbleException.class)