Skip to content

Commit

Permalink
Retain all source IDs on VariantContext (#9032)
Browse files Browse the repository at this point in the history
* Retain all source IDs on VariantContext merge

In GATK3, when merging variants, the IDs of all the source VCFs were retained. This code path seems like it intended that, since the variantSources set is generated, but it doesnt get used for anything. This PR will use that set to set the source of the resulting merged VC.

* Add overloaded method to GATKVariantContextUtils.simpleMerge to preserve existing behavior when storeAllVcfSources=false


Co-authored-by: Louis Bergelson <[email protected]>
  • Loading branch information
bbimber and lbergelson authored Nov 13, 2024
1 parent 3fe011a commit 18707c6
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,14 @@
import java.nio.file.Path;
import java.util.*;
import java.util.function.BiFunction;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

public final class GATKVariantContextUtils {

/** maximum number of sources to include when merging sources */
private static final int MAX_SOURCES_TO_INCLUDE = 10;
private static final Logger logger = LogManager.getLogger(GATKVariantContextUtils.class);

public static final String MERGE_FILTER_PREFIX = "filterIn";
Expand Down Expand Up @@ -1096,31 +1099,46 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
final GenotypeMergeType genotypeMergeOptions,
final boolean filteredAreUncalled) {
int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled);
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, false, -1);
}

public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
final List<String> priorityListOfVCs,
final FilteredRecordMergeType filteredRecordMergeType,
final GenotypeMergeType genotypeMergeOptions,
final boolean filteredAreUncalled,
final boolean storeAllVcfSources,
final int maxSourceFieldLength) {
int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, storeAllVcfSources, maxSourceFieldLength);
}

/**
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
* the sample name.
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
* SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
*
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
*
* @param unsortedVCs collection of unsorted VCs
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
* @param filteredRecordMergeType merge type for filtered records
* @param genotypeMergeOptions merge option for genotypes
* @param filteredAreUncalled are filtered records uncalled?
* @return new VariantContext representing the merge of unsortedVCs
*/
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
* the sample name.
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
* SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
*
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
*
* @param unsortedVCs collection of unsorted VCs
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
* @param filteredRecordMergeType merge type for filtered records
* @param genotypeMergeOptions merge option for genotypes
* @param filteredAreUncalled are filtered records uncalled?
* @param storeAllVcfSources if true, the sources of all VCs where isVariable()=true will be concatenated in the output VC's source field. If false, the source of the first VC will be used. This mirror's GATK3's behavior
* @param maxSourceFieldLength This can be used to enforce a maximum length for the value of the source field (primarily useful if storeAllVcfSources=true). Set to -1 for unlimited
* @return new VariantContext representing the merge of unsortedVCs
*/
public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
final List<String> priorityListOfVCs,
final int originalNumOfVCs,
final FilteredRecordMergeType filteredRecordMergeType,
final GenotypeMergeType genotypeMergeOptions,
final boolean filteredAreUncalled) {
final boolean filteredAreUncalled,
final boolean storeAllVcfSources,
final int maxSourceFieldLength) {
if ( unsortedVCs == null || unsortedVCs.isEmpty() )
return null;

Expand Down Expand Up @@ -1165,7 +1183,7 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
longestVC = vc; // get the longest location

nFiltered += vc.isFiltered() ? 1 : 0;
if ( vc.isVariant() ) variantSources.add(vc.getSource());
if ( storeAllVcfSources && vc.isVariant() ) variantSources.add(vc.getSource());

AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc);

Expand Down Expand Up @@ -1236,7 +1254,19 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort

final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);

final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID);
// This preserves the GATK3-like behavior of reporting multiple sources, delimited with hyphen:
// NOTE: if storeAllVcfSources is false, variantSources will be empty and therefore no sorting is performed
String allSources = variantSources.isEmpty() ? name : variantSources.stream()
.sorted()
.distinct()
.limit(MAX_SOURCES_TO_INCLUDE)
.collect(Collectors.joining("-"));

if (maxSourceFieldLength != -1 && allSources.length() > maxSourceFieldLength) {
allSources = allSources.substring(0, maxSourceFieldLength);
}

final VariantContextBuilder builder = new VariantContextBuilder().source(allSources).id(ID);
builder.loc(longestVC.getContig(), longestVC.getStart(), longestVC.getEnd());
builder.alleles(alleles);
builder.genotypes(genotypes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,27 @@ public void testMergeGenotypesUniquify() {
Assert.assertEquals(merged.getSampleNames(), new LinkedHashSet<>(Arrays.asList("s1.1", "s1.2")));
}

@Test
public void testMergeSourceIDs() {
final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1));
final VariantContext vc1WithSource= new VariantContextBuilder(vc1).source("source1").make();
final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2));
final VariantContext vc2WithSource = new VariantContextBuilder(vc2).source("source2").make();
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
Arrays.asList(vc1WithSource, vc2WithSource), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, true, -1);

// test sources are merged
Assert.assertEquals(merged.getSource(), "source1-source2");

final VariantContext mergedTruncated = GATKVariantContextUtils.simpleMerge(
Arrays.asList(vc1WithSource, vc2WithSource), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, true, 2);

// test sources are merged and respects maxSourceFieldLength
Assert.assertEquals(mergedTruncated.getSource(), "so");
}

// TODO: remove after testing
// @Test(expectedExceptions = IllegalStateException.class)
// public void testMergeGenotypesRequireUnique() {
Expand Down

0 comments on commit 18707c6

Please sign in to comment.