Skip to content

Commit

Permalink
- Adding a filtering iterator on variantContexts and
Browse files Browse the repository at this point in the history
- an implementation of a filter that looks for het sites.
- and a function to find the offset into a read from genomic position
- small change in behavior of new function (optionally let the position of a deletion be the last aligned base.
- added tests for new and old function
- added more tests.
- changed functionality so that getGenotype(int) returns null (rather than throws an OOB exception)
- responded to review comments

fixed documentation

- cleaned up some documentation issues that were making the log to long for travis

- reduced verbosity in TestNG
  • Loading branch information
yfarjoun committed Aug 25, 2015
1 parent a84b5b7 commit 2db9ee2
Show file tree
Hide file tree
Showing 12 changed files with 660 additions and 35 deletions.
2 changes: 1 addition & 1 deletion build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
<property name="repository.revision" value=""/>
<property name="htsjdk-version" value="1.138"/>
<property name="htsjdk-version-file" value="htsjdk.version.properties"/>
<property name="testng.verbosity" value="2"/>
<property name="testng.verbosity" value="1"/>
<property name="test.debug.port" value="5005" /> <!-- override on the command line if desired -->

<condition property="isUnix">
Expand Down
100 changes: 91 additions & 9 deletions src/java/htsjdk/samtools/SAMRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -478,21 +478,32 @@ public int getUnclippedEnd() {


/**
* @param offset 1-based location within the unclipped sequence or 0 if there is no position.
* <p/>
* Non static version of the static function with the same name.
* @return 1-based inclusive reference position of the unclipped sequence at a given offset,
* or 0 if there is no position.
* For example, given the sequence NNNAAACCCGGG, cigar 3S9M, and an alignment start of 1,
* and a (1-based)offset 10 (start of GGG) it returns 7 (1-based offset starting after the soft clip.
* For example: given the sequence AAACCCGGGTTT, cigar 4M1D6M, an alignment start of 1,
* an offset of 4 returns reference position 4, an offset of 5 returns reference position 6.
* Another example: given the sequence AAACCCGGGTTT, cigar 4M1I6M, an alignment start of 1,
* an offset of 4 returns reference position 4, an offset of 5 returns 0.
* @offset 1-based location within the unclipped sequence
*/
public int getReferencePositionAtReadPosition(final int offset) {
return getReferencePositionAtReadPosition(this, offset);
}

/**
* @param rec record to use
* @param offset 1-based location within the unclipped sequence
* @return 1-based inclusive reference position of the unclipped sequence at a given offset,
* or 0 if there is no position.
* For example, given the sequence NNNAAACCCGGG, cigar 3S9M, and an alignment start of 1,
* and a (1-based)offset 10 (start of GGG) it returns 7 (1-based offset starting after the soft clip.
* For example: given the sequence AAACCCGGGTTT, cigar 4M1D6M, an alignment start of 1,
* an offset of 4 returns reference position 4, an offset of 5 returns reference position 6.
* Another example: given the sequence AAACCCGGGTTT, cigar 4M1I6M, an alignment start of 1,
* an offset of 4 returns reference position 4, an offset of 5 returns 0.
*/
public static int getReferencePositionAtReadPosition(final SAMRecord rec, final int offset) {

if (offset == 0) return 0;

for (final AlignmentBlock alignmentBlock : getAlignmentBlocks()) {
for (final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) {
if (CoordMath.getEnd(alignmentBlock.getReadStart(), alignmentBlock.getLength()) < offset) {
continue;
} else if (offset < alignmentBlock.getReadStart()) {
Expand All @@ -504,6 +515,77 @@ public int getReferencePositionAtReadPosition(final int offset) {
return 0; // offset not located in an alignment block
}


/**
* @param pos 1-based reference position
* return the offset
* @return 1-based (to match getReferencePositionAtReadPosition behavior) inclusive position into the
* unclipped sequence at a given reference position, or 0 if there is no such position.
*
* See examples in the static version below
*/
public int getReadPositionAtReferencePosition(final int pos) {
return getReadPositionAtReferencePosition(this, pos, false);
}

/**
* @param pos 1-based reference position
* @param returnLastBaseIfDeleted if positive, and reference position matches a deleted base in the read, function will
* return the offset
* @return 1-based (to match getReferencePositionAtReadPosition behavior) inclusive position into the
* unclipped sequence at a given reference position,
* or 0 if there is no such position. If returnLastBaseIfDeleted is true deletions are assumed to "live" on the last read base
* in the preceding block.
*
* Non-static version of static function with the same name. See examples below.
*/
public int getReadPositionAtReferencePosition(final int pos, final boolean returnLastBaseIfDeleted) {
return getReadPositionAtReferencePosition(this, pos, returnLastBaseIfDeleted);
}

/**
* @param rec record to use
* @param pos 1-based reference position
* @param returnLastBaseIfDeleted if positive, and reference position matches a deleted base in the read, function will
* return the offset
* @return 1-based (to match getReferencePositionAtReadPosition behavior) inclusive position into the
* unclipped sequence at a given reference position,
* or 0 if there is no such position. If returnLastBaseIfDeleted is true deletions are assumed to "live" on the last read base
* in the preceding block.
* For example, given the sequence NNNAAACCCGGG, cigar 3S9M, and an alignment start of 1,
* and a (1-based)pos of 7 (start of GGG) it returns 10 (1-based offset including the soft clip.
* For example: given the sequence AAACCCGGGT, cigar 4M1D6M, an alignment start of 1,
* a reference position of 4 returns offset of 4, a reference of 5 also returns an offset 4 (using "left aligning") if returnLastBaseIfDeleted
* and 0 otherwise.
* For example: given the sequence AAACtCGGGTT, cigar 4M1I6M, an alignment start of 1,
* a position 4 returns an offset 5, a position of 5 returns 6 (the inserted base is the 5th offset), a position of 11 returns 0 since
* that position in the reference doesn't overlap the read at all.
*
*/
public static int getReadPositionAtReferencePosition(final SAMRecord rec, final int pos, final boolean returnLastBaseIfDeleted) {

if (pos <= 0) {
return 0;
}

int lastAlignmentOffset = 0;
for (final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) {
if (CoordMath.getEnd(alignmentBlock.getReferenceStart(), alignmentBlock.getLength()) >= pos) {
if (pos < alignmentBlock.getReferenceStart()) {
//There must have been a deletion block that skipped
return returnLastBaseIfDeleted ? lastAlignmentOffset : 0;
} else {
return pos - alignmentBlock.getReferenceStart() + alignmentBlock.getReadStart() ;
}
} else {
// record the offset to the last base in the current block, in case the next block starts too late
lastAlignmentOffset = alignmentBlock.getReadStart() + alignmentBlock.getLength() - 1 ;
}
}
// if we are here, the reference position was not overlapping the read at all
return 0;
}

/**
* @return 1-based inclusive leftmost position of the clipped mate sequence, or 0 if there is no position.
*/
Expand Down
2 changes: 1 addition & 1 deletion src/java/htsjdk/samtools/filter/FilteringIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ private SAMRecord getNextRecord() {
}
} else if (filterReadPairs && record.getReadPairedFlag() &&
record.getSecondOfPairFlag()) {
// assume that we did a filterOut(first, second) and it passed the filter
// assume that we did a pass(first, second) and it passed the filter
return record;
} else if (!filter.filterOut(record)) {
return record;
Expand Down
8 changes: 6 additions & 2 deletions src/java/htsjdk/variant/variantcontext/VariantContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -1011,11 +1011,15 @@ public boolean hasGenotype(String sample) {
return getGenotypes().containsSample(sample);
}

/**
* @param ith the sample index
*
* @return the ith genotype in this context or null if there aren't that many genotypes
*/
public Genotype getGenotype(int ith) {
return genotypes.get(ith);
return genotypes.size() > ith ? genotypes.get(ith) : null;
}


/**
* Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS)
*
Expand Down
125 changes: 125 additions & 0 deletions src/java/htsjdk/variant/variantcontext/filter/FilteringIterator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package htsjdk.variant.variantcontext.filter;

import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.variant.variantcontext.VariantContext;

import java.util.Iterator;
import java.util.NoSuchElementException;

/**
* A filtering iterator for VariantContexts that takes a base iterator and a VariantContextFilter.
*
* The iterator returns all the variantcontexts for which the filter's function "pass" returns true (and only those)
*/
public class FilteringIterator implements CloseableIterator<VariantContext>, Iterable<VariantContext>{
private final Iterator<VariantContext> iterator;
private final VariantContextFilter filter;
private VariantContext next = null;

/**
* Constructor of an iterator based on the provided iterator and predicate. The resulting
* records will be all those VariantContexts from iterator for which filter.pass( . ) is true
*
* @param iterator the backing iterator
* @param filter the filter
*/
public FilteringIterator(final Iterator<VariantContext> iterator, final VariantContextFilter filter) {
this.iterator = iterator;
this.filter = filter;
next = getNextVC();
}

@Override
public void close() {
CloserUtil.close(iterator);
}

/**
* Returns true if the iteration has more elements.
*
* @return true if the iteration has more elements. Otherwise returns false.
*/
@Override
public boolean hasNext() {
return next != null;
}

/**
* Returns the next element in the iteration.
*
* @return the next element in the iteration
* @throws NoSuchElementException if there are no more elements to return
*
*/
@Override
public VariantContext next() throws NoSuchElementException {
if (next == null) {
throw new NoSuchElementException("Iterator has no more elements.");
}
final VariantContext result = next;
next = getNextVC();
return result;
}

/**
* Required method for Iterator API.
*
* @throws UnsupportedOperationException since it is unsupported here.
*/
@Override
public void remove() {
throw new UnsupportedOperationException("Remove() not supported by FilteringIterator");
}

/**
* Gets the next record from the underlying iterator that passes the filter
*
* @return VariantContext the next filter-passing record
*/
private VariantContext getNextVC() {

while (iterator.hasNext()) {
final VariantContext record = iterator.next();

if (filter.pass(record)) {
return record;
}
}
return null;
}

/**
* function to satisfy the Iterable interface
*
* @return itself since the class inherits from Iterator
*/
@Override
public Iterator<VariantContext> iterator() {
return this;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package htsjdk.variant.variantcontext.filter;

import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;

/**
* A Predicate on VariantContexts that either returns true at heterozygous sites (invertible to false).
* if optional "sample" argument to constructor is given, the genotype of that sample will be examined,
* otherwise first genotype will be used.
*
* Missing sample, or no genotype will result in an exception being thrown.
*/
public class HeterozygosityFilter implements VariantContextFilter {

final private String sample;
final private boolean keepHets;

/**
* Constructor for a filter that will keep (or remove, if keepHets is false) VC for which the
* genotype of sample is heterozygous. If sample is null, the first genotype in the
* variant context will be used.
*
* @param keepHets determine whether to keep the het sites (true) or filter them out (false)
* @param sample the name of the sample in the variant context whose genotype should be examined.
*/
public HeterozygosityFilter(final boolean keepHets, final String sample) {
this.keepHets = keepHets;
this.sample = sample;
}

/**
* Constructor as above that doesn't take a sample, instead it will look at the first genotype of the variant context.
* @param keepHets if true, the heterozygous variant contexts will pass the filter, otherwise they will fail.
*/
public HeterozygosityFilter(final boolean keepHets) {
this(keepHets, null);
}

/**
* @return true if variantContext is to be kept, otherwise false
* Assumes that this.sample is a sample in the variantContext, if not null,
* otherwise looks for the first genotype (and assumes it exists).
* @param variantContext the record to examine for heterozygosity
*/
@Override
public boolean pass(final VariantContext variantContext) {
final Genotype gt = (sample == null) ? variantContext.getGenotype(0) : variantContext.getGenotype(sample);

if (gt == null) {
throw new IllegalArgumentException((sample == null) ?
"Cannot find any genotypes in VariantContext: " + variantContext :
"Cannot find sample requested: " + sample);
}

//XOR operator to reverse behaviour if keepHets is true.
return gt.isHet() ^ !keepHets;
}
}
Loading

0 comments on commit 2db9ee2

Please sign in to comment.