Skip to content

Commit

Permalink
Merge pull request #225 from samtools/nh_dup_set
Browse files Browse the repository at this point in the history
Adding a new ordering based on identifying duplicate reads: SAMRecordDup...
  • Loading branch information
Yossi Farjoun committed May 23, 2015
2 parents 435c292 + 739d727 commit ea46734
Show file tree
Hide file tree
Showing 9 changed files with 939 additions and 1 deletion.
162 changes: 162 additions & 0 deletions src/java/htsjdk/samtools/DuplicateSet.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* Stores a set of records that are duplicates of each other. The first records in the list of records is
* considered the representative of the duplicate, and typically does not have it's duplicate flag set.
* The records' duplicate flag will be set appropriately as records are added. This behavior can be
* turned off.
*
* At this time, this set does not track optical duplicates.
*
* @author nhomer
*/
public class DuplicateSet {

private final List<SAMRecord> records;

private static final SAMRecordDuplicateComparator defaultComparator = new SAMRecordDuplicateComparator();

private final SAMRecordDuplicateComparator comparator;

private boolean needsSorting = false;

private boolean setDuplicateFlag = false;

/** Sets the duplicate flag by default */
public DuplicateSet() {
this(true);
}

public DuplicateSet(final boolean setDuplicateFlag) {
this(setDuplicateFlag, defaultComparator);
}

public DuplicateSet(final SAMRecordDuplicateComparator comparator) {
this(true, comparator);
}

public DuplicateSet(final boolean setDuplicateFlag, final SAMRecordDuplicateComparator comparator) {
records = new ArrayList<SAMRecord>(10);
this.setDuplicateFlag = setDuplicateFlag;
this.comparator = comparator;
}

/**
* Adds a record to the set and returns zero if either the set is empty, or it is a duplicate of the records already in the set. Otherwise,
* it does not add the record and returns non-zero.
* @param record the record to add.
* @return zero if the record belongs in this set, -1 in a previous set, or 1 in a subsequent set, according to the comparison order
*/
public int add(final SAMRecord record) {

if (!this.records.isEmpty()) {
final int cmp = this.comparator.duplicateSetCompare(this.getRepresentative(), record);
if (0 != cmp) {
return cmp;
}
}

this.records.add(record);
needsSorting = true;

return 0;
}

private void sort() {
if (!records.isEmpty()) {
Collections.sort(records, this.comparator);

final SAMRecord representative = records.get(0);

if (setDuplicateFlag) {
// reset duplicate flags
for (final SAMRecord record : records) {
if (!record.getReadUnmappedFlag() && !record.isSecondaryOrSupplementary() && !record.getReadName().equals(representative.getReadName())) {
record.setDuplicateReadFlag(true);
}
}
records.get(0).setDuplicateReadFlag(false);
}
}
needsSorting = false; // this could be in the if above if you think hard about it
}

/**
* Gets the list of records from this set.
*/
public List<SAMRecord> getRecords() {
if (needsSorting) {
sort();
}

return this.records;
}

/**
* Gets the representative record according to the duplicate comparator.
*/
public SAMRecord getRepresentative() {
if (needsSorting) {
sort();
}

return records.get(0);
}

/**
* Returns the number of records in this set.
*/
public int size() {
return this.records.size();
}

/**
* Returns the number of duplicates in this set, including the representative record. Does not include records that are unmapped,
* secondary, or supplementary.
*/
public int numDuplicates() {
int n = 0;
for (final SAMRecord record : records) {
if (!record.getReadUnmappedFlag() && !record.isSecondaryOrSupplementary()) {
n++;
}
}
return n;
}

public boolean isEmpty() {
return this.records.isEmpty();
}

/**
* Controls if we should update the duplicate flag of the records in this set.
*/
public void setDuplicateFlag(boolean setDuplicateFlag) { this.setDuplicateFlag = setDuplicateFlag; }
}
141 changes: 141 additions & 0 deletions src/java/htsjdk/samtools/DuplicateSetIterator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;

import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.SortingCollection;

import java.io.File;
import java.util.Collections;

/**
* An iterator of sets of duplicates. Duplicates are defined currently by the ordering in
* SAMRecordDuplicateComparator.
* <p/>
* If the input records are not pre-sorted according to the duplicate ordering, the records
* will be sorted on-the-fly. This may require extra memory or disk to buffer records, and
* also computational time to perform the sorting.
*
* @author nhomer
*/
public class DuplicateSetIterator implements CloseableIterator<DuplicateSet> {

private final CloseableIterator<SAMRecord> wrappedIterator;

private DuplicateSet duplicateSet = null;

private final SAMRecordDuplicateComparator comparator;

public DuplicateSetIterator(final CloseableIterator<SAMRecord> iterator, final SAMFileHeader header) {
this(iterator, header, false);
}

/**
* Allows the user of this iterator to skip the sorting of the input if the input is already sorted. If the records are said to be
* sorted but not actually sorted in the correct order, an exception during iteration will be thrown.
*/
public DuplicateSetIterator(final CloseableIterator<SAMRecord> iterator, final SAMFileHeader header, final boolean preSorted) {
this.comparator = new SAMRecordDuplicateComparator(Collections.singletonList(header));

if (preSorted) {
this.wrappedIterator = iterator;
} else {
// Sort it!
final int maxRecordsInRam = SAMFileWriterImpl.getDefaultMaxRecordsInRam();
final File tmpDir = new File(System.getProperty("java.io.tmpdir"));
final SortingCollection<SAMRecord> alignmentSorter = SortingCollection.newInstance(SAMRecord.class,
new BAMRecordCodec(header), comparator,
maxRecordsInRam, tmpDir);

while (iterator.hasNext()) {
final SAMRecord record = iterator.next();
alignmentSorter.add(record);
}
iterator.close();

this.wrappedIterator = alignmentSorter.iterator();
}

this.duplicateSet = new DuplicateSet(this.comparator);

if (hasNext()) {
this.duplicateSet.add(this.wrappedIterator.next());
}

}

public void setScoringStrategy(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy) {
this.comparator.setScoringStrategy(scoringStrategy);
}

public DuplicateSet next() {
DuplicateSet duplicateSet = null;

int cmp = 0;

while (0 == cmp) {
if (!wrappedIterator.hasNext()) { // no more!
duplicateSet = this.duplicateSet;
this.duplicateSet = new DuplicateSet(this.comparator);
break;
} else {
// get another one
final SAMRecord record = this.wrappedIterator.next();

// assumes that the duplicate set always has at least one record inside!
final SAMRecord representative = this.duplicateSet.getRepresentative();

if (representative.getReadUnmappedFlag() || representative.isSecondaryOrSupplementary()) {
duplicateSet = this.duplicateSet;
this.duplicateSet = new DuplicateSet(this.comparator);
this.duplicateSet.add(record);
break; // exits the 0 == cmp loop
} else {
// compare against the representative for set membership, not ordering
cmp = this.duplicateSet.add(record);

if (0 < cmp) {
throw new SAMException("The input records were not sorted in duplicate order:\n" +
representative.getSAMString() + record.getSAMString());
} else if (cmp < 0) {
duplicateSet = this.duplicateSet;
this.duplicateSet = new DuplicateSet(this.comparator);
this.duplicateSet.add(record);
} // otherwise it was already added
}
}
}

return duplicateSet;
}

public void close() { wrappedIterator.close(); }

public boolean hasNext() {
return (!duplicateSet.isEmpty() || wrappedIterator.hasNext());
}

// Does nothing!
public void remove() { }
}
3 changes: 2 additions & 1 deletion src/java/htsjdk/samtools/SAMFileHeader.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ public enum SortOrder {

unsorted(null),
queryname(SAMRecordQueryNameComparator.class),
coordinate(SAMRecordCoordinateComparator.class);
coordinate(SAMRecordCoordinateComparator.class),
duplicate(SAMRecordDuplicateComparator.class); // NB: this is not in the SAM spec!

private final Class<? extends SAMRecordComparator> comparator;

Expand Down
2 changes: 2 additions & 0 deletions src/java/htsjdk/samtools/SAMFileWriterImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ private SAMRecordComparator makeComparator() {
return new SAMRecordCoordinateComparator();
case queryname:
return new SAMRecordQueryNameComparator();
case duplicate:
return new SAMRecordDuplicateComparator();
case unsorted:
return null;
}
Expand Down
Loading

0 comments on commit ea46734

Please sign in to comment.