-
Notifications
You must be signed in to change notification settings - Fork 244
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #225 from samtools/nh_dup_set
Adding a new ordering based on identifying duplicate reads: SAMRecordDup...
- Loading branch information
Showing
9 changed files
with
939 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2015 The Broad Institute | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
package htsjdk.samtools; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.List; | ||
|
||
/** | ||
* Stores a set of records that are duplicates of each other. The first records in the list of records is | ||
* considered the representative of the duplicate, and typically does not have it's duplicate flag set. | ||
* The records' duplicate flag will be set appropriately as records are added. This behavior can be | ||
* turned off. | ||
* | ||
* At this time, this set does not track optical duplicates. | ||
* | ||
* @author nhomer | ||
*/ | ||
public class DuplicateSet { | ||
|
||
private final List<SAMRecord> records; | ||
|
||
private static final SAMRecordDuplicateComparator defaultComparator = new SAMRecordDuplicateComparator(); | ||
|
||
private final SAMRecordDuplicateComparator comparator; | ||
|
||
private boolean needsSorting = false; | ||
|
||
private boolean setDuplicateFlag = false; | ||
|
||
/** Sets the duplicate flag by default */ | ||
public DuplicateSet() { | ||
this(true); | ||
} | ||
|
||
public DuplicateSet(final boolean setDuplicateFlag) { | ||
this(setDuplicateFlag, defaultComparator); | ||
} | ||
|
||
public DuplicateSet(final SAMRecordDuplicateComparator comparator) { | ||
this(true, comparator); | ||
} | ||
|
||
public DuplicateSet(final boolean setDuplicateFlag, final SAMRecordDuplicateComparator comparator) { | ||
records = new ArrayList<SAMRecord>(10); | ||
this.setDuplicateFlag = setDuplicateFlag; | ||
this.comparator = comparator; | ||
} | ||
|
||
/** | ||
* Adds a record to the set and returns zero if either the set is empty, or it is a duplicate of the records already in the set. Otherwise, | ||
* it does not add the record and returns non-zero. | ||
* @param record the record to add. | ||
* @return zero if the record belongs in this set, -1 in a previous set, or 1 in a subsequent set, according to the comparison order | ||
*/ | ||
public int add(final SAMRecord record) { | ||
|
||
if (!this.records.isEmpty()) { | ||
final int cmp = this.comparator.duplicateSetCompare(this.getRepresentative(), record); | ||
if (0 != cmp) { | ||
return cmp; | ||
} | ||
} | ||
|
||
this.records.add(record); | ||
needsSorting = true; | ||
|
||
return 0; | ||
} | ||
|
||
private void sort() { | ||
if (!records.isEmpty()) { | ||
Collections.sort(records, this.comparator); | ||
|
||
final SAMRecord representative = records.get(0); | ||
|
||
if (setDuplicateFlag) { | ||
// reset duplicate flags | ||
for (final SAMRecord record : records) { | ||
if (!record.getReadUnmappedFlag() && !record.isSecondaryOrSupplementary() && !record.getReadName().equals(representative.getReadName())) { | ||
record.setDuplicateReadFlag(true); | ||
} | ||
} | ||
records.get(0).setDuplicateReadFlag(false); | ||
} | ||
} | ||
needsSorting = false; // this could be in the if above if you think hard about it | ||
} | ||
|
||
/** | ||
* Gets the list of records from this set. | ||
*/ | ||
public List<SAMRecord> getRecords() { | ||
if (needsSorting) { | ||
sort(); | ||
} | ||
|
||
return this.records; | ||
} | ||
|
||
/** | ||
* Gets the representative record according to the duplicate comparator. | ||
*/ | ||
public SAMRecord getRepresentative() { | ||
if (needsSorting) { | ||
sort(); | ||
} | ||
|
||
return records.get(0); | ||
} | ||
|
||
/** | ||
* Returns the number of records in this set. | ||
*/ | ||
public int size() { | ||
return this.records.size(); | ||
} | ||
|
||
/** | ||
* Returns the number of duplicates in this set, including the representative record. Does not include records that are unmapped, | ||
* secondary, or supplementary. | ||
*/ | ||
public int numDuplicates() { | ||
int n = 0; | ||
for (final SAMRecord record : records) { | ||
if (!record.getReadUnmappedFlag() && !record.isSecondaryOrSupplementary()) { | ||
n++; | ||
} | ||
} | ||
return n; | ||
} | ||
|
||
public boolean isEmpty() { | ||
return this.records.isEmpty(); | ||
} | ||
|
||
/** | ||
* Controls if we should update the duplicate flag of the records in this set. | ||
*/ | ||
public void setDuplicateFlag(boolean setDuplicateFlag) { this.setDuplicateFlag = setDuplicateFlag; } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2015 The Broad Institute | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
package htsjdk.samtools; | ||
|
||
import htsjdk.samtools.util.CloseableIterator; | ||
import htsjdk.samtools.util.SortingCollection; | ||
|
||
import java.io.File; | ||
import java.util.Collections; | ||
|
||
/** | ||
* An iterator of sets of duplicates. Duplicates are defined currently by the ordering in | ||
* SAMRecordDuplicateComparator. | ||
* <p/> | ||
* If the input records are not pre-sorted according to the duplicate ordering, the records | ||
* will be sorted on-the-fly. This may require extra memory or disk to buffer records, and | ||
* also computational time to perform the sorting. | ||
* | ||
* @author nhomer | ||
*/ | ||
public class DuplicateSetIterator implements CloseableIterator<DuplicateSet> { | ||
|
||
private final CloseableIterator<SAMRecord> wrappedIterator; | ||
|
||
private DuplicateSet duplicateSet = null; | ||
|
||
private final SAMRecordDuplicateComparator comparator; | ||
|
||
public DuplicateSetIterator(final CloseableIterator<SAMRecord> iterator, final SAMFileHeader header) { | ||
this(iterator, header, false); | ||
} | ||
|
||
/** | ||
* Allows the user of this iterator to skip the sorting of the input if the input is already sorted. If the records are said to be | ||
* sorted but not actually sorted in the correct order, an exception during iteration will be thrown. | ||
*/ | ||
public DuplicateSetIterator(final CloseableIterator<SAMRecord> iterator, final SAMFileHeader header, final boolean preSorted) { | ||
this.comparator = new SAMRecordDuplicateComparator(Collections.singletonList(header)); | ||
|
||
if (preSorted) { | ||
this.wrappedIterator = iterator; | ||
} else { | ||
// Sort it! | ||
final int maxRecordsInRam = SAMFileWriterImpl.getDefaultMaxRecordsInRam(); | ||
final File tmpDir = new File(System.getProperty("java.io.tmpdir")); | ||
final SortingCollection<SAMRecord> alignmentSorter = SortingCollection.newInstance(SAMRecord.class, | ||
new BAMRecordCodec(header), comparator, | ||
maxRecordsInRam, tmpDir); | ||
|
||
while (iterator.hasNext()) { | ||
final SAMRecord record = iterator.next(); | ||
alignmentSorter.add(record); | ||
} | ||
iterator.close(); | ||
|
||
this.wrappedIterator = alignmentSorter.iterator(); | ||
} | ||
|
||
this.duplicateSet = new DuplicateSet(this.comparator); | ||
|
||
if (hasNext()) { | ||
this.duplicateSet.add(this.wrappedIterator.next()); | ||
} | ||
|
||
} | ||
|
||
public void setScoringStrategy(final DuplicateScoringStrategy.ScoringStrategy scoringStrategy) { | ||
this.comparator.setScoringStrategy(scoringStrategy); | ||
} | ||
|
||
public DuplicateSet next() { | ||
DuplicateSet duplicateSet = null; | ||
|
||
int cmp = 0; | ||
|
||
while (0 == cmp) { | ||
if (!wrappedIterator.hasNext()) { // no more! | ||
duplicateSet = this.duplicateSet; | ||
this.duplicateSet = new DuplicateSet(this.comparator); | ||
break; | ||
} else { | ||
// get another one | ||
final SAMRecord record = this.wrappedIterator.next(); | ||
|
||
// assumes that the duplicate set always has at least one record inside! | ||
final SAMRecord representative = this.duplicateSet.getRepresentative(); | ||
|
||
if (representative.getReadUnmappedFlag() || representative.isSecondaryOrSupplementary()) { | ||
duplicateSet = this.duplicateSet; | ||
this.duplicateSet = new DuplicateSet(this.comparator); | ||
this.duplicateSet.add(record); | ||
break; // exits the 0 == cmp loop | ||
} else { | ||
// compare against the representative for set membership, not ordering | ||
cmp = this.duplicateSet.add(record); | ||
|
||
if (0 < cmp) { | ||
throw new SAMException("The input records were not sorted in duplicate order:\n" + | ||
representative.getSAMString() + record.getSAMString()); | ||
} else if (cmp < 0) { | ||
duplicateSet = this.duplicateSet; | ||
this.duplicateSet = new DuplicateSet(this.comparator); | ||
this.duplicateSet.add(record); | ||
} // otherwise it was already added | ||
} | ||
} | ||
} | ||
|
||
return duplicateSet; | ||
} | ||
|
||
public void close() { wrappedIterator.close(); } | ||
|
||
public boolean hasNext() { | ||
return (!duplicateSet.isEmpty() || wrappedIterator.hasNext()); | ||
} | ||
|
||
// Does nothing! | ||
public void remove() { } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.