Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consider "low" mapping quality reads to be unaligned for the purpose of Marking Duplicates. #1460

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/main/java/picard/sam/markduplicates/MarkDuplicates.java
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ protected int doWork() {

DuplicationMetrics metrics = AbstractMarkDuplicatesCommandLineProgram.addReadToLibraryMetrics(rec, header, libraryIdGenerator);


// Now try and figure out the next duplicate index (if going by coordinate. if going by query name, only do this
// if the query name has changed.
nextDuplicateIndex = nextIndexIfNeeded(sortOrder, recordInFileIndex, nextDuplicateIndex, duplicateQueryName, rec, this.duplicateIndexes);
Expand All @@ -337,7 +338,6 @@ protected int doWork() {

if (isDuplicate) {
rec.setDuplicateReadFlag(true);

AbstractMarkDuplicatesCommandLineProgram.addDuplicateReadToMetrics(rec, metrics);
} else {
rec.setDuplicateReadFlag(false);
Expand Down Expand Up @@ -552,7 +552,7 @@ private void buildSortedReadEndLists(final boolean useBarcodes) {
final ReadEndsForMarkDuplicates fragmentEnd = buildReadEnds(header, indexForRead, rec, useBarcodes);
this.fragSort.add(fragmentEnd);

if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
if (MarkDuplicatesUtil.pairedForMarkDuplicates(rec)) {
final StringBuilder key = new StringBuilder();
key.append(ReservedTagConstants.READ_GROUP_ID);
key.append(rec.getReadName());
Expand Down Expand Up @@ -650,7 +650,7 @@ private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, fina
ends.score = DuplicateScoringStrategy.computeDuplicateScore(rec, this.DUPLICATE_SCORING_STRATEGY);

// Doing this lets the ends object know that it's part of a pair
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
if (MarkDuplicatesUtil.pairedForMarkDuplicates(rec)) {
ends.read2ReferenceIndex = rec.getMateReferenceIndex();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,14 @@ protected int doWork() {
this.SKIP_PAIRS_WITH_NO_MATE_CIGAR,
this.MAX_RECORDS_IN_RAM,
this.BLOCK_SIZE,
this.TMP_DIR);
this.TMP_DIR,
MIN_INFORMATIVE_MAPPING_Q);

// progress logger!
final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read");

// Go through the records
for (final SAMRecord record : new IterableAdapter<SAMRecord>(iterator)) {
for (final SAMRecord record : new IterableAdapter<>(iterator)) {
if (progress.record(record)) {
iterator.logMemoryStats(log);
}
Expand Down Expand Up @@ -192,6 +193,15 @@ protected int doWork() {
return 0;
}

protected String[] customCommandLineValidation() {

if (0 <= MIN_INFORMATIVE_MAPPING_Q) {
LOG.warn("non-zero value for MIN_INFORMATIVE_MAPPING_Q is not supported in " + MarkDuplicatesWithMateCigar.class.getSimpleName());
}

return super.customCommandLineValidation();
}

/**
* Updates the program record if necessary.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,18 @@

package picard.sam.markduplicates;

import htsjdk.samtools.util.SamRecordWithOrdinal;
import htsjdk.samtools.util.SamRecordTrackingBuffer;
import picard.PicardException;
import htsjdk.samtools.util.Histogram;
import picard.sam.DuplicationMetrics;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.PeekableIterator;
import htsjdk.samtools.*;
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.*;
import picard.PicardException;
import picard.sam.DuplicationMetrics;
import picard.sam.markduplicates.util.*;

import java.io.File;
import java.util.*;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;

/**
* This will iterate through a coordinate sorted SAM file (iterator) and either mark or
Expand Down Expand Up @@ -77,7 +75,7 @@ public class MarkDuplicatesWithMateCigarIterator implements SAMRecordIterator {

/**
* The queue that stores the records that currently are not marked as duplicates. These need to be kept until
* they cannot proven not to be duplicates, with the latter records having greater coordinate. The queue is stored in 5' unclipped
* they cannot proven not to be duplicates, with the later records having greater coordinate. The queue is stored in 5' unclipped
* ordering, along with keeping the record with the best score, defined by the scoring strategies. If any record
* is added to this queue and can be identified as a duplicate, the outputBuffer is notified of its
* status and it can be emitted. Therefore, we limit the amount of records in this queue to only those that will NOT
Expand All @@ -99,6 +97,8 @@ public class MarkDuplicatesWithMateCigarIterator implements SAMRecordIterator {

boolean isClosed = false;

private final short minInformativeMappingQuality;

/**
* Initializes the mark duplicates iterator.
*
Expand All @@ -111,6 +111,7 @@ public class MarkDuplicatesWithMateCigarIterator implements SAMRecordIterator {
* @param skipPairsWithNoMateCigar true to not return mapped pairs with no mate cigar, false otherwise
* @param blockSize the size of the blocks in the underlying buffer/queue
* @param tmpDirs the temporary directories to use if we spill records to disk
* @param minInformativeMappingQuality the minimal mapping quality that is considered informative for dup-marking
* @throws PicardException if the inputs are not in coordinate sort order
*/
public MarkDuplicatesWithMateCigarIterator(final SAMFileHeader header,
Expand All @@ -122,19 +123,20 @@ public MarkDuplicatesWithMateCigarIterator(final SAMFileHeader header,
final boolean skipPairsWithNoMateCigar,
final int maxRecordsInRam,
final int blockSize,
final List<File> tmpDirs) throws PicardException {
final List<File> tmpDirs, final short minInformativeMappingQuality) throws PicardException {
this.minInformativeMappingQuality = minInformativeMappingQuality;
if (header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new PicardException(getClass().getName() + " expects the input to be in coordinate sort order.");
}

this.header = header;
backingIterator = new PeekableIterator<SAMRecord>(iterator);
outputBuffer = new SamRecordTrackingBuffer<SamRecordWithOrdinalAndSetDuplicateReadFlag>(maxRecordsInRam, blockSize, tmpDirs, header, SamRecordWithOrdinalAndSetDuplicateReadFlag.class);
backingIterator = new PeekableIterator<>(iterator);
outputBuffer = new SamRecordTrackingBuffer<>(maxRecordsInRam, blockSize, tmpDirs, header, SamRecordWithOrdinalAndSetDuplicateReadFlag.class);

this.removeDuplicates = removeDuplicates;
this.skipPairsWithNoMateCigar = skipPairsWithNoMateCigar;
this.opticalDuplicateFinder = opticalDuplicateFinder;
toMarkQueue = new MarkQueue(duplicateScoringStrategy);
toMarkQueue = new MarkQueue(duplicateScoringStrategy, this.minInformativeMappingQuality);
libraryIdGenerator = new LibraryIdGenerator(header);

// Check for supported scoring strategies
Expand Down Expand Up @@ -237,9 +239,9 @@ public SAMRecord next() throws PicardException {
*/
private boolean ignoreDueToMissingMateCigar(final SamRecordWithOrdinal samRecordWithOrdinal) {
final SAMRecord record = samRecordWithOrdinal.getRecord();
// ignore/except-on paired records with mapped mate and no mate cigar
if (record.getReadPairedFlag() &&
!record.getMateUnmappedFlag() && null == SAMUtils.getMateCigar(record)) { // paired with one end unmapped and no mate cigar
// ignore/throw-on paired records with mapped mate and no mate cigar
if (MarkDuplicatesUtil.pairedForMarkDuplicates(record, minInformativeMappingQuality) &&
null == SAMUtils.getMateCigar(record)) { // paired with one end unmapped and no mate cigar

// NB: we are not truly examining these records. Do we want to count them?

Expand All @@ -250,7 +252,7 @@ private boolean ignoreDueToMissingMateCigar(final SamRecordWithOrdinal samRecord
// update metrics
if (record.getReadUnmappedFlag()) {
++metrics.UNMAPPED_READS;
} else if (!record.getReadPairedFlag() || record.getMateUnmappedFlag()) {
} else if (!MarkDuplicatesUtil.pairedForMarkDuplicates(record, minInformativeMappingQuality)) {
++metrics.UNPAIRED_READS_EXAMINED;
} else {
++metrics.READ_PAIRS_EXAMINED;
Expand Down Expand Up @@ -395,6 +397,7 @@ private SAMRecord markDuplicatesAndGetTheNextAvailable() {
continue;
}


// check for an unmapped read
if (record.getReadUnmappedFlag()) {
// unmapped reads at the end of the file!
Expand All @@ -415,7 +418,7 @@ private SAMRecord markDuplicatesAndGetTheNextAvailable() {
}

// build a read end for use in the toMarkQueue
readEnds = new ReadEndsForMateCigar(header, samRecordWithOrdinal, opticalDuplicateFinder, libraryIdGenerator.getLibraryId(samRecordWithOrdinal.getRecord()));
readEnds = new ReadEndsForMateCigar(header, samRecordWithOrdinal, opticalDuplicateFinder, libraryIdGenerator.getLibraryId(samRecordWithOrdinal.getRecord()), minInformativeMappingQuality);

// check that the minimumDistance was not too small
checkForMinimumDistanceFailure(readEnds);
Expand Down Expand Up @@ -444,7 +447,7 @@ private SAMRecord markDuplicatesAndGetTheNextAvailable() {
}
} else {
// Bring the simple metrics up to date
if (!record.getReadPairedFlag() || record.getMateUnmappedFlag()) {
if (!MarkDuplicatesUtil.pairedForMarkDuplicates(record, minInformativeMappingQuality)) {
++metrics.UNPAIRED_READS_EXAMINED;
} else {
++metrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end
Expand Down Expand Up @@ -596,7 +599,7 @@ private boolean tryPollingTheToMarkQueue(final boolean flush, final ReadEndsForM
final Set<ReadEnds> locations = toMarkQueue.getLocations(next);

if (!locations.isEmpty()) {
AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(new ArrayList<ReadEnds>(locations), null,
AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(new ArrayList<>(locations), null,
opticalDuplicateFinder, libraryIdGenerator);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import picard.sam.DuplicationMetrics;
import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram;
import picard.sam.markduplicates.util.LibraryIdGenerator;
import picard.sam.markduplicates.util.MarkDuplicatesUtil;
import picard.sam.markduplicates.util.ReadEnds;

import java.util.ArrayList;
Expand All @@ -65,6 +66,7 @@
*/
@DocumentedFeature
@ExperimentalFeature

@CommandLineProgramProperties(
summary = "Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. " +
"All records are then written to the output file with the duplicate records flagged.",
Expand All @@ -74,12 +76,7 @@
public class SimpleMarkDuplicatesWithMateCigar extends MarkDuplicates {
private final Log log = Log.getInstance(MarkDuplicatesWithMateCigar.class);

private class ReadEndsForSimpleMarkDuplicatesWithMateCigar extends ReadEnds {}

private static boolean isPairedAndBothMapped(final SAMRecord record) {
return record.getReadPairedFlag() &&
!record.getReadUnmappedFlag() &&
!record.getMateUnmappedFlag();
private class ReadEndsForSimpleMarkDuplicatesWithMateCigar extends ReadEnds {
}

/**
Expand Down Expand Up @@ -126,7 +123,7 @@ protected int doWork() {
for (final DuplicateSet duplicateSet : new IterableAdapter<>(iterator)) {
final SAMRecord representative = duplicateSet.getRepresentative();
final boolean doOpticalDuplicateTracking = (this.READ_NAME_REGEX != null) &&
isPairedAndBothMapped(representative) &&
MarkDuplicatesUtil.pairedForMarkDuplicates(representative) &&
representative.getFirstOfPairFlag();
final Set<String> duplicateReadEndsSeen = new HashSet<>();

Expand All @@ -149,15 +146,15 @@ protected int doWork() {
// First bring the simple metrics up to date
if (record.getReadUnmappedFlag()) {
++metrics.UNMAPPED_READS;
} else if (!record.getReadPairedFlag() || record.getMateUnmappedFlag()) {
} else if (!MarkDuplicatesUtil.pairedForMarkDuplicates(record)) {
++metrics.UNPAIRED_READS_EXAMINED;
} else {
++metrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end
}

if (record.getDuplicateReadFlag()) {
// Update the duplication metrics
if (!record.getReadPairedFlag() || record.getMateUnmappedFlag()) {
if (!MarkDuplicatesUtil.pairedForMarkDuplicates(record)) {
++metrics.UNPAIRED_READ_DUPLICATES;
} else {
++metrics.READ_PAIR_DUPLICATES;// will need to be divided by 2 at the end
Expand All @@ -168,7 +165,7 @@ protected int doWork() {
// To track optical duplicates, store a set of locations for mapped pairs, first end only. We care about orientation relative
// to the first end of the pair for optical duplicate tracking, which is more stringent than PCR duplicate tracking.
if (doOpticalDuplicateTracking &&
isPairedAndBothMapped(record) &&
MarkDuplicatesUtil.pairedForMarkDuplicates(record) &&
!duplicateReadEndsSeen.contains(record.getReadName())) {

final ReadEndsForSimpleMarkDuplicatesWithMateCigar readEnd = new ReadEndsForSimpleMarkDuplicatesWithMateCigar();
Expand Down
1 change: 1 addition & 0 deletions src/main/java/picard/sam/markduplicates/UmiGraph.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public class UmiGraph {
private final boolean allowMissingUmis;
private final boolean duplexUmis;


/**
* Creates a UmiGraph object
* @param set Set of reads that have the same start-stop positions, these will be broken up by UMI
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/picard/sam/markduplicates/UmiUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMUtils;
import org.apache.commons.lang3.StringUtils;
import picard.sam.markduplicates.util.MarkDuplicatesUtil;

import java.util.regex.Pattern;

Expand Down Expand Up @@ -102,7 +103,7 @@ static String getTopStrandNormalizedUmi(final SAMRecord record, final String umi
* @return Top or bottom strand, unknown if it cannot be determined.
*/
static ReadStrand getStrand(final SAMRecord rec) {
if (rec.getReadUnmappedFlag() || rec.getMateUnmappedFlag()) {
if (!MarkDuplicatesUtil.pairedForMarkDuplicates(rec)) {
return ReadStrand.UNKNOWN;
}

Expand Down Expand Up @@ -185,5 +186,4 @@ public enum ReadStrand {
BOTTOM,
UNKNOWN
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,7 @@
package picard.sam.markduplicates.util;

import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
import htsjdk.samtools.MergingSamRecordIterator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMProgramRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamFileHeaderMerger;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.*;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.Histogram;
Expand All @@ -45,12 +38,7 @@
import picard.sam.util.PGTagArgumentCollection;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.*;

/**
* Abstract class that holds parameters and methods common to classes that perform duplicate
Expand Down Expand Up @@ -84,6 +72,13 @@ public abstract class AbstractMarkDuplicatesCommandLineProgram extends AbstractO
"Deprecated, used ASSUME_SORT_ORDER=coordinate instead.", mutex = {"ASSUME_SORT_ORDER"})
public boolean ASSUME_SORTED = false;

@Argument(shortName = "MIN_MQ",
doc = "The minimal value of the mapping quality that is considered informative for the purpose of creating " +
"mapped-pairs. If a read has mapping quality smaller than this, the tool will consider it as unaligned " +
"for the purpose of duplicate mapping (though it will remain mapped).",
optional = true)
public short MIN_INFORMATIVE_MAPPING_Q = 0;

@Argument(shortName = StandardOptionDefinitions.ASSUME_SORT_ORDER_SHORT_NAME,
doc = "If not null, assume that the input file has this order even if the header says otherwise.",
optional = true, mutex = {"ASSUME_SORTED"})
Expand Down Expand Up @@ -218,6 +213,8 @@ public static DuplicationMetrics addReadToLibraryMetrics(final SAMRecord rec, fi
++metrics.SECONDARY_OR_SUPPLEMENTARY_RDS;
} else if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) {
++metrics.UNPAIRED_READS_EXAMINED;
} else if (!MarkDuplicatesUtil.pairedForMarkDuplicates(rec)) {
++metrics.UNPAIRED_READS_EXAMINED;
} else {
++metrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end
}
Expand All @@ -228,7 +225,7 @@ public static void addDuplicateReadToMetrics(final SAMRecord rec, final Duplicat
// only update duplicate counts for "decider" reads, not tag-a-long reads
if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
// Update the duplication metrics
if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) {
if (!MarkDuplicatesUtil.pairedForMarkDuplicates(rec)) {
++metrics.UNPAIRED_READ_DUPLICATES;
} else {
++metrics.READ_PAIR_DUPLICATES;// will need to be divided by 2 at the end
Expand Down Expand Up @@ -377,6 +374,12 @@ private static int trackOpticalDuplicates(final List<? extends ReadEnds> list,
return opticalDuplicates;
}

@Override
protected String[] customCommandLineValidation() {
MarkDuplicatesUtil.setMinInformativeMappingQuality(MIN_INFORMATIVE_MAPPING_Q);
return super.customCommandLineValidation();
}

private static void trackDuplicateCounts(final int listSize,
final int optDupCnt,
final LibraryIdGenerator libraryIdGenerator) {
Expand Down
Loading