Skip to content

Commit

Permalink
- tests added showing that the two codes for calculating NM do not ag…
Browse files Browse the repository at this point in the history
…ree with each other (on lowercase bases in the reference and reads)
  • Loading branch information
yfarjoun authored and Yossi Farjoun committed Oct 24, 2016
1 parent b497630 commit 812d752
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 89 deletions.
84 changes: 1 addition & 83 deletions src/main/java/htsjdk/samtools/util/SequenceUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -441,33 +441,6 @@ public static int countMismatches(final SAMRecord read, final byte[] referenceBa
return countMismatches(read, referenceBases, 0, bisulfiteSequence);
}

/**
* Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather
* than byte[]. This is because GATK needs it this way.
* <p/>
* TODO: Remove this method when GATK map method is changed to take refseq as byte[].
* TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version.
*/
@Deprecated
private static int countMismatches(final SAMRecord read, final char[] referenceBases, final int referenceOffset) {
int mismatches = 0;

final byte[] readBases = read.getReadBases();

for (final AlignmentBlock block : read.getAlignmentBlocks()) {
final int readBlockStart = block.getReadStart() - 1;
final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset;
final int length = block.getLength();

for (int i = 0; i < length; ++i) {
if (!basesEqual(readBases[readBlockStart + i], StringUtil.charToByte(referenceBases[referenceBlockStart + i]))) {
++mismatches;
}
}
}
return mismatches;
}

/**
* Calculates the sum of qualities for mismatched bases in the read.
*
Expand Down Expand Up @@ -537,41 +510,6 @@ public static int sumQualitiesOfMismatches(final SAMRecord read, final byte[] re
return qualities;
}

/**
* Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather
* than byte[]. This is because GATK needs it this way.
* <p/>
* TODO: Remove this method when GATK map method is changed to take refseq as byte[].
* TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version.
*/
@Deprecated
public static int sumQualitiesOfMismatches(final SAMRecord read, final char[] referenceBases,
final int referenceOffset) {
int qualities = 0;

final byte[] readBases = read.getReadBases();
final byte[] readQualities = read.getBaseQualities();

if (read.getAlignmentStart() <= referenceOffset) {
throw new IllegalArgumentException("read.getAlignmentStart(" + read.getAlignmentStart() +
") <= referenceOffset(" + referenceOffset + ")");
}

for (final AlignmentBlock block : read.getAlignmentBlocks()) {
final int readBlockStart = block.getReadStart() - 1;
final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset;
final int length = block.getLength();

for (int i = 0; i < length; ++i) {
if (!basesEqual(readBases[readBlockStart + i], StringUtil.charToByte(referenceBases[referenceBlockStart + i]))) {
qualities += readQualities[readBlockStart + i];
}
}
}

return qualities;
}

public static int countInsertedBases(final Cigar cigar) {
int ret = 0;
for (final CigarElement element : cigar.getCigarElements()) {
Expand Down Expand Up @@ -658,26 +596,6 @@ public static int calculateSamNmTagFromCigar(final SAMRecord record) {
return samNm;
}


/**
* Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather
* than byte[]. This is because GATK needs it this way.
* <p/>
* TODO: Remove this method when GATK map method is changed to take refseq as byte[].
* TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version.
*/
@Deprecated
public static int calculateSamNmTag(final SAMRecord read, final char[] referenceBases,
final int referenceOffset) {
int samNm = countMismatches(read, referenceBases, referenceOffset);
for (final CigarElement el : read.getCigar().getCigarElements()) {
if (el.getOperator() == CigarOperator.INSERTION || el.getOperator() == CigarOperator.DELETION) {
samNm += el.getLength();
}
}
return samNm;
}

/** Returns the complement of a single byte. */
public static byte complement(final byte b) {
switch (b) {
Expand Down Expand Up @@ -1059,7 +977,7 @@ public static byte[] upperCase(final byte[] bases) {

/** Generates all possible unambiguous kmers (upper-case) of length and returns them as byte[]s. */
public static List<byte[]> generateAllKmers(final int length) {
final List<byte[]> sofar = new LinkedList<byte[]>();
final List<byte[]> sofar = new LinkedList<>();

if (sofar.isEmpty()) {
sofar.add(new byte[length]);
Expand Down
32 changes: 26 additions & 6 deletions src/test/java/htsjdk/samtools/util/SequenceUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,15 @@
*/
package htsjdk.samtools.util;

import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.SAMTextHeaderCodec;
import htsjdk.samtools.TextCigarCodec;
import htsjdk.samtools.*;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
Expand Down Expand Up @@ -429,4 +428,25 @@ public Object[][] testGetSamReadNameFromFastqHeaderTestCases() {
{"Name/1/2", "Name"}
};
}

@Test
public void testCalculateNmTag() {
final File TEST_DIR = new File("src/test/resources/htsjdk/samtools/SequenceUtil");
final File referenceFile = new File(TEST_DIR, "reference_with_lower_and_uppercase.fasta");
final File samFile = new File(TEST_DIR, "upper_and_lowercase_read.sam");

SamReader reader = SamReaderFactory.makeDefault().open(samFile);
ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile);

reader.iterator().stream().forEach(r -> {
Integer nm = SequenceUtil.calculateSamNmTag(r, ref.getSequence(r.getContig()).getBases());
String md = r.getStringAttribute(SAMTag.MD.name());
Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with read \'" + r.getReadName() + "\':");
SequenceUtil.calculateMdAndNmTags(r, ref.getSequence(r.getContig()).getBases(), true, true);

Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with read \'" + r.getReadName() + "\':");
Assert.assertEquals(r.getStringAttribute(SAMTag.MD.name()), md, "problem with read \'" + r.getReadName() + "\':");

});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
@HD VN:1.5 SO:unsorted
@SQ SN:chr1 LN:16 M5:56b74a652b3ed2f610263b8bb423167c UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
@SQ SN:chr2 LN:16 M5:b835d2c026aa66c52a05838dcc0b59d4 UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>chr1
ACGTACGTacgtacgt
>chr2
TCGATCGAtcgatcga
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr1 16 6 16 17
chr2 16 29 16 16
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
@HD VN:1.5 SO:coordinate
@SQ SN:chr1 LN:16 M5:56b74a652b3ed2f610263b8bb423167c UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
@SQ SN:chr2 LN:16 M5:b835d2c026aa66c52a05838dcc0b59d4 UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
@CO chr1 value is ACGTACGTacgtacgt
@CO chr1 value is TCGATCGAtcgatcga
read1 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0
read2 0 chr2 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:8 MD:Z:3t0A2T0a2G0t0A2t
read3 0 chr2 1 0 8M * 0 0 TCGATCGA AAAAAAAA NM:i:0
read4 0 chr2 1 0 4M1D2M1S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2

0 comments on commit 812d752

Please sign in to comment.