From 812d75225c7c6abc7f155d8e91e10486b217e872 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 21 Oct 2016 14:16:14 -0400 Subject: [PATCH] - tests added showing that the two codes for calculating NM do not agree with each other (on lowercase bases in the reference and reads) --- .../htsjdk/samtools/util/SequenceUtil.java | 84 +------------------ .../samtools/util/SequenceUtilTest.java | 32 +++++-- .../reference_with_lower_and_uppercase.dict | 3 + .../reference_with_lower_and_uppercase.fasta | 4 + ...ference_with_lower_and_uppercase.fasta.fai | 2 + .../SequenceUtil/upper_and_lowercase_read.sam | 10 +++ 6 files changed, 46 insertions(+), 89 deletions(-) create mode 100644 src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict create mode 100644 src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta create mode 100644 src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai create mode 100644 src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam diff --git a/src/main/java/htsjdk/samtools/util/SequenceUtil.java b/src/main/java/htsjdk/samtools/util/SequenceUtil.java index d2fb861a70..f6fb089491 100644 --- a/src/main/java/htsjdk/samtools/util/SequenceUtil.java +++ b/src/main/java/htsjdk/samtools/util/SequenceUtil.java @@ -441,33 +441,6 @@ public static int countMismatches(final SAMRecord read, final byte[] referenceBa return countMismatches(read, referenceBases, 0, bisulfiteSequence); } - /** - * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather - * than byte[]. This is because GATK needs it this way. - *

- * TODO: Remove this method when GATK map method is changed to take refseq as byte[]. - * TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version. - */ - @Deprecated - private static int countMismatches(final SAMRecord read, final char[] referenceBases, final int referenceOffset) { - int mismatches = 0; - - final byte[] readBases = read.getReadBases(); - - for (final AlignmentBlock block : read.getAlignmentBlocks()) { - final int readBlockStart = block.getReadStart() - 1; - final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset; - final int length = block.getLength(); - - for (int i = 0; i < length; ++i) { - if (!basesEqual(readBases[readBlockStart + i], StringUtil.charToByte(referenceBases[referenceBlockStart + i]))) { - ++mismatches; - } - } - } - return mismatches; - } - /** * Calculates the sum of qualities for mismatched bases in the read. * @@ -537,41 +510,6 @@ public static int sumQualitiesOfMismatches(final SAMRecord read, final byte[] re return qualities; } - /** - * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather - * than byte[]. This is because GATK needs it this way. - *

- * TODO: Remove this method when GATK map method is changed to take refseq as byte[]. - * TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version. - */ - @Deprecated - public static int sumQualitiesOfMismatches(final SAMRecord read, final char[] referenceBases, - final int referenceOffset) { - int qualities = 0; - - final byte[] readBases = read.getReadBases(); - final byte[] readQualities = read.getBaseQualities(); - - if (read.getAlignmentStart() <= referenceOffset) { - throw new IllegalArgumentException("read.getAlignmentStart(" + read.getAlignmentStart() + - ") <= referenceOffset(" + referenceOffset + ")"); - } - - for (final AlignmentBlock block : read.getAlignmentBlocks()) { - final int readBlockStart = block.getReadStart() - 1; - final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset; - final int length = block.getLength(); - - for (int i = 0; i < length; ++i) { - if (!basesEqual(readBases[readBlockStart + i], StringUtil.charToByte(referenceBases[referenceBlockStart + i]))) { - qualities += readQualities[readBlockStart + i]; - } - } - } - - return qualities; - } - public static int countInsertedBases(final Cigar cigar) { int ret = 0; for (final CigarElement element : cigar.getCigarElements()) { @@ -658,26 +596,6 @@ public static int calculateSamNmTagFromCigar(final SAMRecord record) { return samNm; } - - /** - * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather - * than byte[]. This is because GATK needs it this way. - *

- * TODO: Remove this method when GATK map method is changed to take refseq as byte[]. - * TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version. - */ - @Deprecated - public static int calculateSamNmTag(final SAMRecord read, final char[] referenceBases, - final int referenceOffset) { - int samNm = countMismatches(read, referenceBases, referenceOffset); - for (final CigarElement el : read.getCigar().getCigarElements()) { - if (el.getOperator() == CigarOperator.INSERTION || el.getOperator() == CigarOperator.DELETION) { - samNm += el.getLength(); - } - } - return samNm; - } - /** Returns the complement of a single byte. */ public static byte complement(final byte b) { switch (b) { @@ -1059,7 +977,7 @@ public static byte[] upperCase(final byte[] bases) { /** Generates all possible unambiguous kmers (upper-case) of length and returns them as byte[]s. */ public static List generateAllKmers(final int length) { - final List sofar = new LinkedList(); + final List sofar = new LinkedList<>(); if (sofar.isEmpty()) { sofar.add(new byte[length]); diff --git a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java index c5c797e7b4..1f5e12115f 100644 --- a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java @@ -23,16 +23,15 @@ */ package htsjdk.samtools.util; -import htsjdk.samtools.Cigar; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMTag; -import htsjdk.samtools.SAMTextHeaderCodec; -import htsjdk.samtools.TextCigarCodec; +import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileFactory; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; import java.util.HashSet; import java.util.Set; @@ -429,4 +428,25 @@ public Object[][] testGetSamReadNameFromFastqHeaderTestCases() { {"Name/1/2", "Name"} }; } + + @Test + public void testCalculateNmTag() { + final File TEST_DIR = new File("src/test/resources/htsjdk/samtools/SequenceUtil"); + final File referenceFile = new File(TEST_DIR, "reference_with_lower_and_uppercase.fasta"); + final File samFile = new File(TEST_DIR, "upper_and_lowercase_read.sam"); + + SamReader reader = SamReaderFactory.makeDefault().open(samFile); + ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile); + + reader.iterator().stream().forEach(r -> { + Integer nm = SequenceUtil.calculateSamNmTag(r, ref.getSequence(r.getContig()).getBases()); + String md = r.getStringAttribute(SAMTag.MD.name()); + Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with read \'" + r.getReadName() + "\':"); + SequenceUtil.calculateMdAndNmTags(r, ref.getSequence(r.getContig()).getBases(), true, true); + + Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with read \'" + r.getReadName() + "\':"); + Assert.assertEquals(r.getStringAttribute(SAMTag.MD.name()), md, "problem with read \'" + r.getReadName() + "\':"); + + }); + } } diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict new file mode 100644 index 0000000000..db5b251d0b --- /dev/null +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict @@ -0,0 +1,3 @@ +@HD VN:1.5 SO:unsorted +@SQ SN:chr1 LN:16 M5:56b74a652b3ed2f610263b8bb423167c UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta +@SQ SN:chr2 LN:16 M5:b835d2c026aa66c52a05838dcc0b59d4 UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta new file mode 100644 index 0000000000..0b446caa8e --- /dev/null +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta @@ -0,0 +1,4 @@ +>chr1 +ACGTACGTacgtacgt +>chr2 +TCGATCGAtcgatcga \ No newline at end of file diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai new file mode 100644 index 0000000000..9314c8fe55 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai @@ -0,0 +1,2 @@ +chr1 16 6 16 17 +chr2 16 29 16 16 diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam new file mode 100644 index 0000000000..fbaec81247 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam @@ -0,0 +1,10 @@ +@HD VN:1.5 SO:coordinate +@SQ SN:chr1 LN:16 M5:56b74a652b3ed2f610263b8bb423167c UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta +@SQ SN:chr2 LN:16 M5:b835d2c026aa66c52a05838dcc0b59d4 UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta +@CO chr1 value is ACGTACGTacgtacgt +@CO chr1 value is TCGATCGAtcgatcga +read1 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0 +read2 0 chr2 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:8 MD:Z:3t0A2T0a2G0t0A2t +read3 0 chr2 1 0 8M * 0 0 TCGATCGA AAAAAAAA NM:i:0 +read4 0 chr2 1 0 4M1D2M1S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2 +