Skip to content

Commit

Permalink
Merge pull request #671 from cmnbroad/cn_validate_cram_md5
Browse files Browse the repository at this point in the history
Normalize CRAM reference bases and throw on MD5 validation failure.
  • Loading branch information
cmnbroad authored Aug 23, 2016
2 parents 0d0da87 + a781afa commit a31f6c8
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 15 deletions.
12 changes: 8 additions & 4 deletions src/main/java/htsjdk/samtools/CRAMIterator.java
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,14 @@ void nextContainer() throws IOException, IllegalArgumentException,
final Slice slice = container.slices[i];
if (slice.sequenceId < 0)
continue;
if (validationStringency != ValidationStringency.SILENT && !slice.validateRefMD5(refs)) {
log.error(String
.format("Reference sequence MD5 mismatch for slice: seq id %d, start %d, span %d, expected MD5 %s", slice.sequenceId,
slice.alignmentStart, slice.alignmentSpan, String.format("%032x", new BigInteger(1, slice.refMD5))));
if (!slice.validateRefMD5(refs)) {
final String msg = String.format(
"Reference sequence MD5 mismatch for slice: sequence id %d, start %d, span %d, expected MD5 %s",
slice.sequenceId,
slice.alignmentStart,
slice.alignmentSpan,
String.format("%032x", new BigInteger(1, slice.refMD5)));
throw new CRAMException(msg);
}
}

Expand Down
4 changes: 3 additions & 1 deletion src/main/java/htsjdk/samtools/cram/build/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
*/
package htsjdk.samtools.cram.build;

class Utils {
final public class Utils {

private Utils() {};

/**
* CRAM operates with upper case bases, so both read and ref bases should be
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ public interface CRAMReferenceSource {
* against the reference by using common name variations,
* such as adding or removing a leading "chr" prefix
* from the requested name. if false, use exact match
* @return the bases representing the requested sequence. or null if the sequence
* cannot be found
* @return the upper cased, normalized (see {@link htsjdk.samtools.cram.build.Utils#normalizeBase})
* bases representing the requested sequence, or null if the sequence cannot be found
*/
byte[] getReferenceBases(final SAMSequenceRecord sequenceRecord, final boolean tryNameVariants);
}
22 changes: 14 additions & 8 deletions src/main/java/htsjdk/samtools/cram/ref/ReferenceSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.cram.build.Utils;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
Expand Down Expand Up @@ -123,13 +124,23 @@ private byte[] findInCache(final String name) {
return null;
}

// Upper case and normalize (-> ACGTN) in-place, and add to the cache
private byte[] addToCache(final String sequenceName, final byte[] bases) {
for (int i = 0; i < bases.length; i++) {
bases[i] = Utils.normalizeBase(bases[i]);
}
cacheW.put(sequenceName, new WeakReference<byte[]>(bases));
return bases;
}

public synchronized byte[] getReferenceBases(final SAMSequenceRecord record,
final boolean tryNameVariants) {
{ // check cache by sequence name:
final String name = record.getSequenceName();
final byte[] bases = findInCache(name);
if (bases != null)
if (bases != null) {
return bases;
}
}

final String md5 = record.getAttribute(SAMSequenceRecord.MD5_TAG);
Expand All @@ -152,10 +163,7 @@ public synchronized byte[] getReferenceBases(final SAMSequenceRecord record,
{ // try to fetch sequence by name:
bases = findBasesByName(record.getSequenceName(), tryNameVariants);
if (bases != null) {
SequenceUtil.upperCase(bases);
cacheW.put(record.getSequenceName(), new WeakReference<byte[]>(
bases));
return bases;
return addToCache(record.getSequenceName(), bases);
}
}

Expand All @@ -169,9 +177,7 @@ public synchronized byte[] getReferenceBases(final SAMSequenceRecord record,
}
}
if (bases != null) {
SequenceUtil.upperCase(bases);
cacheW.put(md5, new WeakReference<byte[]>(bases));
return bases;
return addToCache(md5, bases);
}
}
}
Expand Down
5 changes: 5 additions & 0 deletions src/test/java/htsjdk/samtools/CRAMCRAIIndexerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ public void testCRAIIndexerFromContainer() throws IOException {
refSource,
ValidationStringency.STRICT);
SAMFileHeader samHeader = reader.getFileHeader();
Iterator<SAMRecord> it = reader.getIterator();
while(it.hasNext()) {
SAMRecord samRec = it.next();
}

reader.close();

FileInputStream fis = new FileInputStream(CRAMFile);
Expand Down
34 changes: 34 additions & 0 deletions src/test/java/htsjdk/samtools/cram/structure/SliceTests.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
package htsjdk.samtools.cram.structure;

import htsjdk.samtools.CRAMFileReader;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.ref.ReferenceSource;
import htsjdk.samtools.util.SequenceUtil;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;
import java.util.Iterator;

/**
* Created by vadim on 07/12/2015.
*/
Expand Down Expand Up @@ -33,4 +42,29 @@ public void test_validateRef() {
Assert.assertEquals(slice.refMD5, md5);
Assert.assertTrue(slice.validateRefMD5(ref));
}

@Test(expectedExceptions= CRAMException.class)
public void testFailsMD5Check() throws IOException {
// auxf.alteredForMD5test.fa has been altered slightly from the original reference
// to cause the CRAM md5 check to fail
final File CRAMFile = new File("src/test/resources/htsjdk/samtools/cram/auxf#values.3.0.cram");
final File refFile = new File("src/test/resources/htsjdk/samtools/cram/auxf.alteredForMD5test.fa");
ReferenceSource refSource = new ReferenceSource(refFile);
CRAMFileReader reader = null;
try {
reader = new CRAMFileReader(
CRAMFile,
null,
refSource,
ValidationStringency.STRICT);
Iterator<SAMRecord> it = reader.getIterator();
while (it.hasNext()) {
it.next();
}
} finally {
if (reader != null) {
reader.close();
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>Sheila
CTAGCTCAGAAAAAAAAAA
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Sheila 19 8 19 20

0 comments on commit a31f6c8

Please sign in to comment.