diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index d25441a96f..0000000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/build.xml b/build.xml index ab80a7387e..ca5df6cd94 100755 --- a/build.xml +++ b/build.xml @@ -39,11 +39,7 @@ - - - - - + diff --git a/lib/apache-ant-1.8.2-bzip2.jar b/lib/apache-ant-1.8.2-bzip2.jar new file mode 100644 index 0000000000..0ca2928ec6 Binary files /dev/null and b/lib/apache-ant-1.8.2-bzip2.jar differ diff --git a/lib/commons-compress-1.4.1.jar b/lib/commons-compress-1.4.1.jar new file mode 100644 index 0000000000..b58761e812 Binary files /dev/null and b/lib/commons-compress-1.4.1.jar differ diff --git a/lib/xz-1.5.jar b/lib/xz-1.5.jar new file mode 100644 index 0000000000..2e9599ecfa Binary files /dev/null and b/lib/xz-1.5.jar differ diff --git a/src/java/htsjdk/samtools/BinaryTagCodec.java b/src/java/htsjdk/samtools/BinaryTagCodec.java index 5574a976e4..902e3baebf 100644 --- a/src/java/htsjdk/samtools/BinaryTagCodec.java +++ b/src/java/htsjdk/samtools/BinaryTagCodec.java @@ -267,7 +267,7 @@ private void writeArray(final Object value, final boolean isUnsignedArray) { * @param length How many bytes in binaryRep are tag storage. */ public static SAMBinaryTagAndValue readTags(final byte[] binaryRep, final int offset, - final int length, final ValidationStringency validationStringency) { + final int length, final ValidationStringency validationStringency) { final ByteBuffer byteBuffer = ByteBuffer.wrap(binaryRep, offset, length); byteBuffer.order(ByteOrder.LITTLE_ENDIAN); diff --git a/src/java/htsjdk/samtools/CRAMFileReader.java b/src/java/htsjdk/samtools/CRAMFileReader.java index 12ee7d5ae8..79b1f5f9bc 100644 --- a/src/java/htsjdk/samtools/CRAMFileReader.java +++ b/src/java/htsjdk/samtools/CRAMFileReader.java @@ -17,13 +17,14 @@ import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.SamReader.Type; -import htsjdk.samtools.cram.build.CramIO; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.cram.structure.Container; +import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.seekablestream.SeekableFileStream; import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.CloserUtil; +import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.RuntimeEOFException; import java.io.File; @@ -31,6 +32,8 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; +import java.util.Arrays; +import java.util.Iterator; /** * {@link htsjdk.samtools.BAMFileReader BAMFileReader} analogue for CRAM files. @@ -38,11 +41,12 @@ * * @author vadim */ -public class CRAMFileReader extends SamReader.ReaderImplementation { - private File file; +@SuppressWarnings("UnusedDeclaration") +public class CRAMFileReader extends SamReader.ReaderImplementation implements SamReader.Indexing { + private File cramFile; private final ReferenceSource referenceSource; - private InputStream is; - private CRAMIterator it; + private InputStream inputStream; + private CRAMIterator iterator; private BAMIndex mIndex; private File mIndexFile; private boolean mEnableIndexCaching; @@ -56,32 +60,31 @@ public class CRAMFileReader extends SamReader.ReaderImplementation { * {@link htsjdk.samtools.Defaults#REFERENCE_FASTA default} reference fasta * file will be used. * - * @param file CRAM file to open - * @param is CRAM stream to read + * @param cramFile CRAM file to open + * @param inputStream CRAM stream to read */ - public CRAMFileReader(final File file, final InputStream is) { - this(file, is, new ReferenceSource(Defaults.REFERENCE_FASTA)); + public CRAMFileReader(final File cramFile, final InputStream inputStream) { + this(cramFile, inputStream, new ReferenceSource(Defaults.REFERENCE_FASTA)); } /** * Open CRAM data for reading using either the file or the input stream * supplied in the arguments. * - * @param file CRAM file to read - * @param is index file to be used for random access + * @param cramFile CRAM file to read + * @param inputStream index file to be used for random access * @param referenceSource a {@link htsjdk.samtools.cram.ref.ReferenceSource source} of * reference sequences */ - public CRAMFileReader(final File file, final InputStream is, + public CRAMFileReader(final File cramFile, final InputStream inputStream, final ReferenceSource referenceSource) { - if (file == null && is == null) + if (cramFile == null && inputStream == null) throw new IllegalArgumentException( "Either file or input stream is required."); - this.file = file; - this.is = is; + this.cramFile = cramFile; + this.inputStream = inputStream; this.referenceSource = referenceSource; - getIterator(); } @@ -96,29 +99,37 @@ public CRAMFileReader(final File file, final InputStream is, */ public CRAMFileReader(final File cramFile, final File indexFile, final ReferenceSource referenceSource) { - if (file == null) + if (cramFile == null) throw new IllegalArgumentException("File is required."); - this.file = cramFile; + this.cramFile = cramFile; this.mIndexFile = indexFile; this.referenceSource = referenceSource; getIterator(); } - public CRAMFileReader(final File file, final ReferenceSource referenceSource) { - if (file == null && is == null) + public CRAMFileReader(final File cramFile, final ReferenceSource referenceSource) { + if (cramFile == null && inputStream == null) throw new IllegalArgumentException( "Either file or input stream is required."); - this.file = file; + this.cramFile = cramFile; this.referenceSource = referenceSource; getIterator(); } - public SAMRecordIterator iterator() { - return getIterator(); + public CRAMFileReader(final InputStream inputStream, final SeekableStream indexInputStream, + final ReferenceSource referenceSource, final ValidationStringency validationStringency) throws IOException { + this.inputStream = inputStream; + this.referenceSource = referenceSource; + this.validationStringency = validationStringency; + + iterator = new CRAMIterator(inputStream, referenceSource); + iterator.setValidationStringency(validationStringency); + if (indexInputStream != null) + mIndex = new CachingBAMFileIndex(indexInputStream, iterator.getSAMFileHeader().getSequenceDictionary()); } @Override @@ -162,26 +173,50 @@ public BAMIndex getIndex() { return mIndex; } + @Override + public boolean hasBrowseableIndex() { + return false; + } + + @Override + public BrowseableBAMIndex getBrowseableIndex() { + return null; + } + + @Override + public SAMRecordIterator iterator(final SAMFileSpan fileSpan) { + // get the file coordinates for the span: + final long[] coordinateArray = ((BAMFileSpan) fileSpan).toCoordinateArray(); + if (coordinateArray == null || coordinateArray.length == 0) return emptyIterator; + try { + // create an input stream that reads the source cram stream only within the coordinate pairs: + final SeekableStream seekableStream = getSeekableStreamOrFailWithRTE(); + return new CRAMIterator(seekableStream, referenceSource, coordinateArray); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + @Override public SAMFileHeader getFileHeader() { - return it.getSAMFileHeader(); + return iterator.getSAMFileHeader(); } @Override public SAMRecordIterator getIterator() { - if (it != null && file == null) - return it; + if (iterator != null && cramFile == null) + return iterator; try { - final CRAMIterator si; - if (file != null) { - si = new CRAMIterator(new FileInputStream(file), + final CRAMIterator newIterator; + if (cramFile != null) { + newIterator = new CRAMIterator(new FileInputStream(cramFile), referenceSource); } else - si = new CRAMIterator(is, referenceSource); + newIterator = new CRAMIterator(inputStream, referenceSource); - si.setValidationStringency(validationStringency); - it = si; - return it; + newIterator.setValidationStringency(validationStringency); + iterator = newIterator; + return iterator; } catch (final Exception e) { throw new RuntimeException(e); } @@ -189,12 +224,12 @@ public SAMRecordIterator getIterator() { @Override public CloseableIterator getIterator(final SAMFileSpan fileSpan) { - throw new RuntimeException("Not implemented."); + return iterator(fileSpan); } @Override public SAMFileSpan getFilePointerSpanningReads() { - throw new RuntimeException("Not implemented."); + return new BAMFileSpan(new Chunk(iterator.firstContainerOffset << 16, Long.MAX_VALUE)); } private static final SAMRecordIterator emptyIterator = new SAMRecordIterator() { @@ -229,8 +264,7 @@ public CloseableIterator queryAlignmentStart(final String sequence, final int start) { long[] filePointers = null; - // Hit the index to determine the chunk boundaries for the required - // data. + // Hit the index to determine the chunk boundaries for the required data. final SAMFileHeader fileHeader = getFileHeader(); final int referenceIndex = fileHeader.getSequenceIndex(sequence); if (referenceIndex != -1) { @@ -244,76 +278,100 @@ public CloseableIterator queryAlignmentStart(final String sequence, if (filePointers == null || filePointers.length == 0) return emptyIterator; - final SeekableStream s = getSeekableStreamOrFailWithRTE(); - final CRAMIterator si; - try { - s.seek(0); - si = new CRAMIterator(s, referenceSource); - si.setValidationStringency(validationStringency); - it = si; - } catch (final IOException e) { - throw new RuntimeEOFException(e); - } - - Container c; + Container container; + final SeekableStream seekableStream = getSeekableStreamOrFailWithRTE(); for (int i = 0; i < filePointers.length; i += 2) { final long containerOffset = filePointers[i] >>> 16; + try { - s.seek(containerOffset); - c = CramIO.readContainerHeader(s); - if (c.alignmentStart + c.alignmentSpan > start) { - s.seek(containerOffset); - return si; + if (seekableStream.position() != containerOffset || iterator.container == null) { + seekableStream.seek(containerOffset); + container = ContainerIO.readContainerHeader(iterator.getCramHeader().getVersion().major, seekableStream); + if (container.alignmentStart + container.alignmentSpan > start) { + seekableStream.seek(containerOffset); + iterator.jumpWithinContainerToPos(fileHeader.getSequenceIndex(sequence), start); + return new IntervalIterator(iterator, new QueryInterval(referenceIndex, start, -1)); + } + } else { + container = iterator.container; + if (container.alignmentStart + container.alignmentSpan > start) { + iterator.jumpWithinContainerToPos(fileHeader.getSequenceIndex(sequence), start); + return new IntervalIterator(iterator, new QueryInterval(referenceIndex, start, -1)); + } } } catch (final IOException e) { throw new RuntimeException(e); } } - return it; + return iterator; + } + + CloseableIterator query(final int referenceIndex, + final int start, final int end, final boolean overlap) throws IOException { + long[] filePointers = null; + + // Hit the index to determine the chunk boundaries for the required data. + if (referenceIndex != -1) { + final BAMIndex fileIndex = getIndex(); + final BAMFileSpan fileSpan = fileIndex.getSpanOverlapping( + referenceIndex, start, -1); + filePointers = fileSpan != null ? fileSpan.toCoordinateArray() + : null; + } + + if (filePointers == null || filePointers.length == 0) + return emptyIterator; + + final CRAMIterator newIterator = new CRAMIterator(getSeekableStreamOrFailWithRTE(), referenceSource, filePointers); + return new IntervalIterator(newIterator, new QueryInterval(referenceIndex, start, end), overlap); } @Override public CloseableIterator queryUnmapped() { final long startOfLastLinearBin = getIndex().getStartOfLastLinearBin(); - final SeekableStream s = getSeekableStreamOrFailWithRTE(); - final CRAMIterator si; + final SeekableStream seekableStream = getSeekableStreamOrFailWithRTE(); + final CRAMIterator newIterator; try { - s.seek(0); - si = new CRAMIterator(s, referenceSource); - si.setValidationStringency(validationStringency); - s.seek(startOfLastLinearBin); - it = si; + seekableStream.seek(0); + newIterator = new CRAMIterator(seekableStream, referenceSource); + newIterator.setValidationStringency(validationStringency); + seekableStream.seek(startOfLastLinearBin >>> 16); + final Container container = ContainerIO.readContainerHeader(newIterator.getCramHeader().getVersion().major, seekableStream); + seekableStream.seek(seekableStream.position() + container.containerByteSize); + iterator = newIterator; + iterator.jumpWithinContainerToPos(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START); } catch (final IOException e) { throw new RuntimeEOFException(e); } - return it; + return iterator; } private SeekableStream getSeekableStreamOrFailWithRTE() { - SeekableStream s = null; - if (file != null) { + SeekableStream seekableStream = null; + if (cramFile != null) { try { - s = new SeekableFileStream(file); + seekableStream = new SeekableFileStream(cramFile); } catch (final FileNotFoundException e) { throw new RuntimeException(e); } - } else if (is instanceof SeekableStream) - s = (SeekableStream) is; - return s; + } else if (inputStream instanceof SeekableStream) + seekableStream = (SeekableStream) inputStream; + return seekableStream; } @Override public void close() { - CloserUtil.close(it); - CloserUtil.close(is); + CloserUtil.close(iterator); + CloserUtil.close(inputStream); CloserUtil.close(mIndex); } @Override void setValidationStringency(final ValidationStringency validationStringency) { this.validationStringency = validationStringency; + if (iterator != null) iterator.setValidationStringency(validationStringency); } @Override @@ -324,17 +382,7 @@ public ValidationStringency getValidationStringency() { @Override public CloseableIterator query(final QueryInterval[] intervals, final boolean contained) { - if (is == null) { - throw new IllegalStateException("File reader is closed"); - } - if (it != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (mIndex == null && mIndexFile == null) { - throw new UnsupportedOperationException( - "Cannot query stream-based BAM file"); - } - throw new SAMException("Multiple interval queries not implemented."); + return new MultiIntervalIterator(Arrays.asList(intervals).iterator(), !contained); } @Override @@ -344,7 +392,136 @@ public Type type() { @Override void enableFileSource(final SamReader reader, final boolean enabled) { - if (it != null) - it.setFileSource(enabled ? reader : null); + if (iterator != null) + iterator.setFileSource(enabled ? reader : null); + } + + private class MultiIntervalIterator implements SAMRecordIterator { + private final Iterator queries; + private CloseableIterator iterator; + private final boolean overlap; + + public MultiIntervalIterator(final Iterator queries, final boolean overlap) { + this.queries = queries; + this.overlap = overlap; + } + + @Override + public SAMRecordIterator assertSorted(final SortOrder sortOrder) { + return null; + } + + @Override + public void close() { + + } + + @Override + public boolean hasNext() { + if (iterator == null || !iterator.hasNext()) { + if (!queries.hasNext()) return false; + do { + final QueryInterval query = queries.next(); + try { + iterator = query(query.referenceIndex, query.start, query.end, overlap); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } while (!iterator.hasNext() && queries.hasNext()); + } + return iterator.hasNext(); + } + + @Override + public SAMRecord next() { + return iterator.next(); + } + + @Override + public void remove() { + iterator.remove(); + } + } + + public static class IntervalIterator implements SAMRecordIterator { + private final CloseableIterator delegate; + private final QueryInterval interval; + private SAMRecord next; + private boolean noMore = false; + private final boolean overlap; + + public IntervalIterator(final CloseableIterator delegate, final QueryInterval interval) { + this(delegate, interval, true); + } + + public IntervalIterator(final CloseableIterator delegate, final QueryInterval interval, final boolean overlap) { + this.delegate = delegate; + this.interval = interval; + this.overlap = overlap; + } + + @Override + public SAMRecordIterator assertSorted(final SortOrder sortOrder) { + return null; + } + + @Override + public void close() { + delegate.close(); + } + + @Override + public boolean hasNext() { + if (next != null) return true; + if (noMore) return false; + + while (delegate.hasNext()) { + next = delegate.next(); + + if (isWithinTheInterval(next)) break; + if (isBeyondTheInterval(next)) { + next = null; + noMore = true; + return false; + } + next = null; + } + + return next != null; + } + + boolean isWithinTheInterval(final SAMRecord record) { + final boolean refMatch = record.getReferenceIndex() == interval.referenceIndex; + if (interval.start == -1) return refMatch; + + final int start = record.getAlignmentStart(); + final int end = record.getAlignmentEnd(); + if (overlap) { + return CoordMath.overlaps(start, end, interval.start, interval.end < 0 ? Integer.MAX_VALUE : interval.end); + } else { + // contained: + return CoordMath.encloses(interval.start, interval.end < 0 ? Integer.MAX_VALUE : interval.end, start, end); + } + + } + + boolean isBeyondTheInterval(final SAMRecord record) { + if (record.getReadUnmappedFlag()) return false; + final boolean refMatch = record.getReferenceIndex() == interval.referenceIndex; + return !refMatch || interval.end != -1 && record.getAlignmentStart() > interval.end; + + } + + @Override + public SAMRecord next() { + final SAMRecord result = next; + next = null; + return result; + } + + @Override + public void remove() { + throw new RuntimeException("Not available."); + } } } diff --git a/src/java/htsjdk/samtools/CRAMFileWriter.java b/src/java/htsjdk/samtools/CRAMFileWriter.java index 4a85509212..dc83bc30f8 100644 --- a/src/java/htsjdk/samtools/CRAMFileWriter.java +++ b/src/java/htsjdk/samtools/CRAMFileWriter.java @@ -18,6 +18,7 @@ import htsjdk.samtools.cram.build.ContainerFactory; import htsjdk.samtools.cram.build.Cram2SamRecordFactory; import htsjdk.samtools.cram.build.CramIO; +import htsjdk.samtools.cram.build.CramNormalizer; import htsjdk.samtools.cram.build.Sam2CramRecordFactory; import htsjdk.samtools.cram.common.CramVersions; import htsjdk.samtools.cram.common.Version; @@ -26,6 +27,7 @@ import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.cram.ref.ReferenceTracks; import htsjdk.samtools.cram.structure.Container; +import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.CramHeader; import htsjdk.samtools.cram.structure.Slice; @@ -41,89 +43,83 @@ import java.util.TreeMap; import java.util.TreeSet; +@SuppressWarnings("UnusedDeclaration") public class CRAMFileWriter extends SAMFileWriterImpl { - private static final int REF_SEQ_INDEX_NOT_INITED = -2; - private static final int DEFAULT_RECORDS_PER_SLICE = 10000; + private static final int REF_SEQ_INDEX_NOT_INITIALIZED = -2; + static int DEFAULT_RECORDS_PER_SLICE = 10000; private static final int DEFAULT_SLICES_PER_CONTAINER = 1; private static final Version cramVersion = CramVersions.CRAM_v2_1; - private String fileName; - private List samRecords = new ArrayList(); + private final String fileName; + private final List samRecords = new ArrayList(); private ContainerFactory containerFactory; - protected int recordsPerSlice = DEFAULT_RECORDS_PER_SLICE; - protected int containerSize = recordsPerSlice - * DEFAULT_SLICES_PER_CONTAINER; + protected final int recordsPerSlice = DEFAULT_RECORDS_PER_SLICE; + protected final int containerSize = recordsPerSlice * DEFAULT_SLICES_PER_CONTAINER; - private Sam2CramRecordFactory sam2CramRecordFactory; - private OutputStream os; + private final OutputStream outputStream; private ReferenceSource source; - private int refSeqIndex = REF_SEQ_INDEX_NOT_INITED; + private int refSeqIndex = REF_SEQ_INDEX_NOT_INITIALIZED; - private static Log log = Log.getInstance(CRAMFileWriter.class); + private static final Log log = Log.getInstance(CRAMFileWriter.class); - private SAMFileHeader samFileHeader; + private final SAMFileHeader samFileHeader; private boolean preserveReadNames = true; private QualityScorePreservation preservation = null; private boolean captureAllTags = true; private Set captureTags = new TreeSet(); private Set ignoreTags = new TreeSet(); - public CRAMFileWriter(OutputStream os, ReferenceSource source, - SAMFileHeader samFileHeader, String fileName) { - this.os = os; + private CRAMIndexer indexer; + private long offset; + + public CRAMFileWriter(final OutputStream outputStream, final ReferenceSource source, final SAMFileHeader samFileHeader, final String fileName) { + this(outputStream, null, source, samFileHeader, fileName); + } + + public CRAMFileWriter(final OutputStream outputStream, final OutputStream indexOS, final ReferenceSource source, final SAMFileHeader samFileHeader, final String fileName) { + this.outputStream = outputStream; this.source = source; this.samFileHeader = samFileHeader; this.fileName = fileName; setSortOrder(samFileHeader.getSortOrder(), true); setHeader(samFileHeader); - if (this.source == null) - this.source = new ReferenceSource(Defaults.REFERENCE_FASTA); + if (this.source == null) this.source = new ReferenceSource(Defaults.REFERENCE_FASTA); containerFactory = new ContainerFactory(samFileHeader, recordsPerSlice); + if (indexOS != null) indexer = new CRAMIndexer(indexOS, samFileHeader); } /** - * Decide if the current container should be completed and flushed. The - * decision is based on a) number of records and b) if the reference - * sequence id has changed. + * Decide if the current container should be completed and flushed. The decision is based on a) number of records and b) if the + * reference sequence id has changed. * * @param nextRecord the record to be added into the current or next container - * @return true if the current container should be flushed and the following - * records should go into a new container; false otherwise. + * @return true if the current container should be flushed and the following records should go into a new container; false otherwise. */ - protected boolean shouldFlushContainer(SAMRecord nextRecord) { - if (samRecords.size() >= containerSize) - return true; + protected boolean shouldFlushContainer(final SAMRecord nextRecord) { + return samRecords.size() >= containerSize || refSeqIndex != REF_SEQ_INDEX_NOT_INITIALIZED && refSeqIndex != nextRecord.getReferenceIndex(); - if (refSeqIndex != REF_SEQ_INDEX_NOT_INITED - && refSeqIndex != nextRecord.getReferenceIndex()) - return true; - - return false; } - private static void updateTracks(List samRecords, - ReferenceTracks tracks) { - for (SAMRecord samRecord : samRecords) { + private static void updateTracks(final List samRecords, final ReferenceTracks tracks) { + for (final SAMRecord samRecord : samRecords) { if (samRecord.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { int refPos = samRecord.getAlignmentStart(); int readPos = 0; - for (CigarElement ce : samRecord.getCigar().getCigarElements()) { - if (ce.getOperator().consumesReferenceBases()) { - for (int i = 0; i < ce.getLength(); i++) - tracks.addCoverage(refPos + i, 1); + for (final CigarElement cigarElement : samRecord.getCigar().getCigarElements()) { + if (cigarElement.getOperator().consumesReferenceBases()) { + for (int elementIndex = 0; elementIndex < cigarElement.getLength(); elementIndex++) + tracks.addCoverage(refPos + elementIndex, 1); } - switch (ce.getOperator()) { + switch (cigarElement.getOperator()) { case M: case X: case EQ: - for (int i = readPos; i < ce.getLength(); i++) { - byte readBase = samRecord.getReadBases()[readPos - + i]; - byte refBase = tracks.baseAt(refPos + i); - if (readBase != refBase) - tracks.addMismatches(refPos + i, 1); + for (int pos = readPos; pos < cigarElement.getLength(); pos++) { + final byte readBase = samRecord.getReadBases()[readPos + pos]; + final byte refBase = tracks.baseAt(refPos + pos); + if (readBase != refBase) tracks.addMismatches(refPos + pos, 1); } break; @@ -131,10 +127,8 @@ private static void updateTracks(List samRecords, break; } - readPos += ce.getOperator().consumesReadBases() ? ce - .getLength() : 0; - refPos += ce.getOperator().consumesReferenceBases() ? ce - .getLength() : 0; + readPos += cigarElement.getOperator().consumesReadBases() ? cigarElement.getLength() : 0; + refPos += cigarElement.getOperator().consumesReferenceBases() ? cigarElement.getLength() : 0; } } } @@ -147,24 +141,23 @@ private static void updateTracks(List samRecords, * @throws IllegalAccessException * @throws IOException */ - protected void flushContainer() throws IllegalArgumentException, - IllegalAccessException, IOException { - - byte[] refs; - if (refSeqIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) - refs = new byte[0]; - else - refs = source.getReferenceBases( - samFileHeader.getSequence(refSeqIndex), true); + protected void flushContainer() throws IllegalArgumentException, IllegalAccessException, IOException { + + final byte[] refs; + String refSeqName = null; + if (refSeqIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) refs = new byte[0]; + else { + final SAMSequenceRecord sequence = samFileHeader.getSequence(refSeqIndex); + refs = source.getReferenceBases(sequence, true); + refSeqName = sequence.getSequenceName(); + } int start = SAMRecord.NO_ALIGNMENT_START; int stop = SAMRecord.NO_ALIGNMENT_START; - for (SAMRecord r : samRecords) { - if (r.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) - continue; + for (final SAMRecord r : samRecords) { + if (r.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) continue; - if (start == SAMRecord.NO_ALIGNMENT_START) - start = r.getAlignmentStart(); + if (start == SAMRecord.NO_ALIGNMENT_START) start = r.getAlignmentStart(); start = Math.min(r.getAlignmentStart(), start); stop = Math.max(r.getAlignmentEnd(), stop); @@ -172,17 +165,15 @@ protected void flushContainer() throws IllegalArgumentException, ReferenceTracks tracks = null; if (preservation != null && preservation.areReferenceTracksRequired()) { - if (tracks == null || tracks.getSequenceId() != refSeqIndex) - tracks = new ReferenceTracks(refSeqIndex, refs); + tracks = new ReferenceTracks(refSeqIndex, refSeqName, refs); + tracks.ensureRange(start, stop - start + 1); updateTracks(samRecords, tracks); } - List cramRecords = new ArrayList( - samRecords.size()); + final List cramRecords = new ArrayList(samRecords.size()); - sam2CramRecordFactory = new Sam2CramRecordFactory(refSeqIndex, refs, - samFileHeader); + final Sam2CramRecordFactory sam2CramRecordFactory = new Sam2CramRecordFactory(refs, samFileHeader, cramVersion); sam2CramRecordFactory.preserveReadNames = preserveReadNames; sam2CramRecordFactory.captureAllTags = captureAllTags; sam2CramRecordFactory.captureTags.addAll(captureTags); @@ -191,103 +182,162 @@ protected void flushContainer() throws IllegalArgumentException, int index = 0; int prevAlStart = start; - for (SAMRecord samRecord : samRecords) { - CramCompressionRecord cramRecord = sam2CramRecordFactory - .createCramRecord(samRecord); + for (final SAMRecord samRecord : samRecords) { + final CramCompressionRecord cramRecord = sam2CramRecordFactory.createCramRecord(samRecord); cramRecord.index = ++index; - cramRecord.alignmentDelta = samRecord.getAlignmentStart() - - prevAlStart; + cramRecord.alignmentDelta = samRecord.getAlignmentStart() - prevAlStart; cramRecord.alignmentStart = samRecord.getAlignmentStart(); prevAlStart = samRecord.getAlignmentStart(); cramRecords.add(cramRecord); - if (preservation != null) - preservation.addQualityScores(samRecord, cramRecord, tracks); - else - cramRecord.setForcePreserveQualityScores(true); + if (preservation != null) preservation.addQualityScores(samRecord, cramRecord, tracks); + else if (cramRecord.qualityScores != SAMRecord.NULL_QUALS) cramRecord.setForcePreserveQualityScores(true); } - // samRecords.clear(); - - if (sam2CramRecordFactory.getBaseCount() < 3 * sam2CramRecordFactory - .getFeatureCount()) + if (sam2CramRecordFactory.getBaseCount() < 3 * sam2CramRecordFactory.getFeatureCount()) log.warn("Abnormally high number of mismatches, possibly wrong reference."); - // mating: - Map primaryMateMap = new TreeMap(); - Map secondaryMateMap = new TreeMap(); - for (CramCompressionRecord r : cramRecords) { - if (!r.isMultiFragment()) { - r.setDetached(true); - - r.setHasMateDownStream(false); - r.recordsToNextFragment = -1; - r.next = null; - r.previous = null; - } else { - String name = r.readName; - Map mateMap = r - .isSecondaryAlignment() ? secondaryMateMap - : primaryMateMap; - CramCompressionRecord mate = mateMap.get(name); - if (mate == null) { - mateMap.put(name, r); - } else { - mate.recordsToNextFragment = r.index - mate.index - 1; - mate.next = r; - r.previous = mate; - r.previous.setHasMateDownStream(true); - r.setHasMateDownStream(false); - r.setDetached(false); - r.previous.setDetached(false); - - mateMap.remove(name); + { + if (samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate) { + // mating: + final Map primaryMateMap = new TreeMap(); + final Map secondaryMateMap = new TreeMap(); + for (final CramCompressionRecord r : cramRecords) { + if (!r.isMultiFragment()) { + r.setDetached(true); + + r.setHasMateDownStream(false); + r.recordsToNextFragment = -1; + r.next = null; + r.previous = null; + } else { + final String name = r.readName; + final Map mateMap = r.isSecondaryAlignment() ? secondaryMateMap : primaryMateMap; + final CramCompressionRecord mate = mateMap.get(name); + if (mate == null) { + mateMap.put(name, r); + } else { + CramCompressionRecord prev = mate; + while (prev.next != null) prev = prev.next; + prev.recordsToNextFragment = r.index - prev.index - 1; + prev.next = r; + r.previous = prev; + r.previous.setHasMateDownStream(true); + r.setHasMateDownStream(false); + r.setDetached(false); + r.previous.setDetached(false); + } + } } - } - } - for (CramCompressionRecord r : primaryMateMap.values()) { - r.setDetached(true); + // mark unpredictable reads as detached: + for (final CramCompressionRecord cramRecord : cramRecords) { + if (cramRecord.next == null || cramRecord.previous != null) continue; + CramCompressionRecord last = cramRecord; + while (last.next != null) last = last.next; - r.setHasMateDownStream(false); - r.recordsToNextFragment = -1; - r.next = null; - r.previous = null; - } + if (cramRecord.isFirstSegment() && last.isLastSegment()) { + + final int templateLength = CramNormalizer.computeInsertSize(cramRecord, last); + + if (cramRecord.templateSize == templateLength) { + last = cramRecord.next; + while (last.next != null) { + if (last.templateSize != -templateLength) + break; + + last = last.next; + } + if (last.templateSize != -templateLength) detach(cramRecord); + } + } else detach(cramRecord); + } + + for (final CramCompressionRecord cramRecord : primaryMateMap.values()) { + if (cramRecord.next != null) continue; + cramRecord.setDetached(true); - for (CramCompressionRecord r : secondaryMateMap.values()) { - r.setDetached(true); + cramRecord.setHasMateDownStream(false); + cramRecord.recordsToNextFragment = -1; + cramRecord.next = null; + cramRecord.previous = null; + } - r.setHasMateDownStream(false); - r.recordsToNextFragment = -1; - r.next = null; - r.previous = null; + for (final CramCompressionRecord cramRecord : secondaryMateMap.values()) { + if (cramRecord.next != null) continue; + cramRecord.setDetached(true); + + cramRecord.setHasMateDownStream(false); + cramRecord.recordsToNextFragment = -1; + cramRecord.next = null; + cramRecord.previous = null; + } + } + else { + for (final CramCompressionRecord cramRecord : cramRecords) { + cramRecord.setDetached(true); + } + } } - Cram2SamRecordFactory f = new Cram2SamRecordFactory(samFileHeader); - for (int i = 0; i < samRecords.size(); i++) { - String s1 = samRecords.get(i).getSAMString(); - SAMRecord r = f.create(cramRecords.get(i)); - String s2 = r.getSAMString(); - assert (s1.equals(s2)); + + { + /** + * The following passage is for paranoid mode only. When java is run with asserts on it will throw an {@link AssertionError} if + * read bases or quality scores of a restored SAM record mismatch the original. This is effectively a runtime round trip test. + */ + @SuppressWarnings("UnusedAssignment") boolean assertsEnabled = false; + //noinspection AssertWithSideEffects,ConstantConditions + assert assertsEnabled = true; + //noinspection ConstantConditions + if (assertsEnabled) { + final Cram2SamRecordFactory f = new Cram2SamRecordFactory(samFileHeader); + for (int i = 0; i < samRecords.size(); i++) { + final SAMRecord restoredSamRecord = f.create(cramRecords.get(i)); + assert (restoredSamRecord.getAlignmentStart() == samRecords.get(i).getAlignmentStart()); + assert (restoredSamRecord.getReferenceName().equals(samRecords.get(i).getReferenceName())); + assert (restoredSamRecord.getReadString().equals(samRecords.get(i).getReadString())); + assert (restoredSamRecord.getBaseQualityString().equals(samRecords.get(i).getBaseQualityString())); + } + } } - Container container = containerFactory.buildContainer(cramRecords); - for (Slice slice : container.slices) + final Container container = containerFactory.buildContainer(cramRecords); + for (final Slice slice : container.slices) slice.setRefMD5(refs); - CramIO.writeContainer(container, os); + container.offset = offset; + offset += ContainerIO.writeContainer(cramVersion, container, outputStream); + if (indexer != null) { + for (final Slice slice : container.slices) { + indexer.processAlignment(slice); + } + } samRecords.clear(); } + /** + * Traverse the graph and mark all segments as detached. + * + * @param cramRecord the starting point of the graph + */ + private static void detach(CramCompressionRecord cramRecord) { + do { + cramRecord.setDetached(true); + + cramRecord.setHasMateDownStream(false); + cramRecord.recordsToNextFragment = -1; + } + while ((cramRecord = cramRecord.next) != null); + } + @Override - protected void writeAlignment(SAMRecord alignment) { - if (shouldFlushContainer(alignment)) - try { - flushContainer(); - } catch (Exception e) { - throw new RuntimeException(e); - } + protected void writeAlignment(final SAMRecord alignment) { + if (shouldFlushContainer(alignment)) try { + flushContainer(); + } catch (final Exception e) { + throw new RuntimeException(e); + } updateReferenceContext(alignment.getReferenceIndex()); @@ -295,36 +345,28 @@ protected void writeAlignment(SAMRecord alignment) { } /** - * Check if the reference has changed and create a new record factory using - * the new reference. + * Check if the reference has changed and create a new record factory using the new reference. * * @param samRecordReferenceIndex index of the new reference sequence */ - private void updateReferenceContext(int samRecordReferenceIndex) { - if (refSeqIndex == REF_SEQ_INDEX_NOT_INITED) { + private void updateReferenceContext(final int samRecordReferenceIndex) { + if (refSeqIndex == REF_SEQ_INDEX_NOT_INITIALIZED) { refSeqIndex = samRecordReferenceIndex; - } else { - int newRefSeqIndex = samRecordReferenceIndex; - if (refSeqIndex != newRefSeqIndex) { - refSeqIndex = newRefSeqIndex; - } - } + } else + if (refSeqIndex != samRecordReferenceIndex) refSeqIndex = samRecordReferenceIndex; } @Override - protected void writeHeader(String textHeader) { + protected void writeHeader(final String textHeader) { // TODO: header must be written exactly once per writer life cycle. - SAMFileHeader header = new SAMTextHeaderCodec().decode( - new StringLineReader(textHeader), (fileName != null ? fileName - : null)); + final SAMFileHeader header = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader), (fileName != null ? fileName : null)); containerFactory = new ContainerFactory(header, recordsPerSlice); - CramHeader cramHeader = new CramHeader(cramVersion.major, - cramVersion.minor, fileName, header); + final CramHeader cramHeader = new CramHeader(cramVersion, fileName, header); try { - CramIO.writeCramHeader(cramHeader, os); - } catch (IOException e) { + offset = CramIO.writeCramHeader(cramHeader, outputStream); + } catch (final IOException e) { throw new RuntimeException(e); } } @@ -332,11 +374,12 @@ protected void writeHeader(String textHeader) { @Override protected void finish() { try { - if (!samRecords.isEmpty()) - flushContainer(); - CramIO.issueZeroB_EOF_marker(os); - os.flush(); - } catch (Exception e) { + if (!samRecords.isEmpty()) flushContainer(); + CramIO.issueEOF(cramVersion, outputStream); + outputStream.flush(); + if (indexer != null) + indexer.finish(); + } catch (final Exception e) { throw new RuntimeException(e); } } @@ -350,7 +393,7 @@ public boolean isPreserveReadNames() { return preserveReadNames; } - public void setPreserveReadNames(boolean preserveReadNames) { + public void setPreserveReadNames(final boolean preserveReadNames) { this.preserveReadNames = preserveReadNames; } @@ -366,7 +409,7 @@ public boolean isCaptureAllTags() { return captureAllTags; } - public void setCaptureAllTags(boolean captureAllTags) { + public void setCaptureAllTags(final boolean captureAllTags) { this.captureAllTags = captureAllTags; } @@ -374,7 +417,7 @@ public Set getCaptureTags() { return captureTags; } - public void setCaptureTags(Set captureTags) { + public void setCaptureTags(final Set captureTags) { this.captureTags = captureTags; } @@ -382,7 +425,7 @@ public Set getIgnoreTags() { return ignoreTags; } - public void setIgnoreTags(Set ignoreTags) { + public void setIgnoreTags(final Set ignoreTags) { this.ignoreTags = ignoreTags; } } diff --git a/src/java/htsjdk/samtools/CRAMIndexer.java b/src/java/htsjdk/samtools/CRAMIndexer.java index 77514fe437..338874f8c6 100755 --- a/src/java/htsjdk/samtools/CRAMIndexer.java +++ b/src/java/htsjdk/samtools/CRAMIndexer.java @@ -38,10 +38,18 @@ */ package htsjdk.samtools; +import htsjdk.samtools.cram.build.CramIO; +import htsjdk.samtools.cram.structure.Container; +import htsjdk.samtools.cram.structure.ContainerIO; +import htsjdk.samtools.cram.structure.CramHeader; import htsjdk.samtools.cram.structure.Slice; +import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.BlockCompressedFilePointerUtil; +import htsjdk.samtools.util.Log; +import org.testng.Assert; import java.io.File; +import java.io.IOException; import java.io.OutputStream; import java.util.Arrays; import java.util.List; @@ -49,9 +57,9 @@ /** * Class for both constructing BAM index content and writing it out. * There are two usage patterns: - * 1) Building a bam index from an existing bam file - * 2) Building a bam index while building the bam file - * In both cases, processAlignment is called for each alignment record and + * 1) Building a bam index from an existing cram file + * 2) Building a bam index while building the cram file + * In both cases, processAlignment is called for each cram slice and * finish() is called at the end. */ public class CRAMIndexer { @@ -68,10 +76,12 @@ public class CRAMIndexer { private final BAMIndexBuilder indexBuilder; /** + * Create a CRAM indexer that writes BAI to a file. + * * @param output binary BAM Index (.bai) file * @param fileHeader header for the corresponding bam file */ - public CRAMIndexer(final File output, SAMFileHeader fileHeader) { + public CRAMIndexer(final File output, final SAMFileHeader fileHeader) { numReferences = fileHeader.getSequenceDictionary().size(); indexBuilder = new BAMIndexBuilder(fileHeader); @@ -79,12 +89,12 @@ public CRAMIndexer(final File output, SAMFileHeader fileHeader) { } /** - * Prepare to index a BAM. + * Create a CRAM indexer that writes BAI to a stream. * * @param output Index will be written here. output will be closed when finish() method is called. * @param fileHeader header for the corresponding bam file. */ - public CRAMIndexer(final OutputStream output, SAMFileHeader fileHeader) { + public CRAMIndexer(final OutputStream output, final SAMFileHeader fileHeader) { numReferences = fileHeader.getSequenceDictionary().size(); indexBuilder = new BAMIndexBuilder(fileHeader); @@ -92,13 +102,13 @@ public CRAMIndexer(final OutputStream output, SAMFileHeader fileHeader) { } /** - * Record any index information for a given BAM record. + * Record any index information for a given CRAM slice. * If this alignment starts a new reference, write out the old reference. * Requires a non-null value for rec.getFileSource(). * - * @param rec The BAM record + * @param slice The CRAM slice */ - public void processAlignment(Slice slice) { + public void processAlignment(final Slice slice) { try { final int reference = slice.sequenceId; if (reference != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && reference != currentReference) { @@ -106,13 +116,13 @@ public void processAlignment(Slice slice) { advanceToReference(reference); } indexBuilder.processAlignment(slice); - } catch (Exception e) { - throw new SAMException("Exception creating BAM index for record " + slice, e); + } catch (final Exception e) { + throw new SAMException("Exception creating BAM index for slice " + slice, e); } } /** - * After all the alignment records have been processed, finish is called. + * After all the slices have been processed, finish is called. * Writes any final information and closes the output file. */ public void finish() { @@ -122,10 +132,12 @@ public void finish() { outputWriter.close(); } - /** write out any references between the currentReference and the nextReference */ - private void advanceToReference(int nextReference) { + /** + * write out any references between the currentReference and the nextReference + */ + private void advanceToReference(final int nextReference) { while (currentReference < nextReference) { - BAMIndexContent content = indexBuilder.processReference(currentReference); + final BAMIndexContent content = indexBuilder.processReference(currentReference); outputWriter.writeReference(content); currentReference++; indexBuilder.startNewReference(); @@ -144,23 +156,23 @@ static public void createAndWriteIndex(final File input, final File output, fina // content is from an existing bai file. final CachingBAMFileIndex existingIndex = new CachingBAMFileIndex(input, null); - final int n_ref = existingIndex.getNumberOfReferences(); + final int nRef = existingIndex.getNumberOfReferences(); final BAMIndexWriter outputWriter; if (textOutput) { - outputWriter = new TextualBAMIndexWriter(n_ref, output); + outputWriter = new TextualBAMIndexWriter(nRef, output); } else { - outputWriter = new BinaryBAMIndexWriter(n_ref, output); + outputWriter = new BinaryBAMIndexWriter(nRef, output); } // write the content one reference at a time try { - for (int i = 0; i < n_ref; i++) { + for (int i = 0; i < nRef; i++) { outputWriter.writeReference(existingIndex.getQueryResults(i)); } outputWriter.writeNoCoordinateRecordCount(existingIndex.getNoCoordinateCount()); outputWriter.close(); - } catch (Exception e) { + } catch (final Exception e) { throw new SAMException("Exception creating BAM index", e); } } @@ -184,16 +196,16 @@ private class BAMIndexBuilder { private int largestIndexSeen = -1; // information in meta data - private BAMIndexMetaData indexStats = new BAMIndexMetaData(); + private final BAMIndexMetaData indexStats = new BAMIndexMetaData(); /** - * @param header SAMFileheader used for reference name (in index stats) and for max bin number + * @param header SAMFileHeader used for reference name (in index stats) and for max bin number */ - BAMIndexBuilder(SAMFileHeader header) { + BAMIndexBuilder(final SAMFileHeader header) { this.bamHeader = header; } - private int computeIndexingBin(Slice slice) { + private int computeIndexingBin(final Slice slice) { // reg2bin has zero-based, half-open API final int alignmentStart = slice.alignmentStart - 1; int alignmentEnd = slice.alignmentStart + slice.alignmentSpan - 1; @@ -202,16 +214,16 @@ private int computeIndexingBin(Slice slice) { // then treat this as a one base alignment for indexing purposes. alignmentEnd = alignmentStart + 1; } - return SAMUtils.reg2bin(alignmentStart, alignmentEnd); + return GenomicIndexUtil.reg2bin(alignmentStart, alignmentEnd); } /** * Record any index information for a given BAM record * - * @param rec The BAM record. Requires rec.getFileSource() is non-null. + * @param slice The BAM record. Requires rec.getFileSource() is non-null. */ - public void processAlignment(Slice slice) { + public void processAlignment(final Slice slice) { // metadata indexStats.recordMetaData(slice); @@ -257,7 +269,7 @@ public void processAlignment(Slice slice) { final long chunkStart = (slice.containerOffset << 16) | slice.index; final long chunkEnd = ((slice.containerOffset << 16) | slice.index) + 1; - Chunk newChunk = new Chunk(chunkStart, chunkEnd); + final Chunk newChunk = new Chunk(chunkStart, chunkEnd); final List oldChunks = bin.getChunkList(); if (!bin.containsChunks()) { @@ -308,7 +320,7 @@ public void processAlignment(Slice slice) { * Creates the BAMIndexContent for this reference. * Requires all alignments of the reference have already been processed. */ - public BAMIndexContent processReference(int reference) { + public BAMIndexContent processReference(final int reference) { if (reference != currentReference) { throw new SAMException("Unexpected reference " + reference + " when constructing index for " + currentReference); @@ -361,4 +373,46 @@ void startNewReference() { indexStats.newReference(); } } + + /** + * Generates a BAI index file from an input CRAM stream + * + * @param stream CRAM stream to index + * @param output File for output index file + * @param log optional {@link htsjdk.samtools.util.Log} to output progress + */ + public static void createIndex(final SeekableStream stream, final File output, final Log log) throws IOException { + + final CramHeader cramHeader = CramIO.readCramHeader(stream); + final CRAMIndexer indexer = new CRAMIndexer(output, cramHeader.getSamFileHeader()); + + int totalRecords = 0; + Container container = null; + do { + if (++totalRecords % 10 == 0) + if (null != log) log.info(totalRecords + " slices processed ..."); + + try { + final long offset = stream.position(); + container = ContainerIO.readContainer(cramHeader.getVersion(), stream); + if (container == null || container.isEOF()) + break; + + container.offset = offset; + + int i = 0; + for (final Slice slice : container.slices) { + slice.containerOffset = offset; + slice.index = i++; + indexer.processAlignment(slice); + } + + } catch (final IOException e) { + Assert.fail("Failed to read cram container", e); + } + + } while (!container.isEOF()); + + indexer.finish(); + } } diff --git a/src/java/htsjdk/samtools/CRAMIterator.java b/src/java/htsjdk/samtools/CRAMIterator.java index b423a16a0e..fc8915f00b 100644 --- a/src/java/htsjdk/samtools/CRAMIterator.java +++ b/src/java/htsjdk/samtools/CRAMIterator.java @@ -8,7 +8,7 @@ * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, + * distributed under the License countingInputStream distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. @@ -18,68 +18,90 @@ import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.cram.build.ContainerParser; import htsjdk.samtools.cram.build.Cram2SamRecordFactory; -import htsjdk.samtools.cram.build.CramIO; +import htsjdk.samtools.cram.build.CramContainerIterator; import htsjdk.samtools.cram.build.CramNormalizer; +import htsjdk.samtools.cram.build.CramSpanContainerIterator; import htsjdk.samtools.cram.io.CountingInputStream; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.cram.structure.Container; +import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.CramHeader; import htsjdk.samtools.cram.structure.Slice; +import htsjdk.samtools.seekablestream.SeekableStream; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeEOFException; import htsjdk.samtools.util.SequenceUtil; -import java.io.BufferedInputStream; -import java.io.EOFException; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.math.BigInteger; -import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.Collections; import java.util.Iterator; import java.util.List; public class CRAMIterator implements SAMRecordIterator { - private static Log log = Log.getInstance(CRAMIterator.class); - private CountingInputStream is; + private static final Log log = Log.getInstance(CRAMIterator.class); + private final CountingInputStream countingInputStream; private CramHeader cramHeader; private ArrayList records; - private int recordCounter = 0; private SAMRecord nextRecord = null; + @SuppressWarnings({"CanBeFinal", "FieldCanBeLocal"}) private boolean restoreNMTag = true; + @SuppressWarnings({"CanBeFinal", "FieldCanBeLocal"}) private boolean restoreMDTag = false; private CramNormalizer normalizer; private byte[] refs; private int prevSeqId = SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; - private Container container; - private long containerOffset = 0; + public Container container; private SamReader mReader; + long firstContainerOffset = 0; + private Iterator containerIterator; private ContainerParser parser; - private ReferenceSource referenceSource; + private final ReferenceSource referenceSource; - private ValidationStringency validationStringency = ValidationStringency.SILENT; + private Iterator iterator = Collections.emptyList().iterator(); + + private ValidationStringency validationStringency = ValidationStringency.DEFAULT_STRINGENCY; public ValidationStringency getValidationStringency() { return validationStringency; } public void setValidationStringency( - ValidationStringency validationStringency) { + final ValidationStringency validationStringency) { this.validationStringency = validationStringency; } private long samRecordIndex; private ArrayList cramRecords; - public CRAMIterator(InputStream is, ReferenceSource referenceSource) + public CRAMIterator(final InputStream inputStream, final ReferenceSource referenceSource) throws IOException { - this.is = new CountingInputStream(is); + this.countingInputStream = new CountingInputStream(inputStream); this.referenceSource = referenceSource; - cramHeader = CramIO.readCramHeader(this.is); + final CramContainerIterator containerIterator = new CramContainerIterator(this.countingInputStream); + cramHeader = containerIterator.getCramHeader(); + this.containerIterator = containerIterator; + + firstContainerOffset = this.countingInputStream.getCount(); + records = new ArrayList(10000); + normalizer = new CramNormalizer(cramHeader.getSamFileHeader(), + referenceSource); + parser = new ContainerParser(cramHeader.getSamFileHeader()); + } + + public CRAMIterator(final SeekableStream seekableStream, final ReferenceSource referenceSource, final long[] coordinates) + throws IOException { + this.countingInputStream = new CountingInputStream(seekableStream); + this.referenceSource = referenceSource; + final CramSpanContainerIterator containerIterator = CramSpanContainerIterator.fromFileSpan(seekableStream, coordinates); + cramHeader = containerIterator.getCramHeader(); + this.containerIterator = containerIterator; + + firstContainerOffset = containerIterator.getFirstContainerOffset(); records = new ArrayList(10000); normalizer = new CramNormalizer(cramHeader.getSamFileHeader(), referenceSource); @@ -92,15 +114,26 @@ public CramHeader getCramHeader() { private void nextContainer() throws IOException, IllegalArgumentException, IllegalAccessException { - recordCounter = 0; - containerOffset = is.getCount(); - container = CramIO.readContainer(is); - if (container == null || container.isEOF()) { - records.clear(); - nextRecord = null; - recordCounter = -1; - return; + if (containerIterator != null) { + if (!containerIterator.hasNext()) { + records.clear(); + nextRecord = null; + return; + } + container = containerIterator.next(); + if (container.isEOF()) { + records.clear(); + nextRecord = null; + return; + } + } else { + container = ContainerIO.readContainer(cramHeader.getVersion(), countingInputStream); + if (container.isEOF()) { + records.clear(); + nextRecord = null; + return; + } } if (records == null) @@ -112,11 +145,7 @@ private void nextContainer() throws IOException, IllegalArgumentException, else cramRecords.clear(); - try { - parser.getRecords(container, cramRecords); - } catch (EOFException e) { - throw e; - } + parser.getRecords(container, cramRecords); if (container.sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { refs = new byte[]{}; @@ -124,83 +153,106 @@ private void nextContainer() throws IOException, IllegalArgumentException, refs = null; prevSeqId = -2; } else if (prevSeqId < 0 || prevSeqId != container.sequenceId) { - SAMSequenceRecord sequence = cramHeader.getSamFileHeader() + final SAMSequenceRecord sequence = cramHeader.getSamFileHeader() .getSequence(container.sequenceId); refs = referenceSource.getReferenceBases(sequence, true); prevSeqId = container.sequenceId; } - try { - for (int i = 0; i < container.slices.length; i++) { - Slice s = container.slices[i]; - if (s.sequenceId < 0) - continue; - if (!s.validateRefMD5(refs)) { - log.error(String - .format("Reference sequence MD5 mismatch for slice: seq id %d, start %d, span %d, expected MD5 %s", - s.sequenceId, s.alignmentStart, s.alignmentSpan, - String.format("%032x", new BigInteger(1, s.refMD5)))); - } + for (int i = 0; i < container.slices.length; i++) { + final Slice slice = container.slices[i]; + if (slice.sequenceId < 0) + continue; + if (validationStringency != ValidationStringency.SILENT && !slice.validateRefMD5(refs)) { + log.error(String + .format("Reference sequence MD5 mismatch for slice: seq id %d, start %d, span %d, expected MD5 %s", slice.sequenceId, + slice.alignmentStart, slice.alignmentSpan, String.format("%032x", new BigInteger(1, slice.refMD5)))); } - } catch (NoSuchAlgorithmException e1) { - throw new RuntimeException(e1); } - normalizer.normalize(cramRecords, true, refs, container.alignmentStart, - container.h.substitutionMatrix, container.h.AP_seriesDelta); + normalizer.normalize(cramRecords, refs, 0, + container.header.substitutionMatrix); - Cram2SamRecordFactory c2sFactory = new Cram2SamRecordFactory( + final Cram2SamRecordFactory cramToSamRecordFactory = new Cram2SamRecordFactory( cramHeader.getSamFileHeader()); - for (CramCompressionRecord r : cramRecords) { - SAMRecord s = c2sFactory.create(r); - if (!r.isSegmentUnmapped()) { - SAMSequenceRecord sequence = cramHeader.getSamFileHeader() - .getSequence(r.sequenceId); + for (final CramCompressionRecord cramRecord : cramRecords) { + final SAMRecord samRecord = cramToSamRecordFactory.create(cramRecord); + if (!cramRecord.isSegmentUnmapped()) { + final SAMSequenceRecord sequence = cramHeader.getSamFileHeader() + .getSequence(cramRecord.sequenceId); refs = referenceSource.getReferenceBases(sequence, true); - SequenceUtil.calculateMdAndNmTags(s, refs, restoreMDTag, restoreNMTag); + if (samRecord.getReadBases() != SAMRecord.NULL_SEQUENCE) + SequenceUtil.calculateMdAndNmTags(samRecord, refs, restoreMDTag, restoreNMTag); } - s.setValidationStringency(validationStringency); + samRecord.setValidationStringency(validationStringency); if (validationStringency != ValidationStringency.SILENT) { - final List validationErrors = s.isValid(); + final List validationErrors = samRecord.isValid(); SAMUtils.processValidationErrors(validationErrors, samRecordIndex, validationStringency); } if (mReader != null) { - final long chunkStart = (containerOffset << 16) | r.sliceIndex; - final long chunkEnd = ((containerOffset << 16) | r.sliceIndex) + 1; + final long chunkStart = (container.offset << 16) | cramRecord.sliceIndex; + final long chunkEnd = ((container.offset << 16) | cramRecord.sliceIndex) + 1; nextRecord.setFileSource(new SAMFileSource(mReader, new BAMFileSpan(new Chunk(chunkStart, chunkEnd)))); } - records.add(s); + records.add(samRecord); + samRecordIndex++; } cramRecords.clear(); + iterator = records.iterator(); + } + + /** + * Skip cached records until given alignment start position. + * + * @param refIndex reference sequence index + * @param pos alignment start to skip to + */ + public void jumpWithinContainerToPos(final int refIndex, final int pos) { + if (!hasNext()) return; + int i = 0; + for (final SAMRecord record : records) { + if (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && record.getReferenceIndex() != refIndex) continue; + + if (pos <= 0) { + if (record.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) { + iterator = records.listIterator(i); + return; + } + } else { + if (record.getAlignmentStart() >= pos) { + iterator = records.listIterator(i); + return; + } + } + i++; + } + iterator = Collections.emptyList().iterator(); } @Override public boolean hasNext() { if (container != null && container.isEOF()) return false; - if (container == null || recordCounter >= records.size()) { + if (!iterator.hasNext()) { try { nextContainer(); - if (records.isEmpty()) - return false; - } catch (Exception e) { + } catch (final Exception e) { throw new RuntimeEOFException(e); } } - nextRecord = records.get(recordCounter++); - return true; + return !records.isEmpty(); } @Override public SAMRecord next() { - return nextRecord; + return iterator.next(); } @Override @@ -211,47 +263,16 @@ public void remove() { @Override public void close() { records.clear(); + //noinspection EmptyCatchBlock try { - is.close(); - } catch (IOException e) { - } - } - - public static class CramFileIterable implements Iterable { - private ReferenceSource referenceSource; - private File cramFile; - private ValidationStringency validationStringency; - - public CramFileIterable(File cramFile, ReferenceSource referenceSource, - ValidationStringency validationStringency) { - this.referenceSource = referenceSource; - this.cramFile = cramFile; - this.validationStringency = validationStringency; - + if (countingInputStream != null) + countingInputStream.close(); + } catch (final IOException e) { } - - public CramFileIterable(File cramFile, ReferenceSource referenceSource) { - this(cramFile, referenceSource, - ValidationStringency.DEFAULT_STRINGENCY); - } - - @Override - public Iterator iterator() { - try { - FileInputStream fis = new FileInputStream(cramFile); - BufferedInputStream bis = new BufferedInputStream(fis); - CRAMIterator iterator = new CRAMIterator(bis, referenceSource); - iterator.setValidationStringency(validationStringency); - return iterator; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - } @Override - public SAMRecordIterator assertSorted(SortOrder sortOrder) { + public SAMRecordIterator assertSorted(final SortOrder sortOrder) { throw new RuntimeException("Not implemented."); } @@ -259,7 +280,7 @@ public SamReader getFileSource() { return mReader; } - public void setFileSource(SamReader mReader) { + public void setFileSource(final SamReader mReader) { this.mReader = mReader; } diff --git a/src/java/htsjdk/samtools/SAMFileWriterFactory.java b/src/java/htsjdk/samtools/SAMFileWriterFactory.java index c2c8563f9c..c32cf97b02 100644 --- a/src/java/htsjdk/samtools/SAMFileWriterFactory.java +++ b/src/java/htsjdk/samtools/SAMFileWriterFactory.java @@ -29,6 +29,7 @@ import htsjdk.samtools.util.Md5CalculatingOutputStream; import htsjdk.samtools.util.RuntimeIOException; +import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; @@ -51,12 +52,16 @@ public class SAMFileWriterFactory { private Integer maxRecordsInRam; - /** Sets the default for whether to create md5Files for BAM files this factory. */ + /** + * Sets the default for whether to create md5Files for BAM files this factory. + */ public static void setDefaultCreateMd5File(final boolean createMd5File) { defaultCreateMd5File = createMd5File; } - /** Sets whether to create md5Files for BAMs from this factory. */ + /** + * Sets whether to create md5Files for BAMs from this factory. + */ public SAMFileWriterFactory setCreateMd5File(final boolean createMd5File) { this.createMd5File = createMd5File; return this; @@ -299,9 +304,35 @@ public SAMFileWriter makeWriter(final SAMFileHeader header, final boolean presor } public CRAMFileWriter makeCRAMWriter(final SAMFileHeader header, final OutputStream stream, final File referenceFasta) { + final CRAMFileWriter writer = new CRAMFileWriter(stream, new ReferenceSource(referenceFasta), header, null); writer.setPreserveReadNames(true); writer.setCaptureAllTags(true); return writer; } + + public CRAMFileWriter makeCRAMWriter(final SAMFileHeader header, final File outputFile, final File referenceFasta) { + + final boolean createIndex = this.createIndex && IOUtil.isRegularPath(outputFile); + if (this.createIndex && !createIndex) { + System.err.println("Cannot create index for CAM because output file is not a regular file: " + outputFile.getAbsolutePath()); + } + + try { + + OutputStream indexOS = null ; + if (createIndex) { + File indexFile = new File(outputFile.getAbsolutePath() + ".bai") ; + indexOS = new FileOutputStream(indexFile) ; + } + final CRAMFileWriter writer = new CRAMFileWriter(new FileOutputStream(outputFile), indexOS, new ReferenceSource(referenceFasta), header, null); + writer.setPreserveReadNames(true); + writer.setCaptureAllTags(true); + return writer; + } catch (final IOException ioe) { + throw new RuntimeIOException("Error opening file: " + outputFile.getAbsolutePath()); + } + } + + } diff --git a/src/java/htsjdk/samtools/SAMRecordSetBuilder.java b/src/java/htsjdk/samtools/SAMRecordSetBuilder.java index af5d898e1d..c33d30fb72 100644 --- a/src/java/htsjdk/samtools/SAMRecordSetBuilder.java +++ b/src/java/htsjdk/samtools/SAMRecordSetBuilder.java @@ -69,6 +69,8 @@ public class SAMRecordSetBuilder implements Iterable { private SAMReadGroupRecord readGroup = null; private boolean useNmFlag = false; + private boolean unmappedHasBasesAndQualities = true; + public static final int DEFAULT_CHROMOSOME_LENGTH = 200000000; public static final ScoringStrategy DEFAULT_DUPLICATE_SCORING_STRATEGY = ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH; @@ -132,6 +134,10 @@ public SAMRecordSetBuilder(final boolean sortForMe, final SAMFileHeader.SortOrde } } + public void setUnmappedHasBasesAndQualities(final boolean value) { + this.unmappedHasBasesAndQualities = value; + } + public int size() { return this.records.size(); } @@ -244,7 +250,9 @@ private SAMRecord createReadNoFlag(final String name, final int contig, final in rec.setAttribute(SAMTag.RG.name(), readGroup.getReadGroupId()); } + if (!recordUnmapped || this.unmappedHasBasesAndQualities) { fillInBasesAndQualities(rec, qualityString, defaultQuality); + } return rec; } @@ -479,7 +487,9 @@ public void addUnmappedPair(final String name) { if (programRecord != null) { end1.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); } + if (this.unmappedHasBasesAndQualities) { fillInBasesAndQualities(end1); + } end2.setReadName(name); end2.setReadPairedFlag(true); @@ -493,7 +503,9 @@ public void addUnmappedPair(final String name) { if (programRecord != null) { end2.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); } + if (this.unmappedHasBasesAndQualities) { fillInBasesAndQualities(end2); + } this.records.add(end1); this.records.add(end2); diff --git a/src/java/htsjdk/samtools/SamStreams.java b/src/java/htsjdk/samtools/SamStreams.java index 5fac1df5b5..cea099d7f2 100644 --- a/src/java/htsjdk/samtools/SamStreams.java +++ b/src/java/htsjdk/samtools/SamStreams.java @@ -32,12 +32,12 @@ private static int readBytes(final InputStream stream, final byte[] buffer, fina public static boolean isCRAMFile(final InputStream stream) throws IOException { stream.mark(4); - final int buffSize = CramHeader.magick.length; + final int buffSize = CramHeader.MAGIC.length; final byte[] buffer = new byte[buffSize]; readBytes(stream, buffer, 0, buffSize); stream.reset(); - return Arrays.equals(buffer, CramHeader.magick); + return Arrays.equals(buffer, CramHeader.MAGIC); } /** * @param stream stream.markSupported() must be true diff --git a/src/java/htsjdk/samtools/cram/build/CompressionHeaderFactory.java b/src/java/htsjdk/samtools/cram/build/CompressionHeaderFactory.java index ca9c26384a..d771f64212 100644 --- a/src/java/htsjdk/samtools/cram/build/CompressionHeaderFactory.java +++ b/src/java/htsjdk/samtools/cram/build/CompressionHeaderFactory.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.build; import htsjdk.samtools.cram.common.MutableInt; @@ -23,20 +25,22 @@ import htsjdk.samtools.cram.encoding.Encoding; import htsjdk.samtools.cram.encoding.ExternalByteArrayEncoding; import htsjdk.samtools.cram.encoding.ExternalByteEncoding; +import htsjdk.samtools.cram.encoding.ExternalCompressor; import htsjdk.samtools.cram.encoding.ExternalIntegerEncoding; import htsjdk.samtools.cram.encoding.GammaIntegerEncoding; -import htsjdk.samtools.cram.encoding.HuffmanByteEncoding; -import htsjdk.samtools.cram.encoding.HuffmanIntegerEncoding; import htsjdk.samtools.cram.encoding.NullEncoding; -import htsjdk.samtools.cram.encoding.SubexpIntegerEncoding; +import htsjdk.samtools.cram.encoding.SubexponentialIntegerEncoding; import htsjdk.samtools.cram.encoding.huffman.HuffmanCode; import htsjdk.samtools.cram.encoding.huffman.HuffmanTree; -import htsjdk.samtools.cram.encoding.read_features.Deletion; -import htsjdk.samtools.cram.encoding.read_features.HardClip; -import htsjdk.samtools.cram.encoding.read_features.Padding; -import htsjdk.samtools.cram.encoding.read_features.ReadFeature; -import htsjdk.samtools.cram.encoding.read_features.RefSkip; -import htsjdk.samtools.cram.encoding.read_features.Substitution; +import htsjdk.samtools.cram.encoding.huffman.codec.HuffmanByteEncoding; +import htsjdk.samtools.cram.encoding.huffman.codec.HuffmanIntegerEncoding; +import htsjdk.samtools.cram.encoding.rans.RANS; +import htsjdk.samtools.cram.encoding.readfeatures.Deletion; +import htsjdk.samtools.cram.encoding.readfeatures.HardClip; +import htsjdk.samtools.cram.encoding.readfeatures.Padding; +import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature; +import htsjdk.samtools.cram.encoding.readfeatures.RefSkip; +import htsjdk.samtools.cram.encoding.readfeatures.Substitution; import htsjdk.samtools.cram.structure.CompressionHeader; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.EncodingKey; @@ -50,137 +54,150 @@ import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeMap; public class CompressionHeaderFactory { private static final Charset charset = Charset.forName("US-ASCII"); - private static Log log = Log.getInstance(CompressionHeaderFactory.class); + private static final Log log = Log.getInstance(CompressionHeaderFactory.class); private static final int oqz = ReadTag.nameType3BytesToInt("OQ", 'Z'); - private static final int bqz = ReadTag.nameType3BytesToInt("OQ", 'Z'); + private static final int bqz = ReadTag.nameType3BytesToInt("BQ", 'Z'); - public CompressionHeader build(List records, SubstitutionMatrix substitutionMatrix) { - CompressionHeader h = new CompressionHeader(); - h.externalIds = new ArrayList(); + public CompressionHeader build(final List records, final SubstitutionMatrix substitutionMatrix, final boolean sorted) { + final CompressionHeader header = new CompressionHeader(); + header.externalIds = new ArrayList(); int exCounter = 0; - int baseID = exCounter++; - h.externalIds.add(baseID); + final int baseID = exCounter++; + header.externalIds.add(baseID); + header.externalCompressors.put(baseID, + ExternalCompressor.createRANS(RANS.ORDER.ONE)); - int qualityScoreID = exCounter++; - h.externalIds.add(qualityScoreID); + final int qualityScoreID = exCounter++; + header.externalIds.add(qualityScoreID); + header.externalCompressors.put(qualityScoreID, + ExternalCompressor.createRANS(RANS.ORDER.ONE)); - int readNameID = exCounter++; - h.externalIds.add(readNameID); + final int readNameID = exCounter++; + header.externalIds.add(readNameID); + header.externalCompressors.put(readNameID, ExternalCompressor.createGZIP()); - int mateInfoID = exCounter++; - h.externalIds.add(mateInfoID); + final int mateInfoID = exCounter++; + header.externalIds.add(mateInfoID); + header.externalCompressors.put(mateInfoID, + ExternalCompressor.createRANS(RANS.ORDER.ONE)); - int tagValueExtID = exCounter++; - h.externalIds.add(tagValueExtID); + header.encodingMap = new TreeMap(); + for (final EncodingKey key : EncodingKey.values()) + header.encodingMap.put(key, NullEncoding.toParam()); - log.debug("Assigned external id to bases: " + baseID); - log.debug("Assigned external id to quality scores: " + qualityScoreID); - log.debug("Assigned external id to read names: " + readNameID); - log.debug("Assigned external id to mate info: " + mateInfoID); - log.debug("Assigned external id to tag values: " + tagValueExtID); - - h.eMap = new TreeMap(); - for (EncodingKey key : EncodingKey.values()) - h.eMap.put(key, NullEncoding.toParam()); - - h.tMap = new TreeMap(); + header.tMap = new TreeMap(); { // bit flags encoding: - getOptimalIntegerEncoding(h, EncodingKey.BF_BitFlags, 0, records); + getOptimalIntegerEncoding(header, EncodingKey.BF_BitFlags, 0, records); } { // compression bit flags encoding: - getOptimalIntegerEncoding(h, EncodingKey.CF_CompressionBitFlags, 0, records); + getOptimalIntegerEncoding(header, EncodingKey.CF_CompressionBitFlags, 0, records); } { // ref id: - getOptimalIntegerEncoding(h, EncodingKey.RI_RefId, -2, records); + getOptimalIntegerEncoding(header, EncodingKey.RI_RefId, -2, records); } { // read length encoding: - getOptimalIntegerEncoding(h, EncodingKey.RL_ReadLength, 0, records); + getOptimalIntegerEncoding(header, EncodingKey.RL_ReadLength, 0, records); } { // alignment offset: - getOptimalIntegerEncoding(h, EncodingKey.AP_AlignmentPositionOffset, 0, records); + if (sorted) { // alignment offset: + header.APDelta = true; + getOptimalIntegerEncoding(header, EncodingKey.AP_AlignmentPositionOffset, 0, records); + } else { + final int aStartID = exCounter++; + header.APDelta = false; + header.encodingMap.put(EncodingKey.AP_AlignmentPositionOffset, + ExternalIntegerEncoding.toParam(aStartID)); + header.externalIds.add(aStartID); + header.externalCompressors.put(aStartID, + ExternalCompressor.createRANS(RANS.ORDER.ONE)); + log.debug("Assigned external id to alignment starts: " + aStartID); + } } { // read group - getOptimalIntegerEncoding(h, EncodingKey.RG_ReadGroup, -1, records); + getOptimalIntegerEncoding(header, EncodingKey.RG_ReadGroup, -1, records); } { // read name encoding: - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) - calculator.add(r.readName.length()); + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) + calculator.add(record.readName.length()); calculator.calculate(); - h.eMap.put(EncodingKey.RN_ReadName, ByteArrayLenEncoding.toParam( - HuffmanIntegerEncoding.toParam(calculator.values(), calculator.bitLens()), - ExternalByteArrayEncoding.toParam(readNameID))); - // h.eMap.put(EncodingKey.RN_ReadName, - // ByteArrayStopEncoding.toParam((byte) 0, readNameID)); + header.encodingMap.put(EncodingKey.RN_ReadName, ByteArrayLenEncoding.toParam( + HuffmanIntegerEncoding.toParam(calculator.values(), + calculator.bitLens()), ExternalByteArrayEncoding + .toParam(readNameID))); } { // records to next fragment - IntegerEncodingCalculator calc = new IntegerEncodingCalculator(EncodingKey.NF_RecordsToNextFragment.name(), - 0); - for (CramCompressionRecord r : records) { + final IntegerEncodingCalculator calc = new IntegerEncodingCalculator( + EncodingKey.NF_RecordsToNextFragment.name(), 0); + for (final CramCompressionRecord r : records) { if (r.isHasMateDownStream()) calc.addValue(r.recordsToNextFragment); } - Encoding bestEncoding = calc.getBestEncoding(); - h.eMap.put(EncodingKey.NF_RecordsToNextFragment, - new EncodingParams(bestEncoding.id(), bestEncoding.toByteArray())); + final Encoding bestEncoding = calc.getBestEncoding(); + header.encodingMap.put( + EncodingKey.NF_RecordsToNextFragment, + new EncodingParams(bestEncoding.id(), bestEncoding + .toByteArray())); } { // tag count - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) - calculator.add(r.tags == null ? 0 : r.tags.length); + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) + calculator.add(record.tags == null ? 0 : record.tags.length); calculator.calculate(); - h.eMap.put(EncodingKey.TC_TagCount, - HuffmanIntegerEncoding.toParam(calculator.values(), calculator.bitLens())); + header.encodingMap.put(EncodingKey.TC_TagCount, HuffmanIntegerEncoding.toParam( + calculator.values(), calculator.bitLens())); } { // tag name and type - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) { - if (r.tags == null) + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) { + if (record.tags == null) continue; - for (ReadTag tag : r.tags) + for (final ReadTag tag : record.tags) calculator.add(tag.keyType3BytesAsInt); } calculator.calculate(); - h.eMap.put(EncodingKey.TN_TagNameAndType, - HuffmanIntegerEncoding.toParam(calculator.values(), calculator.bitLens())); + header.encodingMap.put(EncodingKey.TN_TagNameAndType, HuffmanIntegerEncoding + .toParam(calculator.values(), calculator.bitLens())); } { - Comparator comparator = new Comparator() { + final Comparator comparator = new Comparator() { @Override - public int compare(ReadTag o1, ReadTag o2) { + public int compare(final ReadTag o1, final ReadTag o2) { return o1.keyType3BytesAsInt - o2.keyType3BytesAsInt; } }; - Comparator baComparator = new Comparator() { + final Comparator baComparator = new Comparator() { @Override - public int compare(byte[] o1, byte[] o2) { + public int compare(final byte[] o1, final byte[] o2) { if (o1.length - o2.length != 0) return o1.length - o2.length; @@ -192,44 +209,44 @@ public int compare(byte[] o1, byte[] o2) { } }; - Map map = new TreeMap(baComparator); - MutableInt noTagCounter = new MutableInt(); + final Map map = new TreeMap(baComparator); + final MutableInt noTagCounter = new MutableInt(); map.put(new byte[0], noTagCounter); - for (CramCompressionRecord r : records) { - if (r.tags == null) { + for (final CramCompressionRecord record : records) { + if (record.tags == null) { noTagCounter.value++; - r.tagIdsIndex = noTagCounter; + record.tagIdsIndex = noTagCounter; continue; } - Arrays.sort(r.tags, comparator); - r.tagIds = new byte[r.tags.length * 3]; + Arrays.sort(record.tags, comparator); + record.tagIds = new byte[record.tags.length * 3]; int tagIndex = 0; - for (int i = 0; i < r.tags.length; i++) { - r.tagIds[i * 3] = (byte) r.tags[tagIndex].keyType3Bytes.charAt(0); - r.tagIds[i * 3 + 1] = (byte) r.tags[tagIndex].keyType3Bytes.charAt(1); - r.tagIds[i * 3 + 2] = (byte) r.tags[tagIndex].keyType3Bytes.charAt(2); + for (int i = 0; i < record.tags.length; i++) { + record.tagIds[i * 3] = (byte) record.tags[tagIndex].keyType3Bytes.charAt(0); + record.tagIds[i * 3 + 1] = (byte) record.tags[tagIndex].keyType3Bytes.charAt(1); + record.tagIds[i * 3 + 2] = (byte) record.tags[tagIndex].keyType3Bytes.charAt(2); tagIndex++; } - MutableInt count = map.get(r.tagIds); + MutableInt count = map.get(record.tagIds); if (count == null) { count = new MutableInt(); - map.put(r.tagIds, count); + map.put(record.tagIds, count); } count.value++; - r.tagIdsIndex = count; + record.tagIdsIndex = count; } - byte[][][] dic = new byte[map.size()][][]; + final byte[][][] dic = new byte[map.size()][][]; int i = 0; - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (byte[] idsAsBytes : map.keySet()) { - int nofIds = idsAsBytes.length / 3; + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final byte[] idsAsBytes : map.keySet()) { + final int nofIds = idsAsBytes.length / 3; dic[i] = new byte[nofIds][]; for (int j = 0; j < idsAsBytes.length; ) { - int idIndex = j / 3; + final int idIndex = j / 3; dic[i][idIndex] = new byte[3]; dic[i][idIndex][0] = idsAsBytes[j++]; dic[i][idIndex][1] = idsAsBytes[j++]; @@ -240,386 +257,311 @@ public int compare(byte[] o1, byte[] o2) { } calculator.calculate(); - h.eMap.put(EncodingKey.TL_TagIdList, + header.encodingMap.put(EncodingKey.TL_TagIdList, HuffmanIntegerEncoding.toParam(calculator.values(), calculator.bitLens())); - h.dictionary = dic; + header.dictionary = dic; } { // tag values - Map cc = new TreeMap(); - - for (CramCompressionRecord r : records) { - if (r.tags == null) + @SuppressWarnings("UnnecessaryLocalVariable") final int unsortedTagValueExternalID = exCounter; + header.externalIds.add(unsortedTagValueExternalID); + header.externalCompressors.put(unsortedTagValueExternalID, + ExternalCompressor.createRANS(RANS.ORDER.ONE)); + + final Set tagIdSet = new HashSet(); + for (final CramCompressionRecord record : records) { + if (record.tags == null) continue; - for (ReadTag tag : r.tags) { - switch (tag.keyType3BytesAsInt) { - // case ReadTag.OQZ: - // case ReadTag.BQZ: - // EncodingParams params = h.tMap - // .get(tag.keyType3BytesAsInt); - // if (params == null) { - // h.tMap.put(tag.keyType3BytesAsInt, - // ByteArrayStopEncoding.toParam((byte) 1, - // tagValueExtID)); - // } - // break; - - default: - HuffmanParamsCalculator c = cc.get(tag.keyType3BytesAsInt); - if (c == null) { - c = new HuffmanParamsCalculator(); - cc.put(tag.keyType3BytesAsInt, c); - } - c.add(tag.getValueAsByteArray().length); - break; - } - } + for (final ReadTag tag : record.tags) + tagIdSet.add(tag.keyType3BytesAsInt); } - if (!cc.isEmpty()) - for (Integer key : cc.keySet()) { - HuffmanParamsCalculator c = cc.get(key); - c.calculate(); - - h.tMap.put(key, ByteArrayLenEncoding.toParam( - HuffmanIntegerEncoding.toParam(c.values(), c.bitLens()), - ExternalByteArrayEncoding.toParam(tagValueExtID))); + for (final int id : tagIdSet) { + final int externalID; + final byte type = (byte) (id & 0xFF); + switch (type) { + case 'Z': + case 'B': + externalID = id; + break; + + default: + externalID = unsortedTagValueExternalID; + break; } - for (Integer key : h.tMap.keySet()) { - log.debug(String.format("TAG ENCODING: %d, %s", key, h.tMap.get(key))); + header.externalIds.add(externalID); + header.externalCompressors.put(externalID, + ExternalCompressor.createRANS(RANS.ORDER.ONE)); + header.tMap.put(id, ByteArrayLenEncoding.toParam( + ExternalIntegerEncoding.toParam(externalID), + ExternalByteEncoding.toParam(externalID))); } - - // for (CramRecord r : records) { - // if (r.tags == null || r.tags.isEmpty()) - // continue; - // for (ReadTag tag : r.tags) { - // EncodingParams params = h.tMap.get(tag.keyType3BytesAsInt); - // if (params == null) { - // h.tMap.put(tag.keyType3BytesAsInt, - // ByteArrayStopEncoding.toParam((byte) 0, - // tagValueExtID)); - // } - // } - // } } { // number of read features - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord r : records) calculator.add(r.readFeatures == null ? 0 : r.readFeatures.size()); calculator.calculate(); - h.eMap.put(EncodingKey.FN_NumberOfReadFeatures, + header.encodingMap.put(EncodingKey.FN_NumberOfReadFeatures, HuffmanIntegerEncoding.toParam(calculator.values(), calculator.bitLens())); } { // feature position - IntegerEncodingCalculator calc = new IntegerEncodingCalculator("read feature position", 0); - for (CramCompressionRecord r : records) { + final IntegerEncodingCalculator calc = new IntegerEncodingCalculator("read feature position", 0); + for (final CramCompressionRecord record : records) { int prevPos = 0; - if (r.readFeatures == null) + if (record.readFeatures == null) continue; - for (ReadFeature rf : r.readFeatures) { + for (final ReadFeature rf : record.readFeatures) { calc.addValue(rf.getPosition() - prevPos); prevPos = rf.getPosition(); } } - Encoding bestEncoding = calc.getBestEncoding(); - h.eMap.put(EncodingKey.FP_FeaturePosition, + final Encoding bestEncoding = calc.getBestEncoding(); + header.encodingMap.put(EncodingKey.FP_FeaturePosition, new EncodingParams(bestEncoding.id(), bestEncoding.toByteArray())); } { // feature code - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) - if (r.readFeatures == null) - continue; - else - for (ReadFeature rf : r.readFeatures) - calculator.add(rf.getOperator()); + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) + if (record.readFeatures != null) + for (final ReadFeature readFeature : record.readFeatures) + calculator.add(readFeature.getOperator()); calculator.calculate(); - h.eMap.put(EncodingKey.FC_FeatureCode, - HuffmanByteEncoding.toParam(calculator.valuesAsBytes(), calculator.bitLens)); + header.encodingMap.put(EncodingKey.FC_FeatureCode, HuffmanByteEncoding.toParam( + calculator.valuesAsBytes(), calculator.bitLens)); } { // bases: - h.eMap.put(EncodingKey.BA_Base, ExternalByteEncoding.toParam(baseID)); + header.encodingMap.put(EncodingKey.BA_Base, ExternalByteEncoding.toParam(baseID)); } { // quality scores: - // HuffmanParamsCalculator calculator = new - // HuffmanParamsCalculator(); - // for (CramRecord r : records) { - // if (r.getQualityScores() == null) { - // if (r.getReadFeatures() != null) { - // for (ReadFeature f:r.getReadFeatures()) { - // switch (f.getOperator()) { - // case BaseQualityScore.operator: - // calculator.add(((BaseQualityScore)f).getQualityScore()) ; - // break; - // default: - // break; - // } - // } - // } - // } else { - // for (byte s:r.getQualityScores()) calculator.add(s) ; - // } - // } - // calculator.calculate(); - // - // h.eMap.put(EncodingKey.QS_QualityScore, - // HuffmanByteEncoding.toParam( - // calculator.valuesAsBytes(), calculator.bitLens)); - - h.eMap.put(EncodingKey.QS_QualityScore, ExternalByteEncoding.toParam(qualityScoreID)); + header.encodingMap.put(EncodingKey.QS_QualityScore, ExternalByteEncoding.toParam(qualityScoreID)); } { // base substitution code if (substitutionMatrix == null) { - long[][] freqs = new long[200][200]; - for (CramCompressionRecord r : records) { - if (r.readFeatures == null) - continue; - else - for (ReadFeature rf : r.readFeatures) - if (rf.getOperator() == Substitution.operator) { - Substitution s = ((Substitution) rf); - byte refBase = s.getRefernceBase(); - byte base = s.getBase(); - freqs[refBase][base]++; + final long[][] frequencies = new long[200][200]; + for (final CramCompressionRecord record : records) { + if (record.readFeatures != null) + for (final ReadFeature readFeature : record.readFeatures) + if (readFeature.getOperator() == Substitution.operator) { + final Substitution substitution = ((Substitution) readFeature); + final byte refBase = substitution.getReferenceBase(); + final byte base = substitution.getBase(); + frequencies[refBase][base]++; } } - h.substitutionMatrix = new SubstitutionMatrix(freqs); + header.substitutionMatrix = new SubstitutionMatrix(frequencies); } else - h.substitutionMatrix = substitutionMatrix; - - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) - if (r.readFeatures == null) - continue; - else - for (ReadFeature rf : r.readFeatures) { - if (rf.getOperator() == Substitution.operator) { - Substitution s = ((Substitution) rf); - if (s.getCode() == -1) { - byte refBase = s.getRefernceBase(); - byte base = s.getBase(); - s.setCode(h.substitutionMatrix.code(refBase, base)); + header.substitutionMatrix = substitutionMatrix; + + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) + if (record.readFeatures != null) + for (final ReadFeature recordFeature : record.readFeatures) { + if (recordFeature.getOperator() == Substitution.operator) { + final Substitution substitution = ((Substitution) recordFeature); + if (substitution.getCode() == -1) { + final byte refBase = substitution.getReferenceBase(); + final byte base = substitution.getBase(); + substitution.setCode(header.substitutionMatrix.code(refBase, base)); } - calculator.add(s.getCode()); + calculator.add(substitution.getCode()); } } calculator.calculate(); - h.eMap.put(EncodingKey.BS_BaseSubstitutionCode, + header.encodingMap.put(EncodingKey.BS_BaseSubstitutionCode, HuffmanIntegerEncoding.toParam(calculator.values, calculator.bitLens)); } { // insertion bases - h.eMap.put(EncodingKey.IN_Insertion, ByteArrayStopEncoding.toParam((byte) 0, baseID)); + header.encodingMap.put(EncodingKey.IN_Insertion, ByteArrayStopEncoding.toParam((byte) 0, baseID)); } { // insertion bases - h.eMap.put(EncodingKey.SC_SoftClip, ByteArrayStopEncoding.toParam((byte) 0, baseID)); + header.encodingMap.put(EncodingKey.SC_SoftClip, ByteArrayStopEncoding.toParam((byte) 0, baseID)); } { // deletion length - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) - if (r.readFeatures == null) - continue; - else - for (ReadFeature rf : r.readFeatures) - if (rf.getOperator() == Deletion.operator) - calculator.add(((Deletion) rf).getLength()); + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) + if (record.readFeatures != null) + for (final ReadFeature recordFeature : record.readFeatures) + if (recordFeature.getOperator() == Deletion.operator) + calculator.add(((Deletion) recordFeature).getLength()); calculator.calculate(); - h.eMap.put(EncodingKey.DL_DeletionLength, + header.encodingMap.put(EncodingKey.DL_DeletionLength, HuffmanIntegerEncoding.toParam(calculator.values, calculator.bitLens)); } { // hard clip length - IntegerEncodingCalculator calculator = new IntegerEncodingCalculator(EncodingKey.HC_HardClip.name(), 1); - for (CramCompressionRecord r : records) - if (r.readFeatures == null) - continue; - else - for (ReadFeature rf : r.readFeatures) - if (rf.getOperator() == HardClip.operator) - calculator.addValue(((HardClip) rf).getLength()); + final IntegerEncodingCalculator calculator = new IntegerEncodingCalculator(EncodingKey.HC_HardClip.name(), 0); + for (final CramCompressionRecord record : records) + if (record.readFeatures != null) + for (final ReadFeature recordFeature : record.readFeatures) + if (recordFeature.getOperator() == HardClip.operator) + calculator.addValue(((HardClip) recordFeature).getLength()); - Encoding bestEncoding = calculator.getBestEncoding(); - h.eMap.put(EncodingKey.HC_HardClip, new EncodingParams(bestEncoding.id(), bestEncoding.toByteArray())); + final Encoding bestEncoding = calculator.getBestEncoding(); + header.encodingMap.put(EncodingKey.HC_HardClip, new EncodingParams(bestEncoding.id(), bestEncoding.toByteArray())); } { // padding length - IntegerEncodingCalculator calculator = new IntegerEncodingCalculator(EncodingKey.PD_padding.name(), 1); - for (CramCompressionRecord r : records) - if (r.readFeatures == null) - continue; - else - for (ReadFeature rf : r.readFeatures) - if (rf.getOperator() == Padding.operator) - calculator.addValue(((Padding) rf).getLength()); + final IntegerEncodingCalculator calculator = new IntegerEncodingCalculator(EncodingKey.PD_padding.name(), 0); + for (final CramCompressionRecord record : records) + if (record.readFeatures != null) + for (final ReadFeature recordFeature : record.readFeatures) + if (recordFeature.getOperator() == Padding.operator) + calculator.addValue(((Padding) recordFeature).getLength()); - Encoding bestEncoding = calculator.getBestEncoding(); - h.eMap.put(EncodingKey.PD_padding, new EncodingParams(bestEncoding.id(), bestEncoding.toByteArray())); + final Encoding bestEncoding = calculator.getBestEncoding(); + header.encodingMap.put(EncodingKey.PD_padding, new EncodingParams(bestEncoding.id(), bestEncoding.toByteArray())); } { // ref skip length - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) - if (r.readFeatures == null) - continue; - else - for (ReadFeature rf : r.readFeatures) - if (rf.getOperator() == RefSkip.operator) - calculator.add(((RefSkip) rf).getLength()); + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) + if (record.readFeatures != null) + for (final ReadFeature recordFeature : record.readFeatures) + if (recordFeature.getOperator() == RefSkip.operator) + calculator.add(((RefSkip) recordFeature).getLength()); calculator.calculate(); - h.eMap.put(EncodingKey.RS_RefSkip, HuffmanIntegerEncoding.toParam(calculator.values, calculator.bitLens)); + header.encodingMap.put(EncodingKey.RS_RefSkip, HuffmanIntegerEncoding.toParam(calculator.values, calculator.bitLens)); } { // mapping quality score - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) - if (!r.isSegmentUnmapped()) - calculator.add(r.mappingQuality); + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) + if (!record.isSegmentUnmapped()) + calculator.add(record.mappingQuality); calculator.calculate(); - h.eMap.put(EncodingKey.MQ_MappingQualityScore, + header.encodingMap.put(EncodingKey.MQ_MappingQualityScore, HuffmanIntegerEncoding.toParam(calculator.values(), calculator.bitLens)); } { // mate bit flags - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) - calculator.add(r.getMateFlags()); + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) + calculator.add(record.getMateFlags()); calculator.calculate(); - h.eMap.put(EncodingKey.MF_MateBitFlags, + header.encodingMap.put(EncodingKey.MF_MateBitFlags, HuffmanIntegerEncoding.toParam(calculator.values, calculator.bitLens)); } { // next fragment ref id: - HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); - for (CramCompressionRecord r : records) - if (r.isDetached()) - calculator.add(r.mateSequenceID); + final HuffmanParamsCalculator calculator = new HuffmanParamsCalculator(); + for (final CramCompressionRecord record : records) + if (record.isDetached()) + calculator.add(record.mateSequenceID); calculator.calculate(); if (calculator.values.length == 0) - h.eMap.put(EncodingKey.NS_NextFragmentReferenceSequenceID, NullEncoding.toParam()); + header.encodingMap.put(EncodingKey.NS_NextFragmentReferenceSequenceID, NullEncoding.toParam()); - h.eMap.put(EncodingKey.NS_NextFragmentReferenceSequenceID, - HuffmanIntegerEncoding.toParam(calculator.values(), calculator.bitLens())); - log.debug("NS: " + h.eMap.get(EncodingKey.NS_NextFragmentReferenceSequenceID)); + header.encodingMap.put(EncodingKey.NS_NextFragmentReferenceSequenceID, + HuffmanIntegerEncoding.toParam(calculator.values(), + calculator.bitLens())); + log.debug("NS: " + + header.encodingMap.get(EncodingKey.NS_NextFragmentReferenceSequenceID)); } { // next fragment alignment start - h.eMap.put(EncodingKey.NP_NextFragmentAlignmentStart, ExternalIntegerEncoding.toParam(mateInfoID)); + header.encodingMap.put(EncodingKey.NP_NextFragmentAlignmentStart, ExternalIntegerEncoding.toParam(mateInfoID)); } { // template size - h.eMap.put(EncodingKey.TS_InsetSize, ExternalIntegerEncoding.toParam(mateInfoID)); - } - - { // test mark - // h.eMap.put(EncodingKey.TM_TestMark, - // BetaIntegerEncoding.toParam(0, 32)); + header.encodingMap.put(EncodingKey.TS_InsetSize, ExternalIntegerEncoding.toParam(mateInfoID)); } - return h; + return header; } - // private static final int getValue(EncodingKey key, ReadFeature f) { - // switch (key) { - // case BS_BaseSubstitutionCode: - // - // break; - // - // default: - // break; - // } - // } - - private static final int getValue(EncodingKey key, CramCompressionRecord r) { + private static int getValue(final EncodingKey key, final CramCompressionRecord record) { switch (key) { case AP_AlignmentPositionOffset: - return r.alignmentDelta; + return record.alignmentDelta; case BF_BitFlags: - return r.flags; + return record.flags; case CF_CompressionBitFlags: - return r.compressionFlags; + return record.compressionFlags; case FN_NumberOfReadFeatures: - return r.readFeatures == null ? 0 : r.readFeatures.size(); + return record.readFeatures == null ? 0 : record.readFeatures.size(); case MF_MateBitFlags: - return r.mateFlags; + return record.mateFlags; case MQ_MappingQualityScore: - return r.mappingQuality; + return record.mappingQuality; case NF_RecordsToNextFragment: - return r.recordsToNextFragment; + return record.recordsToNextFragment; case NP_NextFragmentAlignmentStart: - return r.mateAlignmentStart; + return record.mateAlignmentStart; case NS_NextFragmentReferenceSequenceID: - return r.mateSequenceID; + return record.mateSequenceID; case RG_ReadGroup: - return r.readGroupID; + return record.readGroupID; case RI_RefId: - return r.sequenceId; + return record.sequenceId; case RL_ReadLength: - return r.readLength; + return record.readLength; case TC_TagCount: - return r.tags == null ? 0 : r.tags.length; + return record.tags == null ? 0 : record.tags.length; default: throw new RuntimeException("Unexpected encoding key: " + key.name()); } } - private static final void getOptimalIntegerEncoding(CompressionHeader h, EncodingKey key, int minValue, - List records) { - IntegerEncodingCalculator calc = new IntegerEncodingCalculator(key.name(), minValue); - for (CramCompressionRecord r : records) { - int value = getValue(key, r); + private static void getOptimalIntegerEncoding(final CompressionHeader header, final EncodingKey key, final int minValue, + final List records) { + final IntegerEncodingCalculator calc = new IntegerEncodingCalculator(key.name(), minValue); + for (final CramCompressionRecord record : records) { + final int value = getValue(key, record); calc.addValue(value); } - Encoding bestEncoding = calc.getBestEncoding(); - h.eMap.put(key, new EncodingParams(bestEncoding.id(), bestEncoding.toByteArray())); + final Encoding bestEncoding = calc.getBestEncoding(); + header.encodingMap.put(key, new EncodingParams(bestEncoding.id(), bestEncoding.toByteArray())); } private static class BitCode implements Comparable { - int value; - int len; + final int value; + final int length; - public BitCode(int value, int len) { + public BitCode(final int value, final int length) { this.value = value; - this.len = len; + this.length = length; } @Override - public int compareTo(BitCode o) { - int result = value - o.value; + public int compareTo(@SuppressWarnings("NullableProblems") final BitCode o) { + final int result = value - o.value; if (result != 0) return result; - return len - o.len; + return length - o.length; } } public static class HuffmanParamsCalculator { - private HashMap countMap = new HashMap(); + private final HashMap countMap = new HashMap(); private int[] values = new int[]{}; private int[] bitLens = new int[]{}; - public void add(int value) { + public void add(final int value) { MutableInt counter = countMap.get(value); if (counter == null) { counter = new MutableInt(); @@ -628,7 +570,7 @@ public void add(int value) { counter.value++; } - public void add(Integer value, int inc) { + public void add(final Integer value, final int inc) { MutableInt counter = countMap.get(value); if (counter == null) { counter = new MutableInt(); @@ -646,51 +588,51 @@ public int[] values() { } public Integer[] valuesAsAutoIntegers() { - Integer[] ivalues = new Integer[values.length]; - for (int i = 0; i < ivalues.length; i++) - ivalues[i] = values[i]; + final Integer[] intValues = new Integer[values.length]; + for (int i = 0; i < intValues.length; i++) + intValues[i] = values[i]; - return ivalues; + return intValues; } public byte[] valuesAsBytes() { - byte[] bvalues = new byte[values.length]; - for (int i = 0; i < bvalues.length; i++) - bvalues[i] = (byte) (0xFF & values[i]); + final byte[] byteValues = new byte[values.length]; + for (int i = 0; i < byteValues.length; i++) + byteValues[i] = (byte) (0xFF & values[i]); - return bvalues; + return byteValues; } public Byte[] valuesAsAutoBytes() { - Byte[] bvalues = new Byte[values.length]; - for (int i = 0; i < bvalues.length; i++) - bvalues[i] = (byte) (0xFF & values[i]); + final Byte[] byteValues = new Byte[values.length]; + for (int i = 0; i < byteValues.length; i++) + byteValues[i] = (byte) (0xFF & values[i]); - return bvalues; + return byteValues; } public void calculate() { - HuffmanTree tree = null; + final HuffmanTree tree; { - int size = countMap.size(); - int[] freqs = new int[size]; - int[] values = new int[size]; + final int size = countMap.size(); + final int[] frequencies = new int[size]; + final int[] values = new int[size]; int i = 0; - for (Integer v : countMap.keySet()) { - values[i] = v; - freqs[i] = countMap.get(v).value; + for (final Integer key : countMap.keySet()) { + values[i] = key; + frequencies[i] = countMap.get(key).value; i++; } - tree = HuffmanCode.buildTree(freqs, autobox(values)); + tree = HuffmanCode.buildTree(frequencies, autobox(values)); } - List valueList = new ArrayList(); - List lens = new ArrayList(); + final List valueList = new ArrayList(); + final List lens = new ArrayList(); HuffmanCode.getValuesAndBitLengths(valueList, lens, tree); // the following sorting is not really required, but whatever: - BitCode[] codes = new BitCode[valueList.size()]; + final BitCode[] codes = new BitCode[valueList.size()]; for (int i = 0; i < valueList.size(); i++) { codes[i] = new BitCode(valueList.get(i), lens.get(i)); } @@ -700,65 +642,67 @@ public void calculate() { bitLens = new int[codes.length]; for (int i = 0; i < codes.length; i++) { - BitCode code = codes[i]; - bitLens[i] = code.len; + final BitCode code = codes[i]; + bitLens[i] = code.length; values[i] = code.value; } } } - private static Integer[] autobox(int[] array) { - Integer[] newArray = new Integer[array.length]; + private static Integer[] autobox(final int[] array) { + final Integer[] newArray = new Integer[array.length]; for (int i = 0; i < array.length; i++) newArray[i] = array[i]; return newArray; } public static class EncodingLengthCalculator { - private BitCodec codec; - private Encoding encoding; - private long len; + private final BitCodec codec; + private final Encoding encoding; + private long length; - public EncodingLengthCalculator(Encoding encoding) { + public EncodingLengthCalculator(final Encoding encoding) { this.encoding = encoding; codec = encoding.buildCodec(null, null); } - public void add(int value) { - len += codec.numberOfBits(value); + public void add(final int value) { + length += codec.numberOfBits(value); } - public void add(int value, int inc) { - len += inc * codec.numberOfBits(value); + public void add(final int value, final int inc) { + length += inc * codec.numberOfBits(value); } - public long len() { - return len; + public long length() { + return length; } } public static class IntegerEncodingCalculator { - public List calcs = new ArrayList(); + public final List calculators = new ArrayList(); private int max = 0; private int count = 0; - private String name; + private final String name; private HashMap dictionary = new HashMap(); - private int dictionaryThreshold = 100; + private final int dictionaryThreshold = 100; + private final int minValue; - public IntegerEncodingCalculator(String name, int dictionaryThreshold, int minValue) { + public IntegerEncodingCalculator(final String name, final int dictionaryThreshold, final int minValue) { this.name = name; + this.minValue = minValue; // for (int i = 2; i < 10; i++) - // calcs.add(new EncodingLengthCalculator( + // calculators.add(new EncodingLengthCalculator( // new GolombIntegerEncoding(i))); // // for (int i = 2; i < 20; i++) - // calcs.add(new EncodingLengthCalculator( + // calculators.add(new EncodingLengthCalculator( // new GolombRiceIntegerEncoding(i))); - calcs.add(new EncodingLengthCalculator(new GammaIntegerEncoding(1 - minValue))); + calculators.add(new EncodingLengthCalculator(new GammaIntegerEncoding(1 - minValue))); for (int i = 2; i < 5; i++) - calcs.add(new EncodingLengthCalculator(new SubexpIntegerEncoding(0 - minValue, i))); + calculators.add(new EncodingLengthCalculator(new SubexponentialIntegerEncoding(0 - minValue, i))); if (dictionaryThreshold < 1) dictionary = null; @@ -772,28 +716,28 @@ public IntegerEncodingCalculator(String name, int dictionaryThreshold, int minVa } } - public IntegerEncodingCalculator(String name, int minValue) { + public IntegerEncodingCalculator(final String name, final int minValue) { this(name, 255, minValue); } - public void addValue(int value) { + public void addValue(final int value) { count++; if (value > max) max = value; - for (EncodingLengthCalculator c : calcs) - c.add(value); + for (final EncodingLengthCalculator calculator : calculators) + calculator.add(value); if (dictionary != null) { if (dictionary.size() >= dictionaryThreshold - 1) dictionary = null; else { - MutableInt m = dictionary.get(value); - if (m == null) { - m = new MutableInt(); - dictionary.put(value, m); + MutableInt mutableInt = dictionary.get(value); + if (mutableInt == null) { + mutableInt = new MutableInt(); + dictionary.put(value, mutableInt); } - m.value++; + mutableInt.value++; } } @@ -802,57 +746,57 @@ public void addValue(int value) { public Encoding getBestEncoding() { if (dictionary != null && dictionary.size() == 1) { - int value = dictionary.keySet().iterator().next(); - EncodingParams param = HuffmanIntegerEncoding.toParam(new int[]{value}, new int[]{0}); - HuffmanIntegerEncoding he = new HuffmanIntegerEncoding(); - he.fromByteArray(param.params); - return he; + final int value = dictionary.keySet().iterator().next(); + final EncodingParams param = HuffmanIntegerEncoding.toParam(new int[]{value}, new int[]{0}); + final HuffmanIntegerEncoding huffmanEncoding = new HuffmanIntegerEncoding(); + huffmanEncoding.fromByteArray(param.params); + return huffmanEncoding; } - EncodingLengthCalculator bestC = calcs.get(0); + EncodingLengthCalculator bestCalculator = calculators.get(0); - for (EncodingLengthCalculator c : calcs) { - if (c.len() < bestC.len()) - bestC = c; + for (final EncodingLengthCalculator calculator : calculators) { + if (calculator.length() < bestCalculator.length()) + bestCalculator = calculator; } - Encoding bestEncoding = bestC.encoding; - long bits = bestC.len(); + Encoding bestEncoding = bestCalculator.encoding; + long bits = bestCalculator.length(); { // check if beta is better: - int betaLength = (int) Math.round(Math.log(max) / Math.log(2) + 0.5); + final int betaLength = (int) Math.round(Math.log(max - minValue) / Math.log(2) + 0.5); if (bits > betaLength * count) { - bestEncoding = new BetaIntegerEncoding(betaLength); + bestEncoding = new BetaIntegerEncoding(-minValue, betaLength); bits = betaLength * count; } } { // try huffman: if (dictionary != null) { - HuffmanParamsCalculator c = new HuffmanParamsCalculator(); - for (Integer value : dictionary.keySet()) - c.add(value, dictionary.get(value).value); - - c.calculate(); - - EncodingParams param = HuffmanIntegerEncoding.toParam(c.values(), c.bitLens()); - HuffmanIntegerEncoding he = new HuffmanIntegerEncoding(); - he.fromByteArray(param.params); - EncodingLengthCalculator lc = new EncodingLengthCalculator(he); - for (Integer value : dictionary.keySet()) - lc.add(value, dictionary.get(value).value); - - if (lc.len() < bits) { - bestEncoding = he; - bits = lc.len(); + final HuffmanParamsCalculator huffmanParamsCalculator = new HuffmanParamsCalculator(); + for (final Integer value : dictionary.keySet()) + huffmanParamsCalculator.add(value, dictionary.get(value).value); + + huffmanParamsCalculator.calculate(); + + final EncodingParams param = HuffmanIntegerEncoding.toParam(huffmanParamsCalculator.values(), huffmanParamsCalculator.bitLens()); + final HuffmanIntegerEncoding huffmanEncoding = new HuffmanIntegerEncoding(); + huffmanEncoding.fromByteArray(param.params); + final EncodingLengthCalculator calculator = new EncodingLengthCalculator(huffmanEncoding); + for (final Integer key : dictionary.keySet()) + calculator.add(key, dictionary.get(key).value); + + if (calculator.length() < bits) { + bestEncoding = huffmanEncoding; + bits = calculator.length(); } } } byte[] params = bestEncoding.toByteArray(); params = Arrays.copyOf(params, Math.min(params.length, 20)); - log.debug("Best encoding for " + name + ": " + bestEncoding.id().name() + Arrays.toString(params)); + log.debug("Best encoding for " + name + ": " + bestEncoding.id().name() + Arrays.toString(params) + ", bits=" + bits); return bestEncoding; } diff --git a/src/java/htsjdk/samtools/cram/build/ContainerFactory.java b/src/java/htsjdk/samtools/cram/build/ContainerFactory.java index e7dfecb214..b96dd137ca 100644 --- a/src/java/htsjdk/samtools/cram/build/ContainerFactory.java +++ b/src/java/htsjdk/samtools/cram/build/ContainerFactory.java @@ -1,28 +1,31 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.build; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.cram.digest.ContentDigests; +import htsjdk.samtools.cram.encoding.ExternalCompressor; import htsjdk.samtools.cram.encoding.writer.DataWriterFactory; import htsjdk.samtools.cram.encoding.writer.Writer; import htsjdk.samtools.cram.io.DefaultBitOutputStream; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; import htsjdk.samtools.cram.structure.Block; -import htsjdk.samtools.cram.structure.BlockCompressionMethod; import htsjdk.samtools.cram.structure.BlockContentType; import htsjdk.samtools.cram.structure.CompressionHeader; import htsjdk.samtools.cram.structure.Container; @@ -37,194 +40,189 @@ import java.util.Map; public class ContainerFactory { - SAMFileHeader samFileHeader; - int recordsPerSlice = 10000; - boolean preserveReadNames = true; - long globalRecordCounter = 0; - boolean AP_delta = true; - - public ContainerFactory(SAMFileHeader samFileHeader, int recordsPerSlice) { - this.samFileHeader = samFileHeader; - this.recordsPerSlice = recordsPerSlice; - } - - public Container buildContainer(List records) - throws IllegalArgumentException, IllegalAccessException, - IOException { - return buildContainer(records, null); - } - - public Container buildContainer(List records, - SubstitutionMatrix substitutionMatrix) - throws IllegalArgumentException, IllegalAccessException, - IOException { - // get stats, create compression header and slices - long time1 = System.nanoTime(); - CompressionHeader h = new CompressionHeaderFactory().build(records, - substitutionMatrix); - h.AP_seriesDelta = AP_delta; - long time2 = System.nanoTime(); - - h.readNamesIncluded = preserveReadNames; - h.AP_seriesDelta = true; - - List slices = new ArrayList(); - - Container c = new Container(); - c.h = h; - c.nofRecords = records.size(); - c.globalRecordCounter = globalRecordCounter; - c.bases = 0; - c.blockCount = 0; - - long time3 = System.nanoTime(); - long lastGlobalRecordCounter = c.globalRecordCounter; - for (int i = 0; i < records.size(); i += recordsPerSlice) { - List sliceRecords = records.subList(i, - Math.min(records.size(), i + recordsPerSlice)); - Slice slice = buildSlice(sliceRecords, h, samFileHeader); - slice.globalRecordCounter = lastGlobalRecordCounter; - lastGlobalRecordCounter += slice.nofRecords; - c.bases += slice.bases; - slices.add(slice); - - // assuming one sequence per container max: - if (c.sequenceId == -1 && slice.sequenceId != -1) - c.sequenceId = slice.sequenceId; - } - - long time4 = System.nanoTime(); - - c.slices = (Slice[]) slices.toArray(new Slice[slices.size()]); - calculateAlignmentBoundaries(c); - - c.buildHeaderTime = time2 - time1; - c.buildSlicesTime = time4 - time3; - - globalRecordCounter += records.size(); - return c; - } - - private static void calculateAlignmentBoundaries(Container c) { - int start = Integer.MAX_VALUE; - int end = Integer.MIN_VALUE; - for (Slice s : c.slices) { - if (s.sequenceId != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - start = Math.min(start, s.alignmentStart); - end = Math.max(end, s.alignmentStart + s.alignmentSpan); - } - } - - if (start < Integer.MAX_VALUE) { - c.alignmentStart = start; - c.alignmentSpan = end - start; - } - } - - private static Slice buildSlice(List records, - CompressionHeader h, SAMFileHeader fileHeader) - throws IllegalArgumentException, IllegalAccessException, - IOException { - Map map = new HashMap(); - for (int id : h.externalIds) { - map.put(id, new ExposedByteArrayOutputStream()); - } - - DataWriterFactory f = new DataWriterFactory(); - ExposedByteArrayOutputStream bitBAOS = new ExposedByteArrayOutputStream(); - DefaultBitOutputStream bos = new DefaultBitOutputStream(bitBAOS); - - Slice slice = new Slice(); - slice.nofRecords = records.size(); - - { - - // @formatter:off - /* - * 1) Count slice bases. - * 2) Decide if the slice is single ref, unmapped or multiref. - * 3) Detect alignment boundaries for the slice if not multiref. + private final SAMFileHeader samFileHeader; + private int recordsPerSlice = 10000; + private boolean preserveReadNames = true; + private long globalRecordCounter = 0; + + public ContainerFactory(final SAMFileHeader samFileHeader, final int recordsPerSlice) { + this.samFileHeader = samFileHeader; + this.recordsPerSlice = recordsPerSlice; + } + + public Container buildContainer(final List records) + throws IllegalArgumentException, IllegalAccessException, + IOException { + return buildContainer(records, null); + } + + Container buildContainer(final List records, + final SubstitutionMatrix substitutionMatrix) + throws IllegalArgumentException, IllegalAccessException, + IOException { + // get stats, create compression header and slices + final long time1 = System.nanoTime(); + final CompressionHeader header = new CompressionHeaderFactory().build(records, + substitutionMatrix, samFileHeader.getSortOrder() == SAMFileHeader.SortOrder.coordinate); + header.APDelta = true; + final long time2 = System.nanoTime(); + + header.readNamesIncluded = preserveReadNames; + header.APDelta = true; + + final List slices = new ArrayList(); + + final Container container = new Container(); + container.header = header; + container.nofRecords = records.size(); + container.globalRecordCounter = globalRecordCounter; + container.bases = 0; + container.blockCount = 0; + + final long time3 = System.nanoTime(); + long lastGlobalRecordCounter = container.globalRecordCounter; + for (int i = 0; i < records.size(); i += recordsPerSlice) { + final List sliceRecords = records.subList(i, + Math.min(records.size(), i + recordsPerSlice)); + final Slice slice = buildSlice(sliceRecords, header); + slice.globalRecordCounter = lastGlobalRecordCounter; + lastGlobalRecordCounter += slice.nofRecords; + container.bases += slice.bases; + slices.add(slice); + + // assuming one sequence per container max: + if (container.sequenceId == -1 && slice.sequenceId != -1) + container.sequenceId = slice.sequenceId; + } + + final long time4 = System.nanoTime(); + + container.slices = slices.toArray(new Slice[slices.size()]); + calculateAlignmentBoundaries(container); + + container.buildHeaderTime = time2 - time1; + container.buildSlicesTime = time4 - time3; + + globalRecordCounter += records.size(); + return container; + } + + private static void calculateAlignmentBoundaries(final Container container) { + int start = Integer.MAX_VALUE; + int end = Integer.MIN_VALUE; + for (final Slice s : container.slices) { + if (s.sequenceId != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + start = Math.min(start, s.alignmentStart); + end = Math.max(end, s.alignmentStart + s.alignmentSpan); + } + } + + if (start < Integer.MAX_VALUE) { + container.alignmentStart = start; + container.alignmentSpan = end - start; + } + } + + private static Slice buildSlice(final List records, + final CompressionHeader header) + throws IllegalArgumentException, IllegalAccessException, + IOException { + final Map map = new HashMap(); + for (final int id : header.externalIds) { + map.put(id, new ExposedByteArrayOutputStream()); + } + + final DataWriterFactory dataWriterFactory = new DataWriterFactory(); + final ExposedByteArrayOutputStream bitBAOS = new ExposedByteArrayOutputStream(); + final DefaultBitOutputStream bitOutputStream = new DefaultBitOutputStream(bitBAOS); + + final Slice slice = new Slice(); + slice.nofRecords = records.size(); + + int minAlStart = Integer.MAX_VALUE; + int maxAlEnd = SAMRecord.NO_ALIGNMENT_START; + { + // @formatter:off + /* + * 1) Count slice bases. + * 2) Decide if the slice is single ref, unmapped or multi reference. + * 3) Detect alignment boundaries for the slice if not multi reference. */ - // @formatter:on - slice.sequenceId = Slice.UNMAPPED_OR_NOREF; - int minAlStart = Integer.MAX_VALUE; - int maxAlEnd = SAMRecord.NO_ALIGNMENT_START; - for (CramCompressionRecord r : records) { - slice.bases += r.readLength; - - if (slice.sequenceId != Slice.MUTLIREF - && r.alignmentStart != SAMRecord.NO_ALIGNMENT_START - && r.sequenceId != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - switch (slice.sequenceId) { - case Slice.UNMAPPED_OR_NOREF: - slice.sequenceId = r.sequenceId; - break; - case Slice.MUTLIREF: - break; - - default: - if (slice.sequenceId != r.sequenceId) - slice.sequenceId = Slice.UNMAPPED_OR_NOREF; - break; - } - - minAlStart = Math.min(r.alignmentStart, minAlStart); - maxAlEnd = Math.max(r.getAlignmentEnd(), maxAlEnd); - } - } - - /* - * Set the slice boundaries if the slice have records mapped to a - * single ref. - */ - if (slice.sequenceId == Slice.MUTLIREF - || minAlStart == Integer.MAX_VALUE) { - slice.alignmentStart = SAMRecord.NO_ALIGNMENT_START; - slice.alignmentSpan = 0; - } else { - slice.alignmentStart = minAlStart; - slice.alignmentSpan = maxAlEnd - minAlStart + 1; - } - } - - Writer writer = f.buildWriter(bos, map, h, slice.sequenceId); - int prevAlStart = slice.alignmentStart; - for (CramCompressionRecord r : records) { - r.alignmentDelta = r.alignmentStart - prevAlStart; - prevAlStart = r.alignmentStart; - writer.write(r); - } - - slice.contentType = slice.alignmentSpan > -1 ? BlockContentType.MAPPED_SLICE - : BlockContentType.RESERVED; - - bos.close(); - slice.coreBlock = new Block(); - slice.coreBlock.method = BlockCompressionMethod.RAW; - slice.coreBlock.setRawContent(bitBAOS.toByteArray()); - slice.coreBlock.contentType = BlockContentType.CORE; - - slice.external = new HashMap(); - for (Integer i : map.keySet()) { - ExposedByteArrayOutputStream os = map.get(i); - - Block externalBlock = new Block(); - externalBlock.contentType = BlockContentType.EXTERNAL; - externalBlock.method = BlockCompressionMethod.GZIP; - externalBlock.contentId = i; - - externalBlock.setRawContent(os.toByteArray()); - slice.external.put(i, externalBlock); - } - - return slice; - } - - public boolean isPreserveReadNames() { - return preserveReadNames; - } - - public void setPreserveReadNames(boolean preserveReadNames) { - this.preserveReadNames = preserveReadNames; - } + // @formatter:on + slice.sequenceId = Slice.UNMAPPED_OR_NO_REFERENCE; + final ContentDigests hasher = ContentDigests.create(ContentDigests.ALL); + for (final CramCompressionRecord record : records) { + slice.bases += record.readLength; + hasher.add(record); + + if (slice.sequenceId != Slice.MULTI_REFERENCE + && record.alignmentStart != SAMRecord.NO_ALIGNMENT_START + && record.sequenceId != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + switch (slice.sequenceId) { + case Slice.UNMAPPED_OR_NO_REFERENCE: + slice.sequenceId = record.sequenceId; + break; + case Slice.MULTI_REFERENCE: + break; + + default: + if (slice.sequenceId != record.sequenceId) + slice.sequenceId = Slice.UNMAPPED_OR_NO_REFERENCE; + break; + } + + minAlStart = Math.min(record.alignmentStart, minAlStart); + maxAlEnd = Math.max(record.getAlignmentEnd(), maxAlEnd); + } + } + + slice.sliceTags = hasher.getAsTags(); + } + + if (slice.sequenceId == Slice.MULTI_REFERENCE + || minAlStart == Integer.MAX_VALUE) { + slice.alignmentStart = SAMRecord.NO_ALIGNMENT_START; + slice.alignmentSpan = 0; + } else { + slice.alignmentStart = minAlStart; + slice.alignmentSpan = maxAlEnd - minAlStart + 1; + } + + final Writer writer = dataWriterFactory.buildWriter(bitOutputStream, map, header, slice.sequenceId); + int prevAlStart = slice.alignmentStart; + for (final CramCompressionRecord record : records) { + record.alignmentDelta = record.alignmentStart - prevAlStart; + prevAlStart = record.alignmentStart; + writer.write(record); + } + + bitOutputStream.close(); + slice.coreBlock = Block.buildNewCore(bitBAOS.toByteArray()); + + slice.external = new HashMap(); + for (final Integer key : map.keySet()) { + final ExposedByteArrayOutputStream os = map.get(key); + + final Block externalBlock = new Block(); + externalBlock.setContentId(key); + externalBlock.setContentType(BlockContentType.EXTERNAL); + + final ExternalCompressor compressor = header.externalCompressors.get(key); + final byte[] rawData = os.toByteArray(); + final byte[] compressed = compressor.compress(rawData); + externalBlock.setContent(rawData, compressed); + externalBlock.setMethod(compressor.getMethod()); + slice.external.put(key, externalBlock); + } + + return slice; + } + + public boolean isPreserveReadNames() { + return preserveReadNames; + } + + public void setPreserveReadNames(final boolean preserveReadNames) { + this.preserveReadNames = preserveReadNames; + } } diff --git a/src/java/htsjdk/samtools/cram/build/ContainerParser.java b/src/java/htsjdk/samtools/cram/build/ContainerParser.java index 38fa6ea7c6..5cef35ee56 100644 --- a/src/java/htsjdk/samtools/cram/build/ContainerParser.java +++ b/src/java/htsjdk/samtools/cram/build/ContainerParser.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.build; import htsjdk.samtools.SAMFileHeader; @@ -30,8 +32,6 @@ import htsjdk.samtools.util.Log.LogLevel; import java.io.ByteArrayInputStream; -import java.io.EOFException; -import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; @@ -40,121 +40,118 @@ import java.util.TreeMap; public class ContainerParser { - private static Log log = Log.getInstance(ContainerParser.class); - - private SAMFileHeader samFileHeader; - private Map nanoMap = new TreeMap(); - - public ContainerParser(SAMFileHeader samFileHeader) { - this.samFileHeader = samFileHeader; - } - - public List getRecords(Container container, - ArrayList records) throws IllegalArgumentException, - IllegalAccessException, IOException { - long time1 = System.nanoTime(); - if (records == null) - records = new ArrayList(container.nofRecords); - - for (Slice s : container.slices) - records.addAll(getRecords(s, container.h)); - - long time2 = System.nanoTime(); - - container.parseTime = time2 - time1; - - if (log.isEnabled(LogLevel.DEBUG)) { - for (String key : nanoMap.keySet()) { - log.debug(String.format("%s: %dms.", key, nanoMap.get(key) - .longValue() / 1000000)); - } - } - - return records; - } - - public List getRecords(Slice s, CompressionHeader h) - throws IllegalArgumentException, IllegalAccessException, - IOException { - String seqName = SAMRecord.NO_ALIGNMENT_REFERENCE_NAME; - switch (s.sequenceId) { - case SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX: - - break; - case -2: - - break; - - default: - SAMSequenceRecord sequence = samFileHeader - .getSequence(s.sequenceId); - seqName = sequence.getSequenceName(); - break; - } - - DataReaderFactory f = new DataReaderFactory(); - Map inputMap = new HashMap(); - for (Integer exId : s.external.keySet()) { - inputMap.put(exId, new ByteArrayInputStream(s.external.get(exId) - .getRawContent())); - } - - long time = 0; - CramRecordReader reader = new CramRecordReader(); - f.buildReader(reader, new DefaultBitInputStream( - new ByteArrayInputStream(s.coreBlock.getRawContent())), - inputMap, h, s.sequenceId); - - List records = new ArrayList(); - - long readNanos = 0; - int prevStart = s.alignmentStart; - for (int i = 0; i < s.nofRecords; i++) { - CramCompressionRecord r = new CramCompressionRecord(); - r.sliceIndex = s.index ; - r.index = i; - - try { - time = System.nanoTime(); - reader.read(r); - readNanos += System.nanoTime() - time; - } catch (EOFException e) { - e.printStackTrace(); - throw e; - } - - if (r.sequenceId == s.sequenceId) - r.sequenceName = seqName; - else { - if (r.sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) - r.sequenceName = SAMRecord.NO_ALIGNMENT_REFERENCE_NAME; - else { - String name = samFileHeader.getSequence(r.sequenceId) - .getSequenceName(); - r.sequenceName = name; - } - } - - records.add(r); - - if (h.AP_seriesDelta) { - prevStart += r.alignmentDelta; - r.alignmentStart = prevStart; - } - } - log.debug("Slice records read time: " + readNanos / 1000000); - - Map statMap = f.getStats(reader); - for (String key : statMap.keySet()) { - long value = 0; - if (!nanoMap.containsKey(key)) { - nanoMap.put(key, 0L); - value = 0; - } else - value = nanoMap.get(key); - nanoMap.put(key, value + statMap.get(key).nanos); - } - - return records; - } + private static final Log log = Log.getInstance(ContainerParser.class); + + private final SAMFileHeader samFileHeader; + private final Map nanosecondsMap = new TreeMap(); + + public ContainerParser(final SAMFileHeader samFileHeader) { + this.samFileHeader = samFileHeader; + } + + public List getRecords(final Container container, + ArrayList records) throws IllegalArgumentException, + IllegalAccessException { + final long time1 = System.nanoTime(); + if (records == null) + records = new ArrayList(container.nofRecords); + + for (final Slice slice : container.slices) + records.addAll(getRecords(slice, container.header)); + + final long time2 = System.nanoTime(); + + container.parseTime = time2 - time1; + + if (log.isEnabled(LogLevel.DEBUG)) { + for (final String key : nanosecondsMap.keySet()) { + log.debug(String.format("%s: %dms.", key, nanosecondsMap.get(key) / 1000000)); + } + } + + return records; + } + + ArrayList getRecords(ArrayList records, + final Slice slice, final CompressionHeader header) throws IllegalArgumentException, + IllegalAccessException { + String seqName = SAMRecord.NO_ALIGNMENT_REFERENCE_NAME; + switch (slice.sequenceId) { + case SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX: + case -2: + break; + + default: + final SAMSequenceRecord sequence = samFileHeader + .getSequence(slice.sequenceId); + seqName = sequence.getSequenceName(); + break; + } + + final DataReaderFactory dataReaderFactory = new DataReaderFactory(); + final Map inputMap = new HashMap(); + for (final Integer exId : slice.external.keySet()) { + log.debug("Adding external data: " + exId); + inputMap.put(exId, new ByteArrayInputStream(slice.external.get(exId) + .getRawContent())); + } + + long time; + final CramRecordReader reader = new CramRecordReader(); + dataReaderFactory.buildReader(reader, new DefaultBitInputStream( + new ByteArrayInputStream(slice.coreBlock.getRawContent())), + inputMap, header, slice.sequenceId); + + if (records == null) + records = new ArrayList(slice.nofRecords); + + long readNanos = 0; + int prevStart = slice.alignmentStart; + for (int i = 0; i < slice.nofRecords; i++) { + final CramCompressionRecord record = new CramCompressionRecord(); + record.sliceIndex = slice.index; + record.index = i; + + time = System.nanoTime(); + reader.read(record); + readNanos += System.nanoTime() - time; + + if (record.sequenceId == slice.sequenceId) { + record.sequenceName = seqName; + record.sequenceId = slice.sequenceId; + } else { + if (record.sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) + record.sequenceName = SAMRecord.NO_ALIGNMENT_REFERENCE_NAME; + else { + record.sequenceName = samFileHeader.getSequence(record.sequenceId) + .getSequenceName(); + } + } + + records.add(record); + + if (header.APDelta) { + prevStart += record.alignmentDelta; + record.alignmentStart = prevStart; + } + } + log.debug("Slice records read time: " + readNanos / 1000000); + + final Map statMap = dataReaderFactory.getStats(reader); + for (final String key : statMap.keySet()) { + final long value; + if (!nanosecondsMap.containsKey(key)) { + nanosecondsMap.put(key, 0L); + value = 0; + } else + value = nanosecondsMap.get(key); + nanosecondsMap.put(key, value + statMap.get(key).nanos); + } + return records; + } + + List getRecords(final Slice slice, final CompressionHeader header) + throws IllegalArgumentException, IllegalAccessException { + return getRecords(null, slice, header); + } } diff --git a/src/java/htsjdk/samtools/cram/build/Cram2SamRecordFactory.java b/src/java/htsjdk/samtools/cram/build/Cram2SamRecordFactory.java index 42da59eff7..699bd2ad27 100644 --- a/src/java/htsjdk/samtools/cram/build/Cram2SamRecordFactory.java +++ b/src/java/htsjdk/samtools/cram/build/Cram2SamRecordFactory.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.build; import htsjdk.samtools.Cigar; @@ -21,34 +23,34 @@ import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.cram.encoding.read_features.Deletion; -import htsjdk.samtools.cram.encoding.read_features.HardClip; -import htsjdk.samtools.cram.encoding.read_features.InsertBase; -import htsjdk.samtools.cram.encoding.read_features.Insertion; -import htsjdk.samtools.cram.encoding.read_features.Padding; -import htsjdk.samtools.cram.encoding.read_features.ReadBase; -import htsjdk.samtools.cram.encoding.read_features.ReadFeature; -import htsjdk.samtools.cram.encoding.read_features.RefSkip; -import htsjdk.samtools.cram.encoding.read_features.SoftClip; -import htsjdk.samtools.cram.encoding.read_features.Substitution; +import htsjdk.samtools.cram.encoding.readfeatures.Deletion; +import htsjdk.samtools.cram.encoding.readfeatures.HardClip; +import htsjdk.samtools.cram.encoding.readfeatures.InsertBase; +import htsjdk.samtools.cram.encoding.readfeatures.Insertion; +import htsjdk.samtools.cram.encoding.readfeatures.Padding; +import htsjdk.samtools.cram.encoding.readfeatures.ReadBase; +import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature; +import htsjdk.samtools.cram.encoding.readfeatures.RefSkip; +import htsjdk.samtools.cram.encoding.readfeatures.SoftClip; +import htsjdk.samtools.cram.encoding.readfeatures.Substitution; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.ReadTag; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; public class Cram2SamRecordFactory { - private SAMFileHeader header; + private final SAMFileHeader header; - public Cram2SamRecordFactory(SAMFileHeader header) { + public Cram2SamRecordFactory(final SAMFileHeader header) { this.header = header; } - public SAMRecord create(CramCompressionRecord cramRecord) { - SAMRecord samRecord = new SAMRecord(header); + public SAMRecord create(final CramCompressionRecord cramRecord) { + final SAMRecord samRecord = new SAMRecord(header); samRecord.setReadName(cramRecord.readName); copyFlags(cramRecord, samRecord); @@ -72,9 +74,10 @@ public SAMRecord create(CramCompressionRecord cramRecord) { if (samRecord.getReadPairedFlag()) { samRecord.setMateReferenceIndex(cramRecord.mateSequenceID); samRecord - .setMateAlignmentStart(cramRecord.mateAlignmentStart > 0 ? cramRecord.mateAlignmentStart : SAMRecord.NO_ALIGNMENT_START); + .setMateAlignmentStart(cramRecord.mateAlignmentStart > 0 ? cramRecord.mateAlignmentStart : SAMRecord + .NO_ALIGNMENT_START); samRecord.setMateNegativeStrandFlag(cramRecord.isMateNegativeStrand()); - samRecord.setMateUnmappedFlag(cramRecord.isMateUmapped()); + samRecord.setMateUnmappedFlag(cramRecord.isMateUnmapped()); } else { samRecord .setMateReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); @@ -86,11 +89,11 @@ public SAMRecord create(CramCompressionRecord cramRecord) { samRecord.setBaseQualities(cramRecord.qualityScores); if (cramRecord.tags != null) - for (ReadTag tag : cramRecord.tags) + for (final ReadTag tag : cramRecord.tags) samRecord.setAttribute(tag.getKey(), tag.getValue()); if (cramRecord.readGroupID > -1) { - SAMReadGroupRecord readGroupRecord = header.getReadGroups().get( + final SAMReadGroupRecord readGroupRecord = header.getReadGroups().get( cramRecord.readGroupID); samRecord.setAttribute("RG", readGroupRecord.getId()); } @@ -98,36 +101,37 @@ public SAMRecord create(CramCompressionRecord cramRecord) { return samRecord; } - private static final void copyFlags(CramCompressionRecord cr, SAMRecord sr) { - sr.setReadPairedFlag(cr.isMultiFragment()); - sr.setProperPairFlag(cr.isProperPair()); - sr.setReadUnmappedFlag(cr.isSegmentUnmapped()); - sr.setReadNegativeStrandFlag(cr.isNegativeStrand()); - sr.setFirstOfPairFlag(cr.isFirstSegment()); - sr.setSecondOfPairFlag(cr.isLastSegment()); - sr.setNotPrimaryAlignmentFlag(cr.isSecondaryAlignment()); - sr.setReadFailsVendorQualityCheckFlag(cr.isVendorFiltered()); - sr.setDuplicateReadFlag(cr.isDuplicate()); + private static void copyFlags(final CramCompressionRecord cramRecord, final SAMRecord samRecord) { + samRecord.setReadPairedFlag(cramRecord.isMultiFragment()); + samRecord.setProperPairFlag(cramRecord.isProperPair()); + samRecord.setReadUnmappedFlag(cramRecord.isSegmentUnmapped()); + samRecord.setReadNegativeStrandFlag(cramRecord.isNegativeStrand()); + samRecord.setFirstOfPairFlag(cramRecord.isFirstSegment()); + samRecord.setSecondOfPairFlag(cramRecord.isLastSegment()); + samRecord.setNotPrimaryAlignmentFlag(cramRecord.isSecondaryAlignment()); + samRecord.setReadFailsVendorQualityCheckFlag(cramRecord.isVendorFiltered()); + samRecord.setDuplicateReadFlag(cramRecord.isDuplicate()); + samRecord.setSupplementaryAlignmentFlag(cramRecord.isSupplementary()); } - private static final Cigar getCigar2(Collection features, - int readLength) { + private static Cigar getCigar2(final Collection features, + final int readLength) { if (features == null || features.isEmpty()) { - CigarElement ce = new CigarElement(readLength, CigarOperator.M); - return new Cigar(Arrays.asList(ce)); + final CigarElement cigarElement = new CigarElement(readLength, CigarOperator.M); + return new Cigar(Collections.singletonList(cigarElement)); } - List list = new ArrayList(); + final List list = new ArrayList(); int totalOpLen = 1; - CigarElement ce; + CigarElement cigarElement; CigarOperator lastOperator = CigarOperator.MATCH_OR_MISMATCH; int lastOpLen = 0; int lastOpPos = 1; - CigarOperator co = null; - int rfLen = 0; - for (ReadFeature f : features) { + CigarOperator cigarOperator; + int readFeatureLength; + for (final ReadFeature feature : features) { - int gap = f.getPosition() - (lastOpPos + lastOpLen); + final int gap = feature.getPosition() - (lastOpPos + lastOpLen); if (gap > 0) { if (lastOperator != CigarOperator.MATCH_OR_MISMATCH) { list.add(new CigarElement(lastOpLen, lastOperator)); @@ -141,78 +145,81 @@ private static final Cigar getCigar2(Collection features, lastOperator = CigarOperator.MATCH_OR_MISMATCH; } - switch (f.getOperator()) { + switch (feature.getOperator()) { case Insertion.operator: - co = CigarOperator.INSERTION; - rfLen = ((Insertion) f).getSequence().length; + cigarOperator = CigarOperator.INSERTION; + readFeatureLength = ((Insertion) feature).getSequence().length; break; case SoftClip.operator: - co = CigarOperator.SOFT_CLIP; - rfLen = ((SoftClip) f).getSequence().length; + cigarOperator = CigarOperator.SOFT_CLIP; + readFeatureLength = ((SoftClip) feature).getSequence().length; break; case HardClip.operator: - co = CigarOperator.HARD_CLIP; - rfLen = ((HardClip) f).getLength(); + cigarOperator = CigarOperator.HARD_CLIP; + readFeatureLength = ((HardClip) feature).getLength(); break; case InsertBase.operator: - co = CigarOperator.INSERTION; - rfLen = 1; + cigarOperator = CigarOperator.INSERTION; + readFeatureLength = 1; break; case Deletion.operator: - co = CigarOperator.DELETION; - rfLen = ((Deletion) f).getLength(); + cigarOperator = CigarOperator.DELETION; + readFeatureLength = ((Deletion) feature).getLength(); break; case RefSkip.operator: - co = CigarOperator.SKIPPED_REGION; - rfLen = ((RefSkip) f).getLength(); + cigarOperator = CigarOperator.SKIPPED_REGION; + readFeatureLength = ((RefSkip) feature).getLength(); break; case Padding.operator: - co = CigarOperator.PADDING; - rfLen = ((Padding) f).getLength(); + cigarOperator = CigarOperator.PADDING; + readFeatureLength = ((Padding) feature).getLength(); break; case Substitution.operator: case ReadBase.operator: - co = CigarOperator.MATCH_OR_MISMATCH; - rfLen = 1; + cigarOperator = CigarOperator.MATCH_OR_MISMATCH; + readFeatureLength = 1; break; default: continue; } - if (lastOperator != co) { + if (lastOperator != cigarOperator) { // add last feature if (lastOpLen > 0) { list.add(new CigarElement(lastOpLen, lastOperator)); totalOpLen += lastOpLen; } - lastOperator = co; - lastOpLen = rfLen; - lastOpPos = f.getPosition(); + lastOperator = cigarOperator; + lastOpLen = readFeatureLength; + lastOpPos = feature.getPosition(); } else - lastOpLen += rfLen; + lastOpLen += readFeatureLength; - if (!co.consumesReadBases()) - lastOpPos -= rfLen; + if (!cigarOperator.consumesReadBases()) + lastOpPos -= readFeatureLength; } if (lastOperator != null) { if (lastOperator != CigarOperator.M) { list.add(new CigarElement(lastOpLen, lastOperator)); if (readLength >= lastOpPos + lastOpLen) { - ce = new CigarElement(readLength - (lastOpLen + lastOpPos) + cigarElement = new CigarElement(readLength - (lastOpLen + lastOpPos) + 1, CigarOperator.M); - list.add(ce); + list.add(cigarElement); } - } else if (readLength > lastOpPos - 1) { - ce = new CigarElement(readLength - lastOpPos + 1, - CigarOperator.M); - list.add(ce); + } else if (readLength == 0 || readLength > lastOpPos - 1) { + if (readLength == 0) + cigarElement = new CigarElement(lastOpLen, CigarOperator.M); + else + cigarElement = new CigarElement(readLength - lastOpPos + 1, + CigarOperator.M); + list.add(cigarElement); } } if (list.isEmpty()) { - ce = new CigarElement(readLength, CigarOperator.M); - return new Cigar(Arrays.asList(ce)); + cigarElement = new CigarElement(readLength, CigarOperator.M); + return new Cigar(Collections.singletonList(cigarElement)); } return new Cigar(list); diff --git a/src/java/htsjdk/samtools/cram/build/CramBuilder.java b/src/java/htsjdk/samtools/cram/build/CramBuilder.java deleted file mode 100644 index aa534655fa..0000000000 --- a/src/java/htsjdk/samtools/cram/build/CramBuilder.java +++ /dev/null @@ -1,17 +0,0 @@ -package htsjdk.samtools.cram.build; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.cram.ref.ReferenceSource; -import htsjdk.samtools.cram.ref.ReferenceTracks; - -import java.util.List; - -public class CramBuilder { - ReferenceTracks tracks; - Sam2CramRecordFactory factory; - ReferenceSource source; - - public void build(List samRecords) { - - } -} diff --git a/src/java/htsjdk/samtools/cram/build/CramContainerIterator.java b/src/java/htsjdk/samtools/cram/build/CramContainerIterator.java new file mode 100644 index 0000000000..a38e6b8dce --- /dev/null +++ b/src/java/htsjdk/samtools/cram/build/CramContainerIterator.java @@ -0,0 +1,77 @@ +package htsjdk.samtools.cram.build; + +import htsjdk.samtools.cram.io.CountingInputStream; +import htsjdk.samtools.cram.structure.Container; +import htsjdk.samtools.cram.structure.ContainerIO; +import htsjdk.samtools.cram.structure.CramHeader; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Iterator; + +/** + * An iterator of CRAM containers read from an {@link java.io.InputStream}. + */ +public class CramContainerIterator implements Iterator { + private CramHeader cramHeader; + private InputStream inputStream; + private Container nextContainer; + private boolean eof = false; + private long offset = 0; + + public CramContainerIterator(final InputStream inputStream) throws IOException { + cramHeader = CramIO.readCramHeader(inputStream); + this.inputStream = inputStream; + } + + void readNextContainer() { + try { + final CountingInputStream cis = new CountingInputStream(inputStream); + nextContainer = ContainerIO.readContainer(cramHeader.getVersion(), cis); + final long containerSizeInBytes = cis.getCount(); + + nextContainer.offset = offset; + offset += containerSizeInBytes; + } catch (final IOException e) { + throw new RuntimeException(e); + } + + if (nextContainer.isEOF()) { + eof = true; + nextContainer = null; + } + } + + @Override + public boolean hasNext() { + if (eof) return false; + if (nextContainer == null) readNextContainer(); + return !eof; + } + + @Override + public Container next() { + final Container result = nextContainer; + nextContainer = null; + return result; + } + + @Override + public void remove() { + throw new RuntimeException("Read only iterator."); + } + + public CramHeader getCramHeader() { + return cramHeader; + } + + public void close() { + nextContainer = null; + cramHeader = null; + //noinspection EmptyCatchBlock + try { + inputStream.close(); + } catch (final Exception e) { + } + } +} diff --git a/src/java/htsjdk/samtools/cram/build/CramIO.java b/src/java/htsjdk/samtools/cram/build/CramIO.java index 3b5e30abaa..6667ce57e9 100644 --- a/src/java/htsjdk/samtools/cram/build/CramIO.java +++ b/src/java/htsjdk/samtools/cram/build/CramIO.java @@ -1,45 +1,39 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.build; -import htsjdk.samtools.SAMException; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMTextHeaderCodec; -import htsjdk.samtools.cram.io.ByteBufferUtils; +import htsjdk.samtools.cram.common.CramVersions; +import htsjdk.samtools.cram.common.Version; import htsjdk.samtools.cram.io.CountingInputStream; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.InputStreamUtils; import htsjdk.samtools.cram.structure.Block; -import htsjdk.samtools.cram.structure.BlockCompressionMethod; -import htsjdk.samtools.cram.structure.BlockContentType; -import htsjdk.samtools.cram.structure.CompressionHeaderBLock; import htsjdk.samtools.cram.structure.Container; -import htsjdk.samtools.cram.structure.ContainerHeaderIO; +import htsjdk.samtools.cram.structure.ContainerIO; import htsjdk.samtools.cram.structure.CramHeader; import htsjdk.samtools.cram.structure.Slice; -import htsjdk.samtools.cram.structure.SliceIO; -import htsjdk.samtools.seekablestream.SeekableBufferedStream; -import htsjdk.samtools.seekablestream.SeekableFTPStream; import htsjdk.samtools.seekablestream.SeekableFileStream; -import htsjdk.samtools.seekablestream.SeekableHTTPStream; import htsjdk.samtools.seekablestream.SeekableStream; -import htsjdk.samtools.seekablestream.UserPasswordInput; import htsjdk.samtools.util.BufferedLineReader; import htsjdk.samtools.util.Log; -import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; @@ -50,488 +44,278 @@ import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.RandomAccessFile; -import java.net.MalformedURLException; -import java.net.SocketException; -import java.net.URISyntaxException; -import java.net.URL; import java.nio.ByteBuffer; import java.nio.ByteOrder; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.channels.FileChannel.MapMode; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; +/** + * A collection of methods to open and close CRAM files. + */ public class CramIO { - public static int DEFINITION_LENGTH = 4 + 1 + 1 + 20; - private static Log log = Log.getInstance(CramIO.class); - public static byte[] ZERO_B_EOF_MARKER = ByteBufferUtils - .bytesFromHex("0b 00 00 00 ff ff ff ff ff e0 45 4f 46 00 00 00 00 01 00 00 01 00 06 06 01 00 01 00 01 00"); - - - public static String getFileName(String urlString) { - URL url = null; - try { - url = new URL(urlString); - return new File(url.getFile()).getName(); - } catch (MalformedURLException e) { - return new File(urlString).getName(); - } - } - - public static InputStream openInputStreamFromURL(String source) throws SocketException, IOException, - URISyntaxException { - URL url = null; - try { - url = new URL(source); - } catch (MalformedURLException e) { - File file = new File(source); - return new SeekableBufferedStream(new SeekableFileStream(file)); - } - - String protocol = url.getProtocol(); - if ("ftp".equalsIgnoreCase(protocol)) - return new SeekableBufferedStream(new NamedSeekableFTPStream(url)); - - if ("http".equalsIgnoreCase(protocol)) - return new SeekableBufferedStream(new SeekableHTTPStream(url)); - - if ("file".equalsIgnoreCase(protocol)) { - File file = new File(url.toURI()); - return new SeekableBufferedStream(new SeekableFileStream(file)); - } - - throw new RuntimeException("Uknown protocol: " + protocol); - } - - private static class NamedSeekableFTPStream extends SeekableFTPStream { - /** - * This class purpose is to preserve and pass the URL string as the - * source. - */ - private URL source; - - public NamedSeekableFTPStream(URL url) throws IOException { - super(url); - source = url; - } - - public NamedSeekableFTPStream(URL url, UserPasswordInput userPasswordInput) throws IOException { - super(url, userPasswordInput); - source = url; - } - - @Override - public String getSource() { - return source.toString(); - } - - } - - /** - * A convenience method. - *

- * If a file is supplied then it will be wrapped into a SeekableStream. If - * file is null, then the fromIS argument will be used or System.in if null. - * Optionally the input can be decrypted using provided password or the - * password read from the console. - *

- * The method also checks for EOF marker and raise error if the marker is - * not found for files with version 2.1 or greater. For version below 2.1 a - * warning will be issued. - * - * @param cramFile - * CRAM file to be read - * @param fromIS - * input stream to be read - * @param decrypt - * decrypt the input stream - * @param password - * a password to use for decryption - * @return an InputStream ready to be used for reading CRAM file definition - * @throws IOException - * @throws URISyntaxException - */ - public static InputStream openCramInputStream(String cramURL, boolean decrypt, String password) throws IOException, - URISyntaxException { - - InputStream is = null; - if (cramURL == null) - is = new BufferedInputStream(System.in); - else - is = openInputStreamFromURL(cramURL); - - if (decrypt) { - //disabled due to unresolved dependency to SeekableCipherStream_256 and CipherInputStream_256 - throw new SAMException("Encryption not supported in this version.") ; -// char[] pass = null; -// if (password == null) { -// if (System.console() == null) -// throw new RuntimeException("Cannot access console."); -// pass = System.console().readPassword(); -// } else -// pass = password.toCharArray(); -// -// if (is instanceof SeekableStream) -// is = new SeekableCipherStream_256((SeekableStream) is, pass, 1, 128); -// else -// is = new CipherInputStream_256(is, pass, 128).getCipherInputStream(); - - } - - if (is instanceof SeekableStream) { - CramHeader cramHeader = CramIO.readFormatDefinition(is, new CramHeader()); - SeekableStream s = (SeekableStream) is; - if (!CramIO.hasZeroB_EOF_marker(s)) - eofNotFound(cramHeader.getMajorVersion(), cramHeader.getMinorVersion()); - s.seek(0); - } else - log.warn("CRAM file/stream completion cannot be verified."); - - return is; - } - - private static void eofNotFound(byte major, byte minor) { - if (major >= 2 && minor >= 1) { - log.error("Incomplete data: EOF marker not found."); - System.exit(1); - } else { - log.warn("EOF marker not found, possibly incomplete file/stream."); - } - } - - /** - * Reads a CRAM container from the input stream. Returns an EOF container - * when there is no more data or the EOF marker found. - * - * @param cramHeader - * @param is - * @return - * @throws IOException - */ - public static Container readContainer(CramHeader cramHeader, InputStream is) throws IOException { - Container c = CramIO.readContainer(is); - if (c == null) { - // this will cause System.exit(1): - eofNotFound(cramHeader.getMajorVersion(), cramHeader.getMinorVersion()); - return CramIO.readContainer(new ByteArrayInputStream(CramIO.ZERO_B_EOF_MARKER)); - } - if (c.isEOF()) - log.debug("EOF marker found, file/stream is complete."); - - return c; - } - - public static long issueZeroB_EOF_marker(OutputStream os) throws IOException { - os.write(ZERO_B_EOF_MARKER); - return ZERO_B_EOF_MARKER.length; - } - - public static boolean hasZeroB_EOF_marker(SeekableStream s) throws IOException { - byte[] tail = new byte[ZERO_B_EOF_MARKER.length]; - - s.seek(s.length() - ZERO_B_EOF_MARKER.length); - ByteBufferUtils.readFully(tail, s); - - // relaxing the ITF8 hanging bits: - tail[8] |= 0xf0; - return Arrays.equals(tail, ZERO_B_EOF_MARKER); - } - - public static boolean hasZeroB_EOF_marker(File file) throws IOException { - byte[] tail = new byte[ZERO_B_EOF_MARKER.length]; - - RandomAccessFile raf = new RandomAccessFile(file, "r"); - try { - raf.seek(file.length() - ZERO_B_EOF_MARKER.length); - raf.readFully(tail); - } catch (IOException e) { - throw e; - } finally { - raf.close(); - } - - // relaxing the ITF8 hanging bits: - tail[8] |= 0xf0; - return Arrays.equals(tail, ZERO_B_EOF_MARKER); - } - - public static long writeCramHeader(CramHeader h, OutputStream os) throws IOException { - os.write("CRAM".getBytes("US-ASCII")); - os.write(h.getMajorVersion()); - os.write(h.getMinorVersion()); - os.write(h.id); - for (int i = h.id.length; i < 20; i++) - os.write(0); - - long len = writeContainerForSamFileHeader(h.getSamFileHeader(), os); - - return DEFINITION_LENGTH + len; - } - - private static CramHeader readFormatDefinition(InputStream is, CramHeader header) throws IOException { - for (byte b : CramHeader.magick) { - if (b != is.read()) - throw new RuntimeException("Unknown file format."); - } - - header.setMajorVersion((byte) is.read()); - header.setMinorVersion((byte) is.read()); - - DataInputStream dis = new DataInputStream(is); - dis.readFully(header.id); - - return header; - } - - public static CramHeader readCramHeader(InputStream is) throws IOException { - CramHeader header = new CramHeader(); - - readFormatDefinition(is, header); - - header.setSamFileHeader(readSAMFileHeader(new String(header.id), is)); - return header; - } - - public static int writeContainer(Container c, OutputStream os) throws IOException { - - long time1 = System.nanoTime(); - ExposedByteArrayOutputStream baos = new ExposedByteArrayOutputStream(); - - Block block = new CompressionHeaderBLock(c.h); - block.write(baos); - c.blockCount = 1; - - List landmarks = new ArrayList(); - SliceIO sio = new SliceIO(); - for (int i = 0; i < c.slices.length; i++) { - Slice s = c.slices[i]; - landmarks.add(baos.size()); - sio.write(s, baos); - c.blockCount++; - c.blockCount++; - if (s.embeddedRefBlock != null) - c.blockCount++; - c.blockCount += s.external.size(); - } - c.landmarks = new int[landmarks.size()]; - for (int i = 0; i < c.landmarks.length; i++) - c.landmarks[i] = landmarks.get(i); - - c.containerByteSize = baos.size(); - calculateSliceOffsetsAndSizes(c); - - ContainerHeaderIO chio = new ContainerHeaderIO(); - int len = chio.writeContainerHeader(c, os); - os.write(baos.getBuffer(), 0, baos.size()); - len += baos.size(); - - long time2 = System.nanoTime(); - - log.debug("CONTAINER WRITTEN: " + c.toString()); - c.writeTime = time2 - time1; - - return len; - } - - /** - * Reads next container from the stream. - * - * @param is - * the stream to read from - * @return CRAM container or null if no more data - * @throws IOException - */ - public static Container readContainer(InputStream is) throws IOException { - return readContainer(is, 0, Integer.MAX_VALUE); - } - - public static Container readContainerHeader(InputStream is) throws IOException { - Container c = new Container(); - ContainerHeaderIO chio = new ContainerHeaderIO(); - if (!chio.readContainerHeader(c, is)) - return null; - return c; - } - - private static Container readContainer(InputStream is, int fromSlice, int howManySlices) throws IOException { - - long time1 = System.nanoTime(); - Container c = readContainerHeader(is); - if (c == null) - return null; - - CompressionHeaderBLock chb = new CompressionHeaderBLock(is); - c.h = chb.getCompressionHeader(); - howManySlices = Math.min(c.landmarks.length, howManySlices); - - if (fromSlice > 0) - is.skip(c.landmarks[fromSlice]); - - SliceIO sio = new SliceIO(); - List slices = new ArrayList(); - for (int s = fromSlice; s < howManySlices - fromSlice; s++) { - Slice slice = new Slice(); - slice.index = s ; - sio.readSliceHeadBlock(slice, is); - sio.readSliceBlocks(slice, true, is); - slices.add(slice); - } - - c.slices = slices.toArray(new Slice[slices.size()]); - - calculateSliceOffsetsAndSizes(c); - - long time2 = System.nanoTime(); - - log.debug("READ CONTAINER: " + c.toString()); - c.readTime = time2 - time1; - - return c; - } - - private static void calculateSliceOffsetsAndSizes(Container c) { - if (c.slices.length == 0) - return; - for (int i = 0; i < c.slices.length - 1; i++) { - Slice s = c.slices[i]; - s.offset = c.landmarks[i]; - s.size = c.landmarks[i + 1] - s.offset; - } - Slice lastSlice = c.slices[c.slices.length - 1]; - lastSlice.offset = c.landmarks[c.landmarks.length - 1]; - lastSlice.size = c.containerByteSize - lastSlice.offset; - } - - public static byte[] toByteArray(SAMFileHeader samFileHeader) { - ExposedByteArrayOutputStream headerBodyOS = new ExposedByteArrayOutputStream(); - OutputStreamWriter w = new OutputStreamWriter(headerBodyOS); - new SAMTextHeaderCodec().encode(w, samFileHeader); - try { - w.close(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - ByteBuffer buf = ByteBuffer.allocate(4); - buf.order(ByteOrder.LITTLE_ENDIAN); - buf.putInt(headerBodyOS.size()); - buf.flip(); - byte[] bytes = new byte[buf.limit()]; - buf.get(bytes); - - ByteArrayOutputStream headerOS = new ByteArrayOutputStream(); - try { - headerOS.write(bytes); - headerOS.write(headerBodyOS.getBuffer(), 0, headerBodyOS.size()); - } catch (IOException e) { - throw new RuntimeException(e); - } - - return headerOS.toByteArray(); - } - - private static long writeContainerForSamFileHeader(SAMFileHeader samFileHeader, OutputStream os) throws IOException { - byte[] data = toByteArray(samFileHeader); - return writeContainerForSamFileHeaderData(data, 0, Math.max(1024, data.length + data.length / 2), os); - } - - private static long writeContainerForSamFileHeaderData(byte[] data, int offset, int len, OutputStream os) - throws IOException { - Block block = new Block(); - byte[] blockContent = new byte[len]; - System.arraycopy(data, 0, blockContent, offset, Math.min(data.length - offset, len)); - block.setRawContent(blockContent); - block.method = BlockCompressionMethod.RAW; - block.contentId = 0; - block.contentType = BlockContentType.FILE_HEADER; - block.compress(); - - Container c = new Container(); - c.blockCount = 1; - c.blocks = new Block[] { block }; - c.landmarks = new int[0]; - c.slices = new Slice[0]; - c.alignmentSpan = 0; - c.alignmentStart = 0; - c.bases = 0; - c.globalRecordCounter = 0; - c.nofRecords = 0; - c.sequenceId = 0; - - ExposedByteArrayOutputStream baos = new ExposedByteArrayOutputStream(); - block.write(baos); - c.containerByteSize = baos.size(); - - ContainerHeaderIO chio = new ContainerHeaderIO(); - int containerHeaderByteSize = chio.writeContainerHeader(c, os); - os.write(baos.getBuffer(), 0, baos.size()); - - return containerHeaderByteSize + baos.size(); - } - - public static SAMFileHeader readSAMFileHeader(String id, InputStream is) throws IOException { - readContainerHeader(is); - Block b = new Block(is, true, true); - - is = new ByteArrayInputStream(b.getRawContent()); - - ByteBuffer buf = ByteBuffer.allocate(4); - buf.order(ByteOrder.LITTLE_ENDIAN); - for (int i = 0; i < 4; i++) - buf.put((byte) is.read()); - buf.flip(); - int size = buf.asIntBuffer().get(); - - DataInputStream dis = new DataInputStream(is); - byte[] bytes = new byte[size]; - dis.readFully(bytes); - - BufferedLineReader r = new BufferedLineReader(new ByteArrayInputStream(bytes)); - SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); - SAMFileHeader header = codec.decode(r, id); - return header; - } - - public static boolean replaceCramHeader(File file, CramHeader newHeader) throws IOException { - - int MAP_SIZE = (int) Math.min(1024 * 1024, file.length()); - FileInputStream inputStream = new FileInputStream(file); - CountingInputStream cis = new CountingInputStream(inputStream); - - CramHeader header = new CramHeader(); - readFormatDefinition(cis, header); - - if (header.getMajorVersion() != newHeader.getMajorVersion() && header.getMinorVersion() != newHeader.getMinorVersion()) { - log.error(String.format("Cannot replace CRAM header because format versions differ: ", header.getMajorVersion(), - header.getMinorVersion(), newHeader.getMajorVersion(), header.getMinorVersion(), file.getAbsolutePath())); - cis.close(); - return false; - } - - readContainerHeader(cis); - Block b = new Block(cis, false, false); - long dataStart = cis.getCount(); - cis.close(); - - byte[] data = toByteArray(newHeader.getSamFileHeader()); - - if (data.length > b.getRawContentSize()) { - log.error("Failed to replace CRAM header because the new header is bigger."); - return false; - } - - RandomAccessFile raf = new RandomAccessFile(file, "rw"); - FileChannel channelOut = raf.getChannel(); - MappedByteBuffer mapOut = channelOut.map(MapMode.READ_WRITE, dataStart, MAP_SIZE - dataStart); - mapOut.put(data); - mapOut.force(); - - channelOut.close(); - raf.close(); - - return true; - } - + /** + * The 'zero-B' EOF marker as per CRAM specs v2.1. This is basically a serialized empty CRAM container with sequence id set to some + * number to spell out 'EOF' in hex. + */ + public static final byte[] ZERO_B_EOF_MARKER = bytesFromHex("0b 00 00 00 ff ff ff ff ff e0 45 4f 46 00 00 00 00 01 00 00 01 00 06 06 01 00 " + + "" + "01 00 01 00"); + /** + * The zero-F EOF marker as per CRAM specs v3.0. This is basically a serialized empty CRAM container with sequence id set to some number + * to spell out 'EOF' in hex. + */ + public static final byte[] ZERO_F_EOF_MARKER = bytesFromHex("0f 00 00 00 ff ff ff ff 0f e0 45 4f 46 00 00 00 00 01 00 05 bd d9 4f 00 01 00 " + + "" + "06 06 01 00 01 00 01 00 ee 63 01 4b"); + + + private static final int DEFINITION_LENGTH = 4 + 1 + 1 + 20; + private static final Log log = Log.getInstance(CramIO.class); + + private static byte[] bytesFromHex(final String string) { + final String clean = string.replaceAll("[^0-9a-fA-F]", ""); + if (clean.length() % 2 != 0) throw new RuntimeException("Not a hex string: " + string); + final byte[] data = new byte[clean.length() / 2]; + for (int i = 0; i < clean.length(); i += 2) { + data[i / 2] = (Integer.decode("0x" + clean.charAt(i) + clean.charAt(i + 1))).byteValue(); + } + return data; + } + + /** + * Write an end-of-file marker to the {@link OutputStream}. The specific EOF marker is chosen based on the CRAM version. + * + * @param version the CRAM version to assume + * @param outputStream the stream to write to + * @return the number of bytes written out + * @throws IOException as per java IO contract + */ + public static long issueEOF(final Version version, final OutputStream outputStream) throws IOException { + if (version.compatibleWith(CramVersions.CRAM_v3)) { + outputStream.write(ZERO_F_EOF_MARKER); + return ZERO_F_EOF_MARKER.length; + } + + if (version.compatibleWith(CramVersions.CRAM_v2_1)) { + outputStream.write(ZERO_B_EOF_MARKER); + return ZERO_B_EOF_MARKER.length; + } + return 0; + } + + private static boolean streamEndsWith(final SeekableStream seekableStream, final byte[] marker) throws IOException { + final byte[] tail = new byte[ZERO_B_EOF_MARKER.length]; + + seekableStream.seek(seekableStream.length() - marker.length); + InputStreamUtils.readFully(seekableStream, tail, 0, tail.length); + + // relaxing the ITF8 hanging bits: + tail[8] |= 0xf0; + return Arrays.equals(tail, marker); + } + + /** + * Check if the {@link SeekableStream} is properly terminated with a end-of-file marker. + * + * @param version CRAM version to assume + * @param seekableStream the stream to read from + * @return true if the stream ends with a correct EOF marker, false otherwise + * @throws IOException as per java IO contract + */ + @SuppressWarnings("SimplifiableIfStatement") + private static boolean checkEOF(final Version version, final SeekableStream seekableStream) throws IOException { + + if (version.compatibleWith(CramVersions.CRAM_v3)) return streamEndsWith(seekableStream, ZERO_B_EOF_MARKER); + if (version.compatibleWith(CramVersions.CRAM_v2_1)) return streamEndsWith(seekableStream, ZERO_F_EOF_MARKER); + + return false; + } + + /** + * Check if the file: 1) contains proper CRAM header. 2) given the version info from the header check the end of file marker. + * + * @param file the CRAM file to check + * @return true if the file is a valid CRAM file and is properly terminated with respect to the version. + * @throws IOException as per java IO contract + */ + public static boolean checkHeaderAndEOF(final File file) throws IOException { + final SeekableStream seekableStream = new SeekableFileStream(file); + final CramHeader cramHeader = readCramHeader(seekableStream); + return checkEOF(cramHeader.getVersion(), seekableStream); + } + + /** + * Writes CRAM header into the specified {@link OutputStream}. + * + * @param cramHeader the {@link CramHeader} object to write + * @param outputStream the output stream to write to + * @return the number of bytes written out + * @throws IOException as per java IO contract + */ + public static long writeCramHeader(final CramHeader cramHeader, final OutputStream outputStream) throws IOException { +// if (cramHeader.getVersion().major < 3) throw new RuntimeException("Deprecated CRAM version: " + cramHeader.getVersion().major); + outputStream.write("CRAM".getBytes("US-ASCII")); + outputStream.write(cramHeader.getVersion().major); + outputStream.write(cramHeader.getVersion().minor); + outputStream.write(cramHeader.getId()); + for (int i = cramHeader.getId().length; i < 20; i++) + outputStream.write(0); + + final long length = CramIO.writeContainerForSamFileHeader(cramHeader.getVersion().major, cramHeader.getSamFileHeader(), outputStream); + + return CramIO.DEFINITION_LENGTH + length; + } + + private static CramHeader readFormatDefinition(final InputStream inputStream) throws IOException { + for (final byte magicByte : CramHeader.MAGIC) { + if (magicByte != inputStream.read()) throw new RuntimeException("Unknown file format."); + } + + final Version version = new Version(inputStream.read(), inputStream.read(), 0); + + final CramHeader header = new CramHeader(version, null, null); + + final DataInputStream dataInputStream = new DataInputStream(inputStream); + dataInputStream.readFully(header.getId()); + + return header; + } + + /** + * Read CRAM header from the given {@link InputStream}. + * + * @param inputStream input stream to read from + * @return complete {@link CramHeader} object + * @throws IOException as per java IO contract + */ + public static CramHeader readCramHeader(final InputStream inputStream) throws IOException { + final CramHeader header = readFormatDefinition(inputStream); + + final SAMFileHeader samFileHeader = readSAMFileHeader(header.getVersion(), inputStream, new String(header.getId())); + + return new CramHeader(header.getVersion(), new String(header.getId()), samFileHeader); + } + + private static byte[] toByteArray(final SAMFileHeader samFileHeader) { + final ExposedByteArrayOutputStream headerBodyOS = new ExposedByteArrayOutputStream(); + final OutputStreamWriter outStreamWriter = new OutputStreamWriter(headerBodyOS); + new SAMTextHeaderCodec().encode(outStreamWriter, samFileHeader); + try { + outStreamWriter.close(); + } catch (final IOException e) { + throw new RuntimeException(e); + } + + final ByteBuffer buf = ByteBuffer.allocate(4); + buf.order(ByteOrder.LITTLE_ENDIAN); + buf.putInt(headerBodyOS.size()); + buf.flip(); + final byte[] bytes = new byte[buf.limit()]; + buf.get(bytes); + + final ByteArrayOutputStream headerOS = new ByteArrayOutputStream(); + try { + headerOS.write(bytes); + headerOS.write(headerBodyOS.getBuffer(), 0, headerBodyOS.size()); + } catch (final IOException e) { + throw new RuntimeException(e); + } + + return headerOS.toByteArray(); + } + + private static long writeContainerForSamFileHeader(final int major, final SAMFileHeader samFileHeader, final OutputStream os) throws IOException { + final byte[] data = toByteArray(samFileHeader); + final int length = Math.max(1024, data.length + data.length / 2); + final byte[] blockContent = new byte[length]; + System.arraycopy(data, 0, blockContent, 0, Math.min(data.length, length)); + final Block block = Block.buildNewFileHeaderBlock(blockContent); + + final Container container = new Container(); + container.blockCount = 1; + container.blocks = new Block[]{block}; + container.landmarks = new int[0]; + container.slices = new Slice[0]; + container.alignmentSpan = 0; + container.alignmentStart = 0; + container.bases = 0; + container.globalRecordCounter = 0; + container.nofRecords = 0; + container.sequenceId = 0; + + final ExposedByteArrayOutputStream byteArrayOutputStream = new ExposedByteArrayOutputStream(); + block.write(major, byteArrayOutputStream); + container.containerByteSize = byteArrayOutputStream.size(); + + final int containerHeaderByteSize = ContainerIO.writeContainerHeader(major, container, os); + os.write(byteArrayOutputStream.getBuffer(), 0, byteArrayOutputStream.size()); + + return containerHeaderByteSize + byteArrayOutputStream.size(); + } + + private static SAMFileHeader readSAMFileHeader(final Version version, InputStream inputStream, final String id) throws IOException { + final Container container = ContainerIO.readContainerHeader(version.major, inputStream); + final Block block; + { + if (version.compatibleWith(CramVersions.CRAM_v3)) { + final byte[] bytes = new byte[container.containerByteSize]; + InputStreamUtils.readFully(inputStream, bytes, 0, bytes.length); + block = Block.readFromInputStream(version.major, new ByteArrayInputStream(bytes)); + // ignore the rest of the container + } else { + /* + * pending issue: container.containerByteSize inputStream 2 bytes shorter + * then needed in the v21 test cram files. + */ + block = Block.readFromInputStream(version.major, inputStream); + } + } + + inputStream = new ByteArrayInputStream(block.getRawContent()); + + final ByteBuffer buffer = ByteBuffer.allocate(4); + buffer.order(ByteOrder.LITTLE_ENDIAN); + for (int i = 0; i < 4; i++) + buffer.put((byte) inputStream.read()); + buffer.flip(); + final int size = buffer.asIntBuffer().get(); + + final DataInputStream dataInputStream = new DataInputStream(inputStream); + final byte[] bytes = new byte[size]; + dataInputStream.readFully(bytes); + + final BufferedLineReader bufferedLineReader = new BufferedLineReader(new ByteArrayInputStream(bytes)); + final SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); + return codec.decode(bufferedLineReader, id); + } + + /** + * Attempt to replace the SAM file header in the CRAM file. This will succeed only if there is sufficient space reserved in the existing + * CRAM header. The implementation re-writes the first FILE_HEADER block in the first container of the CRAM file using random file + * access. + * + * @param file the CRAM file + * @param newHeader the new CramHeader container a new SAM file header + * @return true if successfully replaced the header, false otherwise + * @throws IOException as per java IO contract + */ + public static boolean replaceCramHeader(final File file, final CramHeader newHeader) throws IOException { + + final CountingInputStream countingInputStream = new CountingInputStream(new FileInputStream(file)); + + final CramHeader header = readFormatDefinition(countingInputStream); + final Container c = ContainerIO.readContainerHeader(header.getVersion().major, countingInputStream); + final long pos = countingInputStream.getCount(); + countingInputStream.close(); + + final Block block = Block.buildNewFileHeaderBlock(toByteArray(newHeader.getSamFileHeader())); + final ExposedByteArrayOutputStream byteArrayOutputStream = new ExposedByteArrayOutputStream(); + block.write(newHeader.getVersion().major, byteArrayOutputStream); + if (byteArrayOutputStream.size() > c.containerByteSize) { + log.error("Failed to replace CRAM header because the new header does not fit."); + return false; + } + final RandomAccessFile randomAccessFile = new RandomAccessFile(file, "rw"); + randomAccessFile.seek(pos); + randomAccessFile.write(byteArrayOutputStream.getBuffer(), 0, byteArrayOutputStream.size()); + randomAccessFile.close(); + return true; + } } diff --git a/src/java/htsjdk/samtools/cram/build/CramNormalizer.java b/src/java/htsjdk/samtools/cram/build/CramNormalizer.java index 93fc174e91..a36a995f69 100644 --- a/src/java/htsjdk/samtools/cram/build/CramNormalizer.java +++ b/src/java/htsjdk/samtools/cram/build/CramNormalizer.java @@ -1,30 +1,33 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.build; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.cram.encoding.read_features.BaseQualityScore; -import htsjdk.samtools.cram.encoding.read_features.Deletion; -import htsjdk.samtools.cram.encoding.read_features.InsertBase; -import htsjdk.samtools.cram.encoding.read_features.Insertion; -import htsjdk.samtools.cram.encoding.read_features.ReadBase; -import htsjdk.samtools.cram.encoding.read_features.ReadFeature; -import htsjdk.samtools.cram.encoding.read_features.SoftClip; -import htsjdk.samtools.cram.encoding.read_features.Substitution; +import htsjdk.samtools.cram.encoding.readfeatures.BaseQualityScore; +import htsjdk.samtools.cram.encoding.readfeatures.Deletion; +import htsjdk.samtools.cram.encoding.readfeatures.InsertBase; +import htsjdk.samtools.cram.encoding.readfeatures.Insertion; +import htsjdk.samtools.cram.encoding.readfeatures.ReadBase; +import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature; +import htsjdk.samtools.cram.encoding.readfeatures.RefSkip; +import htsjdk.samtools.cram.encoding.readfeatures.SoftClip; +import htsjdk.samtools.cram.encoding.readfeatures.Substitution; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.SubstitutionMatrix; @@ -35,251 +38,263 @@ import java.util.List; public class CramNormalizer { - private SAMFileHeader header; + private final SAMFileHeader header; private int readCounter = 0; - private String readNamePrefix = ""; - private byte defaultQualityScore = '?' - '!'; private static Log log = Log.getInstance(CramNormalizer.class); private ReferenceSource referenceSource; - public CramNormalizer(SAMFileHeader header) { + private CramNormalizer(final SAMFileHeader header) { this.header = header; } - public CramNormalizer(SAMFileHeader header, ReferenceSource referenceSource) { + public CramNormalizer(final SAMFileHeader header, final ReferenceSource referenceSource) { this.header = header; this.referenceSource = referenceSource; } - public void normalize(ArrayList records, boolean resetPairing, - byte[] ref, int alignmentStart, - SubstitutionMatrix substitutionMatrix, boolean AP_delta) { + public void normalize(final ArrayList records, + final byte[] ref, final int refOffset_zeroBased, + final SubstitutionMatrix substitutionMatrix) { - int startCounter = readCounter; + final int startCounter = readCounter; - for (CramCompressionRecord r : records) { - r.index = ++readCounter; + for (final CramCompressionRecord record : records) { + record.index = ++readCounter; - if (r.sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - r.sequenceName = SAMRecord.NO_ALIGNMENT_REFERENCE_NAME; - r.alignmentStart = SAMRecord.NO_ALIGNMENT_START; + if (record.sequenceId == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + record.sequenceName = SAMRecord.NO_ALIGNMENT_REFERENCE_NAME; + record.alignmentStart = SAMRecord.NO_ALIGNMENT_START; } else { - r.sequenceName = header.getSequence(r.sequenceId) + record.sequenceName = header.getSequence(record.sequenceId) .getSequenceName(); } } {// restore pairing first: - for (CramCompressionRecord r : records) { - if (!r.isMultiFragment() || r.isDetached()) { - r.recordsToNextFragment = -1; + for (final CramCompressionRecord record : records) { + if (!record.isMultiFragment() || record.isDetached()) { + record.recordsToNextFragment = -1; - r.next = null; - r.previous = null; + record.next = null; + record.previous = null; continue; } - if (r.isHasMateDownStream()) { - CramCompressionRecord downMate = records.get(r.index - + r.recordsToNextFragment - startCounter); - r.next = downMate; - downMate.previous = r; - - r.mateAlignmentStart = downMate.alignmentStart; - r.setMateUmapped(downMate.isSegmentUnmapped()); - r.setMateNegativeStrand(downMate.isNegativeStrand()); - r.mateSequenceID = downMate.sequenceId; - if (r.mateSequenceID == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) - r.mateAlignmentStart = SAMRecord.NO_ALIGNMENT_START; - - downMate.mateAlignmentStart = r.alignmentStart; - downMate.setMateUmapped(r.isSegmentUnmapped()); - downMate.setMateNegativeStrand(r.isNegativeStrand()); - downMate.mateSequenceID = r.sequenceId; - if (downMate.mateSequenceID == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) - downMate.mateAlignmentStart = SAMRecord.NO_ALIGNMENT_START; - - if (r.isFirstSegment()) { - final int tlen = computeInsertSize(r, downMate); - r.templateSize = tlen; - downMate.templateSize = -tlen; - } else { - final int tlen = computeInsertSize(downMate, r); - downMate.templateSize = tlen; - r.templateSize = -tlen; - } + if (record.isHasMateDownStream()) { + final CramCompressionRecord downMate = records.get(record.index + + record.recordsToNextFragment - startCounter); + record.next = downMate; + downMate.previous = record; } } + for (final CramCompressionRecord record : records) { + if (record.previous != null) continue; + if (record.next == null) continue; + restoreMateInfo(record); + } } // assign some read names if needed: - for (CramCompressionRecord r : records) { - if (r.readName == null) { - String name = readNamePrefix + r.index; - r.readName = name; - if (r.next != null) - r.next.readName = name; - if (r.previous != null) - r.previous.readName = name; + for (final CramCompressionRecord record : records) { + if (record.readName == null) { + final String readNamePrefix = ""; + final String name = readNamePrefix + record.index; + record.readName = name; + if (record.next != null) + record.next.readName = name; + if (record.previous != null) + record.previous.readName = name; } } // resolve bases: - for (CramCompressionRecord r : records) { - if (r.isSegmentUnmapped()) + for (final CramCompressionRecord record : records) { + if (record.isSegmentUnmapped()) continue; byte[] refBases = ref; - if (referenceSource != null) - refBases = referenceSource.getReferenceBases(header.getSequence(r.sequenceId), true); + { + // ref could be supplied (aka forced) already or needs looking up: + // ref.length=0 is a special case of seqId=-2 (multiref) + if ((ref == null || ref.length == 0) && referenceSource != null) + refBases = referenceSource.getReferenceBases( + header.getSequence(record.sequenceId), true); + } - byte[] bases = restoreReadBases(r, refBases, substitutionMatrix); - r.readBases = bases; + if (record.isUnknownBases()) { + record.readBases = SAMRecord.NULL_SEQUENCE; + } else + record.readBases = restoreReadBases(record, refBases, refOffset_zeroBased, + substitutionMatrix); } // restore quality scores: + final byte defaultQualityScore = '?' - '!'; restoreQualityScores(defaultQualityScore, records); } - public static void restoreQualityScores(byte defaultQualityScore, - List records) { - for (CramCompressionRecord record : records) + private static void restoreMateInfo(final CramCompressionRecord record) { + if (record.next == null) { + + return; + } + CramCompressionRecord cur; + cur = record; + while (cur.next != null) { + setNextMate(cur, cur.next); + cur = cur.next; + } + + // cur points to the last segment now: + final CramCompressionRecord last = cur; + setNextMate(last, record); +// record.setFirstSegment(true); +// last.setLastSegment(true); + + final int templateLength = computeInsertSize(record, last); + record.templateSize = templateLength; + last.templateSize = -templateLength; + } + + private static void setNextMate(final CramCompressionRecord record, final CramCompressionRecord next) { + record.mateAlignmentStart = next.alignmentStart; + record.setMateUnmapped(next.isSegmentUnmapped()); + record.setMateNegativeStrand(next.isNegativeStrand()); + record.mateSequenceID = next.sequenceId; + if (record.mateSequenceID == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) + record.mateAlignmentStart = SAMRecord.NO_ALIGNMENT_START; + } + + public static void restoreQualityScores(final byte defaultQualityScore, + final List records) { + for (final CramCompressionRecord record : records) restoreQualityScores(defaultQualityScore, record); } - public static byte[] restoreQualityScores(byte defaultQualityScore, - CramCompressionRecord record) { + private static byte[] restoreQualityScores(final byte defaultQualityScore, + final CramCompressionRecord record) { if (!record.isForcePreserveQualityScores()) { - byte[] scores = new byte[record.readLength]; + boolean star = true; + final byte[] scores = new byte[record.readLength]; Arrays.fill(scores, defaultQualityScore); if (record.readFeatures != null) - for (ReadFeature f : record.readFeatures) { - switch (f.getOperator()) { + for (final ReadFeature feature : record.readFeatures) { + switch (feature.getOperator()) { case BaseQualityScore.operator: - int pos = f.getPosition(); - byte q = ((BaseQualityScore) f).getQualityScore(); - - try { - scores[pos - 1] = q; - } catch (ArrayIndexOutOfBoundsException e) { - System.err.println("PROBLEM CAUSED BY:"); - System.err.println(record.toString()); - throw e; - } + int pos = feature.getPosition(); + scores[pos - 1] = ((BaseQualityScore) feature).getQualityScore(); + star = false; break; case ReadBase.operator: - pos = f.getPosition(); - q = ((ReadBase) f).getQualityScore(); - - try { - scores[pos - 1] = q; - } catch (ArrayIndexOutOfBoundsException e) { - System.err.println("PROBLEM CAUSED BY:"); - System.err.println(record.toString()); - throw e; - } + pos = feature.getPosition(); + scores[pos - 1] = ((ReadBase) feature).getQualityScore(); + star = false; break; default: break; } - } - record.qualityScores = scores; + if (star) + record.qualityScores = SAMRecord.NULL_QUALS; + else + record.qualityScores = scores; } else { - byte[] scores = record.qualityScores; + final byte[] scores = record.qualityScores; + int missingScores = 0; for (int i = 0; i < scores.length; i++) - if (scores[i] == -1) + if (scores[i] == -1) { scores[i] = defaultQualityScore; + missingScores++; + } + if (missingScores == scores.length) + record.qualityScores = SAMRecord.NULL_QUALS; } return record.qualityScores; } - private static final long calcRefLength(CramCompressionRecord record) { - if (record.readFeatures == null || record.readFeatures.isEmpty()) - return record.readLength; - long len = record.readLength; - for (ReadFeature rf : record.readFeatures) { - switch (rf.getOperator()) { - case Deletion.operator: - len += ((Deletion) rf).getLength(); - break; - case Insertion.operator: - len -= ((Insertion) rf).getSequence().length; - break; - default: - break; - } - } - - return len; - } - - private static final byte[] restoreReadBases(CramCompressionRecord record, byte[] ref, - SubstitutionMatrix substitutionMatrix) { - int readLength = record.readLength; - byte[] bases = new byte[readLength]; + private static byte[] restoreReadBases(final CramCompressionRecord record, final byte[] ref, + final int refOffsetZeroBased, final SubstitutionMatrix substitutionMatrix) { + if (record.isUnknownBases() || record.readLength == 0) return SAMRecord.NULL_SEQUENCE; + final int readLength = record.readLength; + final byte[] bases = new byte[readLength]; int posInRead = 1; - int alignmentStart = record.alignmentStart - 1; + final int alignmentStart = record.alignmentStart - 1; int posInSeq = 0; if (record.readFeatures == null || record.readFeatures.isEmpty()) { - if (ref.length < alignmentStart + bases.length) { + if (ref.length + refOffsetZeroBased < alignmentStart + + bases.length) { Arrays.fill(bases, (byte) 'N'); - System.arraycopy(ref, alignmentStart, bases, 0, - Math.min(bases.length, ref.length - alignmentStart)); + System.arraycopy( + ref, + alignmentStart - refOffsetZeroBased, + bases, + 0, + Math.min(bases.length, ref.length + refOffsetZeroBased + - alignmentStart)); } else - System.arraycopy(ref, alignmentStart, bases, 0, bases.length); + System.arraycopy(ref, alignmentStart - refOffsetZeroBased, + bases, 0, bases.length); return bases; } - List variations = record.readFeatures; - for (ReadFeature v : variations) { - for (; posInRead < v.getPosition(); posInRead++) - bases[posInRead - 1] = ref[alignmentStart + posInSeq++]; + final List variations = record.readFeatures; + for (final ReadFeature variation : variations) { + for (; posInRead < variation.getPosition(); posInRead++) { + final int rp = alignmentStart + posInSeq++ - refOffsetZeroBased; + bases[posInRead - 1] = getByteOrDefault(ref, rp, (byte) 'N'); + } - switch (v.getOperator()) { + switch (variation.getOperator()) { case Substitution.operator: - Substitution sv = (Substitution) v; - byte refBase = Utils.normalizeBase(ref[alignmentStart - + posInSeq]); - byte base = substitutionMatrix.base(refBase, sv.getCode()); - sv.setBase(base); - sv.setRefernceBase(refBase); + final Substitution substitution = (Substitution) variation; + byte refBase = getByteOrDefault(ref, alignmentStart + posInSeq + - refOffsetZeroBased, (byte) 'N'); + refBase = Utils.normalizeBase(refBase); + final byte base = substitutionMatrix.base(refBase, substitution.getCode()); + substitution.setBase(base); + substitution.setReferenceBase(refBase); bases[posInRead++ - 1] = base; posInSeq++; break; case Insertion.operator: - Insertion iv = (Insertion) v; - for (int i = 0; i < iv.getSequence().length; i++) - bases[posInRead++ - 1] = iv.getSequence()[i]; + final Insertion insertion = (Insertion) variation; + for (int i = 0; i < insertion.getSequence().length; i++) + bases[posInRead++ - 1] = insertion.getSequence()[i]; break; case SoftClip.operator: - SoftClip sc = (SoftClip) v; - for (int i = 0; i < sc.getSequence().length; i++) - bases[posInRead++ - 1] = sc.getSequence()[i]; + final SoftClip softClip = (SoftClip) variation; + for (int i = 0; i < softClip.getSequence().length; i++) + bases[posInRead++ - 1] = softClip.getSequence()[i]; break; case Deletion.operator: - Deletion dv = (Deletion) v; - posInSeq += dv.getLength(); + final Deletion deletion = (Deletion) variation; + posInSeq += deletion.getLength(); break; case InsertBase.operator: - InsertBase ib = (InsertBase) v; - bases[posInRead++ - 1] = ib.getBase(); + final InsertBase insert = (InsertBase) variation; + bases[posInRead++ - 1] = insert.getBase(); + break; + case RefSkip.operator: + posInSeq += ((RefSkip) variation).getLength(); break; } } - for (; posInRead <= readLength; posInRead++) - bases[posInRead - 1] = ref[alignmentStart + posInSeq++]; + for (; posInRead <= readLength + && alignmentStart + posInSeq - refOffsetZeroBased < ref.length; posInRead++, posInSeq++) { + bases[posInRead - 1] = ref[alignmentStart + posInSeq + - refOffsetZeroBased]; + } // ReadBase overwrites bases: - for (ReadFeature v : variations) { - switch (v.getOperator()) { + for (final ReadFeature variation : variations) { + switch (variation.getOperator()) { case ReadBase.operator: - ReadBase rb = (ReadBase) v; - bases[v.getPosition() - 1] = rb.getBase(); + final ReadBase readBase = (ReadBase) variation; + bases[variation.getPosition() - 1] = readBase.getBase(); break; default: break; @@ -293,18 +308,26 @@ private static final byte[] restoreReadBases(CramCompressionRecord record, byte[ return bases; } + private static byte getByteOrDefault(final byte[] array, final int pos, + final byte outOfBoundsValue) { + if (pos >= array.length) + return outOfBoundsValue; + else + return array[pos]; + } + /** * The method is similar in semantics to * {@link htsjdk.samtools.SamPairUtil#computeInsertSize(SAMRecord, SAMRecord) * computeInsertSize} but operates on CRAM native records instead of * SAMRecord objects. * - * @param firstEnd - * @param secondEnd + * @param firstEnd first mate of the pair + * @param secondEnd second mate of the pair * @return template length */ - private static int computeInsertSize(CramCompressionRecord firstEnd, - CramCompressionRecord secondEnd) { + public static int computeInsertSize(final CramCompressionRecord firstEnd, + final CramCompressionRecord secondEnd) { if (firstEnd.isSegmentUnmapped() || secondEnd.isSegmentUnmapped()) { return 0; } diff --git a/src/java/htsjdk/samtools/cram/build/CramSpanContainerIterator.java b/src/java/htsjdk/samtools/cram/build/CramSpanContainerIterator.java new file mode 100644 index 0000000000..2956cda629 --- /dev/null +++ b/src/java/htsjdk/samtools/cram/build/CramSpanContainerIterator.java @@ -0,0 +1,100 @@ +package htsjdk.samtools.cram.build; + +import htsjdk.samtools.cram.structure.Container; +import htsjdk.samtools.cram.structure.ContainerIO; +import htsjdk.samtools.cram.structure.CramHeader; +import htsjdk.samtools.seekablestream.SeekableStream; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * An iterator of CRAM containers read from locations in {@link htsjdk.samtools.seekablestream.SeekableStream}. The locations are specified with + * pairs of coordinates, they are basically file pointers as returned for example by {@link htsjdk.samtools.SamReader.Indexing#getFilePointerSpanningReads()} + */ +public class CramSpanContainerIterator implements Iterator { + private final CramHeader cramHeader; + private final SeekableStream seekableStream; + private Iterator containerBoundaries; + private Boundary currentBoundary; + private long firstContainerOffset; + + private CramSpanContainerIterator(final SeekableStream seekableStream, final long[] coordinates) throws IOException { + this.seekableStream = seekableStream; + seekableStream.seek(0); + this.cramHeader = CramIO.readCramHeader(seekableStream); + firstContainerOffset = seekableStream.position(); + + final List boundaries = new ArrayList(); + for (int i = 0; i < coordinates.length; i += 2) { + boundaries.add(new Boundary(coordinates[i], coordinates[i + 1])); + } + + containerBoundaries = boundaries.iterator(); + currentBoundary = containerBoundaries.next(); + } + + public static CramSpanContainerIterator fromFileSpan(final SeekableStream seekableStream, final long[] coordinates) throws IOException { + return new CramSpanContainerIterator(seekableStream, coordinates); + } + + @Override + public boolean hasNext() { + try { + if (currentBoundary.hasNext()) return true; + if (!containerBoundaries.hasNext()) return false; + currentBoundary = containerBoundaries.next(); + return currentBoundary.hasNext(); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public Container next() { + try { + return currentBoundary.next(); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void remove() { + throw new RuntimeException("Not allowed."); + } + + public CramHeader getCramHeader() { + return cramHeader; + } + + private class Boundary { + final long start; + final long end; + + public Boundary(final long start, final long end) { + this.start = start; + this.end = end; + if (start >= end) throw new RuntimeException("Boundary start is greater than end."); + } + + boolean hasNext() throws IOException { + return seekableStream.position() <= (end >> 16); + } + + Container next() throws IOException { + if (seekableStream.position() < (start >> 16)) seekableStream.seek(start >> 16); + if (seekableStream.position() > (end >> 16)) throw new RuntimeException("No more containers in this boundary."); + final long offset = seekableStream.position(); + final Container c = ContainerIO.readContainer(cramHeader.getVersion(), seekableStream); + c.offset = offset; + return c; + } + } + + public long getFirstContainerOffset() { + return firstContainerOffset; + } +} diff --git a/src/java/htsjdk/samtools/cram/build/Sam2CramRecordFactory.java b/src/java/htsjdk/samtools/cram/build/Sam2CramRecordFactory.java index 668baa80dd..8a0b93de89 100644 --- a/src/java/htsjdk/samtools/cram/build/Sam2CramRecordFactory.java +++ b/src/java/htsjdk/samtools/cram/build/Sam2CramRecordFactory.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.build; import htsjdk.samtools.CigarElement; @@ -22,16 +24,17 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecord.SAMTagAndValue; import htsjdk.samtools.SAMTag; -import htsjdk.samtools.cram.encoding.read_features.BaseQualityScore; -import htsjdk.samtools.cram.encoding.read_features.Deletion; -import htsjdk.samtools.cram.encoding.read_features.HardClip; -import htsjdk.samtools.cram.encoding.read_features.InsertBase; -import htsjdk.samtools.cram.encoding.read_features.Padding; -import htsjdk.samtools.cram.encoding.read_features.ReadFeature; -import htsjdk.samtools.cram.encoding.read_features.RefSkip; -import htsjdk.samtools.cram.encoding.read_features.SoftClip; -import htsjdk.samtools.cram.encoding.read_features.Substitution; -import htsjdk.samtools.cram.mask.RefMaskUtils; +import htsjdk.samtools.cram.common.CramVersions; +import htsjdk.samtools.cram.common.Version; +import htsjdk.samtools.cram.encoding.readfeatures.BaseQualityScore; +import htsjdk.samtools.cram.encoding.readfeatures.Deletion; +import htsjdk.samtools.cram.encoding.readfeatures.HardClip; +import htsjdk.samtools.cram.encoding.readfeatures.InsertBase; +import htsjdk.samtools.cram.encoding.readfeatures.Padding; +import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature; +import htsjdk.samtools.cram.encoding.readfeatures.RefSkip; +import htsjdk.samtools.cram.encoding.readfeatures.SoftClip; +import htsjdk.samtools.cram.encoding.readfeatures.Substitution; import htsjdk.samtools.cram.structure.CramCompressionRecord; import htsjdk.samtools.cram.structure.ReadTag; import htsjdk.samtools.util.Log; @@ -51,26 +54,25 @@ public class Sam2CramRecordFactory { public static final String UNKNOWN_READ_GROUP_ID = "UNKNOWN"; public static final String UNKNOWN_READ_GROUP_SAMPLE = "UNKNOWN"; - public final static byte QS_asciiOffset = 33; + private final static byte QS_asciiOffset = 33; public final static byte unsetQualityScore = 32; public final static byte ignorePositionsWithQualityScore = -1; private byte[] refBases; + private final Version version; private byte[] refSNPs; - private RefMaskUtils.RefMask refPile; - private static Log log = Log.getInstance(Sam2CramRecordFactory.class); + private static final Log log = Log.getInstance(Sam2CramRecordFactory.class); - private Map readGroupMap = new HashMap(); + private final Map readGroupMap = new HashMap(); private long landedRefMaskScores = 0; - private long landedPiledScores = 0; private long landedTotalScores = 0; public boolean captureAllTags = false; public boolean preserveReadNames = false; - public Set captureTags = new TreeSet(); - public Set ignoreTags = new TreeSet(); + public final Set captureTags = new TreeSet(); + public final Set ignoreTags = new TreeSet(); { ignoreTags.add(SAMTag.NM.name()); @@ -78,52 +80,44 @@ public class Sam2CramRecordFactory { ignoreTags.add(SAMTag.RG.name()); } - public boolean losslessQS = false; - - private List readTagList = new ArrayList(); + private final List readTagList = new ArrayList(); private long baseCount = 0; private long featureCount = 0; - public Sam2CramRecordFactory(int samSequenceIndex, byte[] refBases, - SAMFileHeader samFileHeader) { + public Sam2CramRecordFactory(final byte[] refBases, final SAMFileHeader samFileHeader, final Version version) { this.refBases = refBases; + this.version = version; - List readGroups = samFileHeader.getReadGroups(); + final List readGroups = samFileHeader.getReadGroups(); for (int i = 0; i < readGroups.size(); i++) { - SAMReadGroupRecord readGroupRecord = readGroups.get(i); + final SAMReadGroupRecord readGroupRecord = readGroups.get(i); readGroupMap.put(readGroupRecord.getId(), i); } - } - public CramCompressionRecord createCramRecord(SAMRecord record) { - CramCompressionRecord cramRecord = new CramCompressionRecord(); + public CramCompressionRecord createCramRecord(final SAMRecord record) { + final CramCompressionRecord cramRecord = new CramCompressionRecord(); if (record.getReadPairedFlag()) { cramRecord.mateAlignmentStart = record.getMateAlignmentStart(); - cramRecord.setMateUmapped(record.getMateUnmappedFlag()); - cramRecord - .setMateNegativeStrand(record.getMateNegativeStrandFlag()); + cramRecord.setMateUnmapped(record.getMateUnmappedFlag()); + cramRecord.setMateNegativeStrand(record.getMateNegativeStrandFlag()); cramRecord.mateSequenceID = record.getMateReferenceIndex(); - } else - cramRecord.mateSequenceID = -1; + } else cramRecord.mateSequenceID = -1; cramRecord.sequenceId = record.getReferenceIndex(); cramRecord.readName = record.getReadName(); cramRecord.alignmentStart = record.getAlignmentStart(); cramRecord.setMultiFragment(record.getReadPairedFlag()); - cramRecord.setProperPair(record.getReadPairedFlag() - && record.getProperPairFlag()); + cramRecord.setProperPair(record.getReadPairedFlag() && record.getProperPairFlag()); cramRecord.setSegmentUnmapped(record.getReadUnmappedFlag()); cramRecord.setNegativeStrand(record.getReadNegativeStrandFlag()); - cramRecord.setFirstSegment(record.getReadPairedFlag() - && record.getFirstOfPairFlag()); - cramRecord.setLastSegment(record.getReadPairedFlag() - && record.getSecondOfPairFlag()); + cramRecord.setFirstSegment(record.getReadPairedFlag() && record.getFirstOfPairFlag()); + cramRecord.setLastSegment(record.getReadPairedFlag() && record.getSecondOfPairFlag()); cramRecord.setSecondaryAlignment(record.getNotPrimaryAlignmentFlag()); - cramRecord.setVendorFiltered(record - .getReadFailsVendorQualityCheckFlag()); + cramRecord.setVendorFiltered(record.getReadFailsVendorQualityCheckFlag()); cramRecord.setDuplicate(record.getDuplicateReadFlag()); + cramRecord.setSupplementary(record.getSupplementaryAlignmentFlag()); cramRecord.readLength = record.getReadLength(); cramRecord.mappingQuality = record.getMappingQuality(); @@ -131,80 +125,64 @@ public CramCompressionRecord createCramRecord(SAMRecord record) { cramRecord.templateSize = record.getInferredInsertSize(); - SAMReadGroupRecord readGroup = record.getReadGroup(); - if (readGroup != null) - cramRecord.readGroupID = readGroupMap.get(readGroup.getId()); - else - cramRecord.readGroupID = -1; + final SAMReadGroupRecord readGroup = record.getReadGroup(); + if (readGroup != null) cramRecord.readGroupID = readGroupMap.get(readGroup.getId()); + else cramRecord.readGroupID = -1; - if (!record.getReadPairedFlag()) - cramRecord.setLastSegment(false); + if (!record.getReadPairedFlag()) cramRecord.setLastSegment(false); else { - if (record.getFirstOfPairFlag()) - cramRecord.setLastSegment(false); - else if (record.getSecondOfPairFlag()) - cramRecord.setLastSegment(true); - else - cramRecord.setLastSegment(true); + if (record.getFirstOfPairFlag()) cramRecord.setLastSegment(false); + else if (record.getSecondOfPairFlag()) cramRecord.setLastSegment(true); } - if (!record.getReadUnmappedFlag() - && record.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { - List features = checkedCreateVariations(cramRecord, - record); - cramRecord.readFeatures = features; - } else - cramRecord.readFeatures = Collections.emptyList(); + if (!record.getReadUnmappedFlag() && record.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START) { + cramRecord.readFeatures = checkedCreateVariations(cramRecord, record); + } else cramRecord.readFeatures = Collections.emptyList(); cramRecord.readBases = record.getReadBases(); cramRecord.qualityScores = record.getBaseQualities(); landedTotalScores += cramRecord.readLength; + if (version.compatibleWith(CramVersions.CRAM_v3)) + cramRecord.setUnknownBases(record.getReadBases() == SAMRecord.NULL_SEQUENCE); readTagList.clear(); if (captureAllTags) { - List attributes = record.getAttributes(); - for (SAMTagAndValue tv : attributes) { - if (ignoreTags.contains(tv.tag)) - continue; - readTagList.add(ReadTag.deriveTypeFromValue(tv.tag, tv.value)); + final List attributes = record.getAttributes(); + for (final SAMTagAndValue tagAndValue : attributes) { + if (ignoreTags.contains(tagAndValue.tag)) continue; + readTagList.add(ReadTag.deriveTypeFromValue(tagAndValue.tag, tagAndValue.value)); } } else { if (!captureTags.isEmpty()) { - List attributes = record.getAttributes(); + final List attributes = record.getAttributes(); cramRecord.tags = new ReadTag[attributes.size()]; - for (SAMTagAndValue tv : attributes) { - if (captureTags.contains(tv.tag)) { - readTagList.add(ReadTag.deriveTypeFromValue(tv.tag, - tv.value)); + for (final SAMTagAndValue tagAndValue : attributes) { + if (captureTags.contains(tagAndValue.tag)) { + readTagList.add(ReadTag.deriveTypeFromValue(tagAndValue.tag, tagAndValue.value)); } } } } - cramRecord.tags = (ReadTag[]) readTagList - .toArray(new ReadTag[readTagList.size()]); + cramRecord.tags = readTagList.toArray(new ReadTag[readTagList.size()]); - cramRecord.setVendorFiltered(record - .getReadFailsVendorQualityCheckFlag()); + cramRecord.setVendorFiltered(record.getReadFailsVendorQualityCheckFlag()); - if (preserveReadNames) - cramRecord.readName = record.getReadName(); + if (preserveReadNames) cramRecord.readName = record.getReadName(); return cramRecord; } /** - * A wrapper method to provide better diagnostics for - * ArrayIndexOutOfBoundsException. + * A wrapper method to provide better diagnostics for ArrayIndexOutOfBoundsException. * - * @param cramRecord - * @param samRecord - * @return + * @param cramRecord CRAM record + * @param samRecord SAM record + * @return a list of read features created for the given {@link htsjdk.samtools.SAMRecord} */ - private List checkedCreateVariations( - CramCompressionRecord cramRecord, SAMRecord samRecord) { + private List checkedCreateVariations(final CramCompressionRecord cramRecord, final SAMRecord samRecord) { try { return createVariations(cramRecord, samRecord); - } catch (ArrayIndexOutOfBoundsException e) { + } catch (final ArrayIndexOutOfBoundsException e) { log.error("Reference bases array length=" + refBases.length); log.error("Offensive CRAM record: " + cramRecord.toString()); log.error("Offensive SAM record: " + samRecord.getSAMString()); @@ -212,65 +190,61 @@ private List checkedCreateVariations( } } - private List createVariations( - CramCompressionRecord cramRecord, SAMRecord samRecord) { - List features = new LinkedList(); + private List createVariations(final CramCompressionRecord cramRecord, final SAMRecord samRecord) { + final List features = new LinkedList(); int zeroBasedPositionInRead = 0; int alignmentStartOffset = 0; - int cigarElementLength = 0; + int cigarElementLength; + + final List cigarElements = samRecord.getCigar().getCigarElements(); - List cigarElements = samRecord.getCigar() - .getCigarElements(); + int cigarLen = 0; + for (final CigarElement cigarElement : cigarElements) + if (cigarElement.getOperator().consumesReadBases()) + cigarLen += cigarElement.getLength(); byte[] bases = samRecord.getReadBases(); - byte[] qualityScore = samRecord.getBaseQualities(); + if (bases.length == 0) { + bases = new byte[cigarLen]; + Arrays.fill(bases, (byte) 'N'); + } + final byte[] qualityScore = samRecord.getBaseQualities(); - for (CigarElement cigarElement : cigarElements) { + for (final CigarElement cigarElement : cigarElements) { cigarElementLength = cigarElement.getLength(); - CigarOperator operator = cigarElement.getOperator(); + final CigarOperator operator = cigarElement.getOperator(); switch (operator) { case D: - features.add(new Deletion(zeroBasedPositionInRead + 1, - cigarElementLength)); + features.add(new Deletion(zeroBasedPositionInRead + 1, cigarElementLength)); break; case N: - features.add(new RefSkip(zeroBasedPositionInRead + 1, - cigarElementLength)); + features.add(new RefSkip(zeroBasedPositionInRead + 1, cigarElementLength)); break; case P: - features.add(new Padding(zeroBasedPositionInRead + 1, - cigarElementLength)); + features.add(new Padding(zeroBasedPositionInRead + 1, cigarElementLength)); break; case H: - features.add(new HardClip(zeroBasedPositionInRead + 1, - cigarElementLength)); + features.add(new HardClip(zeroBasedPositionInRead + 1, cigarElementLength)); break; case S: - addSoftClip(features, zeroBasedPositionInRead, - cigarElementLength, bases, qualityScore); + addSoftClip(features, zeroBasedPositionInRead, cigarElementLength, bases); break; case I: - addInsertion(features, zeroBasedPositionInRead, - cigarElementLength, bases, qualityScore); + addInsertion(features, zeroBasedPositionInRead, cigarElementLength, bases); break; case M: case X: case EQ: - addSubstitutionsAndMaskedBases(cramRecord, features, - zeroBasedPositionInRead, alignmentStartOffset, + addSubstitutionsAndMaskedBases(cramRecord, features, zeroBasedPositionInRead, alignmentStartOffset, cigarElementLength, bases, qualityScore); break; default: - throw new IllegalArgumentException( - "Unsupported cigar operator: " - + cigarElement.getOperator()); + throw new IllegalArgumentException("Unsupported cigar operator: " + cigarElement.getOperator()); } - if (cigarElement.getOperator().consumesReadBases()) - zeroBasedPositionInRead += cigarElementLength; - if (cigarElement.getOperator().consumesReferenceBases()) - alignmentStartOffset += cigarElementLength; + if (cigarElement.getOperator().consumesReadBases()) zeroBasedPositionInRead += cigarElementLength; + if (cigarElement.getOperator().consumesReferenceBases()) alignmentStartOffset += cigarElementLength; } this.baseCount += bases.length; @@ -279,94 +253,73 @@ private List createVariations( return features; } - private void addSoftClip(List features, - int zeroBasedPositionInRead, int cigarElementLength, byte[] bases, - byte[] scores) { - byte[] insertedBases = Arrays.copyOfRange(bases, - zeroBasedPositionInRead, zeroBasedPositionInRead - + cigarElementLength); + private void addSoftClip(final List features, final int zeroBasedPositionInRead, final int cigarElementLength, final byte[] bases) { + final byte[] insertedBases = Arrays.copyOfRange(bases, zeroBasedPositionInRead, zeroBasedPositionInRead + cigarElementLength); + + final SoftClip softClip = new SoftClip(zeroBasedPositionInRead + 1, insertedBases); + features.add(softClip); + } + + private void addHardClip(final List features, final int zeroBasedPositionInRead, final int cigarElementLength, final byte[] bases) { + final byte[] insertedBases = Arrays.copyOfRange(bases, zeroBasedPositionInRead, zeroBasedPositionInRead + cigarElementLength); - SoftClip v = new SoftClip(zeroBasedPositionInRead + 1, insertedBases); - features.add(v); + final HardClip hardClip = new HardClip(zeroBasedPositionInRead + 1, insertedBases.length); + features.add(hardClip); } - private void addInsertion(List features, - int zeroBasedPositionInRead, int cigarElementLength, byte[] bases, - byte[] scores) { - byte[] insertedBases = Arrays.copyOfRange(bases, - zeroBasedPositionInRead, zeroBasedPositionInRead - + cigarElementLength); + private void addInsertion(final List features, final int zeroBasedPositionInRead, final int cigarElementLength, final byte[] bases) { + final byte[] insertedBases = Arrays.copyOfRange(bases, zeroBasedPositionInRead, zeroBasedPositionInRead + cigarElementLength); + for (int i = 0; i < insertedBases.length; i++) { // single base insertion: - InsertBase ib = new InsertBase(); - ib.setPosition(zeroBasedPositionInRead + 1 + i); - ib.setBase(insertedBases[i]); - features.add(ib); + final InsertBase insertBase = new InsertBase(); + insertBase.setPosition(zeroBasedPositionInRead + 1 + i); + insertBase.setBase(insertedBases[i]); + features.add(insertBase); } } - private void addSubstitutionsAndMaskedBases( - CramCompressionRecord cramRecord, List features, - int fromPosInRead, int alignmentStartOffset, int nofReadBases, - byte[] bases, byte[] qualityScore) { + private void addSubstitutionsAndMaskedBases(final CramCompressionRecord cramRecord, final List features, final int fromPosInRead, final int + alignmentStartOffset, final int nofReadBases, final byte[] bases, final byte[] qualityScore) { int oneBasedPositionInRead; - boolean noQS = (qualityScore.length == 0); + final boolean noQS = (qualityScore.length == 0); - int i = 0; - boolean qualityAdded = false; + int i; + boolean qualityAdded; byte refBase; for (i = 0; i < nofReadBases; i++) { oneBasedPositionInRead = i + fromPosInRead + 1; - int refCoord = (int) (cramRecord.alignmentStart + i + alignmentStartOffset) - 1; + final int referenceCoordinates = cramRecord.alignmentStart + i + alignmentStartOffset - 1; qualityAdded = false; - if (refCoord >= refBases.length) - refBase = 'N'; - else - refBase = refBases[refCoord]; + if (referenceCoordinates >= refBases.length) refBase = 'N'; + else refBase = refBases[referenceCoordinates]; refBase = Utils.normalizeBase(refBase); if (bases[i + fromPosInRead] != refBase) { - Substitution sv = new Substitution(); - sv.setPosition(oneBasedPositionInRead); - sv.setBase(bases[i + fromPosInRead]); - sv.setRefernceBase(refBase); - sv.setBaseChange(null); + final Substitution substitution = new Substitution(); + substitution.setPosition(oneBasedPositionInRead); + substitution.setBase(bases[i + fromPosInRead]); + substitution.setReferenceBase(refBase); - features.add(sv); + features.add(substitution); - if (losslessQS || noQS) - continue; + if (noQS) continue; } - if (noQS) - continue; + if (noQS) continue; - if (!qualityAdded && refSNPs != null) { - byte snpOrNot = refSNPs[refCoord]; + if (refSNPs != null) { + final byte snpOrNot = refSNPs[referenceCoordinates]; if (snpOrNot != 0) { - byte score = (byte) (QS_asciiOffset + qualityScore[i - + fromPosInRead]); - features.add(new BaseQualityScore(oneBasedPositionInRead, - score)); + final byte score = (byte) (QS_asciiOffset + qualityScore[i + fromPosInRead]); + features.add(new BaseQualityScore(oneBasedPositionInRead, score)); qualityAdded = true; landedRefMaskScores++; } } - if (!qualityAdded && refPile != null) { - if (refPile.shouldStore(refCoord, refBase)) { - byte score = (byte) (QS_asciiOffset + qualityScore[i - + fromPosInRead]); - features.add(new BaseQualityScore(oneBasedPositionInRead, - score)); - qualityAdded = true; - landedPiledScores++; - } - } - - if (qualityAdded) - landedTotalScores++; + if (qualityAdded) landedTotalScores++; } } @@ -374,10 +327,6 @@ public long getLandedRefMaskScores() { return landedRefMaskScores; } - public long getLandedPiledScores() { - return landedPiledScores; - } - public long getLandedTotalScores() { return landedTotalScores; } @@ -386,7 +335,7 @@ public byte[] getRefBases() { return refBases; } - public void setRefBases(byte[] refBases) { + public void setRefBases(final byte[] refBases) { this.refBases = refBases; } @@ -394,21 +343,14 @@ public byte[] getRefSNPs() { return refSNPs; } - public void setRefSNPs(byte[] refSNPs) { + public void setRefSNPs(final byte[] refSNPs) { this.refSNPs = refSNPs; } - public RefMaskUtils.RefMask getRefPile() { - return refPile; - } - public Map getReadGroupMap() { return readGroupMap; } - public void setRefPile(RefMaskUtils.RefMask refPile) { - this.refPile = refPile; - } public long getBaseCount() { return baseCount; diff --git a/src/java/htsjdk/samtools/cram/build/Utils.java b/src/java/htsjdk/samtools/cram/build/Utils.java index 1b13aba31f..60efc3b9d5 100644 --- a/src/java/htsjdk/samtools/cram/build/Utils.java +++ b/src/java/htsjdk/samtools/cram/build/Utils.java @@ -1,50 +1,52 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.build; class Utils { - /** - * CRAM operates with upper case bases, so both read and ref bases should be - * upper-cased and vocab controlled. This method does exactly this: upper - * case acgt and replace everything else with N. - * - * @param base - * @return - */ - public static final byte normalizeBase(byte base) { - switch (base) { - case 'a': - case 'A': - return 'A'; + /** + * CRAM operates with upper case bases, so both read and ref bases should be + * upper-cased and vocab controlled. This method does exactly this: upper + * case acgt and replace everything else with N. + * + * @param base a base to normalize + * @return a normalized base + */ + public static byte normalizeBase(final byte base) { + switch (base) { + case 'a': + case 'A': + return 'A'; - case 'c': - case 'C': - return 'C'; + case 'c': + case 'C': + return 'C'; - case 'g': - case 'G': - return 'G'; + case 'g': + case 'G': + return 'G'; - case 't': - case 'T': - return 'T'; + case 't': + case 'T': + return 'T'; - default: - return 'N'; - } - } + default: + return 'N'; + } + } } diff --git a/src/java/htsjdk/samtools/cram/common/CramVersionPolicies.java b/src/java/htsjdk/samtools/cram/common/CramVersionPolicies.java new file mode 100644 index 0000000000..fa1eafb2ef --- /dev/null +++ b/src/java/htsjdk/samtools/cram/common/CramVersionPolicies.java @@ -0,0 +1,24 @@ +package htsjdk.samtools.cram.common; + +import htsjdk.samtools.util.Log; + +/** + * The class provides version-dependant rules and policies for CRAM data. + */ +public class CramVersionPolicies { + private static final Log log = Log.getInstance(CramVersionPolicies.class); + + /** + * The method holds the behaviour for when the EOF marker is not found. Depending on the CRAM version this will be ignored, a warning + * issued or an exception produced. + * + * @param version CRAM version to assume + */ + public static void eofNotFound(final Version version) { + if (version.compatibleWith(CramVersions.CRAM_v3)) { + log.error("Incomplete data: EOF marker not found."); + throw new RuntimeException("EOF not found."); + } + if (version.compatibleWith(CramVersions.CRAM_v2_1)) log.warn("EOF marker not found, possibly incomplete file/stream."); + } +} diff --git a/src/java/htsjdk/samtools/cram/common/CramVersions.java b/src/java/htsjdk/samtools/cram/common/CramVersions.java index 2cec0df9f0..913c2d40df 100644 --- a/src/java/htsjdk/samtools/cram/common/CramVersions.java +++ b/src/java/htsjdk/samtools/cram/common/CramVersions.java @@ -2,6 +2,6 @@ public class CramVersions { - public static final Version CRAM_v2_1 = new Version(2, 1, 0); - public static final Version CRAM_v3 = new Version(3, 0, 0); + public static final Version CRAM_v2_1 = new Version(2, 1, 0); + public static final Version CRAM_v3 = new Version(3, 0, 0); } diff --git a/src/java/htsjdk/samtools/cram/common/IntHashMap.java b/src/java/htsjdk/samtools/cram/common/IntHashMap.java index 3b920d54cd..f2bca063a7 100644 --- a/src/java/htsjdk/samtools/cram/common/IntHashMap.java +++ b/src/java/htsjdk/samtools/cram/common/IntHashMap.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.common; /* @@ -34,402 +36,380 @@ /* * Note: originally released under the GNU LGPL v2.1, - * but rereleased by the original author under the ASF license (above). + * but re-released by the original author under the ASF license (above). */ /** *

* A hash map that uses primitive ints for the key rather than objects. *

- * + *

*

* Note that this class is for internal optimization purposes only, and may not * be supported in future releases of Apache Commons Lang. Utilities of this * sort may be included in future releases of Apache Commons Collections. *

- * + * * @author Justin Couch * @author Alex Chaffee (alex@apache.org) * @author Stephen Colebourne - * @since 2.0 * @version $Revision: 561230 $ * @see java.util.HashMap + * @since 2.0 */ public class IntHashMap { - /** - * The hash table data. - */ - private transient Entry table[]; - - /** - * The total number of entries in the hash table. - */ - private transient int count; - - /** - * The table is rehashed when its size exceeds this threshold. (The value of - * this field is (int)(capacity * loadFactor).) - * - * @serial - */ - private int threshold; - - /** - * The load factor for the hashtable. - * - * @serial - */ - private float loadFactor; - - /** - *

- * Innerclass that acts as a datastructure to create a new entry in the - * table. - *

- */ - private static class Entry { - int hash; - int key; - E value; - Entry next; - - /** - *

- * Create a new entry with the given values. - *

- * - * @param hash - * The code used to hash the object with - * @param key - * The key used to enter this in the table - * @param value - * The value for this key - * @param next - * A reference to the next entry in the table - */ - protected Entry(int hash, int key, E value, Entry next) { - this.hash = hash; - this.key = key; - this.value = value; - this.next = next; - } - } - - /** - *

- * Constructs a new, empty hashtable with a default capacity and load - * factor, which is 20 and 0.75 respectively. - *

- */ - public IntHashMap() { - this(20, 0.75f); - } - - /** - *

- * Constructs a new, empty hashtable with the specified initial capacity and - * default load factor, which is 0.75. - *

- * - * @param initialCapacity - * the initial capacity of the hashtable. - * @throws IllegalArgumentException - * if the initial capacity is less than zero. - */ - public IntHashMap(int initialCapacity) { - this(initialCapacity, 0.75f); - } - - /** - *

- * Constructs a new, empty hashtable with the specified initial capacity and - * the specified load factor. - *

- * - * @param initialCapacity - * the initial capacity of the hashtable. - * @param loadFactor - * the load factor of the hashtable. - * @throws IllegalArgumentException - * if the initial capacity is less than zero, or if the load - * factor is nonpositive. - */ - public IntHashMap(int initialCapacity, float loadFactor) { - super(); - if (initialCapacity < 0) { - throw new IllegalArgumentException("Illegal Capacity: " - + initialCapacity); - } - if (loadFactor <= 0) { - throw new IllegalArgumentException("Illegal Load: " + loadFactor); - } - if (initialCapacity == 0) { - initialCapacity = 1; - } - - this.loadFactor = loadFactor; - table = new Entry[initialCapacity]; - threshold = (int) (initialCapacity * loadFactor); - } - - /** - *

- * Returns the number of keys in this hashtable. - *

- * - * @return the number of keys in this hashtable. - */ - public int size() { - return count; - } - - /** - *

- * Tests if this hashtable maps no keys to values. - *

- * - * @return true if this hashtable maps no keys to values; - * false otherwise. - */ - public boolean isEmpty() { - return count == 0; - } - - /** - *

- * Tests if some key maps into the specified value in this hashtable. This - * operation is more expensive than the containsKey method. - *

- * - *

- * Note that this method is identical in functionality to containsValue, - * (which is part of the Map interface in the collections framework). - *

- * - * @param value - * a value to search for. - * @return true if and only if some key maps to the - * value argument in this hashtable as determined by - * the equals method; false otherwise. - * @throws NullPointerException - * if the value is null. - * @see #containsKey(int) - * @see #containsValue(Object) - * @see java.util.Map - */ - public boolean contains(Object value) { - if (value == null) { - throw new NullPointerException(); - } - - Entry tab[] = table; - for (int i = tab.length; i-- > 0;) { - for (Entry e = tab[i]; e != null; e = e.next) { - if (e.value.equals(value)) { - return true; - } - } - } - return false; - } - - /** - *

- * Returns true if this HashMap maps one or more keys to this - * value. - *

- * - *

- * Note that this method is identical in functionality to contains (which - * predates the Map interface). - *

- * - * @param value - * value whose presence in this HashMap is to be tested. - * @return boolean true if the value is contained - * @see java.util.Map - * @since JDK1.2 - */ - public boolean containsValue(Object value) { - return contains(value); - } - - /** - *

- * Tests if the specified object is a key in this hashtable. - *

- * - * @param key - * possible key. - * @return true if and only if the specified object is a key in - * this hashtable, as determined by the equals method; - * false otherwise. - * @see #contains(Object) - */ - public boolean containsKey(int key) { - Entry tab[] = table; - int hash = key; - int index = (hash & 0x7FFFFFFF) % tab.length; - for (Entry e = tab[index]; e != null; e = e.next) { - if (e.hash == hash) { - return true; - } - } - return false; - } - - /** - *

- * Returns the value to which the specified key is mapped in this map. - *

- * - * @param key - * a key in the hashtable. - * @return the value to which the key is mapped in this hashtable; - * null if the key is not mapped to any value in this - * hashtable. - * @see #put(int, Object) - */ - public T get(int key) { - Entry tab[] = table; - int hash = key; - int index = (hash & 0x7FFFFFFF) % tab.length; - for (Entry e = tab[index]; e != null; e = e.next) { - if (e.hash == hash) { - return e.value; - } - } - return null; - } - - /** - *

- * Increases the capacity of and internally reorganizes this hashtable, in - * order to accommodate and access its entries more efficiently. - *

- * - *

- * This method is called automatically when the number of keys in the - * hashtable exceeds this hashtable's capacity and load factor. - *

- */ - protected void rehash() { - int oldCapacity = table.length; - Entry oldMap[] = table; - - int newCapacity = oldCapacity * 2 + 1; - Entry newMap[] = new Entry[newCapacity]; - - threshold = (int) (newCapacity * loadFactor); - table = newMap; - - for (int i = oldCapacity; i-- > 0;) { - for (Entry old = oldMap[i]; old != null;) { - Entry e = old; - old = old.next; - - int index = (e.hash & 0x7FFFFFFF) % newCapacity; - e.next = newMap[index]; - newMap[index] = e; - } - } - } - - /** - *

- * Maps the specified key to the specified value - * in this hashtable. The key cannot be null. - *

- * - *

- * The value can be retrieved by calling the get method with a - * key that is equal to the original key. - *

- * - * @param key - * the hashtable key. - * @param value - * the value. - * @return the previous value of the specified key in this hashtable, or - * null if it did not have one. - * @throws NullPointerException - * if the key is null. - * @see #get(int) - */ - public Object put(int key, T value) { - // Makes sure the key is not already in the hashtable. - Entry tab[] = table; - int hash = key; - int index = (hash & 0x7FFFFFFF) % tab.length; - for (Entry e = tab[index]; e != null; e = e.next) { - if (e.hash == hash) { - Object old = e.value; - e.value = value; - return old; - } - } - - if (count >= threshold) { - // Rehash the table if the threshold is exceeded - rehash(); - - tab = table; - index = (hash & 0x7FFFFFFF) % tab.length; - } - - // Creates the new entry. - Entry e = new Entry(hash, key, value, tab[index]); - tab[index] = e; - count++; - return null; - } - - /** - *

- * Removes the key (and its corresponding value) from this hashtable. - *

- * - *

- * This method does nothing if the key is not present in the hashtable. - *

- * - * @param key - * the key that needs to be removed. - * @return the value to which the key had been mapped in this hashtable, or - * null if the key did not have a mapping. - */ - public Object remove(int key) { - Entry tab[] = table; - int hash = key; - int index = (hash & 0x7FFFFFFF) % tab.length; - for (Entry e = tab[index], prev = null; e != null; prev = e, e = e.next) { - if (e.hash == hash) { - if (prev != null) { - prev.next = e.next; - } else { - tab[index] = e.next; - } - count--; - Object oldValue = e.value; - e.value = null; - return oldValue; - } - } - return null; - } - - /** - *

- * Clears this hashtable so that it contains no keys. - *

- */ - public synchronized void clear() { - Entry tab[] = table; - for (int index = tab.length; --index >= 0;) { - tab[index] = null; - } - count = 0; - } + /** + * The hash table data. + */ + private transient Entry table[]; + + /** + * The total number of entries in the hash table. + */ + private transient int count; + + /** + * The table is rehashed when its size exceeds this threshold. (The value of + * this field is (int)(capacity * loadFactor).) + * + * @serial + */ + private int threshold; + + /** + * The load factor for the hashtable. + * + * @serial + */ + private float loadFactor; + + /** + *

+ * Innerclass that acts as a datastructure to create a new entry in the + * table. + *

+ */ + private static class Entry { + final int hash; + final int key; + E value; + Entry next; + + /** + *

+ * Create a new entry with the given values. + *

+ * + * @param hash The code used to hash the object with + * @param key The key used to enter this in the table + * @param value The value for this key + * @param next A reference to the next entry in the table + */ + protected Entry(final int hash, final int key, final E value, final Entry next) { + this.hash = hash; + this.key = key; + this.value = value; + this.next = next; + } + } + + /** + *

+ * Constructs a new, empty hashtable with a default capacity and load + * factor, which is 20 and 0.75 respectively. + *

+ */ + public IntHashMap() { + this(20, 0.75f); + } + + /** + *

+ * Constructs a new, empty hashtable with the specified initial capacity and + * default load factor, which is 0.75. + *

+ * + * @param initialCapacity the initial capacity of the hashtable. + * @throws IllegalArgumentException if the initial capacity is less than zero. + */ + public IntHashMap(final int initialCapacity) { + this(initialCapacity, 0.75f); + } + + /** + *

+ * Constructs a new, empty hashtable with the specified initial capacity and + * the specified load factor. + *

+ * + * @param initialCapacity the initial capacity of the hashtable. + * @param loadFactor the load factor of the hashtable. + * @throws IllegalArgumentException if the initial capacity is less than zero, or if the load + * factor is non-positive. + */ + public IntHashMap(int initialCapacity, final float loadFactor) { + super(); + if (initialCapacity < 0) { + throw new IllegalArgumentException("Illegal Capacity: " + + initialCapacity); + } + if (loadFactor <= 0) { + throw new IllegalArgumentException("Illegal Load: " + loadFactor); + } + if (initialCapacity == 0) { + initialCapacity = 1; + } + + this.loadFactor = loadFactor; + table = new Entry[initialCapacity]; + threshold = (int) (initialCapacity * loadFactor); + } + + /** + *

+ * Returns the number of keys in this hashtable. + *

+ * + * @return the number of keys in this hashtable. + */ + public int size() { + return count; + } + + /** + *

+ * Tests if this hashtable maps no keys to values. + *

+ * + * @return true if this hashtable maps no keys to values; + * false otherwise. + */ + public boolean isEmpty() { + return count == 0; + } + + /** + *

+ * Tests if some key maps into the specified value in this hashtable. This + * operation is more expensive than the containsKey method. + *

+ *

+ *

+ * Note that this method is identical in functionality to containsValue, + * (which is part of the Map interface in the collections framework). + *

+ * + * @param value a value to search for. + * @return true if and only if some key maps to the + * value argument in this hashtable as determined by + * the equals method; false otherwise. + * @throws NullPointerException if the value is null. + * @see #containsKey(int) + * @see #containsValue(Object) + * @see java.util.Map + */ + public boolean contains(final Object value) { + if (value == null) { + throw new NullPointerException(); + } + + final Entry[] tab = table; + for (int i = tab.length; i-- > 0; ) { + for (Entry e = tab[i]; e != null; e = e.next) { + if (e.value.equals(value)) { + return true; + } + } + } + return false; + } + + /** + *

+ * Returns true if this HashMap maps one or more keys to this + * value. + *

+ *

+ *

+ * Note that this method is identical in functionality to contains (which + * predates the Map interface). + *

+ * + * @param value value whose presence in this HashMap is to be tested. + * @return boolean true if the value is contained + * @see java.util.Map + * @since JDK1.2 + */ + public boolean containsValue(final Object value) { + return contains(value); + } + + /** + *

+ * Tests if the specified object is a key in this hashtable. + *

+ * + * @param key possible key. + * @return true if and only if the specified object is a key in + * this hashtable, as determined by the equals method; + * false otherwise. + * @see #contains(Object) + */ + public boolean containsKey(final int key) { + final Entry[] tab = table; + final int index = (key & 0x7FFFFFFF) % tab.length; + for (Entry entry = tab[index]; entry != null; entry = entry.next) { + if (entry.hash == key) { + return true; + } + } + return false; + } + + /** + *

+ * Returns the value to which the specified key is mapped in this map. + *

+ * + * @param key a key in the hashtable. + * @return the value to which the key is mapped in this hashtable; + * null if the key is not mapped to any value in this + * hashtable. + * @see #put(int, Object) + */ + public T get(final int key) { + final Entry[] tab = table; + final int index = (key & 0x7FFFFFFF) % tab.length; + for (Entry e = tab[index]; e != null; e = e.next) { + if (e.hash == key) { + return e.value; + } + } + return null; + } + + /** + *

+ * Increases the capacity of and internally reorganizes this hashtable, in + * order to accommodate and access its entries more efficiently. + *

+ *

+ *

+ * This method is called automatically when the number of keys in the + * hashtable exceeds this hashtable's capacity and load factor. + *

+ */ + protected void rehash() { + final int oldCapacity = table.length; + final Entry[] oldMap = table; + + final int newCapacity = oldCapacity * 2 + 1; + final Entry[] newMap = new Entry[newCapacity]; + + threshold = (int) (newCapacity * loadFactor); + table = newMap; + + for (int i = oldCapacity; i-- > 0; ) { + for (Entry old = oldMap[i]; old != null; ) { + final Entry entry = old; + old = old.next; + + final int index = (entry.hash & 0x7FFFFFFF) % newCapacity; + entry.next = newMap[index]; + newMap[index] = entry; + } + } + } + + /** + *

+ * Maps the specified key to the specified value + * in this hashtable. The key cannot be null. + *

+ *

+ *

+ * The value can be retrieved by calling the get method with a + * key that is equal to the original key. + *

+ * + * @param key the hashtable key. + * @param value the value. + * @return the previous value of the specified key in this hashtable, or + * null if it did not have one. + * @throws NullPointerException if the key is null. + * @see #get(int) + */ + public Object put(final int key, final T value) { + // Makes sure the key is not already in the hashtable. + Entry tab[] = table; + int index = (key & 0x7FFFFFFF) % tab.length; + for (Entry entry = tab[index]; entry != null; entry = entry.next) { + if (entry.hash == key) { + final Object old = entry.value; + entry.value = value; + return old; + } + } + + if (count >= threshold) { + // Rehash the table if the threshold is exceeded + rehash(); + + tab = table; + index = (key & 0x7FFFFFFF) % tab.length; + } + + // Creates the new entry. + final Entry entry = new Entry(key, key, value, tab[index]); + tab[index] = entry; + count++; + return null; + } + + /** + *

+ * Removes the key (and its corresponding value) from this hashtable. + *

+ *

+ *

+ * This method does nothing if the key is not present in the hashtable. + *

+ * + * @param key the key that needs to be removed. + * @return the value to which the key had been mapped in this hashtable, or + * null if the key did not have a mapping. + */ + public Object remove(final int key) { + final Entry[] tab = table; + final int index = (key & 0x7FFFFFFF) % tab.length; + for (Entry entry = tab[index], prev = null; entry != null; prev = entry, entry = entry.next) { + if (entry.hash == key) { + if (prev != null) { + prev.next = entry.next; + } else { + tab[index] = entry.next; + } + count--; + final Object oldValue = entry.value; + entry.value = null; + return oldValue; + } + } + return null; + } + + /** + *

+ * Clears this hashtable so that it contains no keys. + *

+ */ + public synchronized void clear() { + final Entry[] tab = table; + for (int index = tab.length; --index >= 0; ) { + tab[index] = null; + } + count = 0; + } } diff --git a/src/java/htsjdk/samtools/cram/common/MutableInt.java b/src/java/htsjdk/samtools/cram/common/MutableInt.java index 6820cfbaa0..4b3d4ff846 100644 --- a/src/java/htsjdk/samtools/cram/common/MutableInt.java +++ b/src/java/htsjdk/samtools/cram/common/MutableInt.java @@ -1,24 +1,27 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.common; public class MutableInt { - public int value = 0; - @Override - public int hashCode() { - return value ; - } + public int value = 0; + + @Override + public int hashCode() { + return value; + } } diff --git a/src/java/htsjdk/samtools/cram/common/NonforgivingPrintStream.java b/src/java/htsjdk/samtools/cram/common/NonforgivingPrintStream.java deleted file mode 100644 index dab840226e..0000000000 --- a/src/java/htsjdk/samtools/cram/common/NonforgivingPrintStream.java +++ /dev/null @@ -1,78 +0,0 @@ -package htsjdk.samtools.cram.common; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.OutputStream; -import java.io.PrintStream; -import java.io.UnsupportedEncodingException; - -public class NonforgivingPrintStream extends PrintStream { - - public NonforgivingPrintStream(File file) throws FileNotFoundException { - super(file); - } - - public NonforgivingPrintStream(File file, String csn) throws FileNotFoundException, UnsupportedEncodingException { - super(file, csn); - } - - public NonforgivingPrintStream(OutputStream out, boolean autoFlush, String encoding) - throws UnsupportedEncodingException { - super(out, autoFlush, encoding); - } - - public NonforgivingPrintStream(OutputStream out, boolean autoFlush) { - super(out, autoFlush); - } - - public NonforgivingPrintStream(OutputStream out) { - super(out); - } - - public NonforgivingPrintStream(String fileName, String csn) throws FileNotFoundException, - UnsupportedEncodingException { - super(fileName, csn); - } - - public NonforgivingPrintStream(String fileName) throws FileNotFoundException { - super(fileName); - } - - @Override - public void write(byte[] b) throws IOException { - if (checkError()) - throw new PrintStreamError(this); - super.write(b); - } - - @Override - public void write(int b) { - if (checkError()) - throw new PrintStreamError(this); - super.write(b); - } - - @Override - public void write(byte[] buf, int off, int len) { - if (checkError()) - throw new PrintStreamError(this); - super.write(buf, off, len); - } - - public static class PrintStreamError extends RuntimeException { - private transient PrintStream printStream; - - public PrintStream getPrintStream() { - return printStream; - } - - public void setPrintStream(PrintStream printStream) { - this.printStream = printStream; - } - - public PrintStreamError(PrintStream ps) { - super(); - } - } -} diff --git a/src/java/htsjdk/samtools/cram/common/NullOutputStream.java b/src/java/htsjdk/samtools/cram/common/NullOutputStream.java deleted file mode 100644 index 07462eb6a5..0000000000 --- a/src/java/htsjdk/samtools/cram/common/NullOutputStream.java +++ /dev/null @@ -1,11 +0,0 @@ -package htsjdk.samtools.cram.common; - -import java.io.IOException; -import java.io.OutputStream; - -public class NullOutputStream extends OutputStream { - @Override - public void write(int b) throws IOException { - ; // - } -} \ No newline at end of file diff --git a/src/java/htsjdk/samtools/cram/common/Version.java b/src/java/htsjdk/samtools/cram/common/Version.java index 7507b0dd90..d20ecf9fc1 100644 --- a/src/java/htsjdk/samtools/cram/common/Version.java +++ b/src/java/htsjdk/samtools/cram/common/Version.java @@ -1,43 +1,63 @@ package htsjdk.samtools.cram.common; +/** + * A class to represent a version information, 3 number: major, minor and build number. + */ public class Version implements Comparable { - public final int major; - public final int minor; - public final int build; - - public Version(int major, int minor, int build) { - this.major = major; - this.minor = minor; - this.build = build; - } - - public Version(String version) { - String[] numbers = version.split("[\\.\\-b]"); - major = Integer.valueOf(numbers[0]); - minor = Integer.valueOf(numbers[1]); - if (numbers.length > 3) - build = Integer.valueOf(numbers[3]); - else - build = 0; - } - - @Override - public String toString() { - if (build > 0) - return String.format("%d.%d-b%d", major, minor, build); - else - return String.format("%d.%d", major, minor); - } - - @Override - public int compareTo(Version o) { - if (major - o.major != 0) - return major - o.major; - if (minor - o.minor != 0) - return minor - o.minor; - - if (build < 1 || o.build < 1) - return 0; - return build - o.build; - } + public final int major; + public final int minor; + private final int build; + + public Version(final int major, final int minor, final int build) { + this.major = major; + this.minor = minor; + this.build = build; + } + + public Version(final String version) { + final String[] numbers = version.split("[\\.\\-b]"); + major = Integer.valueOf(numbers[0]); + minor = Integer.valueOf(numbers[1]); + if (numbers.length > 3) build = Integer.valueOf(numbers[3]); + else build = 0; + } + + @Override + public String toString() { + if (build > 0) return String.format("%d.%d-b%d", major, minor, build); + else return String.format("%d.%d", major, minor); + } + + /** + * Compare with another version. + * + * @param o another version + * @return 0 if both versions are the same, a negative if the other version is higher and a positive otherwise. + */ + @Override + public int compareTo(@SuppressWarnings("NullableProblems") final Version o) { + if (o == null) return -1; + if (major - o.major != 0) return major - o.major; + if (minor - o.minor != 0) return minor - o.minor; + + if (build < 1 || o.build < 1) return 0; + return build - o.build; + } + + /** + * Check if another version is exactly the same as this one. + * + * @param obj another version object + * @return true if both versions are the same, false otherwise. + */ + @Override + public boolean equals(final Object obj) { + if (obj == null || !(obj instanceof Version)) return false; + final Version version = (Version) obj; + return major == version.major && minor == version.minor; + } + + public boolean compatibleWith(final Version version) { + return compareTo(version) >= 0; + } } \ No newline at end of file diff --git a/src/java/htsjdk/samtools/cram/digest/AbstractSerialDigest.java b/src/java/htsjdk/samtools/cram/digest/AbstractSerialDigest.java new file mode 100644 index 0000000000..f1a2f18fc8 --- /dev/null +++ b/src/java/htsjdk/samtools/cram/digest/AbstractSerialDigest.java @@ -0,0 +1,26 @@ +package htsjdk.samtools.cram.digest; + +abstract class AbstractSerialDigest { + private final Combine combine; + T value; + + AbstractSerialDigest(final Combine combine, final T value) { + this.combine = combine; + this.value = value; + } + + protected abstract void resetAndUpdate(byte[] data); + + protected abstract T getValue(); + + protected abstract byte[] asByteArray(); + + void add(final byte[] data) { + resetAndUpdate(data); + final T updateValue = getValue(); + if (value == null) + value = updateValue; + else + value = combine.combine(value, updateValue); + } +} diff --git a/src/java/htsjdk/samtools/cram/digest/ByteSumCombine.java b/src/java/htsjdk/samtools/cram/digest/ByteSumCombine.java new file mode 100644 index 0000000000..25dd77f8e6 --- /dev/null +++ b/src/java/htsjdk/samtools/cram/digest/ByteSumCombine.java @@ -0,0 +1,12 @@ +package htsjdk.samtools.cram.digest; + +class ByteSumCombine implements Combine { + + @Override + public byte[] combine(final byte[] state, final byte[] update) { + for (int i = 0; i < state.length; i++) + state[i] += update[i]; + return state; + } + +} diff --git a/src/java/htsjdk/samtools/cram/digest/Combine.java b/src/java/htsjdk/samtools/cram/digest/Combine.java new file mode 100644 index 0000000000..4868127318 --- /dev/null +++ b/src/java/htsjdk/samtools/cram/digest/Combine.java @@ -0,0 +1,6 @@ +package htsjdk.samtools.cram.digest; + +interface Combine { + + T combine(T state, T update); +} diff --git a/src/java/htsjdk/samtools/cram/digest/ContentDigests.java b/src/java/htsjdk/samtools/cram/digest/ContentDigests.java new file mode 100644 index 0000000000..bbfe1ed70c --- /dev/null +++ b/src/java/htsjdk/samtools/cram/digest/ContentDigests.java @@ -0,0 +1,217 @@ +package htsjdk.samtools.cram.digest; + +import htsjdk.samtools.SAMBinaryTagAndValue; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMTagUtil; +import htsjdk.samtools.cram.structure.CramCompressionRecord; +import htsjdk.samtools.util.Log; + +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.LinkedList; +import java.util.List; + +public class ContentDigests { + public static final EnumSet ALL = EnumSet + .allOf(KNOWN_DIGESTS.class); + public static final EnumSet CRC32 = EnumSet.of( + KNOWN_DIGESTS.BD, KNOWN_DIGESTS.SD); + + private static final Log log = Log.getInstance(ContentDigests.class); + private List digesters = new LinkedList(); + + public static ContentDigests create(final EnumSet requestedDigests) { + final List digesters = new LinkedList(); + for (final KNOWN_DIGESTS digest : requestedDigests) + digesters.add(digest.createDigester()); + return new ContentDigests(digesters); + } + + public static ContentDigests create(final SAMBinaryTagAndValue binaryTags) { + final List digesters = new LinkedList(); + SAMBinaryTagAndValue binaryTag = binaryTags; + while (binaryTag != null) { + final String tagID = SAMTagUtil.getSingleton().makeStringTag( + binaryTag.tag); + final KNOWN_DIGESTS hash; + try { + hash = KNOWN_DIGESTS.valueOf(tagID); + digesters.add(hash.createDigester()); + } catch (final IllegalArgumentException e) { + // The tag is not one of the known content digest tags. + } + binaryTag = binaryTag.getNext(); + } + return new ContentDigests(digesters); + } + + private ContentDigests(final List hashers) { + this.digesters = hashers; + } + + void add(final SAMRecord record) { + for (final Digester digester : digesters) + digester.add(record); + } + + public void add(final CramCompressionRecord record) { + for (final Digester digester : digesters) + digester.addCramRecord(record); + } + + public void addSAMRecords(final Iterable records) { + for (final SAMRecord record : records) + add(record); + } + + public void addCramRecords(final Iterable records) { + for (final CramCompressionRecord record : records) + add(record); + } + + public SAMBinaryTagAndValue getAsTags() { + SAMBinaryTagAndValue tag = null; + for (final Digester digester : digesters) { + if (tag == null) + tag = digester.toTag(); + else + tag = tag.insert(digester.toTag()); + } + + return tag; + } + + public boolean test(final SAMBinaryTagAndValue tags) { + for (final Digester digester : digesters) { + final SAMBinaryTagAndValue foundTag = tags.find(digester.tagCode); + if (foundTag == null) + continue; + + if (!(foundTag.value instanceof byte[])) + throw new RuntimeException("Expecting a byte array but got: " + + foundTag.value.getClass().getName()); + + final byte[] expected = (byte[]) foundTag.value; + final byte[] actual = digester.digest.asByteArray(); + if (!Arrays.equals(expected, actual)) { + final String expectedString = toHexString(expected); + final String actualString = toHexString(actual); + log.error(String + .format("Content hash mismatch for tag %s, actual: %s; expected: %s", + digester.tagID, actualString, expectedString)); + return false; + } else + log.debug("Content digest ok: " + digester.tagID); + } + return true; + } + + private static String toHex(final byte[] bytes) { + final StringBuilder sb = new StringBuilder(); + for (final byte t : bytes) { + sb.append(String.format("%02x", (0xFF & t)).toUpperCase()).append( + ' '); + } + return sb.toString(); + } + + private static String toHexString(final byte[] bytes) { + return toHex(bytes).replace(" ", ""); + } + + private static class Digester { + final AbstractSerialDigest digest; + final SERIES series; + final String tagID; + final short tagCode; + + Digester(final AbstractSerialDigest digest, final SERIES series, final String tagID) { + this.digest = digest; + this.series = series; + this.tagID = tagID; + this.tagCode = SAMTagUtil.getSingleton().makeBinaryTag(tagID); + } + + void add(final SAMRecord record) { + digest.add(series.getBytes(record)); + } + + void addCramRecord(final CramCompressionRecord record) { + digest.add(series.getBytes(record)); + } + + SAMBinaryTagAndValue toTag() { + return new SAMBinaryTagAndValue(tagCode, digest.asByteArray()); + } + } + + public enum KNOWN_DIGESTS { + BD { + @Override + Digester createDigester() { + return new Digester(new Crc32Hasher(new IntegerSumCombine()), + SERIES.BASES, name()); + } + }, + SD { + @Override + Digester createDigester() { + return new Digester(new Crc32Hasher(new IntegerSumCombine()), + SERIES.SCORES, name()); + } + }, + B5 { + @Override + Digester createDigester() { + try { + return new Digester(new MessageDigestHasher( + MessageDigest.getInstance("SHA-512"), + new ByteSumCombine(), null), SERIES.BASES, name()); + } catch (final NoSuchAlgorithmException e) { + throw new RuntimeException(e); + } + } + }, + S5 { + @Override + Digester createDigester() { + try { + return new Digester(new MessageDigestHasher( + MessageDigest.getInstance("SHA-512"), + new ByteSumCombine(), null), SERIES.SCORES, name()); + } catch (final NoSuchAlgorithmException e) { + throw new RuntimeException(e); + } + } + }, + B1 { + @Override + Digester createDigester() { + try { + return new Digester(new MessageDigestHasher( + MessageDigest.getInstance("SHA-1"), + new ByteSumCombine(), null), SERIES.BASES, name()); + } catch (final NoSuchAlgorithmException e) { + throw new RuntimeException(e); + } + } + }, + S1 { + @Override + Digester createDigester() { + try { + return new Digester(new MessageDigestHasher( + MessageDigest.getInstance("SHA-1"), + new ByteSumCombine(), null), SERIES.SCORES, name()); + } catch (final NoSuchAlgorithmException e) { + throw new RuntimeException(e); + } + } + }; + + abstract Digester createDigester(); + + } +} diff --git a/src/java/htsjdk/samtools/cram/digest/Crc32Hasher.java b/src/java/htsjdk/samtools/cram/digest/Crc32Hasher.java new file mode 100644 index 0000000000..464e2209a1 --- /dev/null +++ b/src/java/htsjdk/samtools/cram/digest/Crc32Hasher.java @@ -0,0 +1,42 @@ +package htsjdk.samtools.cram.digest; + +import java.nio.ByteOrder; +import java.util.zip.CRC32; + +class Crc32Hasher extends AbstractSerialDigest { + private final CRC32 crc32 = new CRC32(); + private final ByteOrder byteOrder = ByteOrder.LITTLE_ENDIAN; + + Crc32Hasher(final Combine combine) { + super(combine, null); + } + + @Override + protected void resetAndUpdate(final byte[] data) { + crc32.reset(); + crc32.update(data); + } + + @Override + protected Integer getValue() { + return (int) (crc32.getValue() & 0xFFFFFFFFL); + } + + @Override + protected byte[] asByteArray() { + final byte[] array = new byte[4]; + if (byteOrder == ByteOrder.LITTLE_ENDIAN) { + array[3] = (byte) ((value >>> 24) & 0xFF); + array[2] = (byte) ((value >>> 16) & 0xFF); + array[1] = (byte) ((value >>> 8) & 0xFF); + array[0] = (byte) ((value) & 0xFF); + } else { + array[0] = (byte) ((value >>> 24) & 0xFF); + array[1] = (byte) ((value >>> 16) & 0xFF); + array[2] = (byte) ((value >>> 8) & 0xFF); + array[3] = (byte) ((value) & 0xFF); + } + return array; + } + +} diff --git a/src/java/htsjdk/samtools/cram/digest/IntegerSumCombine.java b/src/java/htsjdk/samtools/cram/digest/IntegerSumCombine.java new file mode 100644 index 0000000000..d6b18e03d9 --- /dev/null +++ b/src/java/htsjdk/samtools/cram/digest/IntegerSumCombine.java @@ -0,0 +1,10 @@ +package htsjdk.samtools.cram.digest; + +class IntegerSumCombine implements Combine { + + @Override + public Integer combine(final Integer state, final Integer update) { + return state + update; + } + +} diff --git a/src/java/htsjdk/samtools/cram/digest/MessageDigestHasher.java b/src/java/htsjdk/samtools/cram/digest/MessageDigestHasher.java new file mode 100644 index 0000000000..e5f8f353a5 --- /dev/null +++ b/src/java/htsjdk/samtools/cram/digest/MessageDigestHasher.java @@ -0,0 +1,30 @@ +package htsjdk.samtools.cram.digest; + +import java.security.MessageDigest; + +class MessageDigestHasher extends AbstractSerialDigest { + private final MessageDigest messageDigest; + + MessageDigestHasher(final MessageDigest messageDigest, final Combine combine, + final byte[] value) { + super(combine, value); + this.messageDigest = messageDigest; + } + + @Override + protected void resetAndUpdate(final byte[] data) { + messageDigest.reset(); + messageDigest.update(data); + } + + @Override + protected byte[] getValue() { + return messageDigest.digest(); + } + + @Override + protected byte[] asByteArray() { + return messageDigest.digest(); + } + +} diff --git a/src/java/htsjdk/samtools/cram/digest/SERIES.java b/src/java/htsjdk/samtools/cram/digest/SERIES.java new file mode 100644 index 0000000000..82ce6f63bd --- /dev/null +++ b/src/java/htsjdk/samtools/cram/digest/SERIES.java @@ -0,0 +1,34 @@ +package htsjdk.samtools.cram.digest; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.cram.structure.CramCompressionRecord; + +enum SERIES { + BASES { + @Override + byte[] getBytes(final SAMRecord record) { + return record.getReadBases(); + } + + @Override + byte[] getBytes(final CramCompressionRecord record) { + return record.readBases; + } + }, + SCORES { + @Override + byte[] getBytes(final SAMRecord record) { + return record.getBaseQualities(); + } + + @Override + byte[] getBytes(final CramCompressionRecord record) { + return record.qualityScores; + } + }; + + abstract byte[] getBytes(SAMRecord record); + + abstract byte[] getBytes(CramCompressionRecord record); + +} \ No newline at end of file diff --git a/src/java/htsjdk/samtools/cram/encoding/AbstractBitCodec.java b/src/java/htsjdk/samtools/cram/encoding/AbstractBitCodec.java index b10bb20c3d..a15c893b2c 100644 --- a/src/java/htsjdk/samtools/cram/encoding/AbstractBitCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/AbstractBitCodec.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; @@ -22,33 +24,33 @@ public abstract class AbstractBitCodec implements BitCodec { - @Override - public abstract T read(BitInputStream bis) throws IOException; + @Override + public abstract T read(BitInputStream bitInputStream) throws IOException; - @Override - public abstract T read(BitInputStream bis, int valueLen) throws IOException; + @Override + public abstract T read(BitInputStream bitInputStream, int valueLen) throws IOException; - @Override - public void readInto(BitInputStream bis, byte[] array, int offset, - int valueLen) throws IOException { - throw new RuntimeException("Not implemented."); - } + @Override + public void readInto(final BitInputStream bitInputStream, final byte[] array, final int offset, + final int valueLen) throws IOException { + throw new RuntimeException("Not implemented."); + } - @Override - public void skip(BitInputStream bis) throws IOException { - read(bis); - } + @Override + public void skip(final BitInputStream bitInputStream) throws IOException { + read(bitInputStream); + } - @Override - public void skip(BitInputStream bis, int len) throws IOException { - read(bis, len); - } + @Override + public void skip(final BitInputStream bitInputStream, final int length) throws IOException { + read(bitInputStream, length); + } - @Override - public abstract long write(BitOutputStream bos, T object) - throws IOException; + @Override + public abstract long write(BitOutputStream bitOutputStream, T object) + throws IOException; - @Override - public abstract long numberOfBits(T object); + @Override + public abstract long numberOfBits(T object); } diff --git a/src/java/htsjdk/samtools/cram/encoding/ArithCodec.java b/src/java/htsjdk/samtools/cram/encoding/ArithCodec.java deleted file mode 100644 index af8a4cc501..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/ArithCodec.java +++ /dev/null @@ -1,314 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.common.NullOutputStream; -import htsjdk.samtools.cram.io.BitInputStream; -import htsjdk.samtools.cram.io.BitOutputStream; -import htsjdk.samtools.cram.io.DefaultBitOutputStream; -import htsjdk.samtools.util.Log; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; - -public class ArithCodec extends AbstractBitCodec { - private byte curBit = 0; - private int curByte = 0; - private double min = 0; - private double max = 1; - private double localMin = 0; - private double localMax = 1; - - private final int TERMINATOR = 256; - private double[] probs; - private int[] map, rev_map; - private long bitCount; - - private ByteArrayOutputStream baos; - private ArrayList fileData; - - public ArithCodec(int[] freqs, int[] map) { - // build expanded map ------------------------------ - this.map = new int[257]; // ASCII + end character - Arrays.fill(this.map, -1); - for (int i = 0; i < map.length; i++) - this.map[map[i]] = i; - this.map[this.TERMINATOR] = map.length; - - // copy collapsed map, plus end character ---------- - this.rev_map = new int[map.length + 1]; - System.arraycopy(map, 0, this.rev_map, 0, map.length); - this.rev_map[map.length] = this.TERMINATOR; - - // build probability table from frequency count ---- - this.probs = new double[freqs.length + 1]; - int total = 0, endCharCount = 0; - for (int i = 0; i < freqs.length; i++) - total += freqs[i]; - endCharCount = (total / 100) > 0 ? (total / 100) : (total / 10); - total += endCharCount; - int t = 0; - for (int i = 0; i < freqs.length; i++) { - t += freqs[i]; - this.probs[i] = (double) t / (double) total; - } - this.probs[this.probs.length - 1] = 1.0; - - // initialize byte stream -------------------------- - this.baos = new ByteArrayOutputStream(2 * 215000000); - this.fileData = new ArrayList(); - } - - /* - * Reading and expanding a bit stream based on given frequency count - */ - @Override - public byte[] read(BitInputStream bis) throws IOException { - this.baos.reset(); - this.fileData.clear(); - curBit = 0; - curByte = 0; - min = 0; - max = 1; - localMin = 0; - localMax = 1; - - int read = decodeCharacter(bis); - while (read != this.map[this.TERMINATOR]) { - this.baos.write(this.rev_map[read]); - read = decodeCharacter(bis); - } - - return this.baos.toByteArray(); - } - - public int decodeCharacter(BitInputStream bis) throws IOException { - double tempMin = min; - double tempMax = max; - byte tempBit = curBit; - int tempByte = curByte; - int val = 0; - if (this.fileData.isEmpty()) - fileData.add(bis.readBits(8)); - while (true) { - double cur = (min + max) / 2.0; - val = -1; - for (int i = 0; i < probs.length; i++) { - if (probs[i] > min) { - if (probs[i] > max) - val = i; - break; - } - } - if (val == -1) { - boolean bit = false; - if ((fileData.get(curByte) & (128 >> curBit)) != 0) - bit = true; - if (bit) - min = cur; - else - max = cur; - curBit++; - if (curBit == 8) { - curBit = 0; - curByte++; - if (curByte > fileData.size() - 1) { - try { - fileData.add(bis.readBits(8)); - } catch (Throwable t) { - fileData.add(0); - } - } - } - } else - break; - } - min = tempMin; - max = tempMax; - curBit = tempBit; - curByte = tempByte; - while (true) { - double cur = (min + max) / 2.0; - int temp = 0; - for (; temp < probs.length; temp++) - if (probs[temp] > cur) - break; - if (cur < 0 || cur > 1) - temp = -1; - if (temp != val) { - boolean bit = false; - if ((fileData.get(curByte) & (128 >> curBit)) != 0) - bit = true; - if (bit) - min = cur; - else - max = cur; - curBit++; - if (curBit == 8) { - curBit = 0; - curByte++; - if (curByte > fileData.size() - 1) - try { - fileData.add(bis.readBits(8)); - } catch (Throwable t) { - fileData.add(0); - } - } - } else { - tempMin = 0; - if (val > 0) - tempMin = probs[val - 1]; - double factor = 1.0 / (probs[val] - tempMin); - min = factor * (min - tempMin); - max = factor * (max - tempMin); - break; - } - } - return val; - } - - /* - * Write compressed output to a bit stream - */ - @Override - public long write(BitOutputStream bos, byte[] object) throws IOException { - this.baos.reset(); - curBit = 0; - curByte = 0; - min = 0; - max = 1; - localMin = 0; - localMax = 1; - this.bitCount = 0; - - try { - for (int i = 0; i < object.length; i++) - encodeCharacter(bos, this.map[object[i] & 0xFF]); - encodeCharacter(bos, this.map[this.TERMINATOR]); - encodeCharacter(bos, this.map[this.TERMINATOR]); - flush(bos); - } catch (Exception ex) { - Log.getInstance(getClass()).error(ex); - } - - return this.bitCount; - } - - private void encodeCharacter(BitOutputStream bos, int character) - throws Exception { - if (probs.length < 2 || probs[probs.length - 1] != 1 || character < 0 - || character >= probs.length) - throw new Exception("Invalid input"); - if (character > 0) - localMin = probs[character - 1]; - else - localMin = 0; - localMax = probs[character]; - while (true) { - double cur = (min + max) / 2.0; - if (cur < localMin) { - curByte |= (128 >> curBit); // set bit = 1, left-to-right - curBit++; - if (curBit == 8) { - bos.write(curByte, 8); - curByte = 0; // byte containing bits to be written - curBit = 0; // bit-position, left-to-right - this.bitCount += 8; - } - min = cur; // wrote 1 (go higher) adjust min - } else if (cur >= localMax) { - curBit++; - if (curBit == 8) { - bos.write(curByte, 8); - curByte = 0; - curBit = 0; - this.bitCount += 8; - } - max = cur; // wrote 0 (go lower) adjust max - } else { - double factor = 1.0 / (localMax - localMin); - min = factor * (min - localMin); - max = factor * (max - localMin); - break; - } - } - } - - private void flush(BitOutputStream bos) throws IOException { - if (curBit != 0) { - while (true) { - while (true) { - double cur = (min + max) / 2.0; - double mid = (localMin + localMax) / 2.0; - if (cur < mid) { - curByte |= (128 >> curBit); - min = cur; - } else - max = cur; - curBit++; - if (curBit == 8) { - bos.write(curByte, 8); - curByte = 0; - curBit = 0; - this.bitCount += 8; - break; - } - } - double cur = (min + max) / 2.0; - if (cur >= localMin && cur < localMax) - break; - } - } - bos.close(); - } - - /* - * Compress and count bits in the end - */ - @Override - public long numberOfBits(byte[] object) { - NullOutputStream baos = new NullOutputStream(); - DefaultBitOutputStream nBos = new DefaultBitOutputStream(baos); - - this.baos.reset(); - curBit = 0; - curByte = 0; - min = 0; - max = 1; - localMin = 0; - localMax = 1; - this.bitCount = 0; - - try { - for (int i = 0; i < object.length; i++) - encodeCharacter(nBos, this.map[object[i] & 0xFF]); - encodeCharacter(nBos, this.map[this.TERMINATOR]); - encodeCharacter(nBos, this.map[this.TERMINATOR]); - flush(nBos); - } catch (Exception ex) { - Log.getInstance(ArithCodec.class).error(ex); - } - - return this.bitCount; - } - - @Override - public byte[] read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented."); - } -} diff --git a/src/java/htsjdk/samtools/cram/encoding/ArithCodec1.java b/src/java/htsjdk/samtools/cram/encoding/ArithCodec1.java deleted file mode 100644 index c15be49af2..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/ArithCodec1.java +++ /dev/null @@ -1,364 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.io.BitInputStream; -import htsjdk.samtools.cram.io.BitOutputStream; -import htsjdk.samtools.cram.io.DefaultBitOutputStream; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.util.ArrayList; - -/** - * - * @author Alexander Senf - */ -public class ArithCodec1 extends AbstractBitCodec { - private byte curBit = 0; - private int curByte = 0; - private double min = 0; - private double max = 1; - private double localMin = 0; - private double localMax = 1; - - private long bitCount; - - private ByteArrayOutputStream baos; - private ArrayList fileData; - - private int previous; - private double[][] freq; - - public ArithCodec1(int[] freqs, byte[][] map) { - // Data received as collapsed array of two characters, with assoc - // frequency distribution. Extract that into 2-dim array - long[] count = new long[257]; - freq = new double[257][256]; // x = prev, y = cur - for (int i = 0; i < 256; i++) { - freq[256][i] = 1; // Initialize counts for first-char of a sequence - count[256]++; // and count characters - } - for (int i = 0; i < freqs.length; i++) { - int x = map[i].length > 1 ? map[i][0] : 256; // coordinate of prev - // char - int y = map[i].length > 1 ? map[i][1] : map[i][0]; // coordinate of - // cur char - freq[x][y] = freqs[i]; // place symbol count in sparts array - count[x] += freqs[i]; // count total symbols by col (x fixed, all y) - } - - // turn into frequency/probability distribution (normalize column-wise - - // for each x) - for (int x = 0; x < freq.length; x++) { - double accum = 0; - for (int y = 0; y < freq[x].length; y++) { - if (count[x] > 0) { - accum += (freq[x][y] / (double) count[x]); - freq[x][y] = accum; - } - } - if (freq[x][freq[x].length - 1] != 1.0) - freq[x][freq[x].length - 1] = 1.0; - } - - this.baos = new ByteArrayOutputStream(); - this.fileData = new ArrayList(); - this.previous = 256; // case where no character has been seen yet - } - - /* - * Reading and expanding a bit stream based on given frequency count - */ - private Byte readByte(BitInputStream bis) throws IOException { - this.baos.reset(); - this.fileData.clear(); - curBit = 0; - curByte = 0; - min = 0; - max = 1; - localMin = 0; - localMax = 1; - previous = 256; - - int read = decodeCharacter(bis); - this.baos.write(read); - previous = read; - - return (byte) read; - } - - public byte[] read(BitInputStream bis, int length) throws IOException { - - this.baos.reset(); - this.fileData.clear(); - curBit = 0; - curByte = 0; - min = 0; - max = 1; - localMin = 0; - localMax = 1; - previous = 256; - - for (int i = 0; i < length; i++) { - int read = decodeCharacter(bis); - this.baos.write(read); - previous = read; - } - - System.out.println(fileData.size()); - System.out.println(curByte); - System.out.println(curBit); - int nofBits = 8 - curBit; -// int nofBits = (fileData.size() - curByte) * 8 + 8 - curBit; - System.out.println(nofBits); - int bits = fileData.get(fileData.size() - 1) ; -// int bits = (fileData.get(fileData.size() - 2) << 8) | fileData.get(fileData.size() - 1) ; - - bis.putBack(curBit, (bits >> curBit) & (((1 << nofBits) - 1))); - - return this.baos.toByteArray(); - } - - public int decodeCharacter(BitInputStream bis) throws IOException { - double tempMin = min; - double tempMax = max; - byte tempBit = curBit; - int tempByte = curByte; - int val = 256; - if (this.fileData.isEmpty()) - fileData.add(bis.readBits(8)); - double[] probs = freq[previous]; // get correct frequency distribution - while (true) { - double cur = (min + max) / 2.0; - val = -1; - for (int i = 0; i < probs.length; i++) { - if (probs[i] > min) { - if (probs[i] > max) - val = i; - break; - } - } - if (val == -1) { - boolean bit = false; - if ((fileData.get(curByte) & (128 >> curBit)) != 0) - bit = true; - if (bit) - min = cur; - else - max = cur; - curBit++; - if (curBit == 8) { - curBit = 0; - curByte++; - if (curByte > fileData.size() - 1) { - try { - fileData.add(bis.readBits(8)); - } catch (Throwable t) { - fileData.add(0); - } - } - } - } else - break; - } - min = tempMin; - max = tempMax; - curBit = tempBit; - curByte = tempByte; - while (true) { - double cur = (min + max) / 2.0; - int temp = 0; - for (; temp < probs.length; temp++) - if (probs[temp] > cur) - break; - if (cur < 0 || cur > 1) - temp = -1; - if (temp != val) { - boolean bit = false; - if ((fileData.get(curByte) & (128 >> curBit)) != 0) - bit = true; - if (bit) - min = cur; - else - max = cur; - curBit++; - if (curBit == 8) { - curBit = 0; - curByte++; - if (curByte > fileData.size() - 1) - try { - fileData.add(bis.readBits(8)); - } catch (Throwable t) { - fileData.add(0); - } - } - } else { - tempMin = 0; - if (val > 0) - tempMin = probs[val - 1]; - double factor = 1.0 / (probs[val] - tempMin); - min = factor * (min - tempMin); - max = factor * (max - tempMin); - break; - } - } - return val; - } - - /* - * Write compressed output to a bit stream - */ - public long write(BitOutputStream bos, byte[] object) throws IOException { - - this.baos.reset(); - curBit = 0; - curByte = 0; - min = 0; - max = 1; - localMin = 0; - localMax = 1; - this.bitCount = 0; - previous = 256; - - try { - for (int i = 0; i < object.length; i++) { - encodeCharacter(bos, object[i] & 0xFF); - previous = object[i] & 0xFF; - } - flush(bos); - } catch (Exception ex) { - throw new RuntimeException(ex); - } - - return this.bitCount; - } - - private void encodeCharacter(BitOutputStream bos, int character) throws Exception { - double[] prbs = freq[previous]; - - if (prbs.length < 2 || prbs[prbs.length - 1] != 1 || character < 0 || character >= prbs.length) - throw new Exception("Invalid input"); - if (character > 0) - localMin = prbs[character - 1]; - else - localMin = 0; - localMax = prbs[character]; - while (true) { - double cur = (min + max) / 2.0; - if (cur < localMin) { - curByte |= (128 >> curBit); // set bit = 1, left-to-right - curBit++; - if (curBit == 8) { - bos.write(curByte, 8); - curByte = 0; // byte containing bits to be written - curBit = 0; // bit-position, left-to-right - this.bitCount += 8; - } - min = cur; // wrote 1 (go higer) adjust min - } else if (cur >= localMax) { - curBit++; - if (curBit == 8) { - bos.write(curByte, 8); - curByte = 0; - curBit = 0; - this.bitCount += 8; - } - max = cur; // wrote 0 (go lower) adjust max - } else { - double factor = 1.0 / (localMax - localMin); - min = factor * (min - localMin); - max = factor * (max - localMin); - break; - } - } - } - - private void flush(BitOutputStream bos) throws IOException { - if (curBit != 0) { - while (true) { - while (true) { - double cur = (min + max) / 2.0; - double mid = (localMin + localMax) / 2.0; - if (cur < mid) { - curByte |= (128 >> curBit); - min = cur; - } else - max = cur; - curBit++; - if (curBit == 8) { - bos.write(curByte, 8); - curByte = 0; - curBit = 0; - this.bitCount += 8; - break; - } - } - double cur = (min + max) / 2.0; - if (cur >= localMin && cur < localMax) - break; - } - } - bos.close(); - } - - /* - * Compress and count bits in the end - */ - @Override - public long numberOfBits(byte[] object) { - NullOutputStream baos = new NullOutputStream(); - DefaultBitOutputStream nBos = new DefaultBitOutputStream(baos); - - this.baos.reset(); - curBit = 0; - curByte = 0; - min = 0; - max = 1; - localMin = 0; - localMax = 1; - this.bitCount = 0; - previous = 256; - - try { - for (int i = 0; i < object.length; i++) { - encodeCharacter(nBos, object[i] & 0xFF); - previous = object[i]; - } - flush(nBos); - } catch (Exception ex) { - ; - } - - return this.bitCount; - } - - /** Writes to nowhere */ - private class NullOutputStream extends OutputStream { - @Override - public void write(int b) throws IOException { - ; // - } - } - - @Override - public byte[] read(BitInputStream bis) throws IOException { - // TODO Auto-generated method stub - return null; - } - -} diff --git a/src/java/htsjdk/samtools/cram/encoding/BetaIntegerCodec.java b/src/java/htsjdk/samtools/cram/encoding/BetaIntegerCodec.java index ae8a5cd742..496f5a63be 100644 --- a/src/java/htsjdk/samtools/cram/encoding/BetaIntegerCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/BetaIntegerCodec.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; @@ -21,59 +23,40 @@ import java.io.IOException; -public class BetaIntegerCodec extends AbstractBitCodec { - private int offset = 0; - private int readNofBits; - - public BetaIntegerCodec(int offset, int readNofBits) { - this.offset = offset; - this.readNofBits = readNofBits; - } - - @Override - public final Integer read(BitInputStream bis) throws IOException { - return bis.readBits(readNofBits) - offset; - } - - @Override - public final long write(BitOutputStream bos, Integer value) throws IOException { -// if (value + offset < 0) -// throw new IllegalArgumentException("Value is less then offset: " + value); - - int nofBits = (int) numberOfBits(value); - long newValue = value + offset; - bos.write(newValue, nofBits); - return nofBits; - } - - @Override - public final long numberOfBits(Integer value) { - if (value > (1L << readNofBits)) - throw new IllegalArgumentException("Value written is bigger then allowed: value=" + value - + ", max nof bits=" + readNofBits); - - return readNofBits; - } - - public long getOffset() { - return offset; - } - - public void setOffset(int offset) { - this.offset = offset; - } - - public int getReadNofBits() { - return readNofBits; - } - - public void setReadNofBits(int readNofBits) { - this.readNofBits = readNofBits; - } - - @Override - public Integer read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented."); - } +class BetaIntegerCodec extends AbstractBitCodec { + private int offset = 0; + private final int readNofBits; + + public BetaIntegerCodec(final int offset, final int readNofBits) { + this.offset = offset; + this.readNofBits = readNofBits; + } + + @Override + public final Integer read(final BitInputStream bitInputStream) throws IOException { + return bitInputStream.readBits(readNofBits) - offset; + } + + @Override + public final long write(final BitOutputStream bitOutputStream, final Integer value) throws IOException { + final int nofBits = (int) numberOfBits(value); + final long newValue = value + offset; + bitOutputStream.write(newValue, nofBits); + return nofBits; + } + + @Override + public final long numberOfBits(final Integer value) { + if (value > (1L << readNofBits)) + throw new IllegalArgumentException("Value written is bigger then allowed: value=" + value + + ", max nof bits=" + readNofBits); + + return readNofBits; + } + + @Override + public Integer read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Not implemented."); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/BetaIntegerEncoding.java b/src/java/htsjdk/samtools/cram/encoding/BetaIntegerEncoding.java index d420d2f74f..486ae36190 100644 --- a/src/java/htsjdk/samtools/cram/encoding/BetaIntegerEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/BetaIntegerEncoding.java @@ -1,22 +1,24 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; @@ -24,55 +26,53 @@ import java.nio.ByteBuffer; import java.util.Map; - - - public class BetaIntegerEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.BETA; - private int offset; - private int bitLimit; + private static final EncodingID ENCODING_ID = EncodingID.BETA; + private int offset; + private int bitLimit; - public BetaIntegerEncoding() { - } + public BetaIntegerEncoding() { + } - public BetaIntegerEncoding(int bitLimit) { - this.bitLimit = bitLimit; - } + public BetaIntegerEncoding(final int offset, final int bitLimit) { + this.offset = offset; + this.bitLimit = bitLimit; + } - @Override - public EncodingID id() { - return ENCODING_ID; - } + @Override + public EncodingID id() { + return ENCODING_ID; + } - public static EncodingParams toParam(int offset, int bitLimit) { - BetaIntegerEncoding e = new BetaIntegerEncoding(); - e.offset = offset; - e.bitLimit = bitLimit; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } + public static EncodingParams toParam(final int offset, final int bitLimit) { + final BetaIntegerEncoding encoding = new BetaIntegerEncoding(); + encoding.offset = offset; + encoding.bitLimit = bitLimit; + return new EncodingParams(ENCODING_ID, encoding.toByteArray()); + } - @Override - public byte[] toByteArray() { - ByteBuffer buf = ByteBuffer.allocate(10); - ByteBufferUtils.writeUnsignedITF8(offset, buf); - ByteBufferUtils.writeUnsignedITF8(bitLimit, buf); - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - return array; - } + @Override + public byte[] toByteArray() { + final ByteBuffer buffer = ByteBuffer.allocate(10); + ITF8.writeUnsignedITF8(offset, buffer); + ITF8.writeUnsignedITF8(bitLimit, buffer); + buffer.flip(); + final byte[] array = new byte[buffer.limit()]; + buffer.get(array); + return array; + } - @Override - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data); - offset = ByteBufferUtils.readUnsignedITF8(buf); - bitLimit = ByteBufferUtils.readUnsignedITF8(buf); - } + @Override + public void fromByteArray(final byte[] data) { + final ByteBuffer buffer = ByteBuffer.wrap(data); + offset = ITF8.readUnsignedITF8(buffer); + bitLimit = ITF8.readUnsignedITF8(buffer); + } - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new BetaIntegerCodec(offset, bitLimit); - } + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + return new BetaIntegerCodec(offset, bitLimit); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/BitCodec.java b/src/java/htsjdk/samtools/cram/encoding/BitCodec.java index 9967dc93bf..2920f11756 100644 --- a/src/java/htsjdk/samtools/cram/encoding/BitCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/BitCodec.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; @@ -20,21 +22,76 @@ import java.io.IOException; +/** + * An interface that defines requirements for serializing/deserializing objects into and from a bit stream. + * + * @param data series type to be read or written + * @noinspection UnnecessaryInterfaceModifier, UnnecessaryInterfaceModifier, UnnecessaryInterfaceModifier, UnnecessaryInterfaceModifier, UnnecessaryInterfaceModifier, UnnecessaryInterfaceModifier, UnnecessaryInterfaceModifier + */ public interface BitCodec { - public T read(BitInputStream bis) throws IOException; + /** + * Read a single object from the bit stream. + * + * @param bitInputStream the bit input stream to rad from + * @return an object from the stream + * @throws IOException as per java IO contract + */ + T read(BitInputStream bitInputStream) throws IOException; - public T read(BitInputStream bis, int valueLen) throws IOException; + /** + * Read a array of specified length from the bit stream. + * + * @param bitInputStream the bit input stream to rad from + * param valueLen the number of elements to read + * @return an object from the stream + * @throws IOException as per java IO contract + */ + T read(BitInputStream bitInputStream, int valueLen) throws IOException; - public void readInto(BitInputStream bis, byte[] array, int offset, - int valueLen) throws IOException; + /** + * Read a array of specified length from the bit stream into a given byte array. + * This method is a way to optimize byte array IO operations by bypassing abstraction. Leaky, I know. + * + * @param bitInputStream the bit input stream to rad from + * @param array the array to read into + * @param offset offset in the array + * @param valueLen number of elements to read + * @throws IOException as per java IO contract + */ + void readInto(BitInputStream bitInputStream, byte[] array, int offset, + int valueLen) throws IOException; - public void skip(BitInputStream bis) throws IOException; + /** + * Skip the next object in the bit stream. + * @param bitInputStream the bit stream to operate on + * @throws IOException as per java IO contract + */ + void skip(BitInputStream bitInputStream) throws IOException; - public void skip(BitInputStream bis, int len) throws IOException; + /** + * Skip the next length objects in the bit stream. + * @param bitInputStream the bit stream to operate on + * @param length the number of objects to skip + * + * @throws IOException as per java IO contract + */ + void skip(BitInputStream bitInputStream, int length) throws IOException; - public long write(BitOutputStream bos, T object) throws IOException; + /** + * Write an object into the bit stream + * @param bitOutputStream the output bit stream to write to + * @param object the object to write + * @return the number of bits written out + * @throws IOException as per java IO contract + */ + long write(BitOutputStream bitOutputStream, T object) throws IOException; - public long numberOfBits(T object); + /** + * Calculate the number of bits that the object would take in bit serialized form. + * @param object an object + * @return the number of bits + */ + long numberOfBits(T object); } diff --git a/src/java/htsjdk/samtools/cram/encoding/ByteArrayLenEncoding.java b/src/java/htsjdk/samtools/cram/encoding/ByteArrayLenEncoding.java index 9326625bac..0c76a5b6ed 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ByteArrayLenEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/ByteArrayLenEncoding.java @@ -1,24 +1,26 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; import htsjdk.samtools.cram.io.BitOutputStream; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; @@ -29,148 +31,116 @@ import java.util.Map; public class ByteArrayLenEncoding implements Encoding { - public final static EncodingID ID = EncodingID.BYTE_ARRAY_LEN; - Encoding lenEncoding; - Encoding byteEncoding; - - public ByteArrayLenEncoding() { - } - - @Override - public EncodingID id() { - return ID; - } - - public static EncodingParams toParam(EncodingParams lenParams, - EncodingParams byteParams) { - ByteArrayOutputStream baos = new ByteArrayOutputStream() ; - try { - baos.write((byte) lenParams.id.ordinal()); - ByteBufferUtils.writeUnsignedITF8(lenParams.params.length, baos); - baos.write(lenParams.params); - - baos.write((byte) byteParams.id.ordinal()); - ByteBufferUtils.writeUnsignedITF8(byteParams.params.length, baos); - baos.write(byteParams.params); - } catch (IOException e) { - throw new RuntimeException("It never happened. ") ; - } - return new EncodingParams(ID, baos.toByteArray()); -// ByteBuffer buf = ByteBuffer.allocate(1024); -// buf.put((byte) lenParams.id.ordinal()); -// ByteBufferUtils.writeUnsignedITF8(lenParams.params.length, buf); -// buf.put(lenParams.params); -// -// buf.put((byte) byteParams.id.ordinal()); -// ByteBufferUtils.writeUnsignedITF8(byteParams.params.length, buf); -// buf.put(byteParams.params); -// -// buf.flip(); -// byte[] data = new byte[buf.limit()]; -// buf.get(data); -// -// EncodingParams params = new EncodingParams(ID, data); -// return params; - } - - public byte[] toByteArray() { - ByteArrayOutputStream baos = new ByteArrayOutputStream() ; - try { - baos.write((byte) lenEncoding.id().ordinal()); - byte[] lenBytes = lenEncoding.toByteArray() ; - ByteBufferUtils.writeUnsignedITF8(lenBytes.length, baos); - baos.write(lenBytes); - - baos.write((byte) byteEncoding.id().ordinal()); - byte[] byteBytes = byteEncoding.toByteArray() ; - ByteBufferUtils.writeUnsignedITF8(byteBytes.length, baos); - baos.write(byteBytes); - } catch (IOException e) { - throw new RuntimeException("It never happened. ") ; - } - return baos.toByteArray() ; - -// ByteBuffer buf = ByteBuffer.allocate(1024); -// buf.put((byte) lenEncoding.id().ordinal()); -// byte[] lenBytes = lenEncoding.toByteArray(); -// ByteBufferUtils.writeUnsignedITF8(lenBytes.length, buf); -// buf.put(lenBytes); -// -// buf.put((byte) byteEncoding.id().ordinal()); -// byte[] byteBytes = lenEncoding.toByteArray(); -// ByteBufferUtils.writeUnsignedITF8(byteBytes.length, buf); -// buf.put(byteBytes); -// -// buf.flip(); -// byte[] array = new byte[buf.limit()]; -// buf.get(array); -// -// return array; - } - - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data); - - EncodingFactory f = new EncodingFactory(); - - EncodingID lenID = EncodingID.values()[buf.get()]; - lenEncoding = f.createEncoding(DataSeriesType.INT, lenID); - int len = ByteBufferUtils.readUnsignedITF8(buf); - byte[] bytes = new byte[len]; - buf.get(bytes); - lenEncoding.fromByteArray(bytes); - - EncodingID byteID = EncodingID.values()[buf.get()]; - byteEncoding = f.createEncoding(DataSeriesType.BYTE_ARRAY, byteID); - len = ByteBufferUtils.readUnsignedITF8(buf); - bytes = new byte[len]; - buf.get(bytes); - byteEncoding.fromByteArray(bytes); - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new ByteArrayLenCodec( - lenEncoding.buildCodec(inputMap, outputMap), - byteEncoding.buildCodec(inputMap, outputMap)); - } - - private static class ByteArrayLenCodec extends AbstractBitCodec { - private BitCodec lenCodec; - private BitCodec byteCodec; - - public ByteArrayLenCodec(BitCodec lenCodec, - BitCodec byteCodec) { - super(); - this.lenCodec = lenCodec; - this.byteCodec = byteCodec; - } - - @Override - public byte[] read(BitInputStream bis) throws IOException { - int len = lenCodec.read(bis); - return byteCodec.read(bis, len); - } - - @Override - public byte[] read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented."); - } - - @Override - public long write(BitOutputStream bos, byte[] object) - throws IOException { - long len = lenCodec.write(bos, object.length); - len += byteCodec.write(bos, object); - return len; - } - - @Override - public long numberOfBits(byte[] object) { - return lenCodec.numberOfBits(object.length) - + byteCodec.numberOfBits(object); - } - - } + private final static EncodingID ID = EncodingID.BYTE_ARRAY_LEN; + private Encoding lenEncoding; + private Encoding byteEncoding; + + public ByteArrayLenEncoding() { + } + + @Override + public EncodingID id() { + return ID; + } + + public static EncodingParams toParam(final EncodingParams lenParams, + final EncodingParams byteParams) { + final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + try { + byteArrayOutputStream.write((byte) lenParams.id.ordinal()); + ITF8.writeUnsignedITF8(lenParams.params.length, byteArrayOutputStream); + byteArrayOutputStream.write(lenParams.params); + + byteArrayOutputStream.write((byte) byteParams.id.ordinal()); + ITF8.writeUnsignedITF8(byteParams.params.length, byteArrayOutputStream); + byteArrayOutputStream.write(byteParams.params); + } catch (final IOException e) { + throw new RuntimeException("It never happened. "); + } + return new EncodingParams(ID, byteArrayOutputStream.toByteArray()); + } + + public byte[] toByteArray() { + final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + try { + byteArrayOutputStream.write((byte) lenEncoding.id().ordinal()); + final byte[] lenBytes = lenEncoding.toByteArray(); + ITF8.writeUnsignedITF8(lenBytes.length, byteArrayOutputStream); + byteArrayOutputStream.write(lenBytes); + + byteArrayOutputStream.write((byte) byteEncoding.id().ordinal()); + final byte[] byteBytes = byteEncoding.toByteArray(); + ITF8.writeUnsignedITF8(byteBytes.length, byteArrayOutputStream); + byteArrayOutputStream.write(byteBytes); + } catch (final IOException e) { + throw new RuntimeException("It never happened. "); + } + return byteArrayOutputStream.toByteArray(); + } + + public void fromByteArray(final byte[] data) { + final ByteBuffer buffer = ByteBuffer.wrap(data); + + final EncodingFactory encodingFactory = new EncodingFactory(); + + final EncodingID lenID = EncodingID.values()[buffer.get()]; + lenEncoding = encodingFactory.createEncoding(DataSeriesType.INT, lenID); + int length = ITF8.readUnsignedITF8(buffer); + byte[] bytes = new byte[length]; + buffer.get(bytes); + lenEncoding.fromByteArray(bytes); + + final EncodingID byteID = EncodingID.values()[buffer.get()]; + byteEncoding = encodingFactory.createEncoding(DataSeriesType.BYTE_ARRAY, byteID); + length = ITF8.readUnsignedITF8(buffer); + bytes = new byte[length]; + buffer.get(bytes); + byteEncoding.fromByteArray(bytes); + } + + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + return new ByteArrayLenCodec( + lenEncoding.buildCodec(inputMap, outputMap), + byteEncoding.buildCodec(inputMap, outputMap)); + } + + private static class ByteArrayLenCodec extends AbstractBitCodec { + private final BitCodec lenCodec; + private final BitCodec byteCodec; + + public ByteArrayLenCodec(final BitCodec lenCodec, + final BitCodec byteCodec) { + super(); + this.lenCodec = lenCodec; + this.byteCodec = byteCodec; + } + + @Override + public byte[] read(final BitInputStream bitInputStream) throws IOException { + final int length = lenCodec.read(bitInputStream); + return byteCodec.read(bitInputStream, length); + } + + @Override + public byte[] read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Not implemented."); + } + + @Override + public long write(final BitOutputStream bitOutputStream, final byte[] object) + throws IOException { + long length = lenCodec.write(bitOutputStream, object.length); + length += byteCodec.write(bitOutputStream, object); + return length; + } + + @Override + public long numberOfBits(final byte[] object) { + return lenCodec.numberOfBits(object.length) + + byteCodec.numberOfBits(object); + } + + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/ByteArrayStopEncoding.java b/src/java/htsjdk/samtools/cram/encoding/ByteArrayStopEncoding.java index 239eb911d9..c46d96754e 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ByteArrayStopEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/ByteArrayStopEncoding.java @@ -1,24 +1,26 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, + * distributed under the License inputStream distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; import htsjdk.samtools.cram.io.BitOutputStream; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; @@ -31,99 +33,98 @@ import java.util.Map; public class ByteArrayStopEncoding implements Encoding { - public final static EncodingID ID = EncodingID.BYTE_ARRAY_STOP; - private byte stopByte = 0; - private int externalId; - - public ByteArrayStopEncoding() { - } - - @Override - public EncodingID id() { - return ID; - } - - public ByteArrayStopEncoding(byte stopByte, int externalId) { - this.stopByte = stopByte; - this.externalId = externalId; - } - - public static EncodingParams toParam(byte stopByte, int externalId) { - ByteArrayStopEncoding e = new ByteArrayStopEncoding(stopByte, - externalId); - EncodingParams params = new EncodingParams(ID, e.toByteArray()); - return params; - } - - public byte[] toByteArray() { - ByteBuffer buf = ByteBuffer.allocate(1024); - buf.order(ByteOrder.LITTLE_ENDIAN); - buf.put(stopByte); - ByteBufferUtils.writeUnsignedITF8(externalId, buf) ; - - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - - return array; - } - - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data); - buf.order(ByteOrder.LITTLE_ENDIAN); - stopByte = buf.get(); - externalId = ByteBufferUtils.readUnsignedITF8(buf) ; - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - InputStream is = inputMap == null ? null : inputMap.get(externalId); - ExposedByteArrayOutputStream os = outputMap == null ? null : outputMap - .get(externalId); - return new ByteArrayStopCodec(stopByte, is, os); - } - - public static class ByteArrayStopCodec extends AbstractBitCodec { - - private int stop; - private InputStream is; - private OutputStream os; - private ByteArrayOutputStream readingBAOS = new ByteArrayOutputStream(); - private int b; - - public ByteArrayStopCodec(byte stopByte, InputStream is, OutputStream os) { - this.stop = 0xFF & stopByte; - this.is = is; - this.os = os; - } - - @Override - public byte[] read(BitInputStream bis) throws IOException { - readingBAOS.reset(); - while ((b = is.read()) != -1 && b != stop) - readingBAOS.write(b); - - return readingBAOS.toByteArray(); - } - - @Override - public byte[] read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented."); - } - - @Override - public long write(BitOutputStream bos, byte[] object) - throws IOException { - os.write(object); - os.write(stop); - return object.length + 1; - } - - @Override - public long numberOfBits(byte[] object) { - return object.length + 1; - } - - } + private final static EncodingID ID = EncodingID.BYTE_ARRAY_STOP; + private byte stopByte = 0; + private int externalId; + + public ByteArrayStopEncoding() { + } + + @Override + public EncodingID id() { + return ID; + } + + private ByteArrayStopEncoding(final byte stopByte, final int externalId) { + this.stopByte = stopByte; + this.externalId = externalId; + } + + public static EncodingParams toParam(final byte stopByte, final int externalId) { + final ByteArrayStopEncoding e = new ByteArrayStopEncoding(stopByte, + externalId); + return new EncodingParams(ID, e.toByteArray()); + } + + public byte[] toByteArray() { + final ByteBuffer buf = ByteBuffer.allocate(1024); + buf.order(ByteOrder.LITTLE_ENDIAN); + buf.put(stopByte); + ITF8.writeUnsignedITF8(externalId, buf); + + buf.flip(); + final byte[] array = new byte[buf.limit()]; + buf.get(array); + + return array; + } + + public void fromByteArray(final byte[] data) { + final ByteBuffer buf = ByteBuffer.wrap(data); + buf.order(ByteOrder.LITTLE_ENDIAN); + stopByte = buf.get(); + externalId = ITF8.readUnsignedITF8(buf); + } + + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + final InputStream is = inputMap == null ? null : inputMap.get(externalId); + final ExposedByteArrayOutputStream os = outputMap == null ? null : outputMap + .get(externalId); + return new ByteArrayStopCodec(stopByte, is, os); + } + + public static class ByteArrayStopCodec extends AbstractBitCodec { + + private final int stop; + private final InputStream inputStream; + private final OutputStream outputStream; + private final ByteArrayOutputStream readingBAOS = new ByteArrayOutputStream(); + private int b; + + public ByteArrayStopCodec(final byte stopByte, final InputStream inputStream, final OutputStream outputStream) { + this.stop = 0xFF & stopByte; + this.inputStream = inputStream; + this.outputStream = outputStream; + } + + @Override + public byte[] read(final BitInputStream bitInputStream) throws IOException { + readingBAOS.reset(); + while ((b = inputStream.read()) != -1 && b != stop) + readingBAOS.write(b); + + return readingBAOS.toByteArray(); + } + + @Override + public byte[] read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Not implemented."); + } + + @Override + public long write(final BitOutputStream bitOutputStream, final byte[] object) + throws IOException { + outputStream.write(object); + outputStream.write(stop); + return object.length + 1; + } + + @Override + public long numberOfBits(final byte[] object) { + return object.length + 1; + } + + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/CanonicalHuffmanByteCodec.java b/src/java/htsjdk/samtools/cram/encoding/CanonicalHuffmanByteCodec.java deleted file mode 100644 index 1d470c6c59..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/CanonicalHuffmanByteCodec.java +++ /dev/null @@ -1,159 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.io.BitInputStream; -import htsjdk.samtools.cram.io.BitOutputStream; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; -import java.util.SortedMap; -import java.util.SortedSet; -import java.util.TreeMap; -import java.util.TreeSet; - - -public class CanonicalHuffmanByteCodec extends AbstractBitCodec { - - private TreeMap codes; - private HuffmanBitCode[] bitCodes = new HuffmanBitCode[256]; - private Integer[] codeLentghSorted; - private Map> codeCache = new HashMap>(); - private Map[] codeMaps; - - /* - * values[]: the alphabet (provided as Integers) bitLengths[]: the number of - * bits of symbil's huffman code - */ - public CanonicalHuffmanByteCodec(byte[] values, int[] bitLengths) { - super(); - - // 1. Sort by (a) bit length and (b) by symbol value ----------- - SortedMap codebook = new TreeMap>(); - for (int i = 0; i < values.length; i++) { - if (codebook.containsKey(bitLengths[i])) - ((TreeSet) codebook.get(bitLengths[i])).add(values[i]); - else { - TreeSet entry = new TreeSet(); - entry.add(values[i]); - codebook.put(bitLengths[i], entry); - } - } - codeLentghSorted = new Integer[codebook.size()]; - int keys = 0; - - // 2. Calculate and Assign Canonical Huffman Codes ------------- - int codeLength = 0, codeValue = -1; // first Canonical is always 0 - codes = new TreeMap(); - Set keySet = codebook.keySet(); - for (Object key : keySet) { // Iterate over code lengths - int iKey = Integer.parseInt(key.toString()); - codeLentghSorted[keys++] = iKey; - - TreeSet get = (TreeSet) codebook.get(key); - for (Byte entry : get) { // Iterate over symbols - HuffmanBitCode code = new HuffmanBitCode(); - code.bitLentgh = iKey; // given: bit length - code.value = entry; // given: symbol - - codeValue++; // increment bit value by 1 - int delta = iKey - codeLength; // new length? - codeValue = codeValue << delta; // pad with 0's - code.bitCode = codeValue; // calculated: huffman code - codeLength += delta; // adjust current code len - - if (NumberOfSetBits(codeValue) > iKey) - throw new IllegalArgumentException("Symbol out of range"); - - bitCodes[entry & 0xFF] = code; // Store Bit Code - codes.put(entry, code); // Store HuffmanBitCode - - Map codeMap = codeCache.get(code.bitLentgh); - if (codeMap == null) { - codeMap = new HashMap(); - codeCache.put(code.bitLentgh, codeMap); - } - codeMap.put(new Long(code.bitCode), (byte) (0xFF & code.value)); - } - - } - - // 3. Done. Just have to populate codeMaps --------------------- - if (codeLentghSorted.length > 0) - codeMaps = new Map[codeLentghSorted[codeLentghSorted.length - 1] + 1]; - else - codeMaps = new Map[1]; - for (int len : codeLentghSorted) { // Iterate over code lengths - codeMaps[len] = codeCache.get(len); - } - } - - @Override - public Byte read(BitInputStream bis) throws IOException { - long buf = 0; // huffman code - int bitsRead = 0; - for (int len : codeLentghSorted) { - buf = buf << (len - bitsRead); - - long readLongBits = bis.readLongBits(len - bitsRead); - - buf = buf | readLongBits; - - bitsRead = len; - Map codeMap = codeMaps[len]; - Byte result = codeMap.get(buf); - if (result != null) { - return result; - } - } - throw new RuntimeException("Bit code not found. Current state: " - + bitsRead + " bits read, buf=" + buf); - } - - @Override - public long write(BitOutputStream bos, Byte object) throws IOException { - HuffmanBitCode bitCode = bitCodes[object]; - if (bitCode == null) - throw new RuntimeException("Huffman code not found for value: " - + object); - bos.write(bitCode.bitCode, bitCode.bitLentgh); - return bitCode.bitLentgh; - } - - @Override - public long numberOfBits(Byte object) { - throw new UnsupportedOperationException("Not supported yet."); - } - - private static class HuffmanBitCode { - int bitCode; - int bitLentgh; - int value; - } - - private int NumberOfSetBits(int i) { - i = i - ((i >> 1) & 0x55555555); - i = (i & 0x33333333) + ((i >> 2) & 0x33333333); - return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; - } - - @Override - public Byte read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented"); - } -} diff --git a/src/java/htsjdk/samtools/cram/encoding/CanonicalHuffmanIntegerCodec.java b/src/java/htsjdk/samtools/cram/encoding/CanonicalHuffmanIntegerCodec.java deleted file mode 100644 index 55d01c8a2c..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/CanonicalHuffmanIntegerCodec.java +++ /dev/null @@ -1,163 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.io.BitInputStream; -import htsjdk.samtools.cram.io.BitOutputStream; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; -import java.util.SortedMap; -import java.util.SortedSet; -import java.util.TreeMap; -import java.util.TreeSet; - - -public class CanonicalHuffmanIntegerCodec extends AbstractBitCodec { - - private TreeMap codes; - private Integer[] codeLentghSorted; - private Map> codeCache = new HashMap>(); - private Map[] codeMaps; - - /* - * values[]: the alphabet (provided as Integers) bitLengths[]: the number of - * bits of symbil's huffman code - */ - public CanonicalHuffmanIntegerCodec(int[] values, int[] bitLengths) { - super(); - - // 1. Sort by (a) bit length and (b) by symbol value ----------- - SortedMap codebook = new TreeMap>(); - for (int i = 0; i < values.length; i++) { - if (codebook.containsKey(bitLengths[i])) - ((TreeSet) codebook.get(bitLengths[i])).add(values[i]); - else { - TreeSet entry = new TreeSet(); - entry.add(values[i]); - codebook.put(bitLengths[i], entry); - } - } - codeLentghSorted = new Integer[codebook.size()]; - int keys = 0; - - // 2. Calculate and Assign Canonical Huffman Codes ------------- - int codeLength = 0, codeValue = -1; // first Canonical is always 0 - codes = new TreeMap(); - Set keySet = codebook.keySet(); - for (Object key : keySet) { // Iterate over code lengths - int iKey = Integer.parseInt(key.toString()); - codeLentghSorted[keys++] = iKey; - - TreeSet get = (TreeSet) codebook.get(key); - for (Integer entry : get) { // Iterate over symbols - HuffmanBitCode code = new HuffmanBitCode(); - code.bitLentgh = iKey; // given: bit length - code.value = entry; // given: symbol - - codeValue++; // increment bit value by 1 - int delta = iKey - codeLength; // new length? - codeValue = codeValue << delta; // pad with 0's - code.bitCode = codeValue; // calculated: huffman code - codeLength += delta; // adjust current code len - - if (NumberOfSetBits(codeValue) > iKey) - throw new IllegalArgumentException("Symbol out of range"); - - codes.put(entry, code); // Store HuffmanBitCode - - Map codeMap = codeCache.get(code.bitLentgh); - if (codeMap == null) { - codeMap = new HashMap(); - codeCache.put(code.bitLentgh, codeMap); - } - codeMap.put(new Long(code.bitCode), code.value); - } - - } - - // 3. Done. Just have to populate codeMaps --------------------- - if (codeLentghSorted.length > 0) - codeMaps = new Map[codeLentghSorted[codeLentghSorted.length - 1] + 1]; - else - codeMaps = new Map[1]; - for (int len : codeLentghSorted) { // Iterate over code lengths - codeMaps[len] = codeCache.get(len); - } - } - - @Override - public Integer read(BitInputStream bis) throws IOException { - long buf = 0; // huffman code - int bitsRead = 0; - for (int len : codeLentghSorted) { - buf = buf << (len - bitsRead); - - long readLongBits = bis.readLongBits(len - bitsRead); - - buf = buf | readLongBits; - - bitsRead = len; - Map codeMap = codeMaps[len]; - Integer result = codeMap.get(buf); - if (result != null) { - return result; - } - } - throw new RuntimeException("Bit code not found. Current state: " - + bitsRead + " bits read, buf=" + buf); - } - - @Override - public long write(BitOutputStream bos, Integer object) throws IOException { - HuffmanBitCode bitCode = codes.get(object); - if (bitCode == null) - throw new RuntimeException("Huffman code not found for value: " - + object); - bos.write(bitCode.bitCode, bitCode.bitLentgh); - return bitCode.bitLentgh; - } - - @Override - public long numberOfBits(Integer object) { - HuffmanBitCode bitCode; - try { - bitCode = codes.get(object); - return bitCode.bitLentgh ; - } catch (NullPointerException e) { - throw new RuntimeException("Value " + object + " not found.", e) ; - } - } - - private static class HuffmanBitCode { - int bitCode; - int bitLentgh; - int value; - } - - private int NumberOfSetBits(int i) { - i = i - ((i >> 1) & 0x55555555); - i = (i & 0x33333333) + ((i >> 2) & 0x33333333); - return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; - } - - @Override - public Integer read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented"); - } -} diff --git a/src/java/htsjdk/samtools/cram/encoding/DataSeries.java b/src/java/htsjdk/samtools/cram/encoding/DataSeries.java index f2e995fcdb..de4fd5ed84 100644 --- a/src/java/htsjdk/samtools/cram/encoding/DataSeries.java +++ b/src/java/htsjdk/samtools/cram/encoding/DataSeries.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.structure.EncodingKey; @@ -22,9 +24,26 @@ import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; +/** + * An annotation to denote a data series field in a java class. + * Some data can be represented as a set of column (data series) where + * each column is characterized by it's intention ({@link htsjdk.samtools.cram.structure.EncodingKey} for CRAM) + * and it's data type, like {@link java.lang.Integer}or {@link java.lang.String}. + * Annotating fields in a class with this annotation allows for automated discovery of such column (data series) + * and attaching specific codec to serialise/deserialize data. + */ @Target(ElementType.FIELD) @Retention(RetentionPolicy.RUNTIME) public @interface DataSeries { - EncodingKey key(); - DataSeriesType type() ; + /** + * One of the pre-defined CRAM data series names + * @return CRAM data series name (key) + */ + EncodingKey key(); + + /** + * Data type of the series. + * @return data type of the series + */ + DataSeriesType type(); } diff --git a/src/java/htsjdk/samtools/cram/encoding/DataSeriesMap.java b/src/java/htsjdk/samtools/cram/encoding/DataSeriesMap.java index 49970ed263..b16ab8d0d4 100644 --- a/src/java/htsjdk/samtools/cram/encoding/DataSeriesMap.java +++ b/src/java/htsjdk/samtools/cram/encoding/DataSeriesMap.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import java.lang.annotation.ElementType; @@ -23,5 +25,5 @@ @Target(ElementType.FIELD) @Retention(RetentionPolicy.RUNTIME) public @interface DataSeriesMap { - String name() default "TAG" ; + String name() default "TAG"; } diff --git a/src/java/htsjdk/samtools/cram/encoding/DataSeriesType.java b/src/java/htsjdk/samtools/cram/encoding/DataSeriesType.java index c02c59c631..6eecb7c049 100644 --- a/src/java/htsjdk/samtools/cram/encoding/DataSeriesType.java +++ b/src/java/htsjdk/samtools/cram/encoding/DataSeriesType.java @@ -1,21 +1,41 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; +/** + * Data series types known to CRAM. + */ public enum DataSeriesType { - BYTE, INT, LONG, BYTE_ARRAY ; + /** + * A single signed byte (256 distinct values) + */ + BYTE, + /** + * A signed integer ~4 billions of them. + */ + INT, + /** + * A signed long value, 64 bits, too many to count. + */ + LONG, + /** + * An array of bytes. + */ + BYTE_ARRAY } diff --git a/src/java/htsjdk/samtools/cram/encoding/Encoding.java b/src/java/htsjdk/samtools/cram/encoding/Encoding.java index 7858ced897..1fc18df601 100644 --- a/src/java/htsjdk/samtools/cram/encoding/Encoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/Encoding.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; @@ -21,18 +23,22 @@ import java.io.InputStream; import java.util.Map; +/** + * An interface to describe how a data series is encoded. + * It also has methods to serialize/deserialize to/from byte array and a method to construct + * a {@link htsjdk.samtools.cram.encoding.BitCodec} instance. + * + * @param data series type + */ +public interface Encoding { + EncodingID id(); + byte[] toByteArray(); -public interface Encoding { - - public EncodingID id() ; - - public byte[] toByteArray(); - - public void fromByteArray(byte[] data); + void fromByteArray(byte[] data); - public BitCodec buildCodec(Map inputMap, - Map outputMap); + BitCodec buildCodec(Map inputMap, + Map outputMap); } diff --git a/src/java/htsjdk/samtools/cram/encoding/EncodingFactory.java b/src/java/htsjdk/samtools/cram/encoding/EncodingFactory.java index f4a8a50d1a..258e148a73 100644 --- a/src/java/htsjdk/samtools/cram/encoding/EncodingFactory.java +++ b/src/java/htsjdk/samtools/cram/encoding/EncodingFactory.java @@ -1,130 +1,118 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; +import htsjdk.samtools.cram.encoding.huffman.codec.HuffmanByteEncoding; +import htsjdk.samtools.cram.encoding.huffman.codec.HuffmanIntegerEncoding; import htsjdk.samtools.cram.structure.EncodingID; +/** + * A helper class to instantiate an appropriate {@link htsjdk.samtools.cram.encoding.Encoding} + * for a given {@link htsjdk.samtools.cram.encoding.DataSeriesType} and + * {@link htsjdk.samtools.cram.encoding.Encoding}. + * Also useful to hide encoding implementations. + */ +@SuppressWarnings("unchecked") public class EncodingFactory { - public Encoding createEncoding(DataSeriesType valueType, - EncodingID id) { - switch (valueType) { - case BYTE: - switch (id) { - case EXTERNAL: - return (Encoding) new ExternalByteEncoding(); - case HUFFMAN: - return (Encoding) new HuffmanByteEncoding(); - case NULL: - return new NullEncoding(); - - default: - break; - } - - break; - - case INT: - switch (id) { - case HUFFMAN: - return (Encoding) new HuffmanIntegerEncoding(); - case NULL: - return new NullEncoding(); - case EXTERNAL: - return (Encoding) new ExternalIntegerEncoding(); - case GOLOMB: - return (Encoding) new GolombIntegerEncoding(); - case GOLOMB_RICE: - return (Encoding) new GolombRiceIntegerEncoding(); - case BETA: - return (Encoding) new BetaIntegerEncoding(); - case GAMMA: - return (Encoding) new GammaIntegerEncoding(); - case SUBEXP: - return (Encoding) new SubexpIntegerEncoding(); - - default: - break; - } - break; - - case LONG: - switch (id) { - case NULL: - return new NullEncoding(); - case GOLOMB: - return (Encoding) new GolombLongEncoding(); - case EXTERNAL: - return (Encoding) new ExternalLongEncoding(); - - default: - break; - } - break; - - case BYTE_ARRAY: - switch (id) { - case NULL: - return new NullEncoding(); - case BYTE_ARRAY_LEN: - return (Encoding) new ByteArrayLenEncoding(); - case BYTE_ARRAY_STOP: - return (Encoding) new ByteArrayStopEncoding(); - case EXTERNAL: - return (Encoding) new ExternalByteArrayEncoding(); - - default: - break; - } - break; - - default: - break; - } - - return null; - } - - public Encoding createByteArrayEncoding(EncodingID id) { - switch (id) { - case BYTE_ARRAY_LEN: - return new ByteArrayLenEncoding(); - case BYTE_ARRAY_STOP: - return new ByteArrayLenEncoding(); - case EXTERNAL: - return new ExternalByteArrayEncoding(); - - default: - break; - } - return null; - } - - public Encoding createByteEncoding(EncodingID id) { - switch (id) { - case EXTERNAL: - return new ExternalByteEncoding(); - - default: - break; - } - return null; - } - - public Encoding createIntEncoding(EncodingID id) { - return null; - } + /** + * Create an encoding for the data series type and encoding id. + * @param valueType data type of the values to be produced/consumed by the encoding + * @param id encoding id used for data serialization + * @param encoding object type, like Integer or String. + * @return a new encoding with the requested parameters + */ + public Encoding createEncoding(final DataSeriesType valueType, + final EncodingID id) { + switch (valueType) { + case BYTE: + switch (id) { + case EXTERNAL: + return (Encoding) new ExternalByteEncoding(); + case HUFFMAN: + return (Encoding) new HuffmanByteEncoding(); + case NULL: + return new NullEncoding(); + + default: + break; + } + + break; + + case INT: + switch (id) { + case HUFFMAN: + return (Encoding) new HuffmanIntegerEncoding(); + case NULL: + return new NullEncoding(); + case EXTERNAL: + return (Encoding) new ExternalIntegerEncoding(); + case GOLOMB: + return (Encoding) new GolombIntegerEncoding(); + case GOLOMB_RICE: + return (Encoding) new GolombRiceIntegerEncoding(); + case BETA: + return (Encoding) new BetaIntegerEncoding(); + case GAMMA: + return (Encoding) new GammaIntegerEncoding(); + case SUBEXPONENTIAL: + return (Encoding) new SubexponentialIntegerEncoding(); + + default: + break; + } + break; + + case LONG: + switch (id) { + case NULL: + return new NullEncoding(); + case GOLOMB: + return (Encoding) new GolombLongEncoding(); + case EXTERNAL: + return (Encoding) new ExternalLongEncoding(); + + default: + break; + } + break; + + case BYTE_ARRAY: + switch (id) { + case NULL: + return new NullEncoding(); + case BYTE_ARRAY_LEN: + return (Encoding) new ByteArrayLenEncoding(); + case BYTE_ARRAY_STOP: + return (Encoding) new ByteArrayStopEncoding(); + case EXTERNAL: + return (Encoding) new ExternalByteArrayEncoding(); + + default: + break; + } + break; + + default: + break; + } + + return null; + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/ExternalByteArrayCodec.java b/src/java/htsjdk/samtools/cram/encoding/ExternalByteArrayCodec.java index b8fdbfdeac..4ebad39971 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ExternalByteArrayCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/ExternalByteArrayCodec.java @@ -1,72 +1,76 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, + * distributed under the License inputStream distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; import htsjdk.samtools.cram.io.BitOutputStream; -import htsjdk.samtools.cram.io.BitwiseUtils; +import htsjdk.samtools.cram.io.InputStreamUtils; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -public class ExternalByteArrayCodec extends AbstractBitCodec { - private OutputStream os; - private InputStream is; +class ExternalByteArrayCodec extends AbstractBitCodec { + private final OutputStream outputStream; + private final InputStream inputStream; - public ExternalByteArrayCodec(OutputStream os, InputStream is) { - this.os = os; - this.is = is; - } + public ExternalByteArrayCodec(final OutputStream outputStream, final InputStream inputStream) { + this.outputStream = outputStream; + this.inputStream = inputStream; + } - @Override - public byte[] read(BitInputStream bis, int len) throws IOException { - return BitwiseUtils.readFully(is, len); - } + @Override + public byte[] read(final BitInputStream bitInputStream, final int length) throws IOException { + return InputStreamUtils.readFully(inputStream, length); + } - @Override - public void readInto(BitInputStream bis, byte[] array, int offset, - int valueLen) throws IOException { - BitwiseUtils.readFully(is, array, offset, valueLen); - } + @Override + public void readInto(final BitInputStream bitInputStream, final byte[] array, final int offset, + final int valueLen) throws IOException { + InputStreamUtils.readFully(inputStream, array, offset, valueLen); + } - @Override - public void skip(BitInputStream bis) throws IOException { - is.skip(1); - } + @Override + public void skip(final BitInputStream bitInputStream) throws IOException { + //noinspection ResultOfMethodCallIgnored + inputStream.skip(1); + } - @Override - public void skip(BitInputStream bis, int len) throws IOException { - is.skip(len); - } + @Override + public void skip(final BitInputStream bitInputStream, final int length) throws IOException { + //noinspection ResultOfMethodCallIgnored + inputStream.skip(length); + } - @Override - public long write(BitOutputStream bos, byte[] object) throws IOException { - os.write(object); - return numberOfBits(object); - } + @Override + public long write(final BitOutputStream bitOutputStream, final byte[] object) throws IOException { + outputStream.write(object); + return numberOfBits(object); + } - @Override - public long numberOfBits(byte[] object) { - return object.length * 8; - } + @Override + public long numberOfBits(final byte[] object) { + return object.length * 8; + } - @Override - public byte[] read(BitInputStream bis) throws IOException { - throw new RuntimeException("Cannot read byte array of unknown length."); - } + @Override + public byte[] read(final BitInputStream bitInputStream) throws IOException { + throw new RuntimeException("Cannot read byte array of unknown length."); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/ExternalByteArrayEncoding.java b/src/java/htsjdk/samtools/cram/encoding/ExternalByteArrayEncoding.java index 7335614221..2fc707c5f3 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ExternalByteArrayEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/ExternalByteArrayEncoding.java @@ -1,64 +1,63 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; import java.io.InputStream; import java.util.Map; - - - public class ExternalByteArrayEncoding implements Encoding { - public static final EncodingID encodingId = EncodingID.EXTERNAL; - public int contentId = -1; - - public ExternalByteArrayEncoding() { - } - - public static EncodingParams toParam(int contentId) { - ExternalByteArrayEncoding e = new ExternalByteArrayEncoding(); - e.contentId = contentId; - return new EncodingParams(encodingId, e.toByteArray()); - } - - public byte[] toByteArray() { - return ByteBufferUtils.writeUnsignedITF8(contentId); - } - - public void fromByteArray(byte[] data) { - contentId = ByteBufferUtils.readUnsignedITF8(data); - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - InputStream is = inputMap == null ? null : inputMap.get(contentId); - ExposedByteArrayOutputStream os = outputMap == null ? null : outputMap - .get(contentId); - return (BitCodec) new ExternalByteArrayCodec(os, is); - } - - @Override - public EncodingID id() { - return encodingId; - } + private static final EncodingID encodingId = EncodingID.EXTERNAL; + private int contentId = -1; + + public ExternalByteArrayEncoding() { + } + + public static EncodingParams toParam(final int contentId) { + final ExternalByteArrayEncoding e = new ExternalByteArrayEncoding(); + e.contentId = contentId; + return new EncodingParams(encodingId, e.toByteArray()); + } + + public byte[] toByteArray() { + return ITF8.writeUnsignedITF8(contentId); + } + + public void fromByteArray(final byte[] data) { + contentId = ITF8.readUnsignedITF8(data); + } + + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + final InputStream inputStream = inputMap == null ? null : inputMap.get(contentId); + final ExposedByteArrayOutputStream outputStream = outputMap == null ? null : outputMap + .get(contentId); + return new ExternalByteArrayCodec(outputStream, inputStream); + } + + @Override + public EncodingID id() { + return encodingId; + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/ExternalByteCodec.java b/src/java/htsjdk/samtools/cram/encoding/ExternalByteCodec.java index 3ac092f4f5..b1c8529bd1 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ExternalByteCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/ExternalByteCodec.java @@ -1,62 +1,64 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, + * distributed under the License inputStream distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; import htsjdk.samtools.cram.io.BitOutputStream; -import htsjdk.samtools.cram.io.BitwiseUtils; +import htsjdk.samtools.cram.io.InputStreamUtils; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -public class ExternalByteCodec extends AbstractBitCodec { - private OutputStream os; - private InputStream is; - - public ExternalByteCodec(OutputStream os, InputStream is) { - this.os = os; - this.is = is; - } - - @Override - public Byte read(BitInputStream bis) throws IOException { - return (byte) is.read(); - } - - @Override - public long write(BitOutputStream bos, Byte object) throws IOException { - os.write(object); - return 8; - } - - @Override - public long numberOfBits(Byte object) { - return 8; - } - - @Override - public Byte read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented.") ; - } - - @Override - public void readInto(BitInputStream bis, byte[] array, int offset, - int valueLen) throws IOException { - BitwiseUtils.readFully(is, array, offset, valueLen); - } +class ExternalByteCodec extends AbstractBitCodec { + private final OutputStream outputStream; + private final InputStream inputStream; + + public ExternalByteCodec(final OutputStream outputStream, final InputStream inputStream) { + this.outputStream = outputStream; + this.inputStream = inputStream; + } + + @Override + public Byte read(final BitInputStream bitInputStream) throws IOException { + return (byte) inputStream.read(); + } + + @Override + public long write(final BitOutputStream bitOutputStream, final Byte object) throws IOException { + outputStream.write(object); + return 8; + } + + @Override + public long numberOfBits(final Byte object) { + return 8; + } + + @Override + public Byte read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Not implemented."); + } + + @Override + public void readInto(final BitInputStream bitInputStream, final byte[] array, final int offset, + final int valueLen) throws IOException { + InputStreamUtils.readFully(inputStream, array, offset, valueLen); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/ExternalByteEncoding.java b/src/java/htsjdk/samtools/cram/encoding/ExternalByteEncoding.java index 0dd361684b..0fed720599 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ExternalByteEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/ExternalByteEncoding.java @@ -1,63 +1,62 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; import java.io.InputStream; import java.util.Map; - - - public class ExternalByteEncoding implements Encoding { - public static final EncodingID encodingId = EncodingID.EXTERNAL ; - public int contentId = -1 ; - - public ExternalByteEncoding() { - } - - public static EncodingParams toParam(int contentId) { - ExternalByteEncoding e = new ExternalByteEncoding() ; - e.contentId = contentId ; - return new EncodingParams(encodingId, e.toByteArray()) ; - } - - public byte[] toByteArray() { - return ByteBufferUtils.writeUnsignedITF8(contentId) ; - } - - public void fromByteArray(byte[] data) { - contentId = ByteBufferUtils.readUnsignedITF8(data) ; - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - InputStream is = inputMap == null ? null : inputMap.get(contentId) ; - ExposedByteArrayOutputStream os = outputMap == null ? null : outputMap.get(contentId) ; - return (BitCodec) new ExternalByteCodec(os, is); - } - - @Override - public EncodingID id() { - return encodingId; - } + private static final EncodingID encodingId = EncodingID.EXTERNAL; + private int contentId = -1; + + public ExternalByteEncoding() { + } + + public static EncodingParams toParam(final int contentId) { + final ExternalByteEncoding externalByteEncoding = new ExternalByteEncoding(); + externalByteEncoding.contentId = contentId; + return new EncodingParams(encodingId, externalByteEncoding.toByteArray()); + } + + public byte[] toByteArray() { + return ITF8.writeUnsignedITF8(contentId); + } + + public void fromByteArray(final byte[] data) { + contentId = ITF8.readUnsignedITF8(data); + } + + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + final InputStream inputStream = inputMap == null ? null : inputMap.get(contentId); + final ExposedByteArrayOutputStream outputStream = outputMap == null ? null : outputMap.get(contentId); + return new ExternalByteCodec(outputStream, inputStream); + } + + @Override + public EncodingID id() { + return encodingId; + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/ExternalCompressor.java b/src/java/htsjdk/samtools/cram/encoding/ExternalCompressor.java new file mode 100644 index 0000000000..0987294c5a --- /dev/null +++ b/src/java/htsjdk/samtools/cram/encoding/ExternalCompressor.java @@ -0,0 +1,84 @@ +package htsjdk.samtools.cram.encoding; + +import htsjdk.samtools.cram.encoding.rans.RANS.ORDER; +import htsjdk.samtools.cram.io.ExternalCompression; +import htsjdk.samtools.cram.structure.BlockCompressionMethod; + +import java.io.IOException; + +public abstract class ExternalCompressor { + private final BlockCompressionMethod method; + + private ExternalCompressor(final BlockCompressionMethod method) { + this.method = method; + } + + public BlockCompressionMethod getMethod() { + return method; + } + + public abstract byte[] compress(byte[] data); + + public static ExternalCompressor createRAW() { + return new ExternalCompressor(BlockCompressionMethod.RAW) { + + @Override + public byte[] compress(final byte[] data) { + return data; + } + }; + } + + public static ExternalCompressor createGZIP() { + return new ExternalCompressor(BlockCompressionMethod.GZIP) { + + @Override + public byte[] compress(final byte[] data) { + try { + return ExternalCompression.gzip(data); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + public static ExternalCompressor createLZMA() { + return new ExternalCompressor(BlockCompressionMethod.LZMA) { + + @Override + public byte[] compress(final byte[] data) { + try { + return ExternalCompression.xz(data); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + public static ExternalCompressor createBZIP2() { + return new ExternalCompressor(BlockCompressionMethod.BZIP2) { + + @Override + public byte[] compress(final byte[] data) { + try { + return ExternalCompression.bzip2(data); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + }; + + } + + public static ExternalCompressor createRANS(final ORDER order) { + return new ExternalCompressor(BlockCompressionMethod.RANS) { + + @Override + public byte[] compress(final byte[] data) { + return ExternalCompression.rans(data, order); + } + }; + } +} diff --git a/src/java/htsjdk/samtools/cram/encoding/ExternalIntegerCodec.java b/src/java/htsjdk/samtools/cram/encoding/ExternalIntegerCodec.java index f52c83b247..d26b0db6eb 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ExternalIntegerCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/ExternalIntegerCodec.java @@ -1,74 +1,76 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, + * distributed under the License inputStream distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; import htsjdk.samtools.cram.io.BitOutputStream; -import htsjdk.samtools.cram.io.ByteBufferUtils; +import htsjdk.samtools.cram.io.ITF8; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -public class ExternalIntegerCodec extends AbstractBitCodec { - private OutputStream os; - private InputStream is; - private OutputStream nullOS = new OutputStream() { +class ExternalIntegerCodec extends AbstractBitCodec { + private final OutputStream outputStream; + private final InputStream inputStream; + private final OutputStream nullOutputStream = new OutputStream() { - @Override - public void write(byte[] b) throws IOException { - } + @Override + public void write(@SuppressWarnings("NullableProblems") final byte[] b) throws IOException { + } - @Override - public void write(int b) throws IOException { - } + @Override + public void write(final int b) throws IOException { + } - @Override - public void write(byte[] b, int off, int len) throws IOException { - } - }; + @Override + public void write(@SuppressWarnings("NullableProblems") final byte[] b, final int off, final int length) throws IOException { + } + }; - public ExternalIntegerCodec(OutputStream os, InputStream is) { - this.os = os; - this.is = is; - } + public ExternalIntegerCodec(final OutputStream outputStream, final InputStream inputStream) { + this.outputStream = outputStream; + this.inputStream = inputStream; + } - @Override - public Integer read(BitInputStream bis) throws IOException { - return ByteBufferUtils.readUnsignedITF8(is); - } + @Override + public Integer read(final BitInputStream bitInputStream) throws IOException { + return ITF8.readUnsignedITF8(inputStream); + } - @Override - public long write(BitOutputStream bos, Integer value) throws IOException { - return ByteBufferUtils.writeUnsignedITF8(value, os); - } + @Override + public long write(final BitOutputStream bitOutputStream, final Integer value) throws IOException { + return ITF8.writeUnsignedITF8(value, outputStream); + } - @Override - public long numberOfBits(Integer value) { - try { - return ByteBufferUtils.writeUnsignedITF8(value, nullOS); - } catch (IOException e) { - // this should never happened but still: - throw new RuntimeException(e) ; - } - } + @Override + public long numberOfBits(final Integer value) { + try { + return ITF8.writeUnsignedITF8(value, nullOutputStream); + } catch (final IOException e) { + // this should never happened but still: + throw new RuntimeException(e); + } + } - @Override - public Integer read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented."); - } + @Override + public Integer read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Not implemented."); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/ExternalIntegerEncoding.java b/src/java/htsjdk/samtools/cram/encoding/ExternalIntegerEncoding.java index 70961dd60d..a7c5736689 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ExternalIntegerEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/ExternalIntegerEncoding.java @@ -1,63 +1,62 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; import java.io.InputStream; import java.util.Map; - - - public class ExternalIntegerEncoding implements Encoding { - public static final EncodingID encodingId = EncodingID.EXTERNAL ; - public int contentId = -1 ; - - public ExternalIntegerEncoding() { - } - - public static EncodingParams toParam(int contentId) { - ExternalIntegerEncoding e = new ExternalIntegerEncoding() ; - e.contentId = contentId ; - return new EncodingParams(encodingId, e.toByteArray()) ; - } - - public byte[] toByteArray() { - return ByteBufferUtils.writeUnsignedITF8(contentId) ; - } - - public void fromByteArray(byte[] data) { - contentId = ByteBufferUtils.readUnsignedITF8(data) ; - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - InputStream is = inputMap == null ? null : inputMap.get(contentId) ; - ExposedByteArrayOutputStream os = outputMap == null ? null : outputMap.get(contentId) ; - return (BitCodec) new ExternalIntegerCodec(os, is); - } - - @Override - public EncodingID id() { - return encodingId; - } + private static final EncodingID encodingId = EncodingID.EXTERNAL; + private int contentId = -1; + + public ExternalIntegerEncoding() { + } + + public static EncodingParams toParam(final int contentId) { + final ExternalIntegerEncoding externalIntegerEncoding = new ExternalIntegerEncoding(); + externalIntegerEncoding.contentId = contentId; + return new EncodingParams(encodingId, externalIntegerEncoding.toByteArray()); + } + + public byte[] toByteArray() { + return ITF8.writeUnsignedITF8(contentId); + } + + public void fromByteArray(final byte[] data) { + contentId = ITF8.readUnsignedITF8(data); + } + + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + final InputStream inputStream = inputMap == null ? null : inputMap.get(contentId); + final ExposedByteArrayOutputStream outputStream = outputMap == null ? null : outputMap.get(contentId); + return new ExternalIntegerCodec(outputStream, inputStream); + } + + @Override + public EncodingID id() { + return encodingId; + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/ExternalLongCodec.java b/src/java/htsjdk/samtools/cram/encoding/ExternalLongCodec.java index 2924dc48cf..328f9407a0 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ExternalLongCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/ExternalLongCodec.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, + * distributed under the License inputStream distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; @@ -23,41 +25,41 @@ import java.io.OutputStream; -public class ExternalLongCodec extends AbstractBitCodec { - private OutputStream os; - private InputStream is; - - public ExternalLongCodec(OutputStream os, InputStream is) { - this.os = os; - this.is = is; - } - - @Override - public Long read(BitInputStream bis) throws IOException { - long result = 0; - for (int i = 0; i < 8; i++) { - result <<= 8; - result |= is.read(); - } - return result; - } - - @Override - public long write(BitOutputStream bos, Long value) throws IOException { - for (int i=0; i<8; i++) { - os.write((int) (value & 0xFF)) ; - value >>>= 8; - } - return 64; - } - - @Override - public long numberOfBits(Long object) { - return 8; - } - - @Override - public Long read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented."); - } +class ExternalLongCodec extends AbstractBitCodec { + private final OutputStream outputStream; + private final InputStream inputStream; + + public ExternalLongCodec(final OutputStream outputStream, final InputStream inputStream) { + this.outputStream = outputStream; + this.inputStream = inputStream; + } + + @Override + public Long read(final BitInputStream bitInputStream) throws IOException { + long result = 0; + for (int i = 0; i < 8; i++) { + result <<= 8; + result |= inputStream.read(); + } + return result; + } + + @Override + public long write(final BitOutputStream bitOutputStream, Long value) throws IOException { + for (int i = 0; i < 8; i++) { + outputStream.write((int) (value & 0xFF)); + value >>>= 8; + } + return 64; + } + + @Override + public long numberOfBits(final Long object) { + return 8; + } + + @Override + public Long read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Not implemented."); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/ExternalLongEncoding.java b/src/java/htsjdk/samtools/cram/encoding/ExternalLongEncoding.java index 860a9339b0..402cea888c 100644 --- a/src/java/htsjdk/samtools/cram/encoding/ExternalLongEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/ExternalLongEncoding.java @@ -1,63 +1,62 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; import java.io.InputStream; import java.util.Map; - - - public class ExternalLongEncoding implements Encoding { - public static final EncodingID encodingId = EncodingID.EXTERNAL ; - public int contentId = -1 ; - - public ExternalLongEncoding() { - } - - public static EncodingParams toParam(int contentId) { - ExternalLongEncoding e = new ExternalLongEncoding() ; - e.contentId = contentId ; - return new EncodingParams(encodingId, e.toByteArray()) ; - } - - public byte[] toByteArray() { - return ByteBufferUtils.writeUnsignedITF8(contentId) ; - } - - public void fromByteArray(byte[] data) { - contentId = ByteBufferUtils.readUnsignedITF8(data) ; - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - InputStream is = inputMap == null ? null : inputMap.get(contentId) ; - ExposedByteArrayOutputStream os = outputMap == null ? null : outputMap.get(contentId) ; - return (BitCodec) new ExternalLongCodec(os, is); - } - - @Override - public EncodingID id() { - return encodingId; - } + private static final EncodingID encodingId = EncodingID.EXTERNAL; + private int contentId = -1; + + public ExternalLongEncoding() { + } + + public static EncodingParams toParam(final int contentId) { + final ExternalLongEncoding externalLongEncoding = new ExternalLongEncoding(); + externalLongEncoding.contentId = contentId; + return new EncodingParams(encodingId, externalLongEncoding.toByteArray()); + } + + public byte[] toByteArray() { + return ITF8.writeUnsignedITF8(contentId); + } + + public void fromByteArray(final byte[] data) { + contentId = ITF8.readUnsignedITF8(data); + } + + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + final InputStream inputStream = inputMap == null ? null : inputMap.get(contentId); + final ExposedByteArrayOutputStream outputStream = outputMap == null ? null : outputMap.get(contentId); + return new ExternalLongCodec(outputStream, inputStream); + } + + @Override + public EncodingID id() { + return encodingId; + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/GammaIntegerCodec.java b/src/java/htsjdk/samtools/cram/encoding/GammaIntegerCodec.java index a4a503910e..9b6cc7f49d 100644 --- a/src/java/htsjdk/samtools/cram/encoding/GammaIntegerCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/GammaIntegerCodec.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; @@ -20,75 +22,51 @@ import java.io.IOException; -public class GammaIntegerCodec extends AbstractBitCodec { - private int offset = 0; - private boolean lenCodingBit = false; - - private GammaIntegerCodec(int offset, boolean lenCodingBit) { - this.offset = offset; - this.lenCodingBit = lenCodingBit; - } - - public GammaIntegerCodec(int offset) { - this(offset, false); - } - - public GammaIntegerCodec() { - this(0, false); - } - - @Override - public final Integer read(BitInputStream bis) throws IOException { - int len = 1; - while (bis.readBit() == lenCodingBit) - len++; - int readBits = bis.readBits(len - 1); - int value = readBits | 1 << (len - 1); - return value - offset; - } - - @Override - public final long write(BitOutputStream bos, Integer value) throws IOException { - if (value + offset < 1) - throw new IllegalArgumentException("Gamma codec handles only positive values: " + value); - - long newValue = value + offset; - int betaCodeLength = 1 + (int) (Math.log(newValue) / Math.log(2)); - if (betaCodeLength > 1) - bos.write(0L, betaCodeLength - 1); - - bos.write(newValue, betaCodeLength); - return betaCodeLength * 2 - 1; - } - - @Override - public final long numberOfBits(Integer value) { - long newValue = value + offset; - if (newValue < 1) - throw new RuntimeException("Invalid valid: " + newValue); - int betaCodeLength = 1 + (int) (Math.log(newValue) / Math.log(2)); - return betaCodeLength * 2 - 1; - } - - public int getOffset() { - return offset; - } - - public boolean isLenCodingBit() { - return lenCodingBit; - } - - public void setOffset(int offset) { - this.offset = offset; - } - - public void setLenCodingBit(boolean lenCodingBit) { - this.lenCodingBit = lenCodingBit; - } - - @Override - public Integer read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented."); - } +class GammaIntegerCodec extends AbstractBitCodec { + private int offset = 0; + + public GammaIntegerCodec(final int offset) { + this.offset = offset; + } + + @Override + public final Integer read(final BitInputStream bitInputStream) throws IOException { + int length = 1; + final boolean lenCodingBit = false; + //noinspection ConstantConditions,PointlessBooleanExpression + while (bitInputStream.readBit() == lenCodingBit) + length++; + final int readBits = bitInputStream.readBits(length - 1); + final int value = readBits | 1 << (length - 1); + return value - offset; + } + + @Override + public final long write(final BitOutputStream bitOutputStream, final Integer value) throws IOException { + if (value + offset < 1) + throw new IllegalArgumentException("Gamma codec handles only positive values: " + value); + + final long newValue = value + offset; + final int betaCodeLength = 1 + (int) (Math.log(newValue) / Math.log(2)); + if (betaCodeLength > 1) + bitOutputStream.write(0L, betaCodeLength - 1); + + bitOutputStream.write(newValue, betaCodeLength); + return betaCodeLength * 2 - 1; + } + + @Override + public final long numberOfBits(final Integer value) { + final long newValue = value + offset; + if (newValue < 1) + throw new RuntimeException("Invalid valid: " + newValue); + final int betaCodeLength = 1 + (int) (Math.log(newValue) / Math.log(2)); + return betaCodeLength * 2 - 1; + } + + @Override + public Integer read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Not implemented."); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/GammaIntegerEncoding.java b/src/java/htsjdk/samtools/cram/encoding/GammaIntegerEncoding.java index 6138c3f599..00c38a2a94 100644 --- a/src/java/htsjdk/samtools/cram/encoding/GammaIntegerEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/GammaIntegerEncoding.java @@ -1,22 +1,24 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; @@ -24,51 +26,48 @@ import java.nio.ByteBuffer; import java.util.Map; - - - public class GammaIntegerEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.GAMMA; - private int offset ; + private static final EncodingID ENCODING_ID = EncodingID.GAMMA; + private int offset; - public GammaIntegerEncoding() { - this (0) ; - } + public GammaIntegerEncoding() { + this(0); + } - public GammaIntegerEncoding(int offset) { - this.offset = offset; - } + public GammaIntegerEncoding(final int offset) { + this.offset = offset; + } - @Override - public EncodingID id() { - return ENCODING_ID; - } + @Override + public EncodingID id() { + return ENCODING_ID; + } - public static EncodingParams toParam(int offset) { - GammaIntegerEncoding e = new GammaIntegerEncoding(); - e.offset = offset ; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } + public static EncodingParams toParam(final int offset) { + final GammaIntegerEncoding gammaIntegerEncoding = new GammaIntegerEncoding(); + gammaIntegerEncoding.offset = offset; + return new EncodingParams(ENCODING_ID, gammaIntegerEncoding.toByteArray()); + } - @Override - public byte[] toByteArray() { - ByteBuffer buf = ByteBuffer.allocate(10); - ByteBufferUtils.writeUnsignedITF8(offset, buf); - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - return array; - } + @Override + public byte[] toByteArray() { + final ByteBuffer buffer = ByteBuffer.allocate(10); + ITF8.writeUnsignedITF8(offset, buffer); + buffer.flip(); + final byte[] array = new byte[buffer.limit()]; + buffer.get(array); + return array; + } - @Override - public void fromByteArray(byte[] data) { - offset = ByteBufferUtils.readUnsignedITF8(data) ; - } + @Override + public void fromByteArray(final byte[] data) { + offset = ITF8.readUnsignedITF8(data); + } - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new GammaIntegerCodec(offset); - } + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + return new GammaIntegerCodec(offset); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/GolombIntegerCodec.java b/src/java/htsjdk/samtools/cram/encoding/GolombIntegerCodec.java index 44e658d705..1a15efa86c 100644 --- a/src/java/htsjdk/samtools/cram/encoding/GolombIntegerCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/GolombIntegerCodec.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; @@ -21,107 +23,79 @@ import java.io.IOException; -public class GolombIntegerCodec extends AbstractBitCodec { - private int m; - private boolean quotientBit = true; - private int offset = 0; - - public GolombIntegerCodec(int m) { - this(m, true, 0); - } - - public GolombIntegerCodec(int m, boolean quotientBit, Integer offset) { - if (m < 2) - throw new IllegalArgumentException( - "M parameter must be at least 2."); - this.m = m; - this.quotientBit = quotientBit; - this.offset = offset; - } - - @Override - public final Integer read(final BitInputStream bis) throws IOException { - int quotient = 0; - while (bis.readBit() == quotientBit) - quotient++; - - int ceiling = (int) (Math.log(m) / Math.log(2) + 1); - int reminder = bis.readBits((int) (ceiling - 1)); - if (reminder >= Math.pow(2, ceiling) - m) { - reminder <<= 1; - reminder |= bis.readBits(1); - reminder -= Math.pow(2, ceiling) - m; - } - - return (quotient * m + reminder) - offset; - } - - @Override - public final long write(final BitOutputStream bos, final Integer value) - throws IOException { - int newValue = value + offset; - int quotient = (int) (newValue / m); - int reminder = newValue % m; - int ceiling = (int) (Math.log(m) / Math.log(2) + 1); - - int len = quotient + 1; - bos.write(quotientBit, quotient); - bos.write(!quotientBit); - - if (reminder < Math.pow(2, ceiling) - m) { - bos.write(reminder, (int) ceiling - 1); - len += ceiling - 1; - } else { - bos.write((int) (reminder + Math.pow(2, ceiling) - m), - (int) ceiling); - len += ceiling; - } - return len; - } - - @Override - public final long numberOfBits(Integer value) { - int newValue = value + offset; - int quotient = (int) (newValue / m); - int reminder = newValue % m; - int ceiling = (int) (Math.log(m) / Math.log(2) + 1); - int l = quotient + 1; - - if (reminder < Math.pow(2, ceiling) - m) - l += ceiling - 1; - else - l += ceiling; - - return l; - } - - public long getM() { - return m; - } - - public boolean isQuotientBit() { - return quotientBit; - } - - public Integer getOffset() { - return offset; - } - - public void setM(int m) { - this.m = m; - } - - public void setQuotientBit(boolean quotientBit) { - this.quotientBit = quotientBit; - } - - public void setOffset(Integer offset) { - this.offset = offset; - } - - @Override - public Integer read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Multi-value read method not defined."); - } +class GolombIntegerCodec extends AbstractBitCodec { + private int m; + private boolean quotientBit = true; + private int offset = 0; + + public GolombIntegerCodec(final int m, final Integer offset) { + if (m < 2) + throw new IllegalArgumentException( + "M parameter must be at least 2."); + this.m = m; + this.quotientBit = true; + this.offset = offset; + } + + @Override + public final Integer read(final BitInputStream bitInputStream) throws IOException { + int quotient = 0; + while (bitInputStream.readBit() == quotientBit) + quotient++; + + final int ceiling = (int) (Math.log(m) / Math.log(2) + 1); + int reminder = bitInputStream.readBits(ceiling - 1); + if (reminder >= Math.pow(2, ceiling) - m) { + reminder <<= 1; + reminder |= bitInputStream.readBits(1); + reminder -= Math.pow(2, ceiling) - m; + } + + return (quotient * m + reminder) - offset; + } + + @Override + public final long write(final BitOutputStream bitOutputStream, final Integer value) + throws IOException { + final int newValue = value + offset; + final int quotient = newValue / m; + final int reminder = newValue % m; + final int ceiling = (int) (Math.log(m) / Math.log(2) + 1); + + int length = quotient + 1; + bitOutputStream.write(quotientBit, quotient); + bitOutputStream.write(!quotientBit); + + if (reminder < Math.pow(2, ceiling) - m) { + bitOutputStream.write(reminder, ceiling - 1); + length += ceiling - 1; + } else { + bitOutputStream.write((int) (reminder + Math.pow(2, ceiling) - m), + ceiling); + length += ceiling; + } + return length; + } + + @Override + public final long numberOfBits(final Integer value) { + final int newValue = value + offset; + final int quotient = newValue / m; + final int reminder = newValue % m; + final int ceiling = (int) (Math.log(m) / Math.log(2) + 1); + int l = quotient + 1; + + if (reminder < Math.pow(2, ceiling) - m) + l += ceiling - 1; + else + l += ceiling; + + return l; + } + + @Override + public Integer read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Multi-value read method not defined."); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/GolombIntegerEncoding.java b/src/java/htsjdk/samtools/cram/encoding/GolombIntegerEncoding.java index e94aff7dfa..2332b5ebfb 100644 --- a/src/java/htsjdk/samtools/cram/encoding/GolombIntegerEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/GolombIntegerEncoding.java @@ -1,22 +1,24 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; @@ -24,63 +26,55 @@ import java.nio.ByteBuffer; import java.util.Map; - - - public class GolombIntegerEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.GOLOMB; - private int m; - private int offset; + private static final EncodingID ENCODING_ID = EncodingID.GOLOMB; + private int m; + private int offset; + + public GolombIntegerEncoding() { + } - public GolombIntegerEncoding() { - } - - public GolombIntegerEncoding(int m) { - this.m = m; - this.offset = 0 ; - } + @Override + public EncodingID id() { + return ENCODING_ID; + } - @Override - public EncodingID id() { - return ENCODING_ID; - } + public static EncodingParams toParam(final int m) { + final GolombIntegerEncoding golombIntegerEncoding = new GolombIntegerEncoding(); + golombIntegerEncoding.m = m; + golombIntegerEncoding.offset = 0; + return new EncodingParams(ENCODING_ID, golombIntegerEncoding.toByteArray()); + } - public static EncodingParams toParam(int m) { - GolombIntegerEncoding e = new GolombIntegerEncoding(); - e.m = m; - e.offset = 0 ; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } - - public static EncodingParams toParam(int m, int offset) { - GolombIntegerEncoding e = new GolombIntegerEncoding(); - e.m = m; - e.offset = offset ; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } + public static EncodingParams toParam(final int m, final int offset) { + final GolombIntegerEncoding e = new GolombIntegerEncoding(); + e.m = m; + e.offset = offset; + return new EncodingParams(ENCODING_ID, e.toByteArray()); + } - @Override - public byte[] toByteArray() { - ByteBuffer buf = ByteBuffer.allocate(10); - ByteBufferUtils.writeUnsignedITF8(offset, buf); - ByteBufferUtils.writeUnsignedITF8(m, buf); - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - return array; - } + @Override + public byte[] toByteArray() { + final ByteBuffer buffer = ByteBuffer.allocate(10); + ITF8.writeUnsignedITF8(offset, buffer); + ITF8.writeUnsignedITF8(m, buffer); + buffer.flip(); + final byte[] array = new byte[buffer.limit()]; + buffer.get(array); + return array; + } - @Override - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data) ; - offset = ByteBufferUtils.readUnsignedITF8(buf); - m = ByteBufferUtils.readUnsignedITF8(buf); - } + @Override + public void fromByteArray(final byte[] data) { + final ByteBuffer buffer = ByteBuffer.wrap(data); + offset = ITF8.readUnsignedITF8(buffer); + m = ITF8.readUnsignedITF8(buffer); + } - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new GolombIntegerCodec(m, true, offset); - } + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + return new GolombIntegerCodec(m, offset); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/GolombLongCodec.java b/src/java/htsjdk/samtools/cram/encoding/GolombLongCodec.java index 1e727ee08e..ef0c12cfe3 100644 --- a/src/java/htsjdk/samtools/cram/encoding/GolombLongCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/GolombLongCodec.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; @@ -21,110 +23,78 @@ import java.io.IOException; -public class GolombLongCodec extends AbstractBitCodec { - private int m; - private boolean quotientBit = true; - private long offset = 0L; - - public GolombLongCodec(int m) { - this(0, m, true); - } - - public GolombLongCodec(long offset, int m) { - this(offset, m, true); - } - - public GolombLongCodec(long offset, int m, boolean quotientBit) { - if (m < 2) - throw new IllegalArgumentException( - "M parameter must be at least 2."); - this.m = m; - this.quotientBit = quotientBit; - this.offset = offset; - } - - @Override - public final Long read(final BitInputStream bis) throws IOException { - long quotient = 0L; - while (bis.readBit() == quotientBit) - quotient++; - - long ceiling = (long) (Math.log(m) / Math.log(2) + 1); - long reminder = bis.readBits((int) (ceiling - 1)); - if (reminder >= Math.pow(2, ceiling) - m) { - reminder <<= 1; - reminder |= bis.readBits(1); - reminder -= Math.pow(2, ceiling) - m; - } - - return (quotient * m + reminder) - offset; - } - - @Override - public final long write(final BitOutputStream bos, final Long value) - throws IOException { - long newValue = value + offset; - long quotient = (long) (newValue / m); - long reminder = newValue % m; - long ceiling = (long) (Math.log(m) / Math.log(2) + 1); - - long len = quotient + 1; - bos.write(quotientBit, quotient); - bos.write(!quotientBit); - - if (reminder < Math.pow(2, ceiling) - m) { - bos.write(reminder, (int) ceiling - 1); - len += ceiling - 1; - } else { - bos.write((int) (reminder + Math.pow(2, ceiling) - m), - (int) ceiling); - len += ceiling; - } - return len; - } - - @Override - public final long numberOfBits(Long value) { - long newValue = value + offset; - long quotient = (long) (newValue / m); - long reminder = newValue % m; - long ceiling = (long) (Math.log(m) / Math.log(2) + 1); - long l = quotient + 1; - - if (reminder < Math.pow(2, ceiling) - m) - l += ceiling - 1; - else - l += ceiling; - - return l; - } - - public int getM() { - return m; - } - - public boolean isQuotientBit() { - return quotientBit; - } - - public Long getOffset() { - return offset; - } - - public void setM(int m) { - this.m = m; - } - - public void setQuotientBit(boolean quotientBit) { - this.quotientBit = quotientBit; - } - - public void setOffset(Long offset) { - this.offset = offset; - } - - @Override - public Long read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Multi-value read method not defined."); - } +class GolombLongCodec extends AbstractBitCodec { + private int m; + private boolean quotientBit = true; + private long offset = 0L; + + public GolombLongCodec(final long offset, final int m) { + if (m < 2) + throw new IllegalArgumentException( + "M parameter must be at least 2."); + this.m = m; + this.quotientBit = true; + this.offset = offset; + } + + @Override + public final Long read(final BitInputStream bitInputStream) throws IOException { + long quotient = 0L; + while (bitInputStream.readBit() == quotientBit) + quotient++; + + final long ceiling = (long) (Math.log(m) / Math.log(2) + 1); + long reminder = bitInputStream.readBits((int) (ceiling - 1)); + if (reminder >= Math.pow(2, ceiling) - m) { + reminder <<= 1; + reminder |= bitInputStream.readBits(1); + reminder -= Math.pow(2, ceiling) - m; + } + + return (quotient * m + reminder) - offset; + } + + @Override + public final long write(final BitOutputStream bitOutputStream, final Long value) + throws IOException { + final long newValue = value + offset; + final long quotient = newValue / m; + final long reminder = newValue % m; + final long ceiling = (long) (Math.log(m) / Math.log(2) + 1); + + long length = quotient + 1; + bitOutputStream.write(quotientBit, quotient); + bitOutputStream.write(!quotientBit); + + if (reminder < Math.pow(2, ceiling) - m) { + bitOutputStream.write(reminder, (int) ceiling - 1); + length += ceiling - 1; + } else { + bitOutputStream.write((int) (reminder + Math.pow(2, ceiling) - m), + (int) ceiling); + length += ceiling; + } + return length; + } + + @Override + public final long numberOfBits(final Long value) { + final long newValue = value + offset; + final long quotient = newValue / m; + final long reminder = newValue % m; + final long ceiling = (long) (Math.log(m) / Math.log(2) + 1); + long l = quotient + 1; + + if (reminder < Math.pow(2, ceiling) - m) + l += ceiling - 1; + else + l += ceiling; + + return l; + } + + @Override + public Long read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Multi-value read method not defined."); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/GolombLongEncoding.java b/src/java/htsjdk/samtools/cram/encoding/GolombLongEncoding.java index 2b17efb1d8..eabec1bfe5 100644 --- a/src/java/htsjdk/samtools/cram/encoding/GolombLongEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/GolombLongEncoding.java @@ -1,22 +1,24 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; @@ -24,51 +26,48 @@ import java.nio.ByteBuffer; import java.util.Map; - - - public class GolombLongEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.GOLOMB; - private int m; - private int offset; + private static final EncodingID ENCODING_ID = EncodingID.GOLOMB; + private int m; + private int offset; - public GolombLongEncoding() { - } + public GolombLongEncoding() { + } - @Override - public EncodingID id() { - return ENCODING_ID; - } + @Override + public EncodingID id() { + return ENCODING_ID; + } - public static EncodingParams toParam(int offset, int m) { - GolombLongEncoding e = new GolombLongEncoding(); - e.offset = offset; - e.m = m; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } + public static EncodingParams toParam(final int offset, final int m) { + final GolombLongEncoding golombLongEncoding = new GolombLongEncoding(); + golombLongEncoding.offset = offset; + golombLongEncoding.m = m; + return new EncodingParams(ENCODING_ID, golombLongEncoding.toByteArray()); + } - @Override - public byte[] toByteArray() { - ByteBuffer buf = ByteBuffer.allocate(10); - ByteBufferUtils.writeUnsignedITF8(offset, buf); - ByteBufferUtils.writeUnsignedITF8(m, buf); - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - return array; - } + @Override + public byte[] toByteArray() { + final ByteBuffer buffer = ByteBuffer.allocate(10); + ITF8.writeUnsignedITF8(offset, buffer); + ITF8.writeUnsignedITF8(m, buffer); + buffer.flip(); + final byte[] array = new byte[buffer.limit()]; + buffer.get(array); + return array; + } - @Override - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data); - offset = ByteBufferUtils.readUnsignedITF8(buf); - m = ByteBufferUtils.readUnsignedITF8(buf); - } + @Override + public void fromByteArray(final byte[] data) { + final ByteBuffer buffer = ByteBuffer.wrap(data); + offset = ITF8.readUnsignedITF8(buffer); + m = ITF8.readUnsignedITF8(buffer); + } - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new GolombLongCodec(offset, m, true); - } + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + return new GolombLongCodec(offset, m); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/GolombRiceIntegerCodec.java b/src/java/htsjdk/samtools/cram/encoding/GolombRiceIntegerCodec.java index 845ff3c17a..e5962a152a 100644 --- a/src/java/htsjdk/samtools/cram/encoding/GolombRiceIntegerCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/GolombRiceIntegerCodec.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; @@ -21,99 +23,65 @@ import java.io.IOException; -public class GolombRiceIntegerCodec extends AbstractBitCodec { - private int m; - private int log2m; - private long mask ; - private boolean quotientBit = false; - private int offset = 0; - - public GolombRiceIntegerCodec(int log2m) { - this (0, log2m) ; - } - - public GolombRiceIntegerCodec(int offset, int log2m) { - this (offset, log2m, false) ; - } - - public GolombRiceIntegerCodec(int offset, int log2m, boolean quotientBit) { - this.log2m = log2m; - m = 1 << log2m; - this.quotientBit = quotientBit; - this.offset = offset; - mask = ~(~0 << log2m) ; - } - - public final Integer read(final BitInputStream bis) throws IOException { - - int unary = 0; - while (bis.readBit() == quotientBit) - unary++; - - int remainder = bis.readBits(log2m); - - int result = unary * m + remainder; - return result - offset; - } - - @Override - public final long write(final BitOutputStream bos, final Integer value) throws IOException { - long newValue = value + offset; - long quotient = newValue >>> log2m; - if (quotient > 0x7fffffffL) - for (long i = 0; i < quotient; i++) - bos.write(quotientBit); - - else if (quotient > 0) { - final int qi = (int) quotient; - for (int i = 0; i < qi; i++) - bos.write(quotientBit); - } - bos.write(!quotientBit); - long remainder = newValue & mask; - long reminderMask = 1 << (log2m - 1); - for (int i = log2m - 1; i >= 0; i--) { - final long b = remainder & reminderMask; - bos.write(b != 0L); - reminderMask >>>= 1; - } - long bits = quotient + 1 + log2m; - return bits; - } - - @Override - public final long numberOfBits(Integer value) { - return (value + offset) / m + 1 + log2m; - } - - public int getLog2m() { - return log2m; - } - - public void setLog2m(int log2m) { - this.log2m = log2m; - m = 1 << log2m; - } - - public boolean isQuotientBit() { - return quotientBit; - } - - public void setQuotientBit(boolean quotientBit) { - this.quotientBit = quotientBit; - } - - public int getOffset() { - return offset; - } - - public void setOffset(int offset) { - this.offset = offset; - } - - @Override - public Integer read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented.") ; - } +class GolombRiceIntegerCodec extends AbstractBitCodec { + private final int m; + private final int log2m; + private final long mask; + private boolean quotientBit = false; + private int offset = 0; + + public GolombRiceIntegerCodec(final int offset, final int log2m) { + this.log2m = log2m; + m = 1 << log2m; + this.quotientBit = true; + this.offset = offset; + mask = ~(~0 << log2m); + } + + public final Integer read(final BitInputStream bitInputStream) throws IOException { + + int unary = 0; + while (bitInputStream.readBit() == quotientBit) + unary++; + + final int remainder = bitInputStream.readBits(log2m); + + final int result = unary * m + remainder; + return result - offset; + } + + @Override + public final long write(final BitOutputStream bitOutputStream, final Integer value) throws IOException { + final long newValue = value + offset; + final long quotient = newValue >>> log2m; + if (quotient > 0x7fffffffL) + for (long i = 0; i < quotient; i++) + bitOutputStream.write(quotientBit); + + else if (quotient > 0) { + final int qi = (int) quotient; + for (int i = 0; i < qi; i++) + bitOutputStream.write(quotientBit); + } + bitOutputStream.write(!quotientBit); + final long remainder = newValue & mask; + long reminderMask = 1 << (log2m - 1); + for (int i = log2m - 1; i >= 0; i--) { + final long b = remainder & reminderMask; + bitOutputStream.write(b != 0L); + reminderMask >>>= 1; + } + return quotient + 1 + log2m; + } + + @Override + public final long numberOfBits(final Integer value) { + return (value + offset) / m + 1 + log2m; + } + + @Override + public Integer read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Not implemented."); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/GolombRiceIntegerEncoding.java b/src/java/htsjdk/samtools/cram/encoding/GolombRiceIntegerEncoding.java index 7a9f11ccbf..8f8d27beb1 100644 --- a/src/java/htsjdk/samtools/cram/encoding/GolombRiceIntegerEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/GolombRiceIntegerEncoding.java @@ -1,22 +1,24 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; -import htsjdk.samtools.cram.io.ByteBufferUtils; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; import htsjdk.samtools.cram.structure.EncodingID; import htsjdk.samtools.cram.structure.EncodingParams; @@ -24,55 +26,48 @@ import java.nio.ByteBuffer; import java.util.Map; - - - public class GolombRiceIntegerEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.GOLOMB_RICE; - private int offset; - private int m; + private static final EncodingID ENCODING_ID = EncodingID.GOLOMB_RICE; + private int offset; + private int m; - public GolombRiceIntegerEncoding() { - } - - public GolombRiceIntegerEncoding(int m) { - this.m = m; - } + public GolombRiceIntegerEncoding() { + } - @Override - public EncodingID id() { - return ENCODING_ID; - } + @Override + public EncodingID id() { + return ENCODING_ID; + } - public static EncodingParams toParam(int offset, int m) { - GolombRiceIntegerEncoding e = new GolombRiceIntegerEncoding(); - e.offset= offset; - e.m = m; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } + public static EncodingParams toParam(final int offset, final int m) { + final GolombRiceIntegerEncoding golombRiceIntegerEncoding = new GolombRiceIntegerEncoding(); + golombRiceIntegerEncoding.offset = offset; + golombRiceIntegerEncoding.m = m; + return new EncodingParams(ENCODING_ID, golombRiceIntegerEncoding.toByteArray()); + } - @Override - public byte[] toByteArray() { - ByteBuffer buf = ByteBuffer.allocate(10); - ByteBufferUtils.writeUnsignedITF8(offset, buf); - ByteBufferUtils.writeUnsignedITF8(m, buf); - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - return array; - } + @Override + public byte[] toByteArray() { + final ByteBuffer buffer = ByteBuffer.allocate(10); + ITF8.writeUnsignedITF8(offset, buffer); + ITF8.writeUnsignedITF8(m, buffer); + buffer.flip(); + final byte[] array = new byte[buffer.limit()]; + buffer.get(array); + return array; + } - @Override - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data) ; - offset = ByteBufferUtils.readUnsignedITF8(buf); - m = ByteBufferUtils.readUnsignedITF8(buf); - } + @Override + public void fromByteArray(final byte[] data) { + final ByteBuffer buffer = ByteBuffer.wrap(data); + offset = ITF8.readUnsignedITF8(buffer); + m = ITF8.readUnsignedITF8(buffer); + } - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new GolombRiceIntegerCodec(offset, m, true); - } + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + return new GolombRiceIntegerCodec(offset, m); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/HuffmanByteEncoding.java b/src/java/htsjdk/samtools/cram/encoding/HuffmanByteEncoding.java deleted file mode 100644 index 15daba8796..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/HuffmanByteEncoding.java +++ /dev/null @@ -1,88 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.encoding.huffint.CanonicalHuffmanByteCodec2; -import htsjdk.samtools.cram.io.ByteBufferUtils; -import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; -import htsjdk.samtools.cram.structure.EncodingID; -import htsjdk.samtools.cram.structure.EncodingParams; - -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.util.Map; - -public class HuffmanByteEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.HUFFMAN; - private int[] bitLengths; - private byte[] values; - private ByteBuffer buf = ByteBuffer.allocate(1024); - - public HuffmanByteEncoding() { - } - - @Override - public EncodingID id() { - return ENCODING_ID; - } - - @Override - public byte[] toByteArray() { - buf.clear(); - if (buf.capacity() < values.length * 8) - buf = ByteBuffer.allocate(values.length * 8); - - ByteBufferUtils.writeUnsignedITF8(values.length, buf); - for (byte value : values) - buf.put(value); - - ByteBufferUtils.writeUnsignedITF8(bitLengths.length, buf); - for (int value : bitLengths) - ByteBufferUtils.writeUnsignedITF8(value, buf); - - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - return array; - } - - @Override - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data); - int size = ByteBufferUtils.readUnsignedITF8(buf); - values = new byte[size]; - buf.get(values); - - size = ByteBufferUtils.readUnsignedITF8(buf); - bitLengths = new int[size]; - for (int i = 0; i < size; i++) - bitLengths[i] = ByteBufferUtils.readUnsignedITF8(buf); - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new CanonicalHuffmanByteCodec2(values, bitLengths); - } - - public static EncodingParams toParam(byte[] bfValues, int[] bfBitLens) { - HuffmanByteEncoding e = new HuffmanByteEncoding(); - e.values = bfValues; - e.bitLengths = bfBitLens; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } - -} diff --git a/src/java/htsjdk/samtools/cram/encoding/HuffmanEncoding.java b/src/java/htsjdk/samtools/cram/encoding/HuffmanEncoding.java deleted file mode 100644 index de28e0d5f0..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/HuffmanEncoding.java +++ /dev/null @@ -1,88 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.io.ByteBufferUtils; -import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; -import htsjdk.samtools.cram.structure.EncodingID; -import htsjdk.samtools.cram.structure.EncodingParams; - -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.util.Map; - - - - -public class HuffmanEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.HUFFMAN; - private int[] bitLengths; - private int[] values; - - public HuffmanEncoding() { - } - - @Override - public EncodingID id() { - return ENCODING_ID; - } - - @Override - public byte[] toByteArray() { - ByteBuffer buf = ByteBuffer.allocate(1024); - ByteBufferUtils.writeUnsignedITF8(values.length, buf); - for (int value : values) - ByteBufferUtils.writeUnsignedITF8(value, buf); - - ByteBufferUtils.writeUnsignedITF8(bitLengths.length, buf); - for (int value : bitLengths) - ByteBufferUtils.writeUnsignedITF8(value, buf); - - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - return array; - } - - @Override - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data); - int size = ByteBufferUtils.readUnsignedITF8(buf); - values = new int[size]; - - for (int i = 0; i < size; i++) - values[i] = ByteBufferUtils.readUnsignedITF8(buf); - - size = ByteBufferUtils.readUnsignedITF8(buf); - bitLengths = new int[size]; - for (int i = 0; i < size; i++) - bitLengths[i] = ByteBufferUtils.readUnsignedITF8(buf); - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new CanonicalHuffmanIntegerCodec(values, bitLengths); - } - - public static EncodingParams toParam(int[] bfValues, int[] bfBitLens) { - HuffmanEncoding e = new HuffmanEncoding(); - e.values = bfValues; - e.bitLengths = bfBitLens; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } - -} diff --git a/src/java/htsjdk/samtools/cram/encoding/HuffmanIntegerEncoding.java b/src/java/htsjdk/samtools/cram/encoding/HuffmanIntegerEncoding.java deleted file mode 100644 index 814e81e0ce..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/HuffmanIntegerEncoding.java +++ /dev/null @@ -1,101 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.encoding.huffint.CanonicalHuffmanIntegerCodec2; -import htsjdk.samtools.cram.io.ByteBufferUtils; -import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; -import htsjdk.samtools.cram.structure.EncodingID; -import htsjdk.samtools.cram.structure.EncodingParams; - -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.util.Arrays; -import java.util.Map; - -public class HuffmanIntegerEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.HUFFMAN; - int[] bitLengths; - int[] values; - ByteBuffer buf = ByteBuffer.allocate(1024 * 10); - - public HuffmanIntegerEncoding() { - } - - @Override - public EncodingID id() { - return ENCODING_ID; - } - - @Override - public byte[] toByteArray() { - buf.clear(); - ByteBufferUtils.writeUnsignedITF8(values.length, buf); - for (int value : values) - ByteBufferUtils.writeUnsignedITF8(value, buf); - - ByteBufferUtils.writeUnsignedITF8(bitLengths.length, buf); - for (int value : bitLengths) - ByteBufferUtils.writeUnsignedITF8(value, buf); - - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - return array; - } - - @Override - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data); - int size = ByteBufferUtils.readUnsignedITF8(buf); - values = new int[size]; - - for (int i = 0; i < size; i++) - values[i] = ByteBufferUtils.readUnsignedITF8(buf); - - size = ByteBufferUtils.readUnsignedITF8(buf); - bitLengths = new int[size]; - for (int i = 0; i < size; i++) - bitLengths[i] = ByteBufferUtils.readUnsignedITF8(buf); - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new CanonicalHuffmanIntegerCodec2(values, bitLengths); - } - - public static EncodingParams toParam(int[] bfValues, int[] bfBitLens) { - HuffmanIntegerEncoding e = new HuffmanIntegerEncoding(); - e.values = bfValues; - e.bitLengths = bfBitLens; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } - - @Override - public boolean equals(Object obj) { - if (obj instanceof HuffmanIntegerEncoding) { - HuffmanIntegerEncoding foe = (HuffmanIntegerEncoding) obj; - if (!Arrays.equals(bitLengths, foe.bitLengths)) - return false; - if (!Arrays.equals(values, foe.values)) - return false; - - return true; - } - return false; - } -} diff --git a/src/java/htsjdk/samtools/cram/encoding/NullCodec.java b/src/java/htsjdk/samtools/cram/encoding/NullCodec.java index 4cecccf59e..6f1038c487 100644 --- a/src/java/htsjdk/samtools/cram/encoding/NullCodec.java +++ b/src/java/htsjdk/samtools/cram/encoding/NullCodec.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.BitInputStream; @@ -21,30 +23,30 @@ import java.io.IOException; -public class NullCodec extends AbstractBitCodec { - private T defaultValue = null ; +class NullCodec extends AbstractBitCodec { + private final T defaultValue = null; - public NullCodec() { - } + public NullCodec() { + } - @Override - public T read(BitInputStream bis) throws IOException { - return defaultValue; - } + @Override + public T read(final BitInputStream bitInputStream) throws IOException { + return defaultValue; + } - @Override - public T read(BitInputStream bis, int len) throws IOException { - return defaultValue; - } + @Override + public T read(final BitInputStream bitInputStream, final int length) throws IOException { + return defaultValue; + } - @Override - public long write(BitOutputStream bos, T object) throws IOException { - return 0; - } + @Override + public long write(final BitOutputStream bitOutputStream, final T object) throws IOException { + return 0; + } - @Override - public long numberOfBits(T object) { - return 0; - } + @Override + public long numberOfBits(final T object) { + return 0; + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/NullEncoding.java b/src/java/htsjdk/samtools/cram/encoding/NullEncoding.java index 9da0a3fea5..9af54bd122 100644 --- a/src/java/htsjdk/samtools/cram/encoding/NullEncoding.java +++ b/src/java/htsjdk/samtools/cram/encoding/NullEncoding.java @@ -1,18 +1,20 @@ -/******************************************************************************* +/** + * **************************************************************************** * Copyright 2013 EMBL-EBI - * + *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - ******************************************************************************/ + * **************************************************************************** + */ package htsjdk.samtools.cram.encoding; import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; @@ -22,37 +24,34 @@ import java.io.InputStream; import java.util.Map; - - - public class NullEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.NULL; - - public NullEncoding() { - } - - @Override - public EncodingID id() { - return ENCODING_ID; - } - - public static EncodingParams toParam() { - return new EncodingParams(ENCODING_ID, new NullEncoding().toByteArray()); - } - - @Override - public byte[] toByteArray() { - return new byte[] {}; - } - - @Override - public void fromByteArray(byte[] data) { - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new NullCodec(); - } + private static final EncodingID ENCODING_ID = EncodingID.NULL; + + public NullEncoding() { + } + + @Override + public EncodingID id() { + return ENCODING_ID; + } + + public static EncodingParams toParam() { + return new EncodingParams(ENCODING_ID, new NullEncoding().toByteArray()); + } + + @Override + public byte[] toByteArray() { + return new byte[]{}; + } + + @Override + public void fromByteArray(final byte[] data) { + } + + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + return new NullCodec(); + } } diff --git a/src/java/htsjdk/samtools/cram/encoding/SubexpIntegerCodec.java b/src/java/htsjdk/samtools/cram/encoding/SubexpIntegerCodec.java deleted file mode 100644 index 6fc23f6a4e..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/SubexpIntegerCodec.java +++ /dev/null @@ -1,129 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.io.BitInputStream; -import htsjdk.samtools.cram.io.BitOutputStream; - -import java.io.IOException; - - -public class SubexpIntegerCodec extends AbstractBitCodec { - private int offset = 0; - private int k = 2; - private boolean unaryBit = true; - - public SubexpIntegerCodec(int offset, int k) { - this (offset, k, true) ; - } - - public SubexpIntegerCodec(int offset, int k, boolean unaryBit) { - this.offset = offset; - this.k = k; - this.unaryBit = unaryBit; - } - - public SubexpIntegerCodec(int k) { - this.k = k; - } - - @Override - public final Integer read(BitInputStream bis) throws IOException { - int u = 0; - while (bis.readBit() == unaryBit) - u++; - - int b = 0; - int n = 0; - if (u == 0) { - b = k; - n = bis.readBits((int) b); - } else { - b = u + k - 1; - n = (1 << b) | bis.readBits((int) b); - } - - return n - offset; - } - - @Override - public final long write(BitOutputStream bos, Integer value) throws IOException { - if (value + offset < 0) - throw new IllegalArgumentException("Value is less then offset: " + value); - - long newValue = value + offset; - int b = 0; - int u = 0; - if (newValue < (1L << k)) { - b = k; - u = 0; - } else { - b = (int) (Math.log(newValue) / Math.log(2)); - u = b - k + 1; - } - - bos.write(unaryBit, u); - bos.write(!unaryBit); - - bos.write(newValue, b); - return u + 1 + b; - } - - @Override - public final long numberOfBits(Integer value) { - long newValue = value + offset; - long b = 0; - long u = 0; - if (newValue < (1L << k)) { - b = k; - u = 0; - } else { - b = (long) Math.floor(Math.log(newValue) / Math.log(2)); - u = b - k + 1; - } - return u + 1 + b; - } - - public int getOffset() { - return offset; - } - - public void setOffset(int offset) { - this.offset = offset; - } - - public int getK() { - return k; - } - - public void setK(int k) { - this.k = k; - } - - public boolean isUnaryBit() { - return unaryBit; - } - - public void setUnaryBit(boolean unaryBit) { - this.unaryBit = unaryBit; - } - - @Override - public Integer read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented.") ; - } - -} diff --git a/src/java/htsjdk/samtools/cram/encoding/SubexpIntegerEncoding.java b/src/java/htsjdk/samtools/cram/encoding/SubexpIntegerEncoding.java deleted file mode 100644 index 036679f66e..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/SubexpIntegerEncoding.java +++ /dev/null @@ -1,83 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.io.ByteBufferUtils; -import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; -import htsjdk.samtools.cram.structure.EncodingID; -import htsjdk.samtools.cram.structure.EncodingParams; - -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.util.Map; - - - - -public class SubexpIntegerEncoding implements Encoding { - public static final EncodingID ENCODING_ID = EncodingID.SUBEXP; - private int offset; - private int k; - - public SubexpIntegerEncoding() { - } - - public SubexpIntegerEncoding(int k) { - this(0, k); - } - - public SubexpIntegerEncoding(int offset, int k) { - this.offset = offset; - this.k = k; - } - - @Override - public EncodingID id() { - return ENCODING_ID; - } - - public static EncodingParams toParam(int offset, int k) { - SubexpIntegerEncoding e = new SubexpIntegerEncoding(); - e.offset = offset; - e.k = k; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } - - @Override - public byte[] toByteArray() { - ByteBuffer buf = ByteBuffer.allocate(10) ; - ByteBufferUtils.writeUnsignedITF8(offset, buf); - ByteBufferUtils.writeUnsignedITF8(k, buf); - buf.flip() ; - byte[] bytes = new byte[buf.limit()] ; - buf.get(bytes) ; - return bytes ; - } - - @Override - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data); - offset = ByteBufferUtils.readUnsignedITF8(buf); - k = ByteBufferUtils.readUnsignedITF8(buf); - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new SubexpIntegerCodec(offset, k); - } - -} diff --git a/src/java/htsjdk/samtools/cram/encoding/SubexponentialIntegerCodec.java b/src/java/htsjdk/samtools/cram/encoding/SubexponentialIntegerCodec.java new file mode 100644 index 0000000000..54dc8094e2 --- /dev/null +++ b/src/java/htsjdk/samtools/cram/encoding/SubexponentialIntegerCodec.java @@ -0,0 +1,99 @@ +/** + * **************************************************************************** + * Copyright 2013 EMBL-EBI + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * **************************************************************************** + */ +package htsjdk.samtools.cram.encoding; + +import htsjdk.samtools.cram.io.BitInputStream; +import htsjdk.samtools.cram.io.BitOutputStream; + +import java.io.IOException; + + +class SubexponentialIntegerCodec extends AbstractBitCodec { + private int offset = 0; + private int k = 2; + private boolean unaryBit = true; + + SubexponentialIntegerCodec(final int offset, final int k) { + this.offset = offset; + this.k = k; + this.unaryBit = true; + } + + @Override + public final Integer read(final BitInputStream bitInputStream) throws IOException { + int u = 0; + while (bitInputStream.readBit() == unaryBit) + u++; + + final int b; + final int n; + if (u == 0) { + b = k; + n = bitInputStream.readBits(b); + } else { + b = u + k - 1; + n = (1 << b) | bitInputStream.readBits(b); + } + + return n - offset; + } + + @Override + public final long write(final BitOutputStream bitOutputStream, final Integer value) throws IOException { + if (value + offset < 0) + throw new IllegalArgumentException("Value is less then offset: " + value); + + final long newValue = value + offset; + final int b; + final int u; + if (newValue < (1L << k)) { + b = k; + u = 0; + } else { + b = (int) (Math.log(newValue) / Math.log(2)); + u = b - k + 1; + } + + bitOutputStream.write(unaryBit, u); + bitOutputStream.write(!unaryBit); + + bitOutputStream.write(newValue, b); + return u + 1 + b; + } + + @Override + public final long numberOfBits(final Integer value) { + final long newValue = value + offset; + final long b; + final long u; + if (newValue < (1L << k)) { + b = k; + u = 0; + } else { + b = (long) Math.floor(Math.log(newValue) / Math.log(2)); + u = b - k + 1; + } + return u + 1 + b; + } + + @Override + public Integer read(final BitInputStream bitInputStream, final int length) throws IOException { + throw new RuntimeException("Not implemented."); + } + +} diff --git a/src/java/htsjdk/samtools/cram/encoding/SubexponentialIntegerEncoding.java b/src/java/htsjdk/samtools/cram/encoding/SubexponentialIntegerEncoding.java new file mode 100644 index 0000000000..03911b6a54 --- /dev/null +++ b/src/java/htsjdk/samtools/cram/encoding/SubexponentialIntegerEncoding.java @@ -0,0 +1,78 @@ +/** + * **************************************************************************** + * Copyright 2013 EMBL-EBI + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * **************************************************************************** + */ +package htsjdk.samtools.cram.encoding; + +import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; +import htsjdk.samtools.cram.io.ITF8; +import htsjdk.samtools.cram.structure.EncodingID; +import htsjdk.samtools.cram.structure.EncodingParams; + +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.util.Map; + +public class SubexponentialIntegerEncoding implements Encoding { + private static final EncodingID ENCODING_ID = EncodingID.SUBEXPONENTIAL; + private int offset; + private int k; + + public SubexponentialIntegerEncoding() { + } + + public SubexponentialIntegerEncoding(final int offset, final int k) { + this.offset = offset; + this.k = k; + } + + @Override + public EncodingID id() { + return ENCODING_ID; + } + + public static EncodingParams toParam(final int offset, final int k) { + final SubexponentialIntegerEncoding subexponentialIntegerEncoding = new SubexponentialIntegerEncoding(); + subexponentialIntegerEncoding.offset = offset; + subexponentialIntegerEncoding.k = k; + return new EncodingParams(ENCODING_ID, subexponentialIntegerEncoding.toByteArray()); + } + + @Override + public byte[] toByteArray() { + final ByteBuffer buffer = ByteBuffer.allocate(10); + ITF8.writeUnsignedITF8(offset, buffer); + ITF8.writeUnsignedITF8(k, buffer); + buffer.flip(); + final byte[] bytes = new byte[buffer.limit()]; + buffer.get(bytes); + return bytes; + } + + @Override + public void fromByteArray(final byte[] data) { + final ByteBuffer buffer = ByteBuffer.wrap(data); + offset = ITF8.readUnsignedITF8(buffer); + k = ITF8.readUnsignedITF8(buffer); + } + + @Override + public BitCodec buildCodec(final Map inputMap, + final Map outputMap) { + return new SubexponentialIntegerCodec(offset, k); + } + +} diff --git a/src/java/htsjdk/samtools/cram/encoding/UnaryIntegerCodec.java b/src/java/htsjdk/samtools/cram/encoding/UnaryIntegerCodec.java deleted file mode 100644 index 9484007386..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/UnaryIntegerCodec.java +++ /dev/null @@ -1,92 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.io.BitInputStream; -import htsjdk.samtools.cram.io.BitOutputStream; - -import java.io.IOException; - - -public class UnaryIntegerCodec extends AbstractBitCodec { - private boolean stopBit = false; - private int offset = 0; - - public UnaryIntegerCodec() { - this(0, false); - } - - public UnaryIntegerCodec(int offset) { - this(offset, false); - } - - public UnaryIntegerCodec(int offset, boolean stopBit) { - this.stopBit = stopBit; - this.offset = offset; - } - - @Override - public final Integer read(BitInputStream bis) throws IOException { - int bits = 0; - while (bis.readBit() != stopBit) - bits++; - - return bits - offset; - } - - @Override - public final long write(BitOutputStream bos, Integer value) - throws IOException { - int newValue = value + offset; - if (newValue < 0) - throw new IllegalArgumentException( - "Unary codec, negative values not allowed: " + newValue); - - int bits = newValue + 1; - - bos.write(!stopBit, bits - 1); - bos.write(stopBit, 1); - - return value + 1; - } - - @Override - public final long numberOfBits(Integer value) { - return value + offset + 1; - } - - public boolean isStopBit() { - return stopBit; - } - - public long getOffset() { - return offset; - } - - public void setStopBit(boolean stopBit) { - this.stopBit = stopBit; - } - - public void setOffset(int offset) { - this.offset = offset; - } - - @Override - public Integer read(BitInputStream bis, int len) throws IOException { - throw new RuntimeException("Not implemented."); - } - -} diff --git a/src/java/htsjdk/samtools/cram/encoding/UnaryIntegerEncoding.java b/src/java/htsjdk/samtools/cram/encoding/UnaryIntegerEncoding.java deleted file mode 100644 index d062681603..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/UnaryIntegerEncoding.java +++ /dev/null @@ -1,72 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding; - -import htsjdk.samtools.cram.io.ByteBufferUtils; -import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream; -import htsjdk.samtools.cram.structure.EncodingID; -import htsjdk.samtools.cram.structure.EncodingParams; - -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.util.Map; - -@Deprecated -public class UnaryIntegerEncoding implements Encoding { - public static final EncodingID ENCODING_ID = null; - private int offset; - private boolean stopBit; - - public UnaryIntegerEncoding() { - } - - @Override - public EncodingID id() { - return ENCODING_ID; - } - - public static EncodingParams toParam(int offset, boolean stopBit) { - UnaryIntegerEncoding e = new UnaryIntegerEncoding(); - e.offset = offset ; - e.stopBit = stopBit ; - return new EncodingParams(ENCODING_ID, e.toByteArray()); - } - - @Override - public byte[] toByteArray() { - ByteBuffer buf = ByteBuffer.allocate(10); - ByteBufferUtils.writeUnsignedITF8(offset, buf); - buf.put((byte) (stopBit ? 1 : 0)); - buf.flip(); - byte[] array = new byte[buf.limit()]; - buf.get(array); - return array; - } - - @Override - public void fromByteArray(byte[] data) { - ByteBuffer buf = ByteBuffer.wrap(data); - offset = ByteBufferUtils.readUnsignedITF8(buf); - stopBit = buf.get() == 1; - } - - @Override - public BitCodec buildCodec(Map inputMap, - Map outputMap) { - return new UnaryIntegerCodec(); - } - -} diff --git a/src/java/htsjdk/samtools/cram/encoding/fastq/Template.java b/src/java/htsjdk/samtools/cram/encoding/fastq/Template.java deleted file mode 100644 index 2ebc656d2c..0000000000 --- a/src/java/htsjdk/samtools/cram/encoding/fastq/Template.java +++ /dev/null @@ -1,272 +0,0 @@ -/******************************************************************************* - * Copyright 2013 EMBL-EBI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package htsjdk.samtools.cram.encoding.fastq; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; - -public class Template { - public byte[] name; - public Segment segment; - private int hashCode; - public byte size = 0; - public long counter = 0; - - public Template(byte[] name) { - this.name = name; - calculateHashCode(); - } - - public void append(byte[] bases, byte[] scores) { - Segment appendix = new Segment(bases, scores); - - Segment lastSegment = getLastSegment(); - if (lastSegment == null) - segment = appendix; - else { - lastSegment.next = appendix; - appendix.prev = lastSegment; - } - size++; - } - - public void prepend(byte[] bases, byte[] scores) { - Segment appendix = new Segment(bases, scores); - - Segment firstSegment = getFirstSegment(); - if (firstSegment == null) - segment = appendix; - else { - firstSegment.prev = appendix; - appendix.next = firstSegment; - } - size++; - } - - public Segment getLastSegment() { - if (segment == null) - return null; - - Segment last = segment; - while (last.next != null) - last = last.next; - return last; - } - - public Segment getFirstSegment() { - if (segment == null) - return null; - - Segment first = segment; - while (first.prev != null) - first = first.prev; - return first; - } - - @Override - public boolean equals(Object obj) { - if (obj instanceof Template) - return Arrays.equals(name, ((Template) obj).name); - return false; - } - - @Override - public int hashCode() { - return hashCode; - } - - protected int calculateHashCode() { - for (int i = 0; i < 4 && i < name.length; i++) { - hashCode <<= 8; - hashCode |= name[name.length - 1 - i]; - } - return 0; - } - - public static class Segment { - public byte[] bases, scores; - public Segment prev, next; - - public Segment(byte[] bases, byte[] scores) { - this.bases = bases; - this.scores = scores; - } - - } - - public static class ByteArrayHashWrapper { - private byte[] array; - private int hashcode; - - public ByteArrayHashWrapper(byte[] array) { - setArray(array); - } - - public void setArray(byte[] array) { - this.array = array; - calculateHashCode(); - } - - protected int calculateHashCode() { - for (int i = 0; i < 4 && i < array.length; i++) { - hashcode <<= 8; - hashcode |= array[array.length - 1 - i]; - } - return 0; - } - - @Override - public int hashCode() { - return hashcode; - } - - @Override - public boolean equals(Object obj) { - return Arrays.equals(array, ((ByteArrayHashWrapper) obj).array); - } - } - - public static class TemplateHash { - private HashMap map = new HashMap(); - public long counter = 0; - public long min = 0; - - private ByteArrayHashWrapper tmpRemoveWrapper = new ByteArrayHashWrapper( - new byte[0]); - - public Template add(byte[] name, byte[] bases, byte[] scores) { - ByteArrayHashWrapper w = new ByteArrayHashWrapper(name); - Template t = map.get(w); - if (t == null) { - t = new Template(name); - t.counter = ++counter; - map.put(w, t); - } - t.append(bases, scores); - return t; - } - - public List