Skip to content

Commit

Permalink
CRAM: Refactor Encodings and Codecs (#1224)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmthibault79 authored and cmnbroad committed Nov 30, 2018
1 parent 698a4c3 commit c596e6b
Show file tree
Hide file tree
Showing 125 changed files with 3,137 additions and 2,678 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
package htsjdk.samtools.cram.build;

import htsjdk.samtools.cram.common.MutableInt;
import htsjdk.samtools.cram.encoding.ByteArrayLenEncoding;
import htsjdk.samtools.cram.encoding.ByteArrayStopEncoding;
import htsjdk.samtools.cram.encoding.ExternalByteEncoding;
import htsjdk.samtools.cram.encoding.ExternalCompressor;
import htsjdk.samtools.cram.encoding.ExternalIntegerEncoding;
import htsjdk.samtools.cram.encoding.huffman.codec.CanonicalHuffmanIntegerEncoding;
import htsjdk.samtools.cram.encoding.rans.RANS;
import htsjdk.samtools.cram.compression.ExternalCompressor;
import htsjdk.samtools.cram.encoding.*;
import htsjdk.samtools.cram.encoding.core.CanonicalHuffmanIntegerEncoding;
import htsjdk.samtools.cram.encoding.external.ByteArrayStopEncoding;
import htsjdk.samtools.cram.encoding.external.ExternalByteArrayEncoding;
import htsjdk.samtools.cram.encoding.external.ExternalByteEncoding;
import htsjdk.samtools.cram.encoding.external.ExternalIntegerEncoding;
import htsjdk.samtools.cram.compression.rans.RANS;
import htsjdk.samtools.cram.encoding.readfeatures.ReadFeature;
import htsjdk.samtools.cram.encoding.readfeatures.Substitution;
import htsjdk.samtools.cram.structure.CompressionHeader;
Expand Down Expand Up @@ -52,15 +53,13 @@
* This particular version relies heavily on GZIP and RANS for better compression.
*/
public class CompressionHeaderFactory {
private static final int TAG_VALUE_BUFFER_SIZE = 1024 * 1024;
public static final int BYTE_SPACE_SIZE = 256;
public static final int ALL_BYTES_USED = -1;
private final Map<Integer, EncodingDetails> bestEncodings = new HashMap<>();
private final ByteArrayOutputStream baosForTagValues;

public CompressionHeaderFactory() {
baosForTagValues = new ByteArrayOutputStream(TAG_VALUE_BUFFER_SIZE);
}
// a parameter for Huffman encoding, so we don't have to re-construct on each call
private static final int[] singleZero = new int[] { 0 };
private final Map<Integer, EncodingDetails> bestEncodings = new HashMap<>();
private final ByteArrayOutputStream baosForTagValues = new ByteArrayOutputStream(1024 * 1024);

/**
* Decides on compression methods to use for the given records.
Expand Down Expand Up @@ -110,7 +109,7 @@ public CompressionHeader build(final List<CramCompressionRecord> records, Substi
builder.addExternalIntegerGzipEncoding(DataSeries.TC_TagCount);
builder.addExternalIntegerEncoding(DataSeries.TL_TagIdList, ExternalCompressor.createGZIP());
builder.addExternalIntegerGzipEncoding(DataSeries.TN_TagNameAndType);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.TS_InsetSize);
builder.addExternalIntegerRansOrderOneEncoding(DataSeries.TS_InsertSize);

builder.setTagIdDictionary(buildTagIdDictionary(records));

Expand Down Expand Up @@ -348,7 +347,7 @@ byte[] getDataForTag(final List<CramCompressionRecord> records, final int tagID)
return baosForTagValues.toByteArray();
}

static ByteSizeRange geByteSizeRangeOfTagValues(final List<CramCompressionRecord> records, final int tagID) {
static ByteSizeRange getByteSizeRangeOfTagValues(final List<CramCompressionRecord> records, final int tagID) {
final byte type = getTagType(tagID);
final ByteSizeRange stats = new ByteSizeRange();
for (final CramCompressionRecord record : records) {
Expand Down Expand Up @@ -401,13 +400,25 @@ private static class EncodingDetails {
EncodingParams params;
}

/**
* Used by buildEncodingForTag to create a ByteArrayLenEncoding with CanonicalHuffmanIntegerEncoding and
* ExternalByteArrayEncoding sub-encodings
*
* @param tagValueSize the size of the tag value, to be Huffman encoded
* @param tagID the ID of the tag
* @return EncodingParams a complete description of the result Encoding
*/
private EncodingParams buildTagEncodingForSize(final int tagValueSize, final int tagID) {
return new ByteArrayLenEncoding(
new CanonicalHuffmanIntegerEncoding(new int[] { tagValueSize }, singleZero),
new ExternalByteArrayEncoding(tagID)).toParam();
}

/**
* Build an encoding for a specific tag for given records.
*
* @param records
* CRAM records holding the tags
* @param tagID
* an integer id of the tag
* @param records CRAM records holding the tags
* @param tagID an integer id of the tag
* @return an encoding for the tag
*/
private EncodingDetails buildEncodingForTag(final List<CramCompressionRecord> records, final int tagID) {
Expand All @@ -421,51 +432,46 @@ private EncodingDetails buildEncodingForTag(final List<CramCompressionRecord> re
case 'A':
case 'c':
case 'C':
details.params = ByteArrayLenEncoding.toParam(
CanonicalHuffmanIntegerEncoding.toParam(new int[] { 1 }, new int[] { 0 }),
ExternalByteEncoding.toParam(tagID));
details.params = buildTagEncodingForSize(1, tagID);
return details;

case 'I':
case 'i':
case 'f':
details.params = ByteArrayLenEncoding.toParam(
CanonicalHuffmanIntegerEncoding.toParam(new int[] { 4 }, new int[] { 0 }),
ExternalByteEncoding.toParam(tagID));
details.params = buildTagEncodingForSize(4, tagID);
return details;

case 's':
case 'S':
details.params = ByteArrayLenEncoding.toParam(
CanonicalHuffmanIntegerEncoding.toParam(new int[] { 2 }, new int[] { 0 }),
ExternalByteEncoding.toParam(tagID));
details.params = buildTagEncodingForSize(2, tagID);
return details;

case 'Z':
case 'B':
final ByteSizeRange stats = geByteSizeRangeOfTagValues(records, tagID);
final ByteSizeRange stats = getByteSizeRangeOfTagValues(records, tagID);
final boolean singleSize = stats.min == stats.max;
if (singleSize) {
details.params = ByteArrayLenEncoding.toParam(
CanonicalHuffmanIntegerEncoding.toParam(new int[] { stats.min }, new int[] { 0 }),
ExternalByteEncoding.toParam(tagID));
details.params = buildTagEncodingForSize(stats.min, tagID);
return details;
}

if (type == 'Z') {
details.params = ByteArrayStopEncoding.toParam((byte) '\t', tagID);
details.params = new ByteArrayStopEncoding((byte) '\t', tagID).toParam();
return details;
}

final int minSize_threshold_ForByteArrayStopEncoding = 100;
if (stats.min > minSize_threshold_ForByteArrayStopEncoding) {
final int unusedByte = getUnusedByte(data);
if (unusedByte > ALL_BYTES_USED) {
details.params = ByteArrayStopEncoding.toParam((byte) unusedByte, tagID);
details.params = new ByteArrayStopEncoding((byte) unusedByte, tagID).toParam();
return details;
}
}

details.params = ByteArrayLenEncoding.toParam(ExternalIntegerEncoding.toParam(tagID),
ExternalByteEncoding.toParam(tagID));
details.params = new ByteArrayLenEncoding(
new ExternalIntegerEncoding(tagID),
new ExternalByteArrayEncoding(tagID)).toParam();
return details;
default:
throw new IllegalArgumentException("Unknown tag type: " + (char) type);
Expand Down Expand Up @@ -502,31 +508,31 @@ private void addExternalEncoding(final DataSeries dataSeries,

private void addExternalByteArrayStopTabGzipEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ByteArrayStopEncoding.toParam((byte) '\t', dataSeries.getExternalBlockContentId()),
new ByteArrayStopEncoding((byte) '\t', dataSeries.getExternalBlockContentId()).toParam(),
ExternalCompressor.createGZIP());
}

private void addExternalIntegerEncoding(final DataSeries dataSeries, final ExternalCompressor compressor) {
addExternalEncoding(dataSeries,
ExternalIntegerEncoding.toParam(dataSeries.getExternalBlockContentId()),
new ExternalIntegerEncoding(dataSeries.getExternalBlockContentId()).toParam(),
compressor);
}

private void addExternalIntegerGzipEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ExternalIntegerEncoding.toParam(dataSeries.getExternalBlockContentId()),
new ExternalIntegerEncoding(dataSeries.getExternalBlockContentId()).toParam(),
ExternalCompressor.createGZIP());
}

private void addExternalByteGzipEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ExternalByteEncoding.toParam(dataSeries.getExternalBlockContentId()),
new ExternalByteEncoding(dataSeries.getExternalBlockContentId()).toParam(),
ExternalCompressor.createGZIP());
}

private void addExternalByteRansOrderOneEncoding(final DataSeries dataSeries) {
addExternalEncoding(dataSeries,
ExternalByteEncoding.toParam(dataSeries.getExternalBlockContentId()),
new ExternalByteEncoding(dataSeries.getExternalBlockContentId()).toParam(),
ExternalCompressor.createRANS(RANS.ORDER.ONE));
}

Expand Down
11 changes: 6 additions & 5 deletions src/main/java/htsjdk/samtools/cram/build/ContainerFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.cram.digest.ContentDigests;
import htsjdk.samtools.cram.encoding.ExternalCompressor;
import htsjdk.samtools.cram.compression.ExternalCompressor;
import htsjdk.samtools.cram.encoding.writer.CramRecordWriter;
import htsjdk.samtools.cram.io.DefaultBitOutputStream;
import htsjdk.samtools.cram.io.ExposedByteArrayOutputStream;
Expand All @@ -32,6 +32,7 @@
import htsjdk.samtools.cram.structure.Slice;
import htsjdk.samtools.cram.structure.SubstitutionMatrix;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
Expand Down Expand Up @@ -117,9 +118,9 @@ private static Slice buildSlice(final List<CramCompressionRecord> records,
final CompressionHeader header)
throws IllegalArgumentException, IllegalAccessException,
IOException {
final Map<Integer, ExposedByteArrayOutputStream> map = new HashMap<Integer, ExposedByteArrayOutputStream>();
final Map<Integer, ByteArrayOutputStream> map = new HashMap<>();
for (final int id : header.externalIds) {
map.put(id, new ExposedByteArrayOutputStream());
map.put(id, new ByteArrayOutputStream());
}

final ExposedByteArrayOutputStream bitBAOS = new ExposedByteArrayOutputStream();
Expand Down Expand Up @@ -171,9 +172,9 @@ private static Slice buildSlice(final List<CramCompressionRecord> records,
bitOutputStream.close();
slice.coreBlock = Block.buildNewCore(bitBAOS.toByteArray());

slice.external = new HashMap<Integer, Block>();
slice.external = new HashMap<>();
for (final Integer key : map.keySet()) {
final ExposedByteArrayOutputStream os = map.get(key);
final ByteArrayOutputStream os = map.get(key);

final Block externalBlock = new Block();
externalBlock.setContentId(key);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package htsjdk.samtools.cram.io;
package htsjdk.samtools.cram.compression;

import htsjdk.samtools.cram.encoding.rans.RANS;
import htsjdk.samtools.cram.compression.rans.RANS;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.util.IOUtil;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package htsjdk.samtools.cram.encoding;
package htsjdk.samtools.cram.compression;

import htsjdk.samtools.cram.encoding.rans.RANS.ORDER;
import htsjdk.samtools.cram.io.ExternalCompression;
import htsjdk.samtools.cram.compression.rans.RANS.ORDER;
import htsjdk.samtools.cram.structure.BlockCompressionMethod;

import java.io.IOException;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

class Constants {
static final int TF_SHIFT = 12;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

import java.nio.ByteBuffer;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

import java.nio.ByteBuffer;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

import htsjdk.samtools.cram.encoding.rans.Encoding.RansEncSymbol;
import htsjdk.samtools.cram.compression.rans.Encoding.RansEncSymbol;

import java.nio.ByteBuffer;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

import htsjdk.samtools.cram.encoding.rans.Encoding.RansEncSymbol;
import htsjdk.samtools.cram.compression.rans.Encoding.RansEncSymbol;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

import java.nio.ByteBuffer;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

import htsjdk.samtools.cram.encoding.rans.Encoding.RansEncSymbol;
import htsjdk.samtools.cram.compression.rans.Encoding.RansEncSymbol;

import java.nio.ByteBuffer;
import java.util.Arrays;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

import htsjdk.samtools.cram.encoding.rans.Encoding.RansEncSymbol;
import htsjdk.samtools.cram.compression.rans.Encoding.RansEncSymbol;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package htsjdk.samtools.cram.encoding.rans;
package htsjdk.samtools.cram.compression.rans;

import java.nio.ByteBuffer;

Expand Down
56 changes: 0 additions & 56 deletions src/main/java/htsjdk/samtools/cram/encoding/AbstractBitCodec.java

This file was deleted.

Loading

0 comments on commit c596e6b

Please sign in to comment.