Skip to content

Commit

Permalink
Rewrite layout of BWT metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
flanglet committed Sep 10, 2024
1 parent aa71e09 commit a58bdb8
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 86 deletions.
175 changes: 95 additions & 80 deletions java/src/main/java/kanzi/transform/BWTBlockCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,38 +17,38 @@

import java.util.Map;
import kanzi.ByteTransform;
import kanzi.Global;
import kanzi.SliceByteArray;


// Utility class to en/de-code a BWT data block and its associated primary index(es)

// BWT stream format: Header (m bytes) Data (n bytes)
// Header: For each primary index,
// mode (8 bits) + primary index (8,16 or 24 bits)
// mode: bits 7-6 contain the size in bits of the primary index :
// 00: primary index size <= 6 bits (fits in mode byte)
// 01: primary index size <= 14 bits (1 extra byte)
// 10: primary index size <= 22 bits (2 extra bytes)
// 11: primary index size > 22 bits (3 extra bytes)
// bits 5-0 contain 6 most significant bits of primary index
// primary index: remaining bits (up to 3 bytes)
// BWT stream format: Header (mode + primary index(es)) | Data (n bytes)
// mode (8 bits): xxxyyyzz
// xxx: ignored
// yyy: log(chunks)
// zz: primary index size - 1 (in bytes)
// primary indexes (chunks * (8|16|24|32 bits))

public class BWTBlockCodec implements ByteTransform
{
private static final int BWT_MAX_HEADER_SIZE = 8 * 4;

private final BWT bwt;
private final int bsVersion;


public BWTBlockCodec()
{
this.bwt = new BWT();
this.bsVersion = 6;
}


public BWTBlockCodec(Map<String, Object> ctx)
{
this.bwt = new BWT(ctx);
this.bsVersion = (ctx == null) ? 6 : (int) ctx.getOrDefault("bsVersion", 6);
}


Expand All @@ -68,71 +68,44 @@ public boolean forward(SliceByteArray input, SliceByteArray output)
if (output.length - output.index < getMaxEncodedLength(blockSize))
return false;

final int savedOIdx = output.index;
final int chunks = BWT.getBWTChunks(blockSize);
int log = 1;
int logBlockSize = Global.log2(blockSize);

while (1<<log <= blockSize)
log++;
if ((blockSize & (blockSize - 1)) != 0)
logBlockSize++;

// Estimate header size based on block size
final int headerSizeBytes1 = chunks * ((2+log+7) >>> 3);
output.index += headerSizeBytes1;
output.length -= headerSizeBytes1;
final int pIndexSize = (logBlockSize + 7) >> 3;

// Apply forward transform
if (this.bwt.forward(input, output) == false)
if ((pIndexSize <= 0) || (pIndexSize >= 5))
return false;

int headerSizeBytes2 = 0;

for (int i=0; i<chunks; i++)
{
final int primaryIndex = this.bwt.getPrimaryIndex(i);
int pIndexSizeBits = 6;

while ((1<<pIndexSizeBits) <= primaryIndex)
pIndexSizeBits++;

// Compute block size based on primary index
headerSizeBytes2 += ((2+pIndexSizeBits+7) >>> 3);
}

if (headerSizeBytes2 != headerSizeBytes1)
{
// Adjust space for header
System.arraycopy(output.array, savedOIdx+headerSizeBytes1,
output.array, savedOIdx+headerSizeBytes2, blockSize);

output.index = output.index - headerSizeBytes1 + headerSizeBytes2;
}
final int chunks = BWT.getBWTChunks(blockSize);
final int logNbChunks = Global.log2(chunks);

int idx = savedOIdx;
if (logNbChunks > 7)
return false;

for (int i=0; i<chunks; i++)
{
final int primaryIndex = this.bwt.getPrimaryIndex(i);
int pIndexSizeBits = 6;
int idx0 = output.index;
output.index += (1 + chunks * pIndexSize);

while ((1<<pIndexSizeBits) <= primaryIndex)
pIndexSizeBits++;
// Apply forward transform
if (this.bwt.forward(input, output) == false)
return false;

// Compute primary index size
final int pIndexSizeBytes = (2+pIndexSizeBits+7) >>> 3;
final byte mode = (byte) ((logNbChunks << 2) | (pIndexSize - 1));

// Write block header (mode + primary index). See top of file for format
int shift = (pIndexSizeBytes - 1) << 3;
int blockMode = (pIndexSizeBits + 1) >>> 3;
blockMode = (blockMode << 6) | ((primaryIndex >>> shift) & 0x3F);
output.array[idx++] = (byte) blockMode;
// Emit header
for (int i=0, idx=idx0+1; i<chunks; i++) {
final int primaryIndex = this.bwt.getPrimaryIndex(i) - 1;
int shift = (pIndexSize - 1) << 3;

while (shift >= 8)
{
shift -= 8;
output.array[idx++] = (byte) (primaryIndex >> shift);
}
while (shift >= 0)
{
output.array[idx++] = (byte) (primaryIndex>>shift);
shift -= 8;
}
}

output.array[idx0] = mode;
return true;
}

Expand All @@ -147,30 +120,72 @@ public boolean inverse(SliceByteArray input, SliceByteArray output)
return false;

int blockSize = input.length;
final int chunks = BWT.getBWTChunks(blockSize);

for (int i=0; i<chunks; i++)
if (this.bsVersion > 5)
{
// Read block header (mode + primary index). See top of file for format
final int blockMode = input.array[input.index++] & 0xFF;
final int pIndexSizeBytes = 1 + ((blockMode >>> 6) & 0x03);
// Number of chunks and primary index size in bitstream since bsVersion 6
byte mode = input.array[input.index++];
final int logNbChunks = (mode >> 2) & 0x07;
final int pIndexSize = (mode & 0x03) + 1;

if (pIndexSize == 0)
return false;

final int chunks = 1 << logNbChunks;

if (chunks != BWT.getBWTChunks(blockSize))
return false;

if (input.length < pIndexSizeBytes)
return false;
final int headerSize = 1 + chunks*pIndexSize;

input.length -= pIndexSizeBytes;
int shift = (pIndexSizeBytes - 1) << 3;
int primaryIndex = (blockMode & 0x3F) << shift;
if ((input.length < headerSize) || (blockSize < headerSize))
return false;

// Extract BWT primary index
for (int n=1; n<pIndexSizeBytes; n++)
// Read header
for (int i=0; i<chunks; i++)
{
int shift = (pIndexSize - 1) << 3;
int primaryIndex = 0;

// Extract BWT primary index
while (shift >= 0) {
primaryIndex = (primaryIndex << 8) | (input.array[input.index++] & 0xFF);
shift -= 8;
}

if (this.bwt.setPrimaryIndex(i, primaryIndex + 1) == false)
return false;
}

blockSize -= headerSize;
}
else
{
final int chunks = BWT.getBWTChunks(blockSize);

for (int i=0; i<chunks; i++)
{
shift -= 8;
primaryIndex |= ((input.array[input.index++] & 0xFF) << shift);
// Read block header (mode + primary index). See top of file for format
final int blockMode = input.array[input.index++] & 0xFF;
final int pIndexSizeBytes = 1 + ((blockMode >>> 6) & 0x03);

if (input.length < pIndexSizeBytes)
return false;

input.length -= pIndexSizeBytes;
int shift = (pIndexSizeBytes - 1) << 3;
int primaryIndex = (blockMode & 0x3F) << shift;

// Extract BWT primary index
for (int n=1; n<pIndexSizeBytes; n++)
{
shift -= 8;
primaryIndex |= ((input.array[input.index++] & 0xFF) << shift);
}

if (this.bwt.setPrimaryIndex(i, primaryIndex) == false)
return false;
}

if (this.bwt.setPrimaryIndex(i, primaryIndex) == false)
return false;
}

// Apply inverse Transform
Expand All @@ -183,4 +198,4 @@ public int getMaxEncodedLength(int srcLen)
{
return srcLen + BWT_MAX_HEADER_SIZE;
}
}
}
8 changes: 4 additions & 4 deletions java/src/main/java/kanzi/transform/LZCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ public LZXCodec()
this.tkBuf = new byte[0];
this.extra = false;
this.ctx = null;
this.bsVersion = 4;
this.bsVersion = 6;
}


Expand All @@ -131,7 +131,7 @@ public LZXCodec(Map<String, Object> ctx)
this.extra = (ctx == null) ? false :
(short) ctx.getOrDefault("lz", TransformFactory.LZ_TYPE) == TransformFactory.LZX_TYPE;
this.ctx = ctx;
this.bsVersion = (ctx == null) ? 4 : (int) ctx.getOrDefault("bsVersion", 4);
this.bsVersion = (ctx == null) ? 6 : (int) ctx.getOrDefault("bsVersion", 6);
}


Expand Down Expand Up @@ -1003,10 +1003,10 @@ public LZPCodec()
public LZPCodec(Map<String, Object> ctx)
{
this.hashes = new int[0];
int bsVersion = 4;
int bsVersion = 6;

if (ctx != null)
bsVersion = (Integer) ctx.getOrDefault("bsVersion", 4);
bsVersion = (Integer) ctx.getOrDefault("bsVersion", 6);

this.isBsVersion3 = bsVersion < 4;
}
Expand Down
4 changes: 2 additions & 2 deletions java/src/main/java/kanzi/transform/UTFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ public UTFCodec()
public UTFCodec(Map<String, Object> ctx)
{
this.ctx = ctx;
int bsVersion = 4;
int bsVersion = 6;

if (ctx != null)
bsVersion = (Integer) ctx.getOrDefault("bsVersion", 4);
bsVersion = (Integer) ctx.getOrDefault("bsVersion", 6);

this.isBsVersion3 = bsVersion < 4;
}
Expand Down

0 comments on commit a58bdb8

Please sign in to comment.