diff --git a/serialized.ssz_snappy b/serialized.ssz_snappy new file mode 100644 index 0000000..0de2290 Binary files /dev/null and b/serialized.ssz_snappy differ diff --git a/src/root.zig b/src/root.zig index f8f396d..33995ef 100644 --- a/src/root.zig +++ b/src/root.zig @@ -22,6 +22,7 @@ pub const committee_helper = @import("consensus/helpers/committee.zig"); pub const shuffle_helper = @import("consensus/helpers/shuffle.zig"); pub const balance_helper = @import("consensus/helpers/balance.zig"); pub const ssz = @import("./ssz/ssz.zig"); +pub const snappy = @import("./snappy/snappy.zig"); test { @import("std").testing.refAllDeclsRecursive(@This()); diff --git a/src/snappy/snappy.zig b/src/snappy/snappy.zig new file mode 100644 index 0000000..a7f5fba --- /dev/null +++ b/src/snappy/snappy.zig @@ -0,0 +1,501 @@ +const std = @import("std"); +const Allocator = std.mem.Allocator; +const crc32 = std.hash.crc.Crc32Iscsi; +const mem = std.mem; +const testing = std.testing; + +const tagLiteral = 0x00; +const tagCopy1 = 0x01; +const tagCopy2 = 0x02; +const tagCopy4 = 0x03; + +const checksumSize = 4; +const chunkHeaderSize = 4; +const magicBody = "sNaPpY"; +const magicChunk = "\xff\x06\x00\x00" ++ magicBody; + +const maxBlockSize = 65536; +const maxEncodedLenOfMaxBlockSize = 76490; + +const inputMargin = 16 - 1; +const minNonLiteralBlockSize = 1 + 1 + inputMargin; + +const obufHeaderLen = magicChunk.len + checksumSize + chunkHeaderSize; +const obufLen = obufHeaderLen + maxEncodedLenOfMaxBlockSize; + +const chunkTypeCompressedData = 0x00; +const chunkTypeUncompressedData = 0x01; +const chunkTypePadding = 0xfe; +const chunkTypeStreamIdentifier = 0xff; + +// Various errors that may occur while decoding. +const SnappyError = error{ + Corrupt, + TooLarge, + Unsupported, +}; + +// Perform the CRC hash per the snappy documentation. We must use wrapping addition since this is +// the default behavior in other languages. +fn crc(b: []const u8) u32 { + var c = crc32.init(); + c.update(b); + const hash = c.final(); + return @as(u32, hash >> 15 | hash << 17) +% 0xa282ead8; +} + +test "snappy crc" { + try testing.expect(crc("snappy") == 0x293d0c23); +} + +// Represents a variable length integer that we read from a byte stream along with how many bytes +// were read to decode it. +const Varint = struct { + value: u64, + bytesRead: usize, +}; + +// https://golang.org/pkg/encoding/binary/#Uvarint +fn uvarint(buf: []const u8) Varint { + var x: u64 = 0; + var s: u6 = 0; // We can shift a maximum of 2^6 (64) times. + + for (buf, 0..) |b, i| { + if (b < 0x80) { + if (i > 9 or i == 9 and b > 1) { + return Varint{ + .value = 0, + .bytesRead = -%i + 1, + }; + } + return Varint{ + .value = x | (@as(u64, b) << s), + .bytesRead = i + 1, + }; + } + x |= (@as(u64, b & 0x7f) << s); + s += 7; + } + + return Varint{ + .value = 0, + .bytesRead = 0, + }; +} + +// https://golang.org/pkg/encoding/binary/#PutUvarint +fn putUvarint(buf: []u8, x: u64) usize { + var i: usize = 0; + var mutX = x; + + while (mutX >= 0x80) { + buf[i] = @as(u8, @truncate(mutX)) | 0x80; + mutX >>= 7; + i += 1; + } + buf[i] = @as(u8, @truncate(mutX)); + + return i + 1; +} + +// This type represents the size of the snappy block and the header length. +const SnappyBlock = struct { + blockLen: u64, + headerLen: usize, +}; + +// Return the length of the decoded block and the number of bytes that the header occupied. +fn decodedLen(src: []const u8) !SnappyBlock { + const varint = uvarint(src); + if (varint.bytesRead <= 0 or varint.value > 0xffffffff) { + return SnappyError.Corrupt; + } + + const wordSize = 32 << (-1 >> 32 & 1); + if (wordSize == 32 and varint.value > 0x7fffffff) { + return SnappyError.TooLarge; + } + + return SnappyBlock{ + .blockLen = varint.value, + .headerLen = varint.bytesRead, + }; +} + +// The block format decoding implementation. +fn runDecode(dst: []u8, src: []const u8) u8 { + var d: usize = 0; + var s: usize = 0; + var offset: isize = 0; + var length: isize = 0; + + while (s < src.len) { + switch (src[s] & 0x03) { + tagLiteral => { + var x = @as(u32, src[s] >> 2); + switch (x) { + 0...59 => s += 1, + 60 => { + s += 2; + if (s > src.len) { + return 1; + } + x = @as(u32, src[s - 1]); + }, + 61 => { + s += 3; + if (s > src.len) { + return 1; + } + x = @as(u32, src[s - 2]) | @as(u32, src[s - 1]) << 8; + }, + 62 => { + s += 4; + if (s > src.len) { + return 1; + } + x = @as(u32, src[s - 3]) | @as(u32, src[s - 2]) << 8 | @as(u32, src[s - 1]) << 16; + }, + 63 => { + s += 5; + if (s > src.len) { + return 1; + } + x = @as(u32, src[s - 4]) | @as(u32, src[s - 3]) << 8 | @as(u32, src[s - 2]) << 16 | @as(u32, src[s - 1]) << 24; + }, + // Should be unreachable. + else => { + return 1; + }, + } + length = @as(isize, x) + 1; + if (length <= 0) { + return 1; + } + + if (length > dst.len - d or length > src.len - s) { + return 1; + } + std.mem.copyForwards(u8, dst[d..], src[s .. s + @as(usize, @intCast(length))]); + // @memcpy(dst[d..], src[s .. s + @as(usize, @intCast(length))]); + const l = @as(usize, @intCast(length)); + d += l; + s += l; + continue; + }, + tagCopy1 => { + s += 2; + if (s > src.len) { + return 1; + } + + length = 4 + (@as(isize, src[s - 2]) >> 2 & 0x7); + offset = @as(isize, (@as(u32, src[s - 2]) & 0xe0) << 3 | @as(u32, src[s - 1])); + }, + tagCopy2 => { + s += 3; + if (s > src.len) { + return 1; + } + + length = 1 + (@as(isize, src[s - 3]) >> 2); + offset = @as(isize, @as(u32, src[s - 2]) | @as(u32, src[s - 1]) << 8); + }, + tagCopy4 => { + s += 5; + if (s > src.len) { + return 1; + } + + length = 1 + (@as(isize, src[s - 5]) >> 2); + offset = @as(isize, @as(u32, src[s - 4]) | @as(u32, src[s - 3]) << 8 | @as(u32, src[s - 2]) << 16 | @as(u32, src[s - 1]) << 24); + }, + // Should be unreachable. + else => { + return 1; + }, + } + + if (offset <= 0 or d < offset or length > dst.len - d) { + return 1; + } + + if (offset >= length) { + const upper_bound = d - @as(usize, @intCast(offset)) + @as(usize, @intCast(length)); + std.mem.copyForwards(u8, dst[d .. d + @as(usize, @intCast(length))], dst[d - @as(usize, @intCast(offset)) .. upper_bound]); + // @memcpy(dst[d .. d + @as(usize, @intCast(length))], dst[d - @as(usize, @intCast(offset)) .. upper_bound]); + d += @as(usize, @intCast(length)); + continue; + } + + var a = dst[d .. d + @as(usize, @intCast(length))]; + var b = dst[d - @as(usize, @intCast(offset)) ..]; + const aLen = a.len; + b = b[0..aLen]; + for (a, 0..) |_, i| { + a[i] = b[i]; + } + d += @as(usize, @intCast(length)); + } + + if (d != dst.len) { + return 1; + } + + return 0; +} + +/// Given a chosen allocator and the source input, decode it using the snappy block format. The +/// returned slice must be freed. +pub fn decode(allocator: Allocator, src: []const u8) ![]u8 { + const block = try decodedLen(src); + + const dst = try allocator.alloc(u8, block.blockLen); + errdefer allocator.free(dst); + + // Skip past how many bytes we read to get the length. + const s = src[block.headerLen..]; + + if (runDecode(dst, s) != 0) { + return SnappyError.Corrupt; + } + + return dst; +} + +// TODO: Split up encode and decode into separate files once I better understand modules. +fn emitLiteral(dst: []u8, lit: []const u8) usize { + var i: usize = 0; + const n = @as(usize, @intCast(lit.len - 1)); + switch (n) { + 0...59 => { + dst[0] = @as(u8, @intCast(n)) << 2 | tagLiteral; + i = 1; + }, + 60...255 => { + dst[0] = 60 << 2 | tagLiteral; + dst[1] = @as(u8, @intCast(n)); + i = 2; + }, + else => { + dst[0] = 61 << 2 | tagLiteral; + dst[1] = @as(u8, @intCast(n)); + dst[2] = @as(u8, @intCast(n >> 8)); + i = 3; + }, + } + std.mem.copyForwards(u8, dst[i..], lit); + // @memcpy(dst[i..], lit); + + return i + @min(dst.len, lit.len); +} + +fn load32(b: []u8, i: isize) u32 { + const j = @as(usize, @intCast(i)); + const v = b[j .. j + 4]; + return @as(u32, @intCast(v[0])) | @as(u32, @intCast(v[1])) << 8 | @as(u32, @intCast(v[2])) << 16 | @as(u32, @intCast(v[3])) << 24; +} + +fn load64(b: []u8, i: isize) u64 { + const j = @as(usize, @intCast(i)); + const v = b[j .. j + 8]; + return @as(u64, @intCast(v[0])) | @as(u64, @intCast(v[1])) << 8 | @as(u64, @intCast(v[2])) << 16 | @as(u64, @intCast(v[3])) << 24 | @as(u64, @intCast(v[4])) << 32 | @as(u64, @intCast(v[5])) << 40 | @as(u64, @intCast(v[6])) << 48 | @as(u64, @intCast(v[7])) << 56; +} + +fn snappyHash(u: u32, shift: u32) u32 { + const s = @as(u5, @intCast(shift)); + return (u *% 0x1e35a7bd) >> s; +} + +fn emitCopy(dst: []u8, offset: isize, length: isize) usize { + var i: usize = 0; + var l: isize = length; + + while (l >= 68) { + dst[i + 0] = 63 << 2 | tagCopy2; + dst[i + 1] = @as(u8, @truncate(@as(usize, @intCast(offset)))); + dst[i + 2] = @as(u8, @truncate(@as(usize, @intCast(offset >> 8)))); + i += 3; + l -= 64; + } + + if (l > 64) { + dst[i + 0] = 59 << 2 | tagCopy2; + dst[i + 1] = @as(u8, @truncate(@as(usize, @intCast(offset)))); + dst[i + 2] = @as(u8, @truncate(@as(usize, @intCast(offset >> 8)))); + //mem.copy(u8, dst, &mem.toBytes(offset)); + i += 3; + l -= 60; + } + + if (l >= 12 or offset >= 2048) { + dst[i + 0] = (@as(u8, @intCast(l)) -% 1) << 2 | tagCopy2; + dst[i + 1] = @as(u8, @truncate(@as(usize, @intCast(offset)))); + dst[i + 2] = @as(u8, @truncate(@as(usize, @intCast(offset >> 8)))); + return i + 3; + } + + dst[i + 0] = @as(u8, @truncate(@as(usize, @intCast(offset >> 8)))) << 5 | (@as(u8, @intCast(l)) -% 4) << 2 | tagCopy1; + dst[i + 1] = @as(u8, @truncate(@as(usize, @intCast(offset)))); + return i + 2; +} + +fn encodeBlock(dst: []u8, src: []u8) usize { + const maxTableSize = 1 << 14; + const tableMask = maxTableSize - 1; + + var d: usize = 0; + var shift: u32 = 24; + var tableSize: isize = 1 << 8; + while (tableSize < maxTableSize and tableSize < src.len) { + tableSize *= 2; + shift -= 1; + } + + var table = mem.zeroes([maxTableSize]u16); + const sLimit = src.len - inputMargin; + var nextEmit: usize = 0; + var s: usize = 1; + var nextHash = snappyHash(load32(src, @as(isize, @intCast(s))), shift); + + outer: while (true) { + var skip: isize = 32; + var nextS = s; + var candidate: isize = 0; + + inner: while (true) { + s = nextS; + const bytesBetweenHashLookups = skip >> 5; + nextS = s + @as(usize, @intCast(bytesBetweenHashLookups)); + skip += bytesBetweenHashLookups; + if (nextS > sLimit) { + break :outer; + } + candidate = @as(isize, @intCast(table[nextHash & tableMask])); + table[nextHash & tableMask] = @as(u16, @intCast(s)); + nextHash = snappyHash(load32(src, @as(isize, @intCast(nextS))), shift); + if (load32(src, @as(isize, @intCast(s))) == load32(src, candidate)) { + break :inner; + } + } + + d += emitLiteral(dst[d..], src[nextEmit..s]); + + while (true) { + const base = s; + s += 4; + var i = @as(usize, @intCast(candidate + 4)); + while (s < src.len and src[i] == src[s]) { + i += 1; + s += 1; + } + + d += emitCopy(dst[d..], @as(isize, @intCast(base - @as(usize, @intCast(candidate)))), @as(isize, @intCast(s - base))); + nextEmit = s; + if (s >= sLimit) { + break :outer; + } + + const x = load64(src, @as(isize, @intCast(s - 1))); + const prevHash = snappyHash(@as(u32, @truncate(x >> 0)), shift); + table[prevHash & tableMask] = @as(u16, @intCast(s - 1)); + const currHash = snappyHash(@as(u32, @truncate(x >> 8)), shift); + candidate = @as(isize, @intCast(table[currHash & tableMask])); + table[currHash & tableMask] = @as(u16, @intCast(s)); + if (@as(u32, @truncate(x >> 8)) != load32(src, candidate)) { + nextHash = snappyHash(@as(u32, @truncate(x >> 16)), shift); + s += 1; + break; + } + } + } + + if (nextEmit < src.len) { + d += emitLiteral(dst[d..], src[nextEmit..]); + } + + return d; +} + +/// Encode returns the encoded form of the source input. The returned slice must be freed. +pub fn encode(allocator: Allocator, src: []u8) ![]u8 { + var mutSrc = src; + const encodedLen = maxEncodedLen(mutSrc.len); + if (encodedLen < 0) { + return SnappyError.TooLarge; + } + + var dst = try allocator.alloc(u8, @as(usize, @intCast(encodedLen))); + errdefer allocator.free(dst); + + var d = putUvarint(dst, @as(u64, @intCast(mutSrc.len))); + + while (mutSrc.len > 0) { + var p = try allocator.alloc(u8, mutSrc.len); + @memcpy(p, mutSrc); + var empty = [_]u8{}; + mutSrc = empty[0..]; + if (p.len > maxBlockSize) { + mutSrc = p[maxBlockSize..]; + p = p[0..maxBlockSize]; + } + if (p.len < minNonLiteralBlockSize) { + d += emitLiteral(dst[d..], p); + } else { + d += encodeBlock(dst[d..], p); + } + allocator.free(p); + } + + const output = try allocator.alloc(u8, d); + @memcpy(output, dst[0..d]); + allocator.free(dst); + + return output; +} + +/// Return the maximum length of a snappy block, given the uncompressed length. +pub fn maxEncodedLen(srcLen: usize) isize { + var n = @as(u64, @intCast(srcLen)); + if (n > 0xffffffff) { + return -1; + } + + n = 32 + n + n / 6; + if (n > 0xffffffff) { + return -1; + } + + return @as(isize, @intCast(n)); +} + +test "decoding variable integers" { + // Taken from the block format description. + const case1 = uvarint(&[_]u8{0x40}); + try testing.expect(case1.value == 64); + try testing.expect(case1.bytesRead == 1); + + const case2 = uvarint(&[_]u8{ 0xfe, 0xff, 0x7f }); + try testing.expect(case2.value == 2097150); + try testing.expect(case2.bytesRead == 3); +} + +test "simple encode" { + const allocator = testing.allocator; + + var input: [4]u8 = [_]u8{ 't', 'h', 'i', 's' }; + const i: []u8 = &input; + const output = try encode(allocator, i); + defer allocator.free(output); + + try testing.expectEqualSlices(u8, output, "\x04\x0cthis"); +} + +test "simple decode" { + const allocator = testing.allocator; + + const decoded = try decode(allocator, "\x19\x1coh snap,\x05\x06,py is cool!\x0a"); + defer allocator.free(decoded); + + try testing.expectEqualSlices(u8, decoded, "oh snap, snappy is cool!\n"); +} diff --git a/src/spec_tests/ssz_static/root.zig b/src/spec_tests/ssz_static/root.zig index a993895..ed6460a 100644 --- a/src/spec_tests/ssz_static/root.zig +++ b/src/spec_tests/ssz_static/root.zig @@ -2,6 +2,7 @@ const std = @import("std"); const testing = std.testing; const ssz = @import("../../ssz/ssz.zig"); const types = @import("../../consensus/types.zig"); +const snappy = @import("../../snappy/snappy.zig"); test "hash tree root" { const fork = types.Fork{ @@ -13,4 +14,16 @@ test "hash tree root" { try ssz.hashTreeRoot(fork, &out, testing.allocator); const expect: [32]u8 = [_]u8{ 0x98, 0x2a, 0x69, 0x96, 0xc9, 0x2f, 0x86, 0xf6, 0x37, 0x68, 0x3c, 0x72, 0xd9, 0x09, 0xc7, 0xa8, 0x68, 0x11, 0x0e, 0x3b, 0x05, 0xf7, 0xb4, 0x48, 0x44, 0xbc, 0x53, 0x96, 0x0d, 0x89, 0x56, 0xf5 }; try std.testing.expect(std.mem.eql(u8, out[0..], expect[0..])); + const file_path = "serialized.ssz_snappy"; + const file_contents = try std.fs.cwd().readFileAlloc(testing.allocator, file_path, std.math.maxInt(usize)); + defer testing.allocator.free(file_contents); + // std.debug.print("Hex: {any}\n", .{std.fmt.fmtSliceHexLower(file_contents)}); + + const decoded_data = try snappy.decode(testing.allocator, file_contents); + defer testing.allocator.free(decoded_data); + + const encode = try ssz.encodeSSZ(testing.allocator, fork); + defer testing.allocator.free(encode); + + try std.testing.expect(std.mem.eql(u8, encode, decoded_data)); } diff --git a/src/ssz/ssz.zig b/src/ssz/ssz.zig index 315ec33..cffbe7f 100644 --- a/src/ssz/ssz.zig +++ b/src/ssz/ssz.zig @@ -122,7 +122,7 @@ pub fn decodeSSZ(comptime T: type, serialized: []const u8) SSZDecodeErrors!T { comptime var num_fields = 0; inline for (struct_info.fields) |field| { switch (@typeInfo(field.type)) { - .bool, .int => continue, + .bool, .int, .array => continue, else => num_fields += 1, } } @@ -134,7 +134,7 @@ pub fn decodeSSZ(comptime T: type, serialized: []const u8) SSZDecodeErrors!T { comptime var field_index = 0; inline for (struct_info.fields) |field| { switch (@typeInfo(field.type)) { - .bool, .int => { + .bool, .int, .array => { @field(result, field.name) = try decodeSSZ(field.type, serialized[index .. index + @sizeOf(field.type)]); index += @sizeOf(field.type); }, @@ -149,7 +149,7 @@ pub fn decodeSSZ(comptime T: type, serialized: []const u8) SSZDecodeErrors!T { comptime var final_index = 0; inline for (struct_info.fields) |field| { switch (@typeInfo(field.type)) { - .bool, .int => continue, + .bool, .int, .array => continue, else => { const final = if (final_index == indices.len - 1) serialized.len else indices[final_index + 1]; @field(result, field.name) = try decodeSSZ(field.type, serialized[indices[final_index]..final]); @@ -307,7 +307,7 @@ fn encodeItem(value: anytype, list: *std.ArrayList(u8)) Allocator.Error!void { comptime var start: usize = 0; inline for (struct_info.fields) |field| { switch (@typeInfo(field.type)) { - .int, .bool => start += @sizeOf(field.type), + .int, .bool, .array => start += @sizeOf(field.type), else => start += 4, } } @@ -315,7 +315,7 @@ fn encodeItem(value: anytype, list: *std.ArrayList(u8)) Allocator.Error!void { var accumulate: usize = start; inline for (struct_info.fields) |field| { switch (@typeInfo(field.type)) { - .int, .bool => try encodeItem(@field(value, field.name), list), + .int, .bool, .array => try encodeItem(@field(value, field.name), list), else => { try encodeItem(@as(u32, @truncate(accumulate)), list); accumulate += sizeOfValue(@field(value, field.name)); @@ -363,7 +363,7 @@ pub inline fn isStaticType(comptime T: type) bool { switch (info) { .bool, .int, .null => return true, - .array => return false, + .array => return true, .@"struct" => inline for (info.@"struct".fields) |field| { if (!isStaticType(field.type)) { return false;