From df3f585d207ae615948efa4e8478ed73def84538 Mon Sep 17 00:00:00 2001 From: Arya Tabaie Date: Tue, 5 Dec 2023 11:17:48 -0600 Subject: [PATCH] refactor: use external compressor repo (#942) * refactor: use external compressor repo * build: updated to compress v0.1.0 --------- Co-authored-by: Gautam Botrel --- go.mod | 3 +- go.sum | 4 + std/compress/lzss/backref.go | 76 - std/compress/lzss/compress.go | 303 --- std/compress/lzss/compress_test.go | 206 -- std/compress/lzss/decompress.go | 121 -- std/compress/lzss/dict_naive | Bin 65535 -> 0 bytes std/compress/lzss/e2e_test.go | 20 +- .../lzss/internal/suffixarray/sais.go | 899 --------- .../lzss/internal/suffixarray/sais2.go | 1741 ----------------- .../lzss/internal/suffixarray/suffixarray.go | 152 -- std/compress/lzss/snark.go | 42 +- std/compress/lzss/snark_test.go | 108 +- std/compress/lzss/snark_testing.go | 16 +- .../testdata/{test_cases => }/3c2943/data.bin | Bin .../testdata/{test_cases => }/705b24/data.bin | Bin .../testdata/{test_cases => }/777003/data.bin | Bin .../testdata/{test_cases => }/bug/data.bin | Bin .../testdata/{test_cases => }/c9b5a2/data.bin | Bin .../testdata/{test_cases => }/e4207e/data.bin | Bin .../testdata/{test_cases => }/fa4a22/data.bin | Bin .../testdata/{test_cases => }/large/data.bin | Bin std/compress/stream.go | 191 -- std/compress/stream_test.go | 33 - 24 files changed, 75 insertions(+), 3840 deletions(-) delete mode 100644 std/compress/lzss/backref.go delete mode 100644 std/compress/lzss/compress.go delete mode 100644 std/compress/lzss/compress_test.go delete mode 100644 std/compress/lzss/decompress.go delete mode 100644 std/compress/lzss/dict_naive delete mode 100644 std/compress/lzss/internal/suffixarray/sais.go delete mode 100644 std/compress/lzss/internal/suffixarray/sais2.go delete mode 100644 std/compress/lzss/internal/suffixarray/suffixarray.go rename std/compress/lzss/testdata/{test_cases => }/3c2943/data.bin (100%) rename std/compress/lzss/testdata/{test_cases => }/705b24/data.bin (100%) rename std/compress/lzss/testdata/{test_cases => }/777003/data.bin (100%) rename std/compress/lzss/testdata/{test_cases => }/bug/data.bin (100%) rename std/compress/lzss/testdata/{test_cases => }/c9b5a2/data.bin (100%) rename std/compress/lzss/testdata/{test_cases => }/e4207e/data.bin (100%) rename std/compress/lzss/testdata/{test_cases => }/fa4a22/data.bin (100%) rename std/compress/lzss/testdata/{test_cases => }/large/data.bin (100%) delete mode 100644 std/compress/stream.go delete mode 100644 std/compress/stream_test.go diff --git a/go.mod b/go.mod index 447974ce43..1e19b391d5 100644 --- a/go.mod +++ b/go.mod @@ -6,11 +6,11 @@ require ( github.com/bits-and-blooms/bitset v1.8.0 github.com/blang/semver/v4 v4.0.0 github.com/consensys/bavard v0.1.13 + github.com/consensys/compress v0.1.0 github.com/consensys/gnark-crypto v0.12.2-0.20231117165148-e77308824822 github.com/fxamacker/cbor/v2 v2.5.0 github.com/google/go-cmp v0.5.9 github.com/google/pprof v0.0.0-20230817174616-7a8ec2ada47b - github.com/icza/bitio v1.1.0 github.com/ingonyama-zk/iciclegnark v0.1.0 github.com/leanovate/gopter v0.2.9 github.com/rs/zerolog v1.30.0 @@ -22,6 +22,7 @@ require ( require ( github.com/davecgh/go-spew v1.1.1 // indirect + github.com/icza/bitio v1.1.0 // indirect github.com/ingonyama-zk/icicle v0.0.0-20230928131117-97f0079e5c71 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.19 // indirect diff --git a/go.sum b/go.sum index 9a9d59fd1a..7da0c99404 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,10 @@ github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ= github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI= +github.com/consensys/compress v0.0.0-20231201231747-b7f0ad98d697 h1:Ar/NyBmxGYeKekc7a7sdpkKgZ6OO6P5Wc5aNH+DxlXE= +github.com/consensys/compress v0.0.0-20231201231747-b7f0ad98d697/go.mod h1:Ne8+cGKjqgjF1dlHapZx38pHzWpaBYhsKxQa+JPl0zM= +github.com/consensys/compress v0.1.0 h1:fczDaganmx2198GudPo4+5VX3eBvKy/bEJfmNotbr70= +github.com/consensys/compress v0.1.0/go.mod h1:Ne8+cGKjqgjF1dlHapZx38pHzWpaBYhsKxQa+JPl0zM= github.com/consensys/gnark-crypto v0.12.2-0.20231117165148-e77308824822 h1:PvEjRgB/U4bv0jl9w65Wy9g0nIdkkW7vkNoR8Vq/als= github.com/consensys/gnark-crypto v0.12.2-0.20231117165148-e77308824822/go.mod h1:v2Gy7L/4ZRosZ7Ivs+9SfUDr0f5UlG+EM5t7MPHiLuY= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= diff --git a/std/compress/lzss/backref.go b/std/compress/lzss/backref.go deleted file mode 100644 index 587dcd3a50..0000000000 --- a/std/compress/lzss/backref.go +++ /dev/null @@ -1,76 +0,0 @@ -package lzss - -import ( - "math" - - "github.com/icza/bitio" -) - -const ( - maxInputSize = 1 << 21 // 2Mb - maxDictSize = 1 << 22 // 4Mb -) - -type backrefType struct { - delimiter byte - nbBitsAddress uint8 - nbBitsLength uint8 - nbBitsBackRef uint8 - nbBytesBackRef int - maxAddress int - maxLength int - dictOnly bool -} - -func newBackRefType(symbol byte, nbBitsAddress, nbBitsLength uint8, dictOnly bool) backrefType { - return backrefType{ - delimiter: symbol, - nbBitsAddress: nbBitsAddress, - nbBitsLength: nbBitsLength, - nbBitsBackRef: 8 + nbBitsAddress + nbBitsLength, - nbBytesBackRef: int(8+nbBitsAddress+nbBitsLength+7) / 8, - maxAddress: 1 << nbBitsAddress, - maxLength: 1 << nbBitsLength, - dictOnly: dictOnly, - } -} - -const ( - symbolDict = 0xFF - symbolShort = 0xFE - symbolLong = 0xFD -) - -type backref struct { - address int - length int - bType backrefType -} - -func (b *backref) writeTo(w *bitio.Writer, i int) { - w.TryWriteByte(b.bType.delimiter) - w.TryWriteBits(uint64(b.length-1), b.bType.nbBitsLength) - addrToWrite := b.address - if !b.bType.dictOnly { - addrToWrite = i - b.address - 1 - } - w.TryWriteBits(uint64(addrToWrite), b.bType.nbBitsAddress) -} - -func (b *backref) readFrom(r *bitio.Reader) { - n := r.TryReadBits(b.bType.nbBitsLength) - b.length = int(n) + 1 - - n = r.TryReadBits(b.bType.nbBitsAddress) - b.address = int(n) - if !b.bType.dictOnly { - b.address++ - } -} - -func (b *backref) savings() int { - if b.length == -1 { - return math.MinInt // -1 is a special value - } - return b.length - b.bType.nbBytesBackRef -} diff --git a/std/compress/lzss/compress.go b/std/compress/lzss/compress.go deleted file mode 100644 index a60dc981d5..0000000000 --- a/std/compress/lzss/compress.go +++ /dev/null @@ -1,303 +0,0 @@ -package lzss - -import ( - "bytes" - "fmt" - "io" - "math/bits" - - "github.com/consensys/gnark/std/compress/lzss/internal/suffixarray" - "github.com/icza/bitio" -) - -type Compressor struct { - buf bytes.Buffer - bw *bitio.Writer - - inputIndex *suffixarray.Index - inputSa [maxInputSize]int32 // suffix array space. - - dictData []byte - dictIndex *suffixarray.Index - dictSa [maxDictSize]int32 // suffix array space. - - level Level -} - -type Level uint8 - -const ( - NoCompression Level = 0 - // BestCompression allows the compressor to produce a stream of bit-level granularity, - // giving the compressor this freedom helps it achieve better compression ratios but - // will impose a high number of constraints on the SNARK decompressor - BestCompression Level = 1 - - GoodCompression = 2 - GoodSnarkDecompression = 4 - - // BestSnarkDecomposition forces the compressor to produce byte-aligned output. - // It is convenient and efficient for the SNARK decompressor but can hurt the compression ratio significantly - BestSnarkDecompression = 8 -) - -// NewCompressor returns a new compressor with the given dictionary -func NewCompressor(dict []byte, level Level) (*Compressor, error) { - dict = augmentDict(dict) - if len(dict) > maxDictSize { - return nil, fmt.Errorf("dict size must be <= %d", maxDictSize) - } - c := &Compressor{ - dictData: dict, - } - c.buf.Grow(maxInputSize) - if level != NoCompression { - // if we don't compress we don't need the dict. - c.dictIndex = suffixarray.New(c.dictData, c.dictSa[:len(c.dictData)]) - } - c.level = level - return c, nil -} - -func augmentDict(dict []byte) []byte { - found := uint8(0) - const mask uint8 = 0b111 - for _, b := range dict { - if b == symbolDict { - found |= 0b001 - } else if b == symbolShort { - found |= 0b010 - } else if b == symbolLong { - found |= 0b100 - } else { - continue - } - if found == mask { - return dict - } - } - - return append(dict, symbolDict, symbolShort, symbolLong) -} - -func initBackRefTypes(dictLen int, level Level) (short, long, dict backrefType) { - wordAlign := func(a int) uint8 { - return (uint8(a) + uint8(level) - 1) / uint8(level) * uint8(level) - } - if level == NoCompression { - wordAlign = func(a int) uint8 { - return uint8(a) - } - } - short = newBackRefType(symbolShort, wordAlign(14), 8, false) - long = newBackRefType(symbolLong, wordAlign(19), 8, false) - dict = newBackRefType(symbolDict, wordAlign(bits.Len(uint(dictLen))), 8, true) - return -} - -// Compress compresses the given data -func (compressor *Compressor) Compress(d []byte) (c []byte, err error) { - // check input size - if len(d) > maxInputSize { - return nil, fmt.Errorf("input size must be <= %d", maxInputSize) - } - - // reset output buffer - compressor.buf.Reset() - settings := settings{version: 0, level: compressor.level} - if err = settings.writeTo(&compressor.buf); err != nil { - return - } - if compressor.level == NoCompression { - compressor.buf.Write(d) - return compressor.buf.Bytes(), nil - } - compressor.bw = bitio.NewWriter(&compressor.buf) - - // build the index - compressor.inputIndex = suffixarray.New(d, compressor.inputSa[:len(d)]) - - shortBackRefType, longBackRefType, dictBackRefType := initBackRefTypes(len(compressor.dictData), compressor.level) - - bDict := backref{bType: dictBackRefType, length: -1, address: -1} - bShort := backref{bType: shortBackRefType, length: -1, address: -1} - bLong := backref{bType: longBackRefType, length: -1, address: -1} - - fillBackrefs := func(i int, minLen int) bool { - bDict.address, bDict.length = compressor.findBackRef(d, i, dictBackRefType, minLen) - bShort.address, bShort.length = compressor.findBackRef(d, i, shortBackRefType, minLen) - bLong.address, bLong.length = compressor.findBackRef(d, i, longBackRefType, minLen) - return !(bDict.length == -1 && bShort.length == -1 && bLong.length == -1) - } - bestBackref := func() (backref, int) { - if bDict.length != -1 && bDict.savings() > bShort.savings() && bDict.savings() > bLong.savings() { - return bDict, bDict.savings() - } - if bShort.length != -1 && bShort.savings() > bLong.savings() { - return bShort, bShort.savings() - } - return bLong, bLong.savings() - } - - for i := 0; i < len(d); { - if !canEncodeSymbol(d[i]) { - // we must find a backref. - if !fillBackrefs(i, 1) { - // we didn't find a backref but can't write the symbol directly - return nil, fmt.Errorf("could not find a backref at index %d", i) - } - best, _ := bestBackref() - best.writeTo(compressor.bw, i) - i += best.length - continue - } - if !fillBackrefs(i, -1) { - // we didn't find a backref, let's write the symbol directly - compressor.writeByte(d[i]) - i++ - continue - } - bestAtI, bestSavings := bestBackref() - - if i+1 < len(d) { - if fillBackrefs(i+1, bestAtI.length+1) { - if newBest, newSavings := bestBackref(); newSavings > bestSavings { - // we found an even better backref - compressor.writeByte(d[i]) - i++ - - // then emit the backref at i+1 - bestSavings = newSavings - bestAtI = newBest - - // can we find an even better backref? - if canEncodeSymbol(d[i]) && i+1 < len(d) { - if fillBackrefs(i+1, bestAtI.length+1) { - // we found an even better backref - if newBest, newSavings := bestBackref(); newSavings > bestSavings { - compressor.writeByte(d[i]) - i++ - - // bestSavings = newSavings - bestAtI = newBest - } - } - } - } - } else if i+2 < len(d) && canEncodeSymbol(d[i+1]) { - // maybe at i+2 ? (we already tried i+1) - if fillBackrefs(i+2, bestAtI.length+2) { - if newBest, newSavings := bestBackref(); newSavings > bestSavings { - // we found a better backref - // write the symbol at i - compressor.writeByte(d[i]) - i++ - compressor.writeByte(d[i]) - i++ - - // then emit the backref at i+2 - bestAtI = newBest - // bestSavings = newSavings - } - } - } - } - - bestAtI.writeTo(compressor.bw, i) - i += bestAtI.length - } - - if compressor.bw.TryError != nil { - return nil, compressor.bw.TryError - } - if err = compressor.bw.Close(); err != nil { - return nil, err - } - - if compressor.buf.Len() >= len(d)+settings.bitLen()/8 { - // compression was not worth it - compressor.buf.Reset() - settings.level = NoCompression - if err = settings.writeTo(&compressor.buf); err != nil { - return - } - _, err = compressor.buf.Write(d) - } - - return compressor.buf.Bytes(), err -} - -// canEncodeSymbol returns true if the symbol can be encoded directly -func canEncodeSymbol(b byte) bool { - return b != symbolDict && b != symbolShort && b != symbolLong -} - -func (compressor *Compressor) writeByte(b byte) { - if !canEncodeSymbol(b) { - panic("cannot encode symbol") - } - compressor.bw.TryWriteByte(b) -} - -// findBackRef attempts to find a backref in the window [i-brAddressRange, i+brLengthRange] -// if no backref is found, it returns -1, -1 -// else returns the address and length of the backref -func (compressor *Compressor) findBackRef(data []byte, i int, bType backrefType, minLength int) (addr, length int) { - if minLength == -1 { - minLength = bType.nbBytesBackRef - } - - if i+minLength > len(data) { - return -1, -1 - } - - windowStart := max(0, i-bType.maxAddress) - maxRefLen := bType.maxLength - - if i+maxRefLen > len(data) { - maxRefLen = len(data) - i - } - - if minLength > maxRefLen { - return -1, -1 - } - - if bType.dictOnly { - return compressor.dictIndex.LookupLongest(data[i:i+maxRefLen], minLength, maxRefLen, 0, len(compressor.dictData)) - } - - return compressor.inputIndex.LookupLongest(data[i:i+maxRefLen], minLength, maxRefLen, windowStart, i) -} - -func max(a, b int) int { - if a > b { - return a - } - return b -} - -type settings struct { - version byte - level Level -} - -func (s *settings) writeTo(w io.Writer) error { - _, err := w.Write([]byte{s.version, byte(s.level)}) // 0 -> compressor release version - return err -} - -func (s *settings) readFrom(r io.ByteReader) (err error) { - if s.version, err = r.ReadByte(); err != nil { - return - } - if level, err := r.ReadByte(); err != nil { - return err - } else { - s.level = Level(level) - } - return -} - -func (s *settings) bitLen() int { - return 16 -} diff --git a/std/compress/lzss/compress_test.go b/std/compress/lzss/compress_test.go deleted file mode 100644 index 061af94976..0000000000 --- a/std/compress/lzss/compress_test.go +++ /dev/null @@ -1,206 +0,0 @@ -package lzss - -import ( - "bytes" - "encoding/hex" - "fmt" - "os" - "testing" - - "github.com/stretchr/testify/require" -) - -func testCompressionRoundTrip(t *testing.T, d []byte) { - compressor, err := NewCompressor(getDictionary(), BestCompression) - require.NoError(t, err) - - c, err := compressor.Compress(d) - require.NoError(t, err) - - dBack, err := DecompressGo(c, getDictionary()) - require.NoError(t, err) - - if !bytes.Equal(d, dBack) { - t.Fatal("round trip failed") - } -} - -func Test8Zeros(t *testing.T) { - testCompressionRoundTrip(t, []byte{0, 0, 0, 0, 0, 0, 0, 0}) -} - -func Test300Zeros(t *testing.T) { // probably won't happen in our calldata - testCompressionRoundTrip(t, make([]byte, 300)) -} - -func TestNoCompression(t *testing.T) { - testCompressionRoundTrip(t, []byte{'h', 'i'}) -} - -func TestNoCompressionAttempt(t *testing.T) { - - d := []byte{253, 254, 255} - - compressor, err := NewCompressor(getDictionary(), NoCompression) - require.NoError(t, err) - - c, err := compressor.Compress(d) - require.NoError(t, err) - - dBack, err := DecompressGo(c, getDictionary()) - require.NoError(t, err) - - if !bytes.Equal(d, dBack) { - t.Fatal("round trip failed") - } -} - -func Test9E(t *testing.T) { - testCompressionRoundTrip(t, []byte{1, 1, 1, 1, 2, 1, 1, 1, 1}) -} - -func Test8ZerosAfterNonzero(t *testing.T) { // probably won't happen in our calldata - testCompressionRoundTrip(t, append([]byte{1}, make([]byte, 8)...)) -} - -// Fuzz test the compression / decompression -func FuzzCompress(f *testing.F) { - - f.Fuzz(func(t *testing.T, input, dict []byte, cMode uint8) { - if len(input) > maxInputSize { - t.Skip("input too large") - } - if len(dict) > maxDictSize { - t.Skip("dict too large") - } - var level Level - if cMode&2 == 2 { - level = 2 - } else if cMode&4 == 4 { - level = 4 - } else if cMode&8 == 8 { - level = 8 - } else { - level = BestCompression - } - - compressor, err := NewCompressor(dict, level) - if err != nil { - t.Fatal(err) - } - compressedBytes, err := compressor.Compress(input) - if err != nil { - t.Fatal(err) - } - - decompressedBytes, err := DecompressGo(compressedBytes, dict) - if err != nil { - t.Fatal(err) - } - - if !bytes.Equal(input, decompressedBytes) { - t.Log("compression level:", level) - t.Log("original bytes:", hex.EncodeToString(input)) - t.Log("decompressed bytes:", hex.EncodeToString(decompressedBytes)) - t.Log("dict", hex.EncodeToString(dict)) - t.Fatal("decompressed bytes are not equal to original bytes") - } - }) -} - -func Test300ZerosAfterNonzero(t *testing.T) { // probably won't happen in our calldata - testCompressionRoundTrip(t, append([]byte{'h', 'i'}, make([]byte, 300)...)) -} - -func TestRepeatedNonzero(t *testing.T) { - testCompressionRoundTrip(t, []byte{'h', 'i', 'h', 'i', 'h', 'i'}) -} - -func TestAverageBatch(t *testing.T) { - assert := require.New(t) - - // read "average_block.hex" file - d, err := os.ReadFile("./testdata/average_block.hex") - assert.NoError(err) - - // convert to bytes - data, err := hex.DecodeString(string(d)) - assert.NoError(err) - - dict := getDictionary() - compressor, err := NewCompressor(dict, BestCompression) - assert.NoError(err) - - lzssRes, err := compresslzss_v1(compressor, data) - assert.NoError(err) - - fmt.Println("lzss compression ratio:", lzssRes.ratio) - - lzssDecompressed, err := decompresslzss_v1(lzssRes.compressed, dict) - assert.NoError(err) - assert.True(bytes.Equal(data, lzssDecompressed)) - -} - -func BenchmarkAverageBatch(b *testing.B) { - // read the file - d, err := os.ReadFile("./testdata/average_block.hex") - if err != nil { - b.Fatal(err) - } - - // convert to bytes - data, err := hex.DecodeString(string(d)) - if err != nil { - b.Fatal(err) - } - - dict := getDictionary() - - compressor, err := NewCompressor(dict, BestCompression) - if err != nil { - b.Fatal(err) - } - - // benchmark lzss - b.Run("lzss", func(b *testing.B) { - for i := 0; i < b.N; i++ { - _, err := compresslzss_v1(compressor, data) - if err != nil { - b.Fatal(err) - } - } - }) -} - -type compressResult struct { - compressed []byte - inputSize int - outputSize int - ratio float64 -} - -func decompresslzss_v1(data, dict []byte) ([]byte, error) { - return DecompressGo(data, dict) -} - -func compresslzss_v1(compressor *Compressor, data []byte) (compressResult, error) { - c, err := compressor.Compress(data) - if err != nil { - return compressResult{}, err - } - return compressResult{ - compressed: c, - inputSize: len(data), - outputSize: len(c), - ratio: float64(len(data)) / float64(len(c)), - }, nil -} - -func getDictionary() []byte { - d, err := os.ReadFile("./testdata/dict_naive") - if err != nil { - panic(err) - } - return d -} diff --git a/std/compress/lzss/decompress.go b/std/compress/lzss/decompress.go deleted file mode 100644 index a7a35e7794..0000000000 --- a/std/compress/lzss/decompress.go +++ /dev/null @@ -1,121 +0,0 @@ -package lzss - -import ( - "bytes" - "errors" - "github.com/consensys/gnark/std/compress" - "github.com/icza/bitio" - "io" -) - -func DecompressGo(data, dict []byte) (d []byte, err error) { - // d[i < 0] = Settings.BackRefSettings.Symbol by convention - var out bytes.Buffer - out.Grow(len(data)*6 + len(dict)) - in := bitio.NewReader(bytes.NewReader(data)) - - var settings settings - if err = settings.readFrom(in); err != nil { - return - } - if settings.version != 0 { - return nil, errors.New("unsupported compressor version") - } - if settings.level == NoCompression { - return data[2:], nil - } - - dict = augmentDict(dict) - shortBackRefType, longBackRefType, dictBackRefType := initBackRefTypes(len(dict), settings.level) - - bDict := backref{bType: dictBackRefType} - bShort := backref{bType: shortBackRefType} - bLong := backref{bType: longBackRefType} - - // read until startAt and write bytes as is - - s := in.TryReadByte() - for in.TryError == nil { - switch s { - case symbolShort: - // short back ref - bShort.readFrom(in) - for i := 0; i < bShort.length; i++ { - out.WriteByte(out.Bytes()[out.Len()-bShort.address]) - } - case symbolLong: - // long back ref - bLong.readFrom(in) - for i := 0; i < bLong.length; i++ { - out.WriteByte(out.Bytes()[out.Len()-bLong.address]) - } - case symbolDict: - // dict back ref - bDict.readFrom(in) - out.Write(dict[bDict.address : bDict.address+bDict.length]) - default: - out.WriteByte(s) - } - s = in.TryReadByte() - } - - return out.Bytes(), nil -} - -// ReadIntoStream reads the compressed data into a stream -// the stream is not padded with zeros as one obtained by a naive call to compress.NewStream may be -func ReadIntoStream(data, dict []byte, level Level) (compress.Stream, error) { - - out, err := compress.NewStream(data, uint8(level)) - if err != nil { - return out, err - } - - // now find out how much of the stream is padded zeros and remove them - byteReader := bytes.NewReader(data) - in := bitio.NewReader(byteReader) - dict = augmentDict(dict) - var settings settings - if err := settings.readFrom(byteReader); err != nil { - return out, err - } - shortBackRefType, longBackRefType, dictBackRefType := initBackRefTypes(len(dict), level) - - // the main job of this function is to compute the right value for outLenBits - // so we can remove the extra zeros at the end of out - outLenBits := settings.bitLen() - if settings.level == NoCompression { - return out, nil - } - if settings.level != level { - return out, errors.New("compression mode mismatch") - } - - s := in.TryReadByte() - for in.TryError == nil { - var b *backrefType - switch s { - case symbolShort: - b = &shortBackRefType - case symbolLong: - b = &longBackRefType - case symbolDict: - b = &dictBackRefType - } - if b == nil { - outLenBits += 8 - } else { - in.TryReadBits(b.nbBitsBackRef - 8) - outLenBits += int(b.nbBitsBackRef) - } - s = in.TryReadByte() - } - if in.TryError != io.EOF { - return out, in.TryError - } - - return compress.Stream{ - D: out.D[:outLenBits/int(level)], - NbSymbs: out.NbSymbs, - }, nil -} diff --git a/std/compress/lzss/dict_naive b/std/compress/lzss/dict_naive deleted file mode 100644 index a40834c844d063abd30c4d7aceb7cd8e031fb503..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 65535 zcmc$H2UrtJ*YIu79qe7~*c&R?_-B(Ko3Idl@Ao`E_c}R!W@l!1XV09OVdYTXq@2;8OEZ^ThUakg zZvU#UU)ejWzJ7Q8yB~MwIbSoOne^(lH@U6fgn+yY;BQBEie1?Dlj8l;&!05lnvhOL z+n>##q>t}Ns*>onN?nj_U|?ckK$J=4$@mdUGai7TlWrfi1Ubj<2R?VfjVb2Re;!e>380pCbi~vo~@U zpP+%>K%p~f3@Vi}gU?2&pw&eTI+ZVEF@-{!i-0fW zyU-W{j*y9f-hqgLoS>mkB!mhCF*B#dMfDX#MK~i7$gHSHCm|B=6c_JAcZ!?o6dx;e zN8 z2?=#dhzfOz2o=Oc#YTzafGwxssHos@QQwfbh;aN%`zxxOUfGYY$h(yb-%h6dAWxW> zH)?1ebIRwt;dd6;e+c;bbqXVG<`kXzc&*>lBuF#*neB-ZC348jQQZHZ5C_O9lhmtYx#9#eh2$mD>IK@ShrM|@jG(d;z=5^wNp)%hGlw{-4xIo z<2md?ck%+}N1F`qi-wOCY?`FV<5TgkB5V!3Y8~4v3>o&r@6h9A`a7%Y^XeWK98{y&Iw8)9_WH^tXXA(87;SgXOsqt1r@)(SCwf7} zaE_j#qEnNq&?H1*(YyT*T;Klg)lcE+aGOx*c~S3f_gk)3Vnw?9Gu92(KH}~N=hns* zJxUke_o@~$uF%-4w1o7tk8E+*?rrWvP^g4$db8clq z2X)oS|4=4uksxKise=!IcD0oE9p!dj)_UFs?n$M?6_uNFYn(TT zkEUFHu>I7DGh;~|CR<60VN%fCI4L#<9GHtk!;-SF2oB8U&;=}^KrD0-QaCgg#l?le z0D}q_fr!r*&}d>lhfSqZg&Z-P&7x6g6aiJtrZPlSF&KX^`Cv?eV-ri_(kU#yfWmU2 zQP?ygRY>DdT{sLX8)PM*@DVYej&N9f5nCW&xbWFjii?=V;o4U zU9B;`{2%hyHmQZfWG#X=YV{xU?5Z3oYg=u^_3b2$2p&{QsEq_IH&EU+`f(+i+>*Xd zQP}V;Qr2xM4EeO}+B-!jgMQLZkLtlouGQpi`>;l6u4%)K?{1dX&wAVA}5#oN@VbqbcJEe8xIF_x^zoxhx(I_mwvuM_MHWHZ`rknHx8=| z@wh$`9x2I(M}`yKeaCsUU>-{H@#5*FOT5#8a5^7?CuL@UAHs0IATORJkHmuz7&1y4 zpTw&tw6ji^@#?J!HtD`zVeN8~CL2|pP6W1j#3VrB5y$hucLpuC9_P;*pQO2VFK;r#RKD*mLE?*mZB0uCOYw^zIy|;Z7lx17@{1Q0&OSgUR(Nb}}B~oP` z8D3r}l0i)03@-7TUl1s6x>veikW>Q7p2UN#+k&el5UqKHbW%DG0u=~~mCox7^T_F> zBsf#b>ue4*)BGjgy!v+5G6`&x>AD0z(PSQgzy-E z9Is{GRtQlafbJ{8fi5u|fYY+1Q~(wLFxty@D===iK|wYEsRNz`1MsLnXmU_N2+TnY>OuR1-XH-DBTXVoePl^V-s*0mz8xeo z84n7QC7FY6G1Ffng}syHFz6IXaJm-)PX_G){y+x;IIpucXaIOCkRpQ40n2z~B6{Z# z+gYn^26<%yX?6em9E1=c9!bI^^0-R#=Tt2REJ-8`{JyTdL&4oCVKhL%9%7iSx(sPs z@TbmdLNq^e=0Nh6d*HEP-*@!Ur7b-GAcEIhEZ;c#!0k%M;1`b9Jnens%@q~gh*em= zHf$0a((+4L%f7Zgz_V~@SdyE#{~1ZhM8m~h_Py@?wau>snLX!kd7QGdN8W&~+(X8n zyG6J8ILF00c+h(1o|}?y(-ym$$m|sxH?)m-_tNU|7Xnq+*FE|Mmw+$)bO7kd8O!_s zP$n|HA;%TO-u4}*mWeASO|1(1Y8QJro*^kF+F!K@ zT&S`q?K|^^zKz4m34`5!u-Ku3AUkO?_ib@a`G&-US?7np-sv|pMS41E5y_g6&D;xD z+_`t|Y(?m=pV{Zgh3Uwfk&oIhU-3;mdBN;ElN{rpY;s`;zlM|;!-&^>!|FhJsdfa5GZ~Ie`8^d1d@NTGXIdtqSgNqi(3o0f< z$$it`Rw(@zg42Tym@KsfIAKycg=^*yLb53Hp@@~24FEjPip=fqb7q1 z9h)13q3=9wHBS_BuCcz%PdD?JG;=1=a*XSueKWCqZP;L8+tsRZz6>tDkI5AEv8-Fd z&D>I(Piwb3@`U5)z~ifT3P#O0w0lSWJUHDQlvRC+3$@hhea{PazO5Y^7;nu|bJ|JQjZ*Y?=3jH#yOuH`W%#iUw!=l&PVT*dGqjxh zxZuF<)wc$%+&ys9$bAvcK@v0V3u%I@S^@Q+SZFyc_Bh@Aku@~3zsbwvl;0l@yj=bH z{6(yT!(2;Vk?ykgZiWB)tkA4%!I=5%(_YJsu=1u_@~2nRInp0~8@GL#Q$E4df6tlR z(o0zR)>`t$@#MZYJRSTcr$*SlK4W!Y-a1};th{!4X5Bct33}OW9)6thwN5}j$w38Ts z+Y0To-L%?pWzgts230^6a0Ei33+ia0iuf$xb`deC6sADPq#`tp#%hXi znL-YgA_OiJ8kNBmi`nQ*5SR&|i1-3T#HS))UWmrw3n_Fu8+f8<6ejTVFj*WXU%+8g zI4of);VWoDDSTQ^IlTQTn!J}DPsZ<}q5!{VOwaQTT@?NNHkCUYT(XpNyS$P}_a`du zvS-F81_mAZ{b3;M)u0b!9LKJ>2A+|d`s^2B%WgM&=xH8rV9AuD{hoD_?C$aqI#P+{ zYj|Gub%b#~LK`t?v&m(L~<& z2*n6ZY-NO(?h2FK(CItQn30C0#gP*BuPU&DSGc9ZCuqHUeE@!c`?z7TA92dBB*CLq zoLv{MBvs)FsmDgm@d;d&QgeJLpk+Qhre!|Ctz|x}t9f4mO9f6IIV1$1gtbh<`gPA5H%K@*A0z zXNM|>6~C^Gj3a$K>fzVTt;95LdQ&M4ibMnZX9U)z!NUsRe!gJ@GTcW+W=wc2y*&QG zDcM|ysxPf}@5iw5KwY2J5a^3AI+<@I@3}o{5}E|dNDs;tJ^HqdsJ-JoAoRT3g;}d@ zCylFfC;=`Mk4cK~?syqtXk-kG2WDZ3??uy+E37wn8yE0&ydGy!hd}nZYAjzH7PXR= zY11_D)@z!b$TsWUdH?dZmI)g2O%ZqecIb&WVuYZca|iD{>^+fz3?XIqxb$o!Pv%~_ zyQxtuMg0~vZ>(}@<%_?4i|#JCdeEX|@sb_R-w2syVk?r%i5Xbq+AvIYLK~S#0gWt? z_uR(lk0wK8q-o`feisgRIz5M4cSW!usm(#7p*0oUui)F6bd60^KSZ^y{lR&EAbg+- zpWHwt@EV|Ie%)@_vudih78cy!bF_7A_l@@7Y>2;}U$9Z#9qL` zh*)pVNJsj)`voa|-@wCfU4U3!RJsubcDaL0d5_vB`A~QyDe!edJbUy9__ldj?TH!b z8P@3nhTlB9be;!{So2XAI%541vG3~_G#<hFcvI8hLLwqHuo z(K9kME))q5orUnjMg7QvsK~e&L=Z;~jU^-DAdHCT0SX<_uG4V4pf!{1CIs0{ULC;8 zY|j&-*CK}z{diDt7_r%@EGTgP17|+3J~KVj8eAcm-at$9#2=Nopq9vJtDG7@>Y;Xj zq=J$Wn>UEP5o!x=oG4M(m1?>~ji@F~5Zg{Ez-X6Uz=PfvG4Bq1zKCt#7IFYp0Olop z5)&PY2GvD&?j085+=lJ*b>_lWWuiOFGp%@+ez1pe?N^}=VYO#D8*O+mFHj~PYJDkU z(G}_72r~3e&+w8c8+y)4KDfn#0H~Zil4Ow3ibbnANm;qTjgMY9C>tv4qLyuo*d_V{4Rge9yg#^m zB(m`&g3Koa$YlT)m}EkykQ%0?g3#&#%>kN(3`=Adh#lWwnoa~W5iAOYim(M@j*!6+ zal}-F!ACeEI{4^gi})a9&qXY5*<}wwTt3r9AONy-4oxf+Q|Th`9miyI7zho-0fIn4 zA%jjw1a!K{g&}mI0_zkPiijZ;3aK0#g~Jh)zH(E3o$HuBM<37<@H)0pKB+m!es-3F zq?gQKb6Z6!_4=6ggcB(VbNzM^tBG?AsyeNjPjvSkA)EHxujr|T)kVi<9;6Lq4TnkQ zOGum(?0&w3M@ff~QDd85UdN0xBv&^6nR-9f|zVmjR@BEhm%YWDSn?_u! zlR&9vjPJ|rAgI{bql$cQQnMGM7HgKTVo0$3^c%A_?25I|-lTAfc(oCU+szw(7R%QK zOMA@{#j;7whCWGTv9A9v&qFV@-FYwtCzVqJLPTent z$FFR?Eo3ugjPPIhSXnh(g+s%TTIHm>*|}f{Q12+|NfZ{Rg7UjtTO zrUtDkqVa9n%lVYqm;>R1L{`O533pEfAkxwNrYcwpk&MNf!m!8x^^7TAlQzv)P5GNT zI3;jsRv$1##vi*h%O~=hWhoiNEFJu+miZ(umLDHFcR0a)(4BoNbN79D5O<#1zF!2E z|3APQ&5xr`#-U+J1T0c_qA)I85m_tCb>NkhbT;2|OIU|89{a5QS@uDgV)g zBV~p_L}dPwz)V=@A@_MqezfdEr_-_Avd6q}J+Q`_z)v5QHlbk%BICAdS2>2L z2}-c2G}YJU#Qr1WMn5emz0$sZ3b$a=RxDo=CXD-hZdHXQtqI0!LeU`Tsvd}}A>I^G zJDXJui}{-ZoN~S#`1AsjX`Ro z{!+a{t1WTkC*J>oqk|RMRA!{*Zj&Z4*IKol7dn9@W?lc4c_(qt(yE)EStBC7fF5gR8jLz8tff@ip*{pkLr zF87bW?h?_>xU_8#cjsa%QBov4v0BQ;DC2=-JR*js(bw=on?X|$XMm<~kq_G!EJBlT z8F6o>1UOuIO~Y1cAUG4aJaUa$lalAp|F{hGSfVaq_?W*d$bKM5lpbtQ7~mHKeo5f-;_@Ie56S@X zw&TI2HWLsowNCfSNJqV&5JE7JBJe%bNvEVB1e+uT{t29Dz>S9r@W@g$j+fv9+?KE$ zA(;6EWr7HCl!b-}6Nak9lR(_Cx4O|#UJ?UU5+or6*G2_kOHQ}OUhg?1C@G`-% ze1cT*IpDJ0p0UPJF*$9cM||`$y;XfQQy3c+EUi2?Y z<%r%pR05Qwp-}4D(0{ASTkoO9#R2uCBqGZ4K@3cjk)O4|;(d1b{2&R@B`DTpK#UB7++`UgFs4k*ZmVNYQk1)bvPE3_2|l z7a}6h62(N35gr64Arghc7*T92=25BzqF3Vs`eLIw0TLdrvK1K>r?jUut~7@Vx~WX| zBMU|0qF}I+nv80Y6(9o~M7$?f&DCL!u*;$R_oAG^!@nqgW3GLM_pnjT+L zpZkvW&du$_Pt%pNUt{@QAAe%%2hC``d0=_jv$WN(%L^x}qcE^i+Hk2hK@*3qusEr= zbivsfD||Mb`0aVfE^XcNu#^ibzR@jci^ttbFPWdZk1bfW#BDwNa=U9u?iJ|Z*;?w= zUfu!3OEanlOg;F%V7<_H1=c1G%*CN$NfIoA16v(E_Wst(S@f>~LEd(x>fMGL7p<|z zk!y~mF^p*{v@Hfp0}ZO2u|U^VO61*_3STOd|KH($46z8Tp=5$cJOP_2rm=+_h6{_$ z6flKg<)nbc5x5BX0v8%bD5Q%xR6YpBbKwYCG{l7^<}+9#o%2Ee)jk*r!mUA-!Hrd> zTu6<(nyu*RC-=b`lRNWF>BX?Cd9ItyAT!I%IdmVt0yG)&e}D@@kO*eGabhTFLPbr(a@--i)u-<SX46B zb#Bm%i)+*A%gfVe>^sye2P#D=FVwz zQMBe%=p@S<*G)=K;!r`ed_q!*1zI?m6cZ%gK*8$_FwFu(4}0w@u>%7G}KeA#mznxx9$ zyS0iD%(082=j^ga<>Uc(jhD8kT^z|NGlZz$cz*f4-r0yLQOvS)9R}^!55iL%Xwo8dHiM+Zbbkozy zx25P9+4$Qqjrj|l9LYE|EXg2ZW^9NkJRD2YfK^Mj2FL~Np#)_jBc>)OdfbbcP-;pK zo$C1D`t40wk^SOYA54XhY$JZig48=YC_^=1`Q)V(Gc>UUi^cL4T@5M=teK$?f~U&1 z-_Kq>$^YHf36dQ4m^Hrbd~UwUkKK5Gy}xNq&Pex={OchDSFgBzHs?{W{dS*oXF<(W zf-WQFP~|-Tv768&U*^y~OX0O>yVDo(y~_Lt(iJa^R?zy~@OUFB%Bk6(@JQoAG`xvl zuC%>fK5Ry<|IkW*{+-WjH^?^Kz)BOVH=YU%tQgle;n@~K**Tx`V;prfAy!HQ)-^yP zYe+XmcvzCInJ!(C1D7_gwElf&|M3pfS>8MS*S0hKy)g)D9|z`QY2h8*O-MD@1ve%v zTp8N0j!-oF*+(p216Duu(YqS`3^0}VE?e9IO&tDze+DpVG&W7drqIAfDhQ21r6UZE zh(=-2m<*N>ti55fg%k?gg)gQvz~(3dbjc2l0haIxL<|u{z!Yd)W>bVqp^4cHuslf2 z7qclK@=GM7A$$gi24k>=0*+WjV~UyJGh9evuuv+ME=HI9ARr)2$Yco+up^D~(QDLL zV=}$4zRfK#%e^~!JGKyxH`bVZiYZofn0xQf7#GXehAk2+Hh>&gfHpG2eD>0yerkM?YLVD2rA3>p!x05mniVS-g)ECCzcb%#v_ z--#mdg-E3fC=`~EBLHE0LMm9j#b+}`6p;()ICLRUp^8LI7hTU>E?vwP3urDvHrQ2% zA*P90ASV$B9t0nl6tHOzoyw-WFz5mfOTcG|T|fjO0yG#@F<3x`P*@0E-7W0pj;3@9 zn)H*wt%(Y}>1~`vR+2OKZSrk>w_c&gd1U=Oi7+=?J!rgm9H_NB7vz`#x&>J2_$Nm_ zS~PA)Of|6k<+&Jwnr>P4Zwnq(RvI1}t>lhI?{1^XlmFij4IyX&2120-SuT7!2kf~- zrGXUO*esfegHT*J94d{*2m2*~C{QL#$Z!GMF){fx8bw6spy8f4HH;;3g)}xBM4!6w zMPjND>gU3~kzkcv!hcn+(TFMN{-4P5n{EISH=GO9tf1`itgK~k- zzgl$mSYO^fxkQL2;WBtmZ`@JW{7&Q#^;4!upd$Ik0nY-l1-UhoeSf^P(qhPuJAU`7 za~HS26)+Ub*MQaI2@P6Pgx8d$Nm1V?1bKIPge#hO$lz{66+?msOnnt!GGf}3@%K;H zk6(WJaoToppJ=a*4IA+!pO8f>?>1=c;YXauzkip;ztXhd1OsbhRo^ud%`g4~m9_Q3av9~^m)M2JiA4(yp=;>jR#eA%-|2p>D_~3Iu?x^bpt%`;& zzWQWH;M?-$&5zwc#-N8c|6QamCANIpF#P1yU2U2+4%!0kAi4&T3hQsDrPkgHNb}TK*%q%!W#_ z{QnvRFO5m~bYvH4-#FnH$M}psGI*0Ce@gk$Eqw&i&5vUH^Hc{gO9ZypxSfp}EC2Pehb;@F zzKxVL?(>`$Q)+mXFB^AQOpwacAO%g4{?0_x!g+OB}i8NIF{jfW|;c`P!~M-&x+PxCbO7n))gZ zCkB6C7gP81a>ks)?;O1zG^Xi+-Lrj`N`}U)J*)mah$q%>2-|1{j091wY;8J=|+w${7nDwv!gv zx3~!@x&(nJ@A}*7BAVPrmmv7`zOiWc$xV`#MLWz3?+qjd*WE%=?~e*U-;I2jcHMZj)T`4Z`KRQqM5tqY$`;#gVTW#c(+|}oY_z=L z!HmiqX4g8l%>>Et6{j*ppHg$mPFhgY#RnZmEphrZSGy6l@KP;Y3pIiKFcjSN|C7}> zLpesU38<AH*S^P1XB`{) zd(Fm47CYrtSiUwaYJHEVP5TF2qwI%I2==1dd&en>>8$ z&m+#yrzhxrXTN!}G4Dh~N$-HM2kWtX99W}#IDB0W z6&#fY63PB~#-^POSTn!mX6WGY)}?@dU(zhwRnLF;+93xgV+C}zZDj9+72!wL)m(7S zx*1`A-M*92VU5S+zt|_5m(MV_{yN);9j>Ww?|?1b=k+ts$ke7c4h=W zG%7YU)(M$5t*L+x3ZbXv&H;H?p&gWo1FYg_@97)+K2iMJzVno+pNC=jfv^1Imv21W z-Ty-Bg2OJsA%XP3!IZ%<%o)RDLPgX4LMim%8Q#&tVKahhGkrtdVgiMt(GhVXHZ>w5 zj52j5!*htQ+t}D)UQ>Pf(Xnwc?n9#Dy{EgncAfj@>8Admv6OfHl#_`jYo+^(|JFl? z_rkyyUmGSBc~3&WEmwjUh91Z)y%r=H1ew9MIf^@@2JTyFuv_|S7B%l$?}y2)-!64a zHI5Se?luD|hK1|$dTWdgv`quN3QK$ap~BRA-=ys>rL8vgZAYLnKE1QW^0i_35lx~^ z)5Kvi7T@y#>Rs|O`F`#d!(YE7-y>$PEUv`z|22QCZZohR*eji_%hS}1E=tw|XKjd( z76d@kC{IIil6U=3{t-=n$O!de3Ow%&SQX+C_!Mq2pg?3+-mQh~P!?AQU*n06BdbFn z=HcjINl!^n12yhQ(@ydIy1%{?q8LZXuuLCrzz=$V6sw?XbdQx)!9C2I(ljW|UZc$t z!j|&nGw;>X%TFCRe<1R~@2`>7=K}RUpZNZI9oh;re_Z{lVNVO`W;UJhJUkuu>dU7?lW7v{tTlpi?u2?<}tn0nX!_mQzU@iuMbWJH@W!12T zJ`P`3`N%jrSW*@i!GT-j(6A&f7Qulv_>)iVeYy9D_bPh7l1Selp_80vD-V+TqlVXm z!Sfi6k0D$IfD{~r1omyUNrp|>iyf7|)i!HNf;SI}xc8y;p~5F!Y{!n7*!2UZlhZKO@{8Jh#hLEFTAM zKFkgk2Gfk_Pcwhj*S2Ur@Jn$1n$cB~MJc*onh?SKddBy-pP^2@c@MjO`DN5$t=~@l zo~7#s9lLJ0aC5o%P)qg!&uz=K$|7?{(7skf&CgAla6kXoIhR!(J|ElEZ)Voy^Rp60 z8fCrMRUJ#6KJKW)KGxHL&o>#aNjdhk--31JwJzDVH7Ol*WKdd1aF&t2dC48ejXWT; z*=3J=N)C90kh(k1ZmBn*`E0>v7CG|Rpvk>yAIDzi-eOm0aqL`vwdi_*^!zFLGq?4( zK%6sUpJnaxdsn?|l3%Ahv(pck8C~n2E1(qY(#6N-j{}$D&@iOGjlDN1fwtJ(E)wQR zPZtf^N;4H5d)|4VVr)HhP<_p)O)i*?=deXWL_`sStyej8^qd<6O!~7xTnJM@K^Sa6 zqtQiFge72rqxvX9I*8W*bG+dAK13{{V+w``a>4v84eTb%pfN-=1`7mwfUpx67X+NW zLvi6VIaCl>z!88*13FVALR=6&Uqly+X>`5|lh1My&_&>TVkvl{!1B(EO}~P~3@k4C z{~h?HERi{;MEWmfijaw~2xZ$3jpom+mL9nc_d4z{rvfKX(c(5^@zST;4E?BqJ)H)N zuN|MbbT_u*9D9zvkF@$QX@~Y=y}fCiPd~ZTt6^?m=u-U|_weXC^xdz;h7fh-PKD9C6_0-YSI6Ul)>!xdCww#95{$^lp^>2<=Xb0q4X(} ze`3$2@ff|NsO7IcqHPqm;CYHlmP7jLrDumftIp=^U4P_|nT8()D~rM=%Y0I`X&She zeGQ%xX`>OO(3Gz^8;iqt%Zc{evZ+7vb4DsfFxc#G$P6B_@FuS?EseE0 zaL?Iw%vON`m+l+JQD2;T{5dqtig5LiZ}|H6Pd6TDk7E=|0?i_k)DiYb#e3yFY(h$9 z?YR|->yauEV$ZcJ%GWKv^ucJZetoOiAHA=R^2RC??-I;gPm7s-Y~i<1_Y*dqPOL9_ zh~?{_>C*qwoh@A~fBp768LJ2gtIAu2^}XZLnk6)xzBxjyc0?Z>CCoZn}ra>iuElH2D;9C}>!xk9woViK0$A-Qfl|2}Oj>%^Khv=F-W z2*qTL4%L<2W~@iuZ_nfQ;x3-%gFd7Vd$1L&--D25*@Gn+w9&$;kUfjQ95Z|CTP(j< zvUe~4q#0}08|y{v=XOY?zR=TH(CgebX?+ZT$z^JoSY%jXc5G##pT>Bf(H|N$(7P<@ z^q2uiAKUqJGHUkxz}k0PcKxEPjb~)yo5F_yX)pRd?4R`x%OBD%@5Y9*)-JaRg5qlX z_hA<8$LnGF!(Qi>22Z!^Sh{d;*5a+)JrWVQ4$B{5o7ba{OJMxPk3sWeKb`IT^2^E$ zEPvFTq15v~Ha(=5A1F+}FFBAid#1(*=;*OieH|xszxjE4oc^}7i38(~T-SJ%`VA#q z%U86jE%xoRCGnZfvFTzDjn9pM?dkmkeaEeto_RWK=@jX#;aiqgVDk?+)3w#b5szzr z3_cSdXi6^Af9cD`@+VF|Q8&V2sQKzv6J#?uZNkE?Sxv+8r<|zYv){C68?vP1?Ja8x zhK^i*vOkuO6z|LyJg-_Dackk!bn4!bW9Ei!#PS6^`0j&VF9Rk#x%G}xciT%~ov)6I!O}^oX^n3K8GL%aV#xFi`m2FTEFwelbeNdR%3|R|1Uo|J?7@D!bS07E27g7X|yDwF(w?F-TDg63Kzz zg7H+Wq`;e}p9#t@CC12m56_8ICNlF=dlXYXvaaR-y6Aatv&oz<^ZXYQB6=*46zdB+ z?^Z0rvxT8?F)%ck1@-YKTyh_Ez|rpZlJ`Sv_LCIJh-}2ejgAM3*vDZfy zbjTPn@ztw>%~<}w!!WO99RjySJ~1G#MZUp29rbt0swI8rb^A5OR@$<<_z*+$ z^>~_i6Rc>@%Y;79yH{=hwpSk9x^NEb*rSS%y~OZEtfX6Fpc`etsey>Vns~{MV2C4{Cq7>l#mw8jZQCL zA_Zkc{q*Dli!@~-Gu}G0nL{l9@3%WO4BZXDPfh}(nc-0C? z6Pk)Z9w0b*fFC-jsu{kLEV6J~-4Y-00&u7K;2NX;`Dy*yX0IJx?Y!`EIiDAGV2u+5 z*WgW*fa4^26`f98f4T5fhqO_(qkmpnb-l-#hR#x28$*zj_YPVugTpKo{Pl6y0zSlK zj5T)5-5BHYoq7Dw+f>qE;VCdID)LAeerX7{yiZQ&e0E`N%D>JCY5o33e)hU2H#DY9 zu(E0xRt?+gi1`3$BPVZ_K2|0&L$i~L4hJCG*G^-PN1qjZxN`ODz)SX2F!f86{~sJy zO2R1%mL$OCE5F@lkrHZ*fa(T|9Z#)7JyALU?Vh;?Fs86uz4_c`H^L2iE1>u%uEff&&vEtuaHQ#^+6aJ*;(w z+I$?F8YGR+X${U_Av_&YSvvTz_VsC8-VCZxhdu-**Dgt2(&Y}14osF2%LglbyR8av z*lBra!o9!>(|bEs%(&Tg9y&cQztKU%HG^ZAho$}d&f5esR=+6>%c;TgFHPwUl)~ra z8VNLa!R9yR-VxKsP4L@z%f7Yw^3Aa9sG`l9Mf(zd+OBqYZ;(cRdvsklkzQ*1Jjc}$ zs;%v$|7FA)4fhaMc6h%YW$j)dhp{x=7Xe1##l(5Zzr*eu7{mqZkEon5?D?Tp(=luu zP^-5#m9TG0(x7-G3=3`c5j33R<@;1I>QA2q$BU4_)`NNJh(TL)3qa+$_vpTc$CD7E z6dYQD?qCNt095U($3qC8WinKX$5WnA!t0?tp9EPY|l?T?i0%)KFlo6c&9*^z|`8fzAzkoTK`rsnK)M+*eSotq|iZ-*wWi#Z&v?_yj z-T#@rh-p(^|W!KGj@}BIsjhZU-uR^!JR5itB)ge6y$s6p(*aqzVNlHhL zJW4b?IWtS6)wK{+yOL5tyZU0<6K{iN1I-`Zcr&mD>`CdW zs$kM*H17;AbTh+2uIOn|;0PdBsSl`d9$5|#L`bzjm6rid{K)|SA$o>VX$qpp1e-*9 zd3h~E6V$Y%nQcG&Tm0!~vbKocP;gESumLuy1UpK4gRK?y0{o@k5;-`>=2uYC-&ca( zV)UXu12YZ1`%g)fyL9U9NfYZ#`z0PDHO- zwf|wmZlQ=?9oSG3oEwGcgStZu()@y!fgLv6Bqj66DKKJi8=UnchcUZ+BKnR%L4qpK zQ3Y_dsDcMZ0nr}~3TWXMq@^h0ImrI(;W-WL;n@|vM2$T>=kv&oBFlhCG!OxMMk?!B z{I=plft4K2p7B`Kc|PufqNdp`pEiq-zAG(#U-X70&kD?$hBD~jkdHe8M-O5%eFjaG z1rL~+IC`{$hLcpQ^P#A&$CxRO4wj^gkCpuwxIY}YN0Pi~aq}PI5A)709{QmLn{biI znHDCFThRPF7J2)p^BQ9~O|^$t92eb|XwsTsaA^~g27#b~;UHG8j2w+;_w0~)-Sl~& zB9T^UbNVul*x!*BppX@?qcYE=%8TRov}=;QID=uYL+ohuefg&x?9@yFaC2uQuhzJjx>0 zP8}@ocSvL=gFCqYZBzl?`6e1eFdRhvaYj%DuJ?&4iI#(8wv135jyv-YPT$^A5@8O) z<%Itw06!fS^)>nRoOKXjbs7H6>ccUS@s~ zj&q5gwQb!xy}LgzQEWYZ_q43h=5EV;ozIvS)kT)BEq(}!EJvtEDEZYa*(oLqYA?sWWg`2i;t20A3y@P zWhDZyWxh_X+p=dR!VgKq3XXl7GH1@iwZfOy?(v6?rdR*I?NI<-u{=Gmzon?_cV3}w z$dvFN0Rt+JJ{&eNa!l_{BS-j8exo547KOs%7sU1i9mXMhWpE$skDh4|@}8lm;I` zYv>!Oa|?Nw)sK_VBu%DIy`#W8ls(n^I1P%FnUts52o;cPX&m3W_cvaWrrgc^HjUwWrL zQ$J;6`Lp03(c?y)-+URFR(kW3vG|gHZ?Bx5 zOAk(@ZnE(I@h+pZ&vUGPdiOhHl2$tu3>dLhypmo2qPS|*J1l>B8l9VFe?WZeWwmv+ zS&XN_s-qhB`?^ufU3ZG>&#|xH95uCO)Vrl79F4%0jYDIuf&!Mj*%TjL(tEN`kPk;= z!=tTNJ}wzdX1WQc6(yIht&uMX3glt)&nvD=8}2aNWXR6xteX9Y-_M*lPs8bdFl1Tc z!IK?LZxUVTo5bt;?aKYi#p)kAxsh8^xhwV3>HJ3rWc?jN1lu&~r^GYQFu`f^@%qU- zEG~UFJ3-#COJj2J@S>!ixd)D~tIRsoTR(Y&!#jh88ts3y>yNSekNYsg86E3OvbRiF zICyX`Z2l*9tsTD4->UP2Rohv`*H^s>4eX{dqAaTubY`yFQ2FxA*bTeJ9}h@-KQX&uL$? z;N7vR1z*jq2*&o4**mfPt82OL54UALpZ@Xs13%`&=cRqFP_X>#Wi>~FJdMuHJMzZq zP2TU`yX$f^CO2+ar#$ZJydq;mubyr13-^+RSvX^e+oa>vC~ovZx%>FW0x>(1;On{v60M*gqb za(?b`&RuU|7dvoW@l~1Iw9*Y&`|q!RXAZpdEa>vHV&NP5+#$wo;NDn%jS}H9!2W7% za8NIsLuFE2XeChpSLv;OwmN%BORQ6=D18s!cjAPSuXjXp9l6kEn7@U~m(i0lD! z`|V`}Wg;V3hqv_1t|LTB3Df0W={vT{^b?0GHrYf_^D6U?2VLfU|2FxgLK=Id41IJa zTO4+6lA$I+*9%40S9PhTIc*FkYg06Fn5T(rW4f}3z{zr8D?UU7Hg^;tU>`?1mBObC zVX^rxEQCV$pbudVVdy2)daxMkklm%5t0wdK14V`NeR3~Ta zVKfZPdo(RR8B0pD^frja(MXyE(vU#Oa~j${h}unl5< z{uX5-BXK$?I>PhI3KFuGi7O^etqS{U7kfCKF(1y~Qx`PutywYg=~atBjbWIEZ56`9 zpq6JBI}y|Nj%Ewqca_XMKhT!Cw-Un!pw3>8euSWb zj4tx7L7a(bB9@UtW;gQ<0zcKck3*YVUp0%+1miZLXcBbQAz4#g8-wkOCfE+rrfT72 zEnFLwUrvJ8`Vs|2K)yTKz-V#gLbwvJ<#jM}Y zKTC4%LSgaE83c6FMmxzK^v0bAERrx5iJhX<@#$cg@$_xM^rs!m*W@Q;fAFl?{C*pB zxO%?nc~Qx^Bi$UqK;t7&ZX@qI+Ic#fB%vb{$>}&3QAB{%sP}dJ>hHCgqSc3Mu_|?@ zH-M6}yw6Yf3N*PT(=)Hb-@Z*Xo+&oX`Llf+w5q+Z{=$lG%(r287p~55U)lW->KV$u&%uMh4K7h0%m zlvUa?-=q|~tKrce8w_6Dz3u$4?k(+v54~>Dtosp=Q2k6RCsyNKzsi0q+c7a&I9chd zKdoCWDmv8@kFSohGJ?C=5Ja_#@Q(l5G7d9KL>mifBqP4#0DX>+e?cbK-M9;OXb6 zh-S7Af6u_x52U&2jk9R5kzAg5j(1`09@h z^U0srl5zM14;9h;>ZG^ZarpXMR7BM+ha=EHf}cGhJ)t}GS9u2-sQ><3r9;JP^X3eP z1(#iHe>>l0Wf)%$m43d@)uDrBchxcZ2ahtuiZ_M9vk}e(Pby;L*2^-2VS8}qFubhk znZpPi3(gECW6um$9^Y&355if%A+)LkgnI)_b%1cC<*{6FJ}^anJ}?5Y5onAu0tJDF zfN&JJ{hx4@&mdw3jZEQjRS{LD%805&<*AkA2qZKzQWV#;_)wA$IO7&XLxISr#$!*L z9B`^St`s_@eOxKD6V;T1V@g3xjxs1m>wr=ay9K?KA`l2*L7;CSG6hmaroaa3$P@^i z`ia2iNeEP_6`7(6F!^(gE6e}~!9tJ!1cqUbsU065P6&N#>)MC%i%~W0=lzUf^#uR6 zg_ZR)7J6LRwZ=|)Of5L}(-NYU))AM=^rlqEr1pyTmF$t+wkS+*FQ^2c)Bw*;hDH=&cL@ZZheJVT! z3tlgxB49zT4GSvRD{`@5L9bn`e6tY3ZnDYB^?vU?{us{rotZPUvy(|XYdm^-|$Ux`N5xbbpW{!jBI8@X}1%LY$PmZh~gRa8B5D*bH9@zD9`kv_y>Cp8w z$HGB(pSHT{w?CpFvsglTv2SHibhE|vW52zwdHC#uXs7w=^)h=NgvR%$GrbHgd)%)r z6_P%+lx{VrC#$itY%6GqC{CEQKFspBaTi;n!=)EbYR0B{b%V|!<%+bx;DM$hCg<_o z+#9F%F`T%SBhRmb06*|H_u{=JgU&o`f|J)`5Oo0Cy?94_wywJu2ixDhXT}`4~+eF{B*JS1?i@6~;HyC9Zz7 z@DzK>7Uy%G(VsQdnwEi~V}LBLOcO=Q7z+8xk4Kz04}oBcs!f^sN8O9#jX#r_sbiK- zkPd=EGT7GQB-RV9(|;hE1m1umQA>CaiTTY_*-Rh#r)`L$7 ztGBqbtMY))5y!OndZa3Bg9Ky_G(CIAuvhJU8Nrd5pKtgG!?>kd!53MRfG>JrQ#ymmY5n$!BHY1Sg#>Jyu=k2j2e0Zsc)yWhqbLJJ*Ab8VNsZ?^ zoemD3czAZ@*hhmzdxqR69IeY5uX%a7DWQVwuw%f#7RWTxY(lvqTndQV#0Al|D13NE zTniNTmsq4*kAM>!v9>5ginVqEX2}K5d^J&?1o2iPyZVy?%|zV0eF+6j@2us&5gqrKV54N^o=JTz+qibpXFZofIdEsrP}&a z8tJ?Hi%jwz+(kuR_ks^2<%+bxa99z1Pp5->3gXJk?h+%kHkK-tK?7B2R*8!x!JbW_?)~m7bp+PXfG|ODdT@@$YoSFQ?~&od zO=XcKYwgogCNpO6J>gRR;09G&Z3+#BrO^aF*-!=Y;nk&W)?S zFrhJuKT_!ukpR!SK(>TVl2|w=IKhdB7;-o)#WKtWlN80kUp;mUQ+8GNB0DZkBqtwO zzs~?#Ylo}=g9hfa@Ls71Sz*FNOkXQm*^%qZz=}legPtHB{d-VK4Aoh|j)Mt{tXICM zCBSRX%mV=yajQq&uzw|K9obDv-S~NBeYPfDcgVpFA}Uy}h8>W(c6z9R{gov#qfRE> zK7V>pHLN1DGv9A*`>g`Q!3%U)rWQ*wCc0q@oa_)oyS%Us zWD5H4o$ZQoj}Z`sBiq3cAr*ETC*6FA`fyd*)1?d7H2S{mb8zqQhlQI(1v@@0UX>h^ zC_rdXBHIl#wI$XZouhDa3U`M43GxRQHBe%0)r$frEHP4>T(A;?2J+FY5@&F*x?y7dV&UlL1DhUd znOAr>d}Adjo6-5=N2)3spnoTq*%M_Xh|Wq5T$NayJ{b)varsEZ>d?!1lEIbv z8(hDU@+KQ@fBS$CijMdikJeA+7HM3a=>~kfbj=^Q}=G`t<2|6Kl*u#)u#$= zle2CuK1)x`%gAl3tL?$(iGJ36yt8kS=T^PF6A85W&!Sh26UKUsLOz;2Bm~G-t_~2| z@y!mybm^Iey@$1)76y-j3)9JK9%!1Vx3>g#LRG}Jg$+b~;7vSq+`FQAne&^a;W|aB>;`SpaT9UYc;gxW5=D!`g7(_4UQ7IrAC7sD;3Aijl2#doC2>}7gfj4VD8-ywc zhOZz_IgoRhfCV_|#n9&SQkqKzw61Xan%U&|9oApDM{6@~}wUd+SUU+Z)is zAK2cALADkv7BN!(|3!;%6!UY!2cbT1r@WqZJpZWuouqr@$771dEq@z9xQ5ih7(cjs z^@_vC9rvG|_;K#QnxK@+`Kn@}=w5$hdieBmF0%ap7%fuY9;hHM%hwE<2d{rXUd89B zB9BxD(;`JcftPJ6E1>^5R)kX(zb>>lY6%6ekM#AsPm-kMP2SBIZ)Me^GK}>7*c_ZX z@cK|?aqtu*!KN@A$$;DG5;y_J%E~rLHD|5K`{thfyLC_MrAxn7ofSL8H)pjwE&=|% zwd{w;rspaZ+p3hb384IlA+w5VxDw~66Xc^SXO4S8Us<`csiDVQzi*8#4T+_3sPYc3 zwr=xK2$C^7_C4cbF>0Btcy{lJL^qS5pl0W9dYg0VSAF^88M25X-9CjkkPMztJsnQC zxKsF+wV>Nbhyo3PjEcj}yH^|{OM*>t~?Y973pn6S*^ktv~^Kh|U*YFc*@ zn);jDLE|F7hAqc#uN=@iVD!@^iVaiXZ54PW7Ti!G+%gSB%e34fOwa#Wu%XVHtKhu$ zKj2dfZ<>YEa;QP+QcwWI&!+PIkr3V*7}lg0SzejeQU85KtYIpbK_J8rczuA#tVp)s z3PtqCdFAq{J;r|@ee_=anf(KOOC))H{)#wbq)wlpZ~8`cavT|1e@L$!b$Q@NO$iv8 z3l~WS`(JX06MwM=?V5_E`wmBMD+F_Oh%at^(QUeLd*`JCW=kelXRE6Qj(zehf4fJ@ z<$y1nZq(#D&A)a2fOy?4x6+opwcmy?p1bWhhbLF7AJ>fTM`NG#>O&~>3V!oxeymTO zL6W#^VE}F^h0F{_0r2$>{Uz2|!;_5oy`o&KlVYT5GI{a`CX?Yv?V@G#;`WT%_SwfoGsmLe zX{zQC9ap4Wi3Vm^8~%J<&M2E(@-``Yv4!+QzxzG3tAlQDn)z}^aO;Y-x;A)nC7o!X z3e75USCPonggs@WPTXTIQSyD$U5gMtCPwSRB}TT??cK?d)t+zM-bW{`B$Rden$?XM z&CI1-GVaYc;JvJDS@HPGucA8pz7@x=k_l$Cx(b@8k2XI>Y+W!x3#Z6WvmIy?KX9RHcW zFZgVN)UQmp)!4p|uOV?JKH;<_nLoadkL5B&xtcbF$jNPyx*s5Y?)dO6kvJ@Gy#*(Y zUGN`sLSW|c?sL+$aLXQnoT2CLd_1}wb=X(2cCGNmzTl}!hSw4`I%^bb7UoGwPx4L$ z2G;r*CO;}9o;>G$<`~DMV}plmK`~~TBDLY|oZuZ}Lh2$A+V_#rm;y&kJ+FW$uu!l9 zpuj>^3V;GjG*ti;S)2zJF<-F&nkQAEc{C^J+pr0ZC~>H{!VrgJrle!Higdi`tMFFxzb;2r6#GhZL*}wh}KK5}&?W9A8FiyrHOQ2U;{S9}Na5soPUN3F>kT;=f z_5FskGm*K6E}NazU9UOerD?AcPm|znWrfVGNI3>gLSlIBj|-mOy~O85?X!MK&7(%1 zJviGK9S;O+v?BF)(0Z~_Gwt?qt^5uU@n2uI{iI5+C$o@OCD)f7cTpwRJJ(*BO8IT| za$VU+imLLuPesPNDr8R)vczL&qYIq)iy_)lDc0KOKeh$-Ne5m)>nm}0{kj59Hi$Lv zcu0BPg^@*fJqHcF5Sg^D_l`N+C?0%Kh0e*9cE%{HibIk8ZOmsk?^yVKi6CT2;GKZ_ zJGqG4f<*>nft?uo_l-B#$9sO^weQnq&4s#zyz}=`4!jPuxA?V~_)_ifSGz!~wX>~g z3bK(Lt2tzxg+@C$h@l&B%TG9@6TzfV&j-IDC+=V4UCrI@(Z}3^=n(tXbkuR{P{>imHK8urt0PB=QGy$#v0Q|tSkDJ^7fwkp@JO1&jgdjv;kSX1)+lYcy1(60XU~fZdiiZSQ{%d*XTq+SiCSX%{4kS z#%%0dGdDq+YgxH>w9tk*J%AM(KyjoeGHs?i+c7<(Y#gViPGfn- zq%-|l$@VU+m;^shM;FEzcLBqW&H#Z{eUhg8@WVpl1H34)QG8#oq?E7#fxyQ(z-hWq zylWzQ zG%(tg=PTsQaIy1C3G<}JB*uh~Nq2TjOm^aU38_BZP))F`eR`7@g|I2z2GAB+z0V{Ns|t=OiV0ySc{)gxTBC0$peiS@Q~V7qw*I5VTtLn5NnLua^9xHKOp_lT&#bY2Lb?GehS`$tAagiV-WmXRSF zW$hMAB+0W8mc2&R{c z`@vm_b1Bo30UiZJ`1P#S*JuRb6n47M4?R%wtH8H+{o=8hm9j$<+Pi zp^1y@;j4}XHJWtRDAsc6g=u>UuG-;$U=d!V?wY{XoswlJp2=j_7vTxo>f4}RD(F|I z)(A>NXwalmLzP~i?E?mES1JE_fy1E_^FOLogHiu)F{`hxY`iu${_waQzo8z(ertTi zR#_ehz#yLrhFnnRkjui)gx!=?NI3?rcJ=}CtDJZ{ z3e)}|&em5wzPfFjJ&Dmv{q6~{`Y`wY9v?p&2&(|7L{EePCOkC73L;o0|G$1Hh z^on?=(nPKbn*hTZ>^{IJb@!c+j5_Zi{@IhdD34Qr=cMkEo zY><>z?!WgR@Vsx!2o>-IAxvgyD38qywPEu)3>u5g2`9aG3eI>eP+?1?o*I&%{_QG37$;Z4W0Mzj+Oup__{aB^kp<-aO%RZS;235HT z26%0PyXFS?O=uyyhP2B;iZ$g~EVIG|2di-XCmEULsBBcbz!pc2_arpMDgTiCFt8Yz z2X8ilwG4wSuS`SlkXW_f-FZgz!LB(AHH%h!A4s`q6e32-QM9ZpQJ4Zle!p~|0@Ni| zHsvKtMwGrC4JWKF>_%bTe|5D(PEq4!E7v56<@|_CaB@Qo4O;c19+&Xk5kbMo`rQxh z9qH>5JZp`S!IFc`WOCDx=ZlbX3|fSNbpgT~kAz^PGCZT~S*J>#mK>e7o<8<)wBHxu zFJtEJ|EZ}jvTm6E8~R0xI=aSpt$*--Z1bhOU0y*SEg73LeAm|(ZYEWPt2YByHZ4?Y ze*2-F9uj6onmL?|&Mbb3IXlkQEIjtXQz+^<>+?z53p^tAItz-)p?xvEG>9s+EgA>c zK2@q44VWU?zp+MzERIFzV}YF*2}iEc(3uyDBI3virdXvASQUaM;BGH>Gmllu~-cCyet_%*=eWP#I{y?E!tzZ z`T2$KsB=EO^VWM#oV&5Q27KsCO67 zV6=NI8d2oXPjW>m;3cv4eIH@qNGRI_wDHNN`wO36r!jlo54zPojpj`Y&&rqpA&jMmW ze8bJByFc|AyNWhysn^lvy*FRNu_hsikmb8hlMoZo*09fBQHKl@dx}ars;uPAHSfx8c(Hba1#N z)P_T2bHULT8#;x|;+w+-cULOC`J zE{nqCvO>5vJQ|zHVsjWAE{)A&&=?FBi%MhL&^S~!hfbq$m`sKM9ByH;XaWJ%h6Tb_ zw0r6(t6VEFWM=KH3j`F*NX_{Co_(d zG7*iyJ#&sX%R%ZQ$ZA1hip=QKN0HE@Me~-f7l{9OymR2o<1>o~_7RY_;mJFWF5lhI Iadi3r0M#$uCjbBd diff --git a/std/compress/lzss/e2e_test.go b/std/compress/lzss/e2e_test.go index 70053c5d77..2a303d3d94 100644 --- a/std/compress/lzss/e2e_test.go +++ b/std/compress/lzss/e2e_test.go @@ -1,13 +1,14 @@ package lzss import ( + goCompress "github.com/consensys/compress" + "github.com/consensys/compress/lzss" "os" "testing" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark/backend" "github.com/consensys/gnark/frontend" - "github.com/consensys/gnark/std/compress" test_vector_utils "github.com/consensys/gnark/std/utils/test_vectors_utils" "github.com/consensys/gnark/test" "github.com/stretchr/testify/assert" @@ -18,32 +19,33 @@ func TestCompression1ZeroE2E(t *testing.T) { } func BenchmarkCompression26KBE2E(b *testing.B) { - _, err := BenchCompressionE2ECompilation(nil, "./testdata/test_cases/3c2943") + _, err := BenchCompressionE2ECompilation(nil, "./testdata/3c2943") assert.NoError(b, err) } func testCompressionE2E(t *testing.T, d, dict []byte, name string) { if d == nil { var err error - d, err = os.ReadFile("./testdata/test_cases/" + name + "/data.bin") + d, err = os.ReadFile("./testdata/" + name + "/data.bin") assert.NoError(t, err) } // compress - compressor, err := NewCompressor(dict, BestCompression) + level := lzss.GoodCompression + compressor, err := lzss.NewCompressor(dict, level) assert.NoError(t, err) c, err := compressor.Compress(d) assert.NoError(t, err) - cStream, err := compress.NewStream(c, uint8(compressor.level)) + cStream, err := goCompress.NewStream(c, uint8(level)) assert.NoError(t, err) cSum, err := check(cStream, cStream.Len()) assert.NoError(t, err) - dStream, err := compress.NewStream(d, 8) + dStream, err := goCompress.NewStream(d, 8) assert.NoError(t, err) dSum, err := check(dStream, len(d)) @@ -53,7 +55,7 @@ func testCompressionE2E(t *testing.T, d, dict []byte, name string) { C: make([]frontend.Variable, cStream.Len()), D: make([]frontend.Variable, len(d)), Dict: make([]byte, len(dict)), - Level: BestCompression, + Level: level, } // solve the circuit or only compile it @@ -71,10 +73,10 @@ func testCompressionE2E(t *testing.T, d, dict []byte, name string) { } func TestChecksum0(t *testing.T) { - testChecksum(t, compress.Stream{D: []int{}, NbSymbs: 256}) + testChecksum(t, goCompress.Stream{D: []int{}, NbSymbs: 256}) } -func testChecksum(t *testing.T, d compress.Stream) { +func testChecksum(t *testing.T, d goCompress.Stream) { circuit := checksumTestCircuit{ Inputs: make([]frontend.Variable, d.Len()), InputLen: d.Len(), diff --git a/std/compress/lzss/internal/suffixarray/sais.go b/std/compress/lzss/internal/suffixarray/sais.go deleted file mode 100644 index 4be4f735c0..0000000000 --- a/std/compress/lzss/internal/suffixarray/sais.go +++ /dev/null @@ -1,899 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Suffix array construction by induced sorting (SAIS). -// See Ge Nong, Sen Zhang, and Wai Hong Chen, -// "Two Efficient Algorithms for Linear Time Suffix Array Construction", -// especially section 3 (https://ieeexplore.ieee.org/document/5582081). -// See also http://zork.net/~st/jottings/sais.html. -// -// With optimizations inspired by Yuta Mori's sais-lite -// (https://sites.google.com/site/yuta256/sais). -// -// And with other new optimizations. - -// Many of these functions are parameterized by the sizes of -// the types they operate on. The generator gen.go makes -// copies of these functions for use with other sizes. -// Specifically: -// -// - A function with a name ending in _8_32 takes []byte and []int32 arguments -// and is duplicated into _32_32, _8_64, and _64_64 forms. -// The _32_32 and _64_64_ suffixes are shortened to plain _32 and _64. -// Any lines in the function body that contain the text "byte-only" or "256" -// are stripped when creating _32_32 and _64_64 forms. -// (Those lines are typically 8-bit-specific optimizations.) -// -// - A function with a name ending only in _32 operates on []int32 -// and is duplicated into a _64 form. (Note that it may still take a []byte, -// but there is no need for a version of the function in which the []byte -// is widened to a full integer array.) - -// The overall runtime of this code is linear in the input size: -// it runs a sequence of linear passes to reduce the problem to -// a subproblem at most half as big, invokes itself recursively, -// and then runs a sequence of linear passes to turn the answer -// for the subproblem into the answer for the original problem. -// This gives T(N) = O(N) + T(N/2) = O(N) + O(N/2) + O(N/4) + ... = O(N). -// -// The outline of the code, with the forward and backward scans -// through O(N)-sized arrays called out, is: -// -// sais_I_N -// placeLMS_I_B -// bucketMax_I_B -// freq_I_B -// (1) -// (2) -// (3) -// induceSubL_I_B -// bucketMin_I_B -// freq_I_B -// (4) -// (5) -// (6) -// induceSubS_I_B -// bucketMax_I_B -// freq_I_B -// (7) -// (8) -// (9) -// assignID_I_B -// (10) -// map_B -// (11) -// recurse_B -// (recursive call to sais_B_B for a subproblem of size at most 1/2 input, often much smaller) -// unmap_I_B -// (12) -// (13) -// expand_I_B -// bucketMax_I_B -// freq_I_B -// (14) -// (15) -// (16) -// induceL_I_B -// bucketMin_I_B -// freq_I_B -// (17) -// (18) -// (19) -// induceS_I_B -// bucketMax_I_B -// freq_I_B -// (20) -// (21) -// (22) -// -// Here, _B indicates the suffix array size (_32 or _64) and _I the input size (_8 or _B). -// -// The outline shows there are in general 22 scans through -// O(N)-sized arrays for a given level of the recursion. -// In the top level, operating on 8-bit input text, -// the six freq scans are fixed size (256) instead of potentially -// input-sized. Also, the frequency is counted once and cached -// whenever there is room to do so (there is nearly always room in general, -// and always room at the top level), which eliminates all but -// the first freq_I_B text scans (that is, 5 of the 6). -// So the top level of the recursion only does 22 - 6 - 5 = 11 -// input-sized scans and a typical level does 16 scans. -// -// The linear scans do not cost anywhere near as much as -// the random accesses to the text made during a few of -// the scans (specifically #6, #9, #16, #19, #22 marked above). -// In real texts, there is not much but some locality to -// the accesses, due to the repetitive structure of the text -// (the same reason Burrows-Wheeler compression is so effective). -// For random inputs, there is no locality, which makes those -// accesses even more expensive, especially once the text -// no longer fits in cache. -// For example, running on 50 MB of Go source code, induceSubL_8_32 -// (which runs only once, at the top level of the recursion) -// takes 0.44s, while on 50 MB of random input, it takes 2.55s. -// Nearly all the relative slowdown is explained by the text access: -// -// c0, c1 := text[k-1], text[k] -// -// That line runs for 0.23s on the Go text and 2.02s on random text. - -// @gbotrel there was a go generate line here. necessary? - -package suffixarray - -// text_32 returns the suffix array for the input text. -// It requires that len(text) fit in an int32 -// and that the caller zero sa. -func text_32(text []byte, sa []int32) { - if int(int32(len(text))) != len(text) || len(text) != len(sa) { - panic("suffixarray: misuse of text_32") - } - sais_8_32(text, 256, sa, make([]int32, 2*256)) -} - -// sais_8_32 computes the suffix array of text. -// The text must contain only values in [0, textMax). -// The suffix array is stored in sa, which the caller -// must ensure is already zeroed. -// The caller must also provide temporary space tmp -// with len(tmp) ≥ textMax. If len(tmp) ≥ 2*textMax -// then the algorithm runs a little faster. -// If sais_8_32 modifies tmp, it sets tmp[0] = -1 on return. -func sais_8_32(text []byte, textMax int, sa, tmp []int32) { - if len(sa) != len(text) || len(tmp) < textMax { - panic("suffixarray: misuse of sais_8_32") - } - - // Trivial base cases. Sorting 0 or 1 things is easy. - if len(text) == 0 { - return - } - if len(text) == 1 { - sa[0] = 0 - return - } - - // Establish slices indexed by text character - // holding character frequency and bucket-sort offsets. - // If there's only enough tmp for one slice, - // we make it the bucket offsets and recompute - // the character frequency each time we need it. - var freq, bucket []int32 - if len(tmp) >= 2*textMax { - freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] - freq[0] = -1 // mark as uninitialized - } else { - freq, bucket = nil, tmp[:textMax] - } - - // The SAIS algorithm. - // Each of these calls makes one scan through sa. - // See the individual functions for documentation - // about each's role in the algorithm. - numLMS := placeLMS_8_32(text, sa, freq, bucket) - if numLMS <= 1 { - // 0 or 1 items are already sorted. Do nothing. - } else { - induceSubL_8_32(text, sa, freq, bucket) - induceSubS_8_32(text, sa, freq, bucket) - length_8_32(text, sa, numLMS) - maxID := assignID_8_32(text, sa, numLMS) - if maxID < numLMS { - map_32(sa, numLMS) - recurse_32(sa, tmp, numLMS, maxID) - unmap_8_32(text, sa, numLMS) - } else { - // If maxID == numLMS, then each LMS-substring - // is unique, so the relative ordering of two LMS-suffixes - // is determined by just the leading LMS-substring. - // That is, the LMS-suffix sort order matches the - // (simpler) LMS-substring sort order. - // Copy the original LMS-substring order into the - // suffix array destination. - copy(sa, sa[len(sa)-numLMS:]) - } - expand_8_32(text, freq, bucket, sa, numLMS) - } - induceL_8_32(text, sa, freq, bucket) - induceS_8_32(text, sa, freq, bucket) - - // Mark for caller that we overwrote tmp. - tmp[0] = -1 -} - -// freq_8_32 returns the character frequencies -// for text, as a slice indexed by character value. -// If freq is nil, freq_8_32 uses and returns bucket. -// If freq is non-nil, freq_8_32 assumes that freq[0] >= 0 -// means the frequencies are already computed. -// If the frequency data is overwritten or uninitialized, -// the caller must set freq[0] = -1 to force recomputation -// the next time it is needed. -func freq_8_32(text []byte, freq, bucket []int32) []int32 { - if freq != nil && freq[0] >= 0 { - return freq // already computed - } - if freq == nil { - freq = bucket - } - - freq = freq[:256] // eliminate bounds check for freq[c] below - for i := range freq { - freq[i] = 0 - } - for _, c := range text { - freq[c]++ - } - return freq -} - -// bucketMin_8_32 stores into bucket[c] the minimum index -// in the bucket for character c in a bucket-sort of text. -func bucketMin_8_32(text []byte, freq, bucket []int32) { - freq = freq_8_32(text, freq, bucket) - freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below - bucket = bucket[:256] // eliminate bounds check for bucket[i] below - total := int32(0) - for i, n := range freq { - bucket[i] = total - total += n - } -} - -// bucketMax_8_32 stores into bucket[c] the maximum index -// in the bucket for character c in a bucket-sort of text. -// The bucket indexes for c are [min, max). -// That is, max is one past the final index in that bucket. -func bucketMax_8_32(text []byte, freq, bucket []int32) { - freq = freq_8_32(text, freq, bucket) - freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below - bucket = bucket[:256] // eliminate bounds check for bucket[i] below - total := int32(0) - for i, n := range freq { - total += n - bucket[i] = total - } -} - -// The SAIS algorithm proceeds in a sequence of scans through sa. -// Each of the following functions implements one scan, -// and the functions appear here in the order they execute in the algorithm. - -// placeLMS_8_32 places into sa the indexes of the -// final characters of the LMS substrings of text, -// sorted into the rightmost ends of their correct buckets -// in the suffix array. -// -// The imaginary sentinel character at the end of the text -// is the final character of the final LMS substring, but there -// is no bucket for the imaginary sentinel character, -// which has a smaller value than any real character. -// The caller must therefore pretend that sa[-1] == len(text). -// -// The text indexes of LMS-substring characters are always ≥ 1 -// (the first LMS-substring must be preceded by one or more L-type -// characters that are not part of any LMS-substring), -// so using 0 as a “not present” suffix array entry is safe, -// both in this function and in most later functions -// (until induceL_8_32 below). -func placeLMS_8_32(text []byte, sa, freq, bucket []int32) int { - bucketMax_8_32(text, freq, bucket) - - numLMS := 0 - lastB := int32(-1) - bucket = bucket[:256] // eliminate bounds check for bucket[c1] below - - // The next stanza of code (until the blank line) loop backward - // over text, stopping to execute a code body at each position i - // such that text[i] is an L-character and text[i+1] is an S-character. - // That is, i+1 is the position of the start of an LMS-substring. - // These could be hoisted out into a function with a callback, - // but at a significant speed cost. Instead, we just write these - // seven lines a few times in this source file. The copies below - // refer back to the pattern established by this original as the - // "LMS-substring iterator". - // - // In every scan through the text, c0, c1 are successive characters of text. - // In this backward scan, c0 == text[i] and c1 == text[i+1]. - // By scanning backward, we can keep track of whether the current - // position is type-S or type-L according to the usual definition: - // - // - position len(text) is type S with text[len(text)] == -1 (the sentinel) - // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. - // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. - // - // The backward scan lets us maintain the current type, - // update it when we see c0 != c1, and otherwise leave it alone. - // We want to identify all S positions with a preceding L. - // Position len(text) is one such position by definition, but we have - // nowhere to write it down, so we eliminate it by untruthfully - // setting isTypeS = false at the start of the loop. - c0, c1, isTypeS := byte(0), byte(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Bucket the index i+1 for the start of an LMS-substring. - b := bucket[c1] - 1 - bucket[c1] = b - sa[b] = int32(i + 1) - lastB = b - numLMS++ - } - } - - // We recorded the LMS-substring starts but really want the ends. - // Luckily, with two differences, the start indexes and the end indexes are the same. - // The first difference is that the rightmost LMS-substring's end index is len(text), - // so the caller must pretend that sa[-1] == len(text), as noted above. - // The second difference is that the first leftmost LMS-substring start index - // does not end an earlier LMS-substring, so as an optimization we can omit - // that leftmost LMS-substring start index (the last one we wrote). - // - // Exception: if numLMS <= 1, the caller is not going to bother with - // the recursion at all and will treat the result as containing LMS-substring starts. - // In that case, we don't remove the final entry. - if numLMS > 1 { - sa[lastB] = 0 - } - return numLMS -} - -// induceSubL_8_32 inserts the L-type text indexes of LMS-substrings -// into sa, assuming that the final characters of the LMS-substrings -// are already inserted into sa, sorted by final character, and at the -// right (not left) end of the corresponding character bucket. -// Each LMS-substring has the form (as a regexp) /S+L+S/: -// one or more S-type, one or more L-type, final S-type. -// induceSubL_8_32 leaves behind only the leftmost L-type text -// index for each LMS-substring. That is, it removes the final S-type -// indexes that are present on entry, and it inserts but then removes -// the interior L-type indexes too. -// (Only the leftmost L-type index is needed by induceSubS_8_32.) -func induceSubL_8_32(text []byte, sa, freq, bucket []int32) { - // Initialize positions for left side of character buckets. - bucketMin_8_32(text, freq, bucket) - bucket = bucket[:256] // eliminate bounds check for bucket[cB] below - - // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly - // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. - // Because j-1 is type L, inserting it into sa now will sort it correctly. - // But we want to distinguish a j-1 with j-2 of type L from type S. - // We can process the former but want to leave the latter for the caller. - // We record the difference by negating j-1 if it is preceded by type S. - // Either way, the insertion (into the text[j-1] bucket) is guaranteed to - // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have - // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, - // and so on, in sorted but not necessarily adjacent order, until it finds - // one preceded by an index of type S, at which point it must stop. - // - // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, - // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing - // only the indexes of the leftmost L-type indexes for each LMS-substring. - // - // The suffix array sa therefore serves simultaneously as input, output, - // and a miraculously well-tailored work queue. - - // placeLMS_8_32 left out the implicit entry sa[-1] == len(text), - // corresponding to the identified type-L index len(text)-1. - // Process it before the left-to-right scan of sa proper. - // See body in loop for commentary. - k := len(text) - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - // Cache recently used bucket index: - // we're processing suffixes in sorted order - // and accessing buckets indexed by the - // byte before the sorted order, which still - // has very good locality. - // Invariant: b is cached, possibly dirty copy of bucket[cB]. - cB := c1 - b := bucket[cB] - sa[b] = int32(k) - b++ - - for i := 0; i < len(sa); i++ { - j := int(sa[i]) - if j == 0 { - // Skip empty entry. - continue - } - if j < 0 { - // Leave discovered type-S index for caller. - sa[i] = int32(-j) - continue - } - sa[i] = 0 - - // Index j was on work queue, meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is L-type, queue k for processing later in this loop. - // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. - k := j - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - sa[b] = int32(k) - b++ - } -} - -// induceSubS_8_32 inserts the S-type text indexes of LMS-substrings -// into sa, assuming that the leftmost L-type text indexes are already -// inserted into sa, sorted by LMS-substring suffix, and at the -// left end of the corresponding character bucket. -// Each LMS-substring has the form (as a regexp) /S+L+S/: -// one or more S-type, one or more L-type, final S-type. -// induceSubS_8_32 leaves behind only the leftmost S-type text -// index for each LMS-substring, in sorted order, at the right end of sa. -// That is, it removes the L-type indexes that are present on entry, -// and it inserts but then removes the interior S-type indexes too, -// leaving the LMS-substring start indexes packed into sa[len(sa)-numLMS:]. -// (Only the LMS-substring start indexes are processed by the recursion.) -func induceSubS_8_32(text []byte, sa, freq, bucket []int32) { - // Initialize positions for right side of character buckets. - bucketMax_8_32(text, freq, bucket) - bucket = bucket[:256] // eliminate bounds check for bucket[cB] below - - // Analogous to induceSubL_8_32 above, - // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly - // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. - // Because j-1 is type S, inserting it into sa now will sort it correctly. - // But we want to distinguish a j-1 with j-2 of type S from type L. - // We can process the former but want to leave the latter for the caller. - // We record the difference by negating j-1 if it is preceded by type L. - // Either way, the insertion (into the text[j-1] bucket) is guaranteed to - // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have - // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, - // and so on, in sorted but not necessarily adjacent order, until it finds - // one preceded by an index of type L, at which point it must stop. - // That index (preceded by one of type L) is an LMS-substring start. - // - // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, - // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, - // so that the loop finishes with the top of sa containing exactly - // the LMS-substring start indexes, sorted by LMS-substring. - - // Cache recently used bucket index: - cB := byte(0) - b := bucket[cB] - - top := len(sa) - for i := len(sa) - 1; i >= 0; i-- { - j := int(sa[i]) - if j == 0 { - // Skip empty entry. - continue - } - sa[i] = 0 - if j < 0 { - // Leave discovered LMS-substring start index for caller. - top-- - sa[top] = int32(-j) - continue - } - - // Index j was on work queue, meaning k := j-1 is S-type, - // so we can now place k correctly into sa. - // If k-1 is S-type, queue k for processing later in this loop. - // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. - k := j - 1 - c1 := text[k] - c0 := text[k-1] - if c0 > c1 { - k = -k - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - b-- - sa[b] = int32(k) - } -} - -// length_8_32 computes and records the length of each LMS-substring in text. -// The length of the LMS-substring at index j is stored at sa[j/2], -// avoiding the LMS-substring indexes already stored in the top half of sa. -// (If index j is an LMS-substring start, then index j-1 is type L and cannot be.) -// There are two exceptions, made for optimizations in name_8_32 below. -// -// First, the final LMS-substring is recorded as having length 0, which is otherwise -// impossible, instead of giving it a length that includes the implicit sentinel. -// This ensures the final LMS-substring has length unequal to all others -// and therefore can be detected as different without text comparison -// (it is unequal because it is the only one that ends in the implicit sentinel, -// and the text comparison would be problematic since the implicit sentinel -// is not actually present at text[len(text)]). -// -// Second, to avoid text comparison entirely, if an LMS-substring is very short, -// sa[j/2] records its actual text instead of its length, so that if two such -// substrings have matching “length,” the text need not be read at all. -// The definition of “very short” is that the text bytes must pack into a uint32, -// and the unsigned encoding e must be ≥ len(text), so that it can be -// distinguished from a valid length. -func length_8_32(text []byte, sa []int32, numLMS int) { - end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) - - // The encoding of N text bytes into a “length” word - // adds 1 to each byte, packs them into the bottom - // N*8 bits of a word, and then bitwise inverts the result. - // That is, the text sequence A B C (hex 41 42 43) - // encodes as ^uint32(0x42_43_44). - // LMS-substrings can never start or end with 0xFF. - // Adding 1 ensures the encoded byte sequence never - // starts or ends with 0x00, so that present bytes can be - // distinguished from zero-padding in the top bits, - // so the length need not be separately encoded. - // Inverting the bytes increases the chance that a - // 4-byte encoding will still be ≥ len(text). - // In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F) - // then the high bit of the inversion will be set, - // making it clearly not a valid length (it would be a negative one). - // - // cx holds the pre-inverted encoding (the packed incremented bytes). - cx := uint32(0) // byte-only - - // This stanza (until the blank line) is the "LMS-substring iterator", - // described in placeLMS_8_32 above, with one line added to maintain cx. - c0, c1, isTypeS := byte(0), byte(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - cx = cx<<8 | uint32(c1+1) // byte-only - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Index j = i+1 is the start of an LMS-substring. - // Compute length or encoded text to store in sa[j/2]. - j := i + 1 - var code int32 - if end == 0 { - code = 0 - } else { - code = int32(end - j) - if code <= 32/8 && ^cx >= uint32(len(text)) { // byte-only - code = int32(^cx) // byte-only - } // byte-only - } - sa[j>>1] = code - end = j + 1 - cx = uint32(c1 + 1) // byte-only - } - } -} - -// assignID_8_32 assigns a dense ID numbering to the -// set of LMS-substrings respecting string ordering and equality, -// returning the maximum assigned ID. -// For example given the input "ababab", the LMS-substrings -// are "aba", "aba", and "ab", renumbered as 2 2 1. -// sa[len(sa)-numLMS:] holds the LMS-substring indexes -// sorted in string order, so to assign numbers we can -// consider each in turn, removing adjacent duplicates. -// The new ID for the LMS-substring at index j is written to sa[j/2], -// overwriting the length previously stored there (by length_8_32 above). -func assignID_8_32(text []byte, sa []int32, numLMS int) int { - id := 0 - lastLen := int32(-1) // impossible - lastPos := int32(0) - for _, j := range sa[len(sa)-numLMS:] { - // Is the LMS-substring at index j new, or is it the same as the last one we saw? - n := sa[j/2] - if n != lastLen { - goto New - } - if uint32(n) >= uint32(len(text)) { - // “Length” is really encoded full text, and they match. - goto Same - } - { - // Compare actual texts. - n := int(n) - this := text[j:][:n] - last := text[lastPos:][:n] - for i := 0; i < n; i++ { - if this[i] != last[i] { - goto New - } - } - goto Same - } - New: - id++ - lastPos = j - lastLen = n - Same: - sa[j/2] = int32(id) - } - return id -} - -// map_32 maps the LMS-substrings in text to their new IDs, -// producing the subproblem for the recursion. -// The mapping itself was mostly applied by assignID_8_32: -// sa[i] is either 0, the ID for the LMS-substring at index 2*i, -// or the ID for the LMS-substring at index 2*i+1. -// To produce the subproblem we need only remove the zeros -// and change ID into ID-1 (our IDs start at 1, but text chars start at 0). -// -// map_32 packs the result, which is the input to the recursion, -// into the top of sa, so that the recursion result can be stored -// in the bottom of sa, which sets up for expand_8_32 well. -func map_32(sa []int32, numLMS int) { - w := len(sa) - for i := len(sa) / 2; i >= 0; i-- { - j := sa[i] - if j > 0 { - w-- - sa[w] = j - 1 - } - } -} - -// recurse_32 calls sais_32 recursively to solve the subproblem we've built. -// The subproblem is at the right end of sa, the suffix array result will be -// written at the left end of sa, and the middle of sa is available for use as -// temporary frequency and bucket storage. -func recurse_32(sa, oldTmp []int32, numLMS, maxID int) { - dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:] - - // Set up temporary space for recursive call. - // We must pass sais_32 a tmp buffer with at least maxID entries. - // - // The subproblem is guaranteed to have length at most len(sa)/2, - // so that sa can hold both the subproblem and its suffix array. - // Nearly all the time, however, the subproblem has length < len(sa)/3, - // in which case there is a subproblem-sized middle of sa that - // we can reuse for temporary space (saTmp). - // When recurse_32 is called from sais_8_32, oldTmp is length 512 - // (from text_32), and saTmp will typically be much larger, so we'll use saTmp. - // When deeper recursions come back to recurse_32, now oldTmp is - // the saTmp from the top-most recursion, it is typically larger than - // the current saTmp (because the current sa gets smaller and smaller - // as the recursion gets deeper), and we keep reusing that top-most - // large saTmp instead of the offered smaller ones. - // - // Why is the subproblem length so often just under len(sa)/3? - // See Nong, Zhang, and Chen, section 3.6 for a plausible explanation. - // In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern - // in the input, perfect alternation of larger and smaller input bytes. - // Real text doesn't do that. If each L-type index is randomly followed - // by either an L-type or S-type index, then half the substrings will - // be of the form SLS, but the other half will be longer. Of that half, - // half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on. - // Not counting the final S in each (which overlaps the first S in the next), - // This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3. - // The space we need is further reduced by the fact that many of the - // short patterns like SLS will often be the same character sequences - // repeated throughout the text, reducing maxID relative to numLMS. - // - // For short inputs, the averages may not run in our favor, but then we - // can often fall back to using the length-512 tmp available in the - // top-most call. (Also a short allocation would not be a big deal.) - // - // For pathological inputs, we fall back to allocating a new tmp of length - // max(maxID, numLMS/2). This level of the recursion needs maxID, - // and all deeper levels of the recursion will need no more than numLMS/2, - // so this one allocation is guaranteed to suffice for the entire stack - // of recursive calls. - tmp := oldTmp - if len(tmp) < len(saTmp) { - tmp = saTmp - } - if len(tmp) < numLMS { - // TestSAIS/forcealloc reaches this code. - n := maxID - if n < numLMS/2 { - n = numLMS / 2 - } - tmp = make([]int32, n) - } - - // sais_32 requires that the caller arrange to clear dst, - // because in general the caller may know dst is - // freshly-allocated and already cleared. But this one is not. - for i := range dst { - dst[i] = 0 - } - sais_32(text, maxID, dst, tmp) -} - -// unmap_8_32 unmaps the subproblem back to the original. -// sa[:numLMS] is the LMS-substring numbers, which don't matter much anymore. -// sa[len(sa)-numLMS:] is the sorted list of those LMS-substring numbers. -// The key part is that if the list says K that means the K'th substring. -// We can replace sa[:numLMS] with the indexes of the LMS-substrings. -// Then if the list says K it really means sa[K]. -// Having mapped the list back to LMS-substring indexes, -// we can place those into the right buckets. -func unmap_8_32(text []byte, sa []int32, numLMS int) { - unmap := sa[len(sa)-numLMS:] - j := len(unmap) - - // "LMS-substring iterator" (see placeLMS_8_32 above). - c0, c1, isTypeS := byte(0), byte(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Populate inverse map. - j-- - unmap[j] = int32(i + 1) - } - } - - // Apply inverse map to subproblem suffix array. - sa = sa[:numLMS] - for i := 0; i < len(sa); i++ { - sa[i] = unmap[sa[i]] - } -} - -// expand_8_32 distributes the compacted, sorted LMS-suffix indexes -// from sa[:numLMS] into the tops of the appropriate buckets in sa, -// preserving the sorted order and making room for the L-type indexes -// to be slotted into the sorted sequence by induceL_8_32. -func expand_8_32(text []byte, freq, bucket, sa []int32, numLMS int) { - bucketMax_8_32(text, freq, bucket) - bucket = bucket[:256] // eliminate bound check for bucket[c] below - - // Loop backward through sa, always tracking - // the next index to populate from sa[:numLMS]. - // When we get to one, populate it. - // Zero the rest of the slots; they have dead values in them. - x := numLMS - 1 - saX := sa[x] - c := text[saX] - b := bucket[c] - 1 - bucket[c] = b - - for i := len(sa) - 1; i >= 0; i-- { - if i != int(b) { - sa[i] = 0 - continue - } - sa[i] = saX - - // Load next entry to put down (if any). - if x > 0 { - x-- - saX = sa[x] // TODO bounds check - c = text[saX] - b = bucket[c] - 1 - bucket[c] = b - } - } -} - -// induceL_8_32 inserts L-type text indexes into sa, -// assuming that the leftmost S-type indexes are inserted -// into sa, in sorted order, in the right bucket halves. -// It leaves all the L-type indexes in sa, but the -// leftmost L-type indexes are negated, to mark them -// for processing by induceS_8_32. -func induceL_8_32(text []byte, sa, freq, bucket []int32) { - // Initialize positions for left side of character buckets. - bucketMin_8_32(text, freq, bucket) - bucket = bucket[:256] // eliminate bounds check for bucket[cB] below - - // This scan is similar to the one in induceSubL_8_32 above. - // That one arranges to clear all but the leftmost L-type indexes. - // This scan leaves all the L-type indexes and the original S-type - // indexes, but it negates the positive leftmost L-type indexes - // (the ones that induceS_8_32 needs to process). - - // expand_8_32 left out the implicit entry sa[-1] == len(text), - // corresponding to the identified type-L index len(text)-1. - // Process it before the left-to-right scan of sa proper. - // See body in loop for commentary. - k := len(text) - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - // Cache recently used bucket index. - cB := c1 - b := bucket[cB] - sa[b] = int32(k) - b++ - - for i := 0; i < len(sa); i++ { - j := int(sa[i]) - if j <= 0 { - // Skip empty or negated entry (including negated zero). - continue - } - - // Index j was on work queue, meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is L-type, queue k for processing later in this loop. - // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. - // If k is zero, k-1 doesn't exist, so we only need to leave it - // for the caller. The caller can't tell the difference between - // an empty slot and a non-empty zero, but there's no need - // to distinguish them anyway: the final suffix array will end up - // with one zero somewhere, and that will be a real zero. - k := j - 1 - c1 := text[k] - if k > 0 { - if c0 := text[k-1]; c0 < c1 { - k = -k - } - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - sa[b] = int32(k) - b++ - } -} - -func induceS_8_32(text []byte, sa, freq, bucket []int32) { - // Initialize positions for right side of character buckets. - bucketMax_8_32(text, freq, bucket) - bucket = bucket[:256] // eliminate bounds check for bucket[cB] below - - cB := byte(0) - b := bucket[cB] - - for i := len(sa) - 1; i >= 0; i-- { - j := int(sa[i]) - if j >= 0 { - // Skip non-flagged entry. - // (This loop can't see an empty entry; 0 means the real zero index.) - continue - } - - // Negative j is a work queue entry; rewrite to positive j for final suffix array. - j = -j - sa[i] = int32(j) - - // Index j was on work queue (encoded as -j but now decoded), - // meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is S-type, queue -k for processing later in this loop. - // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. - // If k is zero, k-1 doesn't exist, so we only need to leave it - // for the caller. - k := j - 1 - c1 := text[k] - if k > 0 { - if c0 := text[k-1]; c0 <= c1 { - k = -k - } - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - b-- - sa[b] = int32(k) - } -} diff --git a/std/compress/lzss/internal/suffixarray/sais2.go b/std/compress/lzss/internal/suffixarray/sais2.go deleted file mode 100644 index 32b8972801..0000000000 --- a/std/compress/lzss/internal/suffixarray/sais2.go +++ /dev/null @@ -1,1741 +0,0 @@ -// Copyright 2019 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Code generated by go generate; DO NOT EDIT. - -package suffixarray - -func text_64(text []byte, sa []int64) { - if int(int64(len(text))) != len(text) || len(text) != len(sa) { - panic("suffixarray: misuse of text_64") - } - sais_8_64(text, 256, sa, make([]int64, 2*256)) -} - -func sais_8_64(text []byte, textMax int, sa, tmp []int64) { - if len(sa) != len(text) || len(tmp) < int(textMax) { - panic("suffixarray: misuse of sais_8_64") - } - - // Trivial base cases. Sorting 0 or 1 things is easy. - if len(text) == 0 { - return - } - if len(text) == 1 { - sa[0] = 0 - return - } - - // Establish slices indexed by text character - // holding character frequency and bucket-sort offsets. - // If there's only enough tmp for one slice, - // we make it the bucket offsets and recompute - // the character frequency each time we need it. - var freq, bucket []int64 - if len(tmp) >= 2*textMax { - freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] - freq[0] = -1 // mark as uninitialized - } else { - freq, bucket = nil, tmp[:textMax] - } - - // The SAIS algorithm. - // Each of these calls makes one scan through sa. - // See the individual functions for documentation - // about each's role in the algorithm. - numLMS := placeLMS_8_64(text, sa, freq, bucket) - if numLMS <= 1 { - // 0 or 1 items are already sorted. Do nothing. - } else { - induceSubL_8_64(text, sa, freq, bucket) - induceSubS_8_64(text, sa, freq, bucket) - length_8_64(text, sa, numLMS) - maxID := assignID_8_64(text, sa, numLMS) - if maxID < numLMS { - map_64(sa, numLMS) - recurse_64(sa, tmp, numLMS, maxID) - unmap_8_64(text, sa, numLMS) - } else { - // If maxID == numLMS, then each LMS-substring - // is unique, so the relative ordering of two LMS-suffixes - // is determined by just the leading LMS-substring. - // That is, the LMS-suffix sort order matches the - // (simpler) LMS-substring sort order. - // Copy the original LMS-substring order into the - // suffix array destination. - copy(sa, sa[len(sa)-numLMS:]) - } - expand_8_64(text, freq, bucket, sa, numLMS) - } - induceL_8_64(text, sa, freq, bucket) - induceS_8_64(text, sa, freq, bucket) - - // Mark for caller that we overwrote tmp. - tmp[0] = -1 -} - -func sais_32(text []int32, textMax int, sa, tmp []int32) { - if len(sa) != len(text) || len(tmp) < int(textMax) { - panic("suffixarray: misuse of sais_32") - } - - // Trivial base cases. Sorting 0 or 1 things is easy. - if len(text) == 0 { - return - } - if len(text) == 1 { - sa[0] = 0 - return - } - - // Establish slices indexed by text character - // holding character frequency and bucket-sort offsets. - // If there's only enough tmp for one slice, - // we make it the bucket offsets and recompute - // the character frequency each time we need it. - var freq, bucket []int32 - if len(tmp) >= 2*textMax { - freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] - freq[0] = -1 // mark as uninitialized - } else { - freq, bucket = nil, tmp[:textMax] - } - - // The SAIS algorithm. - // Each of these calls makes one scan through sa. - // See the individual functions for documentation - // about each's role in the algorithm. - numLMS := placeLMS_32(text, sa, freq, bucket) - if numLMS <= 1 { - // 0 or 1 items are already sorted. Do nothing. - } else { - induceSubL_32(text, sa, freq, bucket) - induceSubS_32(text, sa, freq, bucket) - length_32(text, sa, numLMS) - maxID := assignID_32(text, sa, numLMS) - if maxID < numLMS { - map_32(sa, numLMS) - recurse_32(sa, tmp, numLMS, maxID) - unmap_32(text, sa, numLMS) - } else { - // If maxID == numLMS, then each LMS-substring - // is unique, so the relative ordering of two LMS-suffixes - // is determined by just the leading LMS-substring. - // That is, the LMS-suffix sort order matches the - // (simpler) LMS-substring sort order. - // Copy the original LMS-substring order into the - // suffix array destination. - copy(sa, sa[len(sa)-numLMS:]) - } - expand_32(text, freq, bucket, sa, numLMS) - } - induceL_32(text, sa, freq, bucket) - induceS_32(text, sa, freq, bucket) - - // Mark for caller that we overwrote tmp. - tmp[0] = -1 -} - -func sais_64(text []int64, textMax int, sa, tmp []int64) { - if len(sa) != len(text) || len(tmp) < int(textMax) { - panic("suffixarray: misuse of sais_64") - } - - // Trivial base cases. Sorting 0 or 1 things is easy. - if len(text) == 0 { - return - } - if len(text) == 1 { - sa[0] = 0 - return - } - - // Establish slices indexed by text character - // holding character frequency and bucket-sort offsets. - // If there's only enough tmp for one slice, - // we make it the bucket offsets and recompute - // the character frequency each time we need it. - var freq, bucket []int64 - if len(tmp) >= 2*textMax { - freq, bucket = tmp[:textMax], tmp[textMax:2*textMax] - freq[0] = -1 // mark as uninitialized - } else { - freq, bucket = nil, tmp[:textMax] - } - - // The SAIS algorithm. - // Each of these calls makes one scan through sa. - // See the individual functions for documentation - // about each's role in the algorithm. - numLMS := placeLMS_64(text, sa, freq, bucket) - if numLMS <= 1 { - // 0 or 1 items are already sorted. Do nothing. - } else { - induceSubL_64(text, sa, freq, bucket) - induceSubS_64(text, sa, freq, bucket) - length_64(text, sa, numLMS) - maxID := assignID_64(text, sa, numLMS) - if maxID < numLMS { - map_64(sa, numLMS) - recurse_64(sa, tmp, numLMS, maxID) - unmap_64(text, sa, numLMS) - } else { - // If maxID == numLMS, then each LMS-substring - // is unique, so the relative ordering of two LMS-suffixes - // is determined by just the leading LMS-substring. - // That is, the LMS-suffix sort order matches the - // (simpler) LMS-substring sort order. - // Copy the original LMS-substring order into the - // suffix array destination. - copy(sa, sa[len(sa)-numLMS:]) - } - expand_64(text, freq, bucket, sa, numLMS) - } - induceL_64(text, sa, freq, bucket) - induceS_64(text, sa, freq, bucket) - - // Mark for caller that we overwrote tmp. - tmp[0] = -1 -} - -func freq_8_64(text []byte, freq, bucket []int64) []int64 { - if freq != nil && freq[0] >= 0 { - return freq // already computed - } - if freq == nil { - freq = bucket - } - - freq = freq[:256] // eliminate bounds check for freq[c] below - for i := range freq { - freq[i] = 0 - } - for _, c := range text { - freq[c]++ - } - return freq -} - -func freq_32(text []int32, freq, bucket []int32) []int32 { - if freq != nil && freq[0] >= 0 { - return freq // already computed - } - if freq == nil { - freq = bucket - } - - for i := range freq { - freq[i] = 0 - } - for _, c := range text { - freq[c]++ - } - return freq -} - -func freq_64(text []int64, freq, bucket []int64) []int64 { - if freq != nil && freq[0] >= 0 { - return freq // already computed - } - if freq == nil { - freq = bucket - } - - for i := range freq { - freq[i] = 0 - } - for _, c := range text { - freq[c]++ - } - return freq -} - -func bucketMin_8_64(text []byte, freq, bucket []int64) { - freq = freq_8_64(text, freq, bucket) - freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below - bucket = bucket[:256] // eliminate bounds check for bucket[i] below - total := int64(0) - for i, n := range freq { - bucket[i] = total - total += n - } -} - -func bucketMin_32(text []int32, freq, bucket []int32) { - freq = freq_32(text, freq, bucket) - total := int32(0) - for i, n := range freq { - bucket[i] = total - total += n - } -} - -func bucketMin_64(text []int64, freq, bucket []int64) { - freq = freq_64(text, freq, bucket) - total := int64(0) - for i, n := range freq { - bucket[i] = total - total += n - } -} - -func bucketMax_8_64(text []byte, freq, bucket []int64) { - freq = freq_8_64(text, freq, bucket) - freq = freq[:256] // establish len(freq) = 256, so 0 ≤ i < 256 below - bucket = bucket[:256] // eliminate bounds check for bucket[i] below - total := int64(0) - for i, n := range freq { - total += n - bucket[i] = total - } -} - -func bucketMax_32(text []int32, freq, bucket []int32) { - freq = freq_32(text, freq, bucket) - total := int32(0) - for i, n := range freq { - total += n - bucket[i] = total - } -} - -func bucketMax_64(text []int64, freq, bucket []int64) { - freq = freq_64(text, freq, bucket) - total := int64(0) - for i, n := range freq { - total += n - bucket[i] = total - } -} - -func placeLMS_8_64(text []byte, sa, freq, bucket []int64) int { - bucketMax_8_64(text, freq, bucket) - - numLMS := 0 - lastB := int64(-1) - bucket = bucket[:256] // eliminate bounds check for bucket[c1] below - - // The next stanza of code (until the blank line) loop backward - // over text, stopping to execute a code body at each position i - // such that text[i] is an L-character and text[i+1] is an S-character. - // That is, i+1 is the position of the start of an LMS-substring. - // These could be hoisted out into a function with a callback, - // but at a significant speed cost. Instead, we just write these - // seven lines a few times in this source file. The copies below - // refer back to the pattern established by this original as the - // "LMS-substring iterator". - // - // In every scan through the text, c0, c1 are successive characters of text. - // In this backward scan, c0 == text[i] and c1 == text[i+1]. - // By scanning backward, we can keep track of whether the current - // position is type-S or type-L according to the usual definition: - // - // - position len(text) is type S with text[len(text)] == -1 (the sentinel) - // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. - // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. - // - // The backward scan lets us maintain the current type, - // update it when we see c0 != c1, and otherwise leave it alone. - // We want to identify all S positions with a preceding L. - // Position len(text) is one such position by definition, but we have - // nowhere to write it down, so we eliminate it by untruthfully - // setting isTypeS = false at the start of the loop. - c0, c1, isTypeS := byte(0), byte(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Bucket the index i+1 for the start of an LMS-substring. - b := bucket[c1] - 1 - bucket[c1] = b - sa[b] = int64(i + 1) - lastB = b - numLMS++ - } - } - - // We recorded the LMS-substring starts but really want the ends. - // Luckily, with two differences, the start indexes and the end indexes are the same. - // The first difference is that the rightmost LMS-substring's end index is len(text), - // so the caller must pretend that sa[-1] == len(text), as noted above. - // The second difference is that the first leftmost LMS-substring start index - // does not end an earlier LMS-substring, so as an optimization we can omit - // that leftmost LMS-substring start index (the last one we wrote). - // - // Exception: if numLMS <= 1, the caller is not going to bother with - // the recursion at all and will treat the result as containing LMS-substring starts. - // In that case, we don't remove the final entry. - if numLMS > 1 { - sa[lastB] = 0 - } - return numLMS -} - -func placeLMS_32(text []int32, sa, freq, bucket []int32) int { - bucketMax_32(text, freq, bucket) - - numLMS := 0 - lastB := int32(-1) - - // The next stanza of code (until the blank line) loop backward - // over text, stopping to execute a code body at each position i - // such that text[i] is an L-character and text[i+1] is an S-character. - // That is, i+1 is the position of the start of an LMS-substring. - // These could be hoisted out into a function with a callback, - // but at a significant speed cost. Instead, we just write these - // seven lines a few times in this source file. The copies below - // refer back to the pattern established by this original as the - // "LMS-substring iterator". - // - // In every scan through the text, c0, c1 are successive characters of text. - // In this backward scan, c0 == text[i] and c1 == text[i+1]. - // By scanning backward, we can keep track of whether the current - // position is type-S or type-L according to the usual definition: - // - // - position len(text) is type S with text[len(text)] == -1 (the sentinel) - // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. - // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. - // - // The backward scan lets us maintain the current type, - // update it when we see c0 != c1, and otherwise leave it alone. - // We want to identify all S positions with a preceding L. - // Position len(text) is one such position by definition, but we have - // nowhere to write it down, so we eliminate it by untruthfully - // setting isTypeS = false at the start of the loop. - c0, c1, isTypeS := int32(0), int32(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Bucket the index i+1 for the start of an LMS-substring. - b := bucket[c1] - 1 - bucket[c1] = b - sa[b] = int32(i + 1) - lastB = b - numLMS++ - } - } - - // We recorded the LMS-substring starts but really want the ends. - // Luckily, with two differences, the start indexes and the end indexes are the same. - // The first difference is that the rightmost LMS-substring's end index is len(text), - // so the caller must pretend that sa[-1] == len(text), as noted above. - // The second difference is that the first leftmost LMS-substring start index - // does not end an earlier LMS-substring, so as an optimization we can omit - // that leftmost LMS-substring start index (the last one we wrote). - // - // Exception: if numLMS <= 1, the caller is not going to bother with - // the recursion at all and will treat the result as containing LMS-substring starts. - // In that case, we don't remove the final entry. - if numLMS > 1 { - sa[lastB] = 0 - } - return numLMS -} - -func placeLMS_64(text []int64, sa, freq, bucket []int64) int { - bucketMax_64(text, freq, bucket) - - numLMS := 0 - lastB := int64(-1) - - // The next stanza of code (until the blank line) loop backward - // over text, stopping to execute a code body at each position i - // such that text[i] is an L-character and text[i+1] is an S-character. - // That is, i+1 is the position of the start of an LMS-substring. - // These could be hoisted out into a function with a callback, - // but at a significant speed cost. Instead, we just write these - // seven lines a few times in this source file. The copies below - // refer back to the pattern established by this original as the - // "LMS-substring iterator". - // - // In every scan through the text, c0, c1 are successive characters of text. - // In this backward scan, c0 == text[i] and c1 == text[i+1]. - // By scanning backward, we can keep track of whether the current - // position is type-S or type-L according to the usual definition: - // - // - position len(text) is type S with text[len(text)] == -1 (the sentinel) - // - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S. - // - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L. - // - // The backward scan lets us maintain the current type, - // update it when we see c0 != c1, and otherwise leave it alone. - // We want to identify all S positions with a preceding L. - // Position len(text) is one such position by definition, but we have - // nowhere to write it down, so we eliminate it by untruthfully - // setting isTypeS = false at the start of the loop. - c0, c1, isTypeS := int64(0), int64(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Bucket the index i+1 for the start of an LMS-substring. - b := bucket[c1] - 1 - bucket[c1] = b - sa[b] = int64(i + 1) - lastB = b - numLMS++ - } - } - - // We recorded the LMS-substring starts but really want the ends. - // Luckily, with two differences, the start indexes and the end indexes are the same. - // The first difference is that the rightmost LMS-substring's end index is len(text), - // so the caller must pretend that sa[-1] == len(text), as noted above. - // The second difference is that the first leftmost LMS-substring start index - // does not end an earlier LMS-substring, so as an optimization we can omit - // that leftmost LMS-substring start index (the last one we wrote). - // - // Exception: if numLMS <= 1, the caller is not going to bother with - // the recursion at all and will treat the result as containing LMS-substring starts. - // In that case, we don't remove the final entry. - if numLMS > 1 { - sa[lastB] = 0 - } - return numLMS -} - -func induceSubL_8_64(text []byte, sa, freq, bucket []int64) { - // Initialize positions for left side of character buckets. - bucketMin_8_64(text, freq, bucket) - bucket = bucket[:256] // eliminate bounds check for bucket[cB] below - - // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly - // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. - // Because j-1 is type L, inserting it into sa now will sort it correctly. - // But we want to distinguish a j-1 with j-2 of type L from type S. - // We can process the former but want to leave the latter for the caller. - // We record the difference by negating j-1 if it is preceded by type S. - // Either way, the insertion (into the text[j-1] bucket) is guaranteed to - // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have - // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, - // and so on, in sorted but not necessarily adjacent order, until it finds - // one preceded by an index of type S, at which point it must stop. - // - // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, - // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing - // only the indexes of the leftmost L-type indexes for each LMS-substring. - // - // The suffix array sa therefore serves simultaneously as input, output, - // and a miraculously well-tailored work queue. - - // placeLMS_8_64 left out the implicit entry sa[-1] == len(text), - // corresponding to the identified type-L index len(text)-1. - // Process it before the left-to-right scan of sa proper. - // See body in loop for commentary. - k := len(text) - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - // Cache recently used bucket index: - // we're processing suffixes in sorted order - // and accessing buckets indexed by the - // byte before the sorted order, which still - // has very good locality. - // Invariant: b is cached, possibly dirty copy of bucket[cB]. - cB := c1 - b := bucket[cB] - sa[b] = int64(k) - b++ - - for i := 0; i < len(sa); i++ { - j := int(sa[i]) - if j == 0 { - // Skip empty entry. - continue - } - if j < 0 { - // Leave discovered type-S index for caller. - sa[i] = int64(-j) - continue - } - sa[i] = 0 - - // Index j was on work queue, meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is L-type, queue k for processing later in this loop. - // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. - k := j - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - sa[b] = int64(k) - b++ - } -} - -func induceSubL_32(text []int32, sa, freq, bucket []int32) { - // Initialize positions for left side of character buckets. - bucketMin_32(text, freq, bucket) - - // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly - // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. - // Because j-1 is type L, inserting it into sa now will sort it correctly. - // But we want to distinguish a j-1 with j-2 of type L from type S. - // We can process the former but want to leave the latter for the caller. - // We record the difference by negating j-1 if it is preceded by type S. - // Either way, the insertion (into the text[j-1] bucket) is guaranteed to - // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have - // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, - // and so on, in sorted but not necessarily adjacent order, until it finds - // one preceded by an index of type S, at which point it must stop. - // - // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, - // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing - // only the indexes of the leftmost L-type indexes for each LMS-substring. - // - // The suffix array sa therefore serves simultaneously as input, output, - // and a miraculously well-tailored work queue. - - // placeLMS_32 left out the implicit entry sa[-1] == len(text), - // corresponding to the identified type-L index len(text)-1. - // Process it before the left-to-right scan of sa proper. - // See body in loop for commentary. - k := len(text) - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - // Cache recently used bucket index: - // we're processing suffixes in sorted order - // and accessing buckets indexed by the - // int32 before the sorted order, which still - // has very good locality. - // Invariant: b is cached, possibly dirty copy of bucket[cB]. - cB := c1 - b := bucket[cB] - sa[b] = int32(k) - b++ - - for i := 0; i < len(sa); i++ { - j := int(sa[i]) - if j == 0 { - // Skip empty entry. - continue - } - if j < 0 { - // Leave discovered type-S index for caller. - sa[i] = int32(-j) - continue - } - sa[i] = 0 - - // Index j was on work queue, meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is L-type, queue k for processing later in this loop. - // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. - k := j - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - sa[b] = int32(k) - b++ - } -} - -func induceSubL_64(text []int64, sa, freq, bucket []int64) { - // Initialize positions for left side of character buckets. - bucketMin_64(text, freq, bucket) - - // As we scan the array left-to-right, each sa[i] = j > 0 is a correctly - // sorted suffix array entry (for text[j:]) for which we know that j-1 is type L. - // Because j-1 is type L, inserting it into sa now will sort it correctly. - // But we want to distinguish a j-1 with j-2 of type L from type S. - // We can process the former but want to leave the latter for the caller. - // We record the difference by negating j-1 if it is preceded by type S. - // Either way, the insertion (into the text[j-1] bucket) is guaranteed to - // happen at sa[i´] for some i´ > i, that is, in the portion of sa we have - // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, - // and so on, in sorted but not necessarily adjacent order, until it finds - // one preceded by an index of type S, at which point it must stop. - // - // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, - // and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing - // only the indexes of the leftmost L-type indexes for each LMS-substring. - // - // The suffix array sa therefore serves simultaneously as input, output, - // and a miraculously well-tailored work queue. - - // placeLMS_64 left out the implicit entry sa[-1] == len(text), - // corresponding to the identified type-L index len(text)-1. - // Process it before the left-to-right scan of sa proper. - // See body in loop for commentary. - k := len(text) - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - // Cache recently used bucket index: - // we're processing suffixes in sorted order - // and accessing buckets indexed by the - // int64 before the sorted order, which still - // has very good locality. - // Invariant: b is cached, possibly dirty copy of bucket[cB]. - cB := c1 - b := bucket[cB] - sa[b] = int64(k) - b++ - - for i := 0; i < len(sa); i++ { - j := int(sa[i]) - if j == 0 { - // Skip empty entry. - continue - } - if j < 0 { - // Leave discovered type-S index for caller. - sa[i] = int64(-j) - continue - } - sa[i] = 0 - - // Index j was on work queue, meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is L-type, queue k for processing later in this loop. - // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. - k := j - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - sa[b] = int64(k) - b++ - } -} - -func induceSubS_8_64(text []byte, sa, freq, bucket []int64) { - // Initialize positions for right side of character buckets. - bucketMax_8_64(text, freq, bucket) - bucket = bucket[:256] // eliminate bounds check for bucket[cB] below - - // Analogous to induceSubL_8_64 above, - // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly - // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. - // Because j-1 is type S, inserting it into sa now will sort it correctly. - // But we want to distinguish a j-1 with j-2 of type S from type L. - // We can process the former but want to leave the latter for the caller. - // We record the difference by negating j-1 if it is preceded by type L. - // Either way, the insertion (into the text[j-1] bucket) is guaranteed to - // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have - // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, - // and so on, in sorted but not necessarily adjacent order, until it finds - // one preceded by an index of type L, at which point it must stop. - // That index (preceded by one of type L) is an LMS-substring start. - // - // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, - // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, - // so that the loop finishes with the top of sa containing exactly - // the LMS-substring start indexes, sorted by LMS-substring. - - // Cache recently used bucket index: - cB := byte(0) - b := bucket[cB] - - top := len(sa) - for i := len(sa) - 1; i >= 0; i-- { - j := int(sa[i]) - if j == 0 { - // Skip empty entry. - continue - } - sa[i] = 0 - if j < 0 { - // Leave discovered LMS-substring start index for caller. - top-- - sa[top] = int64(-j) - continue - } - - // Index j was on work queue, meaning k := j-1 is S-type, - // so we can now place k correctly into sa. - // If k-1 is S-type, queue k for processing later in this loop. - // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. - k := j - 1 - c1 := text[k] - c0 := text[k-1] - if c0 > c1 { - k = -k - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - b-- - sa[b] = int64(k) - } -} - -func induceSubS_32(text []int32, sa, freq, bucket []int32) { - // Initialize positions for right side of character buckets. - bucketMax_32(text, freq, bucket) - - // Analogous to induceSubL_32 above, - // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly - // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. - // Because j-1 is type S, inserting it into sa now will sort it correctly. - // But we want to distinguish a j-1 with j-2 of type S from type L. - // We can process the former but want to leave the latter for the caller. - // We record the difference by negating j-1 if it is preceded by type L. - // Either way, the insertion (into the text[j-1] bucket) is guaranteed to - // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have - // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, - // and so on, in sorted but not necessarily adjacent order, until it finds - // one preceded by an index of type L, at which point it must stop. - // That index (preceded by one of type L) is an LMS-substring start. - // - // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, - // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, - // so that the loop finishes with the top of sa containing exactly - // the LMS-substring start indexes, sorted by LMS-substring. - - // Cache recently used bucket index: - cB := int32(0) - b := bucket[cB] - - top := len(sa) - for i := len(sa) - 1; i >= 0; i-- { - j := int(sa[i]) - if j == 0 { - // Skip empty entry. - continue - } - sa[i] = 0 - if j < 0 { - // Leave discovered LMS-substring start index for caller. - top-- - sa[top] = int32(-j) - continue - } - - // Index j was on work queue, meaning k := j-1 is S-type, - // so we can now place k correctly into sa. - // If k-1 is S-type, queue k for processing later in this loop. - // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. - k := j - 1 - c1 := text[k] - c0 := text[k-1] - if c0 > c1 { - k = -k - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - b-- - sa[b] = int32(k) - } -} - -func induceSubS_64(text []int64, sa, freq, bucket []int64) { - // Initialize positions for right side of character buckets. - bucketMax_64(text, freq, bucket) - - // Analogous to induceSubL_64 above, - // as we scan the array right-to-left, each sa[i] = j > 0 is a correctly - // sorted suffix array entry (for text[j:]) for which we know that j-1 is type S. - // Because j-1 is type S, inserting it into sa now will sort it correctly. - // But we want to distinguish a j-1 with j-2 of type S from type L. - // We can process the former but want to leave the latter for the caller. - // We record the difference by negating j-1 if it is preceded by type L. - // Either way, the insertion (into the text[j-1] bucket) is guaranteed to - // happen at sa[i´] for some i´ < i, that is, in the portion of sa we have - // yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3, - // and so on, in sorted but not necessarily adjacent order, until it finds - // one preceded by an index of type L, at which point it must stop. - // That index (preceded by one of type L) is an LMS-substring start. - // - // As we scan through the array, we clear the worked entries (sa[i] > 0) to zero, - // and we flip sa[i] < 0 to -sa[i] and compact into the top of sa, - // so that the loop finishes with the top of sa containing exactly - // the LMS-substring start indexes, sorted by LMS-substring. - - // Cache recently used bucket index: - cB := int64(0) - b := bucket[cB] - - top := len(sa) - for i := len(sa) - 1; i >= 0; i-- { - j := int(sa[i]) - if j == 0 { - // Skip empty entry. - continue - } - sa[i] = 0 - if j < 0 { - // Leave discovered LMS-substring start index for caller. - top-- - sa[top] = int64(-j) - continue - } - - // Index j was on work queue, meaning k := j-1 is S-type, - // so we can now place k correctly into sa. - // If k-1 is S-type, queue k for processing later in this loop. - // If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller. - k := j - 1 - c1 := text[k] - c0 := text[k-1] - if c0 > c1 { - k = -k - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - b-- - sa[b] = int64(k) - } -} - -func length_8_64(text []byte, sa []int64, numLMS int) { - end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) - - // The encoding of N text bytes into a “length” word - // adds 1 to each byte, packs them into the bottom - // N*8 bits of a word, and then bitwise inverts the result. - // That is, the text sequence A B C (hex 41 42 43) - // encodes as ^uint64(0x42_43_44). - // LMS-substrings can never start or end with 0xFF. - // Adding 1 ensures the encoded byte sequence never - // starts or ends with 0x00, so that present bytes can be - // distinguished from zero-padding in the top bits, - // so the length need not be separately encoded. - // Inverting the bytes increases the chance that a - // 4-byte encoding will still be ≥ len(text). - // In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F) - // then the high bit of the inversion will be set, - // making it clearly not a valid length (it would be a negative one). - // - // cx holds the pre-inverted encoding (the packed incremented bytes). - cx := uint64(0) // byte-only - - // This stanza (until the blank line) is the "LMS-substring iterator", - // described in placeLMS_8_64 above, with one line added to maintain cx. - c0, c1, isTypeS := byte(0), byte(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - cx = cx<<8 | uint64(c1+1) // byte-only - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Index j = i+1 is the start of an LMS-substring. - // Compute length or encoded text to store in sa[j/2]. - j := i + 1 - var code int64 - if end == 0 { - code = 0 - } else { - code = int64(end - j) - if code <= 64/8 && ^cx >= uint64(len(text)) { // byte-only - code = int64(^cx) // byte-only - } // byte-only - } - sa[j>>1] = code - end = j + 1 - cx = uint64(c1 + 1) // byte-only - } - } -} - -func length_32(text []int32, sa []int32, numLMS int) { - end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) - - // The encoding of N text int32s into a “length” word - // adds 1 to each int32, packs them into the bottom - // N*8 bits of a word, and then bitwise inverts the result. - // That is, the text sequence A B C (hex 41 42 43) - // encodes as ^uint32(0x42_43_44). - // LMS-substrings can never start or end with 0xFF. - // Adding 1 ensures the encoded int32 sequence never - // starts or ends with 0x00, so that present int32s can be - // distinguished from zero-padding in the top bits, - // so the length need not be separately encoded. - // Inverting the int32s increases the chance that a - // 4-int32 encoding will still be ≥ len(text). - // In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F) - // then the high bit of the inversion will be set, - // making it clearly not a valid length (it would be a negative one). - // - // cx holds the pre-inverted encoding (the packed incremented int32s). - - // This stanza (until the blank line) is the "LMS-substring iterator", - // described in placeLMS_32 above, with one line added to maintain cx. - c0, c1, isTypeS := int32(0), int32(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Index j = i+1 is the start of an LMS-substring. - // Compute length or encoded text to store in sa[j/2]. - j := i + 1 - var code int32 - if end == 0 { - code = 0 - } else { - code = int32(end - j) - } - sa[j>>1] = code - end = j + 1 - } - } -} - -func length_64(text []int64, sa []int64, numLMS int) { - end := 0 // index of current LMS-substring end (0 indicates final LMS-substring) - - // The encoding of N text int64s into a “length” word - // adds 1 to each int64, packs them into the bottom - // N*8 bits of a word, and then bitwise inverts the result. - // That is, the text sequence A B C (hex 41 42 43) - // encodes as ^uint64(0x42_43_44). - // LMS-substrings can never start or end with 0xFF. - // Adding 1 ensures the encoded int64 sequence never - // starts or ends with 0x00, so that present int64s can be - // distinguished from zero-padding in the top bits, - // so the length need not be separately encoded. - // Inverting the int64s increases the chance that a - // 4-int64 encoding will still be ≥ len(text). - // In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F) - // then the high bit of the inversion will be set, - // making it clearly not a valid length (it would be a negative one). - // - // cx holds the pre-inverted encoding (the packed incremented int64s). - - // This stanza (until the blank line) is the "LMS-substring iterator", - // described in placeLMS_64 above, with one line added to maintain cx. - c0, c1, isTypeS := int64(0), int64(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Index j = i+1 is the start of an LMS-substring. - // Compute length or encoded text to store in sa[j/2]. - j := i + 1 - var code int64 - if end == 0 { - code = 0 - } else { - code = int64(end - j) - } - sa[j>>1] = code - end = j + 1 - } - } -} - -func assignID_8_64(text []byte, sa []int64, numLMS int) int { - id := 0 - lastLen := int64(-1) // impossible - lastPos := int64(0) - for _, j := range sa[len(sa)-numLMS:] { - // Is the LMS-substring at index j new, or is it the same as the last one we saw? - n := sa[j/2] - if n != lastLen { - goto New - } - if uint64(n) >= uint64(len(text)) { - // “Length” is really encoded full text, and they match. - goto Same - } - { - // Compare actual texts. - n := int(n) - this := text[j:][:n] - last := text[lastPos:][:n] - for i := 0; i < n; i++ { - if this[i] != last[i] { - goto New - } - } - goto Same - } - New: - id++ - lastPos = j - lastLen = n - Same: - sa[j/2] = int64(id) - } - return id -} - -func assignID_32(text []int32, sa []int32, numLMS int) int { - id := 0 - lastLen := int32(-1) // impossible - lastPos := int32(0) - for _, j := range sa[len(sa)-numLMS:] { - // Is the LMS-substring at index j new, or is it the same as the last one we saw? - n := sa[j/2] - if n != lastLen { - goto New - } - if uint32(n) >= uint32(len(text)) { - // “Length” is really encoded full text, and they match. - goto Same - } - { - // Compare actual texts. - n := int(n) - this := text[j:][:n] - last := text[lastPos:][:n] - for i := 0; i < n; i++ { - if this[i] != last[i] { - goto New - } - } - goto Same - } - New: - id++ - lastPos = j - lastLen = n - Same: - sa[j/2] = int32(id) - } - return id -} - -func assignID_64(text []int64, sa []int64, numLMS int) int { - id := 0 - lastLen := int64(-1) // impossible - lastPos := int64(0) - for _, j := range sa[len(sa)-numLMS:] { - // Is the LMS-substring at index j new, or is it the same as the last one we saw? - n := sa[j/2] - if n != lastLen { - goto New - } - if uint64(n) >= uint64(len(text)) { - // “Length” is really encoded full text, and they match. - goto Same - } - { - // Compare actual texts. - n := int(n) - this := text[j:][:n] - last := text[lastPos:][:n] - for i := 0; i < n; i++ { - if this[i] != last[i] { - goto New - } - } - goto Same - } - New: - id++ - lastPos = j - lastLen = n - Same: - sa[j/2] = int64(id) - } - return id -} - -func map_64(sa []int64, numLMS int) { - w := len(sa) - for i := len(sa) / 2; i >= 0; i-- { - j := sa[i] - if j > 0 { - w-- - sa[w] = j - 1 - } - } -} - -func recurse_64(sa, oldTmp []int64, numLMS, maxID int) { - dst, saTmp, text := sa[:numLMS], sa[numLMS:len(sa)-numLMS], sa[len(sa)-numLMS:] - - // Set up temporary space for recursive call. - // We must pass sais_64 a tmp buffer with at least maxID entries. - // - // The subproblem is guaranteed to have length at most len(sa)/2, - // so that sa can hold both the subproblem and its suffix array. - // Nearly all the time, however, the subproblem has length < len(sa)/3, - // in which case there is a subproblem-sized middle of sa that - // we can reuse for temporary space (saTmp). - // When recurse_64 is called from sais_8_64, oldTmp is length 512 - // (from text_64), and saTmp will typically be much larger, so we'll use saTmp. - // When deeper recursions come back to recurse_64, now oldTmp is - // the saTmp from the top-most recursion, it is typically larger than - // the current saTmp (because the current sa gets smaller and smaller - // as the recursion gets deeper), and we keep reusing that top-most - // large saTmp instead of the offered smaller ones. - // - // Why is the subproblem length so often just under len(sa)/3? - // See Nong, Zhang, and Chen, section 3.6 for a plausible explanation. - // In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern - // in the input, perfect alternation of larger and smaller input bytes. - // Real text doesn't do that. If each L-type index is randomly followed - // by either an L-type or S-type index, then half the substrings will - // be of the form SLS, but the other half will be longer. Of that half, - // half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on. - // Not counting the final S in each (which overlaps the first S in the next), - // This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3. - // The space we need is further reduced by the fact that many of the - // short patterns like SLS will often be the same character sequences - // repeated throughout the text, reducing maxID relative to numLMS. - // - // For short inputs, the averages may not run in our favor, but then we - // can often fall back to using the length-512 tmp available in the - // top-most call. (Also a short allocation would not be a big deal.) - // - // For pathological inputs, we fall back to allocating a new tmp of length - // max(maxID, numLMS/2). This level of the recursion needs maxID, - // and all deeper levels of the recursion will need no more than numLMS/2, - // so this one allocation is guaranteed to suffice for the entire stack - // of recursive calls. - tmp := oldTmp - if len(tmp) < len(saTmp) { - tmp = saTmp - } - if len(tmp) < numLMS { - // TestSAIS/forcealloc reaches this code. - n := maxID - if n < numLMS/2 { - n = numLMS / 2 - } - tmp = make([]int64, n) - } - - // sais_64 requires that the caller arrange to clear dst, - // because in general the caller may know dst is - // freshly-allocated and already cleared. But this one is not. - for i := range dst { - dst[i] = 0 - } - sais_64(text, maxID, dst, tmp) -} - -func unmap_8_64(text []byte, sa []int64, numLMS int) { - unmap := sa[len(sa)-numLMS:] - j := len(unmap) - - // "LMS-substring iterator" (see placeLMS_8_64 above). - c0, c1, isTypeS := byte(0), byte(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Populate inverse map. - j-- - unmap[j] = int64(i + 1) - } - } - - // Apply inverse map to subproblem suffix array. - sa = sa[:numLMS] - for i := 0; i < len(sa); i++ { - sa[i] = unmap[sa[i]] - } -} - -func unmap_32(text []int32, sa []int32, numLMS int) { - unmap := sa[len(sa)-numLMS:] - j := len(unmap) - - // "LMS-substring iterator" (see placeLMS_32 above). - c0, c1, isTypeS := int32(0), int32(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Populate inverse map. - j-- - unmap[j] = int32(i + 1) - } - } - - // Apply inverse map to subproblem suffix array. - sa = sa[:numLMS] - for i := 0; i < len(sa); i++ { - sa[i] = unmap[sa[i]] - } -} - -func unmap_64(text []int64, sa []int64, numLMS int) { - unmap := sa[len(sa)-numLMS:] - j := len(unmap) - - // "LMS-substring iterator" (see placeLMS_64 above). - c0, c1, isTypeS := int64(0), int64(0), false - for i := len(text) - 1; i >= 0; i-- { - c0, c1 = text[i], c0 - if c0 < c1 { - isTypeS = true - } else if c0 > c1 && isTypeS { - isTypeS = false - - // Populate inverse map. - j-- - unmap[j] = int64(i + 1) - } - } - - // Apply inverse map to subproblem suffix array. - sa = sa[:numLMS] - for i := 0; i < len(sa); i++ { - sa[i] = unmap[sa[i]] - } -} - -func expand_8_64(text []byte, freq, bucket, sa []int64, numLMS int) { - bucketMax_8_64(text, freq, bucket) - bucket = bucket[:256] // eliminate bound check for bucket[c] below - - // Loop backward through sa, always tracking - // the next index to populate from sa[:numLMS]. - // When we get to one, populate it. - // Zero the rest of the slots; they have dead values in them. - x := numLMS - 1 - saX := sa[x] - c := text[saX] - b := bucket[c] - 1 - bucket[c] = b - - for i := len(sa) - 1; i >= 0; i-- { - if i != int(b) { - sa[i] = 0 - continue - } - sa[i] = saX - - // Load next entry to put down (if any). - if x > 0 { - x-- - saX = sa[x] // TODO bounds check - c = text[saX] - b = bucket[c] - 1 - bucket[c] = b - } - } -} - -func expand_32(text []int32, freq, bucket, sa []int32, numLMS int) { - bucketMax_32(text, freq, bucket) - - // Loop backward through sa, always tracking - // the next index to populate from sa[:numLMS]. - // When we get to one, populate it. - // Zero the rest of the slots; they have dead values in them. - x := numLMS - 1 - saX := sa[x] - c := text[saX] - b := bucket[c] - 1 - bucket[c] = b - - for i := len(sa) - 1; i >= 0; i-- { - if i != int(b) { - sa[i] = 0 - continue - } - sa[i] = saX - - // Load next entry to put down (if any). - if x > 0 { - x-- - saX = sa[x] // TODO bounds check - c = text[saX] - b = bucket[c] - 1 - bucket[c] = b - } - } -} - -func expand_64(text []int64, freq, bucket, sa []int64, numLMS int) { - bucketMax_64(text, freq, bucket) - - // Loop backward through sa, always tracking - // the next index to populate from sa[:numLMS]. - // When we get to one, populate it. - // Zero the rest of the slots; they have dead values in them. - x := numLMS - 1 - saX := sa[x] - c := text[saX] - b := bucket[c] - 1 - bucket[c] = b - - for i := len(sa) - 1; i >= 0; i-- { - if i != int(b) { - sa[i] = 0 - continue - } - sa[i] = saX - - // Load next entry to put down (if any). - if x > 0 { - x-- - saX = sa[x] // TODO bounds check - c = text[saX] - b = bucket[c] - 1 - bucket[c] = b - } - } -} - -func induceL_8_64(text []byte, sa, freq, bucket []int64) { - // Initialize positions for left side of character buckets. - bucketMin_8_64(text, freq, bucket) - bucket = bucket[:256] // eliminate bounds check for bucket[cB] below - - // This scan is similar to the one in induceSubL_8_64 above. - // That one arranges to clear all but the leftmost L-type indexes. - // This scan leaves all the L-type indexes and the original S-type - // indexes, but it negates the positive leftmost L-type indexes - // (the ones that induceS_8_64 needs to process). - - // expand_8_64 left out the implicit entry sa[-1] == len(text), - // corresponding to the identified type-L index len(text)-1. - // Process it before the left-to-right scan of sa proper. - // See body in loop for commentary. - k := len(text) - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - // Cache recently used bucket index. - cB := c1 - b := bucket[cB] - sa[b] = int64(k) - b++ - - for i := 0; i < len(sa); i++ { - j := int(sa[i]) - if j <= 0 { - // Skip empty or negated entry (including negated zero). - continue - } - - // Index j was on work queue, meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is L-type, queue k for processing later in this loop. - // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. - // If k is zero, k-1 doesn't exist, so we only need to leave it - // for the caller. The caller can't tell the difference between - // an empty slot and a non-empty zero, but there's no need - // to distinguish them anyway: the final suffix array will end up - // with one zero somewhere, and that will be a real zero. - k := j - 1 - c1 := text[k] - if k > 0 { - if c0 := text[k-1]; c0 < c1 { - k = -k - } - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - sa[b] = int64(k) - b++ - } -} - -func induceL_32(text []int32, sa, freq, bucket []int32) { - // Initialize positions for left side of character buckets. - bucketMin_32(text, freq, bucket) - - // This scan is similar to the one in induceSubL_32 above. - // That one arranges to clear all but the leftmost L-type indexes. - // This scan leaves all the L-type indexes and the original S-type - // indexes, but it negates the positive leftmost L-type indexes - // (the ones that induceS_32 needs to process). - - // expand_32 left out the implicit entry sa[-1] == len(text), - // corresponding to the identified type-L index len(text)-1. - // Process it before the left-to-right scan of sa proper. - // See body in loop for commentary. - k := len(text) - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - // Cache recently used bucket index. - cB := c1 - b := bucket[cB] - sa[b] = int32(k) - b++ - - for i := 0; i < len(sa); i++ { - j := int(sa[i]) - if j <= 0 { - // Skip empty or negated entry (including negated zero). - continue - } - - // Index j was on work queue, meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is L-type, queue k for processing later in this loop. - // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. - // If k is zero, k-1 doesn't exist, so we only need to leave it - // for the caller. The caller can't tell the difference between - // an empty slot and a non-empty zero, but there's no need - // to distinguish them anyway: the final suffix array will end up - // with one zero somewhere, and that will be a real zero. - k := j - 1 - c1 := text[k] - if k > 0 { - if c0 := text[k-1]; c0 < c1 { - k = -k - } - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - sa[b] = int32(k) - b++ - } -} - -func induceL_64(text []int64, sa, freq, bucket []int64) { - // Initialize positions for left side of character buckets. - bucketMin_64(text, freq, bucket) - - // This scan is similar to the one in induceSubL_64 above. - // That one arranges to clear all but the leftmost L-type indexes. - // This scan leaves all the L-type indexes and the original S-type - // indexes, but it negates the positive leftmost L-type indexes - // (the ones that induceS_64 needs to process). - - // expand_64 left out the implicit entry sa[-1] == len(text), - // corresponding to the identified type-L index len(text)-1. - // Process it before the left-to-right scan of sa proper. - // See body in loop for commentary. - k := len(text) - 1 - c0, c1 := text[k-1], text[k] - if c0 < c1 { - k = -k - } - - // Cache recently used bucket index. - cB := c1 - b := bucket[cB] - sa[b] = int64(k) - b++ - - for i := 0; i < len(sa); i++ { - j := int(sa[i]) - if j <= 0 { - // Skip empty or negated entry (including negated zero). - continue - } - - // Index j was on work queue, meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is L-type, queue k for processing later in this loop. - // If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller. - // If k is zero, k-1 doesn't exist, so we only need to leave it - // for the caller. The caller can't tell the difference between - // an empty slot and a non-empty zero, but there's no need - // to distinguish them anyway: the final suffix array will end up - // with one zero somewhere, and that will be a real zero. - k := j - 1 - c1 := text[k] - if k > 0 { - if c0 := text[k-1]; c0 < c1 { - k = -k - } - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - sa[b] = int64(k) - b++ - } -} - -func induceS_8_64(text []byte, sa, freq, bucket []int64) { - // Initialize positions for right side of character buckets. - bucketMax_8_64(text, freq, bucket) - bucket = bucket[:256] // eliminate bounds check for bucket[cB] below - - cB := byte(0) - b := bucket[cB] - - for i := len(sa) - 1; i >= 0; i-- { - j := int(sa[i]) - if j >= 0 { - // Skip non-flagged entry. - // (This loop can't see an empty entry; 0 means the real zero index.) - continue - } - - // Negative j is a work queue entry; rewrite to positive j for final suffix array. - j = -j - sa[i] = int64(j) - - // Index j was on work queue (encoded as -j but now decoded), - // meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is S-type, queue -k for processing later in this loop. - // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. - // If k is zero, k-1 doesn't exist, so we only need to leave it - // for the caller. - k := j - 1 - c1 := text[k] - if k > 0 { - if c0 := text[k-1]; c0 <= c1 { - k = -k - } - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - b-- - sa[b] = int64(k) - } -} - -func induceS_32(text []int32, sa, freq, bucket []int32) { - // Initialize positions for right side of character buckets. - bucketMax_32(text, freq, bucket) - - cB := int32(0) - b := bucket[cB] - - for i := len(sa) - 1; i >= 0; i-- { - j := int(sa[i]) - if j >= 0 { - // Skip non-flagged entry. - // (This loop can't see an empty entry; 0 means the real zero index.) - continue - } - - // Negative j is a work queue entry; rewrite to positive j for final suffix array. - j = -j - sa[i] = int32(j) - - // Index j was on work queue (encoded as -j but now decoded), - // meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is S-type, queue -k for processing later in this loop. - // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. - // If k is zero, k-1 doesn't exist, so we only need to leave it - // for the caller. - k := j - 1 - c1 := text[k] - if k > 0 { - if c0 := text[k-1]; c0 <= c1 { - k = -k - } - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - b-- - sa[b] = int32(k) - } -} - -func induceS_64(text []int64, sa, freq, bucket []int64) { - // Initialize positions for right side of character buckets. - bucketMax_64(text, freq, bucket) - - cB := int64(0) - b := bucket[cB] - - for i := len(sa) - 1; i >= 0; i-- { - j := int(sa[i]) - if j >= 0 { - // Skip non-flagged entry. - // (This loop can't see an empty entry; 0 means the real zero index.) - continue - } - - // Negative j is a work queue entry; rewrite to positive j for final suffix array. - j = -j - sa[i] = int64(j) - - // Index j was on work queue (encoded as -j but now decoded), - // meaning k := j-1 is L-type, - // so we can now place k correctly into sa. - // If k-1 is S-type, queue -k for processing later in this loop. - // If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller. - // If k is zero, k-1 doesn't exist, so we only need to leave it - // for the caller. - k := j - 1 - c1 := text[k] - if k > 0 { - if c0 := text[k-1]; c0 <= c1 { - k = -k - } - } - - if cB != c1 { - bucket[cB] = b - cB = c1 - b = bucket[cB] - } - b-- - sa[b] = int64(k) - } -} diff --git a/std/compress/lzss/internal/suffixarray/suffixarray.go b/std/compress/lzss/internal/suffixarray/suffixarray.go deleted file mode 100644 index a62402a7f9..0000000000 --- a/std/compress/lzss/internal/suffixarray/suffixarray.go +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright 2010 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package suffixarray implements substring search in logarithmic time using -// an in-memory suffix array. -// -// It is derived from index/suffixarray in go std; the only difference is that -// it forces use of int32 for the index and exposes a single method LookupLongest -// that returns the longest match in a given range. -package suffixarray - -import ( - "bytes" - "math" - "sort" -) - -// Can change for testing -var maxData32 int = realMaxData32 - -const realMaxData32 = math.MaxInt32 - -// Index implements a suffix array for fast substring search. -type Index struct { - data []byte - sa []int32 // suffix array for data; sa.len() == len(data) -} - -// New creates a new [Index] for data. -// [Index] creation time is O(N) for N = len(data). -func New(data []byte, sa []int32) *Index { - ix := &Index{data: data} - if len(data) > maxData32 { - panic("suffixarray: data too large") - } - // reset the suffix array - for i := range sa { - sa[i] = 0 - } - ix.sa = sa[:len(data)] - text_32(data, ix.sa) - - return ix -} - -// Bytes returns the data over which the index was created. -// It must not be modified. -func (x *Index) Bytes() []byte { - return x.data -} - -func (x *Index) at(i int) []byte { - return x.data[x.sa[i]:] -} - -// LookupLongest returns an index and length of the longest -// substring of s[:minEnd] / s[:maxEnd] that occurs in the indexed data. -func (x *Index) LookupLongest(s []byte, minEnd, maxEnd, rangeStart, rangeEnd int) (index, length int) { - index, length = -1, -1 - - // first search at min end to reduce the search space for next searches - sStart, sEnd := x.lookupLongestInitial(s[:minEnd]) - - if sStart == -1 { - // no match - return - } - - if sStart == sEnd { - // only one match - offset := int(x.sa[sStart]) - if offset >= rangeStart && offset < rangeEnd { - // valid index, we can use it. - index = offset - length = minEnd - } - return - } - - // filter the results to be in the range [rangeStart, rangeEnd) - for i := sStart; i < sEnd; i++ { - offset := int(x.sa[i]) - if offset >= rangeStart && offset < rangeEnd { - // valid index, we can use it. - index = offset - length = minEnd - break - } - } - - if length == -1 { - // no match - return - } - - // binary search between maxEnd - minEnd - low := minEnd - high := maxEnd - - for low <= high { - mid := low + (high-low)/2 - - if newStart, offset := x.lookupLongest(s[:mid], rangeStart, rangeEnd, sStart, sEnd); offset != -1 { - // we found a match of length mid - // try the next part of the binary search - sStart = newStart - index = offset - length = mid - low = mid + 1 - continue - } - // we didn't find a match in this half; try the lower one. - high = mid - 1 - } - return -} - -// lookupLongest is similar to lookupAll but filters out indices that are not -// in the range [rangeStart, rangeEnd). -func (x *Index) lookupLongest(s []byte, rangeStart, rangeEnd, sStart, sEnd int) (rStart, offset int) { - rStart = sStart - // use sort.Search - // find the first index where s would be the prefix - i := sort.Search(sEnd-sStart, func(i int) bool { return bytes.Compare(x.at(i+sStart), s) >= 0 }) + sStart - - if i == sEnd || !bytes.HasPrefix(x.at(i), s) { - return rStart, -1 - } - - rStart = i - - for i < sEnd && bytes.HasPrefix(x.at(i), s) { - offset := int(x.sa[i]) - if offset >= rangeStart && offset < rangeEnd { - // valid index, we can use it. - return rStart, offset - } - i++ - } - return rStart, -1 -} - -func (x *Index) lookupLongestInitial(s []byte) (rStart, rEnd int) { - i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 }) - if i == len(x.sa) || !bytes.HasPrefix(x.at(i), s) { - return -1, -1 - } - - j := i + sort.Search(len(x.sa)-i, func(k int) bool { return !bytes.HasPrefix(x.at(k+i), s) }) - return i, j -} diff --git a/std/compress/lzss/snark.go b/std/compress/lzss/snark.go index e12e6b2954..198cb739df 100644 --- a/std/compress/lzss/snark.go +++ b/std/compress/lzss/snark.go @@ -1,34 +1,38 @@ package lzss import ( + "github.com/consensys/compress/lzss" "github.com/consensys/gnark/frontend" "github.com/consensys/gnark/std/compress" "github.com/consensys/gnark/std/lookup/logderivlookup" ) -// bite size of c needs to be the greatest common denominator of all backref types and 8 -// d consists of bytes -func Decompress(api frontend.API, c []frontend.Variable, cLength frontend.Variable, d []frontend.Variable, dict []byte, level Level) (dLength frontend.Variable, err error) { +// Decompress decompresses c into d using dict as the dictionary +// It returns the length of d as a frontend.Variable +func Decompress(api frontend.API, c []frontend.Variable, cLength frontend.Variable, d []frontend.Variable, dict []byte, level lzss.Level) (dLength frontend.Variable, err error) { wordNbBits := int(level) + // ensure input is in range checkInputRange(api, c, wordNbBits) - dict = augmentDict(dict) - shortBackRefType, longBackRefType, dictBackRefType := initBackRefTypes(len(dict), level) + // init the dictionary and backref types + dict = lzss.AugmentDict(dict) + shortBackRefType, longBackRefType, dictBackRefType := lzss.InitBackRefTypes(len(dict), level) - shortBrNbWords := int(shortBackRefType.nbBitsBackRef) / wordNbBits - longBrNbWords := int(longBackRefType.nbBitsBackRef) / wordNbBits - dictBrNbWords := int(dictBackRefType.nbBitsBackRef) / wordNbBits + shortBrNbWords := int(shortBackRefType.NbBitsBackRef) / wordNbBits + longBrNbWords := int(longBackRefType.NbBitsBackRef) / wordNbBits + dictBrNbWords := int(dictBackRefType.NbBitsBackRef) / wordNbBits byteNbWords := 8 / wordNbBits api.AssertIsEqual(compress.ReadNum(api, c, byteNbWords, wordNbBits), 0) // compressor version TODO @tabaie @gbotrel Handle this outside the circuit instead? fileCompressionMode := compress.ReadNum(api, c[byteNbWords:], byteNbWords, wordNbBits) - c = c[2*byteNbWords:] - cLength = api.Sub(cLength, 2*byteNbWords) api.AssertIsEqual(api.Mul(fileCompressionMode, fileCompressionMode), api.Mul(fileCompressionMode, wordNbBits)) // if fcm!=0, then fcm=wordNbBits decompressionNotBypassed := api.Sub(1, api.IsZero(fileCompressionMode)) + c = c[2*byteNbWords:] + cLength = api.Sub(cLength, 2*byteNbWords) + outTable := logderivlookup.New(api) for i := range dict { outTable.Insert(dict[i]) @@ -38,7 +42,7 @@ func Decompress(api frontend.API, c []frontend.Variable, cLength frontend.Variab bytes := combineIntoBytes(api, c, wordNbBits) bytesTable := sliceToTable(api, bytes) bytesTable.Insert(0) // just because we use this table for looking up backref lengths as well - addrTable := initAddrTable(api, bytes, c, wordNbBits, []backrefType{shortBackRefType, longBackRefType, dictBackRefType}) + addrTable := initAddrTable(api, bytes, c, wordNbBits, []lzss.BackrefType{shortBackRefType, longBackRefType, dictBackRefType}) // state variables inI := frontend.Variable(0) @@ -51,10 +55,10 @@ func Decompress(api frontend.API, c []frontend.Variable, cLength frontend.Variab curr := bytesTable.Lookup(inI)[0] - currMinusLong := api.Sub(api.Mul(curr, decompressionNotBypassed), symbolLong) // if bypassing decompression, currIndicatesXX = 0 + currMinusLong := api.Sub(api.Mul(curr, decompressionNotBypassed), lzss.SymbolLong) // if bypassing decompression, currIndicatesXX = 0 currIndicatesLongBr := api.IsZero(currMinusLong) - currIndicatesShortBr := api.IsZero(api.Sub(currMinusLong, symbolShort-symbolLong)) - currIndicatesDr := api.IsZero(api.Sub(currMinusLong, symbolDict-symbolLong)) + currIndicatesShortBr := api.IsZero(api.Sub(currMinusLong, lzss.SymbolShort-lzss.SymbolLong)) + currIndicatesDr := api.IsZero(api.Sub(currMinusLong, lzss.SymbolDict-lzss.SymbolLong)) currIndicatesBr := api.Add(currIndicatesLongBr, currIndicatesShortBr) currIndicatesCp := api.Add(currIndicatesBr, currIndicatesDr) @@ -142,21 +146,21 @@ func combineIntoBytes(api frontend.API, c []frontend.Variable, wordNbBits int) [ return res } -func initAddrTable(api frontend.API, bytes, c []frontend.Variable, wordNbBits int, backrefs []backrefType) *logderivlookup.Table { +func initAddrTable(api frontend.API, bytes, c []frontend.Variable, wordNbBits int, backrefs []lzss.BackrefType) *logderivlookup.Table { for i := range backrefs { - if backrefs[i].nbBitsLength != backrefs[0].nbBitsLength { + if backrefs[i].NbBitsLength != backrefs[0].NbBitsLength { panic("all backref types must have the same length size") } } readers := make([]*compress.NumReader, len(backrefs)) - delimAndLenNbWords := int(8+backrefs[0].nbBitsLength) / wordNbBits + delimAndLenNbWords := int(8+backrefs[0].NbBitsLength) / wordNbBits for i := range backrefs { var readerC []frontend.Variable if len(c) >= delimAndLenNbWords { readerC = c[delimAndLenNbWords:] } - readers[i] = compress.NewNumReader(api, readerC, int(backrefs[i].nbBitsAddress), wordNbBits) + readers[i] = compress.NewNumReader(api, readerC, int(backrefs[i].NbBitsAddress), wordNbBits) } res := logderivlookup.New(api) @@ -164,7 +168,7 @@ func initAddrTable(api frontend.API, bytes, c []frontend.Variable, wordNbBits in for i := range c { entry := frontend.Variable(0) for j := range backrefs { - isSymb := api.IsZero(api.Sub(bytes[i], backrefs[j].delimiter)) + isSymb := api.IsZero(api.Sub(bytes[i], backrefs[j].Delimiter)) entry = api.MulAcc(entry, isSymb, readers[j].Next()) } res.Insert(entry) diff --git a/std/compress/lzss/snark_test.go b/std/compress/lzss/snark_test.go index 82f4520313..9de006de57 100644 --- a/std/compress/lzss/snark_test.go +++ b/std/compress/lzss/snark_test.go @@ -1,14 +1,13 @@ package lzss import ( - "bytes" + goCompress "github.com/consensys/compress" + "github.com/consensys/compress/lzss" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark/backend" "github.com/consensys/gnark/frontend" - "github.com/consensys/gnark/std/compress" test_vector_utils "github.com/consensys/gnark/std/utils/test_vectors_utils" "github.com/consensys/gnark/test" - "github.com/icza/bitio" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "os" @@ -20,7 +19,7 @@ func Test1ZeroSnark(t *testing.T) { } func TestGoodCompressionSnark(t *testing.T) { - testCompressionRoundTripSnark(t, []byte{1, 2}, nil, withLevel(GoodCompression)) + testCompressionRoundTripSnark(t, []byte{1, 2}, nil, withLevel(lzss.GoodCompression)) } func Test0To10ExplicitSnark(t *testing.T) { @@ -29,19 +28,19 @@ func Test0To10ExplicitSnark(t *testing.T) { func TestNoCompressionSnark(t *testing.T) { - d, err := os.ReadFile("./testdata/test_cases/3c2943/data.bin") + d, err := os.ReadFile("./testdata/3c2943/data.bin") assert.NoError(t, err) dict := getDictionary() - compressor, err := NewCompressor(dict, NoCompression) + compressor, err := lzss.NewCompressor(dict, lzss.NoCompression) require.NoError(t, err) c, err := compressor.Compress(d) require.NoError(t, err) - decompressorLevel := BestCompression + decompressorLevel := lzss.BestCompression - cStream, err := compress.NewStream(c, uint8(decompressorLevel)) + cStream, err := goCompress.NewStream(c, uint8(decompressorLevel)) require.NoError(t, err) circuit := &DecompressionTestCircuit{ @@ -59,28 +58,12 @@ func TestNoCompressionSnark(t *testing.T) { test.NewAssert(t).CheckCircuit(circuit, test.WithValidAssignment(assignment), test.WithBackends(backend.PLONK), test.WithCurves(ecc.BN254)) } -func Test4ZerosBackrefSnark(t *testing.T) { - - shortBackRefType, longBackRefType, _ := initBackRefTypes(0, BestCompression) - - testDecompressionSnark(t, nil, BestCompression, 0, backref{ - address: 0, - length: 2, - bType: shortBackRefType, - }, backref{ - address: 1, - length: 1, - bType: longBackRefType, - }, - ) -} - func Test255_254_253Snark(t *testing.T) { testCompressionRoundTripSnark(t, []byte{255, 254, 253}, nil) } func Test3c2943Snark(t *testing.T) { - d, err := os.ReadFile("./testdata/test_cases/3c2943/data.bin") + d, err := os.ReadFile("./testdata/3c2943/data.bin") assert.NoError(t, err) dict := getDictionary() @@ -91,10 +74,10 @@ func Test3c2943Snark(t *testing.T) { // Fuzz test the decompression func FuzzSnark(f *testing.F) { // TODO This is always skipped f.Fuzz(func(t *testing.T, input, dict []byte) { - if len(input) > maxInputSize { + if len(input) > lzss.MaxInputSize { t.Skip("input too large") } - if len(dict) > maxDictSize { + if len(dict) > lzss.MaxDictSize { t.Skip("dict too large") } if len(input) == 0 { @@ -104,77 +87,28 @@ func FuzzSnark(f *testing.F) { // TODO This is always skipped }) } -type testCompressionRoundTripOption func(*Level) +type testCompressionRoundTripOption func(*lzss.Level) -func withLevel(level Level) testCompressionRoundTripOption { - return func(l *Level) { +func withLevel(level lzss.Level) testCompressionRoundTripOption { + return func(l *lzss.Level) { *l = level } } func testCompressionRoundTripSnark(t *testing.T, d, dict []byte, options ...testCompressionRoundTripOption) { - level := BestCompression + level := lzss.BestCompression for _, option := range options { option(&level) } - compressor, err := NewCompressor(dict, level) + compressor, err := lzss.NewCompressor(dict, level) require.NoError(t, err) c, err := compressor.Compress(d) require.NoError(t, err) - cStream, err := ReadIntoStream(c, dict, level) - require.NoError(t, err) - - circuit := &DecompressionTestCircuit{ - C: make([]frontend.Variable, cStream.Len()), - D: d, - Dict: dict, - CheckCorrectness: true, - Level: level, - } - assignment := &DecompressionTestCircuit{ - C: test_vector_utils.ToVariableSlice(cStream.D), - CLength: cStream.Len(), - } - - test.NewAssert(t).CheckCircuit(circuit, test.WithValidAssignment(assignment), test.WithBackends(backend.PLONK), test.WithCurves(ecc.BN254)) -} - -func testDecompressionSnark(t *testing.T, dict []byte, level Level, compressedStream ...interface{}) { - var bb bytes.Buffer - w := bitio.NewWriter(&bb) - bb.Write([]byte{0, byte(level)}) - i := 0 - for _, c := range compressedStream { - switch v := c.(type) { - case byte: - assert.NoError(t, w.WriteByte(v)) - i++ - case int: - assert.True(t, v >= 0 && v <= 255) - assert.NoError(t, w.WriteByte(byte(v))) - i++ - case []byte: - for _, b := range v { - assert.NoError(t, w.WriteByte(b)) - } - i += len(v) - case backref: - v.writeTo(w, i) - i += v.length - default: - panic("not implemented") - } - } - assert.NoError(t, w.Close()) - c := bb.Bytes() - d, err := DecompressGo(c, dict) - require.NoError(t, err) - - cStream, err := ReadIntoStream(c, dict, level) + cStream, err := lzss.ReadIntoStream(c, dict, level) require.NoError(t, err) circuit := &DecompressionTestCircuit{ @@ -199,7 +133,7 @@ func TestReadBytes(t *testing.T) { WordNbBits: 1, Expected: expected, } - words, err := compress.NewStream(expected, 8) + words, err := goCompress.NewStream(expected, 8) assert.NoError(t, err) words = words.BreakUp(2) assignment := &readBytesCircuit{ @@ -221,3 +155,11 @@ func (c *readBytesCircuit) Define(api frontend.API) error { } return nil } + +func getDictionary() []byte { + d, err := os.ReadFile("./testdata/dict_naive") + if err != nil { + panic(err) + } + return d +} diff --git a/std/compress/lzss/snark_testing.go b/std/compress/lzss/snark_testing.go index 3c8f5e5da9..b910d27940 100644 --- a/std/compress/lzss/snark_testing.go +++ b/std/compress/lzss/snark_testing.go @@ -3,6 +3,8 @@ package lzss import ( "compress/gzip" "fmt" + goCompress "github.com/consensys/compress" + "github.com/consensys/compress/lzss" "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/ecc/bn254/fr" "github.com/consensys/gnark-crypto/hash" @@ -22,7 +24,7 @@ type DecompressionTestCircuit struct { Dict []byte CLength frontend.Variable CheckCorrectness bool - Level Level + Level lzss.Level } func (c *DecompressionTestCircuit) Define(api frontend.API) error { @@ -48,7 +50,9 @@ func BenchCompressionE2ECompilation(dict []byte, name string) (constraint.Constr // compress - compressor, err := NewCompressor(dict, GoodCompression) + level := lzss.GoodCompression + + compressor, err := lzss.NewCompressor(dict, level) if err != nil { return nil, err } @@ -58,7 +62,7 @@ func BenchCompressionE2ECompilation(dict []byte, name string) (constraint.Constr return nil, err } - cStream, err := compress.NewStream(c, uint8(compressor.level)) + cStream, err := goCompress.NewStream(c, uint8(level)) if err != nil { return nil, err } @@ -67,7 +71,7 @@ func BenchCompressionE2ECompilation(dict []byte, name string) (constraint.Constr C: make([]frontend.Variable, cStream.Len()), D: make([]frontend.Variable, len(d)), Dict: make([]byte, len(dict)), - Level: compressor.level, + Level: level, } var start int64 @@ -120,7 +124,7 @@ type compressionCircuit struct { D []frontend.Variable Dict []byte CLen, DLen frontend.Variable - Level Level + Level lzss.Level } func (c *compressionCircuit) Define(api frontend.API) error { @@ -151,7 +155,7 @@ func (c *compressionCircuit) Define(api frontend.API) error { return nil } -func check(s compress.Stream, padTo int) (checksum fr.Element, err error) { +func check(s goCompress.Stream, padTo int) (checksum fr.Element, err error) { s.D = append(s.D, make([]int, padTo-len(s.D))...) diff --git a/std/compress/lzss/testdata/test_cases/3c2943/data.bin b/std/compress/lzss/testdata/3c2943/data.bin similarity index 100% rename from std/compress/lzss/testdata/test_cases/3c2943/data.bin rename to std/compress/lzss/testdata/3c2943/data.bin diff --git a/std/compress/lzss/testdata/test_cases/705b24/data.bin b/std/compress/lzss/testdata/705b24/data.bin similarity index 100% rename from std/compress/lzss/testdata/test_cases/705b24/data.bin rename to std/compress/lzss/testdata/705b24/data.bin diff --git a/std/compress/lzss/testdata/test_cases/777003/data.bin b/std/compress/lzss/testdata/777003/data.bin similarity index 100% rename from std/compress/lzss/testdata/test_cases/777003/data.bin rename to std/compress/lzss/testdata/777003/data.bin diff --git a/std/compress/lzss/testdata/test_cases/bug/data.bin b/std/compress/lzss/testdata/bug/data.bin similarity index 100% rename from std/compress/lzss/testdata/test_cases/bug/data.bin rename to std/compress/lzss/testdata/bug/data.bin diff --git a/std/compress/lzss/testdata/test_cases/c9b5a2/data.bin b/std/compress/lzss/testdata/c9b5a2/data.bin similarity index 100% rename from std/compress/lzss/testdata/test_cases/c9b5a2/data.bin rename to std/compress/lzss/testdata/c9b5a2/data.bin diff --git a/std/compress/lzss/testdata/test_cases/e4207e/data.bin b/std/compress/lzss/testdata/e4207e/data.bin similarity index 100% rename from std/compress/lzss/testdata/test_cases/e4207e/data.bin rename to std/compress/lzss/testdata/e4207e/data.bin diff --git a/std/compress/lzss/testdata/test_cases/fa4a22/data.bin b/std/compress/lzss/testdata/fa4a22/data.bin similarity index 100% rename from std/compress/lzss/testdata/test_cases/fa4a22/data.bin rename to std/compress/lzss/testdata/fa4a22/data.bin diff --git a/std/compress/lzss/testdata/test_cases/large/data.bin b/std/compress/lzss/testdata/large/data.bin similarity index 100% rename from std/compress/lzss/testdata/test_cases/large/data.bin rename to std/compress/lzss/testdata/large/data.bin diff --git a/std/compress/stream.go b/std/compress/stream.go deleted file mode 100644 index c0cdbf5b9c..0000000000 --- a/std/compress/stream.go +++ /dev/null @@ -1,191 +0,0 @@ -package compress - -import ( - "bytes" - "hash" - "math/big" - - "github.com/icza/bitio" -) - -// Streams and pipelines are inefficient data structures used for easy experimentation with compression algorithms. -// They make it easy to swap modules in and out. -type Stream struct { - D []int - NbSymbs int -} - -func (s *Stream) Len() int { - return len(s.D) -} - -func (s *Stream) RunLen(i int) int { - runLen := 1 - for i+runLen < len(s.D) && s.D[i+runLen] == 0 { - runLen++ - } - return runLen -} - -func (s *Stream) At(i int) int { - return s.D[i] -} - -func NewStream(in []byte, bitsPerSymbol uint8) (Stream, error) { - d := make([]int, len(in)*8/int(bitsPerSymbol)) - r := bitio.NewReader(bytes.NewReader(in)) - for i := range d { - if n, err := r.ReadBits(bitsPerSymbol); err != nil { - return Stream{}, err - } else { - d[i] = int(n) - } - } - return Stream{d, 1 << int(bitsPerSymbol)}, nil -} - -func (s *Stream) BreakUp(nbSymbs int) Stream { - newPerOld := log(s.NbSymbs, nbSymbs) - d := make([]int, len(s.D)*newPerOld) - - for i := range s.D { - v := s.D[i] - for j := 0; j < newPerOld; j++ { - d[(i+1)*newPerOld-j-1] = v % nbSymbs - v /= nbSymbs - } - } - - return Stream{d, nbSymbs} -} - -func (s *Stream) Pack(nbBits int) []*big.Int { - wordLen := bitLen(s.NbSymbs) - wordsPerElem := (nbBits - 1) / wordLen - - var radix big.Int - radix.Lsh(big.NewInt(1), uint(wordLen)) - - packed := make([]*big.Int, (len(s.D)+wordsPerElem-1)/wordsPerElem) - for i := range packed { - packed[i] = new(big.Int) - for j := wordsPerElem - 1; j >= 0; j-- { - absJ := i*wordsPerElem + j - if absJ >= len(s.D) { - continue - } - packed[i].Mul(packed[i], &radix).Add(packed[i], big.NewInt(int64(s.D[absJ]))) - } - } - return packed -} - -func log(x, base int) int { - exp := 0 - for pow := 1; pow < x; pow *= base { - exp++ - } - return exp -} - -func (s *Stream) Checksum(hsh hash.Hash, fieldBits int) []byte { - packed := s.Pack(fieldBits) - fieldBytes := (fieldBits + 7) / 8 - byts := make([]byte, fieldBytes) - for _, w := range packed { - w.FillBytes(byts) - hsh.Write(byts) - } - - length := make([]byte, fieldBytes) - big.NewInt(int64(s.Len())).FillBytes(length) - hsh.Write(length) - - return hsh.Sum(nil) -} - -func (s *Stream) WriteNum(r int, nbWords int) *Stream { - for i := 0; i < nbWords; i++ { - s.D = append(s.D, r%s.NbSymbs) - r /= s.NbSymbs - } - if r != 0 { - panic("overflow") - } - return s -} - -func (s *Stream) ReadNum(start, nbWords int) int { - res := 0 - for j := nbWords - 1; j >= 0; j-- { - res *= s.NbSymbs - res += s.D[start+j] - } - return res -} - -func bitLen(n int) int { - bitLen := 0 - for 1<