From 7fd48ea14066837e7035d838748ad1606e0bb768 Mon Sep 17 00:00:00 2001 From: Willian Mitsuda Date: Thu, 16 Jan 2025 21:09:49 -0300 Subject: [PATCH] Unify checker tools; proper optimize implementation --- cmd/integration/commands/idx_compare.go | 194 ------------------ cmd/integration/commands/idx_optimize.go | 42 +--- cmd/integration/commands/idx_verify.go | 27 ++- .../recsplit/multiencseq/sequence_reader.go | 4 + .../multiencseq/sequence_reader_test.go | 6 +- 5 files changed, 39 insertions(+), 234 deletions(-) delete mode 100644 cmd/integration/commands/idx_compare.go diff --git a/cmd/integration/commands/idx_compare.go b/cmd/integration/commands/idx_compare.go deleted file mode 100644 index 00cbd98ce5f..00000000000 --- a/cmd/integration/commands/idx_compare.go +++ /dev/null @@ -1,194 +0,0 @@ -package commands - -import ( - "bytes" - "encoding/binary" - "io/fs" - "log" - "os" - "path/filepath" - "strings" - - "github.com/erigontech/erigon-lib/common" - "github.com/erigontech/erigon-lib/common/hexutility" - "github.com/erigontech/erigon-lib/recsplit" - "github.com/erigontech/erigon-lib/recsplit/eliasfano32" - "github.com/erigontech/erigon-lib/seg" - "github.com/spf13/cobra" -) - -func readEliasFanoOrOpt(v []byte, baseTxNum uint64) *eliasfano32.EliasFano { - if v[0]&0b10000000 == 0 { - ef, _ := eliasfano32.ReadEliasFano(v) - return ef - } - - // not eliasfano, decode - count := (len(v) - 1) / 4 - max := uint64(binary.BigEndian.Uint32(v[len(v)-4:])) + baseTxNum - ef := eliasfano32.NewEliasFano(uint64(count), max) - for i := 1; i <= len(v)-4; i += 4 { - n := uint64(binary.BigEndian.Uint32(v[i:i+4])) + baseTxNum - ef.AddOffset(n) - } - ef.Build() - return ef -} - -func compareOpt4(vOrig, vOpt []byte, baseTxNum uint64) bool { - efOrig, _ := eliasfano32.ReadEliasFano(vOrig) - efOpt := readEliasFanoOrOpt(vOpt, baseTxNum) - - if efOpt.Count() > efOrig.Count() { - log.Print("Optimized eliasfano is longer") - return false - } - if efOrig.Count() > efOpt.Count() { - log.Print("Optimized eliasfano is shorter") - return false - } - - itOrig := efOrig.Iterator() - itOpt := efOpt.Iterator() - for itOrig.HasNext() { - nOrig, err := itOrig.Next() - if err != nil { - log.Fatalf("Failed to read next: %v", err) - } - nOpt, err := itOpt.Next() - if err != nil { - log.Fatalf("Failed to read next: %v", err) - } - if nOrig != nOpt { - log.Printf("values mismatch: orig=%d new=%d", nOrig, nOpt) - log.Printf("orig=%v new=%v", hexutility.Encode(vOrig), hexutility.Encode(vOpt)) - return false - } - } - - return true -} - -var idxCompare = &cobra.Command{ - Use: "idx_compare", - Short: "After an idx_optimize execution, deep compare original and optimized .ef files", - Run: func(cmd *cobra.Command, args []string) { - ctx, _ := common.RootContext() - - idxPath := filepath.Join(datadirCli, "snapshots", "idx") - idxDir := os.DirFS(idxPath) - - files, err := fs.ReadDir(idxDir, ".") - if err != nil { - log.Fatalf("Failed to read directory contents: %v", err) - } - - log.Println("Comparing idx files:") - for _, file := range files { - if file.IsDir() || !strings.HasSuffix(file.Name(), ".ef") { - continue - } - - log.Printf("Checking file %s...", file.Name()) - - efi, err := recsplit.OpenIndex(datadirCli + "/snapshots/accessor/" + file.Name() + "i.new") - if err != nil { - log.Fatalf("Failed to open index: %v", err) - } - defer efi.Close() - - reader := efi.GetReaderFromPool() - defer reader.Close() - - // original .ef file - idxOrig, err := seg.NewDecompressor(datadirCli + "/snapshots/idx/" + file.Name()) - if err != nil { - log.Fatalf("Failed to open decompressor: %v", err) - } - defer idxOrig.Close() - - // reencoded optimized .ef.new file - idxOpt, err := seg.NewDecompressor(datadirCli + "/snapshots/idx/" + file.Name() + ".new") - if err != nil { - log.Fatalf("Failed to open decompressor: %v", err) - } - defer idxOpt.Close() - - g := idxOrig.MakeGetter() - readerOrig := seg.NewReader(g, seg.CompressNone) - readerOrig.Reset(0) - - g = idxOpt.MakeGetter() - readerOpt := seg.NewReader(g, seg.CompressNone) - readerOpt.Reset(0) - - // .ef.new MUST have a magic kv pair with baseTxNum - if !readerOpt.HasNext() { - log.Fatalf("reader doesn't have magic kv!") - } - k, _ := readerOpt.Next(nil) - if !bytes.Equal(k, MAGIC_KEY_BASE_TX_NUM) { - log.Fatalf("magic k is incorrect: %v", hexutility.Encode(k)) - } - if !readerOpt.HasNext() { - log.Fatalf("reader doesn't have magic number!") - } - v, prevKeyOffset := readerOpt.Next(nil) - if len(v) != 8 { - log.Fatalf("baseTxNum is not a uint64: %v", hexutility.Encode(v)) - } - baseTxNum := binary.BigEndian.Uint64(v) - - for readerOrig.HasNext() { - if !readerOpt.HasNext() { - log.Fatal("opt reader doesn't have next!") - } - - kOrig, _ := readerOrig.Next(nil) - kOpt, _ := readerOpt.Next(nil) - if !bytes.Equal(kOrig, kOpt) { - log.Fatalf("key mismatch!") - } - - if !readerOrig.HasNext() { - log.Fatal("orig reader doesn't have next!") - } - if !readerOpt.HasNext() { - log.Fatal("opt reader doesn't have next!") - } - - // orig/opt value comparison - vOrig, _ := readerOrig.Next(nil) - vOpt, nextKeyOffset := readerOpt.Next(nil) - if !compareOpt4(vOrig, vOpt, baseTxNum) { - log.Fatalf("value mismatch!") - } - - // checks new efi lookup points to the same value - offset, found := reader.TwoLayerLookup(kOpt) - if !found { - log.Fatalf("key %v not found in efi", hexutility.Encode(kOpt)) - } - if offset != prevKeyOffset { - log.Fatalf("offset mismatch: %d != %d", offset, prevKeyOffset) - } - prevKeyOffset = nextKeyOffset - - select { - case <-ctx.Done(): - return - default: - } - } - idxOrig.Close() - idxOpt.Close() - reader.Close() - efi.Close() - } - }, -} - -func init() { - withDataDir(idxCompare) - rootCmd.AddCommand(idxCompare) -} diff --git a/cmd/integration/commands/idx_optimize.go b/cmd/integration/commands/idx_optimize.go index 02f2b98410b..390e14ccedd 100644 --- a/cmd/integration/commands/idx_optimize.go +++ b/cmd/integration/commands/idx_optimize.go @@ -1,7 +1,6 @@ package commands import ( - "encoding/binary" "io/fs" "log" "os" @@ -10,11 +9,11 @@ import ( "strings" "github.com/erigontech/erigon-lib/common/background" - "github.com/erigontech/erigon-lib/common/hexutility" "github.com/erigontech/erigon-lib/config3" lllog "github.com/erigontech/erigon-lib/log/v3" "github.com/erigontech/erigon-lib/recsplit" "github.com/erigontech/erigon-lib/recsplit/eliasfano32" + "github.com/erigontech/erigon-lib/recsplit/multiencseq" "github.com/erigontech/erigon-lib/state" "github.com/erigontech/erigon-lib/common" @@ -50,39 +49,22 @@ type efFileInfo struct { endStep uint64 } -var MAGIC_KEY_BASE_TX_NUM = hexutility.MustDecodeHex("0x8453FFFFFFFFFFFFFFFFFFFF") +var b []byte -// Delta encoding starting from 1st elem; only for ef sequences < 16 elems -// -// Encode all elems as deltas from baseTxId; they can fit into uint32 -// because max delta is bounded by 64 * stepSize == 100M -// hence size == count * sizeof(uint32) + 1 byte for encoding type -func doOpt4(baseTxNum uint64, v []byte) ([]byte, error) { +func doConvert(baseTxNum uint64, v []byte) ([]byte, error) { ef, _ := eliasfano32.ReadEliasFano(v) - count := ef.Count() - if count < 16 { - if ef.Max()-ef.Min()+1 < uint64(0xffffffff) { - return convertEF(baseTxNum, ef) - } - } - - return v, nil // DO NOT OPTIMIZE; plain elias fano -} -func convertEF(baseTxNum uint64, ef *eliasfano32.EliasFano) ([]byte, error) { - b := make([]byte, 0, 1+ef.Count()*4) - b = append(b, 0b10000000) + seqBuilder := multiencseq.NewBuilder(baseTxNum, ef.Count(), ef.Max()) for it := ef.Iterator(); it.HasNext(); { n, err := it.Next() if err != nil { return nil, err } - n -= baseTxNum - - bn := make([]byte, 4) - binary.BigEndian.PutUint32(bn, uint32(n)) - b = append(b, bn...) + seqBuilder.AddOffset(n) } + seqBuilder.Build() + + b = seqBuilder.AppendBytes(b[:0]) return b, nil } @@ -153,10 +135,7 @@ var idxOptimize = &cobra.Command{ reader.Reset(0) writer := seg.NewWriter(idxOutput, seg.CompressNone) - writer.AddWord(MAGIC_KEY_BASE_TX_NUM) - b := make([]byte, 8) - binary.BigEndian.PutUint64(b, baseTxNum) - writer.AddWord(b) + ps := background.NewProgressSet() for reader.HasNext() { k, _ := reader.Next(nil) @@ -168,7 +147,7 @@ var idxOptimize = &cobra.Command{ } v, _ := reader.Next(nil) - v, err := doOpt4(baseTxNum, v) + v, err := doConvert(baseTxNum, v) if err != nil { log.Fatalf("error while optimizing value %v", err) } @@ -210,7 +189,6 @@ var idxOptimize = &cobra.Command{ if err != nil { log.Fatalf("Failed to build accessor: %v", err) } - ps := background.NewProgressSet() if err := state.BuildAccessor(ctx, data, seg.CompressNone, idxPath, false, cfg, ps, logger); err != nil { log.Fatalf("Failed to build accessor: %v", err) } diff --git a/cmd/integration/commands/idx_verify.go b/cmd/integration/commands/idx_verify.go index a86ee25cb7b..dc9864b9311 100644 --- a/cmd/integration/commands/idx_verify.go +++ b/cmd/integration/commands/idx_verify.go @@ -39,15 +39,17 @@ var idxVerify = &cobra.Command{ continue } - log.Printf("Deep checking file %s...", file.Name()) - efInfo, err := parseEFFilename(file.Name()) if err != nil { log.Fatalf("Failed to parse file info: %v", err) } baseTxNum := efInfo.startStep * config3.DefaultStepSize - targetEfi, err := recsplit.OpenIndex(targetDirCli + "/snapshots/accessor/" + file.Name() + "i") + targetIndexFilename := targetDirCli + "/snapshots/accessor/" + file.Name() + "i" + if manuallyOptimized { + targetIndexFilename = targetDirCli + "/snapshots/accessor/" + file.Name() + "i.new" + } + targetEfi, err := recsplit.OpenIndex(targetIndexFilename) if err != nil { log.Fatalf("Failed to open index: %v", err) } @@ -57,19 +59,26 @@ var idxVerify = &cobra.Command{ defer targetEfiReader.Close() // original .ef file - sourceIdx, err := seg.NewDecompressor(sourceDirCli + "/snapshots/idx/" + file.Name()) + sourceFilename := sourceDirCli + "/snapshots/idx/" + file.Name() + sourceIdx, err := seg.NewDecompressor(sourceFilename) if err != nil { log.Fatalf("Failed to open decompressor: %v", err) } defer sourceIdx.Close() // reencoded optimized .ef file - targetIdx, err := seg.NewDecompressor(targetDirCli + "/snapshots/idx/" + file.Name()) + targetFilename := targetDirCli + "/snapshots/idx/" + file.Name() + if manuallyOptimized { + targetFilename = targetDirCli + "/snapshots/idx/" + file.Name() + ".new" + } + targetIdx, err := seg.NewDecompressor(targetFilename) if err != nil { log.Fatalf("Failed to open decompressor: %v", err) } defer targetIdx.Close() + log.Printf("Deep checking files %s -> %s, %s...", sourceFilename, targetFilename, targetIndexFilename) + g := sourceIdx.MakeGetter() sourceReader := seg.NewReader(g, seg.CompressNone) sourceReader.Reset(0) @@ -143,9 +152,14 @@ var idxVerify = &cobra.Command{ } func compareSequences(sourceK, sourceV, targetV []byte, baseTxNum uint64) bool { + // log.Printf("k=%s sv=%s tv=%s baseTxNum=%d", hexutility.Encode(sourceK), hexutility.Encode(sourceV), hexutility.Encode(targetV), baseTxNum) sourceEf, _ := eliasfano32.ReadEliasFano(sourceV) targetSeq := multiencseq.ReadMultiEncSeq(baseTxNum, targetV) + if targetSeq.EncodingType() == multiencseq.PlainEliasFano { + log.Printf("target encoding type can't be PlainEliasFano") + return false + } if targetSeq.Count() > sourceEf.Count() { log.Print("Optimized eliasfano is longer") log.Printf("key=%s", hexutility.Encode(sourceK)) @@ -193,7 +207,10 @@ func init() { must(idxVerify.MarkFlagRequired("targetdir")) must(idxVerify.MarkFlagDirname("targetdir")) + idxVerify.Flags().BoolVar(&manuallyOptimized, "manuallyOptimized", false, "set this parameter if you have manually optimized the .ef files ith idx_optimize; set sourcedir/targetdir to the same") + rootCmd.AddCommand(idxVerify) } var sourceDirCli, targetDirCli string +var manuallyOptimized bool diff --git a/erigon-lib/recsplit/multiencseq/sequence_reader.go b/erigon-lib/recsplit/multiencseq/sequence_reader.go index 495eefcb278..9898a85447a 100644 --- a/erigon-lib/recsplit/multiencseq/sequence_reader.go +++ b/erigon-lib/recsplit/multiencseq/sequence_reader.go @@ -64,6 +64,10 @@ func Seek(baseNum uint64, data []byte, n uint64) (uint64, bool) { return seq.search(n) } +func (s *SequenceReader) EncodingType() EncodingType { + return s.currentEnc +} + func (s *SequenceReader) Get(i uint64) uint64 { if s.currentEnc == SimpleEncoding { return s.sseq.Get(i) diff --git a/erigon-lib/recsplit/multiencseq/sequence_reader_test.go b/erigon-lib/recsplit/multiencseq/sequence_reader_test.go index 15d777a16ad..634091e1727 100644 --- a/erigon-lib/recsplit/multiencseq/sequence_reader_test.go +++ b/erigon-lib/recsplit/multiencseq/sequence_reader_test.go @@ -23,7 +23,7 @@ func TestMultiEncSeq(t *testing.T) { // check deserialization s := ReadMultiEncSeq(1000, b) - require.Equal(t, PlainEliasFano, s.currentEnc) + require.Equal(t, PlainEliasFano, s.EncodingType()) requireSequenceChecks(t, s) requireRawDataChecks(t, b) }) @@ -43,7 +43,7 @@ func TestMultiEncSeq(t *testing.T) { // check deserialization s := ReadMultiEncSeq(1000, b) - require.Equal(t, SimpleEncoding, s.currentEnc) + require.Equal(t, SimpleEncoding, s.EncodingType()) requireSequenceChecks(t, s) requireRawDataChecks(t, b) }) @@ -64,7 +64,7 @@ func TestMultiEncSeq(t *testing.T) { // check deserialization s := ReadMultiEncSeq(1000, b) - require.Equal(t, RebasedEliasFano, s.currentEnc) + require.Equal(t, RebasedEliasFano, s.EncodingType()) requireSequenceChecks(t, s) requireRawDataChecks(t, b) })