Skip to content

Commit

Permalink
Unify checker tools; proper optimize implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
wmitsuda committed Jan 17, 2025
1 parent 823a100 commit 7fd48ea
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 234 deletions.
194 changes: 0 additions & 194 deletions cmd/integration/commands/idx_compare.go

This file was deleted.

42 changes: 10 additions & 32 deletions cmd/integration/commands/idx_optimize.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package commands

import (
"encoding/binary"
"io/fs"
"log"
"os"
Expand All @@ -10,11 +9,11 @@ import (
"strings"

"github.com/erigontech/erigon-lib/common/background"
"github.com/erigontech/erigon-lib/common/hexutility"
"github.com/erigontech/erigon-lib/config3"
lllog "github.com/erigontech/erigon-lib/log/v3"
"github.com/erigontech/erigon-lib/recsplit"
"github.com/erigontech/erigon-lib/recsplit/eliasfano32"
"github.com/erigontech/erigon-lib/recsplit/multiencseq"
"github.com/erigontech/erigon-lib/state"

"github.com/erigontech/erigon-lib/common"
Expand Down Expand Up @@ -50,39 +49,22 @@ type efFileInfo struct {
endStep uint64
}

var MAGIC_KEY_BASE_TX_NUM = hexutility.MustDecodeHex("0x8453FFFFFFFFFFFFFFFFFFFF")
var b []byte

// Delta encoding starting from 1st elem; only for ef sequences < 16 elems
//
// Encode all elems as deltas from baseTxId; they can fit into uint32
// because max delta is bounded by 64 * stepSize == 100M
// hence size == count * sizeof(uint32) + 1 byte for encoding type
func doOpt4(baseTxNum uint64, v []byte) ([]byte, error) {
func doConvert(baseTxNum uint64, v []byte) ([]byte, error) {
ef, _ := eliasfano32.ReadEliasFano(v)
count := ef.Count()
if count < 16 {
if ef.Max()-ef.Min()+1 < uint64(0xffffffff) {
return convertEF(baseTxNum, ef)
}
}

return v, nil // DO NOT OPTIMIZE; plain elias fano
}

func convertEF(baseTxNum uint64, ef *eliasfano32.EliasFano) ([]byte, error) {
b := make([]byte, 0, 1+ef.Count()*4)
b = append(b, 0b10000000)
seqBuilder := multiencseq.NewBuilder(baseTxNum, ef.Count(), ef.Max())
for it := ef.Iterator(); it.HasNext(); {
n, err := it.Next()
if err != nil {
return nil, err
}
n -= baseTxNum

bn := make([]byte, 4)
binary.BigEndian.PutUint32(bn, uint32(n))
b = append(b, bn...)
seqBuilder.AddOffset(n)
}
seqBuilder.Build()

b = seqBuilder.AppendBytes(b[:0])
return b, nil
}

Expand Down Expand Up @@ -153,10 +135,7 @@ var idxOptimize = &cobra.Command{
reader.Reset(0)

writer := seg.NewWriter(idxOutput, seg.CompressNone)
writer.AddWord(MAGIC_KEY_BASE_TX_NUM)
b := make([]byte, 8)
binary.BigEndian.PutUint64(b, baseTxNum)
writer.AddWord(b)
ps := background.NewProgressSet()

for reader.HasNext() {
k, _ := reader.Next(nil)
Expand All @@ -168,7 +147,7 @@ var idxOptimize = &cobra.Command{
}

v, _ := reader.Next(nil)
v, err := doOpt4(baseTxNum, v)
v, err := doConvert(baseTxNum, v)
if err != nil {
log.Fatalf("error while optimizing value %v", err)
}
Expand Down Expand Up @@ -210,7 +189,6 @@ var idxOptimize = &cobra.Command{
if err != nil {
log.Fatalf("Failed to build accessor: %v", err)
}
ps := background.NewProgressSet()
if err := state.BuildAccessor(ctx, data, seg.CompressNone, idxPath, false, cfg, ps, logger); err != nil {
log.Fatalf("Failed to build accessor: %v", err)
}
Expand Down
27 changes: 22 additions & 5 deletions cmd/integration/commands/idx_verify.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,17 @@ var idxVerify = &cobra.Command{
continue
}

log.Printf("Deep checking file %s...", file.Name())

efInfo, err := parseEFFilename(file.Name())
if err != nil {
log.Fatalf("Failed to parse file info: %v", err)
}
baseTxNum := efInfo.startStep * config3.DefaultStepSize

targetEfi, err := recsplit.OpenIndex(targetDirCli + "/snapshots/accessor/" + file.Name() + "i")
targetIndexFilename := targetDirCli + "/snapshots/accessor/" + file.Name() + "i"
if manuallyOptimized {
targetIndexFilename = targetDirCli + "/snapshots/accessor/" + file.Name() + "i.new"
}
targetEfi, err := recsplit.OpenIndex(targetIndexFilename)
if err != nil {
log.Fatalf("Failed to open index: %v", err)
}
Expand All @@ -57,19 +59,26 @@ var idxVerify = &cobra.Command{
defer targetEfiReader.Close()

// original .ef file
sourceIdx, err := seg.NewDecompressor(sourceDirCli + "/snapshots/idx/" + file.Name())
sourceFilename := sourceDirCli + "/snapshots/idx/" + file.Name()
sourceIdx, err := seg.NewDecompressor(sourceFilename)
if err != nil {
log.Fatalf("Failed to open decompressor: %v", err)
}
defer sourceIdx.Close()

// reencoded optimized .ef file
targetIdx, err := seg.NewDecompressor(targetDirCli + "/snapshots/idx/" + file.Name())
targetFilename := targetDirCli + "/snapshots/idx/" + file.Name()
if manuallyOptimized {
targetFilename = targetDirCli + "/snapshots/idx/" + file.Name() + ".new"
}
targetIdx, err := seg.NewDecompressor(targetFilename)
if err != nil {
log.Fatalf("Failed to open decompressor: %v", err)
}
defer targetIdx.Close()

log.Printf("Deep checking files %s -> %s, %s...", sourceFilename, targetFilename, targetIndexFilename)

g := sourceIdx.MakeGetter()
sourceReader := seg.NewReader(g, seg.CompressNone)
sourceReader.Reset(0)
Expand Down Expand Up @@ -143,9 +152,14 @@ var idxVerify = &cobra.Command{
}

func compareSequences(sourceK, sourceV, targetV []byte, baseTxNum uint64) bool {
// log.Printf("k=%s sv=%s tv=%s baseTxNum=%d", hexutility.Encode(sourceK), hexutility.Encode(sourceV), hexutility.Encode(targetV), baseTxNum)
sourceEf, _ := eliasfano32.ReadEliasFano(sourceV)
targetSeq := multiencseq.ReadMultiEncSeq(baseTxNum, targetV)

if targetSeq.EncodingType() == multiencseq.PlainEliasFano {
log.Printf("target encoding type can't be PlainEliasFano")
return false
}
if targetSeq.Count() > sourceEf.Count() {
log.Print("Optimized eliasfano is longer")
log.Printf("key=%s", hexutility.Encode(sourceK))
Expand Down Expand Up @@ -193,7 +207,10 @@ func init() {
must(idxVerify.MarkFlagRequired("targetdir"))
must(idxVerify.MarkFlagDirname("targetdir"))

idxVerify.Flags().BoolVar(&manuallyOptimized, "manuallyOptimized", false, "set this parameter if you have manually optimized the .ef files ith idx_optimize; set sourcedir/targetdir to the same")

rootCmd.AddCommand(idxVerify)
}

var sourceDirCli, targetDirCli string
var manuallyOptimized bool
4 changes: 4 additions & 0 deletions erigon-lib/recsplit/multiencseq/sequence_reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ func Seek(baseNum uint64, data []byte, n uint64) (uint64, bool) {
return seq.search(n)
}

func (s *SequenceReader) EncodingType() EncodingType {
return s.currentEnc
}

func (s *SequenceReader) Get(i uint64) uint64 {
if s.currentEnc == SimpleEncoding {
return s.sseq.Get(i)
Expand Down
Loading

0 comments on commit 7fd48ea

Please sign in to comment.