Skip to content

Commit

Permalink
delta compression experiment
Browse files Browse the repository at this point in the history
  • Loading branch information
macneale4 committed Mar 1, 2024
1 parent 6c249f1 commit 560bd50
Show file tree
Hide file tree
Showing 4 changed files with 236 additions and 0 deletions.
75 changes: 75 additions & 0 deletions go/cmd/dolt/commands/admin/delta.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright 2024 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package admin

import (
"context"

"github.com/dolthub/dolt/go/cmd/dolt/cli"
"github.com/dolthub/dolt/go/libraries/doltcore/doltdb"
"github.com/dolthub/dolt/go/libraries/doltcore/env"
"github.com/dolthub/dolt/go/libraries/utils/argparser"
"github.com/dolthub/dolt/go/store/datas"
"github.com/dolthub/dolt/go/store/nbs"
)

type DeltaCmd struct {
}

func (cmd DeltaCmd) Name() string {
return "delta"
}

// Description returns a description of the command
func (cmd DeltaCmd) Description() string {
return "walks history and prints possible space savings with delta encoding"
}
func (cmd DeltaCmd) RequiresRepo() bool {
return true
}
func (cmd DeltaCmd) Docs() *cli.CommandDocumentation {
return nil
}

func (cmd DeltaCmd) ArgParser() *argparser.ArgParser {
ap := argparser.NewArgParserWithMaxArgs(cmd.Name(), 0)
return ap
}
func (cmd DeltaCmd) Hidden() bool {
return true
}

func (cmd DeltaCmd) Exec(ctx context.Context, commandStr string, args []string, dEnv *env.DoltEnv, cliCtx cli.CliContext) int {
// ap := cmd.ArgParser()
// usage, _ := cli.HelpAndUsagePrinters(cli.CommandDocsForCommandString(commandStr, cli.CommandDocumentationContent{}, ap))
// cli.ParseArgsOrDie(ap, args, usage)
db := doltdb.HackDatasDatabaseFromDoltDB(dEnv.DoltDB)
cs := datas.ChunkStoreFromDatabase(db)
if _, ok := cs.(*nbs.GenerationalNBS); !ok {
cli.PrintErrln("Delta command requires a GenerationalNBS")
return 1
}

err := nbs.RunExperiment(cs, func(format string, args ...interface{}) {
cli.Printf(format, args...)
})

if err != nil {
cli.PrintErrln(err)
return 1
}

return 0
}
2 changes: 2 additions & 0 deletions go/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ require (
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect
github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d // indirect
github.com/apache/thrift v0.13.1-0.20201008052519-daf620915714 // indirect
github.com/balacode/go-delta v0.1.0 // indirect
github.com/balacode/zr v1.0.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32
github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM=
github.com/aws/aws-sdk-go-v2/service/sts v1.6.0/go.mod h1:q7o0j7d7HrJk/vr9uUt3BVRASvcU7gYZB9PUgPiByXg=
github.com/aws/smithy-go v1.6.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E=
github.com/balacode/go-delta v0.1.0 h1:pwz4CMn06P2bIaIfAx3GSabMPwJp/Ww4if+7SgPYa3I=
github.com/balacode/go-delta v0.1.0/go.mod h1:wLNrwTI3lHbPBvnLzqbHmA7HVVlm1u22XLvhbeA6t3o=
github.com/balacode/zr v1.0.0 h1:MCupkEoXvrnCljc4KddiDOhR04ZLUAACgtKuo3o+9vc=
github.com/balacode/zr v1.0.0/go.mod h1:pLeSAL3DhZ9L0JuiRkUtIX3mLOCtzBLnDhfmykbSmkE=
github.com/bcicen/jstream v1.0.0 h1:gOi+Sn9mHrpePlENynPKA6Dra/PjLaIpqrTevhfvLAA=
github.com/bcicen/jstream v1.0.0/go.mod h1:9ielPxqFry7Y4Tg3j4BfjPocfJ3TbsRtXOAYXYmRuAQ=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
Expand Down
155 changes: 155 additions & 0 deletions go/store/nbs/delta_calculate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// Copyright 2024 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package nbs

import (
"context"
"fmt"

"github.com/balacode/go-delta"
"github.com/dolthub/dolt/go/store/chunks"
)

const deltaPrefixLen = 16

type bprefix [deltaPrefixLen]byte

type addrBytes struct {
a addr
b []byte
}

type PrintfFunc func(format string, args ...interface{})

func RunExperiment(cs chunks.ChunkStore, p PrintfFunc) error {
if gs, ok := cs.(*GenerationalNBS); ok {
oldgen := gs.oldGen.tables.upstream

prefixMap := make(map[bprefix][]addrBytes)

uncompressedTotal := uint64(0)
for tf, cs := range oldgen {
p("table file: %s\n", tf.String())

idx, err := cs.index()
if err != nil {
panic(err)
}

largeChunks := 0
for i := uint32(0); i < idx.chunkCount(); i++ {
var a addr
_, err := idx.indexEntry(i, &a)
if err != nil {
panic(err)
}
var stat Stats
bytes, err := cs.get(context.TODO(), a, &stat)
if err != nil {
panic(err)
}

if len(bytes) > 512 { // NM4 - try other values? Smaller? Bigger?
prefix := bprefix(bytes[2 : deltaPrefixLen+2])
prefixMap[prefix] = append(prefixMap[prefix], addrBytes{a, bytes})
largeChunks++
} else {
uncompressedTotal += uint64(len(bytes))
}
}

progress := 0
sumSaved := uint64(0)
for pfx, group := range prefixMap {
total, saved := biteSizeEvaluate(group, .75)
progress += len(group)
p("Group: %v (%d/%d chunks) saved: %d of %d\n", pfx, progress, largeChunks, saved, total)
sumSaved += saved
uncompressedTotal += total
}

savings := 100.0 * (1.0 - float64(uncompressedTotal-sumSaved)/float64(uncompressedTotal))
p("Total saved: %d of %d (%.2f%% reduction)\n", sumSaved, uncompressedTotal, savings)
}

} else {
panic(fmt.Sprintf("Use a modern db brah"))
}

return nil
}

func biteSizeEvaluate(group []addrBytes, threshold float64) (uncompressedTotal, saved uint64) {
if len(group) < 100 {
return bruteForceGroup(group, threshold)
} else {
mid := len(group) / 2

uncompressedTotal, saved = biteSizeEvaluate(group[:mid], threshold)

uncompressedTotal2, saved2 := biteSizeEvaluate(group[mid:], threshold)

uncompressedTotal += uncompressedTotal2
saved += saved2

return
}
}

type savings struct {
src addr
dst addr
bytesSaved uint64
ratio float64
}

func bruteForceGroup(group []addrBytes, threshold float64) (uncompressedTotal, saved uint64) {
bestSavings := make(map[int]savings)
for srcIndx, src := range group {
uncompressedTotal += uint64(len(src.b))
for dstIndx, dst := range group[srcIndx+1:] {
dif := delta.Make(src.b, dst.b)
difBytes := dif.Bytes()

// NM4 - 64 is arbitrary. Based on the diff, it's probably not worth the compute time to store as a delta.
if len(difBytes)+64 < len(dst.b) {
deltaRatio := float64(len(difBytes)) / float64(len(dst.b))

if deltaRatio <= threshold {
// NM4 - 32 is a guess of the bytes for the address + other serialization overhead
s := savings{src.a, dst.a, uint64(len(dst.b) - len(difBytes) - 32), deltaRatio}

if best, ok := bestSavings[srcIndx]; !ok || s.bytesSaved > best.bytesSaved {
bestSavings[srcIndx+dstIndx] = s
}

/*
p("Source: %d bytes\n", len(src.b))
p("Destination: %d bytes\n", len(dst.b))
p("Delta: %d bytes\n", len(difBytes))
p("Total: %d bytes raw vs %d bytes delta compressed (%f)\n", totalBytes, compressedBytes, deltaRatio)
*/
}
}

}

srcSavings, ok := bestSavings[srcIndx]
if ok {
saved += srcSavings.bytesSaved
}
}
return
}

0 comments on commit 560bd50

Please sign in to comment.