Skip to content

Commit

Permalink
Merge pull request #8723 from dolthub/nicktobey/json-big
Browse files Browse the repository at this point in the history
If a JSON document contains strings that can't fit in a single chunk, use the naive Blob chunker instead of the smart JSON chunker.
  • Loading branch information
nicktobey authored Jan 9, 2025
2 parents 071cd77 + 5429156 commit b28cb54
Show file tree
Hide file tree
Showing 12 changed files with 249 additions and 62 deletions.
1 change: 1 addition & 0 deletions go/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ require (
github.com/google/btree v1.1.2
github.com/google/go-github/v57 v57.0.0
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
github.com/hashicorp/go-uuid v1.0.1
github.com/hashicorp/golang-lru/v2 v2.0.2
github.com/jmoiron/sqlx v1.3.4
github.com/kch42/buzhash v0.0.0-20160816060738-9bdec3dec7c6
Expand Down
1 change: 1 addition & 0 deletions go/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ github.com/hashicorp/go-sockaddr v1.0.0/go.mod h1:7Xibr9yA9JjQq1JpNB2Vw7kxv8xerX
github.com/hashicorp/go-syslog v1.0.0/go.mod h1:qPfqrKkXGihmCqbJM2mZgkZGvKG1dFdvsLplgctolz4=
github.com/hashicorp/go-uuid v0.0.0-20180228145832-27454136f036/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/go-uuid v1.0.1 h1:fv1ep09latC32wFoVwnqcnKJGnMSdBanPczbHAYm1BE=
github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro=
github.com/hashicorp/go-version v1.2.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA=
github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90=
Expand Down
7 changes: 2 additions & 5 deletions go/libraries/doltcore/sqle/enginetest/dolt_queries.go
Original file line number Diff line number Diff line change
Expand Up @@ -4786,11 +4786,8 @@ var LargeJsonObjectScriptTests = []queries.ScriptTest{
Expected: []sql.Row{{types.OkResult{RowsAffected: 1}}},
},
{
Skip: true,
// TODO: The JSON is coming back truncated for some reason and failing this test.
// When that's fixed, unskip this test, and fix the length value below.
Query: `SELECT pk, length(j1) from t;`,
Expected: []sql.Row{{1, 123}},
Query: `SELECT pk, length(j1->>"$.large_value") from t;`,
Expected: []sql.Row{{1, 1024 * 1024 * 3}},
},
},
},
Expand Down
19 changes: 15 additions & 4 deletions go/store/prolly/tree/indexed_json_diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
)

type IndexedJsonDiffer struct {
differ Differ[jsonLocationKey, jsonLocationOrdering]
differ Differ[jsonLocationKey, *jsonLocationOrdering]
currentFromCursor, currentToCursor *JsonCursor
from, to IndexedJsonDocument
started bool
Expand All @@ -32,10 +32,14 @@ type IndexedJsonDiffer struct {
var _ IJsonDiffer = &IndexedJsonDiffer{}

func NewIndexedJsonDiffer(ctx context.Context, from, to IndexedJsonDocument) (*IndexedJsonDiffer, error) {
differ, err := DifferFromRoots[jsonLocationKey, jsonLocationOrdering](ctx, from.m.NodeStore, to.m.NodeStore, from.m.Root, to.m.Root, jsonLocationOrdering{}, false)
ordering := jsonLocationOrdering{}
differ, err := DifferFromRoots[jsonLocationKey, *jsonLocationOrdering](ctx, from.m.NodeStore, to.m.NodeStore, from.m.Root, to.m.Root, &ordering, false)
if err != nil {
return nil, err
}
if ordering.err != nil {
return nil, err
}
// We want to diff the prolly tree as if it was an address map pointing to the individual blob fragments, rather
// than diffing the blob fragments themselves. We can accomplish this by just replacing the cursors in the differ
// with their parents.
Expand Down Expand Up @@ -225,7 +229,11 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error
// Neither cursor points to the start of a value.
// This should only be possible if they're at the same location.
// Do a sanity check, then continue.
if compareJsonLocations(fromCurrentLocation, toCurrentLocation) != 0 {
cmp, err := compareJsonLocations(fromCurrentLocation, toCurrentLocation)
if err != nil {
return JsonDiff{}, err
}
if cmp != 0 {
return JsonDiff{}, jsonParseError
}
err = advanceCursor(ctx, &jd.currentFromCursor)
Expand All @@ -240,7 +248,10 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error
}

if fromScannerAtStartOfValue && toScannerAtStartOfValue {
cmp := compareJsonLocations(fromCurrentLocation, toCurrentLocation)
cmp, err := compareJsonLocations(fromCurrentLocation, toCurrentLocation)
if err != nil {
return JsonDiff{}, err
}
switch cmp {
case 0:
key := fromCurrentLocation.Clone().key
Expand Down
7 changes: 7 additions & 0 deletions go/store/prolly/tree/json_chunker.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ func SerializeJsonToAddr(ctx context.Context, ns NodeStore, j sql.JSONWrapper) (
}
jsonChunker.appendJsonToBuffer(jsonBytes)
err = jsonChunker.processBuffer(ctx)
if largeJsonStringError.Is(err) {
// Due to current limits on chunk sizes, and an inability of older clients to read
// string keys and values split across multiple chunks, we can't use the JSON chunker for documents
// with extremely long strings.
node, _, err := serializeJsonToBlob(ctx, ns, j)
return node, err
}
if err != nil {
return Node{}, err
}
Expand Down
45 changes: 36 additions & 9 deletions go/store/prolly/tree/json_cursor.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,14 @@ func newJsonCursor(ctx context.Context, ns NodeStore, root Node, startKey jsonLo
}

func newJsonCursorAtStartOfChunk(ctx context.Context, ns NodeStore, root Node, startKey []byte) (jCur *JsonCursor, err error) {
cur, err := newCursorAtKey(ctx, ns, root, startKey, jsonLocationOrdering{})
ordering := jsonLocationOrdering{}
cur, err := newCursorAtKey(ctx, ns, root, startKey, &ordering)
if err != nil {
return nil, err
}
if ordering.err != nil {
return nil, err
}
return newJsonCursorFromCursor(ctx, cur)
}

Expand Down Expand Up @@ -125,7 +129,15 @@ func (j *JsonCursor) NextValue(ctx context.Context) (result []byte, err error) {
return
}

for compareJsonLocations(j.jsonScanner.currentPath, path) < 0 {
for {
var cmp int
cmp, err = compareJsonLocations(j.jsonScanner.currentPath, path)
if err != nil {
return
}
if cmp >= 0 {
break
}
parseChunk()
if err != nil {
return
Expand All @@ -135,13 +147,14 @@ func (j *JsonCursor) NextValue(ctx context.Context) (result []byte, err error) {
return
}

func (j *JsonCursor) isKeyInChunk(path jsonLocation) bool {
func (j *JsonCursor) isKeyInChunk(path jsonLocation) (bool, error) {
if j.cur.parent == nil {
// This is the only chunk, so the path must refer to this chunk.
return true
return true, nil
}
nodeEndPosition := jsonPathFromKey(j.cur.parent.CurrentKey())
return compareJsonLocations(path, nodeEndPosition) <= 0
cmp, err := compareJsonLocations(path, nodeEndPosition)
return cmp <= 0, err
}

// AdvanceToLocation causes the cursor to advance to the specified position. This function returns a boolean indicating
Expand All @@ -151,12 +164,20 @@ func (j *JsonCursor) isKeyInChunk(path jsonLocation) bool {
// the cursor advances to the end of the previous value, prior to the object key. This allows the key to be removed along
// with the value.
func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, forRemoval bool) (found bool, err error) {
if !j.isKeyInChunk(path) {
isInChunk, err := j.isKeyInChunk(path)
if err != nil {
return false, err
}
if !isInChunk {
// Our destination is in another chunk, load it.
err := Seek(ctx, j.cur.parent, path.key, jsonLocationOrdering{})
ordering := jsonLocationOrdering{}
err := Seek(ctx, j.cur.parent, path.key, &ordering)
if err != nil {
return false, err
}
if ordering.err != nil {
return false, err
}
j.cur.nd, err = fetchChild(ctx, j.cur.nrw, j.cur.parent.currentRef())
if err != nil {
return false, err
Expand All @@ -169,7 +190,10 @@ func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, f
}

previousScanner := j.jsonScanner
cmp := compareJsonLocations(j.jsonScanner.currentPath, path)
cmp, err := compareJsonLocations(j.jsonScanner.currentPath, path)
if err != nil {
return false, err
}
for cmp < 0 {
previousScanner = j.jsonScanner.Clone()
err := j.jsonScanner.AdvanceToNextLocation()
Expand All @@ -180,7 +204,10 @@ func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, f
} else if err != nil {
return false, err
}
cmp = compareJsonLocations(j.jsonScanner.currentPath, path)
cmp, err = compareJsonLocations(j.jsonScanner.currentPath, path)
if err != nil {
return false, err
}
}
// If the supplied path doesn't exist in the document, then we want to stop the cursor at the start of the point
// were it would appear. This may mean that we've gone too far and need to rewind one location.
Expand Down
27 changes: 19 additions & 8 deletions go/store/prolly/tree/json_indexed_document.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type jsonLocationKey = []byte

type address = []byte

type StaticJsonMap = StaticMap[jsonLocationKey, address, jsonLocationOrdering]
type StaticJsonMap = StaticMap[jsonLocationKey, address, *jsonLocationOrdering]

// IndexedJsonDocument is an implementation of sql.JSONWrapper that stores the document in a prolly tree.
// Every leaf node in the tree is a blob containing a substring of the original document. This allows the document
Expand All @@ -51,10 +51,10 @@ var _ fmt.Stringer = IndexedJsonDocument{}
var _ driver.Valuer = IndexedJsonDocument{}

func NewIndexedJsonDocument(ctx context.Context, root Node, ns NodeStore) IndexedJsonDocument {
m := StaticMap[jsonLocationKey, address, jsonLocationOrdering]{
m := StaticMap[jsonLocationKey, address, *jsonLocationOrdering]{
Root: root,
NodeStore: ns,
Order: jsonLocationOrdering{},
Order: &jsonLocationOrdering{},
}
return IndexedJsonDocument{
m: m,
Expand Down Expand Up @@ -114,8 +114,8 @@ func tryWithFallback(
tryFunc func() error,
fallbackFunc func(document types.JSONDocument) error) error {
err := tryFunc()
if err == unknownLocationKeyError || err == unsupportedPathError {
if err == unknownLocationKeyError {
if err == unknownLocationKeyError || err == unsupportedPathError || err == jsonParseError || largeJsonStringError.Is(err) {
if err != unsupportedPathError {
if sqlCtx, ok := ctx.(*sql.Context); ok {
sqlCtx.GetLogger().Warn(err)
}
Expand Down Expand Up @@ -282,7 +282,10 @@ func (i IndexedJsonDocument) insertIntoCursor(ctx context.Context, keyPath jsonL
// For example, attempting to insert into the path "$.a.b" in the document {"a": 1}
// We can detect this by checking to see if the insertion point in the original document comes before the inserted path.
// (For example, the insertion point occurs at $.a.START, which is before $.a.b)
cmp := compareJsonLocations(cursorPath, keyPath)
cmp, err := compareJsonLocations(cursorPath, keyPath)
if err != nil {
return IndexedJsonDocument{}, false, err
}
if cmp < 0 && cursorPath.getScannerState() == startOfValue {
// We just attempted to insert into a scalar.
return i, false, nil
Expand Down Expand Up @@ -444,7 +447,11 @@ func (i IndexedJsonDocument) setWithLocation(ctx context.Context, keyPath jsonLo
}

keyPath.pop()
found = compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath) == 0
cmp, err := compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath)
if err != nil {
return IndexedJsonDocument{}, false, err
}
found = cmp == 0
}

if found {
Expand Down Expand Up @@ -491,7 +498,11 @@ func (i IndexedJsonDocument) tryReplace(ctx context.Context, path string, val sq
}

keyPath.pop()
found = compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath) == 0
cmp, err := compareJsonLocations(keyPath, jsonCursor.jsonScanner.currentPath)
if err != nil {
return nil, false, err
}
found = cmp == 0
}

if !found {
Expand Down
19 changes: 19 additions & 0 deletions go/store/prolly/tree/json_indexed_document_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,3 +455,22 @@ func TestJsonCompare(t *testing.T) {
})
})
}

// Test that we can write a JSON document with a multi-MB string value into storage and read it back.
func TestIndexedJsonDocument_CreateLargeStringValues(t *testing.T) {
ctx := sql.NewEmptyContext()
ns := NewTestNodeStore()
docMap := make(map[string]interface{})
value := strings.Repeat("x", 2097152)
docMap["key"] = value
doc, _, err := types.JSON.Convert(docMap)
require.NoError(t, err)
root, err := SerializeJsonToAddr(ctx, ns, doc.(sql.JSONWrapper))
require.NoError(t, err)
indexedDoc, err := NewJSONDoc(root.HashOf(), ns).ToIndexedJSONDocument(ctx)
lookup, err := types.LookupJSONValue(indexedDoc, "$.key")
require.NoError(t, err)
extractedValue, _, err := types.LongText.Convert(lookup)
require.NoError(t, err)
require.Equal(t, value, extractedValue)
}
Loading

0 comments on commit b28cb54

Please sign in to comment.