Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vecindex: support deleting vectors from C-SPANN index #135230

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 137 additions & 1 deletion pkg/sql/vecindex/testdata/delete.ddt
Original file line number Diff line number Diff line change
@@ -1,5 +1,141 @@
# ----------
# Test deleting vectors from primary index, but not from secondary index.
# Construct new index with one vector in the root.
# ----------
new-index min-partition-size=1 max-partition-size=3 beam-size=2
vec1: (1, 2)
----
• 1 (0, 0)
└───• vec1 (1, 2)

# Delete remaining vector in the root.
delete
vec1
----
• 1 (0, 0)

# ----------
# Construct new index with only duplicate vectors.
# ----------
new-index min-partition-size=1 max-partition-size=3 beam-size=2
vec1: (1, 2)
vec2: (1, 2)
vec3: (1, 2)
vec4: (1, 2)
vec5: (1, 2)
vec6: (1, 2)
----
• 1 (1, 2)
├───• 2 (1, 2)
│ │
│ ├───• vec1 (1, 2)
│ └───• vec2 (1, 2)
└───• 3 (1, 2)
├───• vec3 (1, 2)
├───• vec4 (1, 2)
├───• vec5 (1, 2)
└───• vec6 (1, 2)

# Ensure the correct duplicates are deleted (i.e. with matching keys).
delete
vec1
vec5
----
• 1 (1, 2)
├───• 2 (1, 2)
│ │
│ └───• vec2 (1, 2)
└───• 3 (1, 2)
├───• vec3 (1, 2)
├───• vec4 (1, 2)
└───• vec6 (1, 2)

# ----------
# Construct new index with multiple levels.
# ----------
new-index min-partition-size=1 max-partition-size=3 beam-size=1
vec1: (1, 2)
vec2: (7, 4)
vec3: (4, 3)
vec4: (-4, 5)
vec5: (1, 11)
vec6: (1, -6)
vec7: (0, 4)
vec8: (-2, 8)
vec9: (2, 8)
----
• 1 (1.5, 1.875)
├───• 2 (1, -2)
│ │
│ ├───• vec1 (1, 2)
│ └───• vec6 (1, -6)
├───• 4 (1.75, 4)
│ │
│ ├───• vec3 (4, 3)
│ ├───• vec4 (-4, 5)
│ ├───• vec7 (0, 4)
│ └───• vec2 (7, 4)
└───• 5 (0.3333, 9)
├───• vec5 (1, 11)
├───• vec8 (-2, 8)
└───• vec9 (2, 8)

# Test case where initial search fails to find vector to delete and it must be
# retried.
delete
vec1: (0, 8)
----
• 1 (1.5, 1.875)
├───• 2 (1, -2)
│ │
│ └───• vec6 (1, -6)
├───• 4 (1.75, 4)
│ │
│ ├───• vec3 (4, 3)
│ ├───• vec4 (-4, 5)
│ ├───• vec7 (0, 4)
│ └───• vec2 (7, 4)
└───• 5 (0.3333, 9)
├───• vec5 (1, 11)
├───• vec8 (-2, 8)
└───• vec9 (2, 8)

# Delete multiple vectors.
delete
vec4
vec5
vec6
----
• 1 (1.5, 1.875)
├───• 2 (1, -2)
├───• 4 (1.75, 4)
│ │
│ ├───• vec3 (4, 3)
│ ├───• vec2 (7, 4)
│ └───• vec7 (0, 4)
└───• 5 (0.3333, 9)
├───• vec9 (2, 8)
└───• vec8 (-2, 8)

# ----------
# Construct new index with multiple levels.
# ----------
new-index min-partition-size=1 max-partition-size=3 beam-size=2
vec1: (1, 2)
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/vecindex/testdata/insert.ddt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# ----------
# Construct empty index.
# ----------
new-index min-partition-size=1 max-partition-size=4 beam-size=2
----
• 1 (0, 0)
Expand Down
9 changes: 9 additions & 0 deletions pkg/sql/vecindex/vecstore/in_memory_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,15 @@ func (s *InMemoryStore) DeleteVector(txn Txn, key PrimaryKey) {
delete(s.mu.vectors, string(key))
}

// GetVector returns a single vector from the store, by its primary key. This
// is used for testing.
func (s *InMemoryStore) GetVector(key PrimaryKey) vector.T {
s.mu.Lock()
defer s.mu.Unlock()

return s.mu.vectors[string(key)]
}

// GetAllVectors returns all vectors that have been added to the store as key
// and vector pairs. This is used for testing.
func (s *InMemoryStore) GetAllVectors() []VectorWithKey {
Expand Down
10 changes: 10 additions & 0 deletions pkg/sql/vecindex/vecstore/search_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package vecstore

import (
"bytes"
"sort"

"github.com/cockroachdb/cockroach/pkg/util/container/heap"
Expand Down Expand Up @@ -168,6 +169,10 @@ type SearchSet struct {
// among the best results.
MaxExtraResults int

// MatchKey, if non-nil, filters out all search candidates that do not have
// a matching primary key.
MatchKey PrimaryKey

// Stats tracks useful information about the search, such as how many vectors
// and partitions were scanned.
Stats SearchStats
Expand All @@ -180,6 +185,11 @@ type SearchSet struct {
// Add includes a new candidate in the search set. If set limits have been
// reached, then the candidate with the farthest distance will be discarded.
func (ss *SearchSet) Add(candidate *SearchResult) {
if ss.MatchKey != nil && !bytes.Equal(ss.MatchKey, candidate.ChildKey.PrimaryKey) {
// Filter out candidates without a matching primary key.
return
}

// Fast path where no pruning is necessary.
if len(ss.results) < ss.MaxResults {
heap.Push[*SearchResult](&ss.results, candidate)
Expand Down
5 changes: 5 additions & 0 deletions pkg/sql/vecindex/vecstore/search_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,9 @@ func TestSearchSet(t *testing.T) {
otherSet.MaxExtraResults = 1
otherSet.AddAll(SearchResults{result1, result2, result3, result4, result5, result6, result7})
require.Equal(t, SearchResults{result3, result1, result4, result7}, otherSet.PopResults())

// Ignore results without a matching primary key.
otherSet = SearchSet{MaxResults: 2, MatchKey: []byte{60}}
otherSet.AddAll(SearchResults{result1, result2, result3, result4, result5, result6, result7})
require.Equal(t, SearchResults{result6}, otherSet.PopResults())
}
57 changes: 57 additions & 0 deletions pkg/sql/vecindex/vector_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,63 @@ func (vi *VectorIndex) Insert(
return vi.insertHelper(&parentSearchCtx, childKey, true /* allowRetry */)
}

// Delete attempts to remove a vector from the index, given its value and
// primary key. This is called within the scope of a transaction so that the
// index does not appear to change during the delete.
//
// NOTE: Delete may not be able to locate the vector in the index, meaning a
// "dangling vector" reference will be left in the tree. Vector index methods
// handle this rare case by checking for duplicates when returning search
// results.
func (vi *VectorIndex) Delete(
ctx context.Context, txn vecstore.Txn, vector vector.T, key vecstore.PrimaryKey,
) error {
// Search for the vector in the index.
searchCtx := searchContext{
Txn: txn,
Original: vector,
Level: vecstore.LeafLevel,
Options: SearchOptions{
SkipRerank: vi.options.DisableErrorBounds,
},
}
searchCtx.Ctx = internal.WithWorkspace(ctx, &searchCtx.Workspace)

// Randomize the vector if required by the quantizer.
tempRandomized := searchCtx.Workspace.AllocVector(vi.quantizer.GetRandomDims())
defer searchCtx.Workspace.FreeVector(tempRandomized)
vi.quantizer.RandomizeVector(ctx, vector, tempRandomized, false /* invert */)
searchCtx.Randomized = tempRandomized

searchSet := vecstore.SearchSet{MaxResults: 1, MatchKey: key}

// Search with the base beam size. If that fails to find the vector, try again
// with a larger beam size, in order to minimize the chance of dangling
// vector references in the index.
baseBeamSize := max(vi.options.BaseBeamSize, 1)
for {
searchCtx.Options.BaseBeamSize = baseBeamSize

err := vi.searchHelper(&searchCtx, &searchSet, true /* allowRetry */)
if err != nil {
return err
}
results := searchSet.PopResults()
if len(results) == 0 {
// Retry search with significantly higher beam size.
if baseBeamSize == vi.options.BaseBeamSize {
baseBeamSize *= 8
continue
}
return nil
}

// Remove the vector from its partition in the store.
_, err = vi.removeFromPartition(ctx, txn, results[0].ParentPartitionKey, results[0].ChildKey)
return err
}
}

// Search finds vectors in the index that are closest to the given query vector
// and returns them in the search set. Set searchSet.MaxResults to limit the
// number of results. This is called within the scope of a transaction so that
Expand Down
57 changes: 39 additions & 18 deletions pkg/sql/vecindex/vector_index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,31 +269,52 @@ func (s *testState) Delete(d *datadriven.TestData) string {
}
}

txn := beginTransaction(s.Ctx, s.T, s.InMemStore)
defer commitTransaction(s.Ctx, s.T, s.InMemStore, txn)
for i, line := range strings.Split(d.Input, "\n") {
line = strings.TrimSpace(line)
if len(line) == 0 {
continue
}

// Get root in order to acquire partition lock.
_, err := s.InMemStore.GetPartition(s.Ctx, txn, vecstore.RootKey)
require.NoError(s.T, err)
// If vector to delete has a colon, then its value is specified as well
// as its name. This is useful for forcing a certain value to delete.
var key vecstore.PrimaryKey
var vec vector.T
parts := strings.Split(line, ":")
if len(parts) == 1 {
// Get the value from the store.
key = vecstore.PrimaryKey(line)
vec = s.InMemStore.GetVector(key)
} else {
require.Len(s.T, parts, 2)
// Parse the value after the colon.
key = vecstore.PrimaryKey(parts[0])
vec = s.parseVector(parts[1])
}

if notFound {
for _, line := range strings.Split(d.Input, "\n") {
line = strings.TrimSpace(line)
if len(line) == 0 {
continue
}
// Delete within the scope of a transaction.
txn := beginTransaction(s.Ctx, s.T, s.InMemStore)

// Simulate case where the vector is deleted in the primary index, but
// it cannot be found in the secondary index.
s.InMemStore.DeleteVector(txn, []byte(line))
// If notFound=true, then simulate case where the vector is deleted in
// the primary index, but it cannot be found in the secondary index.
if !notFound {
err := s.Index.Delete(s.Ctx, txn, vec, key)
require.NoError(s.T, err)
}
s.InMemStore.DeleteVector(txn, key)

commitTransaction(s.Ctx, s.T, s.InMemStore, txn)

if (i+1)%s.Options.MaxPartitionSize == 0 {
// Periodically, run synchronous fixups so that test results are
// deterministic.
require.NoError(s.T, s.Index.fixups.runAll(s.Ctx))
}
}

// TODO(andyk): Add code to delete vector from index.
// Handle any remaining fixups.
require.NoError(s.T, s.Index.fixups.runAll(s.Ctx))

tree, err := s.Index.Format(s.Ctx, txn, FormatOptions{PrimaryKeyStrings: true})
require.NoError(s.T, err)
return tree
return s.FormatTree(d)
}

func (s *testState) Recall(d *datadriven.TestData) string {
Expand Down