vectorstores: add mongovector (#1005)

* mongovector: Add mongo vectorstore implementation --------- Co-authored-by: Travis Cline <[email protected]>
tmc · Sep 13, 2024 · 66d7710 · 66d7710
1 parent 2124f7f
commit 66d7710
Show file tree

Hide file tree

Showing 8 changed files with 1,171 additions and 0 deletions.
diff --git a/go.mod b/go.mod
@@ -158,6 +158,7 @@ require (
 	gitlab.com/golang-commonmark/linkify v0.0.0-20191026162114-a0c2df6c8f82 // indirect
 	gitlab.com/golang-commonmark/mdurl v0.0.0-20191124015652-932350d1cb84 // indirect
 	gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f // indirect
+	go.mongodb.org/mongo-driver/v2 v2.0.0-beta1 // indirect
 	go.opencensus.io v0.24.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect

diff --git a/go.sum b/go.sum
@@ -783,6 +783,8 @@ go.mongodb.org/mongo-driver v1.7.5/go.mod h1:VXEWRZ6URJIkUq2SCAyapmhH0ZLRBP+FT4x
 go.mongodb.org/mongo-driver v1.10.0/go.mod h1:wsihk0Kdgv8Kqu1Anit4sfK+22vSFbUrAVEYRhCXrA8=
 go.mongodb.org/mongo-driver v1.14.0 h1:P98w8egYRjYe3XDjxhYJagTokP/H6HzlsnojRgZRd80=
 go.mongodb.org/mongo-driver v1.14.0/go.mod h1:Vzb0Mk/pa7e6cWw85R4F/endUC3u0U9jGcNU603k65c=
+go.mongodb.org/mongo-driver/v2 v2.0.0-beta1 h1:vwKMYa9FCX1OW7efPaH0FUaD6o+WC0kiC7VtHtNX7UU=
+go.mongodb.org/mongo-driver/v2 v2.0.0-beta1/go.mod h1:pfndQmffp38kKjbwVfoavadsdC0Nsg/qb+INK01PNaM=
 go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
 go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 h1:A3SayB3rNyt+1S6qpI9mHPkeHTZbD7XILEqWnYZb2l0=

diff --git a/vectorstores/mongovector/doc.go b/vectorstores/mongovector/doc.go
@@ -0,0 +1,46 @@
+// Package mongovector implements a vector store using MongoDB as the backend.
+//
+// The mongovector package provides a way to store and retrieve document embeddings
+// using MongoDB's vector search capabilities. It implements the VectorStore
+// interface from the vectorstores package, allowing it to be used interchangeably
+// with other vector store implementations.
+//
+// Key features:
+//   - Store document embeddings in MongoDB
+//   - Perform similarity searches on stored embeddings
+//   - Configurable index and path settings
+//   - Support for custom embedding functions
+//
+// Main types:
+//   - Store: The main type that implements the VectorStore interface
+//   - Option: A function type for configuring the Store
+//
+// Usage:
+//
+//	import (
+//	    "github.com/tmc/langchaingo/vectorstores/mongovector"
+//	    "go.mongodb.org/mongo-driver/mongo"
+//	)
+//
+//	// Create a new Store
+//	coll := // ... obtain a *mongo.Collection
+//	embedder := // ... obtain an embeddings.Embedder
+//	store := mongovector.New(coll, embedder)
+//
+//	// Add documents
+//	docs := []schema.Document{
+//	    {PageContent: "Document 1"},
+//	    {PageContent: "Document 2"},
+//	}
+//	ids, err := store.AddDocuments(context.Background(), docs)
+//
+//	// Perform similarity search
+//	results, err := store.SimilaritySearch(context.Background(), "query", 5)
+//
+// The package also provides options for customizing the Store:
+//   - WithIndex: Set a custom index name
+//   - WithPath: Set a custom path for the vector field
+//   - WithNumCandidates: Set the number of candidates for similarity search
+//
+// For more detailed information, see the documentation for individual types and functions.
+package mongovector
diff --git a/vectorstores/mongovector/mock_embedder.go b/vectorstores/mongovector/mock_embedder.go
@@ -0,0 +1,207 @@
+package mongovector
+
+import (
+	"context"
+	"crypto/rand"
+	"fmt"
+	"math/big"
+	"time"
+
+	"github.com/tmc/langchaingo/embeddings"
+	"github.com/tmc/langchaingo/schema"
+	"github.com/tmc/langchaingo/vectorstores"
+)
+
+type mockEmbedder struct {
+	queryVector []float32
+	docs        map[string]schema.Document
+	docVectors  map[string][]float32
+}
+
+var _ embeddings.Embedder = &mockEmbedder{}
+
+func newMockEmbedder(dim int) *mockEmbedder {
+	emb := &mockEmbedder{
+		queryVector: newNormalizedVector(dim),
+		docs:        make(map[string]schema.Document),
+		docVectors:  make(map[string][]float32),
+	}
+
+	return emb
+}
+
+// mockDocuments will add the given documents to the embedder, assigning each
+// a vector such that similarity score = 0.5 * ( 1 + vector * queryVector).
+func (emb *mockEmbedder) mockDocuments(doc ...schema.Document) {
+	for _, d := range doc {
+		emb.docs[d.PageContent] = d
+	}
+}
+
+// existingVectors returns all the vectors that have been added to the embedder.
+// The query vector is included in the list to maintain orthogonality.
+func (emb *mockEmbedder) existingVectors() [][]float32 {
+	vectors := make([][]float32, 0, len(emb.docs)+1)
+	for _, vec := range emb.docVectors {
+		vectors = append(vectors, vec)
+	}
+
+	return append(vectors, emb.queryVector)
+}
+
+// EmbedDocuments will return the embedded vectors for the given texts. If the
+// text does not exist in the document set, a zero vector will be returned.
+func (emb *mockEmbedder) EmbedDocuments(_ context.Context, texts []string) ([][]float32, error) {
+	vectors := make([][]float32, len(texts))
+	for i := range vectors {
+		// If the text does not exist in the document set, return a zero vector.
+		doc, ok := emb.docs[texts[i]]
+		if !ok {
+			vectors[i] = make([]float32, len(emb.queryVector))
+		}
+
+		// If the vector exists, use it.
+		existing, ok := emb.docVectors[texts[i]]
+		if ok {
+			vectors[i] = existing
+
+			continue
+		}
+
+		// If it does not exist, make a linearly independent vector.
+		newVectorBasis := newOrthogonalVector(len(emb.queryVector), emb.existingVectors()...)
+
+		// Update the newVector to be scaled by the score.
+		newVector := dotProductNormFn(doc.Score, emb.queryVector, newVectorBasis)
+
+		vectors[i] = newVector
+		emb.docVectors[texts[i]] = newVector
+	}
+
+	return vectors, nil
+}
+
+// EmbedQuery returns the query vector.
+func (emb *mockEmbedder) EmbedQuery(context.Context, string) ([]float32, error) {
+	return emb.queryVector, nil
+}
+
+// Insert all of the mock documents collected by the embedder.
+func flushMockDocuments(ctx context.Context, store Store, emb *mockEmbedder) error {
+	docs := make([]schema.Document, 0, len(emb.docs))
+	for _, doc := range emb.docs {
+		docs = append(docs, doc)
+	}
+
+	_, err := store.AddDocuments(ctx, docs, vectorstores.WithEmbedder(emb))
+	if err != nil {
+		return err
+	}
+
+	// Consistency on indexes is not synchronous.
+	// nolint:mnd
+	time.Sleep(10 * time.Second)
+
+	return nil
+}
+
+// newNormalizedFloat32 will generate a random float32 in [-1, 1].
+// nolint:mnd
+func newNormalizedFloat32() (float32, error) {
+	max := big.NewInt(1 << 24)
+
+	n, err := rand.Int(rand.Reader, max)
+	if err != nil {
+		return 0.0, fmt.Errorf("failed to normalize float32")
+	}
+
+	return 2.0*(float32(n.Int64())/float32(1<<24)) - 1.0, nil
+}
+
+// dotProduct will return the dot product between two slices of f32.
+func dotProduct(v1, v2 []float32) float32 {
+	var sum float32
+
+	for i := range v1 {
+		sum += v1[i] * v2[i]
+	}
+
+	return sum
+}
+
+// linearlyIndependent true if the vectors are linearly independent.
+func linearlyIndependent(v1, v2 []float32) bool {
+	var ratio float32
+
+	for i := range v1 {
+		if v1[i] != 0 {
+			r := v2[i] / v1[i]
+
+			if ratio == 0 {
+				ratio = r
+
+				continue
+			}
+
+			if r == ratio {
+				continue
+			}
+
+			return true
+		}
+
+		if v2[i] != 0 {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Create a vector of values between [-1, 1] of the specified size.
+func newNormalizedVector(dim int) []float32 {
+	vector := make([]float32, dim)
+	for i := range vector {
+		vector[i], _ = newNormalizedFloat32()
+	}
+
+	return vector
+}
+
+// Use Gram Schmidt to return a vector orthogonal to the basis, so long as
+// the vectors in the basis are linearly independent.
+func newOrthogonalVector(dim int, basis ...[]float32) []float32 {
+	candidate := newNormalizedVector(dim)
+
+	for _, b := range basis {
+		dp := dotProduct(candidate, b)
+		basisNorm := dotProduct(b, b)
+
+		for i := range candidate {
+			candidate[i] -= (dp / basisNorm) * b[i]
+		}
+	}
+
+	return candidate
+}
+
+// return a new vector such that v1 * v2 = 2S - 1.
+func dotProductNormFn(score float32, qvector, basis []float32) []float32 {
+	var sum float32
+
+	// Populate v2 upto dim-1.
+	for i := range qvector[:len(qvector)-1] {
+		sum += qvector[i] * basis[i]
+	}
+
+	// Calculate v_{2, dim} such that v1 * v2 = 2S - 1:
+	basis[len(basis)-1] = (2*score - 1 - sum) / qvector[len(qvector)-1]
+
+	// If the vectors are linearly independent, regenerate the dim-1 elements
+	// of v2.
+	if !linearlyIndependent(qvector, basis) {
+		return dotProductNormFn(score, qvector, basis)
+	}
+
+	return basis
+}
diff --git a/vectorstores/mongovector/mock_llm.go b/vectorstores/mongovector/mock_llm.go
@@ -0,0 +1,38 @@
+package mongovector
+
+import (
+	"context"
+
+	"github.com/tmc/langchaingo/embeddings"
+)
+
+// mockLLM will create consistent text embeddings mocking the OpenAI
+// text-embedding-3-small algorithm.
+type mockLLM struct {
+	seen map[string][]float32
+	dim  int
+}
+
+var _ embeddings.EmbedderClient = &mockLLM{}
+
+// createEmbedding will return vector embeddings for the mock LLM, maintaining
+// consistency.
+func (emb *mockLLM) CreateEmbedding(_ context.Context, texts []string) ([][]float32, error) {
+	if emb.seen == nil {
+		emb.seen = map[string][]float32{}
+	}
+
+	vectors := make([][]float32, len(texts))
+	for i, text := range texts {
+		if f32s := emb.seen[text]; len(f32s) > 0 {
+			vectors[i] = f32s
+
+			continue
+		}
+
+		vectors[i] = newNormalizedVector(emb.dim)
+		emb.seen[text] = vectors[i] // ensure consistency
+	}
+
+	return vectors, nil
+}