Skip to content

Commit

Permalink
vectorstores: add mongovector (#1005)
Browse files Browse the repository at this point in the history
* mongovector: Add mongo vectorstore implementation

---------

Co-authored-by: Travis Cline <[email protected]>
  • Loading branch information
prestonvasquez and tmc authored Sep 13, 2024
1 parent 2124f7f commit 66d7710
Show file tree
Hide file tree
Showing 8 changed files with 1,171 additions and 0 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ require (
gitlab.com/golang-commonmark/linkify v0.0.0-20191026162114-a0c2df6c8f82 // indirect
gitlab.com/golang-commonmark/mdurl v0.0.0-20191124015652-932350d1cb84 // indirect
gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f // indirect
go.mongodb.org/mongo-driver/v2 v2.0.0-beta1 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,8 @@ go.mongodb.org/mongo-driver v1.7.5/go.mod h1:VXEWRZ6URJIkUq2SCAyapmhH0ZLRBP+FT4x
go.mongodb.org/mongo-driver v1.10.0/go.mod h1:wsihk0Kdgv8Kqu1Anit4sfK+22vSFbUrAVEYRhCXrA8=
go.mongodb.org/mongo-driver v1.14.0 h1:P98w8egYRjYe3XDjxhYJagTokP/H6HzlsnojRgZRd80=
go.mongodb.org/mongo-driver v1.14.0/go.mod h1:Vzb0Mk/pa7e6cWw85R4F/endUC3u0U9jGcNU603k65c=
go.mongodb.org/mongo-driver/v2 v2.0.0-beta1 h1:vwKMYa9FCX1OW7efPaH0FUaD6o+WC0kiC7VtHtNX7UU=
go.mongodb.org/mongo-driver/v2 v2.0.0-beta1/go.mod h1:pfndQmffp38kKjbwVfoavadsdC0Nsg/qb+INK01PNaM=
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 h1:A3SayB3rNyt+1S6qpI9mHPkeHTZbD7XILEqWnYZb2l0=
Expand Down
46 changes: 46 additions & 0 deletions vectorstores/mongovector/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Package mongovector implements a vector store using MongoDB as the backend.
//
// The mongovector package provides a way to store and retrieve document embeddings
// using MongoDB's vector search capabilities. It implements the VectorStore
// interface from the vectorstores package, allowing it to be used interchangeably
// with other vector store implementations.
//
// Key features:
// - Store document embeddings in MongoDB
// - Perform similarity searches on stored embeddings
// - Configurable index and path settings
// - Support for custom embedding functions
//
// Main types:
// - Store: The main type that implements the VectorStore interface
// - Option: A function type for configuring the Store
//
// Usage:
//
// import (
// "github.com/tmc/langchaingo/vectorstores/mongovector"
// "go.mongodb.org/mongo-driver/mongo"
// )
//
// // Create a new Store
// coll := // ... obtain a *mongo.Collection
// embedder := // ... obtain an embeddings.Embedder
// store := mongovector.New(coll, embedder)
//
// // Add documents
// docs := []schema.Document{
// {PageContent: "Document 1"},
// {PageContent: "Document 2"},
// }
// ids, err := store.AddDocuments(context.Background(), docs)
//
// // Perform similarity search
// results, err := store.SimilaritySearch(context.Background(), "query", 5)
//
// The package also provides options for customizing the Store:
// - WithIndex: Set a custom index name
// - WithPath: Set a custom path for the vector field
// - WithNumCandidates: Set the number of candidates for similarity search
//
// For more detailed information, see the documentation for individual types and functions.
package mongovector
207 changes: 207 additions & 0 deletions vectorstores/mongovector/mock_embedder.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
package mongovector

import (
"context"
"crypto/rand"
"fmt"
"math/big"
"time"

"github.com/tmc/langchaingo/embeddings"
"github.com/tmc/langchaingo/schema"
"github.com/tmc/langchaingo/vectorstores"
)

type mockEmbedder struct {
queryVector []float32
docs map[string]schema.Document
docVectors map[string][]float32
}

var _ embeddings.Embedder = &mockEmbedder{}

func newMockEmbedder(dim int) *mockEmbedder {
emb := &mockEmbedder{
queryVector: newNormalizedVector(dim),
docs: make(map[string]schema.Document),
docVectors: make(map[string][]float32),
}

return emb
}

// mockDocuments will add the given documents to the embedder, assigning each
// a vector such that similarity score = 0.5 * ( 1 + vector * queryVector).
func (emb *mockEmbedder) mockDocuments(doc ...schema.Document) {
for _, d := range doc {
emb.docs[d.PageContent] = d
}
}

// existingVectors returns all the vectors that have been added to the embedder.
// The query vector is included in the list to maintain orthogonality.
func (emb *mockEmbedder) existingVectors() [][]float32 {
vectors := make([][]float32, 0, len(emb.docs)+1)
for _, vec := range emb.docVectors {
vectors = append(vectors, vec)
}

return append(vectors, emb.queryVector)
}

// EmbedDocuments will return the embedded vectors for the given texts. If the
// text does not exist in the document set, a zero vector will be returned.
func (emb *mockEmbedder) EmbedDocuments(_ context.Context, texts []string) ([][]float32, error) {
vectors := make([][]float32, len(texts))
for i := range vectors {
// If the text does not exist in the document set, return a zero vector.
doc, ok := emb.docs[texts[i]]
if !ok {
vectors[i] = make([]float32, len(emb.queryVector))
}

// If the vector exists, use it.
existing, ok := emb.docVectors[texts[i]]
if ok {
vectors[i] = existing

continue
}

// If it does not exist, make a linearly independent vector.
newVectorBasis := newOrthogonalVector(len(emb.queryVector), emb.existingVectors()...)

// Update the newVector to be scaled by the score.
newVector := dotProductNormFn(doc.Score, emb.queryVector, newVectorBasis)

vectors[i] = newVector
emb.docVectors[texts[i]] = newVector
}

return vectors, nil
}

// EmbedQuery returns the query vector.
func (emb *mockEmbedder) EmbedQuery(context.Context, string) ([]float32, error) {
return emb.queryVector, nil
}

// Insert all of the mock documents collected by the embedder.
func flushMockDocuments(ctx context.Context, store Store, emb *mockEmbedder) error {
docs := make([]schema.Document, 0, len(emb.docs))
for _, doc := range emb.docs {
docs = append(docs, doc)
}

_, err := store.AddDocuments(ctx, docs, vectorstores.WithEmbedder(emb))
if err != nil {
return err
}

// Consistency on indexes is not synchronous.
// nolint:mnd
time.Sleep(10 * time.Second)

return nil
}

// newNormalizedFloat32 will generate a random float32 in [-1, 1].
// nolint:mnd
func newNormalizedFloat32() (float32, error) {
max := big.NewInt(1 << 24)

n, err := rand.Int(rand.Reader, max)
if err != nil {
return 0.0, fmt.Errorf("failed to normalize float32")
}

return 2.0*(float32(n.Int64())/float32(1<<24)) - 1.0, nil
}

// dotProduct will return the dot product between two slices of f32.
func dotProduct(v1, v2 []float32) float32 {
var sum float32

for i := range v1 {
sum += v1[i] * v2[i]
}

return sum
}

// linearlyIndependent true if the vectors are linearly independent.
func linearlyIndependent(v1, v2 []float32) bool {
var ratio float32

for i := range v1 {
if v1[i] != 0 {
r := v2[i] / v1[i]

if ratio == 0 {
ratio = r

continue
}

if r == ratio {
continue
}

return true
}

if v2[i] != 0 {
return true
}
}

return false
}

// Create a vector of values between [-1, 1] of the specified size.
func newNormalizedVector(dim int) []float32 {
vector := make([]float32, dim)
for i := range vector {
vector[i], _ = newNormalizedFloat32()
}

return vector
}

// Use Gram Schmidt to return a vector orthogonal to the basis, so long as
// the vectors in the basis are linearly independent.
func newOrthogonalVector(dim int, basis ...[]float32) []float32 {
candidate := newNormalizedVector(dim)

for _, b := range basis {
dp := dotProduct(candidate, b)
basisNorm := dotProduct(b, b)

for i := range candidate {
candidate[i] -= (dp / basisNorm) * b[i]
}
}

return candidate
}

// return a new vector such that v1 * v2 = 2S - 1.
func dotProductNormFn(score float32, qvector, basis []float32) []float32 {
var sum float32

// Populate v2 upto dim-1.
for i := range qvector[:len(qvector)-1] {
sum += qvector[i] * basis[i]
}

// Calculate v_{2, dim} such that v1 * v2 = 2S - 1:
basis[len(basis)-1] = (2*score - 1 - sum) / qvector[len(qvector)-1]

// If the vectors are linearly independent, regenerate the dim-1 elements
// of v2.
if !linearlyIndependent(qvector, basis) {
return dotProductNormFn(score, qvector, basis)
}

return basis
}
38 changes: 38 additions & 0 deletions vectorstores/mongovector/mock_llm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package mongovector

import (
"context"

"github.com/tmc/langchaingo/embeddings"
)

// mockLLM will create consistent text embeddings mocking the OpenAI
// text-embedding-3-small algorithm.
type mockLLM struct {
seen map[string][]float32
dim int
}

var _ embeddings.EmbedderClient = &mockLLM{}

// createEmbedding will return vector embeddings for the mock LLM, maintaining
// consistency.
func (emb *mockLLM) CreateEmbedding(_ context.Context, texts []string) ([][]float32, error) {
if emb.seen == nil {
emb.seen = map[string][]float32{}
}

vectors := make([][]float32, len(texts))
for i, text := range texts {
if f32s := emb.seen[text]; len(f32s) > 0 {
vectors[i] = f32s

continue
}

vectors[i] = newNormalizedVector(emb.dim)
emb.seen[text] = vectors[i] // ensure consistency
}

return vectors, nil
}
Loading

0 comments on commit 66d7710

Please sign in to comment.