diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b2c58574..1e37d6c6 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -105,8 +105,6 @@ jobs: strategy: matrix: platform: - - runner: macos-12 - target: x86_64 - runner: macos-14 target: aarch64 steps: diff --git a/README.md b/README.md index 625e7fda..fb89f98f 100644 --- a/README.md +++ b/README.md @@ -21,14 +21,14 @@

- 🦀 Rust-powered Framework for Lightning-Fast Ingestion, Inference, and Indexing + Inference, ingestion, and indexing – supercharged by Rust 🦀
Explore the docs »

View Demo · - Examples + Benches · Vector Streaming Adapters . @@ -83,7 +83,7 @@ EmbedAnything is a minimalist, highly performant, lightning-fast, lightweight, m ## 💡What is Vector Streaming -Vector Streaming enables you to process and generate embeddings for files and stream them, so if you have 10 GB of file, it can continuously generate embeddings Chunk by Chunk, that you can segment semantically, and store them in the vector database of your choice, Thus it eliminates bulk embeddings storage on RAM at once. +Vector Streaming enables you to process and generate embeddings for files and stream them, so if you have 10 GB of file, it can continuously generate embeddings Chunk by Chunk, that you can segment semantically, and store them in the vector database of your choice, Thus it eliminates bulk embeddings storage on RAM at once. The embedding process happens separetly from the main process, so as to maintain high performance enabled by rust MPSC. [![EmbedAnythingXWeaviate](https://res.cloudinary.com/dltwftrgc/image/upload/v1731166897/demo_o8auu4.gif)](https://www.youtube.com/watch?v=OJRWPLQ44Dw) @@ -107,7 +107,7 @@ model = EmbeddingModel.from_pretrained_hf( WhichModel.Bert, model_id="model link from huggingface" ) config = TextEmbedConfig(chunk_size=200, batch_size=32) -data = embed_anything.embed_file("file_address", embeder=model, config=config) +data = embed_anything.embed_file("file_address", embedder=model, config=config) ``` @@ -190,7 +190,7 @@ pip install embed-anything-gpu model = EmbeddingModel.from_pretrained_local( WhichModel.Bert, model_id="Hugging_face_link" ) -data = embed_anything.embed_file("test_files/test.pdf", embeder=model) +data = embed_anything.embed_file("test_files/test.pdf", embedder=model) ``` @@ -206,11 +206,11 @@ model = embed_anything.EmbeddingModel.from_pretrained_local( model_id="openai/clip-vit-base-patch16", # revision="refs/pr/15", ) -data: list[EmbedData] = embed_anything.embed_directory("test_files", embeder=model) +data: list[EmbedData] = embed_anything.embed_directory("test_files", embedder=model) embeddings = np.array([data.embedding for data in data]) query = ["Photo of a monkey?"] query_embedding = np.array( - embed_anything.embed_query(query, embeder=model)[0].embedding + embed_anything.embed_query(query, embedder=model)[0].embedding ) similarities = np.dot(embeddings, query_embedding) max_index = np.argmax(similarities) @@ -233,7 +233,7 @@ from embed_anything import ( audio_decoder = AudioDecoderModel.from_pretrained_hf( "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False ) -embeder = EmbeddingModel.from_pretrained_hf( +embedder = EmbeddingModel.from_pretrained_hf( embed_anything.WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", @@ -242,7 +242,7 @@ config = TextEmbedConfig(chunk_size=200, batch_size=32) data = embed_anything.embed_audio_file( "test_files/audio/samples_hp0.wav", audio_decoder=audio_decoder, - embeder=embeder, + embedder=embedder, text_embed_config=config, ) print(data[0].metadata) diff --git a/docs/blog/posts/Journey.md b/docs/blog/posts/Journey.md new file mode 100644 index 00000000..6dd1336c --- /dev/null +++ b/docs/blog/posts/Journey.md @@ -0,0 +1,77 @@ +--- +draft: false +date: 2024-12-15 +authors: + - akshay + - sonam +slug: embed-anything +title: The path ahead of EmbedAnything +--- +In March, we set out to build a local file search app. We aimed to create a tool that would make file searching faster, more innovative, and more efficient. However, we quickly hit a roadblock: no high-performance backend fit our needs. + +![image.png](https://royal-hygienic-522.notion.site/image/https%3A%2F%2Fprod-files-secure.s3.us-west-2.amazonaws.com%2Ff1bf59bf-2c3f-4b4d-a5f9-109d041ef45a%2Faa8abe48-4210-494c-af98-458b6694b09a%2Fimage.png?table=block&id=15d81b6a-6bbe-80cc-883e-fcafd65e619d&spaceId=f1bf59bf-2c3f-4b4d-a5f9-109d041ef45a&width=1420&userId=&cache=v2) + +### Short of backend + +Initially, we experimented with LlamaIndex, hoping it would provide the required speed and reliability. Unfortunately, it fell short. Its performance didn’t meet our expectations, and its heavy dependencies added unnecessary complexity to our stack. We realized we needed a better solution. + +Around the same time, we discovered **Candle**, a Rust-based framework for transformer model inference. Candle stood out with its remarkable speed and minimal dependency footprint. It was exactly what we were looking for a high-performing, lightweight backend that aligned with our vision for a seamless file search experience. + +### Experimentation and Breakthroughs + +Excited by Candle’s potential, we experimented to see how well it could handle our use case. The results were outstanding. Candle’s blazing-fast inference speeds and low resource demands enabled us to build a prototype that surpassed our initial performance goals. + +With a working prototype, we decided to share it with the world. We knew a compelling demonstration could capture attention and validate our efforts. The next step was to make a splash with our launch. + +### Demo Released + +On **April 2nd**, we unveiled our demo online, carefully choosing the date to avoid confusion with April Fool’s Day. We created an engaging demo video to highlight the app’s capabilities and shared it on Twitter. What happened next exceeded all our expectations. + +The demo received an overwhelming response. What began as a simple showcase of our prototype transformed into a pivotal moment for our project. In the next 30 days, we released it as an open-source project, seeing the demand and people’s interest. + +[Demo](https://www.youtube.com/watch?v=HLXIuznnXcI) +### 0.2 released + +Since then, we have never looked back. We kept embedding anything better and better. In the next three months, we released a more stable version, 0.2, with all the Python versions. It was running amazingly on AWS and could support multimodality. + +At the same time, we realized that people wanted an end-to-end solution, not just an embedding generation platform. So we tried to integrate a vector database, but we realized that it would just make our library heavier and not give the value we were looking for, which was confirmed by this discussion opened on our GitHub. + +[—GitHub discussion](https://github.com/StarlightSearch/EmbedAnything/discussions/44#discussion-6953627) + +Akshay started looking for ways to index embeddings without being dependent on vector databases as a dependency, and he came up with a brilliant method that enhanced performance and made indexing extremely memory efficient. + +And thus, vector streaming was born. + +— [vector streaming blog](https://starlight-search.com/blog/2024/01/31/vector-streaming/) + +### 0.3 release + +It's time to release 0.3 because we underwent major code refactoring. All the major functions are refactored, making calling models more intuitive and optimized. Check out our docs and usage. We also added audio modality and different types of ingestions. + +We only supported dense, so we expanded the types of embedding we could support. We went for sparse and started supporting ColPali, Onnx, and Candle. + +## What We Got Right + +We actively listened to our community and prioritized their needs in the library's development. When users requested support for sparse matrices in hybrid models, we delivered. When they wanted advanced indexing, we made it happen. During the critical three-month period between versions 0.2 and 0.4, our efforts were laser-focused on enhancing the product to meet and exceed expectations. + +We also released benches comparing it with other inference and to our suprise it's faster than libraries like sentence transformer and fastembed. Check out [Benches](https://colab.research.google.com/drive/1nXvd25hDYO-j7QGOIIC0M7MDpovuPCaD?usp=sharing). + + +We presented Embedanything at many conferences, like Pydata Global, Elastic, voxel 51 meetups, AI builders, etc. Additionally, we forged collaborations with major brands like Weaviate and Elastic, a strategy we’re excited to continue expanding in 2025. + +[Weaviate Collab](https://www.youtube.com/watch?v=OJRWPLQ44Dw) + + +## What We Initially Got Wrong + +In hindsight, one significant mistake was prematurely releasing the library before it was ready for production. As the saying goes, “You never get a second chance to make a first impression,” and this holds true even for open-source projects. + +The library was unusable on macOS for the first three months, and we only released compatibility with Python 10. We didn’t focus enough on how we were rolling out updates, partly because we never anticipated the overwhelming rate of experimentation and interest it would receive right from the start. + +I intended to foster a “build in public” project, encouraging collaboration and rapid iteration. I wanted to showcase how quickly we could improve and refine this amazing library. + +### In the year 2025 + +We are committed to applying everything we’ve learned from this journey and doubling down on what truly matters: our hero, the product. In the grand scheme of things, nothing else is as important. Moving forward, we’re also excited to announce even more collaborations with amazing brands, further expanding the impact and reach of our work. + +Heartfelt thanks to all our amazing contributors and stargazers for your unwavering support and dedication to *embedanything*. Your continuous experimentation and feedback inspire us to keep refining and enhancing the library with every iteration. We deeply appreciate your efforts in making this journey truly collaborative. Let’s go from 100k+ to a million downloads this year! \ No newline at end of file diff --git a/docs/blog/posts/embed-anything.md b/docs/blog/posts/embed-anything.md index f4266507..9c545325 100644 --- a/docs/blog/posts/embed-anything.md +++ b/docs/blog/posts/embed-anything.md @@ -1,6 +1,6 @@ --- draft: false -date: 2024-01-31 +date: 2024-03-31 authors: - akshay - sonam @@ -115,7 +115,7 @@ model = EmbeddingModel.from_pretrained_hf( WhichModel.Bert, model_id="model link from huggingface" ) config = TextEmbedConfig(chunk_size=200, batch_size=32) -data = embed_anything.embed_file("file_address", embeder=model, config=config) +data = embed_anything.embed_file("file_address", embedder=model, config=config) ``` You can check out the documentation at https://starlight-search.com/references/ diff --git a/docs/blog/posts/vector-streaming.md b/docs/blog/posts/vector-streaming.md index c6b8a2be..abc2c35e 100644 --- a/docs/blog/posts/vector-streaming.md +++ b/docs/blog/posts/vector-streaming.md @@ -1,6 +1,6 @@ --- draft: false -date: 2024-01-31 +date: 2024-03-31 authors: - akshay - sonam @@ -114,7 +114,7 @@ model = embed_anything.EmbeddingModel.from_pretrained_cloud( data = embed_anything.embed_image_directory( "\image_directory", - embeder=model, + embedder=model, adapter=weaviate_adapter, config=embed_anything.ImageEmbedConfig(buffer_size=100), ) @@ -124,7 +124,7 @@ data = embed_anything.embed_image_directory( #### Step 4: Query the Vector Database ```python -query_vector = embed_anything.embed_query(["image of a cat"], embeder=model)[0].embedding +query_vector = embed_anything.embed_query(["image of a cat"], embedder=model)[0].embedding ``` #### Step 5: Query the Vector Database diff --git a/docs/index.md b/docs/index.md index 02643ed6..e45cf78e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -119,7 +119,7 @@ pip install embed-anything-gpu model = EmbeddingModel.from_pretrained_local( WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2" ) -data = embed_anything.embed_file("test_files/test.pdf", embeder=model) +data = embed_anything.embed_file("test_files/test.pdf", embedder=model) ``` @@ -162,11 +162,11 @@ model = embed_anything.EmbeddingModel.from_pretrained_local( model_id="openai/clip-vit-base-patch16", # revision="refs/pr/15", ) -data: list[EmbedData] = embed_anything.embed_directory("test_files", embeder=model) +data: list[EmbedData] = embed_anything.embed_directory("test_files", embedder=model) embeddings = np.array([data.embedding for data in data]) query = ["Photo of a monkey?"] query_embedding = np.array( - embed_anything.embed_query(query, embeder=model)[0].embedding + embed_anything.embed_query(query, embedder=model)[0].embedding ) similarities = np.dot(embeddings, query_embedding) max_index = np.argmax(similarities) @@ -199,7 +199,7 @@ jina_config = JinaConfig( config = EmbedConfig(jina=jina_config, audio_decoder=audio_decoder_config) data = embed_anything.embed_file( - "test_files/audio/samples_hp0.wav", embeder="Audio", config=config + "test_files/audio/samples_hp0.wav", embedder="Audio", config=config ) print(data[0].metadata) end_time = time.time() diff --git a/examples/adapters/elastic.py b/examples/adapters/elastic.py index 0600090f..4b70f606 100644 --- a/examples/adapters/elastic.py +++ b/examples/adapters/elastic.py @@ -77,7 +77,7 @@ def upsert(self, data: List[Dict]): data = embed_anything.embed_file( "/path/to/my-file.pdf", - embeder="Bert", + embedder="Bert", adapter=elasticsearch_adapter, config=embed_config, ) diff --git a/examples/adapters/pinecone_db.py b/examples/adapters/pinecone_db.py index de054f7e..7545bc7e 100644 --- a/examples/adapters/pinecone_db.py +++ b/examples/adapters/pinecone_db.py @@ -123,7 +123,7 @@ def upsert(self, data: List[Dict]): data = embed_anything.embed_image_directory( "test_files", - embeder=clip_model, + embedder=clip_model, adapter=pinecone_adapter, config=embed_config, ) diff --git a/examples/adapters/weaviate_db.py b/examples/adapters/weaviate_db.py index e0dbf456..1fc54780 100644 --- a/examples/adapters/weaviate_db.py +++ b/examples/adapters/weaviate_db.py @@ -65,10 +65,10 @@ def delete_index(self, index_name: str): data = embed_anything.embed_directory( - "test_files", embeder=model, adapter=weaviate_adapter + "test_files", embedder=model, adapter=weaviate_adapter ) -query_vector = embed_anything.embed_query(["What is attention"], embeder=model)[ +query_vector = embed_anything.embed_query(["What is attention"], embedder=model)[ 0 ].embedding diff --git a/examples/audio.py b/examples/audio.py index 5277d0da..ba000aee 100644 --- a/examples/audio.py +++ b/examples/audio.py @@ -14,7 +14,7 @@ "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False ) -embeder = EmbeddingModel.from_pretrained_hf( +embedder = EmbeddingModel.from_pretrained_hf( embed_anything.WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", @@ -24,7 +24,7 @@ data = embed_anything.embed_audio_file( "test_files/audio/samples_hp0.wav", audio_decoder=audio_decoder, - embeder=embeder, + embedder=embedder, text_embed_config=config, ) print(data[0].metadata) diff --git a/examples/clip.py b/examples/clip.py index ca4b14cd..61dcd80c 100644 --- a/examples/clip.py +++ b/examples/clip.py @@ -11,7 +11,7 @@ model_id="openai/clip-vit-base-patch16", ) data: list[EmbedData] = embed_anything.embed_image_directory( - "test_files", embeder=model + "test_files", embedder=model ) # Convert the embeddings to a numpy array @@ -22,7 +22,7 @@ # Embed a query query = ["Photo of a monkey?"] query_embedding = np.array( - embed_anything.embed_query(query, embeder=model)[0].embedding + embed_anything.embed_query(query, embedder=model)[0].embedding ) # Calculate the similarities between the query embedding and all the embeddings diff --git a/examples/hybridsearch.py b/examples/hybridsearch.py index c3d635d4..855cabc8 100644 --- a/examples/hybridsearch.py +++ b/examples/hybridsearch.py @@ -50,16 +50,16 @@ WhichModel.Jina, model_id="jinaai/jina-embeddings-v2-small-en" ) -jina_embedddings = embed_anything.embed_query(sentences, embeder=jina_model) -jina_query = embed_anything.embed_query(query_text, embeder=jina_model)[0] +jina_embedddings = embed_anything.embed_query(sentences, embedder=jina_model) +jina_query = embed_anything.embed_query(query_text, embedder=jina_model)[0] splade_model = EmbeddingModel.from_pretrained_hf( WhichModel.SparseBert, "prithivida/Splade_PP_en_v1" ) -jina_embedddings = embed_anything.embed_query(sentences, embeder=jina_model) +jina_embedddings = embed_anything.embed_query(sentences, embedder=jina_model) -splade_query = embed_anything.embed_query(query_text, embeder=splade_model) +splade_query = embed_anything.embed_query(query_text, embedder=splade_model) client.query_points( collection_name="my-hybrid-collection", diff --git a/examples/onnx_models.py b/examples/onnx_models.py index 312556cc..a6752f8f 100644 --- a/examples/onnx_models.py +++ b/examples/onnx_models.py @@ -25,7 +25,7 @@ "The dog is sitting in the park", ] -embedddings = embed_query(sentences, embeder=model) +embedddings = embed_query(sentences, embedder=model) embed_vector = np.array([e.embedding for e in embedddings]) diff --git a/examples/semantic_chunking.py b/examples/semantic_chunking.py index 7575f60a..20e59a5c 100644 --- a/examples/semantic_chunking.py +++ b/examples/semantic_chunking.py @@ -16,7 +16,7 @@ semantic_encoder=semantic_encoder, ) -data = embed_anything.embed_file("test_files/bank.txt", embeder=model, config=config) +data = embed_anything.embed_file("test_files/bank.txt", embedder=model, config=config) for d in data: print(d.text) diff --git a/examples/splade.py b/examples/splade.py index 4f806614..40b7738f 100644 --- a/examples/splade.py +++ b/examples/splade.py @@ -22,7 +22,7 @@ "Do you like pizza?", ] -embedddings = embed_query(sentences, embeder=model) +embedddings = embed_query(sentences, embedder=model) embed_vector = np.array([e.embedding for e in embedddings]) diff --git a/examples/text.py b/examples/text.py index 15225727..296bff83 100644 --- a/examples/text.py +++ b/examples/text.py @@ -21,7 +21,7 @@ def embed_directory_example(): # Embed all files in a directory data: list[EmbedData] = embed_anything.embed_directory( - "bench", embeder=model, config=config + "bench", embedder=model, config=config ) # End timing @@ -39,7 +39,7 @@ def embed_query_example(): # Embed a query embeddings: EmbedData = embed_anything.embed_query( - ["Hello world my"], embeder=model, config=config + ["Hello world my"], embedder=model, config=config )[0] # Print the shape of the embedding @@ -48,7 +48,7 @@ def embed_query_example(): # Embed another query and print the result print( embed_anything.embed_query( - ["What is the capital of India?"], embeder=model, config=config + ["What is the capital of India?"], embedder=model, config=config ) ) @@ -62,7 +62,7 @@ def embed_file_example(): # Embed a single file data: list[EmbedData] = embed_anything.embed_file( - "test_files/bank.txt", embeder=model, config=config + "test_files/bank.txt", embedder=model, config=config ) # Print the embedded data diff --git a/examples/text_ocr.py b/examples/text_ocr.py index a0db094d..f01a68bf 100644 --- a/examples/text_ocr.py +++ b/examples/text_ocr.py @@ -22,7 +22,7 @@ data: list[EmbedData] = embed_anything.embed_file( "/home/akshay/projects/starlaw/src-server/test_files/court.pdf", # Replace with your file path - embeder=model, + embedder=model, config=config, ) end = time() diff --git a/examples/web.py b/examples/web.py index e877e084..dcb55d88 100644 --- a/examples/web.py +++ b/examples/web.py @@ -1,3 +1,3 @@ import embed_anything -data = embed_anything.embed_webpage("https://www.akshaymakes.com/", embeder="Bert") +data = embed_anything.embed_webpage("https://www.akshaymakes.com/", embedder="Bert") diff --git a/python/python/embed_anything/__init__.py b/python/python/embed_anything/__init__.py index 95f0f29f..4c8506b5 100644 --- a/python/python/embed_anything/__init__.py +++ b/python/python/embed_anything/__init__.py @@ -21,7 +21,7 @@ model = EmbeddingModel.from_pretrained_local( WhichModel.Bert, model_id="Hugging_face_link" ) -data = embed_anything.embed_file("test_files/test.pdf", embeder=model) +data = embed_anything.embed_file("test_files/test.pdf", embedder=model) #For images @@ -30,11 +30,11 @@ model_id="openai/clip-vit-base-patch16", # revision="refs/pr/15", ) -data: list[EmbedData] = embed_anything.embed_directory("test_files", embeder=model) +data: list[EmbedData] = embed_anything.embed_directory("test_files", embedder=model) embeddings = np.array([data.embedding for data in data]) query = ["Photo of a monkey?"] query_embedding = np.array( - embed_anything.embed_query(query, embeder=model)[0].embedding + embed_anything.embed_query(query, embedder=model)[0].embedding ) # For audio files from embed_anything import ( @@ -47,7 +47,7 @@ audio_decoder = AudioDecoderModel.from_pretrained_hf( "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False ) -embeder = EmbeddingModel.from_pretrained_hf( +embedder = EmbeddingModel.from_pretrained_hf( embed_anything.WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", @@ -56,7 +56,7 @@ data = embed_anything.embed_audio_file( "test_files/audio/samples_hp0.wav", audio_decoder=audio_decoder, - embeder=embeder, + embedder=embedder, text_embed_config=config, ) @@ -98,7 +98,7 @@ data = embed_anything.embed_image_directory( "test_files", - embeder=clip_model, + embedder=clip_model, adapter=pinecone_adapter, # config=embed_config, ``` diff --git a/python/python/embed_anything/_embed_anything.pyi b/python/python/embed_anything/_embed_anything.pyi index 50b32783..00290dd8 100644 --- a/python/python/embed_anything/_embed_anything.pyi +++ b/python/python/embed_anything/_embed_anything.pyi @@ -53,14 +53,14 @@ class Adapter(ABC): """ def embed_query( - query: list[str], embeder: EmbeddingModel, config: TextEmbedConfig | None = None + query: list[str], embedder: EmbeddingModel, config: TextEmbedConfig | None = None ) -> list[EmbedData]: """ Embeds the given query and returns a list of EmbedData objects. Args: query: The query to embed. - embeder: The embedding model to use. + embedder: The embedding model to use. config: The configuration for the embedding model. Returns: @@ -80,7 +80,7 @@ def embed_query( def embed_file( file_path: str, - embeder: EmbeddingModel, + embedder: EmbeddingModel, config: TextEmbedConfig | None = None, adapter: Adapter | None = None, ) -> list[EmbedData]: @@ -89,7 +89,7 @@ def embed_file( Args: file_path: The path to the file to embed. - embeder: The embedding model to use. + embedder: The embedding model to use. config: The configuration for the embedding model. adapter: The adapter to use for storing the embeddings in a vector database. @@ -104,13 +104,13 @@ def embed_file( model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", ) - data = embed_anything.embed_file("test_files/test.pdf", embeder=model) + data = embed_anything.embed_file("test_files/test.pdf", embedder=model) ``` """ def embed_directory( file_path: str, - embeder: EmbeddingModel, + embedder: EmbeddingModel, extensions: list[str], config: TextEmbedConfig | None = None, adapter: Adapter | None = None, @@ -120,7 +120,7 @@ def embed_directory( Args: file_path: The path to the directory containing the files to embed. - embeder: The embedding model to use. + embedder: The embedding model to use. extensions: The list of file extensions to consider for embedding. config: The configuration for the embedding model. adapter: The adapter to use for storing the embeddings in a vector database. @@ -136,13 +136,13 @@ def embed_directory( model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", ) - data = embed_anything.embed_directory("test_files", embeder=model, extensions=[".pdf"]) + data = embed_anything.embed_directory("test_files", embedder=model, extensions=[".pdf"]) ``` """ def embed_image_directory( file_path: str, - embeder: EmbeddingModel, + embedder: EmbeddingModel, config: ImageEmbedConfig | None = None, adapter: Adapter | None = None, ) -> list[EmbedData]: @@ -151,7 +151,7 @@ def embed_image_directory( Args: file_path: The path to the directory containing the images to embed. - embeder: The embedding model to use. + embedder: The embedding model to use. config: The configuration for the embedding model. adapter: The adapter to use for storing the embeddings in a vector database. @@ -161,7 +161,7 @@ def embed_image_directory( def embed_webpage( url: str, - embeder: EmbeddingModel, + embedder: EmbeddingModel, config: TextEmbedConfig | None, adapter: Adapter | None, ) -> list[EmbedData] | None: @@ -170,7 +170,7 @@ def embed_webpage( Args: url: The URL of the webpage to embed. - embeder: The name of the embedding model to use. Choose between "OpenAI", "Jina", "Bert" + embedder: The name of the embedding model to use. Choose between "OpenAI", "Jina", "Bert" config: The configuration for the embedding model. adapter: The adapter to use for storing the embeddings. @@ -185,7 +185,7 @@ def embed_webpage( openai_config=embed_anything.OpenAIConfig(model="text-embedding-3-small") ) data = embed_anything.embed_webpage( - "https://www.akshaymakes.com/", embeder="OpenAI", config=config + "https://www.akshaymakes.com/", embedder="OpenAI", config=config ) ``` """ @@ -193,7 +193,7 @@ def embed_webpage( def embed_audio_file( file_path: str, audio_decoder: AudioDecoderModel, - embeder: EmbeddingModel, + embedder: EmbeddingModel, text_embed_config: TextEmbedConfig | None = TextEmbedConfig( chunk_size=200, batch_size=32 ), @@ -204,7 +204,7 @@ def embed_audio_file( Args: file_path: The path to the audio file to embed. audio_decoder: The audio decoder model to use. - embeder: The embedding model to use. + embedder: The embedding model to use. text_embed_config: The configuration for the embedding model. Returns: @@ -218,7 +218,7 @@ def embed_audio_file( "openai/whisper-tiny.en", revision="main", model_type="tiny-en", quantized=False ) - embeder = embed_anything.EmbeddingModel.from_pretrained_hf( + embedder = embed_anything.EmbeddingModel.from_pretrained_hf( embed_anything.WhichModel.Bert, model_id="sentence-transformers/all-MiniLM-L6-v2", revision="main", @@ -228,7 +228,7 @@ def embed_audio_file( data = embed_anything.embed_audio_file( "test_files/audio/samples_hp0.wav", audio_decoder=audio_decoder, - embeder=embeder, + embedder=embedder, text_embed_config=config, ) ``` diff --git a/python/src/lib.rs b/python/src/lib.rs index 3fd6b1e2..d5afba28 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -353,14 +353,14 @@ impl AudioDecoderModel { } #[pyfunction] -#[pyo3(signature = (query, embeder, config=None))] +#[pyo3(signature = (query, embedder, config=None))] pub fn embed_query( query: Vec, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, config: Option<&config::TextEmbedConfig>, ) -> PyResult> { let config = config.map(|c| &c.inner); - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let rt = Builder::new_multi_thread().enable_all().build().unwrap(); Ok(rt.block_on(async { embed_anything::embed_query( @@ -378,15 +378,15 @@ pub fn embed_query( } #[pyfunction] -#[pyo3(signature = (file_name, embeder, config=None, adapter=None))] +#[pyo3(signature = (file_name, embedder, config=None, adapter=None))] pub fn embed_file( file_name: &str, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, config: Option<&config::TextEmbedConfig>, adapter: Option, ) -> PyResult>> { let config = config.map(|c| &c.inner); - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let rt = Builder::new_multi_thread().enable_all().build().unwrap(); if !Path::new(file_name).exists() { // check if the file exists other wise return a "File not found" error with PyValueError @@ -437,15 +437,15 @@ pub fn embed_file( } #[pyfunction] -#[pyo3(signature = (audio_file, audio_decoder, embeder, text_embed_config=None))] +#[pyo3(signature = (audio_file, audio_decoder, embedder, text_embed_config=None))] pub fn embed_audio_file( audio_file: String, audio_decoder: &mut AudioDecoderModel, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, text_embed_config: Option<&config::TextEmbedConfig>, ) -> PyResult>> { let config = text_embed_config.map(|c| &c.inner); - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let audio_decoder = &mut audio_decoder.inner; let rt = Builder::new_multi_thread().enable_all().build().unwrap(); let data = rt.block_on(async { @@ -463,16 +463,16 @@ pub fn embed_audio_file( } #[pyfunction] -#[pyo3(signature = (directory, embeder, extensions=None, config=None, adapter = None))] +#[pyo3(signature = (directory, embedder, extensions=None, config=None, adapter = None))] pub fn embed_directory( directory: PathBuf, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, extensions: Option>, config: Option<&config::TextEmbedConfig>, adapter: Option, ) -> PyResult>> { let config = config.map(|c| &c.inner); - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let rt = Builder::new_multi_thread().enable_all().build().unwrap(); println!("Runtime created"); @@ -517,14 +517,14 @@ pub fn embed_directory( } #[pyfunction] -#[pyo3(signature = (directory, embeder, config=None, adapter = None))] +#[pyo3(signature = (directory, embedder, config=None, adapter = None))] pub fn embed_image_directory( directory: PathBuf, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, config: Option<&config::ImageEmbedConfig>, adapter: Option, ) -> PyResult>> { - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let config = config.map(|c| &c.inner); let rt = Builder::new_multi_thread().enable_all().build().unwrap(); println!("Runtime created"); @@ -563,14 +563,14 @@ pub fn embed_image_directory( Ok(data) } #[pyfunction] -#[pyo3(signature = (url, embeder, config=None, adapter = None))] +#[pyo3(signature = (url, embedder, config=None, adapter = None))] pub fn embed_webpage( url: String, - embeder: &EmbeddingModel, + embedder: &EmbeddingModel, config: Option<&config::TextEmbedConfig>, adapter: Option, ) -> PyResult>> { - let embedding_model = &embeder.inner; + let embedding_model = &embedder.inner; let config = config.map(|c| &c.inner); let rt = Builder::new_multi_thread().enable_all().build().unwrap(); let adapter = match adapter { diff --git a/rust/examples/web_embed.rs b/rust/examples/web_embed.rs index 3bdfa668..4adf5eea 100644 --- a/rust/examples/web_embed.rs +++ b/rust/examples/web_embed.rs @@ -13,7 +13,7 @@ async fn main() { let start_time = std::time::Instant::now(); let url = "https://www.scrapingbee.com/blog/web-scraping-rust/".to_string(); - let embeder = Arc::new( + let embedder = Arc::new( Embedder::from_pretrained_hf("bert", "sentence-transformers/all-MiniLM-L6-v2", None) .unwrap(), ); @@ -23,11 +23,11 @@ async fn main() { .with_batch_size(32) .with_buffer_size(100) .with_splitting_strategy(SplittingStrategy::Sentence) - .with_semantic_encoder(Arc::clone(&embeder)); + .with_semantic_encoder(Arc::clone(&embedder)); let embed_data = embed_webpage( url, - &embeder, + &embedder, Some(&embed_config), None::)>, ) @@ -48,7 +48,7 @@ async fn main() { .unwrap(); let query = vec!["Rust for web scraping".to_string()]; - let query_embedding: Vec = embed_query(query, &embeder, Some(&embed_config)) + let query_embedding: Vec = embed_query(query, &embedder, Some(&embed_config)) .await .unwrap() .iter() diff --git a/rust/src/embeddings/embed.rs b/rust/src/embeddings/embed.rs index 8dcf1759..72762fa3 100644 --- a/rust/src/embeddings/embed.rs +++ b/rust/src/embeddings/embed.rs @@ -99,10 +99,10 @@ impl TextEmbedder { batch_size: Option, ) -> Result, anyhow::Error> { match self { - TextEmbedder::OpenAI(embeder) => embeder.embed(text_batch).await, - TextEmbedder::Cohere(embeder) => embeder.embed(text_batch).await, - TextEmbedder::Jina(embeder) => embeder.embed(text_batch, batch_size), - TextEmbedder::Bert(embeder) => embeder.embed(text_batch, batch_size), + TextEmbedder::OpenAI(embedder) => embedder.embed(text_batch).await, + TextEmbedder::Cohere(embedder) => embedder.embed(text_batch).await, + TextEmbedder::Jina(embedder) => embedder.embed(text_batch, batch_size), + TextEmbedder::Bert(embedder) => embedder.embed(text_batch, batch_size), } } @@ -142,7 +142,7 @@ impl TextEmbedder { } } - /// Creates a new instance of a cloud api based `Embeder` with the specified model and API key. + /// Creates a new instance of a cloud api based `Embedder` with the specified model and API key. /// /// # Arguments /// @@ -159,7 +159,7 @@ impl TextEmbedder { /// /// # Returns /// - /// A new instance of `Embeder`. + /// A new instance of `Embedder`. pub fn from_pretrained_cloud( model: &str, model_id: &str, @@ -239,8 +239,8 @@ impl Embedder { batch_size: Option, ) -> Result, anyhow::Error> { match self { - Self::Text(embeder) => embeder.embed(text_batch, batch_size).await, - Self::Vision(embeder) => embeder.embed(text_batch, batch_size), + Self::Text(embedder) => embedder.embed(text_batch, batch_size).await, + Self::Vision(embedder) => embedder.embed(text_batch, batch_size), } } @@ -302,7 +302,7 @@ impl EmbedImage for Embedder { metadata: Option>, ) -> anyhow::Result { match self { - Self::Vision(embeder) => embeder.embed_image(image_path, metadata), + Self::Vision(embedder) => embedder.embed_image(image_path, metadata), _ => Err(anyhow::anyhow!("Model not supported for vision embedding")), } } @@ -312,7 +312,7 @@ impl EmbedImage for Embedder { image_paths: &[T], ) -> anyhow::Result> { match self { - Self::Vision(embeder) => embeder.embed_image_batch(image_paths), + Self::Vision(embedder) => embedder.embed_image_batch(image_paths), _ => Err(anyhow::anyhow!("Model not supported for vision embedding")), } } @@ -333,8 +333,8 @@ impl TextEmbed for VisionEmbedder { batch_size: Option, ) -> Result, anyhow::Error> { match self { - Self::Clip(embeder) => embeder.embed(text_batch, batch_size), - Self::ColPali(embeder) => embeder.embed(text_batch, batch_size), + Self::Clip(embedder) => embedder.embed(text_batch, batch_size), + Self::ColPali(embedder) => embedder.embed(text_batch, batch_size), } } } @@ -358,9 +358,9 @@ impl EmbedImage for VisionEmbedder { metadata: Option>, ) -> anyhow::Result { match self { - Self::Clip(embeder) => embeder.embed_image(image_path, metadata), - Self::ColPali(embeder) => { - embeder.embed_image(PathBuf::from(image_path.as_ref()), metadata) + Self::Clip(embedder) => embedder.embed_image(image_path, metadata), + Self::ColPali(embedder) => { + embedder.embed_image(PathBuf::from(image_path.as_ref()), metadata) } } } @@ -370,8 +370,8 @@ impl EmbedImage for VisionEmbedder { image_paths: &[T], ) -> anyhow::Result> { match self { - Self::Clip(embeder) => embeder.embed_image_batch(image_paths), - Self::ColPali(embeder) => embeder.embed_image_batch( + Self::Clip(embedder) => embedder.embed_image_batch(image_paths), + Self::ColPali(embedder) => embedder.embed_image_batch( &image_paths .iter() .map(|p| PathBuf::from(p.as_ref())) diff --git a/rust/src/embeddings/local/clip.rs b/rust/src/embeddings/local/clip.rs index 2f84ca43..db73f001 100644 --- a/rust/src/embeddings/local/clip.rs +++ b/rust/src/embeddings/local/clip.rs @@ -288,13 +288,13 @@ mod tests { // Tests the tokenize_sequences method. #[test] fn test_tokenize_sequences() { - let clip_embeder = ClipEmbedder::default(); + let clip_embedder = ClipEmbedder::default(); let sequences = Some(vec![ "Hey there how are you?".to_string(), "EmbedAnything is the best!".to_string(), ]); - let (input_ids, vec_seq) = clip_embeder - .tokenize_sequences(sequences, &clip_embeder.tokenizer) + let (input_ids, vec_seq) = clip_embedder + .tokenize_sequences(sequences, &clip_embedder.tokenizer) .unwrap(); assert_eq!( vec_seq, @@ -309,8 +309,8 @@ mod tests { // Tests the load_image method. #[test] fn test_load_image() { - let clip_embeder = ClipEmbedder::default(); - let image = clip_embeder + let clip_embedder = ClipEmbedder::default(); + let image = clip_embedder .load_image("test_files/clip/cat1.jpg", 224) .unwrap(); assert_eq!(image.shape().clone().into_dims(), &[3, 224, 224]); @@ -319,8 +319,8 @@ mod tests { // Tests the load_images method. #[test] fn test_load_images() { - let clip_embeder = ClipEmbedder::default(); - let images = clip_embeder + let clip_embedder = ClipEmbedder::default(); + let images = clip_embedder .load_images( &["test_files/clip/cat1.jpg", "test_files/clip/cat2.jpeg"], 224, @@ -332,8 +332,8 @@ mod tests { // Tests the embed_image_batch method. #[test] fn test_embed_image_batch() { - let clip_embeder = ClipEmbedder::default(); - let embeddings = clip_embeder + let clip_embedder = ClipEmbedder::default(); + let embeddings = clip_embedder .embed_image_batch(&["test_files/clip/cat1.jpg", "test_files/clip/cat2.jpeg"]) .unwrap(); assert_eq!(embeddings.len(), 2); diff --git a/rust/src/embeddings/local/jina.rs b/rust/src/embeddings/local/jina.rs index ee4c252b..98c3a4b5 100644 --- a/rust/src/embeddings/local/jina.rs +++ b/rust/src/embeddings/local/jina.rs @@ -332,10 +332,10 @@ mod tests { #[test] fn test_embed() { - let embeder = JinaEmbedder::new("jinaai/jina-embeddings-v2-small-en", None).unwrap(); + let embedder = JinaEmbedder::new("jinaai/jina-embeddings-v2-small-en", None).unwrap(); let text_batch = vec!["Hello, world!".to_string()]; - let encodings = embeder.embed(&text_batch, None).unwrap(); + let encodings = embedder.embed(&text_batch, None).unwrap(); println!("{:?}", encodings); } } diff --git a/rust/src/embeddings/mod.rs b/rust/src/embeddings/mod.rs index 73e9f639..4215ac57 100644 --- a/rust/src/embeddings/mod.rs +++ b/rust/src/embeddings/mod.rs @@ -64,13 +64,13 @@ pub fn text_batch_from_audio(segments: &[Segment]) -> Vec { } pub async fn embed_audio>( - embeder: &Embedder, + embedder: &Embedder, segments: Vec, audio_file: T, batch_size: Option, ) -> Result, anyhow::Error> { let text_batch = text_batch_from_audio(&segments); - let encodings = embeder.embed(&text_batch, batch_size).await?; + let encodings = embedder.embed(&text_batch, batch_size).await?; get_audio_metadata(encodings, segments, audio_file) } diff --git a/rust/src/file_processor/html_processor.rs b/rust/src/file_processor/html_processor.rs index e5ba21a1..fa0db810 100644 --- a/rust/src/file_processor/html_processor.rs +++ b/rust/src/file_processor/html_processor.rs @@ -21,7 +21,7 @@ pub struct HtmlDocument { impl HtmlDocument { pub async fn embed_webpage( &self, - embeder: &Embedder, + embedder: &Embedder, chunk_size: usize, overlap_ratio: f32, batch_size: Option, @@ -33,7 +33,7 @@ impl HtmlDocument { self.embed_tag( "p", paragraphs, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -47,7 +47,7 @@ impl HtmlDocument { self.embed_tag( "h1", headers, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -61,7 +61,7 @@ impl HtmlDocument { self.embed_tag( "code", codes, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -77,7 +77,7 @@ impl HtmlDocument { &self, tag: &str, tag_content: &[String], - embeder: &Embedder, + embedder: &Embedder, chunk_size: usize, overlap_ratio: f32, batch_size: Option, @@ -113,7 +113,7 @@ impl HtmlDocument { let metadata_hashmap: HashMap = serde_json::from_value(metadata)?; - let encodings = embeder.embed(&chunks, batch_size).await?; + let encodings = embedder.embed(&chunks, batch_size).await?; let embeddings = get_text_metadata(&Rc::new(encodings), &chunks, &Some(metadata_hashmap))?; embed_data.extend(embeddings); diff --git a/rust/src/file_processor/website_processor.rs b/rust/src/file_processor/website_processor.rs index 2a53f6a6..e704b7af 100644 --- a/rust/src/file_processor/website_processor.rs +++ b/rust/src/file_processor/website_processor.rs @@ -28,7 +28,7 @@ pub struct WebPage { impl WebPage { pub async fn embed_webpage( &self, - embeder: &Embedder, + embedder: &Embedder, chunk_size: usize, overlap_ratio: f32, batch_size: Option, @@ -40,7 +40,7 @@ impl WebPage { self.embed_tag( "p", paragraphs, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -54,7 +54,7 @@ impl WebPage { self.embed_tag( "h1", headers, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -68,7 +68,7 @@ impl WebPage { self.embed_tag( "code", codes, - embeder, + embedder, chunk_size, overlap_ratio, batch_size, @@ -84,7 +84,7 @@ impl WebPage { &self, tag: &str, tag_content: &[String], - embeder: &Embedder, + embedder: &Embedder, chunk_size: usize, overlap_ratio: f32, batch_size: Option, @@ -120,7 +120,7 @@ impl WebPage { let metadata_hashmap: HashMap = serde_json::from_value(metadata)?; - let encodings = embeder.embed(&chunks, batch_size).await?; + let encodings = embedder.embed(&chunks, batch_size).await?; let embeddings = get_text_metadata(&Rc::new(encodings), &chunks, &Some(metadata_hashmap))?; embed_data.extend(embeddings); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 991187de..589a8177 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -29,7 +29,7 @@ use tokio::sync::mpsc; // Add this at the top of your file /// # Arguments /// /// * `query` - A vector of strings representing the queries to embed. -/// * `embeder` - A string specifying the embedding model to use. Valid options are "OpenAI", "Jina", "Clip", and "Bert". +/// * `embedder` - A string specifying the embedding model to use. Valid options are "OpenAI", "Jina", "Clip", and "Bert". /// * `config` - An optional `EmbedConfig` object specifying the configuration for the embedding model. /// * 'adapter' - An optional `Adapter` object to send the embeddings to a vector database. /// @@ -47,16 +47,16 @@ use tokio::sync::mpsc; // Add this at the top of your file /// use embed_anything::embed_query; /// /// let query = vec!["Hello".to_string(), "World".to_string()]; -/// let embeder = "OpenAI"; +/// let embedder = "OpenAI"; /// let openai_config = OpenAIConfig{ model: Some("text-embedding-3-small".to_string()), api_key: None, chunk_size: Some(256) }; /// let config = EmbedConfig{ openai: Some(openai_config), ..Default::default() }; -/// let embeddings = embed_query(query, embeder).unwrap(); +/// let embeddings = embed_query(query, embedder).unwrap(); /// println!("{:?}", embeddings); /// ``` /// This will output the embeddings of the queries using the OpenAI embedding model. pub async fn embed_query( query: Vec, - embeder: &Embedder, + embedder: &Embedder, config: Option<&TextEmbedConfig>, ) -> Result> { let binding = TextEmbedConfig::default(); @@ -64,7 +64,7 @@ pub async fn embed_query( let _chunk_size = config.chunk_size.unwrap_or(256); let batch_size = config.batch_size; - let encodings = embeder.embed(&query, batch_size).await.unwrap(); + let encodings = embedder.embed(&query, batch_size).await.unwrap(); let embeddings = get_text_metadata(&Rc::new(encodings), &query, &None)?; Ok(embeddings) @@ -75,7 +75,7 @@ pub async fn embed_query( /// # Arguments /// /// * `file_name` - A string specifying the name of the file to embed. -/// * `embeder` - A string specifying the embedding model to use. Valid options are "OpenAI", "Jina", "Clip", and "Bert". +/// * `embedder` - A string specifying the embedding model to use. Valid options are "OpenAI", "Jina", "Clip", and "Bert". /// * `config` - An optional `EmbedConfig` object specifying the configuration for the embedding model. /// * 'adapter' - An optional `Adapter` object to send the embeddings to a vector database. /// @@ -93,14 +93,14 @@ pub async fn embed_query( /// use embed_anything::embed_file; /// /// let file_name = "test_files/test.pdf"; -/// let embeder = "Bert"; +/// let embedder = "Bert"; /// let bert_config = BertConfig{ model_id: Some("sentence-transformers/all-MiniLM-L12-v2".to_string()), revision: None, chunk_size: Some(256) }; -/// let embeddings = embed_file(file_name, embeder, config).unwrap(); +/// let embeddings = embed_file(file_name, embedder, config).unwrap(); /// ``` /// This will output the embeddings of the file using the OpenAI embedding model. pub async fn embed_file, F>( file_name: T, - embeder: &Embedder, + embedder: &Embedder, config: Option<&TextEmbedConfig>, adapter: Option, ) -> Result>> @@ -118,11 +118,11 @@ where let semantic_encoder = config.semantic_encoder.clone(); let use_ocr = config.use_ocr.unwrap_or(false); - match embeder { - Embedder::Text(embeder) => { + match embedder { + Embedder::Text(embedder) => { emb_text( file_name, - embeder, + embedder, Some(chunk_size), Some(overlap_ratio), batch_size, @@ -133,7 +133,7 @@ where ) .await } - Embedder::Vision(embeder) => Ok(Some(vec![emb_image(file_name, embeder).unwrap()])), + Embedder::Vision(embedder) => Ok(Some(vec![emb_image(file_name, embedder).unwrap()])), } } @@ -141,7 +141,7 @@ where /// /// # Arguments /// -/// * `embeder` - The embedding model to use. Supported options are "OpenAI", "Jina", and "Bert". +/// * `embedder` - The embedding model to use. Supported options are "OpenAI", "Jina", and "Bert". /// * `webpage` - The webpage to embed. /// /// # Returns @@ -155,15 +155,15 @@ where /// # Example /// /// ``` -/// let embeddings = match embeder { +/// let embeddings = match embedder { /// "OpenAI" => webpage -/// .embed_webpage(&embedding_model::openai::OpenAIEmbeder::default()) +/// .embed_webpage(&embedding_model::openai::OpenAIEmbedder::default()) /// .unwrap(), /// "Jina" => webpage -/// .embed_webpage(&embedding_model::jina::JinaEmbeder::default()) +/// .embed_webpage(&embedding_model::jina::JinaEmbedder::default()) /// .unwrap(), /// "Bert" => webpage -/// .embed_webpage(&embedding_model::bert::BertEmbeder::default()) +/// .embed_webpage(&embedding_model::bert::BertEmbedder::default()) /// .unwrap(), /// _ => { /// return Err(PyValueError::new_err( @@ -174,7 +174,7 @@ where /// ``` pub async fn embed_webpage( url: String, - embeder: &Embedder, + embedder: &Embedder, config: Option<&TextEmbedConfig>, // Callback function adapter: Option, @@ -185,7 +185,7 @@ where let website_processor = file_processor::website_processor::WebsiteProcessor::new(); let webpage = website_processor.process_website(url.as_ref())?; - // if let Embeder::Clip(_) = embeder { + // if let Embedder::Clip(_) = embedder { // return Err(anyhow!("Clip model does not support webpage embedding")); // } @@ -196,7 +196,7 @@ where let batch_size = config.batch_size; let embeddings = webpage - .embed_webpage(embeder, chunk_size, overlap_ratio, batch_size) + .embed_webpage(embedder, chunk_size, overlap_ratio, batch_size) .await?; // Send embeddings to vector database @@ -327,12 +327,12 @@ fn emb_image>( pub async fn emb_audio>( audio_file: T, audio_decoder: &mut AudioDecoderModel, - embeder: &Arc, + embedder: &Arc, text_embed_config: Option<&TextEmbedConfig>, ) -> Result>> { let segments: Vec = audio_decoder.process_audio(&audio_file).unwrap(); let embeddings = embed_audio( - embeder, + embedder, segments, audio_file, text_embed_config @@ -349,7 +349,7 @@ pub async fn emb_audio>( /// # Arguments /// /// * `directory` - A `PathBuf` representing the directory containing the images to embed. -/// * `embeder` - A reference to the embedding model to use. +/// * `embedder` - A reference to the embedding model to use. /// * `config` - An optional `ImageEmbedConfig` object specifying the configuration for the embedding model. Default buffer size is 100. /// * `adapter` - An optional callback function to handle the embeddings. /// @@ -367,8 +367,8 @@ pub async fn emb_audio>( /// use std::sync::Arc; /// /// let directory = PathBuf::from("/path/to/directory"); -/// let embeder = Arc::new(Embeder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); -/// let embeddings = embed_image_directory(directory, &embeder, None).await.unwrap(); +/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); +/// let embeddings = embed_image_directory(directory, &embedder, None).await.unwrap(); /// ``` /// This will output the embeddings of the images in the specified directory using the specified embedding model. /// @@ -392,7 +392,7 @@ where let (tx, mut rx) = mpsc::unbounded_channel(); let (collector_tx, mut collector_rx) = mpsc::unbounded_channel(); - let embeder = embedding_model.clone(); + let embedder = embedding_model.clone(); let pb = indicatif::ProgressBar::new(file_parser.files.len() as u64); pb.set_style( @@ -412,8 +412,8 @@ where image_buffer.push(image); if image_buffer.len() == buffer_size { - // Ensure embeder is mutable and not wrapped in Arc - match process_images(&image_buffer, embeder.clone()).await { + // Ensure embedder is mutable and not wrapped in Arc + match process_images(&image_buffer, embedder.clone()).await { Ok(embeddings) => { let files = embeddings .iter() @@ -441,7 +441,7 @@ where // Process any remaining images if !image_buffer.is_empty() { - match process_images(&image_buffer, embeder).await { + match process_images(&image_buffer, embedder).await { Ok(embeddings) => { let files = embeddings .iter() @@ -494,9 +494,9 @@ where async fn process_images( image_buffer: &[String], - embeder: Arc, + embedder: Arc, ) -> Result>> { - let embeddings = embeder.embed_image_batch(image_buffer)?; + let embeddings = embedder.embed_image_batch(image_buffer)?; Ok(Arc::new(embeddings)) } @@ -505,7 +505,7 @@ async fn process_images( /// # Arguments /// /// * `directory` - A `PathBuf` representing the directory containing the files to embed. -/// * `embeder` - A reference to the embedding model to use. +/// * `embedder` - A reference to the embedding model to use. /// * `extensions` - An optional vector of strings representing the file extensions to consider for embedding. If `None`, all files in the directory will be considered. /// * `config` - An optional `TextEmbedConfig` object specifying the configuration for the embedding model. /// * `adapter` - An optional callback function to handle the embeddings. @@ -524,15 +524,15 @@ async fn process_images( /// use std::sync::Arc; /// /// let directory = PathBuf::from("/path/to/directory"); -/// let embeder = Arc::new(Embeder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); +/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); /// let config = Some(TextEmbedConfig::default()); /// let extensions = Some(vec!["txt".to_string(), "pdf".to_string()]); -/// let embeddings = embed_directory_stream(directory, &embeder, extensions, config, None).await.unwrap(); +/// let embeddings = embed_directory_stream(directory, &embedder, extensions, config, None).await.unwrap(); /// ``` /// This will output the embeddings of the files in the specified directory using the specified embedding model. pub async fn embed_directory_stream( directory: PathBuf, - embeder: &Arc, + embedder: &Arc, extensions: Option>, config: Option<&TextEmbedConfig>, adapter: Option, @@ -555,7 +555,7 @@ where let (tx, mut rx) = mpsc::unbounded_channel(); let (collector_tx, mut collector_rx) = mpsc::unbounded_channel(); - let embeder = embeder.clone(); + let embedder = embedder.clone(); let pb = indicatif::ProgressBar::new(files.len() as u64); pb.set_style( indicatif::ProgressStyle::with_template( @@ -576,7 +576,7 @@ where metadata_buffer.push(metadata); if chunk_buffer.len() == buffer_size { - match process_chunks(&chunk_buffer, &metadata_buffer, &embeder, batch_size) + match process_chunks(&chunk_buffer, &metadata_buffer, &embedder, batch_size) .await { Ok(embeddings) => { @@ -607,7 +607,7 @@ where // Process any remaining chunks if !chunk_buffer.is_empty() { - match process_chunks(&chunk_buffer, &metadata_buffer, &embeder, batch_size).await { + match process_chunks(&chunk_buffer, &metadata_buffer, &embedder, batch_size).await { Ok(embeddings) => { let files = embeddings .iter() diff --git a/rust/src/text_loader.rs b/rust/src/text_loader.rs index 142ec5c6..e8a64dbe 100644 --- a/rust/src/text_loader.rs +++ b/rust/src/text_loader.rs @@ -104,11 +104,11 @@ impl TextLoader { .map(|chunk| chunk.to_string()) .collect(), SplittingStrategy::Semantic => { - let embeder = semantic_encoder.unwrap_or(Arc::new(Embedder::Text( + let embedder = semantic_encoder.unwrap_or(Arc::new(Embedder::Text( TextEmbedder::Jina(Box::new(JinaEmbedder::default())), ))); let chunker = StatisticalChunker { - encoder: embeder, + encoder: embedder, ..Default::default() }; @@ -212,10 +212,10 @@ mod tests { } #[test] - fn test_image_embeder() { + fn test_image_embedder() { let file_path = PathBuf::from("test_files/clip/cat1.jpg"); - let embeder = ClipEmbedder::default(); - let emb_data = embeder.embed_image(file_path, None).unwrap(); + let embedder = ClipEmbedder::default(); + let emb_data = embedder.embed_image(file_path, None).unwrap(); assert_eq!(emb_data.embedding.to_dense().unwrap().len(), 512); } } diff --git a/tests/model_tests/test_adapter.py b/tests/model_tests/test_adapter.py index cf96b09e..6d7225dd 100644 --- a/tests/model_tests/test_adapter.py +++ b/tests/model_tests/test_adapter.py @@ -7,13 +7,13 @@ def test_adapter_upsert_call_file( ): assert ( embed_anything.embed_file( - test_pdf_file, embeder=bert_model, adapter=dummy_adapter + test_pdf_file, embedder=bert_model, adapter=dummy_adapter ) is None ) assert ( embed_anything.embed_file( - test_txt_file, embeder=bert_model, adapter=dummy_adapter + test_txt_file, embedder=bert_model, adapter=dummy_adapter ) is None ) @@ -22,7 +22,7 @@ def test_adapter_upsert_call_file( def test_adapter_upsert_call_directory(bert_model, dummy_adapter, test_files_directory): assert ( embed_anything.embed_directory( - test_files_directory, embeder=bert_model, adapter=dummy_adapter + test_files_directory, embedder=bert_model, adapter=dummy_adapter ) is None )