jin v3

StarlightSearch · Dec 1, 2024 · fe16a80 · fe16a80
1 parent a35bf95
commit fe16a80
Show file tree

Hide file tree

Showing 31 changed files with 499 additions and 123 deletions.
diff --git a/docs/guides/ocr.md b/docs/guides/ocr.md
@@ -0,0 +1,51 @@
+# Use PDFs that need OCR
+
+Embed Anything can be used to embed scanned documents using OCR. This is useful for tasks such as document search and retrieval. You can set `use_ocr=True` in the `TextEmbedConfig` to enable OCR. But this requires `tesseract` and `poppler` to be installed.
+
+You can install `tesseract` and `poppler` using the following commands:
+
+## Install Tesseract and Poppler
+
+### Windows
+
+For Tesseract, download the installer from [here](https://github.com/UB-Mannheim/tesseract/wiki) and install it.
+
+For Poppler, download the installer from [here](https://github.com/oschwartz10612/poppler-windows?tab=readme-ov-file) and install it.
+
+### MacOS
+
+For Tesseract, you can install it using Homebrew.
+
+``` bash
+brew install tesseract
+```
+
+For Poppler, you can install it using Homebrew.
+
+``` bash
+brew install poppler
+```
+
+### Linux
+
+For Tesseract, you can install it using the package manager for your Linux distribution. For example, on Ubuntu, you can install it using:
+
+``` bash
+sudo apt install tesseract-ocr
+sudo apt install libtesseract-dev
+
+```
+
+For Poppler, you can install it using the package manager for your Linux distribution. For example, on Ubuntu, you can install it using:
+
+``` bash
+sudo apt install poppler-utils
+```
+
+For more information, refer to the [Tesseract installation guide](https://tesseract-ocr.github.io/tessdoc/Installation.html).
+
+## Example Usage
+
+``` python
+--8<-- "examples/text_ocr.py"
+```
diff --git a/docs/guides/onnx_models.md b/docs/guides/onnx_models.md
@@ -32,7 +32,7 @@
 | `GTELargeENV15Q`                 | Quantized Alibaba-NLP/gte-large-en-v1.5          |
 | `JINAV2SMALLEN`                  | jinaai/jina-embeddings-v2-small-en               |
 | `JINAV2BASEEN`                   | jinaai/jina-embeddings-v2-base-en                |
-| `JINAV2LARGEEN`                  | jinaai/jina-embeddings-v2-large-en               |
+| `JINAV3`                         | jinaai/jina-embeddings-v3                         |
 
 ## Example Usage
 

diff --git a/examples/adapters/weaviate_db.py b/examples/adapters/weaviate_db.py
@@ -83,6 +83,5 @@ def delete_index(self, index_name: str):
     print(response.objects[i].properties["text"])
 
 
-
 for res in response.objects:
     print(textwrap.fill(res.properties["text"], width=120), end="\n\n")
diff --git a/examples/onnx_models.py b/examples/onnx_models.py
@@ -1,5 +1,11 @@
 import heapq
-from embed_anything import EmbeddingModel, TextEmbedConfig, WhichModel, embed_query, ONNXModel
+from embed_anything import (
+    EmbeddingModel,
+    TextEmbedConfig,
+    WhichModel,
+    embed_query,
+    ONNXModel,
+)
 import os
 from time import time
 import numpy as np

diff --git a/examples/text.py b/examples/text.py
@@ -8,6 +8,7 @@
     WhichModel.Jina, model_id="jinaai/jina-embeddings-v2-small-en"
 )
 
+
 # Example 1: Embedding a Directory
 def embed_directory_example():
     # Configure the embedding process
@@ -28,10 +29,13 @@ def embed_directory_example():
 
     print(f"Time taken to embed directory: {end - start} seconds")
 
+
 # Example 2: Embedding a Query
 def embed_query_example():
     # Configure the embedding process
-    config = TextEmbedConfig(chunk_size=256, batch_size=32, splitting_strategy="sentence")
+    config = TextEmbedConfig(
+        chunk_size=256, batch_size=32, splitting_strategy="sentence"
+    )
 
     # Embed a query
     embeddings: EmbedData = embed_anything.embed_query(
@@ -48,6 +52,7 @@ def embed_query_example():
         )
     )
 
+
 # Example 3: Embedding a File
 def embed_file_example():
     # Configure the embedding process
@@ -65,6 +70,7 @@ def embed_file_example():
         print(d.text)
         print("---" * 20)
 
+
 # Call the examples
 embed_directory_example()
 embed_query_example()

diff --git a/examples/text_ocr.py b/examples/text_ocr.py
@@ -1,8 +1,8 @@
+# OCR Requires `tesseract` and `poppler` to be installed.
+
 import time
 import embed_anything
 from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel
-from embed_anything.vectordb import Adapter
-import os
 from time import time
 
 
@@ -11,7 +11,7 @@
 )
 
 config = TextEmbedConfig(
-    chunk_size=1000,
+    chunk_size=256,
     batch_size=32,
     buffer_size=64,
     splitting_strategy="sentence",
@@ -21,7 +21,7 @@
 start = time()
 
 data: list[EmbedData] = embed_anything.embed_file(
-    "/home/akshay/projects/starlaw/src-server/test_files/court.pdf",
+    "/home/akshay/projects/starlaw/src-server/test_files/court.pdf",  # Replace with your file path
     embeder=model,
     config=config,
 )

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -55,6 +55,7 @@ nav:
   - guides/semantic.md
   - guides/adapters.md
   - guides/onnx_models.md
+  - guides/ocr.md
 - Contribution:
   - roadmap/roadmap.md
   - roadmap/contribution.md

diff --git a/python/python/embed_anything/__init__.py b/python/python/embed_anything/__init__.py
@@ -118,6 +118,7 @@
 - Audio Embedding Models:
     - "Whisper"
 """
+
 from ._embed_anything import *
 from .vectordb import *
 import platform
@@ -146,4 +147,4 @@
 
 __doc__ = _embed_anything.__doc__
 if hasattr(_embed_anything, "__all__"):
-    __all__ = _embed_anything.__all__
+    __all__ = _embed_anything.__all__
diff --git a/python/python/embed_anything/_embed_anything.pyi b/python/python/embed_anything/_embed_anything.pyi
@@ -500,10 +500,9 @@ class WhichModel(Enum):
     SparseBert = ("SparseBert",)
 
 class ONNXModel(Enum):
-
     """
     Enum representing various ONNX models.
-    
+
     ```markdown
     | Enum Variant                     | Description                                      |
     |----------------------------------|--------------------------------------------------|
@@ -596,4 +595,3 @@ class ONNXModel(Enum):
     JINAV2BASEEN = "JINAV2BASEEN"
 
     JINAV2LARGEEN = "JINAV2LARGEEN"
-
diff --git a/python/src/lib.rs b/python/src/lib.rs
@@ -124,7 +124,7 @@ pub enum ONNXModel {
     GTELargeENV15Q,
     JINAV2SMALLEN,
     JINAV2BASEEN,
-    JINAV2LARGEEN,
+    JINAV3,
 }
 impl fmt::Display for ONNXModel {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -219,10 +219,10 @@ impl EmbeddingModel {
             }
             WhichModel::Jina => {
                 let model_id = model_id.unwrap_or("jinaai/jina-embeddings-v2-small-en");
-                let model = Embedder::Text(TextEmbedder::Jina(
+                let model = Embedder::Text(TextEmbedder::Jina(Box::new(
                     embed_anything::embeddings::local::jina::JinaEmbedder::new(model_id, revision)
                         .unwrap(),
-                ));
+                )));
                 Ok(EmbeddingModel {
                     inner: Arc::new(model),
                 })

diff --git a/rust/examples/audio.rs b/rust/examples/audio.rs
@@ -1,11 +1,8 @@
 use std::sync::Arc;
 
 use embed_anything::{
-    config::TextEmbedConfig,
-    emb_audio,
-    embeddings::embed::Embedder,
-    file_processor::audio::audio_processor::AudioDecoderModel,
-    text_loader::SplittingStrategy,
+    config::TextEmbedConfig, emb_audio, embeddings::embed::Embedder,
+    file_processor::audio::audio_processor::AudioDecoderModel, text_loader::SplittingStrategy,
 };
 
 #[tokio::main]

diff --git a/rust/examples/bert_ort.rs → rust/examples/ort_models.rs b/rust/examples/bert_ort.rs → rust/examples/ort_models.rs
@@ -10,14 +10,15 @@ use std::time::Instant;
 
 #[tokio::main]
 async fn main() -> Result<(), anyhow::Error> {
-    let model =
-        Arc::new(Embedder::from_pretrained_onnx("bert", ONNXModel::AllMiniLML6V2, None).unwrap());
+    let model = Arc::new(Embedder::from_pretrained_onnx("jina", ONNXModel::JINAV3, None).unwrap());
+    let semantic_encoder =
+        Arc::new(Embedder::from_pretrained_onnx("jina", ONNXModel::JINAV2SMALLEN, None).unwrap());
     let config = TextEmbedConfig::default()
-        .with_chunk_size(1000, Some(0.3))
-        .with_batch_size(256)
+        .with_chunk_size(256, Some(0.3))
+        .with_batch_size(32)
         .with_buffer_size(256)
-        .with_splitting_strategy(SplittingStrategy::Sentence)
-        .with_semantic_encoder(Arc::clone(&model));
+        .with_splitting_strategy(SplittingStrategy::Semantic)
+        .with_semantic_encoder(Arc::clone(&semantic_encoder));
 
     // get files in bench
     let files = std::fs::read_dir("bench")
@@ -32,7 +33,14 @@ async fn main() -> Result<(), anyhow::Error> {
         .map(|file| embed_file(file, &model, Some(&config), None::<fn(Vec<EmbedData>)>))
         .collect::<Vec<_>>();
 
-    let _data = futures.into_iter().next().unwrap().await;
+    let _data = futures.into_iter().next().unwrap().await?.unwrap();
+
+    for chunk in _data {
+        println!("--------------------------------");
+
+        println!("{:?}", chunk.text.unwrap());
+        println!("\n");
+    }
 
     let elapsed_time = now.elapsed();
     println!("Elapsed Time: {}", elapsed_time.as_secs_f32());
@@ -42,10 +50,10 @@ async fn main() -> Result<(), anyhow::Error> {
         "The cat is sleeping on the mat",
         "The dog is barking at the moon",
         "I love pizza",
-        "I like to have pasta",
         "The dog is sitting in the park",
-        "The window is broken",
+        "Der Hund sitzt im Park", // German for "The dog is sitting in the park"
         "pizza is the best",
+        "मैं पिज्जा पसंद करता हूं", // Hindi for "I like pizza"
     ]
     .iter()
     .map(|s| s.to_string())

diff --git a/rust/src/chunkers/cumulative.rs b/rust/src/chunkers/cumulative.rs
@@ -15,7 +15,7 @@ impl Default for CumulativeChunker<Tokenizer> {
         let splitter = TextSplitter::new(ChunkConfig::new(200).with_sizer(
             Tokenizer::from_pretrained("BEE-spoke-data/cl100k_base-mlm", None).unwrap(),
         ));
-        let encoder = TextEmbedder::Jina(JinaEmbedder::default());
+        let encoder = TextEmbedder::Jina(Box::new(JinaEmbedder::default()));
         let score_threshold = 0.9;
         let device = candle_core::Device::cuda_if_available(0).unwrap_or(candle_core::Device::Cpu);
         Self {

diff --git a/rust/src/chunkers/statistical.rs b/rust/src/chunkers/statistical.rs
@@ -3,9 +3,11 @@ use std::{cmp::max, sync::Arc};
 use crate::embeddings::{
     embed::{Embedder, TextEmbedder},
     local::jina::JinaEmbedder,
+    select_device,
 };
 use candle_core::Tensor;
 use itertools::{enumerate, Itertools};
+use text_splitter::{ChunkConfig, TextSplitter};
 // use text_splitter::{ChunkConfig, TextSplitter};
 use tokenizers::Tokenizer;
 
@@ -24,8 +26,10 @@ pub struct StatisticalChunker {
 impl Default for StatisticalChunker {
     fn default() -> Self {
         let tokenizer = Tokenizer::from_pretrained("BEE-spoke-data/cl100k_base-mlm", None).unwrap();
-        let encoder = Arc::new(Embedder::Text(TextEmbedder::Jina(JinaEmbedder::default())));
-        let device = candle_core::Device::cuda_if_available(0).unwrap_or(candle_core::Device::Cpu);
+        let encoder = Arc::new(Embedder::Text(TextEmbedder::Jina(Box::new(
+            JinaEmbedder::default(),
+        ))));
+        let device = select_device();
         Self {
             encoder,
             device,
@@ -56,7 +60,7 @@ impl StatisticalChunker {
     ) -> Self {
         Self {
             encoder,
-            device: candle_core::Device::cuda_if_available(0).unwrap_or(candle_core::Device::Cpu),
+            device: select_device(),
             threshold_adjustment,
             dynamic_threshold,
             window_size,
@@ -105,10 +109,12 @@ impl StatisticalChunker {
     }
 
     pub async fn chunk(&self, text: &str, batch_size: usize) -> Vec<String> {
-        // let splitter = TextSplitter::new(ChunkConfig::new(256)
-        // .with_sizer(Tokenizer::from_pretrained("bert-base-cased", None).unwrap()));
-        // let splits = splitter.chunks(text).collect::<Vec<_>>();
-        let splits = self.split_into_sentences(text, 50).unwrap();
+        let splitter = TextSplitter::new(
+            ChunkConfig::new(50)
+                .with_sizer(Tokenizer::from_pretrained("bert-base-cased", None).unwrap()),
+        );
+        let splits = splitter.chunks(text).collect::<Vec<_>>();
+        // let splits = self.split_into_sentences(text, 50).unwrap();
         if self.verbose {
             for split in splits.iter() {
                 println!("-----Split---\n{}", split);

diff --git a/rust/src/embeddings/embed.rs b/rust/src/embeddings/embed.rs
@@ -5,7 +5,7 @@ use super::cloud::openai::OpenAIEmbedder;
 use super::local::bert::{BertEmbed, BertEmbedder, OrtBertEmbedder, SparseBertEmbedder};
 use super::local::clip::ClipEmbedder;
 use super::local::colpali::{ColPaliEmbed, ColPaliEmbedder};
-use super::local::jina::JinaEmbedder;
+use super::local::jina::{JinaEmbed, JinaEmbedder, OrtJinaEmbedder};
 use super::local::text_embedding::ONNXModel;
 use anyhow::anyhow;
 use serde::Deserialize;
@@ -88,7 +88,7 @@ pub trait AudioDecoder {
 pub enum TextEmbedder {
     OpenAI(OpenAIEmbedder),
     Cohere(CohereEmbedder),
-    Jina(JinaEmbedder),
+    Jina(Box<dyn JinaEmbed + Send + Sync>),
     Bert(Box<dyn BertEmbed + Send + Sync>),
 }
 
@@ -112,7 +112,7 @@ impl TextEmbedder {
         revision: Option<&str>,
     ) -> Result<Self, anyhow::Error> {
         match model {
-            "jina" | "Jina" => Ok(Self::Jina(JinaEmbedder::new(model_id, revision)?)),
+            "jina" | "Jina" => Ok(Self::Jina(Box::new(JinaEmbedder::new(model_id, revision)?))),
 
             "Bert" | "bert" => Ok(Self::Bert(Box::new(BertEmbedder::new(
                 model_id.to_string(),
@@ -135,6 +135,9 @@ impl TextEmbedder {
                 model_name,
                 revision.map(|s| s.to_string()),
             )?))),
+            "jina" | "Jina" => Ok(Self::Jina(Box::new(OrtJinaEmbedder::new(
+                model_name, revision,
+            )?))),
             _ => Err(anyhow::anyhow!("Model not supported")),
         }
     }

diff --git a/rust/src/embeddings/local/bert.rs b/rust/src/embeddings/local/bert.rs
@@ -114,7 +114,7 @@ impl OrtBertEmbedder {
         let threads = std::thread::available_parallelism().unwrap().get();
         let model = Session::builder()?
             .with_execution_providers([
-                CUDAExecutionProvider::default().build(),
+                CUDAExecutionProvider::default().build().error_on_failure(),
                 CoreMLExecutionProvider::default().build(),
             ])?
             .with_optimization_level(GraphOptimizationLevel::Level3)?
Original file line number	Diff line number	Diff line change
Expand Up		@@ -83,6 +83,5 @@ def delete_index(self, index_name: str):
		print(response.objects[i].properties["text"])



		for res in response.objects:
		print(textwrap.fill(res.properties["text"], width=120), end="\n\n")