Skip to content

Commit

Permalink
jin v3
Browse files Browse the repository at this point in the history
  • Loading branch information
akshayballal95 committed Dec 1, 2024
1 parent a35bf95 commit fe16a80
Show file tree
Hide file tree
Showing 31 changed files with 499 additions and 123 deletions.
51 changes: 51 additions & 0 deletions docs/guides/ocr.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Use PDFs that need OCR

Embed Anything can be used to embed scanned documents using OCR. This is useful for tasks such as document search and retrieval. You can set `use_ocr=True` in the `TextEmbedConfig` to enable OCR. But this requires `tesseract` and `poppler` to be installed.

You can install `tesseract` and `poppler` using the following commands:

## Install Tesseract and Poppler

### Windows

For Tesseract, download the installer from [here](https://github.com/UB-Mannheim/tesseract/wiki) and install it.

For Poppler, download the installer from [here](https://github.com/oschwartz10612/poppler-windows?tab=readme-ov-file) and install it.

### MacOS

For Tesseract, you can install it using Homebrew.

``` bash
brew install tesseract
```

For Poppler, you can install it using Homebrew.

``` bash
brew install poppler
```

### Linux

For Tesseract, you can install it using the package manager for your Linux distribution. For example, on Ubuntu, you can install it using:

``` bash
sudo apt install tesseract-ocr
sudo apt install libtesseract-dev

```

For Poppler, you can install it using the package manager for your Linux distribution. For example, on Ubuntu, you can install it using:

``` bash
sudo apt install poppler-utils
```

For more information, refer to the [Tesseract installation guide](https://tesseract-ocr.github.io/tessdoc/Installation.html).

## Example Usage

``` python
--8<-- "examples/text_ocr.py"
```
2 changes: 1 addition & 1 deletion docs/guides/onnx_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
| `GTELargeENV15Q` | Quantized Alibaba-NLP/gte-large-en-v1.5 |
| `JINAV2SMALLEN` | jinaai/jina-embeddings-v2-small-en |
| `JINAV2BASEEN` | jinaai/jina-embeddings-v2-base-en |
| `JINAV2LARGEEN` | jinaai/jina-embeddings-v2-large-en |
| `JINAV3` | jinaai/jina-embeddings-v3 |

## Example Usage

Expand Down
1 change: 0 additions & 1 deletion examples/adapters/weaviate_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,5 @@ def delete_index(self, index_name: str):
print(response.objects[i].properties["text"])



for res in response.objects:
print(textwrap.fill(res.properties["text"], width=120), end="\n\n")
8 changes: 7 additions & 1 deletion examples/onnx_models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
import heapq
from embed_anything import EmbeddingModel, TextEmbedConfig, WhichModel, embed_query, ONNXModel
from embed_anything import (
EmbeddingModel,
TextEmbedConfig,
WhichModel,
embed_query,
ONNXModel,
)
import os
from time import time
import numpy as np
Expand Down
8 changes: 7 additions & 1 deletion examples/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
WhichModel.Jina, model_id="jinaai/jina-embeddings-v2-small-en"
)


# Example 1: Embedding a Directory
def embed_directory_example():
# Configure the embedding process
Expand All @@ -28,10 +29,13 @@ def embed_directory_example():

print(f"Time taken to embed directory: {end - start} seconds")


# Example 2: Embedding a Query
def embed_query_example():
# Configure the embedding process
config = TextEmbedConfig(chunk_size=256, batch_size=32, splitting_strategy="sentence")
config = TextEmbedConfig(
chunk_size=256, batch_size=32, splitting_strategy="sentence"
)

# Embed a query
embeddings: EmbedData = embed_anything.embed_query(
Expand All @@ -48,6 +52,7 @@ def embed_query_example():
)
)


# Example 3: Embedding a File
def embed_file_example():
# Configure the embedding process
Expand All @@ -65,6 +70,7 @@ def embed_file_example():
print(d.text)
print("---" * 20)


# Call the examples
embed_directory_example()
embed_query_example()
Expand Down
8 changes: 4 additions & 4 deletions examples/text_ocr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# OCR Requires `tesseract` and `poppler` to be installed.

import time
import embed_anything
from embed_anything import EmbedData, EmbeddingModel, TextEmbedConfig, WhichModel
from embed_anything.vectordb import Adapter
import os
from time import time


Expand All @@ -11,7 +11,7 @@
)

config = TextEmbedConfig(
chunk_size=1000,
chunk_size=256,
batch_size=32,
buffer_size=64,
splitting_strategy="sentence",
Expand All @@ -21,7 +21,7 @@
start = time()

data: list[EmbedData] = embed_anything.embed_file(
"/home/akshay/projects/starlaw/src-server/test_files/court.pdf",
"/home/akshay/projects/starlaw/src-server/test_files/court.pdf", # Replace with your file path
embeder=model,
config=config,
)
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ nav:
- guides/semantic.md
- guides/adapters.md
- guides/onnx_models.md
- guides/ocr.md
- Contribution:
- roadmap/roadmap.md
- roadmap/contribution.md
Expand Down
3 changes: 2 additions & 1 deletion python/python/embed_anything/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
- Audio Embedding Models:
- "Whisper"
"""

from ._embed_anything import *
from .vectordb import *
import platform
Expand Down Expand Up @@ -146,4 +147,4 @@

__doc__ = _embed_anything.__doc__
if hasattr(_embed_anything, "__all__"):
__all__ = _embed_anything.__all__
__all__ = _embed_anything.__all__
4 changes: 1 addition & 3 deletions python/python/embed_anything/_embed_anything.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -500,10 +500,9 @@ class WhichModel(Enum):
SparseBert = ("SparseBert",)

class ONNXModel(Enum):

"""
Enum representing various ONNX models.
```markdown
| Enum Variant | Description |
|----------------------------------|--------------------------------------------------|
Expand Down Expand Up @@ -596,4 +595,3 @@ class ONNXModel(Enum):
JINAV2BASEEN = "JINAV2BASEEN"

JINAV2LARGEEN = "JINAV2LARGEEN"

6 changes: 3 additions & 3 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ pub enum ONNXModel {
GTELargeENV15Q,
JINAV2SMALLEN,
JINAV2BASEEN,
JINAV2LARGEEN,
JINAV3,
}
impl fmt::Display for ONNXModel {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
Expand Down Expand Up @@ -219,10 +219,10 @@ impl EmbeddingModel {
}
WhichModel::Jina => {
let model_id = model_id.unwrap_or("jinaai/jina-embeddings-v2-small-en");
let model = Embedder::Text(TextEmbedder::Jina(
let model = Embedder::Text(TextEmbedder::Jina(Box::new(
embed_anything::embeddings::local::jina::JinaEmbedder::new(model_id, revision)
.unwrap(),
));
)));
Ok(EmbeddingModel {
inner: Arc::new(model),
})
Expand Down
7 changes: 2 additions & 5 deletions rust/examples/audio.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
use std::sync::Arc;

use embed_anything::{
config::TextEmbedConfig,
emb_audio,
embeddings::embed::Embedder,
file_processor::audio::audio_processor::AudioDecoderModel,
text_loader::SplittingStrategy,
config::TextEmbedConfig, emb_audio, embeddings::embed::Embedder,
file_processor::audio::audio_processor::AudioDecoderModel, text_loader::SplittingStrategy,
};

#[tokio::main]
Expand Down
26 changes: 17 additions & 9 deletions rust/examples/bert_ort.rs → rust/examples/ort_models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ use std::time::Instant;

#[tokio::main]
async fn main() -> Result<(), anyhow::Error> {
let model =
Arc::new(Embedder::from_pretrained_onnx("bert", ONNXModel::AllMiniLML6V2, None).unwrap());
let model = Arc::new(Embedder::from_pretrained_onnx("jina", ONNXModel::JINAV3, None).unwrap());
let semantic_encoder =
Arc::new(Embedder::from_pretrained_onnx("jina", ONNXModel::JINAV2SMALLEN, None).unwrap());
let config = TextEmbedConfig::default()
.with_chunk_size(1000, Some(0.3))
.with_batch_size(256)
.with_chunk_size(256, Some(0.3))
.with_batch_size(32)
.with_buffer_size(256)
.with_splitting_strategy(SplittingStrategy::Sentence)
.with_semantic_encoder(Arc::clone(&model));
.with_splitting_strategy(SplittingStrategy::Semantic)
.with_semantic_encoder(Arc::clone(&semantic_encoder));

// get files in bench
let files = std::fs::read_dir("bench")
Expand All @@ -32,7 +33,14 @@ async fn main() -> Result<(), anyhow::Error> {
.map(|file| embed_file(file, &model, Some(&config), None::<fn(Vec<EmbedData>)>))
.collect::<Vec<_>>();

let _data = futures.into_iter().next().unwrap().await;
let _data = futures.into_iter().next().unwrap().await?.unwrap();

for chunk in _data {
println!("--------------------------------");

println!("{:?}", chunk.text.unwrap());
println!("\n");
}

let elapsed_time = now.elapsed();
println!("Elapsed Time: {}", elapsed_time.as_secs_f32());
Expand All @@ -42,10 +50,10 @@ async fn main() -> Result<(), anyhow::Error> {
"The cat is sleeping on the mat",
"The dog is barking at the moon",
"I love pizza",
"I like to have pasta",
"The dog is sitting in the park",
"The window is broken",
"Der Hund sitzt im Park", // German for "The dog is sitting in the park"
"pizza is the best",
"मैं पिज्जा पसंद करता हूं", // Hindi for "I like pizza"
]
.iter()
.map(|s| s.to_string())
Expand Down
2 changes: 1 addition & 1 deletion rust/src/chunkers/cumulative.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ impl Default for CumulativeChunker<Tokenizer> {
let splitter = TextSplitter::new(ChunkConfig::new(200).with_sizer(
Tokenizer::from_pretrained("BEE-spoke-data/cl100k_base-mlm", None).unwrap(),
));
let encoder = TextEmbedder::Jina(JinaEmbedder::default());
let encoder = TextEmbedder::Jina(Box::new(JinaEmbedder::default()));
let score_threshold = 0.9;
let device = candle_core::Device::cuda_if_available(0).unwrap_or(candle_core::Device::Cpu);
Self {
Expand Down
20 changes: 13 additions & 7 deletions rust/src/chunkers/statistical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ use std::{cmp::max, sync::Arc};
use crate::embeddings::{
embed::{Embedder, TextEmbedder},
local::jina::JinaEmbedder,
select_device,
};
use candle_core::Tensor;
use itertools::{enumerate, Itertools};
use text_splitter::{ChunkConfig, TextSplitter};
// use text_splitter::{ChunkConfig, TextSplitter};
use tokenizers::Tokenizer;

Expand All @@ -24,8 +26,10 @@ pub struct StatisticalChunker {
impl Default for StatisticalChunker {
fn default() -> Self {
let tokenizer = Tokenizer::from_pretrained("BEE-spoke-data/cl100k_base-mlm", None).unwrap();
let encoder = Arc::new(Embedder::Text(TextEmbedder::Jina(JinaEmbedder::default())));
let device = candle_core::Device::cuda_if_available(0).unwrap_or(candle_core::Device::Cpu);
let encoder = Arc::new(Embedder::Text(TextEmbedder::Jina(Box::new(
JinaEmbedder::default(),
))));
let device = select_device();
Self {
encoder,
device,
Expand Down Expand Up @@ -56,7 +60,7 @@ impl StatisticalChunker {
) -> Self {
Self {
encoder,
device: candle_core::Device::cuda_if_available(0).unwrap_or(candle_core::Device::Cpu),
device: select_device(),
threshold_adjustment,
dynamic_threshold,
window_size,
Expand Down Expand Up @@ -105,10 +109,12 @@ impl StatisticalChunker {
}

pub async fn chunk(&self, text: &str, batch_size: usize) -> Vec<String> {
// let splitter = TextSplitter::new(ChunkConfig::new(256)
// .with_sizer(Tokenizer::from_pretrained("bert-base-cased", None).unwrap()));
// let splits = splitter.chunks(text).collect::<Vec<_>>();
let splits = self.split_into_sentences(text, 50).unwrap();
let splitter = TextSplitter::new(
ChunkConfig::new(50)
.with_sizer(Tokenizer::from_pretrained("bert-base-cased", None).unwrap()),
);
let splits = splitter.chunks(text).collect::<Vec<_>>();
// let splits = self.split_into_sentences(text, 50).unwrap();
if self.verbose {
for split in splits.iter() {
println!("-----Split---\n{}", split);
Expand Down
9 changes: 6 additions & 3 deletions rust/src/embeddings/embed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use super::cloud::openai::OpenAIEmbedder;
use super::local::bert::{BertEmbed, BertEmbedder, OrtBertEmbedder, SparseBertEmbedder};
use super::local::clip::ClipEmbedder;
use super::local::colpali::{ColPaliEmbed, ColPaliEmbedder};
use super::local::jina::JinaEmbedder;
use super::local::jina::{JinaEmbed, JinaEmbedder, OrtJinaEmbedder};
use super::local::text_embedding::ONNXModel;
use anyhow::anyhow;
use serde::Deserialize;
Expand Down Expand Up @@ -88,7 +88,7 @@ pub trait AudioDecoder {
pub enum TextEmbedder {
OpenAI(OpenAIEmbedder),
Cohere(CohereEmbedder),
Jina(JinaEmbedder),
Jina(Box<dyn JinaEmbed + Send + Sync>),
Bert(Box<dyn BertEmbed + Send + Sync>),
}

Expand All @@ -112,7 +112,7 @@ impl TextEmbedder {
revision: Option<&str>,
) -> Result<Self, anyhow::Error> {
match model {
"jina" | "Jina" => Ok(Self::Jina(JinaEmbedder::new(model_id, revision)?)),
"jina" | "Jina" => Ok(Self::Jina(Box::new(JinaEmbedder::new(model_id, revision)?))),

"Bert" | "bert" => Ok(Self::Bert(Box::new(BertEmbedder::new(
model_id.to_string(),
Expand All @@ -135,6 +135,9 @@ impl TextEmbedder {
model_name,
revision.map(|s| s.to_string()),
)?))),
"jina" | "Jina" => Ok(Self::Jina(Box::new(OrtJinaEmbedder::new(
model_name, revision,
)?))),
_ => Err(anyhow::anyhow!("Model not supported")),
}
}
Expand Down
2 changes: 1 addition & 1 deletion rust/src/embeddings/local/bert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ impl OrtBertEmbedder {
let threads = std::thread::available_parallelism().unwrap().get();
let model = Session::builder()?
.with_execution_providers([
CUDAExecutionProvider::default().build(),
CUDAExecutionProvider::default().build().error_on_failure(),
CoreMLExecutionProvider::default().build(),
])?
.with_optimization_level(GraphOptimizationLevel::Level3)?
Expand Down
Loading

0 comments on commit fe16a80

Please sign in to comment.