diff --git a/rust/src/config.rs b/rust/src/config.rs index 74af75f6..8bcd1dcf 100644 --- a/rust/src/config.rs +++ b/rust/src/config.rs @@ -2,14 +2,56 @@ use std::sync::Arc; use crate::{embeddings::embed::Embedder, text_loader::SplittingStrategy}; +/// Configuration for text embedding. +/// +/// # Example: Creating a new instance +/// +/// ```rust +/// use embed_anything::config::TextEmbedConfig; +/// use embed_anything::text_loader::SplittingStrategy; +/// let config = TextEmbedConfig::new( +/// Some(512), +/// Some(128), +/// Some(100), +/// Some(0.0), +/// Some(SplittingStrategy::Sentence), +/// None, +/// Some(true) +/// ); +/// ``` +/// +/// # Example: Overriding a single default +/// +/// ```rust +/// use embed_anything::config::TextEmbedConfig; +/// use embed_anything::text_loader::SplittingStrategy; +/// let config = TextEmbedConfig { +/// splitting_strategy: Some(SplittingStrategy::Semantic), +/// ..Default::default() +/// }; +/// ``` #[derive(Clone)] pub struct TextEmbedConfig { + /// Controls the size of each "chunk" of data that your input text gets split into. Defaults to + /// 256. pub chunk_size: Option, + /// Controls the ratio of overlapping data across "chunks" of your input text. Defaults to 0.0, + /// or no overlap. pub overlap_ratio: Option, + /// Controls the size of each "batch" of data sent to the embedder. The default value depends + /// largely on the embedder, but will be set to 32 when using [TextEmbedConfig::default()] pub batch_size: Option, - pub buffer_size: Option, // Required for adapter. Default is 100. + /// When using an adapter, this controls the size of the buffer. Defaults to 100. + pub buffer_size: Option, + /// Controls how documents are split into segments. See [SplittingStrategy] for options. + /// Defaults to [SplittingStrategy::Sentence] pub splitting_strategy: Option, + /// Allows overriding the embedder used when the splitting strategy is + /// [SplittingStrategy::Semantic]. Defaults to JINA. pub semantic_encoder: Option>, + /// When embedding a PDF, controls whether **o**ptical **c**haracter **r**ecognition is used on + /// the PDF to extract text. This process involves rendering the PDF as a series of images, and + /// extracting text from the images. Defaults to false. pub use_ocr: Option, pub tesseract_path: Option, } diff --git a/rust/src/embeddings/cloud/cohere.rs b/rust/src/embeddings/cloud/cohere.rs index def8f4a8..9055695e 100644 --- a/rust/src/embeddings/cloud/cohere.rs +++ b/rust/src/embeddings/cloud/cohere.rs @@ -36,7 +36,7 @@ impl CohereEmbedder { /// /// # Arguments /// - /// * `model` - A string slice that holds the model to be used for embedding. Find available models at https://docs.cohere.com/docs/cohere-embed + /// * `model` - A string slice that holds the model to be used for embedding. Find available models at /// * `api_key` - An optional string slice that holds the API key for authenticating requests to the Cohere API. /// /// # Returns diff --git a/rust/src/embeddings/embed.rs b/rust/src/embeddings/embed.rs index 142faa8a..c635a916 100644 --- a/rust/src/embeddings/embed.rs +++ b/rust/src/embeddings/embed.rs @@ -205,8 +205,8 @@ impl TextEmbedder { /// - "cohere" /// /// * `model_id` - A string holds the model ID for the model to be used for embedding. - /// - For OpenAI, find available models at https://platform.openai.com/docs/guides/embeddings/embedding-models - /// - For Cohere, find available models at https://docs.cohere.com/docs/cohere-embed + /// - For OpenAI, find available models at + /// - For Cohere, find available models at /// * `api_key` - An optional string holds the API key for authenticating requests to the Cohere API. If not provided, it is taken from the environment variable /// - For OpenAI, create environment variable `OPENAI_API_KEY` /// - For Cohere, create environment variable `CO_API_KEY` diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 688e1ee0..fc4dcf30 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,5 +1,62 @@ -//! # Embed Anything -//! This library provides a simple interface to embed text and images using various embedding models. +#![doc(html_favicon_url = "https://raw.githubusercontent.com/StarlightSearch/EmbedAnything/refs/heads/main/docs/assets/icon.ico")] +#![doc(html_logo_url = "https://raw.githubusercontent.com/StarlightSearch/EmbedAnything/refs/heads/main/docs/assets/Square310x310Logo.png")] +#![doc(issue_tracker_base_url = "https://github.com/StarlightSearch/EmbedAnything/issues/")] +//! embed_anything is a minimalist, highly performant, lightning-fast, lightweight, multisource, +//! multimodal, and local embedding pipeline. +//! +//! Whether you're working with text, images, audio, PDFs, websites, or other media, embed_anything +//! streamlines the process of generating embeddings from various sources and seamlessly streaming +//! (memory-efficient-indexing) them to a vector database. +//! +//! It supports dense, sparse, [ONNX](https://github.com/onnx/onnx) and late-interaction embeddings, +//! offering flexibility for a wide range of use cases. +//! +//! # Usage +//! +//! ## Creating an [Embedder] +//! +//! To get started, you'll need to create an [Embedder] for the type of content you want to embed. +//! We offer some utility functions to streamline creating embedders from various sources, such as +//! [Embedder::from_pretrained_hf], [Embedder::from_pretrained_onnx], and +//! [Embedder::from_pretrained_cloud]. You can use any of these to quickly create an Embedder like so: +//! +//! ```rust +//! use embed_anything::embeddings::embed::Embedder; +//! +//! // Create a local CLIP embedder from a Hugging Face model +//! let clip_embedder = Embedder::from_pretrained_hf("CLIP", "jina-clip-v2", None); +//! +//! // Create a cloud OpenAI embedder +//! let openai_embedder = Embedder::from_pretrained_cloud("OpenAI", "gpt-3.5-turbo", Some("my-api-key".to_string())); +//! ``` +//! +//! If needed, you can also create an instance of [Embedder] manually, allowing you to create your +//! own embedder! Here's an example of manually creating embedders: +//! +//! ```rust +//! use embed_anything::embeddings::embed::{Embedder, TextEmbedder}; +//! use embed_anything::embeddings::local::jina::JinaEmbedder; +//! +//! let jina_embedder = Embedder::Text(TextEmbedder::Jina(Box::new(JinaEmbedder::default()))); +//! ``` +//! +//! ## Generate embeddings +//! +//! # Example: Embed a text file +//! +//! Let's see how embed_anything can help us generate embeddings from a plain text file: +//! +//! ```rust +//! use embed_anything::embed_file; +//! use embed_anything::embeddings::embed::{Embedder, TextEmbedder}; +//! use embed_anything::embeddings::local::jina::JinaEmbedder; +//! +//! // Create an Embedder for text. We support a variety of models out-of-the-box, including cloud-based models! +//! let embedder = Embedder::Text(TextEmbedder::Jina(Box::new(JinaEmbedder::default()))); +//! // Generate embeddings for 'path/to/file.txt' using the embedder we just created. +//! let embedding = embed_file("path/to/file.txt", &embedder, None, None); +//! ``` + pub mod chunkers; pub mod config; pub mod embeddings; @@ -62,15 +119,14 @@ pub enum Dtype { /// /// ``` /// use embed_anything::embed_query; +/// use embed_anything::embeddings::embed::{Embedder, TextEmbedder}; +/// use embed_anything::embeddings::local::jina::JinaEmbedder; /// /// let query = vec!["Hello".to_string(), "World".to_string()]; -/// let embedder = "OpenAI"; -/// let openai_config = OpenAIConfig{ model: Some("text-embedding-3-small".to_string()), api_key: None, chunk_size: Some(256) }; -/// let config = EmbedConfig{ openai: Some(openai_config), ..Default::default() }; -/// let embeddings = embed_query(query, embedder).unwrap(); +/// let embedder = Embedder::Text(TextEmbedder::Jina(Box::new(JinaEmbedder::default()))); +/// let embeddings = embed_query(query, &embedder, None).unwrap(); /// println!("{:?}", embeddings); /// ``` -/// This will output the embeddings of the queries using the OpenAI embedding model. pub async fn embed_query( query: Vec, embedder: &Embedder, @@ -81,7 +137,7 @@ pub async fn embed_query( let _chunk_size = config.chunk_size.unwrap_or(256); let batch_size = config.batch_size; - let encodings = embedder.embed(&query, batch_size).await.unwrap(); + let encodings = embedder.embed(&query, batch_size).await?; let embeddings = get_text_metadata(&Rc::new(encodings), &query, &None)?; Ok(embeddings) @@ -108,13 +164,13 @@ pub async fn embed_query( /// /// ```rust /// use embed_anything::embed_file; +/// use embed_anything::embeddings::embed::{Embedder, TextEmbedder}; +/// use embed_anything::embeddings::local::bert::BertEmbedder; /// -/// let file_name = "test_files/test.pdf"; -/// let embedder = "Bert"; -/// let bert_config = BertConfig{ model_id: Some("sentence-transformers/all-MiniLM-L12-v2".to_string()), revision: None, chunk_size: Some(256) }; -/// let embeddings = embed_file(file_name, embedder, config).unwrap(); +/// let file_name = "path/to/file.pdf"; +/// let embedder = Embedder::Text(TextEmbedder::from(BertEmbedder::new("sentence-transformers/all-MiniLM-L12-v2".into(), None).unwrap())); +/// let embeddings = embed_file(file_name, &embedder, None, None).unwrap(); /// ``` -/// This will output the embeddings of the file using the OpenAI embedding model. pub async fn embed_file, F>( file_name: T, embedder: &Embedder, @@ -158,22 +214,12 @@ where /// # Example /// /// ``` -/// let embeddings = match embedder { -/// "OpenAI" => webpage -/// .embed_webpage(&embedding_model::openai::OpenAIEmbedder::default()) -/// .unwrap(), -/// "Jina" => webpage -/// .embed_webpage(&embedding_model::jina::JinaEmbedder::default()) -/// .unwrap(), -/// "Bert" => webpage -/// .embed_webpage(&embedding_model::bert::BertEmbedder::default()) -/// .unwrap(), -/// _ => { -/// return Err(PyValueError::new_err( -/// "Invalid embedding model. Choose between OpenAI and AllMiniLmL12V2.", -/// )) -/// } -/// }; +/// use embed_anything::embed_webpage; +/// use embed_anything::embeddings::embed::{Embedder, TextEmbedder}; +/// use embed_anything::embeddings::local::jina::JinaEmbedder; +/// +/// let embedder = Embedder::Text(TextEmbedder::Jina(Box::new(JinaEmbedder::default()))); +/// let embeddings = embed_webpage("https://en.wikipedia.org/wiki/Embedding".into(), &embedder, None, None).unwrap(); /// ``` pub async fn embed_webpage( url: String, @@ -230,13 +276,19 @@ where /// # Example /// /// ``` -/// embed_html( -/// "test_files/test.html", -/// "https://example.com/", -/// &Embedder::Text(TextEmbedder::Jina(JinaEmbedder::default())), -/// Some(&config), -/// None, -/// ) +/// use embed_anything::embed_html; +/// use embed_anything::embeddings::embed::{Embedder, TextEmbedder}; +/// use embed_anything::embeddings::local::jina::JinaEmbedder; +/// +/// async fn get_embeddings() { +/// let embeddings = embed_html( +/// "test_files/test.html", +/// Some("https://example.com/"), +/// &Embedder::from_pretrained_hf("JINA", "jinaai/jina-embeddings-v2-small-en", None).unwrap(), +/// None, +/// None, +/// ).await.unwrap(); +/// } /// ``` pub async fn embed_html( file_name: impl AsRef, @@ -387,10 +439,13 @@ pub async fn emb_audio>( /// use embed_anything::embed_image_directory; /// use std::path::PathBuf; /// use std::sync::Arc; +/// use embed_anything::embeddings::embed::Embedder; /// -/// let directory = PathBuf::from("/path/to/directory"); -/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); -/// let embeddings = embed_image_directory(directory, &embedder, None).await.unwrap(); +/// async fn embed_images() { +/// let directory = PathBuf::from("/path/to/directory"); +/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); +/// let embeddings = embed_image_directory(directory, &embedder, None, None).await.unwrap(); +/// } /// ``` /// This will output the embeddings of the images in the specified directory using the specified embedding model. /// @@ -544,12 +599,16 @@ async fn process_images( /// use embed_anything::embed_directory_stream; /// use std::path::PathBuf; /// use std::sync::Arc; -/// -/// let directory = PathBuf::from("/path/to/directory"); -/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); -/// let config = Some(TextEmbedConfig::default()); -/// let extensions = Some(vec!["txt".to_string(), "pdf".to_string()]); -/// let embeddings = embed_directory_stream(directory, &embedder, extensions, config, None).await.unwrap(); +/// use embed_anything::config::TextEmbedConfig; +/// use embed_anything::embeddings::embed::Embedder; +/// +/// async fn generate_embeddings() { +/// let directory = PathBuf::from("/path/to/directory"); +/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap()); +/// let config = Some(&TextEmbedConfig::default()); +/// let extensions = Some(vec!["txt".to_string(), "pdf".to_string()]); +/// let embeddings = embed_directory_stream(directory, &embedder, extensions, config, None).await.unwrap(); +/// } /// ``` /// This will output the embeddings of the files in the specified directory using the specified embedding model. pub async fn embed_directory_stream( diff --git a/rust/src/models/clip/mod.rs b/rust/src/models/clip/mod.rs index b8cc2b56..f7f931e2 100644 --- a/rust/src/models/clip/mod.rs +++ b/rust/src/models/clip/mod.rs @@ -3,8 +3,8 @@ //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on //! pairs of images with related texts. //! -//! https://github.com/openai/CLIP -//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip +//! +//! use self::{ text_model::{Activation, ClipTextTransformer}, vision_model::ClipVisionTransformer, diff --git a/rust/src/models/clip/text_model.rs b/rust/src/models/clip/text_model.rs index fc9b5f23..a7fb49d0 100644 --- a/rust/src/models/clip/text_model.rs +++ b/rust/src/models/clip/text_model.rs @@ -3,8 +3,8 @@ //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on //! pairs of images with related texts. //! -//! https://github.com/openai/CLIP -//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip +//! +//! use candle_core::{DType, Device, IndexOp, Result, Tensor, D}; use candle_nn as nn; diff --git a/rust/src/models/clip/vision_model.rs b/rust/src/models/clip/vision_model.rs index 6a9cb934..86684b3e 100644 --- a/rust/src/models/clip/vision_model.rs +++ b/rust/src/models/clip/vision_model.rs @@ -3,8 +3,8 @@ //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on //! pairs of images with related texts. //! -//! https://github.com/openai/CLIP -//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip +//! +//! use candle_core::{IndexOp, Result, Shape, Tensor, D}; use candle_nn as nn;