Merge pull request #106 from boswelja/documentation

Rewrite Rust documentation - part 1
StarlightSearch · Jan 28, 2025 · 2488f88 · 2488f88
2 parents 7d849cc + 0b83f14
commit 2488f88
Show file tree

Hide file tree

Showing 7 changed files with 156 additions and 55 deletions.
diff --git a/rust/src/config.rs b/rust/src/config.rs
@@ -2,14 +2,56 @@ use std::sync::Arc;
 
 use crate::{embeddings::embed::Embedder, text_loader::SplittingStrategy};
 
+/// Configuration for text embedding.
+///
+/// # Example: Creating a new instance
+///
+/// ```rust
+/// use embed_anything::config::TextEmbedConfig;
+/// use embed_anything::text_loader::SplittingStrategy;
+/// let config = TextEmbedConfig::new(
+///     Some(512),
+///     Some(128),
+///     Some(100),
+///     Some(0.0),
+///     Some(SplittingStrategy::Sentence),
+///     None,
+///     Some(true)
+/// );
+/// ```
+///
+/// # Example: Overriding a single default
+///
+/// ```rust
+/// use embed_anything::config::TextEmbedConfig;
+/// use embed_anything::text_loader::SplittingStrategy;
+/// let config = TextEmbedConfig {
+///     splitting_strategy: Some(SplittingStrategy::Semantic),
+///     ..Default::default()
+/// };
+/// ```
 #[derive(Clone)]
 pub struct TextEmbedConfig {
+    /// Controls the size of each "chunk" of data that your input text gets split into. Defaults to
+    /// 256.
     pub chunk_size: Option<usize>,
+    /// Controls the ratio of overlapping data across "chunks" of your input text. Defaults to 0.0,
+    /// or no overlap.
     pub overlap_ratio: Option<f32>,
+    /// Controls the size of each "batch" of data sent to the embedder. The default value depends
+    /// largely on the embedder, but will be set to 32 when using [TextEmbedConfig::default()]
     pub batch_size: Option<usize>,
-    pub buffer_size: Option<usize>, // Required for adapter. Default is 100.
+    /// When using an adapter, this controls the size of the buffer. Defaults to 100.
+    pub buffer_size: Option<usize>,
+    /// Controls how documents are split into segments. See [SplittingStrategy] for options.
+    /// Defaults to [SplittingStrategy::Sentence]
     pub splitting_strategy: Option<SplittingStrategy>,
+    /// Allows overriding the embedder used when the splitting strategy is
+    /// [SplittingStrategy::Semantic]. Defaults to JINA.
     pub semantic_encoder: Option<Arc<Embedder>>,
+    /// When embedding a PDF, controls whether **o**ptical **c**haracter **r**ecognition is used on
+    /// the PDF to extract text. This process involves rendering the PDF as a series of images, and
+    /// extracting text from the images. Defaults to false.
     pub use_ocr: Option<bool>,
     pub tesseract_path: Option<String>,
 }

diff --git a/rust/src/embeddings/cloud/cohere.rs b/rust/src/embeddings/cloud/cohere.rs
@@ -36,7 +36,7 @@ impl CohereEmbedder {
     ///
     /// # Arguments
     ///
-    /// * `model` - A string slice that holds the model to be used for embedding. Find available models at https://docs.cohere.com/docs/cohere-embed
+    /// * `model` - A string slice that holds the model to be used for embedding. Find available models at <https://docs.cohere.com/docs/cohere-embed>
     /// * `api_key` - An optional string slice that holds the API key for authenticating requests to the Cohere API.
     ///
     /// # Returns

diff --git a/rust/src/embeddings/embed.rs b/rust/src/embeddings/embed.rs
@@ -205,8 +205,8 @@ impl TextEmbedder {
     ///             - "cohere"
     ///
     /// * `model_id` - A string holds the model ID for the model to be used for embedding.
-    ///     - For OpenAI, find available models at https://platform.openai.com/docs/guides/embeddings/embedding-models
-    ///     - For Cohere, find available models at https://docs.cohere.com/docs/cohere-embed
+    ///     - For OpenAI, find available models at <https://platform.openai.com/docs/guides/embeddings/embedding-models>
+    ///     - For Cohere, find available models at <https://docs.cohere.com/docs/cohere-embed>
     /// * `api_key` - An optional string holds the API key for authenticating requests to the Cohere API. If not provided, it is taken from the environment variable
     ///         - For OpenAI, create environment variable `OPENAI_API_KEY`
     ///         - For Cohere, create environment variable `CO_API_KEY`

diff --git a/rust/src/lib.rs b/rust/src/lib.rs
@@ -1,5 +1,62 @@
-//! # Embed Anything
-//! This library provides a simple interface to embed text and images using various embedding models.
+#![doc(html_favicon_url = "https://raw.githubusercontent.com/StarlightSearch/EmbedAnything/refs/heads/main/docs/assets/icon.ico")]
+#![doc(html_logo_url = "https://raw.githubusercontent.com/StarlightSearch/EmbedAnything/refs/heads/main/docs/assets/Square310x310Logo.png")]
+#![doc(issue_tracker_base_url = "https://github.com/StarlightSearch/EmbedAnything/issues/")]
+//! embed_anything is a minimalist, highly performant, lightning-fast, lightweight, multisource,
+//! multimodal, and local embedding pipeline.
+//!
+//! Whether you're working with text, images, audio, PDFs, websites, or other media, embed_anything
+//! streamlines the process of generating embeddings from various sources and seamlessly streaming
+//! (memory-efficient-indexing) them to a vector database.
+//!
+//! It supports dense, sparse, [ONNX](https://github.com/onnx/onnx) and late-interaction embeddings,
+//! offering flexibility for a wide range of use cases.
+//!
+//! # Usage
+//!
+//! ## Creating an [Embedder]
+//!
+//! To get started, you'll need to create an [Embedder] for the type of content you want to embed.
+//! We offer some utility functions to streamline creating embedders from various sources, such as
+//! [Embedder::from_pretrained_hf], [Embedder::from_pretrained_onnx], and
+//! [Embedder::from_pretrained_cloud]. You can use any of these to quickly create an Embedder like so:
+//!
+//! ```rust
+//! use embed_anything::embeddings::embed::Embedder;
+//!
+//! // Create a local CLIP embedder from a Hugging Face model
+//! let clip_embedder = Embedder::from_pretrained_hf("CLIP", "jina-clip-v2", None);
+//!
+//! // Create a cloud OpenAI embedder
+//! let openai_embedder = Embedder::from_pretrained_cloud("OpenAI", "gpt-3.5-turbo", Some("my-api-key".to_string()));
+//! ```
+//!
+//! If needed, you can also create an instance of [Embedder] manually, allowing you to create your
+//! own embedder! Here's an example of manually creating embedders:
+//!
+//! ```rust
+//! use embed_anything::embeddings::embed::{Embedder, TextEmbedder};
+//! use embed_anything::embeddings::local::jina::JinaEmbedder;
+//!
+//! let jina_embedder = Embedder::Text(TextEmbedder::Jina(Box::new(JinaEmbedder::default())));
+//! ```
+//!
+//! ## Generate embeddings
+//!
+//! # Example: Embed a text file
+//!
+//! Let's see how embed_anything can help us generate embeddings from a plain text file:
+//!
+//! ```rust
+//! use embed_anything::embed_file;
+//! use embed_anything::embeddings::embed::{Embedder, TextEmbedder};
+//! use embed_anything::embeddings::local::jina::JinaEmbedder;
+//!
+//! // Create an Embedder for text. We support a variety of models out-of-the-box, including cloud-based models!
+//! let embedder = Embedder::Text(TextEmbedder::Jina(Box::new(JinaEmbedder::default())));
+//! // Generate embeddings for 'path/to/file.txt' using the embedder we just created.
+//! let embedding = embed_file("path/to/file.txt", &embedder, None, None);
+//! ```
+
 pub mod chunkers;
 pub mod config;
 pub mod embeddings;
@@ -62,15 +119,14 @@ pub enum Dtype {
 ///
 /// ```
 /// use embed_anything::embed_query;
+/// use embed_anything::embeddings::embed::{Embedder, TextEmbedder};
+/// use embed_anything::embeddings::local::jina::JinaEmbedder;
 ///
 /// let query = vec!["Hello".to_string(), "World".to_string()];
-/// let embedder = "OpenAI";
-/// let openai_config = OpenAIConfig{ model: Some("text-embedding-3-small".to_string()), api_key: None, chunk_size: Some(256) };
-/// let config = EmbedConfig{ openai: Some(openai_config), ..Default::default() };
-/// let embeddings = embed_query(query, embedder).unwrap();
+/// let embedder = Embedder::Text(TextEmbedder::Jina(Box::new(JinaEmbedder::default())));
+/// let embeddings = embed_query(query, &embedder, None).unwrap();
 /// println!("{:?}", embeddings);
 /// ```
-/// This will output the embeddings of the queries using the OpenAI embedding model.
 pub async fn embed_query(
     query: Vec<String>,
     embedder: &Embedder,
@@ -81,7 +137,7 @@ pub async fn embed_query(
     let _chunk_size = config.chunk_size.unwrap_or(256);
     let batch_size = config.batch_size;
 
-    let encodings = embedder.embed(&query, batch_size).await.unwrap();
+    let encodings = embedder.embed(&query, batch_size).await?;
     let embeddings = get_text_metadata(&Rc::new(encodings), &query, &None)?;
 
     Ok(embeddings)
@@ -108,13 +164,13 @@ pub async fn embed_query(
 ///
 /// ```rust
 /// use embed_anything::embed_file;
+/// use embed_anything::embeddings::embed::{Embedder, TextEmbedder};
+/// use embed_anything::embeddings::local::bert::BertEmbedder;
 ///
-/// let file_name = "test_files/test.pdf";
-/// let embedder = "Bert";
-/// let bert_config = BertConfig{ model_id: Some("sentence-transformers/all-MiniLM-L12-v2".to_string()), revision: None, chunk_size: Some(256) };
-/// let embeddings = embed_file(file_name, embedder, config).unwrap();
+/// let file_name = "path/to/file.pdf";
+/// let embedder = Embedder::Text(TextEmbedder::from(BertEmbedder::new("sentence-transformers/all-MiniLM-L12-v2".into(), None).unwrap()));
+/// let embeddings = embed_file(file_name, &embedder, None, None).unwrap();
 /// ```
-/// This will output the embeddings of the file using the OpenAI embedding model.
 pub async fn embed_file<T: AsRef<std::path::Path>, F>(
     file_name: T,
     embedder: &Embedder,
@@ -158,22 +214,12 @@ where
 /// # Example
 ///
 /// ```
-/// let embeddings = match embedder {
-///     "OpenAI" => webpage
-///         .embed_webpage(&embedding_model::openai::OpenAIEmbedder::default())
-///         .unwrap(),
-///     "Jina" => webpage
-///         .embed_webpage(&embedding_model::jina::JinaEmbedder::default())
-///         .unwrap(),
-///     "Bert" => webpage
-///         .embed_webpage(&embedding_model::bert::BertEmbedder::default())
-///         .unwrap(),
-///     _ => {
-///         return Err(PyValueError::new_err(
-///             "Invalid embedding model. Choose between OpenAI and AllMiniLmL12V2.",
-///         ))
-///     }
-/// };
+/// use embed_anything::embed_webpage;
+/// use embed_anything::embeddings::embed::{Embedder, TextEmbedder};
+/// use embed_anything::embeddings::local::jina::JinaEmbedder;
+///
+/// let embedder = Embedder::Text(TextEmbedder::Jina(Box::new(JinaEmbedder::default())));
+/// let embeddings = embed_webpage("https://en.wikipedia.org/wiki/Embedding".into(), &embedder, None, None).unwrap();
 /// ```
 pub async fn embed_webpage<F>(
     url: String,
@@ -230,13 +276,19 @@ where
 /// # Example
 ///
 /// ```
-/// embed_html(
-///     "test_files/test.html",
-///     "https://example.com/",
-///     &Embedder::Text(TextEmbedder::Jina(JinaEmbedder::default())),
-///     Some(&config),
-///     None,
-/// )
+/// use embed_anything::embed_html;
+/// use embed_anything::embeddings::embed::{Embedder, TextEmbedder};
+/// use embed_anything::embeddings::local::jina::JinaEmbedder;
+///
+/// async fn get_embeddings() {
+///     let embeddings = embed_html(
+///         "test_files/test.html",
+///         Some("https://example.com/"),
+///         &Embedder::from_pretrained_hf("JINA", "jinaai/jina-embeddings-v2-small-en", None).unwrap(),
+///         None,
+///         None,
+///     ).await.unwrap();
+/// }
 /// ```
 pub async fn embed_html(
     file_name: impl AsRef<std::path::Path>,
@@ -387,10 +439,13 @@ pub async fn emb_audio<T: AsRef<std::path::Path>>(
 /// use embed_anything::embed_image_directory;
 /// use std::path::PathBuf;
 /// use std::sync::Arc;
+/// use embed_anything::embeddings::embed::Embedder;
 ///
-/// let directory = PathBuf::from("/path/to/directory");
-/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap());
-/// let embeddings = embed_image_directory(directory, &embedder, None).await.unwrap();
+/// async fn embed_images() {
+///     let directory = PathBuf::from("/path/to/directory");
+///     let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap());
+///     let embeddings = embed_image_directory(directory, &embedder, None, None).await.unwrap();
+/// }
 /// ```
 /// This will output the embeddings of the images in the specified directory using the specified embedding model.
 ///
@@ -544,12 +599,16 @@ async fn process_images<E: EmbedImage>(
 /// use embed_anything::embed_directory_stream;
 /// use std::path::PathBuf;
 /// use std::sync::Arc;
-///
-/// let directory = PathBuf::from("/path/to/directory");
-/// let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap());
-/// let config = Some(TextEmbedConfig::default());
-/// let extensions = Some(vec!["txt".to_string(), "pdf".to_string()]);
-/// let embeddings = embed_directory_stream(directory, &embedder, extensions, config, None).await.unwrap();
+/// use embed_anything::config::TextEmbedConfig;
+/// use embed_anything::embeddings::embed::Embedder;
+///
+/// async fn generate_embeddings() {
+///     let directory = PathBuf::from("/path/to/directory");
+///     let embedder = Arc::new(Embedder::from_pretrained_hf("clip", "openai/clip-vit-base-patch16", None).unwrap());
+///     let config = Some(&TextEmbedConfig::default());
+///     let extensions = Some(vec!["txt".to_string(), "pdf".to_string()]);
+///     let embeddings = embed_directory_stream(directory, &embedder, extensions, config, None).await.unwrap();
+/// }
 /// ```
 /// This will output the embeddings of the files in the specified directory using the specified embedding model.
 pub async fn embed_directory_stream<F>(

diff --git a/rust/src/models/clip/mod.rs b/rust/src/models/clip/mod.rs
@@ -3,8 +3,8 @@
 //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on
 //! pairs of images with related texts.
 //!
-//! https://github.com/openai/CLIP
-//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip
+//! <https://github.com/openai/CLIP>
+//! <https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip>
 use self::{
     text_model::{Activation, ClipTextTransformer},
     vision_model::ClipVisionTransformer,

diff --git a/rust/src/models/clip/text_model.rs b/rust/src/models/clip/text_model.rs
@@ -3,8 +3,8 @@
 //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on
 //! pairs of images with related texts.
 //!
-//! https://github.com/openai/CLIP
-//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip
+//! <https://github.com/openai/CLIP>
+//! <https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip>
 
 use candle_core::{DType, Device, IndexOp, Result, Tensor, D};
 use candle_nn as nn;

diff --git a/rust/src/models/clip/vision_model.rs b/rust/src/models/clip/vision_model.rs
@@ -3,8 +3,8 @@
 //! Contrastive Language-Image Pre-Training (CLIP) is an architecture trained on
 //! pairs of images with related texts.
 //!
-//! https://github.com/openai/CLIP
-//! https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip
+//! <https://github.com/openai/CLIP>
+//! <https://github.com/huggingface/transformers/tree/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip>
 
 use candle_core::{IndexOp, Result, Shape, Tensor, D};
 use candle_nn as nn;