diff --git a/rig-core/Cargo.toml b/rig-core/Cargo.toml index afb3c1d9..147502f8 100644 --- a/rig-core/Cargo.toml +++ b/rig-core/Cargo.toml @@ -27,6 +27,7 @@ rig-derive = { version = "0.1.0", path = "./rig-core-derive", optional = true } glob = "0.3.1" lopdf = { version = "0.34.0", optional = true } rayon = { version = "1.10.0", optional = true} +epub = { version = "2.1.2", optional = true } [dev-dependencies] anyhow = "1.0.75" @@ -39,6 +40,7 @@ tokio-test = "0.4.4" all = ["derive", "pdf", "rayon"] derive = ["dep:rig-derive"] pdf = ["dep:lopdf"] +epub = ["dep:epub"] rayon = ["dep:rayon"] [[test]] diff --git a/rig-core/src/loaders/epub.rs b/rig-core/src/loaders/epub.rs new file mode 100644 index 00000000..a8e8b5fd --- /dev/null +++ b/rig-core/src/loaders/epub.rs @@ -0,0 +1,412 @@ +use super::file::FileLoaderError; +use epub::doc::{DocError, EpubDoc}; +use thiserror::Error; + +use std::fs::File; +use std::io::BufReader; +use std::path::PathBuf; + +#[derive(Error, Debug)] +pub enum EpubLoaderError { + #[error("IO error: {0}")] + EpubError(#[from] DocError), + + #[error("File loader error: {0}")] + FileLoaderError(#[from] FileLoaderError), +} + +// ================================================================ +// Implementing Loadable trait for loading epubs +// ================================================================ + +pub(crate) trait Loadable { + fn load(self) -> Result>, EpubLoaderError>; + fn load_with_path(self) -> Result<(PathBuf, EpubDoc>), EpubLoaderError>; +} + +impl Loadable for PathBuf { + fn load(self) -> Result>, EpubLoaderError> { + EpubDoc::new(self).map_err(EpubLoaderError::EpubError) + } + + fn load_with_path(self) -> Result<(PathBuf, EpubDoc>), EpubLoaderError> { + let contents = EpubDoc::new(&self).map_err(EpubLoaderError::EpubError); + Ok((self, contents?)) + } +} + +impl Loadable for Result { + fn load(self) -> Result>, EpubLoaderError> { + self.map(|t| t.load())? + } + fn load_with_path(self) -> Result<(PathBuf, EpubDoc>), EpubLoaderError> { + self.map(|t| t.load_with_path())? + } +} + +// ================================================================ +// EpubFileLoader definitions and implementations +// ================================================================ + +/// [EpubFileLoader] is a utility for loading epub files from the filesystem using glob patterns or +/// directory paths. It provides methods to read file contents and handle errors gracefully. +/// +/// # Errors +/// +/// This module defines a custom error type [EpubLoaderError] which can represent various errors +/// that might occur during file loading operations, such as any [FileLoaderError] alongside +/// specific EPUB-related errors. +/// +/// # Example Usage +/// +/// ```rust +/// use rig::loaders::EpubFileLoader; +/// +/// fn main() -> Result<(), Box> { +/// // Create a FileLoader using a glob pattern +/// let loader = EpubFileLoader::with_glob("tests/data/*.epub")?; +/// +/// // Load epub file contents by chapter, ignoring any errors +/// let contents = loader +/// .load_with_path() +/// .ignore_errors() +/// .by_chapter(); +/// +/// for (path, chapters) in contents { +/// println!("{}", path.display()); +/// for (idx, chapter) in chapters { +/// println!("Chapter {} begins", idx); +/// println!("{}", chapter); +/// println!("Chapter {} ends", idx); +/// } +/// } +/// +/// Ok(()) +/// } +/// ``` +/// +/// [EpubFileLoader] uses strict typing between the iterator methods to ensure that transitions +/// between different implementations of the loaders and it's methods are handled properly by +/// the compiler. +pub struct EpubFileLoader<'a, T> { + iterator: Box + 'a>, +} + +type EpubLoaded = Result<(PathBuf, EpubDoc>), EpubLoaderError>; + +impl<'a> EpubFileLoader<'a, Result> { + /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob] + /// or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances that can be + /// further processed (by chapter, etc). + /// + /// # Example + /// Load epub files in directory "tests/data/*.epub" and return the loaded documents + /// + /// ```rust + /// use rig::loaders::EpubFileLoader; + /// + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.load().into_iter(); + /// for result in content { + /// match result { + /// Ok(doc) => println!("{:?}", doc), + /// Err(e) => eprintln!("Error reading epub: {}", e), + /// } + /// } + /// ``` + pub fn load(self) -> EpubFileLoader<'a, Result>, EpubLoaderError>> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|res| res.load())), + } + } + + /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob] + /// or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances with their path + /// that can be further processed. + /// + /// # Example + /// Load epub files in directory "tests/data/*.epub" and return the loaded documents + /// + /// ```rust + /// use rig::loaders::EpubFileLoader; + /// + /// let content = EpubFileLoader::with_glob("tests/data/*.epub").unwrap().load_with_path().into_iter(); + /// for result in content { + /// match result { + /// Ok((path, doc)) => println!("{:?} {:?}", path, doc), + /// Err(e) => eprintln!("Error reading epub: {}", e), + /// } + /// } + /// ``` + pub fn load_with_path(self) -> EpubFileLoader<'a, EpubLoaded> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|res| res.load_with_path())), + } + } +} + +impl<'a> EpubFileLoader<'a, Result> { + /// Directly reads the contents of the epub files within the iterator returned by + /// [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir]. + /// + /// # Example + /// Read epub files in directory "tests/data/*.epub" and return the contents of the documents. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.read().into_iter(); + /// for result in content { + /// match result { + /// Ok(content) => println!("{}", content), + /// Err(e) => eprintln!("Error reading epub: {}", e), + /// } + /// } + /// ``` + pub fn read(self) -> EpubFileLoader<'a, Result> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|res| { + let doc = res.load().map(EpubChapterIterator::from)?; + + Ok(doc.into_iter().collect::()) + })), + } + } + + /// Directly reads the contents of the epub files within the iterator returned by + /// [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir] and returns the path along with + /// the content. + /// + /// # Example + /// Read epub files in directory "tests/data/*.epub" and return the content and paths of the documents. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.read_with_path().into_iter(); + /// for result in content { + /// match result { + /// Ok((path, content)) => println!("{:?} {}", path, content), + /// Err(e) => eprintln!("Error reading epub: {}", e), + /// } + /// } + /// ``` + pub fn read_with_path(self) -> EpubFileLoader<'a, Result<(PathBuf, String), EpubLoaderError>> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|res| { + let (path, doc) = res.load_with_path()?; + Ok((path, EpubChapterIterator::from(doc).collect::())) + })), + } + } +} + +impl<'a> EpubFileLoader<'a, EpubDoc>> { + /// Chunks the chapters of a loaded document by chapter, flattened as a single vector. + /// + /// # Example + /// Load epub files in directory "tests/data/*.epub" and chunk all document into it's chapters. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.load().by_chapter().into_iter(); + /// for result in content { + /// println!("{}", result); + /// } + /// ``` + pub fn by_chapter(self) -> EpubFileLoader<'a, String> { + EpubFileLoader { + iterator: Box::new(self.iterator.flat_map(EpubChapterIterator::from)), + } + } +} + +type ByChapter = (PathBuf, Vec<(usize, String)>); +impl<'a> EpubFileLoader<'a, (PathBuf, EpubDoc>)> { + /// Chunks the chapters of a loaded document by chapter, processed as a vector of documents by path + /// which each document container an inner vector of chapters by chapter number. + /// + /// # Example + /// Read epub files in directory "tests/data/*.epub" and chunk all documents by path by it's chapters. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")? + /// .load_with_path() + /// .ignore_errors() + /// .by_chapter() + /// .into_iter(); + /// + /// for result in content { + /// println!("{:?}", result); + /// } + /// ``` + pub fn by_chapter(self) -> EpubFileLoader<'a, ByChapter> { + EpubFileLoader { + iterator: Box::new(self.iterator.map(|doc| { + let (path, doc) = doc; + + ( + path, + EpubChapterIterator::from(doc) + .enumerate() + .collect::>(), + ) + })), + } + } +} + +impl<'a, T: 'a> EpubFileLoader<'a, Result> { + /// Ignores errors in the iterator, returning only successful results. This can be used on any + /// [EpubFileLoader] state of iterator whose items are results. + /// + /// # Example + /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files. + /// + /// ```rust + /// let content = EpubFileLoader::with_glob("tests/data/*.epub")?.read().ignore_errors().into_iter(); + /// for result in content { + /// println!("{}", content) + /// } + /// ``` + pub fn ignore_errors(self) -> EpubFileLoader<'a, T> { + EpubFileLoader { + iterator: Box::new(self.iterator.filter_map(|res| res.ok())), + } + } +} + +impl EpubFileLoader<'_, Result> { + /// Creates a new [EpubFileLoader] using a glob pattern to match files. + /// + /// # Example + /// Create a [EpubFileLoader] for all `.epub` files that match the glob "tests/data/*.epub". + /// + /// ```rust + /// let loader = EpubFileLoader::with_glob("tests/data/*.epub")?; + /// ``` + pub fn with_glob( + pattern: &str, + ) -> Result>, EpubLoaderError> { + let paths = glob::glob(pattern).map_err(FileLoaderError::PatternError)?; + + Ok(EpubFileLoader { + iterator: Box::new(paths.into_iter().map(|path| { + path.map_err(FileLoaderError::GlobError) + .map_err(EpubLoaderError::FileLoaderError) + })), + }) + } + + /// Creates a new [EpubFileLoader] on all files within a directory. + /// + /// # Example + /// Create a [EpubFileLoader] for all files that are in the directory "files". + /// + /// ```rust + /// let loader = EpubFileLoader::with_dir("files")?; + /// ``` + pub fn with_dir( + directory: &str, + ) -> Result>, EpubLoaderError> { + let paths = std::fs::read_dir(directory).map_err(FileLoaderError::IoError)?; + + Ok(EpubFileLoader { + iterator: Box::new( + paths + .into_iter() + .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())), + ), + }) + } +} + +// ================================================================ +// EpubFileLoader iterator implementations +// ================================================================ +pub struct IntoIter<'a, T> { + iterator: Box + 'a>, +} + +impl<'a, T> IntoIterator for EpubFileLoader<'a, T> { + type Item = T; + type IntoIter = IntoIter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + IntoIter { + iterator: self.iterator, + } + } +} + +impl Iterator for IntoIter<'_, T> { + type Item = T; + + fn next(&mut self) -> Option { + self.iterator.next() + } +} + +// ================================================================ +// EpubChapterIterator definitions and implementations +// ================================================================ + +struct EpubChapterIterator { + epub: EpubDoc>, + finished: bool, +} + +impl From>> for EpubChapterIterator { + fn from(epub: EpubDoc>) -> Self { + Self::new(epub) + } +} + +impl EpubChapterIterator { + fn new(epub: EpubDoc>) -> Self { + Self { + epub, + finished: false, + } + } +} + +impl Iterator for EpubChapterIterator { + type Item = String; + + fn next(&mut self) -> Option { + if self.finished { + return None; + } + + // ignore empty chapters if they exist + while !self.finished { + let chapter = self.epub.get_current_str(); + + if !self.epub.go_next() { + self.finished = true; + } + + if let Some((text, _)) = chapter { + return Some(text); + } + } + + None + } +} + +#[cfg(test)] +mod tests { + use super::EpubFileLoader; + + #[test] + fn test_epub_loader() { + let loader = EpubFileLoader::with_glob("tests/data/*.epub").unwrap(); + let actual = loader + .load_with_path() + .ignore_errors() + .by_chapter() + .into_iter() + .collect::>(); + + assert_eq!(actual.len(), 1); + + let (_, chapters) = &actual[0]; + assert_eq!(chapters.len(), 3); + } +} diff --git a/rig-core/src/loaders/mod.rs b/rig-core/src/loaders/mod.rs index 6611819e..6ae033f0 100644 --- a/rig-core/src/loaders/mod.rs +++ b/rig-core/src/loaders/mod.rs @@ -9,6 +9,12 @@ //! and keeping track of the page numbers along with their contents. //! //! Note: The [PdfFileLoader] requires the `pdf` feature to be enabled in the `Cargo.toml` file. +//! +//! The [EpubFileLoader] works similarly to the [FileLoader], but is specifically designed to load EPUB +//! files. This loader also provides EPUB-specific preprocessing methods for splitting the EPUB into chapters +//! and keeping track of the chapter numbers along with their contents. +//! +//! Note: The [EpubFileLoader] requires the `epub` feature to be enabled in the `Cargo.toml` file. pub mod file; @@ -19,3 +25,9 @@ pub mod pdf; #[cfg(feature = "pdf")] pub use pdf::PdfFileLoader; + +#[cfg(feature = "epub")] +pub mod epub; + +#[cfg(feature = "epub")] +pub use epub::EpubFileLoader; diff --git a/rig-core/tests/data/dummy.epub b/rig-core/tests/data/dummy.epub new file mode 100644 index 00000000..4f2fa72e Binary files /dev/null and b/rig-core/tests/data/dummy.epub differ