diff --git a/Cargo.lock b/Cargo.lock index b2506501..e03479fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -974,7 +974,7 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "embed_anything" -version = "0.4.13" +version = "0.4.18" dependencies = [ "accelerate-src", "anyhow", @@ -995,7 +995,6 @@ dependencies = [ "intel-mkl-src", "itertools 0.13.0", "lazy_static", - "markdown-parser", "markdown_to_text", "ndarray 0.16.1", "ndarray-linalg", @@ -1026,7 +1025,7 @@ dependencies = [ [[package]] name = "embed_anything_python" -version = "0.4.17" +version = "0.4.18" dependencies = [ "embed_anything", "pyo3", @@ -1126,26 +1125,6 @@ dependencies = [ "syn 2.0.90", ] -[[package]] -name = "enum-iterator" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c79a6321a1197d7730510c7e3f6cb80432dfefecb32426de8cea0aa19b4bb8d7" -dependencies = [ - "enum-iterator-derive", -] - -[[package]] -name = "enum-iterator-derive" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e94aa31f7c0dc764f57896dc615ddd76fc13b0d5dca7eb6cc5e018a5a09ec06" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -1614,7 +1593,7 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap 2.6.0", + "indexmap", "slab", "tokio", "tokio-util", @@ -1669,12 +1648,6 @@ dependencies = [ "ahash 0.7.8", ] -[[package]] -name = "hashbrown" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" - [[package]] name = "hashbrown" version = "0.15.2" @@ -2054,16 +2027,6 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0263a3d970d5c054ed9312c0057b4f3bde9c0b33836d3637361d4a9e6e7a408" -[[package]] -name = "indexmap" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" -dependencies = [ - "autocfg 1.4.0", - "hashbrown 0.12.3", -] - [[package]] name = "indexmap" version = "2.6.0" @@ -2390,24 +2353,6 @@ dependencies = [ "libc", ] -[[package]] -name = "markdown-parser" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf523952b36c9ad1a650d608d38187f12f4ad1b9e402a8c9df79743989289def" -dependencies = [ - "enum-iterator", - "enum-iterator-derive", - "getset", - "lazy_static", - "quick-error", - "regex", - "serde", - "serde_json", - "serde_yaml", - "toml 0.5.11", -] - [[package]] name = "markdown_to_text" version = "1.0.0" @@ -2918,7 +2863,7 @@ dependencies = [ "sha2", "tar", "thiserror", - "toml 0.8.19", + "toml", "ureq", "url", "uuid", @@ -4110,18 +4055,6 @@ dependencies = [ "serde", ] -[[package]] -name = "serde_yaml" -version = "0.8.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578a7433b776b56a35785ed5ce9a7e777ac0598aac5a6dd1b4b18a307c7fc71b" -dependencies = [ - "indexmap 1.9.3", - "ryu", - "serde", - "yaml-rust", -] - [[package]] name = "servo_arc" version = "0.3.0" @@ -4608,7 +4541,7 @@ dependencies = [ "cfg-expr", "heck", "pkg-config", - "toml 0.8.19", + "toml", "version-compare", ] @@ -4887,15 +4820,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "toml" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" -dependencies = [ - "serde", -] - [[package]] name = "toml" version = "0.8.19" @@ -4923,7 +4847,7 @@ version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ - "indexmap 2.6.0", + "indexmap", "serde", "serde_spanned", "toml_datetime", @@ -5636,15 +5560,6 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" -[[package]] -name = "yaml-rust" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85" -dependencies = [ - "linked-hash-map", -] - [[package]] name = "yoke" version = "0.7.5" @@ -5750,7 +5665,7 @@ dependencies = [ "crossbeam-utils", "displaydoc", "flate2", - "indexmap 2.6.0", + "indexmap", "num_enum", "thiserror", ] diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 3c1e59b3..c3598d43 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "embed_anything" -version = "0.4.13" +version = "0.4.18" edition.workspace = true license.workspace = true description.workspace = true @@ -49,7 +49,6 @@ tokio = { version = "1.40.0", features = ["macros", "rt-multi-thread"] } # Markdown Processing -markdown-parser = "0.1.2" markdown_to_text = "1.0.0" # Web Scraping diff --git a/rust/src/file_processor/markdown_processor.rs b/rust/src/file_processor/markdown_processor.rs index b68a73dd..6b36f623 100644 --- a/rust/src/file_processor/markdown_processor.rs +++ b/rust/src/file_processor/markdown_processor.rs @@ -1,5 +1,4 @@ use anyhow::Error; -use markdown_parser::read_file; /// A struct that provides functionality to process Markdown files. pub struct MarkdownProcessor; @@ -16,9 +15,9 @@ impl MarkdownProcessor { /// Returns a `Result` containing the extracted text content as a `String` if successful, /// or an `Error` if an error occurred while reading the file or converting the Markdown. pub fn extract_text>(file_path: &T) -> Result { - let md = read_file(file_path)?; - let content = md.content(); - let content = markdown_to_text::convert(content); + let bytes = std::fs::read(file_path)?; + let out = String::from_utf8_lossy(&bytes).to_string(); + let content = markdown_to_text::convert(&out); Ok(content) } }