From 474c73b9d31ad5258117c8842c9b920875e50007 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sun, 8 Dec 2024 13:54:14 +0800 Subject: [PATCH 01/22] feat: support japanese & korea tokenizer for fts --- Cargo.lock | 533 ++++++++++++++++-- Cargo.toml | 2 + python/Cargo.lock | 418 ++++++++++++++ python/Cargo.toml | 2 +- python/python/tests/test_scalar_index.py | 38 ++ rust/lance-index/Cargo.toml | 9 + .../src/scalar/inverted/tokenizer.rs | 26 + 7 files changed, 984 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9f26e23854..42127f1390 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "ahash" version = "0.8.11" @@ -148,9 +154,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.86" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" +checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7" [[package]] name = "approx" @@ -397,7 +403,7 @@ dependencies = [ "memchr", "num", "regex", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", ] [[package]] @@ -951,7 +957,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.4", "object", "rustc-demangle", ] @@ -978,6 +984,15 @@ dependencies = [ "vsimd", ] +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bit-set" version = "0.5.3" @@ -1530,9 +1545,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -1549,6 +1564,41 @@ dependencies = [ "memchr", ] +[[package]] +name = "darling" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.89", +] + +[[package]] +name = "darling_macro" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.89", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -1850,7 +1900,7 @@ dependencies = [ "itertools 0.13.0", "log", "paste", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", ] [[package]] @@ -2022,6 +2072,37 @@ dependencies = [ "serde", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.89", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.89", +] + [[package]] name = "diff" version = "0.1.13" @@ -2090,6 +2171,88 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "env_filter" version = "0.1.2" @@ -2261,12 +2424,12 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.31" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.8.0", ] [[package]] @@ -2275,6 +2438,21 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -2761,6 +2939,22 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.4.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.7" @@ -2813,6 +3007,12 @@ dependencies = [ "cc", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.5.0" @@ -2991,6 +3191,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "kv-log-macro" version = "1.0.7" @@ -3338,6 +3547,8 @@ dependencies = [ "lance-table", "lance-testing", "lazy_static", + "lindera", + "lindera-tantivy", "log", "moka", "num-traits", @@ -3637,6 +3848,137 @@ dependencies = [ "libc", ] +[[package]] +name = "lindera" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fff887f4b98539fb5f879ede50e17eb7eaafa5622c252cffe8280f42cafc6b7d" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "kanaria", + "lindera-cc-cedict", + "lindera-dictionary", + "lindera-ipadic", + "lindera-ipadic-neologd", + "lindera-ko-dic", + "lindera-unidic", + "once_cell", + "regex", + "serde", + "serde_json", + "serde_yaml", + "strum", + "strum_macros", + "unicode-blocks", + "unicode-normalization", + "unicode-segmentation", + "yada", +] + +[[package]] +name = "lindera-cc-cedict" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a891e53b4fac346b314dcd60e4337b6deec7d972f338c004cfb9e0fe9868893" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-dictionary" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec716483ceb95aa84ac262cb766eef314b24257c343ca230daa71f856a278fe4" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "derive_builder", + "encoding", + "encoding_rs", + "encoding_rs_io", + "flate2", + "glob", + "log", + "once_cell", + "reqwest", + "serde", + "tar", + "thiserror 2.0.3", + "yada", +] + +[[package]] +name = "lindera-ipadic" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d2b997fa2aeee8adccea2e4cb0ade771132f6e32093ed0beda8409e9a44018" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-ipadic-neologd" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "084c4e7b63500b235af2c7cbf8ee23735ae452971ac29bbd9a7f55a10eae50c4" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-ko-dic" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d921a5c39a634316125fc1572f00bc78f4351baeacaff2cf39953b2fc8493a55" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-tantivy" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261c87882a909fd17db4dd797e4dc2aac3992bdbbb4e2900d1362a1e0746266f" +dependencies = [ + "lindera", + "tantivy", + "tantivy-tokenizer-api", +] + +[[package]] +name = "lindera-unidic" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b95e25975448fa10f2bec61bfd012d8b0a1740d92ac7fc43e725edb1568ff7e" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + [[package]] name = "linux-raw-sys" version = "0.3.8" @@ -3768,6 +4110,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "1.0.1" @@ -3851,6 +4202,23 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "nix" version = "0.26.4" @@ -4070,12 +4438,50 @@ version = "11.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" +[[package]] +name = "openssl" +version = "0.10.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.89", +] + [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "openssl-sys" +version = "0.9.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -4525,7 +4931,7 @@ dependencies = [ "rand", "rand_chacha", "rand_xorshift", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", "rusty-fork", "tempfile", "unarray", @@ -4878,14 +5284,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.6" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.7", - "regex-syntax 0.8.4", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", ] [[package]] @@ -4899,13 +5305,13 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", ] [[package]] @@ -4922,9 +5328,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "regress" @@ -4954,12 +5360,13 @@ checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" [[package]] name = "reqwest" -version = "0.12.7" +version = "0.12.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" +checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" dependencies = [ "base64 0.22.1", "bytes", + "encoding_rs", "futures-core", "futures-util", "h2 0.4.6", @@ -4968,24 +5375,28 @@ dependencies = [ "http-body-util", "hyper 1.4.1", "hyper-rustls 0.27.3", + "hyper-tls", "hyper-util", "ipnet", "js-sys", "log", "mime", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", "quinn", "rustls 0.23.12", - "rustls-native-certs 0.7.3", + "rustls-native-certs 0.8.0", "rustls-pemfile 2.1.3", "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", "sync_wrapper", + "system-configuration", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.0", "tokio-util", "tower-service", @@ -5164,19 +5575,6 @@ dependencies = [ "security-framework", ] -[[package]] -name = "rustls-native-certs" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" -dependencies = [ - "openssl-probe", - "rustls-pemfile 2.1.3", - "rustls-pki-types", - "schannel", - "security-framework", -] - [[package]] name = "rustls-native-certs" version = "0.8.0" @@ -5805,6 +6203,27 @@ dependencies = [ "futures-core", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -5907,7 +6326,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" dependencies = [ "byteorder", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", "utf8-ranges", ] @@ -5960,9 +6379,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tar" -version = "0.4.41" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb797dad5fb5b76fcf519e702f4a589483b5ef06567f160c392832c1f5e44909" +checksum = "c65998313f8e17d0d553d28f91a0df93e4dbbbf770279c7bc21ca0f09ea1a1f6" dependencies = [ "filetime", "libc", @@ -6176,9 +6595,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.2" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" +checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" dependencies = [ "backtrace", "bytes", @@ -6202,6 +6621,16 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -6506,6 +6935,12 @@ version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" +[[package]] +name = "unicode-blocks" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" + [[package]] name = "unicode-ident" version = "1.0.14" @@ -6514,9 +6949,9 @@ checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] @@ -6612,6 +7047,12 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -7090,6 +7531,12 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yada" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" + [[package]] name = "yansi" version = "0.5.1" diff --git a/Cargo.toml b/Cargo.toml index 84c183579c..d0ae9e3f19 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -143,6 +143,8 @@ serde_json = { version = "1" } shellexpand = "3.0" snafu = "0.7.5" tantivy = { version = "0.22.0", features = ["stopwords"] } +lindera = { version = "0.38.1"} +lindera-tantivy = { version = "0.38.1"} tempfile = "3" test-log = { version = "0.2.15" } tokio = { version = "1.23", features = [ diff --git a/python/Cargo.lock b/python/Cargo.lock index fcd28fd2fd..7a47c2c6cf 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -880,6 +880,15 @@ dependencies = [ "vsimd", ] +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -1283,6 +1292,41 @@ dependencies = [ "memchr", ] +[[package]] +name = "darling" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.90", +] + +[[package]] +name = "darling_macro" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.90", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -1747,6 +1791,37 @@ dependencies = [ "serde", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.90", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.90", +] + [[package]] name = "digest" version = "0.10.7" @@ -1814,6 +1889,88 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "encoding" +version = "0.2.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" +dependencies = [ + "encoding-index-japanese", + "encoding-index-korean", + "encoding-index-simpchinese", + "encoding-index-singlebyte", + "encoding-index-tradchinese", +] + +[[package]] +name = "encoding-index-japanese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-korean" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-simpchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-singlebyte" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding-index-tradchinese" +version = "1.20141219.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" +dependencies = [ + "encoding_index_tests", +] + +[[package]] +name = "encoding_index_tests" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -1943,6 +2100,21 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -2411,6 +2583,22 @@ dependencies = [ "tower-service", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.5.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.10" @@ -2580,6 +2768,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.0.3" @@ -2719,6 +2913,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "kv-log-macro" version = "1.0.7" @@ -2988,6 +3191,8 @@ dependencies = [ "lance-linalg", "lance-table", "lazy_static", + "lindera", + "lindera-tantivy", "log", "moka", "num-traits", @@ -3207,6 +3412,95 @@ dependencies = [ "redox_syscall", ] +[[package]] +name = "lindera" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fff887f4b98539fb5f879ede50e17eb7eaafa5622c252cffe8280f42cafc6b7d" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "kanaria", + "lindera-dictionary", + "lindera-ipadic", + "lindera-ko-dic", + "once_cell", + "regex", + "serde", + "serde_json", + "serde_yaml", + "strum", + "strum_macros", + "unicode-blocks", + "unicode-normalization", + "unicode-segmentation", + "yada", +] + +[[package]] +name = "lindera-dictionary" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec716483ceb95aa84ac262cb766eef314b24257c343ca230daa71f856a278fe4" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "derive_builder", + "encoding", + "encoding_rs", + "encoding_rs_io", + "flate2", + "glob", + "log", + "once_cell", + "reqwest", + "serde", + "tar", + "thiserror 2.0.4", + "yada", +] + +[[package]] +name = "lindera-ipadic" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d2b997fa2aeee8adccea2e4cb0ade771132f6e32093ed0beda8409e9a44018" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-ko-dic" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d921a5c39a634316125fc1572f00bc78f4351baeacaff2cf39953b2fc8493a55" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + +[[package]] +name = "lindera-tantivy" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261c87882a909fd17db4dd797e4dc2aac3992bdbbb4e2900d1362a1e0746266f" +dependencies = [ + "lindera", + "tantivy", + "tantivy-tokenizer-api", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" @@ -3394,6 +3688,23 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework 2.11.1", + "security-framework-sys", + "tempfile", +] + [[package]] name = "noisy_float" version = "0.2.0" @@ -3586,12 +3897,50 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e296cf87e61c9cfc1a61c3c63a0f7f286ed4554e0e22be84e8a38e1d264a2a29" +[[package]] +name = "openssl" +version = "0.10.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5" +dependencies = [ + "bitflags 2.6.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "openssl-sys" +version = "0.9.104" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -4446,6 +4795,7 @@ checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" dependencies = [ "base64 0.22.1", "bytes", + "encoding_rs", "futures-core", "futures-util", "h2 0.4.7", @@ -4454,11 +4804,13 @@ dependencies = [ "http-body-util", "hyper 1.5.1", "hyper-rustls 0.27.3", + "hyper-tls", "hyper-util", "ipnet", "js-sys", "log", "mime", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -4471,7 +4823,9 @@ dependencies = [ "serde_json", "serde_urlencoded", "sync_wrapper", + "system-configuration", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.0", "tokio-util", "tower-service", @@ -5042,6 +5396,12 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.26.3" @@ -5136,6 +5496,27 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.6.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -5511,6 +5892,16 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -5705,12 +6096,27 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "unicode-blocks" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" + [[package]] name = "unicode-ident" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -5814,6 +6220,12 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ef4c4aa54d5d05a279399bfa921ec387b7aba77caf7a682ae8d86785b8fdad2" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -6238,6 +6650,12 @@ dependencies = [ "lzma-sys", ] +[[package]] +name = "yada" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" + [[package]] name = "yoke" version = "0.7.5" diff --git a/python/Cargo.toml b/python/Cargo.toml index e9e9f867c4..7f0e349bbf 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -36,7 +36,7 @@ lance-core = { path = "../rust/lance-core" } lance-datagen = { path = "../rust/lance-datagen", optional = true } lance-encoding = { path = "../rust/lance-encoding" } lance-file = { path = "../rust/lance-file" } -lance-index = { path = "../rust/lance-index" } +lance-index = { path = "../rust/lance-index", features = ["lindera-tantivy", "lindera-tantivy-ko-dic", "lindera-tantivy-ipadic"] } lance-io = { path = "../rust/lance-io" } lance-linalg = { path = "../rust/lance-linalg" } lance-table = { path = "../rust/lance-table" } diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 3777c90d48..52ad548cb8 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -307,6 +307,44 @@ def test_fts_all_deleted(dataset): dataset.to_table(full_text_query=first_row_doc) +def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path): + data = pa.table( + { + "text": [ + "成田国際空港", + "東京国際空港", + "羽田空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera-ipadic") + + results = ds.to_table( + full_text_query="成田", + prefilter=True, + with_row_id=True, + ) + assert results["_rowid"].to_pylist() == [0] + + +def test_indexed_filter_with_fts_index_with_lindera_ko_tokenizer(tmp_path): + data = pa.table( + { + "text": ["하네다공항한정토트백", "나리타공항한정토트백"], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera-ko-dic") + + results = ds.to_table( + full_text_query="나리타", + prefilter=True, + with_row_id=True, + ) + assert results["_rowid"].to_pylist() == [1] + + def test_bitmap_index(tmp_path: Path): """Test create bitmap index""" tbl = pa.Table.from_arrays( diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 12d38e5678..684731ab0a 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -50,6 +50,8 @@ serde_json.workspace = true serde.workspace = true snafu.workspace = true tantivy.workspace = true +lindera = { workspace = true, optional = true } +lindera-tantivy = { workspace = true, optional = true } tokio.workspace = true tracing.workspace = true tempfile.workspace = true @@ -68,6 +70,13 @@ test-log.workspace = true datafusion-sql.workspace = true random_word = { version = "0.4.3", features = ["en"] } +[features] +lindera-tantivy-ipadic = ["lindera", "lindera-tantivy", "lindera-tantivy/ipadic"] +lindera-tantivy-ipadic-neologd = ["lindera", "lindera-tantivy", "lindera-tantivy/ipadic-neologd"] +lindera-tantivy-unidic = ["lindera", "lindera-tantivy", "lindera-tantivy/unidic"] +lindera-tantivy-ko-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/ko-dic"] +lindera-tantivy-cc-cedict = ["lindera", "lindera-tantivy", "lindera-tantivy/cc-cedict"] + [build-dependencies] prost-build.workspace = true diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 440def7a5a..0f7e0f1a9f 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -12,6 +12,8 @@ pub struct TokenizerConfig { /// - `simple`: splits tokens on whitespace and punctuation /// - `whitespace`: splits tokens on whitespace /// - `raw`: no tokenization + /// - `lindera-tantivy-ipadic`: Japanese tokenizer + /// - `lindera-tantivy-ko-dic`: Korea tokenizer /// /// `simple` is recommended for most cases and the default value base_tokenizer: String, @@ -141,9 +143,33 @@ fn build_base_tokenizer_builder(name: &str) -> Result build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADIC), + #[cfg(feature = "lindera-tantivy-ipadic-neologd")] + "lindera-ipadic-neologd" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADICNEologd), + #[cfg(feature = "lindera-tantivy-unidic")] + "lindera-unidic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::UniDic), + #[cfg(feature = "lindera-tantivy-ko-dic")] + "lindera-ko-dic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::KoDic), + #[cfg(feature = "lindera-tantivy-cc-cedict")] + "lindera-cc-cedict" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::CcCedict), _ => Err(Error::invalid_input( format!("unknown base tokenizer {}", name), location!(), )), } } + +#[cfg(feature = "lindera-tantivy")] +fn build_lindera_tokenizer_builder(dic: lindera::dictionary::DictionaryKind) -> Result { + use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; + use lindera_tantivy::tokenizer::LinderaTokenizer; + let mode = Mode::Normal; + let dictionary = load_dictionary_from_kind(dic).unwrap(); + let user_dictionary = None; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); + Ok(tantivy::tokenizer::TextAnalyzer::builder( + tokenizer, + ).dynamic()) +} From f36311ce9797a42b9a1be125564c8ecbd1b2883a Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sun, 8 Dec 2024 19:43:11 +0800 Subject: [PATCH 02/22] lindera tmp --- python/Cargo.lock | 42 +++++++++++++++++++ python/Cargo.toml | 2 +- rust/lance-index/Cargo.toml | 10 +++-- .../src/scalar/inverted/tokenizer.rs | 40 ++++++++++++++---- 4 files changed, 80 insertions(+), 14 deletions(-) diff --git a/python/Cargo.lock b/python/Cargo.lock index 7a47c2c6cf..591dd89c11 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -3423,9 +3423,12 @@ dependencies = [ "byteorder", "csv", "kanaria", + "lindera-cc-cedict", "lindera-dictionary", "lindera-ipadic", + "lindera-ipadic-neologd", "lindera-ko-dic", + "lindera-unidic", "once_cell", "regex", "serde", @@ -3439,6 +3442,19 @@ dependencies = [ "yada", ] +[[package]] +name = "lindera-cc-cedict" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a891e53b4fac346b314dcd60e4337b6deec7d972f338c004cfb9e0fe9868893" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + [[package]] name = "lindera-dictionary" version = "0.38.1" @@ -3477,6 +3493,19 @@ dependencies = [ "tokio", ] +[[package]] +name = "lindera-ipadic-neologd" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "084c4e7b63500b235af2c7cbf8ee23735ae452971ac29bbd9a7f55a10eae50c4" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + [[package]] name = "lindera-ko-dic" version = "0.38.1" @@ -3501,6 +3530,19 @@ dependencies = [ "tantivy-tokenizer-api", ] +[[package]] +name = "lindera-unidic" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b95e25975448fa10f2bec61bfd012d8b0a1740d92ac7fc43e725edb1568ff7e" +dependencies = [ + "bincode", + "byteorder", + "lindera-dictionary", + "once_cell", + "tokio", +] + [[package]] name = "linux-raw-sys" version = "0.4.14" diff --git a/python/Cargo.toml b/python/Cargo.toml index 7f0e349bbf..a3e3b701b2 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -36,7 +36,7 @@ lance-core = { path = "../rust/lance-core" } lance-datagen = { path = "../rust/lance-datagen", optional = true } lance-encoding = { path = "../rust/lance-encoding" } lance-file = { path = "../rust/lance-file" } -lance-index = { path = "../rust/lance-index", features = ["lindera-tantivy", "lindera-tantivy-ko-dic", "lindera-tantivy-ipadic"] } +lance-index = { path = "../rust/lance-index", features = ["lindera-tantivy-custom"] } lance-io = { path = "../rust/lance-io" } lance-linalg = { path = "../rust/lance-linalg" } lance-table = { path = "../rust/lance-table" } diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 684731ab0a..c0344acf7c 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -71,11 +71,13 @@ datafusion-sql.workspace = true random_word = { version = "0.4.3", features = ["en"] } [features] -lindera-tantivy-ipadic = ["lindera", "lindera-tantivy", "lindera-tantivy/ipadic"] -lindera-tantivy-ipadic-neologd = ["lindera", "lindera-tantivy", "lindera-tantivy/ipadic-neologd"] -lindera-tantivy-unidic = ["lindera", "lindera-tantivy", "lindera-tantivy/unidic"] +lindera-tantivy-custom = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"] +lindera-tantivy-builtin-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"] +lindera-tantivy-ipadic = ["lindera-tantivy-builtin-dic", "lindera-tantivy/ipadic"] +lindera-tantivy-ipadic-neologd = ["lindera-tantivy-builtin-dic", "lindera-tantivy/ipadic-neologd"] +lindera-tantivy-unidic = ["lindera-tantivy-builtin-dic", "lindera-tantivy/unidic"] lindera-tantivy-ko-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/ko-dic"] -lindera-tantivy-cc-cedict = ["lindera", "lindera-tantivy", "lindera-tantivy/cc-cedict"] +lindera-tantivy-cc-cedict = ["lindera-tantivy-builtin-dic", "lindera-tantivy/cc-cedict"] [build-dependencies] prost-build.workspace = true diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 0f7e0f1a9f..ce5adac1b6 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -12,8 +12,8 @@ pub struct TokenizerConfig { /// - `simple`: splits tokens on whitespace and punctuation /// - `whitespace`: splits tokens on whitespace /// - `raw`: no tokenization - /// - `lindera-tantivy-ipadic`: Japanese tokenizer - /// - `lindera-tantivy-ko-dic`: Korea tokenizer + /// - `lindera-ipadic`: Japanese tokenizer + /// - `lindera-ko-dic`: Korea tokenizer /// /// `simple` is recommended for most cases and the default value base_tokenizer: String, @@ -144,15 +144,19 @@ fn build_base_tokenizer_builder(name: &str) -> Result build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADIC), + "lindera-ipadic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADIC), #[cfg(feature = "lindera-tantivy-ipadic-neologd")] - "lindera-ipadic-neologd" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADICNEologd), + "lindera-ipadic-neologd" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADICNEologd), #[cfg(feature = "lindera-tantivy-unidic")] - "lindera-unidic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::UniDic), + "lindera-unidic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::UniDic), #[cfg(feature = "lindera-tantivy-ko-dic")] - "lindera-ko-dic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::KoDic), + "lindera-ko-dic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::KoDic), #[cfg(feature = "lindera-tantivy-cc-cedict")] - "lindera-cc-cedict" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::CcCedict), + "lindera-cc-cedict" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::CcCedict), + #[cfg(feature = "lindera-tantivy-custom")] + s if s.starts_with("lindera-") => { + return build_custom_lindera_tokenizer_builder(s); + } _ => Err(Error::invalid_input( format!("unknown base tokenizer {}", name), location!(), @@ -160,8 +164,10 @@ fn build_base_tokenizer_builder(name: &str) -> Result Result { +#[cfg(feature = "lindera-tantivy-builtin-dic")] +fn build_builtin_lindera_tokenizer_builder( + dic: lindera::dictionary::DictionaryKind +) -> Result { use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; use lindera_tantivy::tokenizer::LinderaTokenizer; let mode = Mode::Normal; @@ -173,3 +179,19 @@ fn build_lindera_tokenizer_builder(dic: lindera::dictionary::DictionaryKind) -> tokenizer, ).dynamic()) } + +#[cfg(feature = "lindera-tantivy-custom")] +fn build_custom_lindera_tokenizer_builder(dic: &str) -> Result { + use lindera::{dictionary::load_dictionary_from_path, mode::Mode, segmenter::Segmenter}; + use lindera_tantivy::tokenizer::LinderaTokenizer; + let dic = std::path::Path::new(dic); + let mode = Mode::Normal; + let dictionary = load_dictionary_from_path(dic).unwrap(); + let user_dictionary = None; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); + Ok(tantivy::tokenizer::TextAnalyzer::builder( + tokenizer, + ).dynamic()) +} + From 0c2f5903f1cc0824895eadc12dcc1a3d6b9a9a9b Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sun, 8 Dec 2024 22:54:14 +0800 Subject: [PATCH 03/22] update tokenizer --- Cargo.lock | 1 + Cargo.toml | 1 + python/Cargo.lock | 1 + python/Cargo.toml | 2 +- rust/lance-core/Cargo.toml | 1 + rust/lance-core/src/lib.rs | 11 ++ rust/lance-index/Cargo.toml | 8 +- .../src/scalar/inverted/tokenizer.rs | 101 +++++++++++------- 8 files changed, 77 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 42127f1390..72afc01781 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3320,6 +3320,7 @@ dependencies = [ "datafusion-common", "datafusion-sql", "deepsize", + "dirs", "futures", "lance-arrow", "lance-testing", diff --git a/Cargo.toml b/Cargo.toml index d0ae9e3f19..73785e6c93 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,6 +110,7 @@ datafusion-physical-expr = { version = "42.0", features = [ "regex_expressions", ] } deepsize = "0.2.0" +dirs = "5.0.0" either = "1.0" fsst = { version = "=0.21.0", path = "./rust/lance-encoding/src/compression_algo/fsst" } futures = "0.3" diff --git a/python/Cargo.lock b/python/Cargo.lock index 591dd89c11..0a0f99cccc 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -3024,6 +3024,7 @@ dependencies = [ "datafusion-common", "datafusion-sql", "deepsize", + "dirs", "futures", "lance-arrow", "lazy_static", diff --git a/python/Cargo.toml b/python/Cargo.toml index a3e3b701b2..0a1bea95e4 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -36,7 +36,7 @@ lance-core = { path = "../rust/lance-core" } lance-datagen = { path = "../rust/lance-datagen", optional = true } lance-encoding = { path = "../rust/lance-encoding" } lance-file = { path = "../rust/lance-file" } -lance-index = { path = "../rust/lance-index", features = ["lindera-tantivy-custom"] } +lance-index = { path = "../rust/lance-index", features = ["tokenizer-lindera"] } lance-io = { path = "../rust/lance-io" } lance-linalg = { path = "../rust/lance-linalg" } lance-table = { path = "../rust/lance-table" } diff --git a/rust/lance-core/Cargo.toml b/rust/lance-core/Cargo.toml index fe4e9a1331..9175a3657d 100644 --- a/rust/lance-core/Cargo.toml +++ b/rust/lance-core/Cargo.toml @@ -23,6 +23,7 @@ chrono.workspace = true datafusion-common = { workspace = true, optional = true } datafusion-sql = { workspace = true, optional = true } deepsize.workspace = true +dirs.workspace = true futures.workspace = true lazy_static.workspace = true mock_instant.workspace = true diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs index 9ab1854076..4d52608e06 100644 --- a/rust/lance-core/src/lib.rs +++ b/rust/lance-core/src/lib.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::{env, path::PathBuf}; + use arrow_schema::{DataType, Field as ArrowField}; pub mod cache; @@ -16,6 +18,9 @@ pub const ROW_ID: &str = "_rowid"; /// Column name for the meta row address. pub const ROW_ADDR: &str = "_rowaddr"; +pub const LANCE_HOME_ENV_KEY: &str = "LANCE_HOME"; +pub const LANCE_HOME_DEFAULT_DIRECTORY: &str = "lance"; + lazy_static::lazy_static! { /// Row ID field. This is nullable because its validity bitmap is sometimes used /// as a selection vector. @@ -23,4 +28,10 @@ lazy_static::lazy_static! { /// Row address field. This is nullable because its validity bitmap is sometimes used /// as a selection vector. pub static ref ROW_ADDR_FIELD: ArrowField = ArrowField::new(ROW_ADDR, DataType::UInt64, true); + + /// default directory that stores lance related files, e.g. tokenizer model. + pub static ref LANCE_HOME: Option = match env::var(LANCE_HOME_ENV_KEY) { + Ok(p) => Some(PathBuf::from(p)), + Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_HOME_DEFAULT_DIRECTORY)) + }; } diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index c0344acf7c..78aa2bbdf3 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -71,13 +71,7 @@ datafusion-sql.workspace = true random_word = { version = "0.4.3", features = ["en"] } [features] -lindera-tantivy-custom = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"] -lindera-tantivy-builtin-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"] -lindera-tantivy-ipadic = ["lindera-tantivy-builtin-dic", "lindera-tantivy/ipadic"] -lindera-tantivy-ipadic-neologd = ["lindera-tantivy-builtin-dic", "lindera-tantivy/ipadic-neologd"] -lindera-tantivy-unidic = ["lindera-tantivy-builtin-dic", "lindera-tantivy/unidic"] -lindera-tantivy-ko-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/ko-dic"] -lindera-tantivy-cc-cedict = ["lindera-tantivy-builtin-dic", "lindera-tantivy/cc-cedict"] +tokenizer-lindera = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"] [build-dependencies] prost-build.workspace = true diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index ce5adac1b6..b860061322 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -1,7 +1,9 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use lance_core::{Error, Result}; +use std::path::PathBuf; + +use lance_core::{Error, Result, LANCE_HOME}; use serde::{Deserialize, Serialize}; use snafu::{location, Location}; @@ -143,19 +145,9 @@ fn build_base_tokenizer_builder(name: &str) -> Result build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADIC), - #[cfg(feature = "lindera-tantivy-ipadic-neologd")] - "lindera-ipadic-neologd" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADICNEologd), - #[cfg(feature = "lindera-tantivy-unidic")] - "lindera-unidic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::UniDic), - #[cfg(feature = "lindera-tantivy-ko-dic")] - "lindera-ko-dic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::KoDic), - #[cfg(feature = "lindera-tantivy-cc-cedict")] - "lindera-cc-cedict" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::CcCedict), - #[cfg(feature = "lindera-tantivy-custom")] - s if s.starts_with("lindera-") => { - return build_custom_lindera_tokenizer_builder(s); + #[cfg(feature = "tokenizer-lindera")] + s if s.starts_with("lindera/") => { + return build_lindera_tokenizer_builder(s); } _ => Err(Error::invalid_input( format!("unknown base tokenizer {}", name), @@ -164,34 +156,61 @@ fn build_base_tokenizer_builder(name: &str) -> Result Result { - use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; - use lindera_tantivy::tokenizer::LinderaTokenizer; - let mode = Mode::Normal; - let dictionary = load_dictionary_from_kind(dic).unwrap(); - let user_dictionary = None; - let segmenter = Segmenter::new(mode, dictionary, user_dictionary); - let tokenizer = LinderaTokenizer::from_segmenter(segmenter); - Ok(tantivy::tokenizer::TextAnalyzer::builder( - tokenizer, - ).dynamic()) +lazy_static::lazy_static! { + pub static ref LANCE_TOKENIZER_HOME: Option = LANCE_HOME.as_ref().map(|p| p.join("tokenizers")); } -#[cfg(feature = "lindera-tantivy-custom")] -fn build_custom_lindera_tokenizer_builder(dic: &str) -> Result { - use lindera::{dictionary::load_dictionary_from_path, mode::Mode, segmenter::Segmenter}; +#[cfg(feature = "tokenizer-lindera")] +fn build_lindera_tokenizer_builder(dic: &str) -> Result { + use std::{fs::File, io::BufReader}; + + use lindera::{ + dictionary::{ + load_dictionary_from_path, load_user_dictionary_from_config, UserDictionaryConfig, + }, + mode::Mode, + segmenter::Segmenter, + }; use lindera_tantivy::tokenizer::LinderaTokenizer; - let dic = std::path::Path::new(dic); - let mode = Mode::Normal; - let dictionary = load_dictionary_from_path(dic).unwrap(); - let user_dictionary = None; - let segmenter = Segmenter::new(mode, dictionary, user_dictionary); - let tokenizer = LinderaTokenizer::from_segmenter(segmenter); - Ok(tantivy::tokenizer::TextAnalyzer::builder( - tokenizer, - ).dynamic()) + use serde_json::from_reader; + + match LANCE_TOKENIZER_HOME.as_ref() { + Some(p) => { + let dic_dir = p.join(dic); + let main_dir = dic_dir.join("main"); + let user_config_path = dic_dir.join("user_config.json"); + let user_dictionary = if user_config_path.exists() { + let file = File::open(user_config_path)?; + let reader = BufReader::new(file); + let user_dictionary_config: UserDictionaryConfig = from_reader(reader)?; + Some( + load_user_dictionary_from_config(&user_dictionary_config).map_err(|e| { + Error::io( + format!("load lindera tokenizer user dictionary err: {e}"), + location!(), + ) + })?, + ) + } else { + None + }; + let mode = Mode::Normal; + let dictionary = load_dictionary_from_path(main_dir.as_path()).map_err(|e| { + Error::io( + format!("load lindera tokenizer main dictionary err: {e}"), + location!(), + ) + })?; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); + Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) + } + None => Err(Error::invalid_input( + format!( + "{} is undefined", + String::from(lance_core::LANCE_HOME_ENV_KEY) + ), + location!(), + )), + } } - From 2428567d3d2d344e92f0a25401c2b5f5c2651844 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Tue, 10 Dec 2024 20:26:26 +0800 Subject: [PATCH 04/22] lindera support --- Cargo.lock | 2 +- python/Cargo.lock | 2 +- python/python/tests/test_scalar_index.py | 4 +- rust/lance-core/Cargo.toml | 1 - rust/lance-core/src/lib.rs | 10 --- rust/lance-index/Cargo.toml | 1 + .../src/scalar/inverted/tokenizer.rs | 82 +++++++++++++------ 7 files changed, 64 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 72afc01781..ff90a5cfcf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3320,7 +3320,6 @@ dependencies = [ "datafusion-common", "datafusion-sql", "deepsize", - "dirs", "futures", "lance-arrow", "lance-testing", @@ -3534,6 +3533,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "deepsize", + "dirs", "futures", "half", "itertools 0.13.0", diff --git a/python/Cargo.lock b/python/Cargo.lock index 0a0f99cccc..6e470e5e1f 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -3024,7 +3024,6 @@ dependencies = [ "datafusion-common", "datafusion-sql", "deepsize", - "dirs", "futures", "lance-arrow", "lazy_static", @@ -3180,6 +3179,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "deepsize", + "dirs", "futures", "half", "itertools 0.13.0", diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 52ad548cb8..76de9b22a0 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -318,7 +318,7 @@ def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path } ) ds = lance.write_dataset(data, tmp_path, mode="overwrite") - ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera-ipadic") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ipadic") results = ds.to_table( full_text_query="成田", @@ -335,7 +335,7 @@ def test_indexed_filter_with_fts_index_with_lindera_ko_tokenizer(tmp_path): } ) ds = lance.write_dataset(data, tmp_path, mode="overwrite") - ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera-ko-dic") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ko-dic") results = ds.to_table( full_text_query="나리타", diff --git a/rust/lance-core/Cargo.toml b/rust/lance-core/Cargo.toml index 9175a3657d..fe4e9a1331 100644 --- a/rust/lance-core/Cargo.toml +++ b/rust/lance-core/Cargo.toml @@ -23,7 +23,6 @@ chrono.workspace = true datafusion-common = { workspace = true, optional = true } datafusion-sql = { workspace = true, optional = true } deepsize.workspace = true -dirs.workspace = true futures.workspace = true lazy_static.workspace = true mock_instant.workspace = true diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs index 4d52608e06..91a894b355 100644 --- a/rust/lance-core/src/lib.rs +++ b/rust/lance-core/src/lib.rs @@ -1,8 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{env, path::PathBuf}; - use arrow_schema::{DataType, Field as ArrowField}; pub mod cache; @@ -18,9 +16,6 @@ pub const ROW_ID: &str = "_rowid"; /// Column name for the meta row address. pub const ROW_ADDR: &str = "_rowaddr"; -pub const LANCE_HOME_ENV_KEY: &str = "LANCE_HOME"; -pub const LANCE_HOME_DEFAULT_DIRECTORY: &str = "lance"; - lazy_static::lazy_static! { /// Row ID field. This is nullable because its validity bitmap is sometimes used /// as a selection vector. @@ -29,9 +24,4 @@ lazy_static::lazy_static! { /// as a selection vector. pub static ref ROW_ADDR_FIELD: ArrowField = ArrowField::new(ROW_ADDR, DataType::UInt64, true); - /// default directory that stores lance related files, e.g. tokenizer model. - pub static ref LANCE_HOME: Option = match env::var(LANCE_HOME_ENV_KEY) { - Ok(p) => Some(PathBuf::from(p)), - Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_HOME_DEFAULT_DIRECTORY)) - }; } diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 78aa2bbdf3..c98388eeb9 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -26,6 +26,7 @@ datafusion-physical-expr.workspace = true datafusion-sql.workspace = true datafusion.workspace = true deepsize.workspace = true +dirs.workspace = true futures.workspace = true half.workspace = true itertools.workspace = true diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index b860061322..d1ac009884 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -1,9 +1,9 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::path::PathBuf; +use std::{env, path::PathBuf}; -use lance_core::{Error, Result, LANCE_HOME}; +use lance_core::{Error, Result}; use serde::{Deserialize, Serialize}; use snafu::{location, Location}; @@ -156,8 +156,24 @@ fn build_base_tokenizer_builder(name: &str) -> Result = LANCE_HOME.as_ref().map(|p| p.join("tokenizers")); + /// default directory that stores lance tokenizer related files, e.g. tokenizer model. + pub static ref LANCE_TOKENIZER_HOME: Option = match env::var(LANCE_TOKENIZERS_HOME_ENV_KEY) { + Ok(p) => Some(PathBuf::from(p)), + Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_HOME_DEFAULT_DIRECTORY)) + }; +} + +#[cfg(feature = "tokenizer-lindera")] +#[derive(Serialize, Deserialize)] +struct LinderaConfig{ + main: String, + user: Option, + user_kind: Option } #[cfg(feature = "tokenizer-lindera")] @@ -172,35 +188,55 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result { let dic_dir = p.join(dic); - let main_dir = dic_dir.join("main"); - let user_config_path = dic_dir.join("user_config.json"); - let user_dictionary = if user_config_path.exists() { - let file = File::open(user_config_path)?; - let reader = BufReader::new(file); - let user_dictionary_config: UserDictionaryConfig = from_reader(reader)?; - Some( - load_user_dictionary_from_config(&user_dictionary_config).map_err(|e| { + let config_path = dic_dir.join("config.json"); + let file = File::open(config_path)?; + let reader = BufReader::new(file); + let config: LinderaConfig = serde_json::from_reader(reader)?; + let main_path = dic_dir.join(config.main); + let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| { + Error::io( + format!("load lindera tokenizer main dictionary err: {e}"), + location!(), + ) + })?; + let user_dictionary = match config.user { + Some(user) => { + let mut conf = serde_json::Map::::new(); + let user_path = dic_dir.join(user); + match user_path.to_str() { + Some(p) => { + conf.insert(String::from("path"), Value::String(String::from(p))); + Ok(()) + }, + None => { + let p = user_path.display(); + Err(Error::io( + format!("invalid lindera tokenizer user dictionary path: {p}"), + location!(), + )) + } + }?; + if let Some(kind) = config.user_kind { + conf.insert(String::from("kind"), Value::String(kind)); + } + let user_dictionary_config: UserDictionaryConfig = Value::Object(conf); + let user_dictionary = load_user_dictionary_from_config(&user_dictionary_config).map_err(|e| { Error::io( format!("load lindera tokenizer user dictionary err: {e}"), location!(), ) - })?, - ) - } else { - None + })?; + Some(user_dictionary) + }, + None => None + }; let mode = Mode::Normal; - let dictionary = load_dictionary_from_path(main_dir.as_path()).map_err(|e| { - Error::io( - format!("load lindera tokenizer main dictionary err: {e}"), - location!(), - ) - })?; let segmenter = Segmenter::new(mode, dictionary, user_dictionary); let tokenizer = LinderaTokenizer::from_segmenter(segmenter); Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) @@ -208,7 +244,7 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result Err(Error::invalid_input( format!( "{} is undefined", - String::from(lance_core::LANCE_HOME_ENV_KEY) + String::from(LANCE_TOKENIZERS_HOME_ENV_KEY) ), location!(), )), From f1b91465b8c3f5a091c55b8d3c9c85e3d9461501 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Tue, 10 Dec 2024 20:43:45 +0800 Subject: [PATCH 05/22] format --- rust/lance-core/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs index 91a894b355..9ab1854076 100644 --- a/rust/lance-core/src/lib.rs +++ b/rust/lance-core/src/lib.rs @@ -23,5 +23,4 @@ lazy_static::lazy_static! { /// Row address field. This is nullable because its validity bitmap is sometimes used /// as a selection vector. pub static ref ROW_ADDR_FIELD: ArrowField = ArrowField::new(ROW_ADDR, DataType::UInt64, true); - } From b8c778ef15cac7a78d19a80f6d548cc3591e4ede Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Tue, 10 Dec 2024 20:49:56 +0800 Subject: [PATCH 06/22] update deps --- Cargo.lock | 70 ------------------------------------- python/Cargo.lock | 70 ------------------------------------- rust/lance-index/Cargo.toml | 2 +- 3 files changed, 1 insertion(+), 141 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ff90a5cfcf..372e9966bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3860,12 +3860,7 @@ dependencies = [ "byteorder", "csv", "kanaria", - "lindera-cc-cedict", "lindera-dictionary", - "lindera-ipadic", - "lindera-ipadic-neologd", - "lindera-ko-dic", - "lindera-unidic", "once_cell", "regex", "serde", @@ -3879,19 +3874,6 @@ dependencies = [ "yada", ] -[[package]] -name = "lindera-cc-cedict" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a891e53b4fac346b314dcd60e4337b6deec7d972f338c004cfb9e0fe9868893" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - [[package]] name = "lindera-dictionary" version = "0.38.1" @@ -3917,45 +3899,6 @@ dependencies = [ "yada", ] -[[package]] -name = "lindera-ipadic" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d2b997fa2aeee8adccea2e4cb0ade771132f6e32093ed0beda8409e9a44018" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - -[[package]] -name = "lindera-ipadic-neologd" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084c4e7b63500b235af2c7cbf8ee23735ae452971ac29bbd9a7f55a10eae50c4" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - -[[package]] -name = "lindera-ko-dic" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d921a5c39a634316125fc1572f00bc78f4351baeacaff2cf39953b2fc8493a55" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - [[package]] name = "lindera-tantivy" version = "0.38.1" @@ -3967,19 +3910,6 @@ dependencies = [ "tantivy-tokenizer-api", ] -[[package]] -name = "lindera-unidic" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b95e25975448fa10f2bec61bfd012d8b0a1740d92ac7fc43e725edb1568ff7e" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - [[package]] name = "linux-raw-sys" version = "0.3.8" diff --git a/python/Cargo.lock b/python/Cargo.lock index 6e470e5e1f..8fa894d767 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -3424,12 +3424,7 @@ dependencies = [ "byteorder", "csv", "kanaria", - "lindera-cc-cedict", "lindera-dictionary", - "lindera-ipadic", - "lindera-ipadic-neologd", - "lindera-ko-dic", - "lindera-unidic", "once_cell", "regex", "serde", @@ -3443,19 +3438,6 @@ dependencies = [ "yada", ] -[[package]] -name = "lindera-cc-cedict" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a891e53b4fac346b314dcd60e4337b6deec7d972f338c004cfb9e0fe9868893" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - [[package]] name = "lindera-dictionary" version = "0.38.1" @@ -3481,45 +3463,6 @@ dependencies = [ "yada", ] -[[package]] -name = "lindera-ipadic" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d2b997fa2aeee8adccea2e4cb0ade771132f6e32093ed0beda8409e9a44018" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - -[[package]] -name = "lindera-ipadic-neologd" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084c4e7b63500b235af2c7cbf8ee23735ae452971ac29bbd9a7f55a10eae50c4" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - -[[package]] -name = "lindera-ko-dic" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d921a5c39a634316125fc1572f00bc78f4351baeacaff2cf39953b2fc8493a55" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - [[package]] name = "lindera-tantivy" version = "0.38.1" @@ -3531,19 +3474,6 @@ dependencies = [ "tantivy-tokenizer-api", ] -[[package]] -name = "lindera-unidic" -version = "0.38.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b95e25975448fa10f2bec61bfd012d8b0a1740d92ac7fc43e725edb1568ff7e" -dependencies = [ - "bincode", - "byteorder", - "lindera-dictionary", - "once_cell", - "tokio", -] - [[package]] name = "linux-raw-sys" version = "0.4.14" diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index c98388eeb9..1cff60a8b6 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -72,7 +72,7 @@ datafusion-sql.workspace = true random_word = { version = "0.4.3", features = ["en"] } [features] -tokenizer-lindera = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"] +tokenizer-lindera = ["lindera", "lindera-tantivy"] [build-dependencies] prost-build.workspace = true From 83cd1dd1030c718b123b3f606d2a0c1616c70f53 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Wed, 25 Dec 2024 23:11:13 +0800 Subject: [PATCH 07/22] lm download script --- python/python/lance/lance/__init__.pyi | 2 + python/python/lance/lm.py | 88 +++++++++++++++++++ python/src/lib.rs | 4 + .../src/scalar/inverted/tokenizer.rs | 12 +-- 4 files changed, 100 insertions(+), 6 deletions(-) create mode 100644 python/python/lance/lm.py diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 97d2cb602d..3b4278bcff 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -16,6 +16,8 @@ from typing import Dict, List, Optional import pyarrow as pa +LANGUAGE_MODEL_HOME: Optional[str] + def infer_tfrecord_schema( uri: str, tensor_features: Optional[List[str]] = None, diff --git a/python/python/lance/lm.py b/python/python/lance/lm.py new file mode 100644 index 0000000000..d59330bb48 --- /dev/null +++ b/python/python/lance/lm.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +from io import BytesIO +import os +import shutil +import subprocess +import tarfile +import traceback +from .lance import LANGUAGE_MODEL_HOME + +if LANGUAGE_MODEL_HOME is None: + raise Exception("LANCE_LANGUAGE_MODEL_HOME is not configured") + +def check_lindera(): + if not shutil.which("lindera"): + raise Exception("lindera is not installed. Please install it by following https://github.com/lindera/lindera/tree/main/lindera-cli") + +def check_requests(): + try: + import requests + except: + raise Exception("requests is not installed, Please pip install requests") + +def download_jieba(): + dirname = os.path.join(LANGUAGE_MODEL_HOME, "jieba", "default") + os.makedirs(dirname, exist_ok=True) + try: + check_requests() + import requests + resp = requests.get("https://api.github.com/repos/messense/jieba-rs/releases/latest") + content = requests.get(resp.json()["tarball_url"]).content + with tarfile.open(fileobj=BytesIO(content)) as tar: + dir = tar.getnames()[0] + tar.extract(f'{dir}/src/data', path=dirname) + shutil.move(os.path.join(dirname, dir, "src", "data"), dirname) + except Exception as _: + traceback.print_exc() + print("Download jieba language model failed. Please download this folder " + f"https://github.com/messense/jieba-rs/tree/main/src/data and put it in {dirname}") + +def download_lindera(lm: str): + import requests + dirname = os.path.join(LANGUAGE_MODEL_HOME, "lindera", lm) + src_dirname = os.path.join(dirname, "src") + if lm == "ipadic": + url = "https://dlwqk3ibdg1xh.cloudfront.net/mecab-ipadic-2.7.0-20070801.tar.gz" + elif lm == "ko-dic": + url = "https://dlwqk3ibdg1xh.cloudfront.net/mecab-ko-dic-2.1.1-20180720.tar.gz" + elif lm == "unidic": + url = "https://dlwqk3ibdg1xh.cloudfront.net/unidic-mecab-2.1.2.tar.gz" + else: + raise Exception(f"language model {lm} is not supported") + os.makedirs(src_dirname, exist_ok=True) + print(f"downloading language model: {url}") + data = requests.get(url).content + print(f"unzip language model: {url}") + + cwd = os.getcwd() + try: + os.chdir(src_dirname) + with tarfile.open(fileobj=BytesIO(data)) as tar: + tar.extractall() + name = tar.getnames()[0] + cmd = ["lindera", "build", "--dictionary-kind=ipadic", os.path.join(src_dirname, name), dirname] + print(f"compile language model: {' '.join(cmd)}") + subprocess.run(cmd) + finally: + os.chdir(cwd) + + +def main(): + import argparse + parser = argparse.ArgumentParser( + description='Lance tokenizer language model downloader' + ) + parser.add_argument('tokenizer', choices=['jieba', 'lindera']) + parser.add_argument("-l", "--languagemodel") + args = parser.parse_args() + print(f"LANCE_LANGUAGE_MODEL_HOME={LANGUAGE_MODEL_HOME}") + if args.tokenizer == 'jieba': + download_jieba() + elif args.tokenizer == 'lindera': + download_lindera(args.languagemodel) + +if __name__ == '__main__': + main() + diff --git a/python/src/lib.rs b/python/src/lib.rs index 9b82ff2a53..5d1aec32e4 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -44,6 +44,7 @@ use futures::StreamExt; use lance_index::DatasetIndexExt; use pyo3::exceptions::{PyIOError, PyValueError}; use pyo3::prelude::*; +use pyo3::types::{PyNone, PyString}; use session::Session; #[macro_use] @@ -151,6 +152,9 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(debug::format_fragment))?; m.add_wrapped(wrap_pyfunction!(debug::list_transactions))?; m.add("__version__", env!("CARGO_PKG_VERSION"))?; + let none = PyNone::get_bound(py).into_py(py); + let lm_home = lance_index::scalar::inverted::LANCE_LANGUAGE_MODEL_HOME.as_ref().and_then(|p| p.to_str()).map(|p| PyString::new_bound(py, p).into_py(py)).unwrap_or(none); + m.add("LANGUAGE_MODEL_HOME", lm_home)?; register_datagen(py, m)?; register_indices(py, m)?; Ok(()) diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index d1ac009884..566f8e7635 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -156,15 +156,15 @@ fn build_base_tokenizer_builder(name: &str) -> Result = match env::var(LANCE_TOKENIZERS_HOME_ENV_KEY) { + pub static ref LANCE_LANGUAGE_MODEL_HOME: Option = match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) { Ok(p) => Some(PathBuf::from(p)), - Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_HOME_DEFAULT_DIRECTORY)) + Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY)) }; } @@ -190,7 +190,7 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result { let dic_dir = p.join(dic); let config_path = dic_dir.join("config.json"); @@ -244,7 +244,7 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result Err(Error::invalid_input( format!( "{} is undefined", - String::from(LANCE_TOKENIZERS_HOME_ENV_KEY) + String::from(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) ), location!(), )), From 73582502a028fec9556ae25ed0a6787a1ded6704 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Wed, 25 Dec 2024 23:13:29 +0800 Subject: [PATCH 08/22] update --- python/python/lance/lm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/python/lance/lm.py b/python/python/lance/lm.py index d59330bb48..b8d7cb850b 100644 --- a/python/python/lance/lm.py +++ b/python/python/lance/lm.py @@ -62,7 +62,7 @@ def download_lindera(lm: str): with tarfile.open(fileobj=BytesIO(data)) as tar: tar.extractall() name = tar.getnames()[0] - cmd = ["lindera", "build", "--dictionary-kind=ipadic", os.path.join(src_dirname, name), dirname] + cmd = ["lindera", "build", f"--dictionary-kind={lm}", os.path.join(src_dirname, name), dirname] print(f"compile language model: {' '.join(cmd)}") subprocess.run(cmd) finally: From eb8e568311a45a30ee3d244616d8e9684fb27de2 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Wed, 25 Dec 2024 23:58:35 +0800 Subject: [PATCH 09/22] jieba --- Cargo.lock | 117 ++++++++++++++++++ Cargo.toml | 1 + python/Cargo.lock | 117 ++++++++++++++++++ python/Cargo.toml | 2 +- rust/lance-index/Cargo.toml | 2 + .../src/scalar/inverted/tokenizer.rs | 108 +++++++++++++++- 6 files changed, 343 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 372e9966bc..727ca31781 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,6 +23,12 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + [[package]] name = "ahash" version = "0.8.11" @@ -1198,6 +1204,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + [[package]] name = "census" version = "0.4.2" @@ -1404,6 +1419,15 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "cpp_demangle" version = "0.4.3" @@ -1599,6 +1623,12 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "dary_heap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" + [[package]] name = "dashmap" version = "5.5.3" @@ -2619,6 +2649,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -3023,6 +3062,29 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "include-flate" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e" +dependencies = [ + "include-flate-codegen", + "lazy_static", + "libflate", +] + +[[package]] +name = "include-flate-codegen" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7" +dependencies = [ + "libflate", + "proc-macro2", + "quote", + "syn 2.0.89", +] + [[package]] name = "indexmap" version = "2.3.0" @@ -3151,6 +3213,30 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jieba-macros" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192" +dependencies = [ + "phf_codegen", +] + +[[package]] +name = "jieba-rs" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a77d0ae8831f870c4f6ffce310f708b5273ea2e7a88e6af770a10d1b4876311" +dependencies = [ + "cedarwood", + "fxhash", + "include-flate", + "jieba-macros", + "lazy_static", + "phf", + "regex", +] + [[package]] name = "jni" version = "0.21.1" @@ -3537,6 +3623,7 @@ dependencies = [ "futures", "half", "itertools 0.13.0", + "jieba-rs", "lance-arrow", "lance-core", "lance-datafusion", @@ -3833,6 +3920,30 @@ version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +[[package]] +name = "libflate" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" +dependencies = [ + "adler32", + "core2", + "crc32fast", + "dary_heap", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" +dependencies = [ + "core2", + "hashbrown 0.14.5", + "rle-decode-fast", +] + [[package]] name = "libm" version = "0.2.8" @@ -5363,6 +5474,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "roaring" version = "0.10.6" diff --git a/Cargo.toml b/Cargo.toml index 73785e6c93..2d0f38fa80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -117,6 +117,7 @@ futures = "0.3" http = "1.1.0" hyperloglogplus = { version = "0.4.1", features = ["const-loop"] } itertools = "0.13" +jieba-rs = { version = "0.7", default-features = false } lazy_static = "1" log = "0.4" mockall = { version = "0.13.1" } diff --git a/python/Cargo.lock b/python/Cargo.lock index 8fa894d767..3ca3456b98 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -17,6 +17,12 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + [[package]] name = "ahash" version = "0.8.11" @@ -1053,6 +1059,15 @@ dependencies = [ "shlex", ] +[[package]] +name = "cedarwood" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" +dependencies = [ + "smallvec", +] + [[package]] name = "census" version = "0.4.2" @@ -1179,6 +1194,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "cpufeatures" version = "0.2.16" @@ -1327,6 +1351,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "dary_heap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" + [[package]] name = "dashmap" version = "5.5.3" @@ -2249,6 +2279,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -2795,6 +2834,29 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "include-flate" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e" +dependencies = [ + "include-flate-codegen", + "lazy_static", + "libflate", +] + +[[package]] +name = "include-flate-codegen" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7" +dependencies = [ + "libflate", + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "indexmap" version = "2.7.0" @@ -2894,6 +2956,30 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "jieba-macros" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192" +dependencies = [ + "phf_codegen", +] + +[[package]] +name = "jieba-rs" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a77d0ae8831f870c4f6ffce310f708b5273ea2e7a88e6af770a10d1b4876311" +dependencies = [ + "cedarwood", + "fxhash", + "include-flate", + "jieba-macros", + "lazy_static", + "phf", + "regex", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -3183,6 +3269,7 @@ dependencies = [ "futures", "half", "itertools 0.13.0", + "jieba-rs", "lance-arrow", "lance-core", "lance-datafusion", @@ -3396,6 +3483,30 @@ version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" +[[package]] +name = "libflate" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" +dependencies = [ + "adler32", + "core2", + "crc32fast", + "dary_heap", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" +dependencies = [ + "core2", + "hashbrown 0.14.5", + "rle-decode-fast", +] + [[package]] name = "libm" version = "0.2.11" @@ -4825,6 +4936,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "roaring" version = "0.10.7" diff --git a/python/Cargo.toml b/python/Cargo.toml index 0a1bea95e4..cb13c86963 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -36,7 +36,7 @@ lance-core = { path = "../rust/lance-core" } lance-datagen = { path = "../rust/lance-datagen", optional = true } lance-encoding = { path = "../rust/lance-encoding" } lance-file = { path = "../rust/lance-file" } -lance-index = { path = "../rust/lance-index", features = ["tokenizer-lindera"] } +lance-index = { path = "../rust/lance-index", features = ["tokenizer-lindera", "tokenizer-jieba"] } lance-io = { path = "../rust/lance-io" } lance-linalg = { path = "../rust/lance-linalg" } lance-table = { path = "../rust/lance-table" } diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 1cff60a8b6..ac08e8d0d5 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -30,6 +30,7 @@ dirs.workspace = true futures.workspace = true half.workspace = true itertools.workspace = true +jieba-rs = { workspace = true, optional = true } lance-arrow.workspace = true lance-core.workspace = true lance-datafusion.workspace = true @@ -73,6 +74,7 @@ random_word = { version = "0.4.3", features = ["en"] } [features] tokenizer-lindera = ["lindera", "lindera-tantivy"] +tokenizer-jieba = ["jieba-rs"] [build-dependencies] prost-build.workspace = true diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 566f8e7635..431cf8d421 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -149,6 +149,10 @@ fn build_base_tokenizer_builder(name: &str) -> Result { return build_lindera_tokenizer_builder(s); } + #[cfg(feature = "tokenizer-jieba")] + s if s.starts_with("jieba/") || s == "jieba" => { + return build_jieba_tokenizer_builder(s); + } _ => Err(Error::invalid_input( format!("unknown base tokenizer {}", name), location!(), @@ -194,9 +198,18 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result { let dic_dir = p.join(dic); let config_path = dic_dir.join("config.json"); - let file = File::open(config_path)?; - let reader = BufReader::new(file); - let config: LinderaConfig = serde_json::from_reader(reader)?; + let config: LinderaConfig = if config_path.exists() { + let file = File::open(config_path)?; + let reader = BufReader::new(file); + serde_json::from_reader(reader)? + } else { + let Some(dic_dir) = dic_dir.to_str() else { + return Err(Error::invalid_input("dic dir is invalid", + location!(), + )) + }; + LinderaConfig{main: String::from(dic_dir), user: None, user_kind: None} + }; let main_path = dic_dir.join(config.main); let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| { Error::io( @@ -250,3 +263,92 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result Result { + match LANCE_LANGUAGE_MODEL_HOME.as_ref() { + Some(p) => { + let dic = if dic == "jieba" { + "jieba/default" + } else { + dic + }; + let dic_file = p.join(dic).join("dict.txt"); + let file = std::fs::File::open(dic_file)?; + let mut f = std::io::BufReader::new(file); + let jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| { + Error::io( + format!("load jieba tokenizer dictionary err: {e}"), + location!(), + ) + })?; + let tokenizer = JiebaTokenizer{jieba: jieba}; + Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) + }, + None => Err(Error::invalid_input( + format!( + "{} is undefined", + String::from(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) + ), + location!(), + )), + } + +} + +#[cfg(feature = "tokenizer-jieba")] +#[derive(Clone)] +struct JiebaTokenizer{ + jieba: jieba_rs::Jieba +} + +#[cfg(feature = "tokenizer-jieba")] +struct JiebaTokenStream { + tokens: Vec, + index: usize, +} + +#[cfg(feature = "tokenizer-jieba")] +impl tantivy::tokenizer::TokenStream for JiebaTokenStream { + fn advance(&mut self) -> bool { + if self.index < self.tokens.len() { + self.index += 1; + true + } else { + false + } + } + + fn token(&self) -> &tantivy::tokenizer::Token { + &self.tokens[self.index - 1] + } + + fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { + &mut self.tokens[self.index - 1] + } +} + + +#[cfg(feature = "tokenizer-jieba")] +impl tantivy::tokenizer::Tokenizer for JiebaTokenizer { + type TokenStream<'a> = JiebaTokenStream; + + fn token_stream(&mut self, text: &str) -> JiebaTokenStream { + let mut indices = text.char_indices().collect::>(); + indices.push((text.len(), '\0')); + let orig_tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true); + let mut tokens = Vec::new(); + for token in orig_tokens { + tokens.push(tantivy::tokenizer::Token { + offset_from: indices[token.start].0, + offset_to: indices[token.end].0, + position: token.start, + text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]), + position_length: token.end - token.start, + }); + } + JiebaTokenStream { tokens, index: 0 } + } +} From 7962d9efa5caff215311f03f6709507965f985b1 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Fri, 27 Dec 2024 15:45:51 +0800 Subject: [PATCH 10/22] update type --- python/python/lance/lance/__init__.pyi | 1 + 1 file changed, 1 insertion(+) diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index ac6b5d3582..bf535a47a4 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -433,3 +433,4 @@ class BFloat16: def bfloat16_array(values: List[str | None]) -> BFloat16Array: ... __version__: str +LANGUAGE_MODEL_HOME: Optional[str] From 2aa4886c11e64bc31995354e5e574ac3265e3a94 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Fri, 27 Dec 2024 22:18:33 +0800 Subject: [PATCH 11/22] modulize thirdpart tokenizer --- python/python/lance/lm.py | 16 +- rust/lance-index/Cargo.toml | 5 +- .../src/scalar/inverted/tokenizer.rs | 216 ++++-------------- .../src/scalar/inverted/tokenizer/jieba.rs | 123 ++++++++++ .../src/scalar/inverted/tokenizer/lindera.rs | 97 ++++++++ 5 files changed, 270 insertions(+), 187 deletions(-) create mode 100644 rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs create mode 100644 rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs diff --git a/python/python/lance/lm.py b/python/python/lance/lm.py index b8d7cb850b..1338889054 100644 --- a/python/python/lance/lm.py +++ b/python/python/lance/lm.py @@ -28,15 +28,13 @@ def download_jieba(): try: check_requests() import requests - resp = requests.get("https://api.github.com/repos/messense/jieba-rs/releases/latest") - content = requests.get(resp.json()["tarball_url"]).content - with tarfile.open(fileobj=BytesIO(content)) as tar: - dir = tar.getnames()[0] - tar.extract(f'{dir}/src/data', path=dirname) - shutil.move(os.path.join(dirname, dir, "src", "data"), dirname) + resp = requests.get("https://github.com/messense/jieba-rs/raw/refs/heads/main/src/data/dict.txt") + content = resp.content + with open(os.path.join(dirname, "dict.txt"), "wb") as fo: + fo.write(content) except Exception as _: traceback.print_exc() - print("Download jieba language model failed. Please download this folder " + print("Download jieba language model failed. Please download dict.txt from " f"https://github.com/messense/jieba-rs/tree/main/src/data and put it in {dirname}") def download_lindera(lm: str): @@ -62,8 +60,8 @@ def download_lindera(lm: str): with tarfile.open(fileobj=BytesIO(data)) as tar: tar.extractall() name = tar.getnames()[0] - cmd = ["lindera", "build", f"--dictionary-kind={lm}", os.path.join(src_dirname, name), dirname] - print(f"compile language model: {' '.join(cmd)}") + cmd = ["lindera", "build", f"--dictionary-kind={lm}", os.path.join(src_dirname, name),os.path.join(dirname, "main")] + print(f"compiling language model: {' '.join(cmd)}") subprocess.run(cmd) finally: os.chdir(cwd) diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index ac08e8d0d5..e6cf51d2d7 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -73,8 +73,9 @@ datafusion-sql.workspace = true random_word = { version = "0.4.3", features = ["en"] } [features] -tokenizer-lindera = ["lindera", "lindera-tantivy"] -tokenizer-jieba = ["jieba-rs"] +tokenizer-lindera = ["lindera", "lindera-tantivy", "tokenizer-common"] +tokenizer-jieba = ["jieba-rs", "tokenizer-common"] +tokenizer-common = [] [build-dependencies] prost-build.workspace = true diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 431cf8d421..52f263e638 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -7,6 +7,12 @@ use lance_core::{Error, Result}; use serde::{Deserialize, Serialize}; use snafu::{location, Location}; +#[cfg(feature = "tokenizer-lindera")] +mod lindera; + +#[cfg(feature = "tokenizer-jieba")] +mod jieba; + /// Tokenizer configs #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TokenizerConfig { @@ -147,11 +153,24 @@ fn build_base_tokenizer_builder(name: &str) -> Result { - return build_lindera_tokenizer_builder(s); + let Some(home) = &*LANCE_LANGUAGE_MODEL_HOME else { + return Err(Error::invalid_input( + format!("unknown base tokenizer {}", name), + location!(), + )) + }; + lindera::LinderaBuilder::load(&home.join(s))?.build() } #[cfg(feature = "tokenizer-jieba")] s if s.starts_with("jieba/") || s == "jieba" => { - return build_jieba_tokenizer_builder(s); + let s = if s == "jieba" { "jieba/default" } else { s }; + let Some(home) = &*LANCE_LANGUAGE_MODEL_HOME else { + return Err(Error::invalid_input( + format!("unknown base tokenizer {}", name), + location!(), + )) + }; + lindera::LinderaBuilder::load(&home.join(s))?.build() } _ => Err(Error::invalid_input( format!("unknown base tokenizer {}", name), @@ -164,6 +183,8 @@ pub const LANCE_LANGUAGE_MODEL_HOME_ENV_KEY: &str = "LANCE_LANGUAGE_MODEL_HOME"; pub const LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY: &str = "lance/language_models"; +pub const LANCE_LANGUAGE_MODEL_CONFIG_FILE: &str = "config.json"; + lazy_static::lazy_static! { /// default directory that stores lance tokenizer related files, e.g. tokenizer model. pub static ref LANCE_LANGUAGE_MODEL_HOME: Option = match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) { @@ -172,183 +193,26 @@ lazy_static::lazy_static! { }; } -#[cfg(feature = "tokenizer-lindera")] -#[derive(Serialize, Deserialize)] -struct LinderaConfig{ - main: String, - user: Option, - user_kind: Option -} - -#[cfg(feature = "tokenizer-lindera")] -fn build_lindera_tokenizer_builder(dic: &str) -> Result { - use std::{fs::File, io::BufReader}; - - use lindera::{ - dictionary::{ - load_dictionary_from_path, load_user_dictionary_from_config, UserDictionaryConfig, - }, - mode::Mode, - segmenter::Segmenter, - }; - use lindera_tantivy::tokenizer::LinderaTokenizer; - use serde_json::Value; - - match LANCE_LANGUAGE_MODEL_HOME.as_ref() { - Some(p) => { - let dic_dir = p.join(dic); - let config_path = dic_dir.join("config.json"); - let config: LinderaConfig = if config_path.exists() { - let file = File::open(config_path)?; - let reader = BufReader::new(file); - serde_json::from_reader(reader)? - } else { - let Some(dic_dir) = dic_dir.to_str() else { - return Err(Error::invalid_input("dic dir is invalid", - location!(), - )) - }; - LinderaConfig{main: String::from(dic_dir), user: None, user_kind: None} - }; - let main_path = dic_dir.join(config.main); - let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| { - Error::io( - format!("load lindera tokenizer main dictionary err: {e}"), - location!(), - ) - })?; - let user_dictionary = match config.user { - Some(user) => { - let mut conf = serde_json::Map::::new(); - let user_path = dic_dir.join(user); - match user_path.to_str() { - Some(p) => { - conf.insert(String::from("path"), Value::String(String::from(p))); - Ok(()) - }, - None => { - let p = user_path.display(); - Err(Error::io( - format!("invalid lindera tokenizer user dictionary path: {p}"), - location!(), - )) - } - }?; - if let Some(kind) = config.user_kind { - conf.insert(String::from("kind"), Value::String(kind)); - } - let user_dictionary_config: UserDictionaryConfig = Value::Object(conf); - let user_dictionary = load_user_dictionary_from_config(&user_dictionary_config).map_err(|e| { - Error::io( - format!("load lindera tokenizer user dictionary err: {e}"), - location!(), - ) - })?; - Some(user_dictionary) - }, - None => None - - }; - let mode = Mode::Normal; - let segmenter = Segmenter::new(mode, dictionary, user_dictionary); - let tokenizer = LinderaTokenizer::from_segmenter(segmenter); - Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) +#[cfg(feature = "tokenizer-common")] +trait TokenizerBuilder: Sized { + type Config: serde::de::DeserializeOwned + Default; + fn load(p: &PathBuf) -> Result { + if !p.is_dir() { + return Err(Error::io(format!("{} is not a valid directory", p.display()), location!())) } - None => Err(Error::invalid_input( - format!( - "{} is undefined", - String::from(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) - ), - location!(), - )), - } -} - - - -#[cfg(feature = "tokenizer-jieba")] -fn build_jieba_tokenizer_builder(dic: &str) -> Result { - match LANCE_LANGUAGE_MODEL_HOME.as_ref() { - Some(p) => { - let dic = if dic == "jieba" { - "jieba/default" - } else { - dic - }; - let dic_file = p.join(dic).join("dict.txt"); - let file = std::fs::File::open(dic_file)?; - let mut f = std::io::BufReader::new(file); - let jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| { - Error::io( - format!("load jieba tokenizer dictionary err: {e}"), - location!(), - ) - })?; - let tokenizer = JiebaTokenizer{jieba: jieba}; - Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) - }, - None => Err(Error::invalid_input( - format!( - "{} is undefined", - String::from(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) - ), - location!(), - )), - } - -} - -#[cfg(feature = "tokenizer-jieba")] -#[derive(Clone)] -struct JiebaTokenizer{ - jieba: jieba_rs::Jieba -} - -#[cfg(feature = "tokenizer-jieba")] -struct JiebaTokenStream { - tokens: Vec, - index: usize, -} - -#[cfg(feature = "tokenizer-jieba")] -impl tantivy::tokenizer::TokenStream for JiebaTokenStream { - fn advance(&mut self) -> bool { - if self.index < self.tokens.len() { - self.index += 1; - true + use std::{fs::File, io::BufReader}; + let config_path = p.join(LANCE_LANGUAGE_MODEL_CONFIG_FILE); + let config= if config_path.exists() { + let file = File::open(config_path)?; + let reader = BufReader::new(file); + serde_json::from_reader::, Self::Config>(reader)? } else { - false - } + Self::Config::default() + }; + Self::new(config, p) } - fn token(&self) -> &tantivy::tokenizer::Token { - &self.tokens[self.index - 1] - } + fn new(config: Self::Config, root: &PathBuf) -> Result; - fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { - &mut self.tokens[self.index - 1] - } -} - - -#[cfg(feature = "tokenizer-jieba")] -impl tantivy::tokenizer::Tokenizer for JiebaTokenizer { - type TokenStream<'a> = JiebaTokenStream; - - fn token_stream(&mut self, text: &str) -> JiebaTokenStream { - let mut indices = text.char_indices().collect::>(); - indices.push((text.len(), '\0')); - let orig_tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true); - let mut tokens = Vec::new(); - for token in orig_tokens { - tokens.push(tantivy::tokenizer::Token { - offset_from: indices[token.start].0, - offset_to: indices[token.end].0, - position: token.start, - text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]), - position_length: token.end - token.start, - }); - } - JiebaTokenStream { tokens, index: 0 } - } + fn build(&self) -> Result; } diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs new file mode 100644 index 0000000000..33fcb8b30a --- /dev/null +++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs @@ -0,0 +1,123 @@ +use std::path::PathBuf; + +use lance_core::{Error, Result}; +use serde::{Deserialize, Serialize}; +use snafu::{location, Location}; +use super::TokenizerBuilder; + +#[derive(Serialize, Deserialize)] +pub struct JiebaConfig{ + main: Option, + users: Option> +} + +impl Default for JiebaConfig { + fn default() -> Self { + Self { main: Default::default(), users: Default::default() } + } +} + +pub struct JiebaBuilder { + root: PathBuf, + config: JiebaConfig +} + +impl JiebaBuilder { + fn main_dict_path(&self) -> PathBuf { + if let Some(p) = &self.config.main { + return self.root.join(p); + } + self.root.join("dict.txt") + } + + fn user_dict_paths(&self) -> Vec { + let Some(users) = &self.config.users else { + return vec![]; + }; + users.iter().map(|p| self.root.join(p)).collect() + } +} + +impl TokenizerBuilder for JiebaBuilder { + type Config = JiebaConfig; + + fn new(config: Self::Config, root: &PathBuf) -> Result { + Ok(JiebaBuilder{config, root: root.clone()}) + } + + fn build(&self) -> Result { + let main_dict_path = &self.main_dict_path(); + let file = std::fs::File::open(main_dict_path)?; + let mut f = std::io::BufReader::new(file); + let mut jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| { + Error::io( + format!("load jieba tokenizer dictionary {}, error: {}", main_dict_path.display(), e), + location!(), + ) + })?; + for user_dict_path in &self.user_dict_paths() { + let file = std::fs::File::open(user_dict_path)?; + let mut f = std::io::BufReader::new(file); + jieba.load_dict(&mut f).map_err(|e| { + Error::io( + format!("load jieba tokenizer user dictionary {}, error: {}", user_dict_path.display(), e), + location!(), + ) + })? + } + let tokenizer = JiebaTokenizer{jieba: jieba}; + Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) + } +} + +#[derive(Clone)] +struct JiebaTokenizer{ + jieba: jieba_rs::Jieba +} + +struct JiebaTokenStream { + tokens: Vec, + index: usize, +} + +impl tantivy::tokenizer::TokenStream for JiebaTokenStream { + fn advance(&mut self) -> bool { + if self.index < self.tokens.len() { + self.index += 1; + true + } else { + false + } + } + + fn token(&self) -> &tantivy::tokenizer::Token { + &self.tokens[self.index - 1] + } + + fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token { + &mut self.tokens[self.index - 1] + } +} + + +#[cfg(feature = "tokenizer-jieba")] +impl tantivy::tokenizer::Tokenizer for JiebaTokenizer { + type TokenStream<'a> = JiebaTokenStream; + + fn token_stream(&mut self, text: &str) -> JiebaTokenStream { + let mut indices = text.char_indices().collect::>(); + indices.push((text.len(), '\0')); + let orig_tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true); + let mut tokens = Vec::new(); + for token in orig_tokens { + tokens.push(tantivy::tokenizer::Token { + offset_from: indices[token.start].0, + offset_to: indices[token.end].0, + position: token.start, + text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]), + position_length: token.end - token.start, + }); + } + JiebaTokenStream { tokens, index: 0 } + } +} diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs new file mode 100644 index 0000000000..e07ed4d91f --- /dev/null +++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs @@ -0,0 +1,97 @@ +use std::path::PathBuf; + +use lance_core::{Error, Result}; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use snafu::{location, Location}; +use super::TokenizerBuilder; +use lindera::{ + dictionary::{ + load_dictionary_from_path, load_user_dictionary_from_config, UserDictionaryConfig, + }, + mode::Mode, + segmenter::Segmenter, +}; +use lindera_tantivy::tokenizer::LinderaTokenizer; + +#[derive(Serialize, Deserialize)] +pub struct LinderaConfig{ + main: Option, + user: Option, + user_kind: Option +} + +impl Default for LinderaConfig { + fn default() -> Self { + Self { main: Default::default(), user: Default::default(), user_kind: Default::default() } + } +} + +pub struct LinderaBuilder { + root: PathBuf, + config: LinderaConfig +} + +impl LinderaBuilder { + fn main_dict_path(&self) -> PathBuf { + if let Some(p) = &self.config.main { + return self.root.join(p); + } + self.root.join("main") + } + + fn user_dict_config(&self) -> Result> { + let Some(user_dict_path) = &self.config.user else { + return Ok(None) + }; + let mut conf = Map::::new(); + let user_path = self.root.join(user_dict_path); + let Some(p) = user_path.to_str() else { + return Err(Error::io( + format!("invalid lindera tokenizer user dictionary path: {}", user_path.display()), + location!(), + )) + }; + conf.insert(String::from("path"), Value::String(String::from(p))); + if let Some(kind) = &self.config.user_kind { + conf.insert(String::from("kind"), Value::String(kind.clone())); + } + Ok(Some(Value::Object(conf))) + } +} + +impl TokenizerBuilder for LinderaBuilder { + type Config = LinderaConfig; + + fn new(config: Self::Config, root: &PathBuf) -> Result { + Ok(LinderaBuilder{config, root: root.clone()}) + } + + fn build(&self) -> Result { + let main_path = self.main_dict_path(); + let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| { + Error::io( + format!("load lindera tokenizer main dictionary from {}, error: {}", main_path.display(), e), + location!(), + ) + })?; + let user_dictionary = match self.user_dict_config()? { + Some(conf) => { + let user_dictionary = load_user_dictionary_from_config(&conf).map_err(|e| { + Error::io( + format!("load lindera tokenizer user dictionary err: {e}"), + location!(), + ) + })?; + Some(user_dictionary) + }, + None => None + + }; + let mode = Mode::Normal; + let segmenter = Segmenter::new(mode, dictionary, user_dictionary); + let tokenizer = LinderaTokenizer::from_segmenter(segmenter); + Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) + } +} + From 5313fe71a2df3b3fb1e1a94416c78e96e111dd61 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Fri, 27 Dec 2024 22:51:55 +0800 Subject: [PATCH 12/22] format and dict --- python/python/lance/{lm.py => download.py} | 0 .../python/tests/lms/jieba/default/dict.txt | 8 ++++ .../tests/lms/jieba/user_dict/config.json | 6 +++ .../python/tests/lms/jieba/user_dict/user.txt | 1 + .../src/scalar/inverted/tokenizer.rs | 11 +++-- .../src/scalar/inverted/tokenizer/jieba.rs | 41 +++++++++++----- .../src/scalar/inverted/tokenizer/lindera.rs | 48 ++++++++++++------- 7 files changed, 80 insertions(+), 35 deletions(-) rename python/python/lance/{lm.py => download.py} (100%) create mode 100644 python/python/tests/lms/jieba/default/dict.txt create mode 100644 python/python/tests/lms/jieba/user_dict/config.json create mode 100644 python/python/tests/lms/jieba/user_dict/user.txt diff --git a/python/python/lance/lm.py b/python/python/lance/download.py similarity index 100% rename from python/python/lance/lm.py rename to python/python/lance/download.py diff --git a/python/python/tests/lms/jieba/default/dict.txt b/python/python/tests/lms/jieba/default/dict.txt new file mode 100644 index 0000000000..237b47ca6a --- /dev/null +++ b/python/python/tests/lms/jieba/default/dict.txt @@ -0,0 +1,8 @@ +我们 98740 r +都 202780 d +有 423765 v +光明 1219 n +的 318825 uj +前途 1263 n +前 62779 f +途 857 n diff --git a/python/python/tests/lms/jieba/user_dict/config.json b/python/python/tests/lms/jieba/user_dict/config.json new file mode 100644 index 0000000000..5f0541ed4f --- /dev/null +++ b/python/python/tests/lms/jieba/user_dict/config.json @@ -0,0 +1,6 @@ +{ + "main": "../default/dict.txt", + "user": [ + "user.txt" + ] +} diff --git a/python/python/tests/lms/jieba/user_dict/user.txt b/python/python/tests/lms/jieba/user_dict/user.txt new file mode 100644 index 0000000000..be2d8a9582 --- /dev/null +++ b/python/python/tests/lms/jieba/user_dict/user.txt @@ -0,0 +1 @@ +光明的前途 1219 n diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 52f263e638..d0b38f8f3f 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -157,7 +157,7 @@ fn build_base_tokenizer_builder(name: &str) -> Result Result Result { if !p.is_dir() { - return Err(Error::io(format!("{} is not a valid directory", p.display()), location!())) + return Err(Error::io( + format!("{} is not a valid directory", p.display()), + location!(), + )); } use std::{fs::File, io::BufReader}; let config_path = p.join(LANCE_LANGUAGE_MODEL_CONFIG_FILE); - let config= if config_path.exists() { + let config = if config_path.exists() { let file = File::open(config_path)?; let reader = BufReader::new(file); serde_json::from_reader::, Self::Config>(reader)? diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs index 33fcb8b30a..a874ca3bd4 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs @@ -1,25 +1,28 @@ use std::path::PathBuf; +use super::TokenizerBuilder; use lance_core::{Error, Result}; use serde::{Deserialize, Serialize}; use snafu::{location, Location}; -use super::TokenizerBuilder; #[derive(Serialize, Deserialize)] -pub struct JiebaConfig{ +pub struct JiebaConfig { main: Option, - users: Option> + users: Option>, } impl Default for JiebaConfig { fn default() -> Self { - Self { main: Default::default(), users: Default::default() } + Self { + main: Default::default(), + users: Default::default(), + } } } pub struct JiebaBuilder { root: PathBuf, - config: JiebaConfig + config: JiebaConfig, } impl JiebaBuilder { @@ -42,7 +45,10 @@ impl TokenizerBuilder for JiebaBuilder { type Config = JiebaConfig; fn new(config: Self::Config, root: &PathBuf) -> Result { - Ok(JiebaBuilder{config, root: root.clone()}) + Ok(JiebaBuilder { + config, + root: root.clone(), + }) } fn build(&self) -> Result { @@ -51,7 +57,11 @@ impl TokenizerBuilder for JiebaBuilder { let mut f = std::io::BufReader::new(file); let mut jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| { Error::io( - format!("load jieba tokenizer dictionary {}, error: {}", main_dict_path.display(), e), + format!( + "load jieba tokenizer dictionary {}, error: {}", + main_dict_path.display(), + e + ), location!(), ) })?; @@ -60,19 +70,23 @@ impl TokenizerBuilder for JiebaBuilder { let mut f = std::io::BufReader::new(file); jieba.load_dict(&mut f).map_err(|e| { Error::io( - format!("load jieba tokenizer user dictionary {}, error: {}", user_dict_path.display(), e), + format!( + "load jieba tokenizer user dictionary {}, error: {}", + user_dict_path.display(), + e + ), location!(), ) })? } - let tokenizer = JiebaTokenizer{jieba: jieba}; + let tokenizer = JiebaTokenizer { jieba: jieba }; Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) } } #[derive(Clone)] -struct JiebaTokenizer{ - jieba: jieba_rs::Jieba +struct JiebaTokenizer { + jieba: jieba_rs::Jieba, } struct JiebaTokenStream { @@ -99,7 +113,6 @@ impl tantivy::tokenizer::TokenStream for JiebaTokenStream { } } - #[cfg(feature = "tokenizer-jieba")] impl tantivy::tokenizer::Tokenizer for JiebaTokenizer { type TokenStream<'a> = JiebaTokenStream; @@ -107,7 +120,9 @@ impl tantivy::tokenizer::Tokenizer for JiebaTokenizer { fn token_stream(&mut self, text: &str) -> JiebaTokenStream { let mut indices = text.char_indices().collect::>(); indices.push((text.len(), '\0')); - let orig_tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true); + let orig_tokens = self + .jieba + .tokenize(text, jieba_rs::TokenizeMode::Search, true); let mut tokens = Vec::new(); for token in orig_tokens { tokens.push(tantivy::tokenizer::Token { diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs index e07ed4d91f..ab60790de6 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs @@ -1,10 +1,7 @@ use std::path::PathBuf; -use lance_core::{Error, Result}; -use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; -use snafu::{location, Location}; use super::TokenizerBuilder; +use lance_core::{Error, Result}; use lindera::{ dictionary::{ load_dictionary_from_path, load_user_dictionary_from_config, UserDictionaryConfig, @@ -13,23 +10,30 @@ use lindera::{ segmenter::Segmenter, }; use lindera_tantivy::tokenizer::LinderaTokenizer; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use snafu::{location, Location}; #[derive(Serialize, Deserialize)] -pub struct LinderaConfig{ +pub struct LinderaConfig { main: Option, user: Option, - user_kind: Option + user_kind: Option, } impl Default for LinderaConfig { fn default() -> Self { - Self { main: Default::default(), user: Default::default(), user_kind: Default::default() } + Self { + main: Default::default(), + user: Default::default(), + user_kind: Default::default(), + } } } pub struct LinderaBuilder { root: PathBuf, - config: LinderaConfig + config: LinderaConfig, } impl LinderaBuilder { @@ -42,15 +46,18 @@ impl LinderaBuilder { fn user_dict_config(&self) -> Result> { let Some(user_dict_path) = &self.config.user else { - return Ok(None) + return Ok(None); }; let mut conf = Map::::new(); let user_path = self.root.join(user_dict_path); let Some(p) = user_path.to_str() else { return Err(Error::io( - format!("invalid lindera tokenizer user dictionary path: {}", user_path.display()), + format!( + "invalid lindera tokenizer user dictionary path: {}", + user_path.display() + ), location!(), - )) + )); }; conf.insert(String::from("path"), Value::String(String::from(p))); if let Some(kind) = &self.config.user_kind { @@ -64,19 +71,26 @@ impl TokenizerBuilder for LinderaBuilder { type Config = LinderaConfig; fn new(config: Self::Config, root: &PathBuf) -> Result { - Ok(LinderaBuilder{config, root: root.clone()}) + Ok(LinderaBuilder { + config, + root: root.clone(), + }) } fn build(&self) -> Result { let main_path = self.main_dict_path(); let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| { Error::io( - format!("load lindera tokenizer main dictionary from {}, error: {}", main_path.display(), e), + format!( + "load lindera tokenizer main dictionary from {}, error: {}", + main_path.display(), + e + ), location!(), ) })?; let user_dictionary = match self.user_dict_config()? { - Some(conf) => { + Some(conf) => { let user_dictionary = load_user_dictionary_from_config(&conf).map_err(|e| { Error::io( format!("load lindera tokenizer user dictionary err: {e}"), @@ -84,9 +98,8 @@ impl TokenizerBuilder for LinderaBuilder { ) })?; Some(user_dictionary) - }, - None => None - + } + None => None, }; let mode = Mode::Normal; let segmenter = Segmenter::new(mode, dictionary, user_dictionary); @@ -94,4 +107,3 @@ impl TokenizerBuilder for LinderaBuilder { Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) } } - From f7fcb4788ee5dc8f99803adbf7dfa57a994fad84 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 11:23:04 +0800 Subject: [PATCH 13/22] update tokenizer --- python/python/lance/download.py | 60 ++++++++++++------ python/python/lance/lance/__init__.pyi | 3 +- .../{lms => models}/jieba/default/dict.txt | 0 .../jieba/user_dict/config.json | 0 .../{lms => models}/jieba/user_dict/user.txt | 0 .../lindera/ipadic/ipadic_simple_userdic.bin | Bin 0 -> 1612 bytes .../lindera/ipadic/ipadic_simple_userdic.csv | 3 + python/src/lib.rs | 20 +++++- .../src/scalar/inverted/tokenizer.rs | 11 ++-- 9 files changed, 66 insertions(+), 31 deletions(-) rename python/python/tests/{lms => models}/jieba/default/dict.txt (100%) rename python/python/tests/{lms => models}/jieba/user_dict/config.json (100%) rename python/python/tests/{lms => models}/jieba/user_dict/user.txt (100%) create mode 100644 python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin create mode 100644 python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv diff --git a/python/python/lance/download.py b/python/python/lance/download.py index 1338889054..66f2558176 100644 --- a/python/python/lance/download.py +++ b/python/python/lance/download.py @@ -1,44 +1,55 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The Lance Authors -from io import BytesIO import os import shutil import subprocess import tarfile import traceback -from .lance import LANGUAGE_MODEL_HOME +from io import BytesIO + +from .lance import language_model_home + +LANGUAGE_MODEL_HOME = language_model_home() -if LANGUAGE_MODEL_HOME is None: - raise Exception("LANCE_LANGUAGE_MODEL_HOME is not configured") def check_lindera(): if not shutil.which("lindera"): - raise Exception("lindera is not installed. Please install it by following https://github.com/lindera/lindera/tree/main/lindera-cli") + raise Exception( + "lindera is not installed. Please install it by following https://github.com/lindera/lindera/tree/main/lindera-cli" + ) + -def check_requests(): +def import_requests(): try: - import requests - except: + import requests # type: ignore + except Exception: raise Exception("requests is not installed, Please pip install requests") + return requests + def download_jieba(): dirname = os.path.join(LANGUAGE_MODEL_HOME, "jieba", "default") os.makedirs(dirname, exist_ok=True) try: - check_requests() - import requests - resp = requests.get("https://github.com/messense/jieba-rs/raw/refs/heads/main/src/data/dict.txt") + requests = import_requests() + resp = requests.get( + "https://github.com/messense/jieba-rs/raw/refs/heads/main/src/data/dict.txt" + ) content = resp.content with open(os.path.join(dirname, "dict.txt"), "wb") as fo: fo.write(content) except Exception as _: traceback.print_exc() - print("Download jieba language model failed. Please download dict.txt from " - f"https://github.com/messense/jieba-rs/tree/main/src/data and put it in {dirname}") + print( + "Download jieba language model failed. Please download dict.txt from " + "https://github.com/messense/jieba-rs/tree/main/src/data " + f"and put it in {dirname}" + ) + def download_lindera(lm: str): - import requests + requests = import_requests() dirname = os.path.join(LANGUAGE_MODEL_HOME, "lindera", lm) src_dirname = os.path.join(dirname, "src") if lm == "ipadic": @@ -60,7 +71,13 @@ def download_lindera(lm: str): with tarfile.open(fileobj=BytesIO(data)) as tar: tar.extractall() name = tar.getnames()[0] - cmd = ["lindera", "build", f"--dictionary-kind={lm}", os.path.join(src_dirname, name),os.path.join(dirname, "main")] + cmd = [ + "lindera", + "build", + f"--dictionary-kind={lm}", + os.path.join(src_dirname, name), + os.path.join(dirname, "main"), + ] print(f"compiling language model: {' '.join(cmd)}") subprocess.run(cmd) finally: @@ -69,18 +86,19 @@ def download_lindera(lm: str): def main(): import argparse + parser = argparse.ArgumentParser( - description='Lance tokenizer language model downloader' + description="Lance tokenizer language model downloader" ) - parser.add_argument('tokenizer', choices=['jieba', 'lindera']) + parser.add_argument("tokenizer", choices=["jieba", "lindera"]) parser.add_argument("-l", "--languagemodel") args = parser.parse_args() print(f"LANCE_LANGUAGE_MODEL_HOME={LANGUAGE_MODEL_HOME}") - if args.tokenizer == 'jieba': + if args.tokenizer == "jieba": download_jieba() - elif args.tokenizer == 'lindera': + elif args.tokenizer == "lindera": download_lindera(args.languagemodel) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index bf535a47a4..07aefef390 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -15,6 +15,7 @@ from pathlib import Path from typing import ( Any, + Callable, Dict, Iterable, Iterator, @@ -433,4 +434,4 @@ class BFloat16: def bfloat16_array(values: List[str | None]) -> BFloat16Array: ... __version__: str -LANGUAGE_MODEL_HOME: Optional[str] +language_model_home: Callable[[], str] diff --git a/python/python/tests/lms/jieba/default/dict.txt b/python/python/tests/models/jieba/default/dict.txt similarity index 100% rename from python/python/tests/lms/jieba/default/dict.txt rename to python/python/tests/models/jieba/default/dict.txt diff --git a/python/python/tests/lms/jieba/user_dict/config.json b/python/python/tests/models/jieba/user_dict/config.json similarity index 100% rename from python/python/tests/lms/jieba/user_dict/config.json rename to python/python/tests/models/jieba/user_dict/config.json diff --git a/python/python/tests/lms/jieba/user_dict/user.txt b/python/python/tests/models/jieba/user_dict/user.txt similarity index 100% rename from python/python/tests/lms/jieba/user_dict/user.txt rename to python/python/tests/models/jieba/user_dict/user.txt diff --git a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin new file mode 100644 index 0000000000000000000000000000000000000000..ada7701adb197032ce7374b3bed5269a351ca8e2 GIT binary patch literal 1612 zcmeHGJxfDD5S;9@P_PhDun<8I6tS?du(0tz_!ImAmKGLycZu2rEJQ3+QasFu2`Fh) z0{#U{3kzX27_dm8kix>*yq6#fVxic$&F$^Z&CD%#=tCUa0kmHbMuOsQfJR&5{kUNB zls&Mw#M+P`N30tYY^7P>VE;KlxXNCiz}*Qhc32-|o)?@L?*jAvxF7M1r;cGW?D>kI zyhaRb{#8)V2WVyl*#V$HtrUa`NRbMl&Qn!3D$E>eTu zqYQOs*aI~TLtc*nljJV%6W0zz=~`8My+~+T3 zpf!@Haa|!Ol{{X4*iH;5tGW|)7wWFm=O1=*kEJ7Hw3z))G8JQ?VNt_P4Rh+YEHO$F zN9r@`%j)aRLUy`~_1o%>)m7COI>|?@x+1$h`5(4Cme^<#?+a|tEZzW`{UO2t literal 0 HcmV?d00001 diff --git a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv new file mode 100644 index 0000000000..fae82a570b --- /dev/null +++ b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv @@ -0,0 +1,3 @@ +東京スカイツリー,カスタム名詞,トウキョウスカイツリー +東武スカイツリーライン,カスタム名詞,トウブスカイツリーライン +とうきょうスカイツリー駅,カスタム名詞,トウキョウスカイツリーエキ diff --git a/python/src/lib.rs b/python/src/lib.rs index 98aa8b43be..f80399dae2 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -144,15 +144,14 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(read_tfrecord))?; m.add_wrapped(wrap_pyfunction!(trace_to_chrome))?; m.add_wrapped(wrap_pyfunction!(manifest_needs_migration))?; + m.add_wrapped(wrap_pyfunction!(language_model_home))?; // Debug functions m.add_wrapped(wrap_pyfunction!(debug::format_schema))?; m.add_wrapped(wrap_pyfunction!(debug::format_manifest))?; m.add_wrapped(wrap_pyfunction!(debug::format_fragment))?; m.add_wrapped(wrap_pyfunction!(debug::list_transactions))?; m.add("__version__", env!("CARGO_PKG_VERSION"))?; - let none = PyNone::get_bound(py).into_py(py); - let lm_home = lance_index::scalar::inverted::LANCE_LANGUAGE_MODEL_HOME.as_ref().and_then(|p| p.to_str()).map(|p| PyString::new_bound(py, p).into_py(py)).unwrap_or(none); - m.add("LANGUAGE_MODEL_HOME", lm_home)?; + register_datagen(py, m)?; register_indices(py, m)?; Ok(()) @@ -176,6 +175,21 @@ fn json_to_schema(json: &str) -> PyResult> { Ok(schema.into()) } +#[pyfunction] +pub fn language_model_home() -> PyResult { + let Some(p) = lance_index::scalar::inverted::language_model_home() else { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "Failed to get language model home" + ))); + }; + let Some(pstr) = p.to_str() else { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "Failed to convert language model home to str" + ))); + }; + Ok(String::from(pstr)) +} + /// Infer schema from tfrecord file /// /// Parameters diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index d0b38f8f3f..cdb60ff0f8 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -153,7 +153,7 @@ fn build_base_tokenizer_builder(name: &str) -> Result { - let Some(home) = &*LANCE_LANGUAGE_MODEL_HOME else { + let Some(home) = language_model_home() else { return Err(Error::invalid_input( format!("unknown base tokenizer {}", name), location!(), @@ -164,7 +164,7 @@ fn build_base_tokenizer_builder(name: &str) -> Result { let s = if s == "jieba" { "jieba/default" } else { s }; - let Some(home) = &*LANCE_LANGUAGE_MODEL_HOME else { + let Some(home) = language_model_home() else { return Err(Error::invalid_input( format!("unknown base tokenizer {}", name), location!(), @@ -185,12 +185,11 @@ pub const LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY: &str = "lance/language_models" pub const LANCE_LANGUAGE_MODEL_CONFIG_FILE: &str = "config.json"; -lazy_static::lazy_static! { - /// default directory that stores lance tokenizer related files, e.g. tokenizer model. - pub static ref LANCE_LANGUAGE_MODEL_HOME: Option = match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) { +pub fn language_model_home() -> Option { + match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) { Ok(p) => Some(PathBuf::from(p)), Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY)) - }; + } } #[cfg(feature = "tokenizer-common")] From eab0c16160304cfca6c9ef0ce9a001091f97d9fe Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 14:38:09 +0800 Subject: [PATCH 14/22] add document --- docs/tokenizer.rst | 87 ++++++++++++++++++ python/python/lance/download.py | 2 +- .../lindera/ipadic/ipadic_simple_userdic.bin | Bin 1612 -> 0 bytes .../lindera/ipadic/ipadic_simple_userdic.csv | 3 - python/src/lib.rs | 1 - 5 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 docs/tokenizer.rst delete mode 100644 python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin delete mode 100644 python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv diff --git a/docs/tokenizer.rst b/docs/tokenizer.rst new file mode 100644 index 0000000000..602956b26b --- /dev/null +++ b/docs/tokenizer.rst @@ -0,0 +1,87 @@ +Tokenizers +============================ + +Currently, Lance has built-in support for Jieba and Lindera. However, it doesn't come with its own language models. +If tokenization is needed, you can download language models by yourself. +You can specify the location where the language models are stored by setting the environment variable LANCE_LANGUAGE_MODEL_HOME. +If it's not set, the default value is + +... code-block::bash + ${system data directory}/lance/language_models + +It also supports configuring user dictionaries, +which makes it convenient for users to expand their own dictionaries without retraining the language models. + +Language Models of Jieba +--------------- + +Downloading the Model +~~~~~~~~~~~ + +... code-block::bash + python -m lance.download jieba + +The language model is stored by default in `${LANCE_LANGUAGE_MODEL_HOME}/jieba/default`. + +Using the Model +~~~~~~~~~~~ + +... code-block::python + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/default") + +User Dictionaries +~~~~~~~~~~~ +Create a file named config.json in the root directory of the current model. + +... code-block::json + { + "main": "dict.txt", + "users": ["path/to/user/dict.txt"] + } + +- The "main" field is optional. If not filled, the default is "dict.txt". +- "users" is the path of the user dictionary. For the format of the user dictionary, please refer to https://github.com/messense/jieba-rs/blob/main/src/data/dict.txt. + + +Language Models of Lindera +--------------- + +Downloading the Model +~~~~~~~~~~~ + +... code-block::bash + python -m lance.download lindera -l [ipadic|ko-dic|unidic] + +Note that the language models of Lindera need to be compiled. Please install lindera-cli first. For detailed steps, please refer to https://github.com/lindera/lindera/tree/main/lindera-cli. + +The language model is stored by default in ${LANCE_LANGUAGE_MODEL_HOME}/lindera/[ipadic|ko-dic|unidic] + +Using the Model +~~~~~~~~~~~ + +... code-block::python + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ipadic") + +User Dictionaries +~~~~~~~~~~~ + +Create a file named config.json in the root directory of the current model. + +... code-block::json + { + "main": "main", + "users": "path/to/user/dict.bin", + "user_type": "ipadic|ko-dic|unidic" + } + +- The "main" field is optional. If not filled, the default is the "main" directory. +- "user" is the path of the user dictionary. The user dictionary can be passed as a CSV file or as a binary file compiled by lindera-cli. +- The "user_type" field can be left blank if the user dictionary is in binary format. If it's in CSV format, you need to specify the type of the language model. + + +Create your own language model +--------------- + +Put your language model into `LANCE_LANGUAGE_MODEL_HOME`. + + diff --git a/python/python/lance/download.py b/python/python/lance/download.py index 66f2558176..778949aaff 100644 --- a/python/python/lance/download.py +++ b/python/python/lance/download.py @@ -22,7 +22,7 @@ def check_lindera(): def import_requests(): try: - import requests # type: ignore + import requests except Exception: raise Exception("requests is not installed, Please pip install requests") return requests diff --git a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin deleted file mode 100644 index ada7701adb197032ce7374b3bed5269a351ca8e2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1612 zcmeHGJxfDD5S;9@P_PhDun<8I6tS?du(0tz_!ImAmKGLycZu2rEJQ3+QasFu2`Fh) z0{#U{3kzX27_dm8kix>*yq6#fVxic$&F$^Z&CD%#=tCUa0kmHbMuOsQfJR&5{kUNB zls&Mw#M+P`N30tYY^7P>VE;KlxXNCiz}*Qhc32-|o)?@L?*jAvxF7M1r;cGW?D>kI zyhaRb{#8)V2WVyl*#V$HtrUa`NRbMl&Qn!3D$E>eTu zqYQOs*aI~TLtc*nljJV%6W0zz=~`8My+~+T3 zpf!@Haa|!Ol{{X4*iH;5tGW|)7wWFm=O1=*kEJ7Hw3z))G8JQ?VNt_P4Rh+YEHO$F zN9r@`%j)aRLUy`~_1o%>)m7COI>|?@x+1$h`5(4Cme^<#?+a|tEZzW`{UO2t diff --git a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv deleted file mode 100644 index fae82a570b..0000000000 --- a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv +++ /dev/null @@ -1,3 +0,0 @@ -東京スカイツリー,カスタム名詞,トウキョウスカイツリー -東武スカイツリーライン,カスタム名詞,トウブスカイツリーライン -とうきょうスカイツリー駅,カスタム名詞,トウキョウスカイツリーエキ diff --git a/python/src/lib.rs b/python/src/lib.rs index f80399dae2..f534d0c3df 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -44,7 +44,6 @@ use futures::StreamExt; use lance_index::DatasetIndexExt; use pyo3::exceptions::{PyIOError, PyValueError}; use pyo3::prelude::*; -use pyo3::types::{PyNone, PyString}; use session::Session; #[macro_use] From d19c957d64753e06dc427566d68adb87fd5a590b Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 16:37:44 +0800 Subject: [PATCH 15/22] update test --- docs/tokenizer.rst | 4 +- .../models/jieba/invalid_dict/config.json | 6 + .../models/jieba/invalid_dict2/config.json | 3 + .../tests/models/jieba/user_dict/config.json | 2 +- .../tests/models/jieba/user_dict/user.txt | 2 +- python/python/tests/models/lindera/README.md | 28 ++++ .../models/lindera/invalid_dict/config.json | 4 + .../models/lindera/invalid_dict2/config.json | 4 + .../tests/models/lindera/ipadic/main.zip | Bin 0 -> 5910 bytes .../models/lindera/ipadic/raw/Noun.mock.csv | 3 + .../models/lindera/user_dict/config.json | 5 + .../models/lindera/user_dict/userdic.csv | 1 + .../models/lindera/user_dict2/config.json | 4 + .../models/lindera/user_dict2/userdic.bin | Bin 0 -> 1226 bytes python/python/tests/test_scalar_index.py | 147 +++++++++++++++++- .../src/scalar/inverted/tokenizer.rs | 6 +- .../src/scalar/inverted/tokenizer/lindera.rs | 2 +- 17 files changed, 206 insertions(+), 15 deletions(-) create mode 100644 python/python/tests/models/jieba/invalid_dict/config.json create mode 100644 python/python/tests/models/jieba/invalid_dict2/config.json create mode 100644 python/python/tests/models/lindera/README.md create mode 100644 python/python/tests/models/lindera/invalid_dict/config.json create mode 100644 python/python/tests/models/lindera/invalid_dict2/config.json create mode 100644 python/python/tests/models/lindera/ipadic/main.zip create mode 100644 python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv create mode 100644 python/python/tests/models/lindera/user_dict/config.json create mode 100644 python/python/tests/models/lindera/user_dict/userdic.csv create mode 100644 python/python/tests/models/lindera/user_dict2/config.json create mode 100644 python/python/tests/models/lindera/user_dict2/userdic.bin diff --git a/docs/tokenizer.rst b/docs/tokenizer.rst index 602956b26b..306b7919ad 100644 --- a/docs/tokenizer.rst +++ b/docs/tokenizer.rst @@ -71,12 +71,12 @@ Create a file named config.json in the root directory of the current model. { "main": "main", "users": "path/to/user/dict.bin", - "user_type": "ipadic|ko-dic|unidic" + "user_kind": "ipadic|ko-dic|unidic" } - The "main" field is optional. If not filled, the default is the "main" directory. - "user" is the path of the user dictionary. The user dictionary can be passed as a CSV file or as a binary file compiled by lindera-cli. -- The "user_type" field can be left blank if the user dictionary is in binary format. If it's in CSV format, you need to specify the type of the language model. +- The "user_kind" field can be left blank if the user dictionary is in binary format. If it's in CSV format, you need to specify the type of the language model. Create your own language model diff --git a/python/python/tests/models/jieba/invalid_dict/config.json b/python/python/tests/models/jieba/invalid_dict/config.json new file mode 100644 index 0000000000..cf4301aa2b --- /dev/null +++ b/python/python/tests/models/jieba/invalid_dict/config.json @@ -0,0 +1,6 @@ +{ + "main": "../default/dict.txt", + "users": [ + "invalid_user.txt" + ] +} diff --git a/python/python/tests/models/jieba/invalid_dict2/config.json b/python/python/tests/models/jieba/invalid_dict2/config.json new file mode 100644 index 0000000000..d0216419a5 --- /dev/null +++ b/python/python/tests/models/jieba/invalid_dict2/config.json @@ -0,0 +1,3 @@ +{ + "main": "invalid_dict.txt" +} diff --git a/python/python/tests/models/jieba/user_dict/config.json b/python/python/tests/models/jieba/user_dict/config.json index 5f0541ed4f..0d65334ca2 100644 --- a/python/python/tests/models/jieba/user_dict/config.json +++ b/python/python/tests/models/jieba/user_dict/config.json @@ -1,6 +1,6 @@ { "main": "../default/dict.txt", - "user": [ + "users": [ "user.txt" ] } diff --git a/python/python/tests/models/jieba/user_dict/user.txt b/python/python/tests/models/jieba/user_dict/user.txt index be2d8a9582..bb6ffa4d85 100644 --- a/python/python/tests/models/jieba/user_dict/user.txt +++ b/python/python/tests/models/jieba/user_dict/user.txt @@ -1 +1 @@ -光明的前途 1219 n +光明的前途 318825 n diff --git a/python/python/tests/models/lindera/README.md b/python/python/tests/models/lindera/README.md new file mode 100644 index 0000000000..c4073b65d5 --- /dev/null +++ b/python/python/tests/models/lindera/README.md @@ -0,0 +1,28 @@ +# How to build this test language model + +Ipadic model is about 45M. so we created a tiny ipadic in zip. + +- Download language model + +```bash +curl -L -o mecab-ipadic-2.7.0-20070801.tar.gz "https://github.com/lindera-morphology/mecab-ipadic/archive/refs/tags/2.7.0-20070801.tar.gz" +tar xvf mecab-ipadic-2.7.0-20070801.tar.gz +``` + +- Remove csv files in folder + +- Put files in `ipadic/raw` into folder + +- Edit matrix.def, reset last column(weight) into zero, except first row. + +- build + +```bash +lindera build --dictionary-kind=ipadic mecab-ipadic-2.7.0-20070801 main +``` + +- build user dict + +```bash +lindera build --build-user-dictionary --dictionary-kind=ipadic user_dict/userdict.csv user_dict2 +``` diff --git a/python/python/tests/models/lindera/invalid_dict/config.json b/python/python/tests/models/lindera/invalid_dict/config.json new file mode 100644 index 0000000000..b486aeba24 --- /dev/null +++ b/python/python/tests/models/lindera/invalid_dict/config.json @@ -0,0 +1,4 @@ +{ + "main": "../main", + "user": "invalid.bin" +} diff --git a/python/python/tests/models/lindera/invalid_dict2/config.json b/python/python/tests/models/lindera/invalid_dict2/config.json new file mode 100644 index 0000000000..11c22e9f1c --- /dev/null +++ b/python/python/tests/models/lindera/invalid_dict2/config.json @@ -0,0 +1,4 @@ +{ + "main": "../main", + "user": "ipadic_simple_userdic.csv" +} diff --git a/python/python/tests/models/lindera/ipadic/main.zip b/python/python/tests/models/lindera/ipadic/main.zip new file mode 100644 index 0000000000000000000000000000000000000000..25966ae2a1d06f509cc06ba46cc1555cca2cbeae GIT binary patch literal 5910 zcmWIWW@Zs#0Du4mnWrBb!pp$kwm3h%8;C)4X$3a}Bg*wPl?Wr`fX48_j7Z5$F3~GX%qa$&rMd)S4v0oGD>(rO(h``Q+!VT5J&c%c z^6;!lFk}euX6Kmu{BWH(&`^-U*sa{e&A=cCvobj&u_!(zHBB!mGmnt1Z?Eq67jYCh z@NxV1-+bKJ+)StV1DUzh)NWbmoZfivUZq@uQ-_$8H$63F0 z+BAK+ET^5}#OBhc*t~|ZdP29vBDNyV>4mKo5h?OUN}G%??OLHzbYgMC?SPCV-u54> z^#4iiRa|_AUGi}0#NJi^rbP%}I-$_d)TQx|QSK4%y+^ioFK$(z>Gzznu_e{!nDC0s ziG@369C%kd(ZAy7oZ92Rw2JEmCG8#rZCG;N{f+FjhccV5TeQo4c_b$CY-2}%#v|iz z0(ZIl{#040+uT03yw@W6WV-d4*{?gL=WqJrD>eJGnC{P@_R8CTJips!-pRDTU6uQD zzwQ11@A9wB{+@iP=GF>t$@#NO(`rS;lU_Z3b~f_U&z6n3HJ57pZ<`m-Kl9q`^YYT= zrS(DI*Ux`H*^YhN{p{kJ<^KjPPFzr>>$ySe5R>)uf!Q+8yO)dCJhlW76Ug^%64T2cRUvirTc`Fl5o2PBJr+Sv2F%PwSQfE@#z;tRs(wenRS`o~q z0qn11zlBfP37x8P zpYk_M^R;Pa#myy-W|JiwAb2o?+ABBngQD5W?vrGij~2Ng3Lm>yXvuz`z~ z(m0DJz_r0pEGf#Y(910$XwTb&hCmhP{(-AHv9OU zuW$B!|bt#NHE+cT5*jD@qlgF2Gz92Z>p{qlg01vwbIR~T53MiRhYNl7H+kK`E# z&V1O?)cH`7S@`g0aUY(|_6CM;Vh^7AVAS537{)N+{HX&6P8_&!;o+ep1`8W!KCI*r zK0L+Phex{MQJ2HVsrfS`%6t>kD!w&$Chk>XKHSIZ!_((b*cNEO<#uoZJJ{tvbPDGu z09_4oUw}6wlL#~JQ9_V15P0hdB8eO`gcu1MHAEiV1C1KOz>-F1n2|(}A;O#sGaGq0 z2Q)kg14|mW!OSLQun^sN*~D;?WI9PMxS>o+XTIIN`h$ U;LXYg@;4~4+yRa>seqF?04AJJ#{d8T literal 0 HcmV?d00001 diff --git a/python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv b/python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv new file mode 100644 index 0000000000..4201b57a54 --- /dev/null +++ b/python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv @@ -0,0 +1,3 @@ +,1293,1293,5686,̾,ͭ̾,ϰ,,*,*,,ʥ꥿,ʥ꥿ +,1285,1285,553,̾,,*,*,*,*,,, +,1285,1285,7778,̾,,*,*,*,*,,, \ No newline at end of file diff --git a/python/python/tests/models/lindera/user_dict/config.json b/python/python/tests/models/lindera/user_dict/config.json new file mode 100644 index 0000000000..e554849af2 --- /dev/null +++ b/python/python/tests/models/lindera/user_dict/config.json @@ -0,0 +1,5 @@ +{ + "main": "../ipadic/main", + "user": "userdic.csv", + "user_kind": "ipadic" +} diff --git a/python/python/tests/models/lindera/user_dict/userdic.csv b/python/python/tests/models/lindera/user_dict/userdic.csv new file mode 100644 index 0000000000..652c3f7791 --- /dev/null +++ b/python/python/tests/models/lindera/user_dict/userdic.csv @@ -0,0 +1 @@ +成田国際空港,カスタム名詞,トウキョウスカイツリー diff --git a/python/python/tests/models/lindera/user_dict2/config.json b/python/python/tests/models/lindera/user_dict2/config.json new file mode 100644 index 0000000000..e06bd8c71b --- /dev/null +++ b/python/python/tests/models/lindera/user_dict2/config.json @@ -0,0 +1,4 @@ +{ + "main": "../ipadic/main", + "user": "userdic.bin" +} diff --git a/python/python/tests/models/lindera/user_dict2/userdic.bin b/python/python/tests/models/lindera/user_dict2/userdic.bin new file mode 100644 index 0000000000000000000000000000000000000000..a0410fa0798689aaaea53c73af4972e8ea805aca GIT binary patch literal 1226 zcmeHHF$%&!5FGck(9$2+iqG%~KEx-4JBu{J6&8{cu@EJSn8HFu@dG}Gr%#+~t32{J3ESrIVuL;qZ zaJ&ob?R;AUDu9!3EvZbPOyCa^Xnei#c}tt(Fr?a~#iE`OnmMyvvplf8u$qN>`0%Ip x@4wOhMHFiySI46uH0Q)Kv44#A+g4$qT$T%#8&=D=ux8eB&T7DF#p?92!3)L#ORoR` literal 0 HcmV?d00001 diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 0e51ca70b5..83a45ec499 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -3,9 +3,11 @@ import os import random +import shutil import string from datetime import date, datetime, timedelta from pathlib import Path +import zipfile import lance import numpy as np @@ -33,6 +35,23 @@ def gen_str(n, split="", char_set=string.ascii_letters + string.digits): ) return tbl +def set_language_model_path(): + os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(os.path.dirname(__file__), "models") + + +@pytest.fixture() +def lindera_ipadic(): + set_language_model_path() + model_path = os.path.join(os.path.dirname(__file__), "models", "lindera", "ipadic") + cwd = os.getcwd() + try: + os.chdir(model_path) + with zipfile.ZipFile("main.zip", 'r') as zip_ref: + zip_ref.extractall() + os.chdir(cwd) + yield + finally: + shutil.rmtree(os.path.join(model_path, "main")) @pytest.fixture() def dataset(tmp_path): @@ -325,8 +344,8 @@ def test_fts_all_deleted(dataset): dataset.delete(f"doc = '{first_row_doc}'") dataset.to_table(full_text_query=first_row_doc) - -def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path): +def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path, lindera_ipadic): + os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(os.path.dirname(__file__), "models") data = pa.table( { "text": [ @@ -346,22 +365,136 @@ def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path ) assert results["_rowid"].to_pylist() == [0] +def test_lindera_ipadic_jp_tokenizer_invalid_user_dict_path(tmp_path, lindera_ipadic): + data = pa.table( + { + "text": [ + "成田国際空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + with pytest.raises(OSError): + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/invalid_dict") + +def test_lindera_ipadic_jp_tokenizer_csv_user_dict_without_type(tmp_path, lindera_ipadic): + data = pa.table( + { + "text": [ + "成田国際空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + with pytest.raises(OSError): + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/invalid_dict2") + +def test_lindera_ipadic_jp_tokenizer_csv_user_dict(tmp_path, lindera_ipadic): + data = pa.table( + { + "text": [ + "成田国際空港", + "東京国際空港", + "羽田空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/user_dict") + results = ds.to_table( + full_text_query="成田", + prefilter=True, + with_row_id=True, + ) + assert len(results) == 0 + results = ds.to_table( + full_text_query="成田国際空港", + prefilter=True, + with_row_id=True, + ) + assert results["_rowid"].to_pylist() == [0] + +def test_lindera_ipadic_jp_tokenizer_bin_user_dict(tmp_path, lindera_ipadic): + data = pa.table( + { + "text": [ + "成田国際空港", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/user_dict2") + +def test_jieba_tokenizer(tmp_path): + set_language_model_path() + data = pa.table( + { + "text": [ + "我们都有光明的前途", + "光明的前途" + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/default") + results = ds.to_table( + full_text_query="我们", + prefilter=True, + with_row_id=True, + ) + assert results["_rowid"].to_pylist() == [0] + +def test_jieba_invalid_user_dict_tokenizer(tmp_path): + set_language_model_path() + data = pa.table( + { + "text": [ + "我们都有光明的前途", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + with pytest.raises(OSError): + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/invalid_dict") + -def test_indexed_filter_with_fts_index_with_lindera_ko_tokenizer(tmp_path): +def test_jieba_invalid_main_dict_tokenizer(tmp_path): + set_language_model_path() data = pa.table( { - "text": ["하네다공항한정토트백", "나리타공항한정토트백"], + "text": [ + "我们都有光明的前途", + ], } ) ds = lance.write_dataset(data, tmp_path, mode="overwrite") - ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ko-dic") + with pytest.raises(OSError): + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/invalid_dict2") +def test_jieba_user_dict_tokenizer(tmp_path): + set_language_model_path() + data = pa.table( + { + "text": [ + "我们都有光明的前途", + "光明的前途" + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/user_dict") + results = ds.to_table( + full_text_query="的前", + prefilter=True, + with_row_id=True, + ) + assert len(results) == 0 results = ds.to_table( - full_text_query="나리타", + full_text_query="光明的前途", prefilter=True, with_row_id=True, ) - assert results["_rowid"].to_pylist() == [1] + assert results["_rowid"].to_pylist() == [1, 0] def test_bitmap_index(tmp_path: Path): diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index cdb60ff0f8..11c161c2f6 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -20,8 +20,8 @@ pub struct TokenizerConfig { /// - `simple`: splits tokens on whitespace and punctuation /// - `whitespace`: splits tokens on whitespace /// - `raw`: no tokenization - /// - `lindera-ipadic`: Japanese tokenizer - /// - `lindera-ko-dic`: Korea tokenizer + /// - `lindera/*`: Lindera tokenizer + /// - `jieba/*`: Jieba tokenizer /// /// `simple` is recommended for most cases and the default value base_tokenizer: String, @@ -170,7 +170,7 @@ fn build_base_tokenizer_builder(name: &str) -> Result Err(Error::invalid_input( format!("unknown base tokenizer {}", name), diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs index ab60790de6..fedb2eb59e 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs @@ -93,7 +93,7 @@ impl TokenizerBuilder for LinderaBuilder { Some(conf) => { let user_dictionary = load_user_dictionary_from_config(&conf).map_err(|e| { Error::io( - format!("load lindera tokenizer user dictionary err: {e}"), + format!("load lindera tokenizer user dictionary, conf:{conf}, err: {e}"), location!(), ) })?; From d1d98c3b0de03cc78dcc71b03cd1e10b7d240770 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 16:38:22 +0800 Subject: [PATCH 16/22] format --- python/python/tests/test_scalar_index.py | 48 ++++++++++++++++-------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 83a45ec499..c24b7abb20 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -5,9 +5,9 @@ import random import shutil import string +import zipfile from datetime import date, datetime, timedelta from pathlib import Path -import zipfile import lance import numpy as np @@ -35,8 +35,11 @@ def gen_str(n, split="", char_set=string.ascii_letters + string.digits): ) return tbl + def set_language_model_path(): - os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(os.path.dirname(__file__), "models") + os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join( + os.path.dirname(__file__), "models" + ) @pytest.fixture() @@ -46,13 +49,14 @@ def lindera_ipadic(): cwd = os.getcwd() try: os.chdir(model_path) - with zipfile.ZipFile("main.zip", 'r') as zip_ref: + with zipfile.ZipFile("main.zip", "r") as zip_ref: zip_ref.extractall() os.chdir(cwd) yield finally: shutil.rmtree(os.path.join(model_path, "main")) + @pytest.fixture() def dataset(tmp_path): tbl = create_table() @@ -344,8 +348,13 @@ def test_fts_all_deleted(dataset): dataset.delete(f"doc = '{first_row_doc}'") dataset.to_table(full_text_query=first_row_doc) -def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path, lindera_ipadic): - os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(os.path.dirname(__file__), "models") + +def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer( + tmp_path, lindera_ipadic +): + os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join( + os.path.dirname(__file__), "models" + ) data = pa.table( { "text": [ @@ -365,6 +374,7 @@ def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path ) assert results["_rowid"].to_pylist() == [0] + def test_lindera_ipadic_jp_tokenizer_invalid_user_dict_path(tmp_path, lindera_ipadic): data = pa.table( { @@ -375,9 +385,14 @@ def test_lindera_ipadic_jp_tokenizer_invalid_user_dict_path(tmp_path, lindera_ip ) ds = lance.write_dataset(data, tmp_path, mode="overwrite") with pytest.raises(OSError): - ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/invalid_dict") + ds.create_scalar_index( + "text", "INVERTED", base_tokenizer="lindera/invalid_dict" + ) + -def test_lindera_ipadic_jp_tokenizer_csv_user_dict_without_type(tmp_path, lindera_ipadic): +def test_lindera_ipadic_jp_tokenizer_csv_user_dict_without_type( + tmp_path, lindera_ipadic +): data = pa.table( { "text": [ @@ -387,7 +402,10 @@ def test_lindera_ipadic_jp_tokenizer_csv_user_dict_without_type(tmp_path, linder ) ds = lance.write_dataset(data, tmp_path, mode="overwrite") with pytest.raises(OSError): - ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/invalid_dict2") + ds.create_scalar_index( + "text", "INVERTED", base_tokenizer="lindera/invalid_dict2" + ) + def test_lindera_ipadic_jp_tokenizer_csv_user_dict(tmp_path, lindera_ipadic): data = pa.table( @@ -414,6 +432,7 @@ def test_lindera_ipadic_jp_tokenizer_csv_user_dict(tmp_path, lindera_ipadic): ) assert results["_rowid"].to_pylist() == [0] + def test_lindera_ipadic_jp_tokenizer_bin_user_dict(tmp_path, lindera_ipadic): data = pa.table( { @@ -425,14 +444,12 @@ def test_lindera_ipadic_jp_tokenizer_bin_user_dict(tmp_path, lindera_ipadic): ds = lance.write_dataset(data, tmp_path, mode="overwrite") ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/user_dict2") + def test_jieba_tokenizer(tmp_path): set_language_model_path() data = pa.table( { - "text": [ - "我们都有光明的前途", - "光明的前途" - ], + "text": ["我们都有光明的前途", "光明的前途"], } ) ds = lance.write_dataset(data, tmp_path, mode="overwrite") @@ -444,6 +461,7 @@ def test_jieba_tokenizer(tmp_path): ) assert results["_rowid"].to_pylist() == [0] + def test_jieba_invalid_user_dict_tokenizer(tmp_path): set_language_model_path() data = pa.table( @@ -471,14 +489,12 @@ def test_jieba_invalid_main_dict_tokenizer(tmp_path): with pytest.raises(OSError): ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/invalid_dict2") + def test_jieba_user_dict_tokenizer(tmp_path): set_language_model_path() data = pa.table( { - "text": [ - "我们都有光明的前途", - "光明的前途" - ], + "text": ["我们都有光明的前途", "光明的前途"], } ) ds = lance.write_dataset(data, tmp_path, mode="overwrite") From 987e883f1110528dfb2277e7943ae68c5aad04ed Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 16:40:12 +0800 Subject: [PATCH 17/22] format --- rust/lance-index/src/scalar/inverted/tokenizer.rs | 2 +- rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs | 3 +++ rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 11c161c2f6..252ece7e8f 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -188,7 +188,7 @@ pub const LANCE_LANGUAGE_MODEL_CONFIG_FILE: &str = "config.json"; pub fn language_model_home() -> Option { match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) { Ok(p) => Some(PathBuf::from(p)), - Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY)) + Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY)), } } diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs index a874ca3bd4..0c04dc236a 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + use std::path::PathBuf; use super::TokenizerBuilder; diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs index fedb2eb59e..5ce6a5ab36 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + use std::path::PathBuf; use super::TokenizerBuilder; From e83a18a0e5a159aa96048c3cbf5de087c2e0ed76 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 16:41:53 +0800 Subject: [PATCH 18/22] format --- Cargo.lock | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7ed15c3203..bcf001e073 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,12 +17,6 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" -[[package]] -name = "adler2" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" - [[package]] name = "adler32" version = "1.2.0" From 466d26ecc2e894d7ff0e7f9310e20d1d744fd5b3 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 16:43:20 +0800 Subject: [PATCH 19/22] format --- python/python/lance/download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/python/lance/download.py b/python/python/lance/download.py index 778949aaff..cff42520e4 100644 --- a/python/python/lance/download.py +++ b/python/python/lance/download.py @@ -37,8 +37,8 @@ def download_jieba(): "https://github.com/messense/jieba-rs/raw/refs/heads/main/src/data/dict.txt" ) content = resp.content - with open(os.path.join(dirname, "dict.txt"), "wb") as fo: - fo.write(content) + with open(os.path.join(dirname, "dict.txt"), "wb") as out: + out.write(content) except Exception as _: traceback.print_exc() print( From f01777cd6653a9938b5fcc1d75b0112dd4568255 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 16:46:03 +0800 Subject: [PATCH 20/22] format --- Cargo.lock | 82 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bcf001e073..751769ff6e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -936,7 +936,7 @@ dependencies = [ "addr2line", "cfg-if", "libc", - "miniz_oxide 0.7.4", + "miniz_oxide", "object", "rustc-demangle", "windows-targets 0.52.6", @@ -1600,7 +1600,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.89", + "syn 2.0.90", ] [[package]] @@ -1611,7 +1611,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.89", + "syn 2.0.90", ] [[package]] @@ -2111,7 +2111,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.90", ] [[package]] @@ -2121,7 +2121,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.89", + "syn 2.0.90", ] [[package]] @@ -2452,7 +2452,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", - "miniz_oxide 0.8.0", + "miniz_oxide", ] [[package]] @@ -2467,6 +2467,21 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -2960,7 +2975,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" dependencies = [ "bytes", "http-body-util", - "hyper 1.4.1", + "hyper 1.5.1", "hyper-util", "native-tls", "tokio", @@ -3137,6 +3152,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.0.3" @@ -3178,7 +3199,7 @@ dependencies = [ "libflate", "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.90", ] [[package]] @@ -4093,7 +4114,7 @@ dependencies = [ "reqwest", "serde", "tar", - "thiserror 2.0.3", + "thiserror 2.0.4", "yada", ] @@ -4239,15 +4260,6 @@ dependencies = [ "adler2", ] -[[package]] -name = "miniz_oxide" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" -dependencies = [ - "adler2", -] - [[package]] name = "mio" version = "1.0.3" @@ -4342,7 +4354,7 @@ dependencies = [ "openssl-probe", "openssl-sys", "schannel", - "security-framework", + "security-framework 2.11.1", "security-framework-sys", "tempfile", ] @@ -4589,7 +4601,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.90", ] [[package]] @@ -6323,6 +6335,27 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.6.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -7026,6 +7059,15 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-segmentation" version = "1.12.0" From a25db786a3281f2491617402d5320513e63c371f Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 18:02:40 +0800 Subject: [PATCH 21/22] format --- python/src/lib.rs | 12 +++++------ .../src/scalar/inverted/tokenizer.rs | 4 ++-- .../src/scalar/inverted/tokenizer/jieba.rs | 19 +++++------------- .../src/scalar/inverted/tokenizer/lindera.rs | 20 +++++-------------- 4 files changed, 18 insertions(+), 37 deletions(-) diff --git a/python/src/lib.rs b/python/src/lib.rs index f534d0c3df..9e46ce8883 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -177,14 +177,14 @@ fn json_to_schema(json: &str) -> PyResult> { #[pyfunction] pub fn language_model_home() -> PyResult { let Some(p) = lance_index::scalar::inverted::language_model_home() else { - return Err(pyo3::exceptions::PyValueError::new_err(format!( - "Failed to get language model home" - ))); + return Err(pyo3::exceptions::PyValueError::new_err( + "Failed to get language model home", + )); }; let Some(pstr) = p.to_str() else { - return Err(pyo3::exceptions::PyValueError::new_err(format!( - "Failed to convert language model home to str" - ))); + return Err(pyo3::exceptions::PyValueError::new_err( + "Failed to convert language model home to str", + )); }; Ok(String::from(pstr)) } diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 252ece7e8f..7d34710286 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -195,7 +195,7 @@ pub fn language_model_home() -> Option { #[cfg(feature = "tokenizer-common")] trait TokenizerBuilder: Sized { type Config: serde::de::DeserializeOwned + Default; - fn load(p: &PathBuf) -> Result { + fn load(p: &std::path::Path) -> Result { if !p.is_dir() { return Err(Error::io( format!("{} is not a valid directory", p.display()), @@ -214,7 +214,7 @@ trait TokenizerBuilder: Sized { Self::new(config, p) } - fn new(config: Self::Config, root: &PathBuf) -> Result; + fn new(config: Self::Config, root: &std::path::Path) -> Result; fn build(&self) -> Result; } diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs index 0c04dc236a..063cf0b4a9 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs @@ -1,28 +1,19 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use super::TokenizerBuilder; use lance_core::{Error, Result}; use serde::{Deserialize, Serialize}; use snafu::{location, Location}; -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Default)] pub struct JiebaConfig { main: Option, users: Option>, } -impl Default for JiebaConfig { - fn default() -> Self { - Self { - main: Default::default(), - users: Default::default(), - } - } -} - pub struct JiebaBuilder { root: PathBuf, config: JiebaConfig, @@ -47,10 +38,10 @@ impl JiebaBuilder { impl TokenizerBuilder for JiebaBuilder { type Config = JiebaConfig; - fn new(config: Self::Config, root: &PathBuf) -> Result { - Ok(JiebaBuilder { + fn new(config: Self::Config, root: &Path) -> Result { + Ok(Self { config, - root: root.clone(), + root: root.to_path_buf(), }) } diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs index 5ce6a5ab36..23c8042dd0 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use super::TokenizerBuilder; use lance_core::{Error, Result}; @@ -17,23 +17,13 @@ use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; use snafu::{location, Location}; -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Default)] pub struct LinderaConfig { main: Option, user: Option, user_kind: Option, } -impl Default for LinderaConfig { - fn default() -> Self { - Self { - main: Default::default(), - user: Default::default(), - user_kind: Default::default(), - } - } -} - pub struct LinderaBuilder { root: PathBuf, config: LinderaConfig, @@ -73,10 +63,10 @@ impl LinderaBuilder { impl TokenizerBuilder for LinderaBuilder { type Config = LinderaConfig; - fn new(config: Self::Config, root: &PathBuf) -> Result { - Ok(LinderaBuilder { + fn new(config: Self::Config, root: &Path) -> Result { + Ok(Self { config, - root: root.clone(), + root: root.to_path_buf(), }) } From eeebd1650ad35ed45e552f095f712019a69176b1 Mon Sep 17 00:00:00 2001 From: Chongchen Chen Date: Sat, 28 Dec 2024 18:15:51 +0800 Subject: [PATCH 22/22] format --- rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs index 063cf0b4a9..95445fb544 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs @@ -73,7 +73,7 @@ impl TokenizerBuilder for JiebaBuilder { ) })? } - let tokenizer = JiebaTokenizer { jieba: jieba }; + let tokenizer = JiebaTokenizer { jieba }; Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic()) } }