From 474c73b9d31ad5258117c8842c9b920875e50007 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sun, 8 Dec 2024 13:54:14 +0800
Subject: [PATCH 01/22] feat: support japanese & korea tokenizer for fts

---
 Cargo.lock                                    | 533 ++++++++++++++++--
 Cargo.toml                                    |   2 +
 python/Cargo.lock                             | 418 ++++++++++++++
 python/Cargo.toml                             |   2 +-
 python/python/tests/test_scalar_index.py      |  38 ++
 rust/lance-index/Cargo.toml                   |   9 +
 .../src/scalar/inverted/tokenizer.rs          |  26 +
 7 files changed, 984 insertions(+), 44 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9f26e23854..42127f1390 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -17,6 +17,12 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
 [[package]]
 name = "ahash"
 version = "0.8.11"
@@ -148,9 +154,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.86"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
+checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
 
 [[package]]
 name = "approx"
@@ -397,7 +403,7 @@ dependencies = [
  "memchr",
  "num",
  "regex",
- "regex-syntax 0.8.4",
+ "regex-syntax 0.8.5",
 ]
 
 [[package]]
@@ -951,7 +957,7 @@ dependencies = [
  "cc",
  "cfg-if",
  "libc",
- "miniz_oxide",
+ "miniz_oxide 0.7.4",
  "object",
  "rustc-demangle",
 ]
@@ -978,6 +984,15 @@ dependencies = [
  "vsimd",
 ]
 
+[[package]]
+name = "bincode"
+version = "1.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.5.3"
@@ -1530,9 +1545,9 @@ dependencies = [
 
 [[package]]
 name = "csv"
-version = "1.3.0"
+version = "1.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
+checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
 dependencies = [
  "csv-core",
  "itoa",
@@ -1549,6 +1564,41 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "darling"
+version = "0.20.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.20.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn 2.0.89",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.20.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn 2.0.89",
+]
+
 [[package]]
 name = "dashmap"
 version = "5.5.3"
@@ -1850,7 +1900,7 @@ dependencies = [
  "itertools 0.13.0",
  "log",
  "paste",
- "regex-syntax 0.8.4",
+ "regex-syntax 0.8.5",
 ]
 
 [[package]]
@@ -2022,6 +2072,37 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "derive_builder"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
+dependencies = [
+ "derive_builder_macro",
+]
+
+[[package]]
+name = "derive_builder_core"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.89",
+]
+
+[[package]]
+name = "derive_builder_macro"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
+dependencies = [
+ "derive_builder_core",
+ "syn 2.0.89",
+]
+
 [[package]]
 name = "diff"
 version = "0.1.13"
@@ -2090,6 +2171,88 @@ version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
+[[package]]
+name = "encoding"
+version = "0.2.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
+dependencies = [
+ "encoding-index-japanese",
+ "encoding-index-korean",
+ "encoding-index-simpchinese",
+ "encoding-index-singlebyte",
+ "encoding-index-tradchinese",
+]
+
+[[package]]
+name = "encoding-index-japanese"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-korean"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-simpchinese"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-singlebyte"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-tradchinese"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding_index_tests"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "encoding_rs_io"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83"
+dependencies = [
+ "encoding_rs",
+]
+
 [[package]]
 name = "env_filter"
 version = "0.1.2"
@@ -2261,12 +2424,12 @@ dependencies = [
 
 [[package]]
 name = "flate2"
-version = "1.0.31"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920"
+checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c"
 dependencies = [
  "crc32fast",
- "miniz_oxide",
+ "miniz_oxide 0.8.0",
 ]
 
 [[package]]
@@ -2275,6 +2438,21 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.1"
@@ -2761,6 +2939,22 @@ dependencies = [
  "tower-service",
 ]
 
+[[package]]
+name = "hyper-tls"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
+dependencies = [
+ "bytes",
+ "http-body-util",
+ "hyper 1.4.1",
+ "hyper-util",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.7"
@@ -2813,6 +3007,12 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
 [[package]]
 name = "idna"
 version = "0.5.0"
@@ -2991,6 +3191,15 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "kanaria"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
 [[package]]
 name = "kv-log-macro"
 version = "1.0.7"
@@ -3338,6 +3547,8 @@ dependencies = [
  "lance-table",
  "lance-testing",
  "lazy_static",
+ "lindera",
+ "lindera-tantivy",
  "log",
  "moka",
  "num-traits",
@@ -3637,6 +3848,137 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "lindera"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fff887f4b98539fb5f879ede50e17eb7eaafa5622c252cffe8280f42cafc6b7d"
+dependencies = [
+ "anyhow",
+ "bincode",
+ "byteorder",
+ "csv",
+ "kanaria",
+ "lindera-cc-cedict",
+ "lindera-dictionary",
+ "lindera-ipadic",
+ "lindera-ipadic-neologd",
+ "lindera-ko-dic",
+ "lindera-unidic",
+ "once_cell",
+ "regex",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "strum",
+ "strum_macros",
+ "unicode-blocks",
+ "unicode-normalization",
+ "unicode-segmentation",
+ "yada",
+]
+
+[[package]]
+name = "lindera-cc-cedict"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a891e53b4fac346b314dcd60e4337b6deec7d972f338c004cfb9e0fe9868893"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
+[[package]]
+name = "lindera-dictionary"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec716483ceb95aa84ac262cb766eef314b24257c343ca230daa71f856a278fe4"
+dependencies = [
+ "anyhow",
+ "bincode",
+ "byteorder",
+ "csv",
+ "derive_builder",
+ "encoding",
+ "encoding_rs",
+ "encoding_rs_io",
+ "flate2",
+ "glob",
+ "log",
+ "once_cell",
+ "reqwest",
+ "serde",
+ "tar",
+ "thiserror 2.0.3",
+ "yada",
+]
+
+[[package]]
+name = "lindera-ipadic"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74d2b997fa2aeee8adccea2e4cb0ade771132f6e32093ed0beda8409e9a44018"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
+[[package]]
+name = "lindera-ipadic-neologd"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "084c4e7b63500b235af2c7cbf8ee23735ae452971ac29bbd9a7f55a10eae50c4"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
+[[package]]
+name = "lindera-ko-dic"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d921a5c39a634316125fc1572f00bc78f4351baeacaff2cf39953b2fc8493a55"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
+[[package]]
+name = "lindera-tantivy"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "261c87882a909fd17db4dd797e4dc2aac3992bdbbb4e2900d1362a1e0746266f"
+dependencies = [
+ "lindera",
+ "tantivy",
+ "tantivy-tokenizer-api",
+]
+
+[[package]]
+name = "lindera-unidic"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b95e25975448fa10f2bec61bfd012d8b0a1740d92ac7fc43e725edb1568ff7e"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.3.8"
@@ -3768,6 +4110,15 @@ dependencies = [
  "adler",
 ]
 
+[[package]]
+name = "miniz_oxide"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1"
+dependencies = [
+ "adler2",
+]
+
 [[package]]
 name = "mio"
 version = "1.0.1"
@@ -3851,6 +4202,23 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
 
+[[package]]
+name = "native-tls"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
+dependencies = [
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "nix"
 version = "0.26.4"
@@ -4070,12 +4438,50 @@ version = "11.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9"
 
+[[package]]
+name = "openssl"
+version = "0.10.68"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5"
+dependencies = [
+ "bitflags 2.6.0",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.89",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
+[[package]]
+name = "openssl-sys"
+version = "0.9.104"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "option-ext"
 version = "0.2.0"
@@ -4525,7 +4931,7 @@ dependencies = [
  "rand",
  "rand_chacha",
  "rand_xorshift",
- "regex-syntax 0.8.4",
+ "regex-syntax 0.8.5",
  "rusty-fork",
  "tempfile",
  "unarray",
@@ -4878,14 +5284,14 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.10.6"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata 0.4.7",
- "regex-syntax 0.8.4",
+ "regex-automata 0.4.9",
+ "regex-syntax 0.8.5",
 ]
 
 [[package]]
@@ -4899,13 +5305,13 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.7"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax 0.8.4",
+ "regex-syntax 0.8.5",
 ]
 
 [[package]]
@@ -4922,9 +5328,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
 [[package]]
 name = "regex-syntax"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
 
 [[package]]
 name = "regress"
@@ -4954,12 +5360,13 @@ checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2"
 
 [[package]]
 name = "reqwest"
-version = "0.12.7"
+version = "0.12.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63"
+checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f"
 dependencies = [
  "base64 0.22.1",
  "bytes",
+ "encoding_rs",
  "futures-core",
  "futures-util",
  "h2 0.4.6",
@@ -4968,24 +5375,28 @@ dependencies = [
  "http-body-util",
  "hyper 1.4.1",
  "hyper-rustls 0.27.3",
+ "hyper-tls",
  "hyper-util",
  "ipnet",
  "js-sys",
  "log",
  "mime",
+ "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
  "quinn",
  "rustls 0.23.12",
- "rustls-native-certs 0.7.3",
+ "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.3",
  "rustls-pki-types",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "sync_wrapper",
+ "system-configuration",
  "tokio",
+ "tokio-native-tls",
  "tokio-rustls 0.26.0",
  "tokio-util",
  "tower-service",
@@ -5164,19 +5575,6 @@ dependencies = [
  "security-framework",
 ]
 
-[[package]]
-name = "rustls-native-certs"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5"
-dependencies = [
- "openssl-probe",
- "rustls-pemfile 2.1.3",
- "rustls-pki-types",
- "schannel",
- "security-framework",
-]
-
 [[package]]
 name = "rustls-native-certs"
 version = "0.8.0"
@@ -5805,6 +6203,27 @@ dependencies = [
  "futures-core",
 ]
 
+[[package]]
+name = "system-configuration"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
+dependencies = [
+ "bitflags 2.6.0",
+ "core-foundation",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
 [[package]]
 name = "tagptr"
 version = "0.2.0"
@@ -5907,7 +6326,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18"
 dependencies = [
  "byteorder",
- "regex-syntax 0.8.4",
+ "regex-syntax 0.8.5",
  "utf8-ranges",
 ]
 
@@ -5960,9 +6379,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
 [[package]]
 name = "tar"
-version = "0.4.41"
+version = "0.4.43"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb797dad5fb5b76fcf519e702f4a589483b5ef06567f160c392832c1f5e44909"
+checksum = "c65998313f8e17d0d553d28f91a0df93e4dbbbf770279c7bc21ca0f09ea1a1f6"
 dependencies = [
  "filetime",
  "libc",
@@ -6176,9 +6595,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.39.2"
+version = "1.42.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1"
+checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551"
 dependencies = [
  "backtrace",
  "bytes",
@@ -6202,6 +6621,16 @@ dependencies = [
  "syn 2.0.89",
 ]
 
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-rustls"
 version = "0.24.1"
@@ -6506,6 +6935,12 @@ version = "0.3.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
 
+[[package]]
+name = "unicode-blocks"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.14"
@@ -6514,9 +6949,9 @@ checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
+checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
 dependencies = [
  "tinyvec",
 ]
@@ -6612,6 +7047,12 @@ version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.5"
@@ -7090,6 +7531,12 @@ dependencies = [
  "lzma-sys",
 ]
 
+[[package]]
+name = "yada"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd"
+
 [[package]]
 name = "yansi"
 version = "0.5.1"
diff --git a/Cargo.toml b/Cargo.toml
index 84c183579c..d0ae9e3f19 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -143,6 +143,8 @@ serde_json = { version = "1" }
 shellexpand = "3.0"
 snafu = "0.7.5"
 tantivy = { version = "0.22.0", features = ["stopwords"] }
+lindera = { version = "0.38.1"}
+lindera-tantivy = { version = "0.38.1"}
 tempfile = "3"
 test-log = { version = "0.2.15" }
 tokio = { version = "1.23", features = [
diff --git a/python/Cargo.lock b/python/Cargo.lock
index fcd28fd2fd..7a47c2c6cf 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -880,6 +880,15 @@ dependencies = [
  "vsimd",
 ]
 
+[[package]]
+name = "bincode"
+version = "1.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -1283,6 +1292,41 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "darling"
+version = "0.20.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.20.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn 2.0.90",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.20.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "dashmap"
 version = "5.5.3"
@@ -1747,6 +1791,37 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "derive_builder"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
+dependencies = [
+ "derive_builder_macro",
+]
+
+[[package]]
+name = "derive_builder_core"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
+[[package]]
+name = "derive_builder_macro"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
+dependencies = [
+ "derive_builder_core",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -1814,6 +1889,88 @@ version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
+[[package]]
+name = "encoding"
+version = "0.2.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
+dependencies = [
+ "encoding-index-japanese",
+ "encoding-index-korean",
+ "encoding-index-simpchinese",
+ "encoding-index-singlebyte",
+ "encoding-index-tradchinese",
+]
+
+[[package]]
+name = "encoding-index-japanese"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-korean"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-simpchinese"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-singlebyte"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding-index-tradchinese"
+version = "1.20141219.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
+dependencies = [
+ "encoding_index_tests",
+]
+
+[[package]]
+name = "encoding_index_tests"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "encoding_rs_io"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83"
+dependencies = [
+ "encoding_rs",
+]
+
 [[package]]
 name = "env_logger"
 version = "0.10.2"
@@ -1943,6 +2100,21 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2"
 
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.1"
@@ -2411,6 +2583,22 @@ dependencies = [
  "tower-service",
 ]
 
+[[package]]
+name = "hyper-tls"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
+dependencies = [
+ "bytes",
+ "http-body-util",
+ "hyper 1.5.1",
+ "hyper-util",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.10"
@@ -2580,6 +2768,12 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
 [[package]]
 name = "idna"
 version = "1.0.3"
@@ -2719,6 +2913,15 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "kanaria"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
 [[package]]
 name = "kv-log-macro"
 version = "1.0.7"
@@ -2988,6 +3191,8 @@ dependencies = [
  "lance-linalg",
  "lance-table",
  "lazy_static",
+ "lindera",
+ "lindera-tantivy",
  "log",
  "moka",
  "num-traits",
@@ -3207,6 +3412,95 @@ dependencies = [
  "redox_syscall",
 ]
 
+[[package]]
+name = "lindera"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fff887f4b98539fb5f879ede50e17eb7eaafa5622c252cffe8280f42cafc6b7d"
+dependencies = [
+ "anyhow",
+ "bincode",
+ "byteorder",
+ "csv",
+ "kanaria",
+ "lindera-dictionary",
+ "lindera-ipadic",
+ "lindera-ko-dic",
+ "once_cell",
+ "regex",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "strum",
+ "strum_macros",
+ "unicode-blocks",
+ "unicode-normalization",
+ "unicode-segmentation",
+ "yada",
+]
+
+[[package]]
+name = "lindera-dictionary"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec716483ceb95aa84ac262cb766eef314b24257c343ca230daa71f856a278fe4"
+dependencies = [
+ "anyhow",
+ "bincode",
+ "byteorder",
+ "csv",
+ "derive_builder",
+ "encoding",
+ "encoding_rs",
+ "encoding_rs_io",
+ "flate2",
+ "glob",
+ "log",
+ "once_cell",
+ "reqwest",
+ "serde",
+ "tar",
+ "thiserror 2.0.4",
+ "yada",
+]
+
+[[package]]
+name = "lindera-ipadic"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74d2b997fa2aeee8adccea2e4cb0ade771132f6e32093ed0beda8409e9a44018"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
+[[package]]
+name = "lindera-ko-dic"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d921a5c39a634316125fc1572f00bc78f4351baeacaff2cf39953b2fc8493a55"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
+[[package]]
+name = "lindera-tantivy"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "261c87882a909fd17db4dd797e4dc2aac3992bdbbb4e2900d1362a1e0746266f"
+dependencies = [
+ "lindera",
+ "tantivy",
+ "tantivy-tokenizer-api",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.14"
@@ -3394,6 +3688,23 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b"
 
+[[package]]
+name = "native-tls"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
+dependencies = [
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe",
+ "openssl-sys",
+ "schannel",
+ "security-framework 2.11.1",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "noisy_float"
 version = "0.2.0"
@@ -3586,12 +3897,50 @@ version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e296cf87e61c9cfc1a61c3c63a0f7f286ed4554e0e22be84e8a38e1d264a2a29"
 
+[[package]]
+name = "openssl"
+version = "0.10.68"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5"
+dependencies = [
+ "bitflags 2.6.0",
+ "cfg-if",
+ "foreign-types",
+ "libc",
+ "once_cell",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
+[[package]]
+name = "openssl-sys"
+version = "0.9.104"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "option-ext"
 version = "0.2.0"
@@ -4446,6 +4795,7 @@ checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f"
 dependencies = [
  "base64 0.22.1",
  "bytes",
+ "encoding_rs",
  "futures-core",
  "futures-util",
  "h2 0.4.7",
@@ -4454,11 +4804,13 @@ dependencies = [
  "http-body-util",
  "hyper 1.5.1",
  "hyper-rustls 0.27.3",
+ "hyper-tls",
  "hyper-util",
  "ipnet",
  "js-sys",
  "log",
  "mime",
+ "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -4471,7 +4823,9 @@ dependencies = [
  "serde_json",
  "serde_urlencoded",
  "sync_wrapper",
+ "system-configuration",
  "tokio",
+ "tokio-native-tls",
  "tokio-rustls 0.26.0",
  "tokio-util",
  "tower-service",
@@ -5042,6 +5396,12 @@ version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978"
 
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "strum"
 version = "0.26.3"
@@ -5136,6 +5496,27 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "system-configuration"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
+dependencies = [
+ "bitflags 2.6.0",
+ "core-foundation 0.9.4",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
 [[package]]
 name = "tagptr"
 version = "0.2.0"
@@ -5511,6 +5892,16 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-rustls"
 version = "0.24.1"
@@ -5705,12 +6096,27 @@ dependencies = [
  "typify-impl",
 ]
 
+[[package]]
+name = "unicode-blocks"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
+[[package]]
+name = "unicode-normalization"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
+dependencies = [
+ "tinyvec",
+]
+
 [[package]]
 name = "unicode-segmentation"
 version = "1.12.0"
@@ -5814,6 +6220,12 @@ version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3ef4c4aa54d5d05a279399bfa921ec387b7aba77caf7a682ae8d86785b8fdad2"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "version_check"
 version = "0.9.5"
@@ -6238,6 +6650,12 @@ dependencies = [
  "lzma-sys",
 ]
 
+[[package]]
+name = "yada"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd"
+
 [[package]]
 name = "yoke"
 version = "0.7.5"
diff --git a/python/Cargo.toml b/python/Cargo.toml
index e9e9f867c4..7f0e349bbf 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -36,7 +36,7 @@ lance-core = { path = "../rust/lance-core" }
 lance-datagen = { path = "../rust/lance-datagen", optional = true }
 lance-encoding = { path = "../rust/lance-encoding" }
 lance-file = { path = "../rust/lance-file" }
-lance-index = { path = "../rust/lance-index" }
+lance-index = { path = "../rust/lance-index", features = ["lindera-tantivy", "lindera-tantivy-ko-dic", "lindera-tantivy-ipadic"] }
 lance-io = { path = "../rust/lance-io" }
 lance-linalg = { path = "../rust/lance-linalg" }
 lance-table = { path = "../rust/lance-table" }
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
index 3777c90d48..52ad548cb8 100644
--- a/python/python/tests/test_scalar_index.py
+++ b/python/python/tests/test_scalar_index.py
@@ -307,6 +307,44 @@ def test_fts_all_deleted(dataset):
     dataset.to_table(full_text_query=first_row_doc)
 
 
+def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path):
+    data = pa.table(
+        {
+            "text": [
+                "成田国際空港",
+                "東京国際空港",
+                "羽田空港",
+            ],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera-ipadic")
+
+    results = ds.to_table(
+        full_text_query="成田",
+        prefilter=True,
+        with_row_id=True,
+    )
+    assert results["_rowid"].to_pylist() == [0]
+
+
+def test_indexed_filter_with_fts_index_with_lindera_ko_tokenizer(tmp_path):
+    data = pa.table(
+        {
+            "text": ["하네다공항한정토트백", "나리타공항한정토트백"],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera-ko-dic")
+
+    results = ds.to_table(
+        full_text_query="나리타",
+        prefilter=True,
+        with_row_id=True,
+    )
+    assert results["_rowid"].to_pylist() == [1]
+
+
 def test_bitmap_index(tmp_path: Path):
     """Test create bitmap index"""
     tbl = pa.Table.from_arrays(
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
index 12d38e5678..684731ab0a 100644
--- a/rust/lance-index/Cargo.toml
+++ b/rust/lance-index/Cargo.toml
@@ -50,6 +50,8 @@ serde_json.workspace = true
 serde.workspace = true
 snafu.workspace = true
 tantivy.workspace = true
+lindera = { workspace = true, optional = true }
+lindera-tantivy = { workspace = true, optional = true }
 tokio.workspace = true
 tracing.workspace = true
 tempfile.workspace = true
@@ -68,6 +70,13 @@ test-log.workspace = true
 datafusion-sql.workspace = true
 random_word = { version = "0.4.3", features = ["en"] }
 
+[features]
+lindera-tantivy-ipadic = ["lindera", "lindera-tantivy", "lindera-tantivy/ipadic"]
+lindera-tantivy-ipadic-neologd = ["lindera", "lindera-tantivy", "lindera-tantivy/ipadic-neologd"]
+lindera-tantivy-unidic = ["lindera", "lindera-tantivy", "lindera-tantivy/unidic"]
+lindera-tantivy-ko-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/ko-dic"]
+lindera-tantivy-cc-cedict = ["lindera", "lindera-tantivy", "lindera-tantivy/cc-cedict"]
+
 [build-dependencies]
 prost-build.workspace = true
 
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 440def7a5a..0f7e0f1a9f 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -12,6 +12,8 @@ pub struct TokenizerConfig {
     /// - `simple`: splits tokens on whitespace and punctuation
     /// - `whitespace`: splits tokens on whitespace
     /// - `raw`: no tokenization
+    /// - `lindera-tantivy-ipadic`: Japanese tokenizer
+    /// - `lindera-tantivy-ko-dic`: Korea tokenizer
     ///
     /// `simple` is recommended for most cases and the default value
     base_tokenizer: String,
@@ -141,9 +143,33 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
             tantivy::tokenizer::RawTokenizer::default(),
         )
         .dynamic()),
+        #[cfg(feature = "lindera-tantivy-ipadic")]
+        "lindera-ipadic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADIC),
+        #[cfg(feature = "lindera-tantivy-ipadic-neologd")]
+        "lindera-ipadic-neologd" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADICNEologd),
+        #[cfg(feature = "lindera-tantivy-unidic")]
+        "lindera-unidic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::UniDic),
+        #[cfg(feature = "lindera-tantivy-ko-dic")]
+        "lindera-ko-dic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::KoDic),
+        #[cfg(feature = "lindera-tantivy-cc-cedict")]
+        "lindera-cc-cedict" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::CcCedict),
         _ => Err(Error::invalid_input(
             format!("unknown base tokenizer {}", name),
             location!(),
         )),
     }
 }
+
+#[cfg(feature = "lindera-tantivy")]
+fn build_lindera_tokenizer_builder(dic: lindera::dictionary::DictionaryKind) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
+    use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
+    let mode = Mode::Normal;
+    let dictionary = load_dictionary_from_kind(dic).unwrap();
+    let user_dictionary = None;
+    let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
+    let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
+    Ok(tantivy::tokenizer::TextAnalyzer::builder(
+        tokenizer,
+    ).dynamic())
+}

From f36311ce9797a42b9a1be125564c8ecbd1b2883a Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sun, 8 Dec 2024 19:43:11 +0800
Subject: [PATCH 02/22] lindera tmp

---
 python/Cargo.lock                             | 42 +++++++++++++++++++
 python/Cargo.toml                             |  2 +-
 rust/lance-index/Cargo.toml                   | 10 +++--
 .../src/scalar/inverted/tokenizer.rs          | 40 ++++++++++++++----
 4 files changed, 80 insertions(+), 14 deletions(-)

diff --git a/python/Cargo.lock b/python/Cargo.lock
index 7a47c2c6cf..591dd89c11 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -3423,9 +3423,12 @@ dependencies = [
  "byteorder",
  "csv",
  "kanaria",
+ "lindera-cc-cedict",
  "lindera-dictionary",
  "lindera-ipadic",
+ "lindera-ipadic-neologd",
  "lindera-ko-dic",
+ "lindera-unidic",
  "once_cell",
  "regex",
  "serde",
@@ -3439,6 +3442,19 @@ dependencies = [
  "yada",
 ]
 
+[[package]]
+name = "lindera-cc-cedict"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a891e53b4fac346b314dcd60e4337b6deec7d972f338c004cfb9e0fe9868893"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
 [[package]]
 name = "lindera-dictionary"
 version = "0.38.1"
@@ -3477,6 +3493,19 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "lindera-ipadic-neologd"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "084c4e7b63500b235af2c7cbf8ee23735ae452971ac29bbd9a7f55a10eae50c4"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
 [[package]]
 name = "lindera-ko-dic"
 version = "0.38.1"
@@ -3501,6 +3530,19 @@ dependencies = [
  "tantivy-tokenizer-api",
 ]
 
+[[package]]
+name = "lindera-unidic"
+version = "0.38.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b95e25975448fa10f2bec61bfd012d8b0a1740d92ac7fc43e725edb1568ff7e"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "lindera-dictionary",
+ "once_cell",
+ "tokio",
+]
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.14"
diff --git a/python/Cargo.toml b/python/Cargo.toml
index 7f0e349bbf..a3e3b701b2 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -36,7 +36,7 @@ lance-core = { path = "../rust/lance-core" }
 lance-datagen = { path = "../rust/lance-datagen", optional = true }
 lance-encoding = { path = "../rust/lance-encoding" }
 lance-file = { path = "../rust/lance-file" }
-lance-index = { path = "../rust/lance-index", features = ["lindera-tantivy", "lindera-tantivy-ko-dic", "lindera-tantivy-ipadic"] }
+lance-index = { path = "../rust/lance-index", features = ["lindera-tantivy-custom"] }
 lance-io = { path = "../rust/lance-io" }
 lance-linalg = { path = "../rust/lance-linalg" }
 lance-table = { path = "../rust/lance-table" }
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
index 684731ab0a..c0344acf7c 100644
--- a/rust/lance-index/Cargo.toml
+++ b/rust/lance-index/Cargo.toml
@@ -71,11 +71,13 @@ datafusion-sql.workspace = true
 random_word = { version = "0.4.3", features = ["en"] }
 
 [features]
-lindera-tantivy-ipadic = ["lindera", "lindera-tantivy", "lindera-tantivy/ipadic"]
-lindera-tantivy-ipadic-neologd = ["lindera", "lindera-tantivy", "lindera-tantivy/ipadic-neologd"]
-lindera-tantivy-unidic = ["lindera", "lindera-tantivy", "lindera-tantivy/unidic"]
+lindera-tantivy-custom = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"]
+lindera-tantivy-builtin-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"]
+lindera-tantivy-ipadic = ["lindera-tantivy-builtin-dic", "lindera-tantivy/ipadic"]
+lindera-tantivy-ipadic-neologd = ["lindera-tantivy-builtin-dic", "lindera-tantivy/ipadic-neologd"]
+lindera-tantivy-unidic = ["lindera-tantivy-builtin-dic", "lindera-tantivy/unidic"]
 lindera-tantivy-ko-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/ko-dic"]
-lindera-tantivy-cc-cedict = ["lindera", "lindera-tantivy", "lindera-tantivy/cc-cedict"]
+lindera-tantivy-cc-cedict = ["lindera-tantivy-builtin-dic", "lindera-tantivy/cc-cedict"]
 
 [build-dependencies]
 prost-build.workspace = true
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 0f7e0f1a9f..ce5adac1b6 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -12,8 +12,8 @@ pub struct TokenizerConfig {
     /// - `simple`: splits tokens on whitespace and punctuation
     /// - `whitespace`: splits tokens on whitespace
     /// - `raw`: no tokenization
-    /// - `lindera-tantivy-ipadic`: Japanese tokenizer
-    /// - `lindera-tantivy-ko-dic`: Korea tokenizer
+    /// - `lindera-ipadic`: Japanese tokenizer
+    /// - `lindera-ko-dic`: Korea tokenizer
     ///
     /// `simple` is recommended for most cases and the default value
     base_tokenizer: String,
@@ -144,15 +144,19 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
         )
         .dynamic()),
         #[cfg(feature = "lindera-tantivy-ipadic")]
-        "lindera-ipadic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADIC),
+        "lindera-ipadic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADIC),
         #[cfg(feature = "lindera-tantivy-ipadic-neologd")]
-        "lindera-ipadic-neologd" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADICNEologd),
+        "lindera-ipadic-neologd" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADICNEologd),
         #[cfg(feature = "lindera-tantivy-unidic")]
-        "lindera-unidic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::UniDic),
+        "lindera-unidic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::UniDic),
         #[cfg(feature = "lindera-tantivy-ko-dic")]
-        "lindera-ko-dic" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::KoDic),
+        "lindera-ko-dic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::KoDic),
         #[cfg(feature = "lindera-tantivy-cc-cedict")]
-        "lindera-cc-cedict" => build_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::CcCedict),
+        "lindera-cc-cedict" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::CcCedict),
+        #[cfg(feature = "lindera-tantivy-custom")]
+        s if s.starts_with("lindera-") => {
+            return build_custom_lindera_tokenizer_builder(s);
+        }
         _ => Err(Error::invalid_input(
             format!("unknown base tokenizer {}", name),
             location!(),
@@ -160,8 +164,10 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
     }
 }
 
-#[cfg(feature = "lindera-tantivy")]
-fn build_lindera_tokenizer_builder(dic: lindera::dictionary::DictionaryKind) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
+#[cfg(feature = "lindera-tantivy-builtin-dic")]
+fn build_builtin_lindera_tokenizer_builder(
+    dic: lindera::dictionary::DictionaryKind
+) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
     use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
     use lindera_tantivy::tokenizer::LinderaTokenizer;
     let mode = Mode::Normal;
@@ -173,3 +179,19 @@ fn build_lindera_tokenizer_builder(dic: lindera::dictionary::DictionaryKind) ->
         tokenizer,
     ).dynamic())
 }
+
+#[cfg(feature = "lindera-tantivy-custom")]
+fn build_custom_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
+    use lindera::{dictionary::load_dictionary_from_path, mode::Mode, segmenter::Segmenter};
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
+    let dic = std::path::Path::new(dic);
+    let mode = Mode::Normal;
+    let dictionary = load_dictionary_from_path(dic).unwrap();
+    let user_dictionary = None;
+    let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
+    let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
+    Ok(tantivy::tokenizer::TextAnalyzer::builder(
+        tokenizer,
+    ).dynamic())
+}
+

From 0c2f5903f1cc0824895eadc12dcc1a3d6b9a9a9b Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sun, 8 Dec 2024 22:54:14 +0800
Subject: [PATCH 03/22] update tokenizer

---
 Cargo.lock                                    |   1 +
 Cargo.toml                                    |   1 +
 python/Cargo.lock                             |   1 +
 python/Cargo.toml                             |   2 +-
 rust/lance-core/Cargo.toml                    |   1 +
 rust/lance-core/src/lib.rs                    |  11 ++
 rust/lance-index/Cargo.toml                   |   8 +-
 .../src/scalar/inverted/tokenizer.rs          | 101 +++++++++++-------
 8 files changed, 77 insertions(+), 49 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 42127f1390..72afc01781 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3320,6 +3320,7 @@ dependencies = [
  "datafusion-common",
  "datafusion-sql",
  "deepsize",
+ "dirs",
  "futures",
  "lance-arrow",
  "lance-testing",
diff --git a/Cargo.toml b/Cargo.toml
index d0ae9e3f19..73785e6c93 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -110,6 +110,7 @@ datafusion-physical-expr = { version = "42.0", features = [
     "regex_expressions",
 ] }
 deepsize = "0.2.0"
+dirs = "5.0.0"
 either = "1.0"
 fsst = { version = "=0.21.0", path = "./rust/lance-encoding/src/compression_algo/fsst" }
 futures = "0.3"
diff --git a/python/Cargo.lock b/python/Cargo.lock
index 591dd89c11..0a0f99cccc 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -3024,6 +3024,7 @@ dependencies = [
  "datafusion-common",
  "datafusion-sql",
  "deepsize",
+ "dirs",
  "futures",
  "lance-arrow",
  "lazy_static",
diff --git a/python/Cargo.toml b/python/Cargo.toml
index a3e3b701b2..0a1bea95e4 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -36,7 +36,7 @@ lance-core = { path = "../rust/lance-core" }
 lance-datagen = { path = "../rust/lance-datagen", optional = true }
 lance-encoding = { path = "../rust/lance-encoding" }
 lance-file = { path = "../rust/lance-file" }
-lance-index = { path = "../rust/lance-index", features = ["lindera-tantivy-custom"] }
+lance-index = { path = "../rust/lance-index", features = ["tokenizer-lindera"] }
 lance-io = { path = "../rust/lance-io" }
 lance-linalg = { path = "../rust/lance-linalg" }
 lance-table = { path = "../rust/lance-table" }
diff --git a/rust/lance-core/Cargo.toml b/rust/lance-core/Cargo.toml
index fe4e9a1331..9175a3657d 100644
--- a/rust/lance-core/Cargo.toml
+++ b/rust/lance-core/Cargo.toml
@@ -23,6 +23,7 @@ chrono.workspace = true
 datafusion-common = { workspace = true, optional = true }
 datafusion-sql = { workspace = true, optional = true }
 deepsize.workspace = true
+dirs.workspace = true
 futures.workspace = true
 lazy_static.workspace = true
 mock_instant.workspace = true
diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs
index 9ab1854076..4d52608e06 100644
--- a/rust/lance-core/src/lib.rs
+++ b/rust/lance-core/src/lib.rs
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
+use std::{env, path::PathBuf};
+
 use arrow_schema::{DataType, Field as ArrowField};
 
 pub mod cache;
@@ -16,6 +18,9 @@ pub const ROW_ID: &str = "_rowid";
 /// Column name for the meta row address.
 pub const ROW_ADDR: &str = "_rowaddr";
 
+pub const LANCE_HOME_ENV_KEY: &str = "LANCE_HOME";
+pub const LANCE_HOME_DEFAULT_DIRECTORY: &str = "lance";
+
 lazy_static::lazy_static! {
     /// Row ID field. This is nullable because its validity bitmap is sometimes used
     /// as a selection vector.
@@ -23,4 +28,10 @@ lazy_static::lazy_static! {
     /// Row address field. This is nullable because its validity bitmap is sometimes used
     /// as a selection vector.
     pub static ref ROW_ADDR_FIELD: ArrowField = ArrowField::new(ROW_ADDR, DataType::UInt64, true);
+
+    /// default directory that stores lance related files, e.g. tokenizer model.
+    pub static ref LANCE_HOME: Option<PathBuf> = match env::var(LANCE_HOME_ENV_KEY) {
+        Ok(p) => Some(PathBuf::from(p)),
+        Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_HOME_DEFAULT_DIRECTORY))
+    };
 }
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
index c0344acf7c..78aa2bbdf3 100644
--- a/rust/lance-index/Cargo.toml
+++ b/rust/lance-index/Cargo.toml
@@ -71,13 +71,7 @@ datafusion-sql.workspace = true
 random_word = { version = "0.4.3", features = ["en"] }
 
 [features]
-lindera-tantivy-custom = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"]
-lindera-tantivy-builtin-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"]
-lindera-tantivy-ipadic = ["lindera-tantivy-builtin-dic", "lindera-tantivy/ipadic"]
-lindera-tantivy-ipadic-neologd = ["lindera-tantivy-builtin-dic", "lindera-tantivy/ipadic-neologd"]
-lindera-tantivy-unidic = ["lindera-tantivy-builtin-dic", "lindera-tantivy/unidic"]
-lindera-tantivy-ko-dic = ["lindera", "lindera-tantivy", "lindera-tantivy/ko-dic"]
-lindera-tantivy-cc-cedict = ["lindera-tantivy-builtin-dic", "lindera-tantivy/cc-cedict"]
+tokenizer-lindera = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"]
 
 [build-dependencies]
 prost-build.workspace = true
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index ce5adac1b6..b860061322 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -1,7 +1,9 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use lance_core::{Error, Result};
+use std::path::PathBuf;
+
+use lance_core::{Error, Result, LANCE_HOME};
 use serde::{Deserialize, Serialize};
 use snafu::{location, Location};
 
@@ -143,19 +145,9 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
             tantivy::tokenizer::RawTokenizer::default(),
         )
         .dynamic()),
-        #[cfg(feature = "lindera-tantivy-ipadic")]
-        "lindera-ipadic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADIC),
-        #[cfg(feature = "lindera-tantivy-ipadic-neologd")]
-        "lindera-ipadic-neologd" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::IPADICNEologd),
-        #[cfg(feature = "lindera-tantivy-unidic")]
-        "lindera-unidic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::UniDic),
-        #[cfg(feature = "lindera-tantivy-ko-dic")]
-        "lindera-ko-dic" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::KoDic),
-        #[cfg(feature = "lindera-tantivy-cc-cedict")]
-        "lindera-cc-cedict" => build_builtin_lindera_tokenizer_builder(lindera::dictionary::DictionaryKind::CcCedict),
-        #[cfg(feature = "lindera-tantivy-custom")]
-        s if s.starts_with("lindera-") => {
-            return build_custom_lindera_tokenizer_builder(s);
+        #[cfg(feature = "tokenizer-lindera")]
+        s if s.starts_with("lindera/") => {
+            return build_lindera_tokenizer_builder(s);
         }
         _ => Err(Error::invalid_input(
             format!("unknown base tokenizer {}", name),
@@ -164,34 +156,61 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
     }
 }
 
-#[cfg(feature = "lindera-tantivy-builtin-dic")]
-fn build_builtin_lindera_tokenizer_builder(
-    dic: lindera::dictionary::DictionaryKind
-) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
-    use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
-    use lindera_tantivy::tokenizer::LinderaTokenizer;
-    let mode = Mode::Normal;
-    let dictionary = load_dictionary_from_kind(dic).unwrap();
-    let user_dictionary = None;
-    let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
-    let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
-    Ok(tantivy::tokenizer::TextAnalyzer::builder(
-        tokenizer,
-    ).dynamic())
+lazy_static::lazy_static! {
+    pub static ref LANCE_TOKENIZER_HOME: Option<PathBuf> = LANCE_HOME.as_ref().map(|p| p.join("tokenizers"));
 }
 
-#[cfg(feature = "lindera-tantivy-custom")]
-fn build_custom_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
-    use lindera::{dictionary::load_dictionary_from_path, mode::Mode, segmenter::Segmenter};
+#[cfg(feature = "tokenizer-lindera")]
+fn build_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
+    use std::{fs::File, io::BufReader};
+
+    use lindera::{
+        dictionary::{
+            load_dictionary_from_path, load_user_dictionary_from_config, UserDictionaryConfig,
+        },
+        mode::Mode,
+        segmenter::Segmenter,
+    };
     use lindera_tantivy::tokenizer::LinderaTokenizer;
-    let dic = std::path::Path::new(dic);
-    let mode = Mode::Normal;
-    let dictionary = load_dictionary_from_path(dic).unwrap();
-    let user_dictionary = None;
-    let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
-    let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
-    Ok(tantivy::tokenizer::TextAnalyzer::builder(
-        tokenizer,
-    ).dynamic())
+    use serde_json::from_reader;
+
+    match LANCE_TOKENIZER_HOME.as_ref() {
+        Some(p) => {
+            let dic_dir = p.join(dic);
+            let main_dir = dic_dir.join("main");
+            let user_config_path = dic_dir.join("user_config.json");
+            let user_dictionary = if user_config_path.exists() {
+                let file = File::open(user_config_path)?;
+                let reader = BufReader::new(file);
+                let user_dictionary_config: UserDictionaryConfig = from_reader(reader)?;
+                Some(
+                    load_user_dictionary_from_config(&user_dictionary_config).map_err(|e| {
+                        Error::io(
+                            format!("load lindera tokenizer user dictionary err: {e}"),
+                            location!(),
+                        )
+                    })?,
+                )
+            } else {
+                None
+            };
+            let mode = Mode::Normal;
+            let dictionary = load_dictionary_from_path(main_dir.as_path()).map_err(|e| {
+                Error::io(
+                    format!("load lindera tokenizer main dictionary err: {e}"),
+                    location!(),
+                )
+            })?;
+            let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
+            let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
+            Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
+        }
+        None => Err(Error::invalid_input(
+            format!(
+                "{} is undefined",
+                String::from(lance_core::LANCE_HOME_ENV_KEY)
+            ),
+            location!(),
+        )),
+    }
 }
-

From 2428567d3d2d344e92f0a25401c2b5f5c2651844 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Tue, 10 Dec 2024 20:26:26 +0800
Subject: [PATCH 04/22] lindera support

---
 Cargo.lock                                    |  2 +-
 python/Cargo.lock                             |  2 +-
 python/python/tests/test_scalar_index.py      |  4 +-
 rust/lance-core/Cargo.toml                    |  1 -
 rust/lance-core/src/lib.rs                    | 10 ---
 rust/lance-index/Cargo.toml                   |  1 +
 .../src/scalar/inverted/tokenizer.rs          | 82 +++++++++++++------
 7 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 72afc01781..ff90a5cfcf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3320,7 +3320,6 @@ dependencies = [
  "datafusion-common",
  "datafusion-sql",
  "deepsize",
- "dirs",
  "futures",
  "lance-arrow",
  "lance-testing",
@@ -3534,6 +3533,7 @@ dependencies = [
  "datafusion-physical-expr",
  "datafusion-sql",
  "deepsize",
+ "dirs",
  "futures",
  "half",
  "itertools 0.13.0",
diff --git a/python/Cargo.lock b/python/Cargo.lock
index 0a0f99cccc..6e470e5e1f 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -3024,7 +3024,6 @@ dependencies = [
  "datafusion-common",
  "datafusion-sql",
  "deepsize",
- "dirs",
  "futures",
  "lance-arrow",
  "lazy_static",
@@ -3180,6 +3179,7 @@ dependencies = [
  "datafusion-physical-expr",
  "datafusion-sql",
  "deepsize",
+ "dirs",
  "futures",
  "half",
  "itertools 0.13.0",
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
index 52ad548cb8..76de9b22a0 100644
--- a/python/python/tests/test_scalar_index.py
+++ b/python/python/tests/test_scalar_index.py
@@ -318,7 +318,7 @@ def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path
         }
     )
     ds = lance.write_dataset(data, tmp_path, mode="overwrite")
-    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera-ipadic")
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ipadic")
 
     results = ds.to_table(
         full_text_query="成田",
@@ -335,7 +335,7 @@ def test_indexed_filter_with_fts_index_with_lindera_ko_tokenizer(tmp_path):
         }
     )
     ds = lance.write_dataset(data, tmp_path, mode="overwrite")
-    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera-ko-dic")
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ko-dic")
 
     results = ds.to_table(
         full_text_query="나리타",
diff --git a/rust/lance-core/Cargo.toml b/rust/lance-core/Cargo.toml
index 9175a3657d..fe4e9a1331 100644
--- a/rust/lance-core/Cargo.toml
+++ b/rust/lance-core/Cargo.toml
@@ -23,7 +23,6 @@ chrono.workspace = true
 datafusion-common = { workspace = true, optional = true }
 datafusion-sql = { workspace = true, optional = true }
 deepsize.workspace = true
-dirs.workspace = true
 futures.workspace = true
 lazy_static.workspace = true
 mock_instant.workspace = true
diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs
index 4d52608e06..91a894b355 100644
--- a/rust/lance-core/src/lib.rs
+++ b/rust/lance-core/src/lib.rs
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use std::{env, path::PathBuf};
-
 use arrow_schema::{DataType, Field as ArrowField};
 
 pub mod cache;
@@ -18,9 +16,6 @@ pub const ROW_ID: &str = "_rowid";
 /// Column name for the meta row address.
 pub const ROW_ADDR: &str = "_rowaddr";
 
-pub const LANCE_HOME_ENV_KEY: &str = "LANCE_HOME";
-pub const LANCE_HOME_DEFAULT_DIRECTORY: &str = "lance";
-
 lazy_static::lazy_static! {
     /// Row ID field. This is nullable because its validity bitmap is sometimes used
     /// as a selection vector.
@@ -29,9 +24,4 @@ lazy_static::lazy_static! {
     /// as a selection vector.
     pub static ref ROW_ADDR_FIELD: ArrowField = ArrowField::new(ROW_ADDR, DataType::UInt64, true);
 
-    /// default directory that stores lance related files, e.g. tokenizer model.
-    pub static ref LANCE_HOME: Option<PathBuf> = match env::var(LANCE_HOME_ENV_KEY) {
-        Ok(p) => Some(PathBuf::from(p)),
-        Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_HOME_DEFAULT_DIRECTORY))
-    };
 }
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
index 78aa2bbdf3..c98388eeb9 100644
--- a/rust/lance-index/Cargo.toml
+++ b/rust/lance-index/Cargo.toml
@@ -26,6 +26,7 @@ datafusion-physical-expr.workspace = true
 datafusion-sql.workspace = true
 datafusion.workspace = true
 deepsize.workspace = true
+dirs.workspace = true
 futures.workspace = true
 half.workspace = true
 itertools.workspace = true
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index b860061322..d1ac009884 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -1,9 +1,9 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use std::path::PathBuf;
+use std::{env, path::PathBuf};
 
-use lance_core::{Error, Result, LANCE_HOME};
+use lance_core::{Error, Result};
 use serde::{Deserialize, Serialize};
 use snafu::{location, Location};
 
@@ -156,8 +156,24 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
     }
 }
 
+pub const LANCE_TOKENIZERS_HOME_ENV_KEY: &str = "LANCE_TOKENIZERS_HOME";
+
+pub const LANCE_HOME_DEFAULT_DIRECTORY: &str = "lance/tokenizers";
+
 lazy_static::lazy_static! {
-    pub static ref LANCE_TOKENIZER_HOME: Option<PathBuf> = LANCE_HOME.as_ref().map(|p| p.join("tokenizers"));
+    /// default directory that stores lance tokenizer related files, e.g. tokenizer model.
+    pub static ref LANCE_TOKENIZER_HOME: Option<PathBuf> = match env::var(LANCE_TOKENIZERS_HOME_ENV_KEY) {
+        Ok(p) => Some(PathBuf::from(p)),
+        Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_HOME_DEFAULT_DIRECTORY))
+    };
+}
+
+#[cfg(feature = "tokenizer-lindera")]
+#[derive(Serialize, Deserialize)]
+struct LinderaConfig{
+    main: String,
+    user: Option<String>,
+    user_kind: Option<String>
 }
 
 #[cfg(feature = "tokenizer-lindera")]
@@ -172,35 +188,55 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::Text
         segmenter::Segmenter,
     };
     use lindera_tantivy::tokenizer::LinderaTokenizer;
-    use serde_json::from_reader;
+    use serde_json::Value;
 
     match LANCE_TOKENIZER_HOME.as_ref() {
         Some(p) => {
             let dic_dir = p.join(dic);
-            let main_dir = dic_dir.join("main");
-            let user_config_path = dic_dir.join("user_config.json");
-            let user_dictionary = if user_config_path.exists() {
-                let file = File::open(user_config_path)?;
-                let reader = BufReader::new(file);
-                let user_dictionary_config: UserDictionaryConfig = from_reader(reader)?;
-                Some(
-                    load_user_dictionary_from_config(&user_dictionary_config).map_err(|e| {
+            let config_path = dic_dir.join("config.json");
+            let file = File::open(config_path)?;
+            let reader = BufReader::new(file);
+            let config: LinderaConfig = serde_json::from_reader(reader)?;
+            let main_path = dic_dir.join(config.main);
+            let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| {
+                Error::io(
+                    format!("load lindera tokenizer main dictionary err: {e}"),
+                    location!(),
+                )
+            })?;
+            let user_dictionary = match config.user {
+                Some(user) =>  {
+                    let mut conf = serde_json::Map::<String, Value>::new();
+                    let user_path = dic_dir.join(user);
+                    match user_path.to_str() {
+                        Some(p) => {
+                            conf.insert(String::from("path"), Value::String(String::from(p)));
+                            Ok(())
+                        },
+                        None => {
+                            let p = user_path.display();
+                            Err(Error::io(
+                                format!("invalid lindera tokenizer user dictionary path: {p}"),
+                                location!(),
+                            ))
+                        }
+                    }?;
+                    if let Some(kind) = config.user_kind {
+                        conf.insert(String::from("kind"), Value::String(kind));
+                    }
+                    let user_dictionary_config: UserDictionaryConfig = Value::Object(conf);
+                    let user_dictionary = load_user_dictionary_from_config(&user_dictionary_config).map_err(|e| {
                         Error::io(
                             format!("load lindera tokenizer user dictionary err: {e}"),
                             location!(),
                         )
-                    })?,
-                )
-            } else {
-                None
+                    })?;
+                    Some(user_dictionary)
+                },
+                None => None
+
             };
             let mode = Mode::Normal;
-            let dictionary = load_dictionary_from_path(main_dir.as_path()).map_err(|e| {
-                Error::io(
-                    format!("load lindera tokenizer main dictionary err: {e}"),
-                    location!(),
-                )
-            })?;
             let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
             let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
             Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
@@ -208,7 +244,7 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::Text
         None => Err(Error::invalid_input(
             format!(
                 "{} is undefined",
-                String::from(lance_core::LANCE_HOME_ENV_KEY)
+                String::from(LANCE_TOKENIZERS_HOME_ENV_KEY)
             ),
             location!(),
         )),

From f1b91465b8c3f5a091c55b8d3c9c85e3d9461501 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Tue, 10 Dec 2024 20:43:45 +0800
Subject: [PATCH 05/22] format

---
 rust/lance-core/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rust/lance-core/src/lib.rs b/rust/lance-core/src/lib.rs
index 91a894b355..9ab1854076 100644
--- a/rust/lance-core/src/lib.rs
+++ b/rust/lance-core/src/lib.rs
@@ -23,5 +23,4 @@ lazy_static::lazy_static! {
     /// Row address field. This is nullable because its validity bitmap is sometimes used
     /// as a selection vector.
     pub static ref ROW_ADDR_FIELD: ArrowField = ArrowField::new(ROW_ADDR, DataType::UInt64, true);
-
 }

From b8c778ef15cac7a78d19a80f6d548cc3591e4ede Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Tue, 10 Dec 2024 20:49:56 +0800
Subject: [PATCH 06/22] update deps

---
 Cargo.lock                  | 70 -------------------------------------
 python/Cargo.lock           | 70 -------------------------------------
 rust/lance-index/Cargo.toml |  2 +-
 3 files changed, 1 insertion(+), 141 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ff90a5cfcf..372e9966bc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3860,12 +3860,7 @@ dependencies = [
  "byteorder",
  "csv",
  "kanaria",
- "lindera-cc-cedict",
  "lindera-dictionary",
- "lindera-ipadic",
- "lindera-ipadic-neologd",
- "lindera-ko-dic",
- "lindera-unidic",
  "once_cell",
  "regex",
  "serde",
@@ -3879,19 +3874,6 @@ dependencies = [
  "yada",
 ]
 
-[[package]]
-name = "lindera-cc-cedict"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a891e53b4fac346b314dcd60e4337b6deec7d972f338c004cfb9e0fe9868893"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
 [[package]]
 name = "lindera-dictionary"
 version = "0.38.1"
@@ -3917,45 +3899,6 @@ dependencies = [
  "yada",
 ]
 
-[[package]]
-name = "lindera-ipadic"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74d2b997fa2aeee8adccea2e4cb0ade771132f6e32093ed0beda8409e9a44018"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
-[[package]]
-name = "lindera-ipadic-neologd"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "084c4e7b63500b235af2c7cbf8ee23735ae452971ac29bbd9a7f55a10eae50c4"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
-[[package]]
-name = "lindera-ko-dic"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d921a5c39a634316125fc1572f00bc78f4351baeacaff2cf39953b2fc8493a55"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
 [[package]]
 name = "lindera-tantivy"
 version = "0.38.1"
@@ -3967,19 +3910,6 @@ dependencies = [
  "tantivy-tokenizer-api",
 ]
 
-[[package]]
-name = "lindera-unidic"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b95e25975448fa10f2bec61bfd012d8b0a1740d92ac7fc43e725edb1568ff7e"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.3.8"
diff --git a/python/Cargo.lock b/python/Cargo.lock
index 6e470e5e1f..8fa894d767 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -3424,12 +3424,7 @@ dependencies = [
  "byteorder",
  "csv",
  "kanaria",
- "lindera-cc-cedict",
  "lindera-dictionary",
- "lindera-ipadic",
- "lindera-ipadic-neologd",
- "lindera-ko-dic",
- "lindera-unidic",
  "once_cell",
  "regex",
  "serde",
@@ -3443,19 +3438,6 @@ dependencies = [
  "yada",
 ]
 
-[[package]]
-name = "lindera-cc-cedict"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a891e53b4fac346b314dcd60e4337b6deec7d972f338c004cfb9e0fe9868893"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
 [[package]]
 name = "lindera-dictionary"
 version = "0.38.1"
@@ -3481,45 +3463,6 @@ dependencies = [
  "yada",
 ]
 
-[[package]]
-name = "lindera-ipadic"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74d2b997fa2aeee8adccea2e4cb0ade771132f6e32093ed0beda8409e9a44018"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
-[[package]]
-name = "lindera-ipadic-neologd"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "084c4e7b63500b235af2c7cbf8ee23735ae452971ac29bbd9a7f55a10eae50c4"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
-[[package]]
-name = "lindera-ko-dic"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d921a5c39a634316125fc1572f00bc78f4351baeacaff2cf39953b2fc8493a55"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
 [[package]]
 name = "lindera-tantivy"
 version = "0.38.1"
@@ -3531,19 +3474,6 @@ dependencies = [
  "tantivy-tokenizer-api",
 ]
 
-[[package]]
-name = "lindera-unidic"
-version = "0.38.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b95e25975448fa10f2bec61bfd012d8b0a1740d92ac7fc43e725edb1568ff7e"
-dependencies = [
- "bincode",
- "byteorder",
- "lindera-dictionary",
- "once_cell",
- "tokio",
-]
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.14"
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
index c98388eeb9..1cff60a8b6 100644
--- a/rust/lance-index/Cargo.toml
+++ b/rust/lance-index/Cargo.toml
@@ -72,7 +72,7 @@ datafusion-sql.workspace = true
 random_word = { version = "0.4.3", features = ["en"] }
 
 [features]
-tokenizer-lindera = ["lindera", "lindera-tantivy", "lindera-tantivy/compress"]
+tokenizer-lindera = ["lindera", "lindera-tantivy"]
 
 [build-dependencies]
 prost-build.workspace = true

From 83cd1dd1030c718b123b3f606d2a0c1616c70f53 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Wed, 25 Dec 2024 23:11:13 +0800
Subject: [PATCH 07/22] lm download script

---
 python/python/lance/lance/__init__.pyi        |  2 +
 python/python/lance/lm.py                     | 88 +++++++++++++++++++
 python/src/lib.rs                             |  4 +
 .../src/scalar/inverted/tokenizer.rs          | 12 +--
 4 files changed, 100 insertions(+), 6 deletions(-)
 create mode 100644 python/python/lance/lm.py

diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi
index 97d2cb602d..3b4278bcff 100644
--- a/python/python/lance/lance/__init__.pyi
+++ b/python/python/lance/lance/__init__.pyi
@@ -16,6 +16,8 @@ from typing import Dict, List, Optional
 
 import pyarrow as pa
 
+LANGUAGE_MODEL_HOME: Optional[str]
+
 def infer_tfrecord_schema(
     uri: str,
     tensor_features: Optional[List[str]] = None,
diff --git a/python/python/lance/lm.py b/python/python/lance/lm.py
new file mode 100644
index 0000000000..d59330bb48
--- /dev/null
+++ b/python/python/lance/lm.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The Lance Authors
+
+from io import BytesIO
+import os
+import shutil
+import subprocess
+import tarfile
+import traceback
+from .lance import LANGUAGE_MODEL_HOME
+
+if LANGUAGE_MODEL_HOME is None:
+    raise Exception("LANCE_LANGUAGE_MODEL_HOME is not configured")
+
+def check_lindera():
+    if not shutil.which("lindera"):
+        raise Exception("lindera is not installed. Please install it by following https://github.com/lindera/lindera/tree/main/lindera-cli")
+
+def check_requests():
+    try:
+        import requests
+    except:
+        raise Exception("requests is not installed, Please pip install requests")
+
+def download_jieba():
+    dirname = os.path.join(LANGUAGE_MODEL_HOME, "jieba", "default")
+    os.makedirs(dirname, exist_ok=True)
+    try:
+        check_requests()
+        import requests
+        resp = requests.get("https://api.github.com/repos/messense/jieba-rs/releases/latest")
+        content = requests.get(resp.json()["tarball_url"]).content
+        with tarfile.open(fileobj=BytesIO(content)) as tar:
+            dir = tar.getnames()[0]
+            tar.extract(f'{dir}/src/data', path=dirname)
+        shutil.move(os.path.join(dirname, dir, "src", "data"), dirname)
+    except Exception as _:
+        traceback.print_exc()
+        print("Download jieba language model failed. Please download this folder "
+              f"https://github.com/messense/jieba-rs/tree/main/src/data and put it in {dirname}")
+
+def download_lindera(lm: str):
+    import requests
+    dirname = os.path.join(LANGUAGE_MODEL_HOME, "lindera", lm)
+    src_dirname = os.path.join(dirname, "src")
+    if lm == "ipadic":
+        url = "https://dlwqk3ibdg1xh.cloudfront.net/mecab-ipadic-2.7.0-20070801.tar.gz"
+    elif lm == "ko-dic":
+        url = "https://dlwqk3ibdg1xh.cloudfront.net/mecab-ko-dic-2.1.1-20180720.tar.gz"
+    elif lm == "unidic":
+        url = "https://dlwqk3ibdg1xh.cloudfront.net/unidic-mecab-2.1.2.tar.gz"
+    else:
+        raise Exception(f"language model {lm} is not supported")
+    os.makedirs(src_dirname, exist_ok=True)
+    print(f"downloading language model: {url}")
+    data = requests.get(url).content
+    print(f"unzip language model: {url}")
+
+    cwd = os.getcwd()
+    try:
+        os.chdir(src_dirname)
+        with tarfile.open(fileobj=BytesIO(data)) as tar:
+            tar.extractall()
+            name = tar.getnames()[0]
+        cmd = ["lindera", "build", "--dictionary-kind=ipadic", os.path.join(src_dirname, name), dirname]
+        print(f"compile language model: {' '.join(cmd)}")
+        subprocess.run(cmd)
+    finally:
+        os.chdir(cwd)
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Lance tokenizer language model downloader'
+    )
+    parser.add_argument('tokenizer', choices=['jieba', 'lindera'])
+    parser.add_argument("-l", "--languagemodel")
+    args = parser.parse_args()
+    print(f"LANCE_LANGUAGE_MODEL_HOME={LANGUAGE_MODEL_HOME}")
+    if args.tokenizer == 'jieba':
+        download_jieba()
+    elif args.tokenizer == 'lindera':
+        download_lindera(args.languagemodel)
+
+if __name__ == '__main__':
+    main()
+
diff --git a/python/src/lib.rs b/python/src/lib.rs
index 9b82ff2a53..5d1aec32e4 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -44,6 +44,7 @@ use futures::StreamExt;
 use lance_index::DatasetIndexExt;
 use pyo3::exceptions::{PyIOError, PyValueError};
 use pyo3::prelude::*;
+use pyo3::types::{PyNone, PyString};
 use session::Session;
 
 #[macro_use]
@@ -151,6 +152,9 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_wrapped(wrap_pyfunction!(debug::format_fragment))?;
     m.add_wrapped(wrap_pyfunction!(debug::list_transactions))?;
     m.add("__version__", env!("CARGO_PKG_VERSION"))?;
+    let none = PyNone::get_bound(py).into_py(py);
+    let lm_home = lance_index::scalar::inverted::LANCE_LANGUAGE_MODEL_HOME.as_ref().and_then(|p| p.to_str()).map(|p| PyString::new_bound(py, p).into_py(py)).unwrap_or(none);
+    m.add("LANGUAGE_MODEL_HOME", lm_home)?;
     register_datagen(py, m)?;
     register_indices(py, m)?;
     Ok(())
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index d1ac009884..566f8e7635 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -156,15 +156,15 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
     }
 }
 
-pub const LANCE_TOKENIZERS_HOME_ENV_KEY: &str = "LANCE_TOKENIZERS_HOME";
+pub const LANCE_LANGUAGE_MODEL_HOME_ENV_KEY: &str = "LANCE_LANGUAGE_MODEL_HOME";
 
-pub const LANCE_HOME_DEFAULT_DIRECTORY: &str = "lance/tokenizers";
+pub const LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY: &str = "lance/language_models";
 
 lazy_static::lazy_static! {
     /// default directory that stores lance tokenizer related files, e.g. tokenizer model.
-    pub static ref LANCE_TOKENIZER_HOME: Option<PathBuf> = match env::var(LANCE_TOKENIZERS_HOME_ENV_KEY) {
+    pub static ref LANCE_LANGUAGE_MODEL_HOME: Option<PathBuf> = match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) {
         Ok(p) => Some(PathBuf::from(p)),
-        Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_HOME_DEFAULT_DIRECTORY))
+        Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY))
     };
 }
 
@@ -190,7 +190,7 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::Text
     use lindera_tantivy::tokenizer::LinderaTokenizer;
     use serde_json::Value;
 
-    match LANCE_TOKENIZER_HOME.as_ref() {
+    match LANCE_LANGUAGE_MODEL_HOME.as_ref() {
         Some(p) => {
             let dic_dir = p.join(dic);
             let config_path = dic_dir.join("config.json");
@@ -244,7 +244,7 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::Text
         None => Err(Error::invalid_input(
             format!(
                 "{} is undefined",
-                String::from(LANCE_TOKENIZERS_HOME_ENV_KEY)
+                String::from(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY)
             ),
             location!(),
         )),

From 73582502a028fec9556ae25ed0a6787a1ded6704 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Wed, 25 Dec 2024 23:13:29 +0800
Subject: [PATCH 08/22] update

---
 python/python/lance/lm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/python/lance/lm.py b/python/python/lance/lm.py
index d59330bb48..b8d7cb850b 100644
--- a/python/python/lance/lm.py
+++ b/python/python/lance/lm.py
@@ -62,7 +62,7 @@ def download_lindera(lm: str):
         with tarfile.open(fileobj=BytesIO(data)) as tar:
             tar.extractall()
             name = tar.getnames()[0]
-        cmd = ["lindera", "build", "--dictionary-kind=ipadic", os.path.join(src_dirname, name), dirname]
+        cmd = ["lindera", "build", f"--dictionary-kind={lm}", os.path.join(src_dirname, name), dirname]
         print(f"compile language model: {' '.join(cmd)}")
         subprocess.run(cmd)
     finally:

From eb8e568311a45a30ee3d244616d8e9684fb27de2 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Wed, 25 Dec 2024 23:58:35 +0800
Subject: [PATCH 09/22] jieba

---
 Cargo.lock                                    | 117 ++++++++++++++++++
 Cargo.toml                                    |   1 +
 python/Cargo.lock                             | 117 ++++++++++++++++++
 python/Cargo.toml                             |   2 +-
 rust/lance-index/Cargo.toml                   |   2 +
 .../src/scalar/inverted/tokenizer.rs          | 108 +++++++++++++++-
 6 files changed, 343 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 372e9966bc..727ca31781 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -23,6 +23,12 @@ version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
 
+[[package]]
+name = "adler32"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
+
 [[package]]
 name = "ahash"
 version = "0.8.11"
@@ -1198,6 +1204,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "cedarwood"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
+dependencies = [
+ "smallvec",
+]
+
 [[package]]
 name = "census"
 version = "0.4.2"
@@ -1404,6 +1419,15 @@ version = "0.8.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
 
+[[package]]
+name = "core2"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "cpp_demangle"
 version = "0.4.3"
@@ -1599,6 +1623,12 @@ dependencies = [
  "syn 2.0.89",
 ]
 
+[[package]]
+name = "dary_heap"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728"
+
 [[package]]
 name = "dashmap"
 version = "5.5.3"
@@ -2619,6 +2649,15 @@ dependencies = [
  "slab",
 ]
 
+[[package]]
+name = "fxhash"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
+dependencies = [
+ "byteorder",
+]
+
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -3023,6 +3062,29 @@ dependencies = [
  "unicode-normalization",
 ]
 
+[[package]]
+name = "include-flate"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e"
+dependencies = [
+ "include-flate-codegen",
+ "lazy_static",
+ "libflate",
+]
+
+[[package]]
+name = "include-flate-codegen"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7"
+dependencies = [
+ "libflate",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.89",
+]
+
 [[package]]
 name = "indexmap"
 version = "2.3.0"
@@ -3151,6 +3213,30 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
+[[package]]
+name = "jieba-macros"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192"
+dependencies = [
+ "phf_codegen",
+]
+
+[[package]]
+name = "jieba-rs"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a77d0ae8831f870c4f6ffce310f708b5273ea2e7a88e6af770a10d1b4876311"
+dependencies = [
+ "cedarwood",
+ "fxhash",
+ "include-flate",
+ "jieba-macros",
+ "lazy_static",
+ "phf",
+ "regex",
+]
+
 [[package]]
 name = "jni"
 version = "0.21.1"
@@ -3537,6 +3623,7 @@ dependencies = [
  "futures",
  "half",
  "itertools 0.13.0",
+ "jieba-rs",
  "lance-arrow",
  "lance-core",
  "lance-datafusion",
@@ -3833,6 +3920,30 @@ version = "0.2.155"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
 
+[[package]]
+name = "libflate"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e"
+dependencies = [
+ "adler32",
+ "core2",
+ "crc32fast",
+ "dary_heap",
+ "libflate_lz77",
+]
+
+[[package]]
+name = "libflate_lz77"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
+dependencies = [
+ "core2",
+ "hashbrown 0.14.5",
+ "rle-decode-fast",
+]
+
 [[package]]
 name = "libm"
 version = "0.2.8"
@@ -5363,6 +5474,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "rle-decode-fast"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
+
 [[package]]
 name = "roaring"
 version = "0.10.6"
diff --git a/Cargo.toml b/Cargo.toml
index 73785e6c93..2d0f38fa80 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -117,6 +117,7 @@ futures = "0.3"
 http = "1.1.0"
 hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
 itertools = "0.13"
+jieba-rs = { version = "0.7", default-features = false }
 lazy_static = "1"
 log = "0.4"
 mockall = { version = "0.13.1" }
diff --git a/python/Cargo.lock b/python/Cargo.lock
index 8fa894d767..3ca3456b98 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -17,6 +17,12 @@ version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
 
+[[package]]
+name = "adler32"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
+
 [[package]]
 name = "ahash"
 version = "0.8.11"
@@ -1053,6 +1059,15 @@ dependencies = [
  "shlex",
 ]
 
+[[package]]
+name = "cedarwood"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90"
+dependencies = [
+ "smallvec",
+]
+
 [[package]]
 name = "census"
 version = "0.4.2"
@@ -1179,6 +1194,15 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
+[[package]]
+name = "core2"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.16"
@@ -1327,6 +1351,12 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "dary_heap"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728"
+
 [[package]]
 name = "dashmap"
 version = "5.5.3"
@@ -2249,6 +2279,15 @@ dependencies = [
  "slab",
 ]
 
+[[package]]
+name = "fxhash"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
+dependencies = [
+ "byteorder",
+]
+
 [[package]]
 name = "generic-array"
 version = "0.14.7"
@@ -2795,6 +2834,29 @@ dependencies = [
  "icu_properties",
 ]
 
+[[package]]
+name = "include-flate"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e"
+dependencies = [
+ "include-flate-codegen",
+ "lazy_static",
+ "libflate",
+]
+
+[[package]]
+name = "include-flate-codegen"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7"
+dependencies = [
+ "libflate",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "indexmap"
 version = "2.7.0"
@@ -2894,6 +2956,30 @@ version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
 
+[[package]]
+name = "jieba-macros"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192"
+dependencies = [
+ "phf_codegen",
+]
+
+[[package]]
+name = "jieba-rs"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a77d0ae8831f870c4f6ffce310f708b5273ea2e7a88e6af770a10d1b4876311"
+dependencies = [
+ "cedarwood",
+ "fxhash",
+ "include-flate",
+ "jieba-macros",
+ "lazy_static",
+ "phf",
+ "regex",
+]
+
 [[package]]
 name = "jobserver"
 version = "0.1.32"
@@ -3183,6 +3269,7 @@ dependencies = [
  "futures",
  "half",
  "itertools 0.13.0",
+ "jieba-rs",
  "lance-arrow",
  "lance-core",
  "lance-datafusion",
@@ -3396,6 +3483,30 @@ version = "0.2.167"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
 
+[[package]]
+name = "libflate"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e"
+dependencies = [
+ "adler32",
+ "core2",
+ "crc32fast",
+ "dary_heap",
+ "libflate_lz77",
+]
+
+[[package]]
+name = "libflate_lz77"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
+dependencies = [
+ "core2",
+ "hashbrown 0.14.5",
+ "rle-decode-fast",
+]
+
 [[package]]
 name = "libm"
 version = "0.2.11"
@@ -4825,6 +4936,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "rle-decode-fast"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
+
 [[package]]
 name = "roaring"
 version = "0.10.7"
diff --git a/python/Cargo.toml b/python/Cargo.toml
index 0a1bea95e4..cb13c86963 100644
--- a/python/Cargo.toml
+++ b/python/Cargo.toml
@@ -36,7 +36,7 @@ lance-core = { path = "../rust/lance-core" }
 lance-datagen = { path = "../rust/lance-datagen", optional = true }
 lance-encoding = { path = "../rust/lance-encoding" }
 lance-file = { path = "../rust/lance-file" }
-lance-index = { path = "../rust/lance-index", features = ["tokenizer-lindera"] }
+lance-index = { path = "../rust/lance-index", features = ["tokenizer-lindera", "tokenizer-jieba"] }
 lance-io = { path = "../rust/lance-io" }
 lance-linalg = { path = "../rust/lance-linalg" }
 lance-table = { path = "../rust/lance-table" }
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
index 1cff60a8b6..ac08e8d0d5 100644
--- a/rust/lance-index/Cargo.toml
+++ b/rust/lance-index/Cargo.toml
@@ -30,6 +30,7 @@ dirs.workspace = true
 futures.workspace = true
 half.workspace = true
 itertools.workspace = true
+jieba-rs = { workspace = true, optional = true }
 lance-arrow.workspace = true
 lance-core.workspace = true
 lance-datafusion.workspace = true
@@ -73,6 +74,7 @@ random_word = { version = "0.4.3", features = ["en"] }
 
 [features]
 tokenizer-lindera = ["lindera", "lindera-tantivy"]
+tokenizer-jieba = ["jieba-rs"]
 
 [build-dependencies]
 prost-build.workspace = true
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 566f8e7635..431cf8d421 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -149,6 +149,10 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
         s if s.starts_with("lindera/") => {
             return build_lindera_tokenizer_builder(s);
         }
+        #[cfg(feature = "tokenizer-jieba")]
+        s if s.starts_with("jieba/") || s == "jieba" => {
+            return build_jieba_tokenizer_builder(s);
+        }
         _ => Err(Error::invalid_input(
             format!("unknown base tokenizer {}", name),
             location!(),
@@ -194,9 +198,18 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::Text
         Some(p) => {
             let dic_dir = p.join(dic);
             let config_path = dic_dir.join("config.json");
-            let file = File::open(config_path)?;
-            let reader = BufReader::new(file);
-            let config: LinderaConfig = serde_json::from_reader(reader)?;
+            let config: LinderaConfig = if config_path.exists() {
+                let file = File::open(config_path)?;
+                let reader = BufReader::new(file);
+                serde_json::from_reader(reader)?
+            } else {
+                let Some(dic_dir) = dic_dir.to_str() else {
+                    return Err(Error::invalid_input("dic dir is invalid",
+                        location!(),
+                    ))
+                };
+                LinderaConfig{main: String::from(dic_dir), user: None, user_kind: None}
+            };
             let main_path = dic_dir.join(config.main);
             let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| {
                 Error::io(
@@ -250,3 +263,92 @@ fn build_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::Text
         )),
     }
 }
+
+
+
+#[cfg(feature = "tokenizer-jieba")]
+fn build_jieba_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
+    match LANCE_LANGUAGE_MODEL_HOME.as_ref() {
+        Some(p) => {
+            let dic = if dic == "jieba" {
+                "jieba/default"
+            } else {
+                dic
+            };
+            let dic_file = p.join(dic).join("dict.txt");
+            let file = std::fs::File::open(dic_file)?;
+            let mut f = std::io::BufReader::new(file);
+            let jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| {
+                Error::io(
+                    format!("load jieba tokenizer dictionary err: {e}"),
+                    location!(),
+                )
+            })?;
+            let tokenizer = JiebaTokenizer{jieba: jieba};
+            Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
+        },
+        None => Err(Error::invalid_input(
+            format!(
+                "{} is undefined",
+                String::from(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY)
+            ),
+            location!(),
+        )),
+    }
+
+}
+
+#[cfg(feature = "tokenizer-jieba")]
+#[derive(Clone)]
+struct JiebaTokenizer{
+    jieba: jieba_rs::Jieba
+}
+
+#[cfg(feature = "tokenizer-jieba")]
+struct JiebaTokenStream {
+    tokens: Vec<tantivy::tokenizer::Token>,
+    index: usize,
+}
+
+#[cfg(feature = "tokenizer-jieba")]
+impl tantivy::tokenizer::TokenStream for JiebaTokenStream {
+    fn advance(&mut self) -> bool {
+        if self.index < self.tokens.len() {
+            self.index += 1;
+            true
+        } else {
+            false
+        }
+    }
+
+    fn token(&self) -> &tantivy::tokenizer::Token {
+        &self.tokens[self.index - 1]
+    }
+
+    fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
+        &mut self.tokens[self.index - 1]
+    }
+}
+
+
+#[cfg(feature = "tokenizer-jieba")]
+impl tantivy::tokenizer::Tokenizer for JiebaTokenizer {
+    type TokenStream<'a> = JiebaTokenStream;
+
+    fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
+        let mut indices = text.char_indices().collect::<Vec<_>>();
+        indices.push((text.len(), '\0'));
+        let orig_tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true);
+        let mut tokens = Vec::new();
+        for token in orig_tokens {
+            tokens.push(tantivy::tokenizer::Token {
+                offset_from: indices[token.start].0,
+                offset_to: indices[token.end].0,
+                position: token.start,
+                text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]),
+                position_length: token.end - token.start,
+            });
+        }
+        JiebaTokenStream { tokens, index: 0 }
+    }
+}

From 7962d9efa5caff215311f03f6709507965f985b1 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Fri, 27 Dec 2024 15:45:51 +0800
Subject: [PATCH 10/22] update type

---
 python/python/lance/lance/__init__.pyi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi
index ac6b5d3582..bf535a47a4 100644
--- a/python/python/lance/lance/__init__.pyi
+++ b/python/python/lance/lance/__init__.pyi
@@ -433,3 +433,4 @@ class BFloat16:
 def bfloat16_array(values: List[str | None]) -> BFloat16Array: ...
 
 __version__: str
+LANGUAGE_MODEL_HOME: Optional[str]

From 2aa4886c11e64bc31995354e5e574ac3265e3a94 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Fri, 27 Dec 2024 22:18:33 +0800
Subject: [PATCH 11/22] modulize thirdpart tokenizer

---
 python/python/lance/lm.py                     |  16 +-
 rust/lance-index/Cargo.toml                   |   5 +-
 .../src/scalar/inverted/tokenizer.rs          | 216 ++++--------------
 .../src/scalar/inverted/tokenizer/jieba.rs    | 123 ++++++++++
 .../src/scalar/inverted/tokenizer/lindera.rs  |  97 ++++++++
 5 files changed, 270 insertions(+), 187 deletions(-)
 create mode 100644 rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
 create mode 100644 rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs

diff --git a/python/python/lance/lm.py b/python/python/lance/lm.py
index b8d7cb850b..1338889054 100644
--- a/python/python/lance/lm.py
+++ b/python/python/lance/lm.py
@@ -28,15 +28,13 @@ def download_jieba():
     try:
         check_requests()
         import requests
-        resp = requests.get("https://api.github.com/repos/messense/jieba-rs/releases/latest")
-        content = requests.get(resp.json()["tarball_url"]).content
-        with tarfile.open(fileobj=BytesIO(content)) as tar:
-            dir = tar.getnames()[0]
-            tar.extract(f'{dir}/src/data', path=dirname)
-        shutil.move(os.path.join(dirname, dir, "src", "data"), dirname)
+        resp = requests.get("https://github.com/messense/jieba-rs/raw/refs/heads/main/src/data/dict.txt")
+        content = resp.content
+        with open(os.path.join(dirname, "dict.txt"), "wb") as fo:
+            fo.write(content)
     except Exception as _:
         traceback.print_exc()
-        print("Download jieba language model failed. Please download this folder "
+        print("Download jieba language model failed. Please download dict.txt from "
               f"https://github.com/messense/jieba-rs/tree/main/src/data and put it in {dirname}")
 
 def download_lindera(lm: str):
@@ -62,8 +60,8 @@ def download_lindera(lm: str):
         with tarfile.open(fileobj=BytesIO(data)) as tar:
             tar.extractall()
             name = tar.getnames()[0]
-        cmd = ["lindera", "build", f"--dictionary-kind={lm}", os.path.join(src_dirname, name), dirname]
-        print(f"compile language model: {' '.join(cmd)}")
+        cmd = ["lindera", "build", f"--dictionary-kind={lm}", os.path.join(src_dirname, name),os.path.join(dirname, "main")]
+        print(f"compiling language model: {' '.join(cmd)}")
         subprocess.run(cmd)
     finally:
         os.chdir(cwd)
diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
index ac08e8d0d5..e6cf51d2d7 100644
--- a/rust/lance-index/Cargo.toml
+++ b/rust/lance-index/Cargo.toml
@@ -73,8 +73,9 @@ datafusion-sql.workspace = true
 random_word = { version = "0.4.3", features = ["en"] }
 
 [features]
-tokenizer-lindera = ["lindera", "lindera-tantivy"]
-tokenizer-jieba = ["jieba-rs"]
+tokenizer-lindera = ["lindera", "lindera-tantivy", "tokenizer-common"]
+tokenizer-jieba = ["jieba-rs", "tokenizer-common"]
+tokenizer-common = []
 
 [build-dependencies]
 prost-build.workspace = true
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 431cf8d421..52f263e638 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -7,6 +7,12 @@ use lance_core::{Error, Result};
 use serde::{Deserialize, Serialize};
 use snafu::{location, Location};
 
+#[cfg(feature = "tokenizer-lindera")]
+mod lindera;
+
+#[cfg(feature = "tokenizer-jieba")]
+mod jieba;
+
 /// Tokenizer configs
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct TokenizerConfig {
@@ -147,11 +153,24 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
         .dynamic()),
         #[cfg(feature = "tokenizer-lindera")]
         s if s.starts_with("lindera/") => {
-            return build_lindera_tokenizer_builder(s);
+            let Some(home) = &*LANCE_LANGUAGE_MODEL_HOME else {
+                return Err(Error::invalid_input(
+                    format!("unknown base tokenizer {}", name),
+                    location!(),
+                ))
+            };
+            lindera::LinderaBuilder::load(&home.join(s))?.build()
         }
         #[cfg(feature = "tokenizer-jieba")]
         s if s.starts_with("jieba/") || s == "jieba" => {
-            return build_jieba_tokenizer_builder(s);
+            let s = if s == "jieba" { "jieba/default" } else { s };
+            let Some(home) = &*LANCE_LANGUAGE_MODEL_HOME else {
+                return Err(Error::invalid_input(
+                    format!("unknown base tokenizer {}", name),
+                    location!(),
+                ))
+            };
+            lindera::LinderaBuilder::load(&home.join(s))?.build()
         }
         _ => Err(Error::invalid_input(
             format!("unknown base tokenizer {}", name),
@@ -164,6 +183,8 @@ pub const LANCE_LANGUAGE_MODEL_HOME_ENV_KEY: &str = "LANCE_LANGUAGE_MODEL_HOME";
 
 pub const LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY: &str = "lance/language_models";
 
+pub const LANCE_LANGUAGE_MODEL_CONFIG_FILE: &str = "config.json";
+
 lazy_static::lazy_static! {
     /// default directory that stores lance tokenizer related files, e.g. tokenizer model.
     pub static ref LANCE_LANGUAGE_MODEL_HOME: Option<PathBuf> = match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) {
@@ -172,183 +193,26 @@ lazy_static::lazy_static! {
     };
 }
 
-#[cfg(feature = "tokenizer-lindera")]
-#[derive(Serialize, Deserialize)]
-struct LinderaConfig{
-    main: String,
-    user: Option<String>,
-    user_kind: Option<String>
-}
-
-#[cfg(feature = "tokenizer-lindera")]
-fn build_lindera_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
-    use std::{fs::File, io::BufReader};
-
-    use lindera::{
-        dictionary::{
-            load_dictionary_from_path, load_user_dictionary_from_config, UserDictionaryConfig,
-        },
-        mode::Mode,
-        segmenter::Segmenter,
-    };
-    use lindera_tantivy::tokenizer::LinderaTokenizer;
-    use serde_json::Value;
-
-    match LANCE_LANGUAGE_MODEL_HOME.as_ref() {
-        Some(p) => {
-            let dic_dir = p.join(dic);
-            let config_path = dic_dir.join("config.json");
-            let config: LinderaConfig = if config_path.exists() {
-                let file = File::open(config_path)?;
-                let reader = BufReader::new(file);
-                serde_json::from_reader(reader)?
-            } else {
-                let Some(dic_dir) = dic_dir.to_str() else {
-                    return Err(Error::invalid_input("dic dir is invalid",
-                        location!(),
-                    ))
-                };
-                LinderaConfig{main: String::from(dic_dir), user: None, user_kind: None}
-            };
-            let main_path = dic_dir.join(config.main);
-            let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| {
-                Error::io(
-                    format!("load lindera tokenizer main dictionary err: {e}"),
-                    location!(),
-                )
-            })?;
-            let user_dictionary = match config.user {
-                Some(user) =>  {
-                    let mut conf = serde_json::Map::<String, Value>::new();
-                    let user_path = dic_dir.join(user);
-                    match user_path.to_str() {
-                        Some(p) => {
-                            conf.insert(String::from("path"), Value::String(String::from(p)));
-                            Ok(())
-                        },
-                        None => {
-                            let p = user_path.display();
-                            Err(Error::io(
-                                format!("invalid lindera tokenizer user dictionary path: {p}"),
-                                location!(),
-                            ))
-                        }
-                    }?;
-                    if let Some(kind) = config.user_kind {
-                        conf.insert(String::from("kind"), Value::String(kind));
-                    }
-                    let user_dictionary_config: UserDictionaryConfig = Value::Object(conf);
-                    let user_dictionary = load_user_dictionary_from_config(&user_dictionary_config).map_err(|e| {
-                        Error::io(
-                            format!("load lindera tokenizer user dictionary err: {e}"),
-                            location!(),
-                        )
-                    })?;
-                    Some(user_dictionary)
-                },
-                None => None
-
-            };
-            let mode = Mode::Normal;
-            let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
-            let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
-            Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
+#[cfg(feature = "tokenizer-common")]
+trait TokenizerBuilder: Sized {
+    type Config: serde::de::DeserializeOwned + Default;
+    fn load(p: &PathBuf) -> Result<Self> {
+        if !p.is_dir() {
+            return Err(Error::io(format!("{} is not a valid directory", p.display()), location!()))
         }
-        None => Err(Error::invalid_input(
-            format!(
-                "{} is undefined",
-                String::from(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY)
-            ),
-            location!(),
-        )),
-    }
-}
-
-
-
-#[cfg(feature = "tokenizer-jieba")]
-fn build_jieba_tokenizer_builder(dic: &str) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
-    match LANCE_LANGUAGE_MODEL_HOME.as_ref() {
-        Some(p) => {
-            let dic = if dic == "jieba" {
-                "jieba/default"
-            } else {
-                dic
-            };
-            let dic_file = p.join(dic).join("dict.txt");
-            let file = std::fs::File::open(dic_file)?;
-            let mut f = std::io::BufReader::new(file);
-            let jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| {
-                Error::io(
-                    format!("load jieba tokenizer dictionary err: {e}"),
-                    location!(),
-                )
-            })?;
-            let tokenizer = JiebaTokenizer{jieba: jieba};
-            Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
-        },
-        None => Err(Error::invalid_input(
-            format!(
-                "{} is undefined",
-                String::from(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY)
-            ),
-            location!(),
-        )),
-    }
-
-}
-
-#[cfg(feature = "tokenizer-jieba")]
-#[derive(Clone)]
-struct JiebaTokenizer{
-    jieba: jieba_rs::Jieba
-}
-
-#[cfg(feature = "tokenizer-jieba")]
-struct JiebaTokenStream {
-    tokens: Vec<tantivy::tokenizer::Token>,
-    index: usize,
-}
-
-#[cfg(feature = "tokenizer-jieba")]
-impl tantivy::tokenizer::TokenStream for JiebaTokenStream {
-    fn advance(&mut self) -> bool {
-        if self.index < self.tokens.len() {
-            self.index += 1;
-            true
+        use std::{fs::File, io::BufReader};
+        let config_path = p.join(LANCE_LANGUAGE_MODEL_CONFIG_FILE);
+        let config= if config_path.exists() {
+            let file = File::open(config_path)?;
+            let reader = BufReader::new(file);
+            serde_json::from_reader::<BufReader<File>, Self::Config>(reader)?
         } else {
-            false
-        }
+            Self::Config::default()
+        };
+        Self::new(config, p)
     }
 
-    fn token(&self) -> &tantivy::tokenizer::Token {
-        &self.tokens[self.index - 1]
-    }
+    fn new(config: Self::Config, root: &PathBuf) -> Result<Self>;
 
-    fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
-        &mut self.tokens[self.index - 1]
-    }
-}
-
-
-#[cfg(feature = "tokenizer-jieba")]
-impl tantivy::tokenizer::Tokenizer for JiebaTokenizer {
-    type TokenStream<'a> = JiebaTokenStream;
-
-    fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
-        let mut indices = text.char_indices().collect::<Vec<_>>();
-        indices.push((text.len(), '\0'));
-        let orig_tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true);
-        let mut tokens = Vec::new();
-        for token in orig_tokens {
-            tokens.push(tantivy::tokenizer::Token {
-                offset_from: indices[token.start].0,
-                offset_to: indices[token.end].0,
-                position: token.start,
-                text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]),
-                position_length: token.end - token.start,
-            });
-        }
-        JiebaTokenStream { tokens, index: 0 }
-    }
+    fn build(&self) -> Result<tantivy::tokenizer::TextAnalyzerBuilder>;
 }
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
new file mode 100644
index 0000000000..33fcb8b30a
--- /dev/null
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
@@ -0,0 +1,123 @@
+use std::path::PathBuf;
+
+use lance_core::{Error, Result};
+use serde::{Deserialize, Serialize};
+use snafu::{location, Location};
+use super::TokenizerBuilder;
+
+#[derive(Serialize, Deserialize)]
+pub struct JiebaConfig{
+    main: Option<String>,
+    users: Option<Vec<String>>
+}
+
+impl Default for JiebaConfig {
+    fn default() -> Self {
+        Self { main: Default::default(), users: Default::default() }
+    }
+}
+
+pub struct JiebaBuilder {
+    root: PathBuf,
+    config: JiebaConfig
+}
+
+impl JiebaBuilder {
+    fn main_dict_path(&self) -> PathBuf {
+        if let Some(p) = &self.config.main {
+            return self.root.join(p);
+        }
+        self.root.join("dict.txt")
+    }
+
+    fn user_dict_paths(&self) -> Vec<PathBuf> {
+        let Some(users) = &self.config.users else {
+            return vec![];
+        };
+        users.iter().map(|p| self.root.join(p)).collect()
+    }
+}
+
+impl TokenizerBuilder for JiebaBuilder {
+    type Config = JiebaConfig;
+
+    fn new(config: Self::Config, root: &PathBuf) -> Result<Self> {
+        Ok(JiebaBuilder{config, root: root.clone()})
+    }
+
+    fn build(&self) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
+        let main_dict_path = &self.main_dict_path();
+        let file = std::fs::File::open(main_dict_path)?;
+        let mut f = std::io::BufReader::new(file);
+        let mut jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| {
+            Error::io(
+                format!("load jieba tokenizer dictionary {}, error: {}", main_dict_path.display(), e),
+                location!(),
+            )
+        })?;
+        for user_dict_path in &self.user_dict_paths() {
+            let file = std::fs::File::open(user_dict_path)?;
+            let mut f = std::io::BufReader::new(file);
+            jieba.load_dict(&mut f).map_err(|e| {
+                Error::io(
+                    format!("load jieba tokenizer user dictionary {},  error: {}", user_dict_path.display(), e),
+                    location!(),
+                )
+            })?
+        }
+        let tokenizer = JiebaTokenizer{jieba: jieba};
+        Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
+    }
+}
+
+#[derive(Clone)]
+struct JiebaTokenizer{
+    jieba: jieba_rs::Jieba
+}
+
+struct JiebaTokenStream {
+    tokens: Vec<tantivy::tokenizer::Token>,
+    index: usize,
+}
+
+impl tantivy::tokenizer::TokenStream for JiebaTokenStream {
+    fn advance(&mut self) -> bool {
+        if self.index < self.tokens.len() {
+            self.index += 1;
+            true
+        } else {
+            false
+        }
+    }
+
+    fn token(&self) -> &tantivy::tokenizer::Token {
+        &self.tokens[self.index - 1]
+    }
+
+    fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
+        &mut self.tokens[self.index - 1]
+    }
+}
+
+
+#[cfg(feature = "tokenizer-jieba")]
+impl tantivy::tokenizer::Tokenizer for JiebaTokenizer {
+    type TokenStream<'a> = JiebaTokenStream;
+
+    fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
+        let mut indices = text.char_indices().collect::<Vec<_>>();
+        indices.push((text.len(), '\0'));
+        let orig_tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true);
+        let mut tokens = Vec::new();
+        for token in orig_tokens {
+            tokens.push(tantivy::tokenizer::Token {
+                offset_from: indices[token.start].0,
+                offset_to: indices[token.end].0,
+                position: token.start,
+                text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]),
+                position_length: token.end - token.start,
+            });
+        }
+        JiebaTokenStream { tokens, index: 0 }
+    }
+}
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
new file mode 100644
index 0000000000..e07ed4d91f
--- /dev/null
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
@@ -0,0 +1,97 @@
+use std::path::PathBuf;
+
+use lance_core::{Error, Result};
+use serde::{Deserialize, Serialize};
+use serde_json::{Map, Value};
+use snafu::{location, Location};
+use super::TokenizerBuilder;
+use lindera::{
+    dictionary::{
+        load_dictionary_from_path, load_user_dictionary_from_config, UserDictionaryConfig,
+    },
+    mode::Mode,
+    segmenter::Segmenter,
+};
+use lindera_tantivy::tokenizer::LinderaTokenizer;
+
+#[derive(Serialize, Deserialize)]
+pub struct LinderaConfig{
+    main: Option<String>,
+    user: Option<String>,
+    user_kind: Option<String>
+}
+
+impl Default for LinderaConfig {
+    fn default() -> Self {
+        Self { main: Default::default(), user: Default::default(), user_kind: Default::default() }
+    }
+}
+
+pub struct LinderaBuilder {
+    root: PathBuf,
+    config: LinderaConfig
+}
+
+impl LinderaBuilder {
+    fn main_dict_path(&self) -> PathBuf {
+        if let Some(p) = &self.config.main {
+            return self.root.join(p);
+        }
+        self.root.join("main")
+    }
+
+    fn user_dict_config(&self) -> Result<Option<UserDictionaryConfig>> {
+        let Some(user_dict_path) = &self.config.user else {
+            return Ok(None)
+        };
+        let mut conf = Map::<String, Value>::new();
+        let user_path = self.root.join(user_dict_path);
+        let Some(p) = user_path.to_str() else {
+            return Err(Error::io(
+                format!("invalid lindera tokenizer user dictionary path: {}", user_path.display()),
+                location!(),
+            ))
+        };
+        conf.insert(String::from("path"), Value::String(String::from(p)));
+        if let Some(kind) = &self.config.user_kind {
+            conf.insert(String::from("kind"), Value::String(kind.clone()));
+        }
+        Ok(Some(Value::Object(conf)))
+    }
+}
+
+impl TokenizerBuilder for LinderaBuilder {
+    type Config = LinderaConfig;
+
+    fn new(config: Self::Config, root: &PathBuf) -> Result<Self> {
+        Ok(LinderaBuilder{config, root: root.clone()})
+    }
+
+    fn build(&self) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
+        let main_path = self.main_dict_path();
+        let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| {
+            Error::io(
+                format!("load lindera tokenizer main dictionary from {}, error: {}", main_path.display(), e),
+                location!(),
+            )
+        })?;
+        let user_dictionary = match self.user_dict_config()? {
+            Some(conf) =>  {
+                let user_dictionary = load_user_dictionary_from_config(&conf).map_err(|e| {
+                    Error::io(
+                        format!("load lindera tokenizer user dictionary err: {e}"),
+                        location!(),
+                    )
+                })?;
+                Some(user_dictionary)
+            },
+            None => None
+
+        };
+        let mode = Mode::Normal;
+        let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
+        let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
+        Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
+    }
+}
+

From 5313fe71a2df3b3fb1e1a94416c78e96e111dd61 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Fri, 27 Dec 2024 22:51:55 +0800
Subject: [PATCH 12/22] format and dict

---
 python/python/lance/{lm.py => download.py}    |  0
 .../python/tests/lms/jieba/default/dict.txt   |  8 ++++
 .../tests/lms/jieba/user_dict/config.json     |  6 +++
 .../python/tests/lms/jieba/user_dict/user.txt |  1 +
 .../src/scalar/inverted/tokenizer.rs          | 11 +++--
 .../src/scalar/inverted/tokenizer/jieba.rs    | 41 +++++++++++-----
 .../src/scalar/inverted/tokenizer/lindera.rs  | 48 ++++++++++++-------
 7 files changed, 80 insertions(+), 35 deletions(-)
 rename python/python/lance/{lm.py => download.py} (100%)
 create mode 100644 python/python/tests/lms/jieba/default/dict.txt
 create mode 100644 python/python/tests/lms/jieba/user_dict/config.json
 create mode 100644 python/python/tests/lms/jieba/user_dict/user.txt

diff --git a/python/python/lance/lm.py b/python/python/lance/download.py
similarity index 100%
rename from python/python/lance/lm.py
rename to python/python/lance/download.py
diff --git a/python/python/tests/lms/jieba/default/dict.txt b/python/python/tests/lms/jieba/default/dict.txt
new file mode 100644
index 0000000000..237b47ca6a
--- /dev/null
+++ b/python/python/tests/lms/jieba/default/dict.txt
@@ -0,0 +1,8 @@
+我们 98740 r
+都 202780 d
+有 423765 v
+光明 1219 n
+的 318825 uj
+前途 1263 n
+前 62779 f
+途 857 n
diff --git a/python/python/tests/lms/jieba/user_dict/config.json b/python/python/tests/lms/jieba/user_dict/config.json
new file mode 100644
index 0000000000..5f0541ed4f
--- /dev/null
+++ b/python/python/tests/lms/jieba/user_dict/config.json
@@ -0,0 +1,6 @@
+{
+    "main": "../default/dict.txt",
+    "user": [
+        "user.txt"
+    ]
+}
diff --git a/python/python/tests/lms/jieba/user_dict/user.txt b/python/python/tests/lms/jieba/user_dict/user.txt
new file mode 100644
index 0000000000..be2d8a9582
--- /dev/null
+++ b/python/python/tests/lms/jieba/user_dict/user.txt
@@ -0,0 +1 @@
+光明的前途 1219 n
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 52f263e638..d0b38f8f3f 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -157,7 +157,7 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
                 return Err(Error::invalid_input(
                     format!("unknown base tokenizer {}", name),
                     location!(),
-                ))
+                ));
             };
             lindera::LinderaBuilder::load(&home.join(s))?.build()
         }
@@ -168,7 +168,7 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
                 return Err(Error::invalid_input(
                     format!("unknown base tokenizer {}", name),
                     location!(),
-                ))
+                ));
             };
             lindera::LinderaBuilder::load(&home.join(s))?.build()
         }
@@ -198,11 +198,14 @@ trait TokenizerBuilder: Sized {
     type Config: serde::de::DeserializeOwned + Default;
     fn load(p: &PathBuf) -> Result<Self> {
         if !p.is_dir() {
-            return Err(Error::io(format!("{} is not a valid directory", p.display()), location!()))
+            return Err(Error::io(
+                format!("{} is not a valid directory", p.display()),
+                location!(),
+            ));
         }
         use std::{fs::File, io::BufReader};
         let config_path = p.join(LANCE_LANGUAGE_MODEL_CONFIG_FILE);
-        let config= if config_path.exists() {
+        let config = if config_path.exists() {
             let file = File::open(config_path)?;
             let reader = BufReader::new(file);
             serde_json::from_reader::<BufReader<File>, Self::Config>(reader)?
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
index 33fcb8b30a..a874ca3bd4 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
@@ -1,25 +1,28 @@
 use std::path::PathBuf;
 
+use super::TokenizerBuilder;
 use lance_core::{Error, Result};
 use serde::{Deserialize, Serialize};
 use snafu::{location, Location};
-use super::TokenizerBuilder;
 
 #[derive(Serialize, Deserialize)]
-pub struct JiebaConfig{
+pub struct JiebaConfig {
     main: Option<String>,
-    users: Option<Vec<String>>
+    users: Option<Vec<String>>,
 }
 
 impl Default for JiebaConfig {
     fn default() -> Self {
-        Self { main: Default::default(), users: Default::default() }
+        Self {
+            main: Default::default(),
+            users: Default::default(),
+        }
     }
 }
 
 pub struct JiebaBuilder {
     root: PathBuf,
-    config: JiebaConfig
+    config: JiebaConfig,
 }
 
 impl JiebaBuilder {
@@ -42,7 +45,10 @@ impl TokenizerBuilder for JiebaBuilder {
     type Config = JiebaConfig;
 
     fn new(config: Self::Config, root: &PathBuf) -> Result<Self> {
-        Ok(JiebaBuilder{config, root: root.clone()})
+        Ok(JiebaBuilder {
+            config,
+            root: root.clone(),
+        })
     }
 
     fn build(&self) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
@@ -51,7 +57,11 @@ impl TokenizerBuilder for JiebaBuilder {
         let mut f = std::io::BufReader::new(file);
         let mut jieba = jieba_rs::Jieba::with_dict(&mut f).map_err(|e| {
             Error::io(
-                format!("load jieba tokenizer dictionary {}, error: {}", main_dict_path.display(), e),
+                format!(
+                    "load jieba tokenizer dictionary {}, error: {}",
+                    main_dict_path.display(),
+                    e
+                ),
                 location!(),
             )
         })?;
@@ -60,19 +70,23 @@ impl TokenizerBuilder for JiebaBuilder {
             let mut f = std::io::BufReader::new(file);
             jieba.load_dict(&mut f).map_err(|e| {
                 Error::io(
-                    format!("load jieba tokenizer user dictionary {},  error: {}", user_dict_path.display(), e),
+                    format!(
+                        "load jieba tokenizer user dictionary {},  error: {}",
+                        user_dict_path.display(),
+                        e
+                    ),
                     location!(),
                 )
             })?
         }
-        let tokenizer = JiebaTokenizer{jieba: jieba};
+        let tokenizer = JiebaTokenizer { jieba: jieba };
         Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
     }
 }
 
 #[derive(Clone)]
-struct JiebaTokenizer{
-    jieba: jieba_rs::Jieba
+struct JiebaTokenizer {
+    jieba: jieba_rs::Jieba,
 }
 
 struct JiebaTokenStream {
@@ -99,7 +113,6 @@ impl tantivy::tokenizer::TokenStream for JiebaTokenStream {
     }
 }
 
-
 #[cfg(feature = "tokenizer-jieba")]
 impl tantivy::tokenizer::Tokenizer for JiebaTokenizer {
     type TokenStream<'a> = JiebaTokenStream;
@@ -107,7 +120,9 @@ impl tantivy::tokenizer::Tokenizer for JiebaTokenizer {
     fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
         let mut indices = text.char_indices().collect::<Vec<_>>();
         indices.push((text.len(), '\0'));
-        let orig_tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Search, true);
+        let orig_tokens = self
+            .jieba
+            .tokenize(text, jieba_rs::TokenizeMode::Search, true);
         let mut tokens = Vec::new();
         for token in orig_tokens {
             tokens.push(tantivy::tokenizer::Token {
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
index e07ed4d91f..ab60790de6 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
@@ -1,10 +1,7 @@
 use std::path::PathBuf;
 
-use lance_core::{Error, Result};
-use serde::{Deserialize, Serialize};
-use serde_json::{Map, Value};
-use snafu::{location, Location};
 use super::TokenizerBuilder;
+use lance_core::{Error, Result};
 use lindera::{
     dictionary::{
         load_dictionary_from_path, load_user_dictionary_from_config, UserDictionaryConfig,
@@ -13,23 +10,30 @@ use lindera::{
     segmenter::Segmenter,
 };
 use lindera_tantivy::tokenizer::LinderaTokenizer;
+use serde::{Deserialize, Serialize};
+use serde_json::{Map, Value};
+use snafu::{location, Location};
 
 #[derive(Serialize, Deserialize)]
-pub struct LinderaConfig{
+pub struct LinderaConfig {
     main: Option<String>,
     user: Option<String>,
-    user_kind: Option<String>
+    user_kind: Option<String>,
 }
 
 impl Default for LinderaConfig {
     fn default() -> Self {
-        Self { main: Default::default(), user: Default::default(), user_kind: Default::default() }
+        Self {
+            main: Default::default(),
+            user: Default::default(),
+            user_kind: Default::default(),
+        }
     }
 }
 
 pub struct LinderaBuilder {
     root: PathBuf,
-    config: LinderaConfig
+    config: LinderaConfig,
 }
 
 impl LinderaBuilder {
@@ -42,15 +46,18 @@ impl LinderaBuilder {
 
     fn user_dict_config(&self) -> Result<Option<UserDictionaryConfig>> {
         let Some(user_dict_path) = &self.config.user else {
-            return Ok(None)
+            return Ok(None);
         };
         let mut conf = Map::<String, Value>::new();
         let user_path = self.root.join(user_dict_path);
         let Some(p) = user_path.to_str() else {
             return Err(Error::io(
-                format!("invalid lindera tokenizer user dictionary path: {}", user_path.display()),
+                format!(
+                    "invalid lindera tokenizer user dictionary path: {}",
+                    user_path.display()
+                ),
                 location!(),
-            ))
+            ));
         };
         conf.insert(String::from("path"), Value::String(String::from(p)));
         if let Some(kind) = &self.config.user_kind {
@@ -64,19 +71,26 @@ impl TokenizerBuilder for LinderaBuilder {
     type Config = LinderaConfig;
 
     fn new(config: Self::Config, root: &PathBuf) -> Result<Self> {
-        Ok(LinderaBuilder{config, root: root.clone()})
+        Ok(LinderaBuilder {
+            config,
+            root: root.clone(),
+        })
     }
 
     fn build(&self) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
         let main_path = self.main_dict_path();
         let dictionary = load_dictionary_from_path(main_path.as_path()).map_err(|e| {
             Error::io(
-                format!("load lindera tokenizer main dictionary from {}, error: {}", main_path.display(), e),
+                format!(
+                    "load lindera tokenizer main dictionary from {}, error: {}",
+                    main_path.display(),
+                    e
+                ),
                 location!(),
             )
         })?;
         let user_dictionary = match self.user_dict_config()? {
-            Some(conf) =>  {
+            Some(conf) => {
                 let user_dictionary = load_user_dictionary_from_config(&conf).map_err(|e| {
                     Error::io(
                         format!("load lindera tokenizer user dictionary err: {e}"),
@@ -84,9 +98,8 @@ impl TokenizerBuilder for LinderaBuilder {
                     )
                 })?;
                 Some(user_dictionary)
-            },
-            None => None
-
+            }
+            None => None,
         };
         let mode = Mode::Normal;
         let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
@@ -94,4 +107,3 @@ impl TokenizerBuilder for LinderaBuilder {
         Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
     }
 }
-

From f7fcb4788ee5dc8f99803adbf7dfa57a994fad84 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 11:23:04 +0800
Subject: [PATCH 13/22] update tokenizer

---
 python/python/lance/download.py               |  60 ++++++++++++------
 python/python/lance/lance/__init__.pyi        |   3 +-
 .../{lms => models}/jieba/default/dict.txt    |   0
 .../jieba/user_dict/config.json               |   0
 .../{lms => models}/jieba/user_dict/user.txt  |   0
 .../lindera/ipadic/ipadic_simple_userdic.bin  | Bin 0 -> 1612 bytes
 .../lindera/ipadic/ipadic_simple_userdic.csv  |   3 +
 python/src/lib.rs                             |  20 +++++-
 .../src/scalar/inverted/tokenizer.rs          |  11 ++--
 9 files changed, 66 insertions(+), 31 deletions(-)
 rename python/python/tests/{lms => models}/jieba/default/dict.txt (100%)
 rename python/python/tests/{lms => models}/jieba/user_dict/config.json (100%)
 rename python/python/tests/{lms => models}/jieba/user_dict/user.txt (100%)
 create mode 100644 python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin
 create mode 100644 python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv

diff --git a/python/python/lance/download.py b/python/python/lance/download.py
index 1338889054..66f2558176 100644
--- a/python/python/lance/download.py
+++ b/python/python/lance/download.py
@@ -1,44 +1,55 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright The Lance Authors
 
-from io import BytesIO
 import os
 import shutil
 import subprocess
 import tarfile
 import traceback
-from .lance import LANGUAGE_MODEL_HOME
+from io import BytesIO
+
+from .lance import language_model_home
+
+LANGUAGE_MODEL_HOME = language_model_home()
 
-if LANGUAGE_MODEL_HOME is None:
-    raise Exception("LANCE_LANGUAGE_MODEL_HOME is not configured")
 
 def check_lindera():
     if not shutil.which("lindera"):
-        raise Exception("lindera is not installed. Please install it by following https://github.com/lindera/lindera/tree/main/lindera-cli")
+        raise Exception(
+            "lindera is not installed. Please install it by following https://github.com/lindera/lindera/tree/main/lindera-cli"
+        )
+
 
-def check_requests():
+def import_requests():
     try:
-        import requests
-    except:
+        import requests  # type: ignore
+    except Exception:
         raise Exception("requests is not installed, Please pip install requests")
+    return requests
+
 
 def download_jieba():
     dirname = os.path.join(LANGUAGE_MODEL_HOME, "jieba", "default")
     os.makedirs(dirname, exist_ok=True)
     try:
-        check_requests()
-        import requests
-        resp = requests.get("https://github.com/messense/jieba-rs/raw/refs/heads/main/src/data/dict.txt")
+        requests = import_requests()
+        resp = requests.get(
+            "https://github.com/messense/jieba-rs/raw/refs/heads/main/src/data/dict.txt"
+        )
         content = resp.content
         with open(os.path.join(dirname, "dict.txt"), "wb") as fo:
             fo.write(content)
     except Exception as _:
         traceback.print_exc()
-        print("Download jieba language model failed. Please download dict.txt from "
-              f"https://github.com/messense/jieba-rs/tree/main/src/data and put it in {dirname}")
+        print(
+            "Download jieba language model failed. Please download dict.txt from "
+            "https://github.com/messense/jieba-rs/tree/main/src/data "
+            f"and put it in {dirname}"
+        )
+
 
 def download_lindera(lm: str):
-    import requests
+    requests = import_requests()
     dirname = os.path.join(LANGUAGE_MODEL_HOME, "lindera", lm)
     src_dirname = os.path.join(dirname, "src")
     if lm == "ipadic":
@@ -60,7 +71,13 @@ def download_lindera(lm: str):
         with tarfile.open(fileobj=BytesIO(data)) as tar:
             tar.extractall()
             name = tar.getnames()[0]
-        cmd = ["lindera", "build", f"--dictionary-kind={lm}", os.path.join(src_dirname, name),os.path.join(dirname, "main")]
+        cmd = [
+            "lindera",
+            "build",
+            f"--dictionary-kind={lm}",
+            os.path.join(src_dirname, name),
+            os.path.join(dirname, "main"),
+        ]
         print(f"compiling language model: {' '.join(cmd)}")
         subprocess.run(cmd)
     finally:
@@ -69,18 +86,19 @@ def download_lindera(lm: str):
 
 def main():
     import argparse
+
     parser = argparse.ArgumentParser(
-        description='Lance tokenizer language model downloader'
+        description="Lance tokenizer language model downloader"
     )
-    parser.add_argument('tokenizer', choices=['jieba', 'lindera'])
+    parser.add_argument("tokenizer", choices=["jieba", "lindera"])
     parser.add_argument("-l", "--languagemodel")
     args = parser.parse_args()
     print(f"LANCE_LANGUAGE_MODEL_HOME={LANGUAGE_MODEL_HOME}")
-    if args.tokenizer == 'jieba':
+    if args.tokenizer == "jieba":
         download_jieba()
-    elif args.tokenizer == 'lindera':
+    elif args.tokenizer == "lindera":
         download_lindera(args.languagemodel)
 
-if __name__ == '__main__':
-    main()
 
+if __name__ == "__main__":
+    main()
diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi
index bf535a47a4..07aefef390 100644
--- a/python/python/lance/lance/__init__.pyi
+++ b/python/python/lance/lance/__init__.pyi
@@ -15,6 +15,7 @@
 from pathlib import Path
 from typing import (
     Any,
+    Callable,
     Dict,
     Iterable,
     Iterator,
@@ -433,4 +434,4 @@ class BFloat16:
 def bfloat16_array(values: List[str | None]) -> BFloat16Array: ...
 
 __version__: str
-LANGUAGE_MODEL_HOME: Optional[str]
+language_model_home: Callable[[], str]
diff --git a/python/python/tests/lms/jieba/default/dict.txt b/python/python/tests/models/jieba/default/dict.txt
similarity index 100%
rename from python/python/tests/lms/jieba/default/dict.txt
rename to python/python/tests/models/jieba/default/dict.txt
diff --git a/python/python/tests/lms/jieba/user_dict/config.json b/python/python/tests/models/jieba/user_dict/config.json
similarity index 100%
rename from python/python/tests/lms/jieba/user_dict/config.json
rename to python/python/tests/models/jieba/user_dict/config.json
diff --git a/python/python/tests/lms/jieba/user_dict/user.txt b/python/python/tests/models/jieba/user_dict/user.txt
similarity index 100%
rename from python/python/tests/lms/jieba/user_dict/user.txt
rename to python/python/tests/models/jieba/user_dict/user.txt
diff --git a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ada7701adb197032ce7374b3bed5269a351ca8e2
GIT binary patch
literal 1612
zcmeHGJxfDD5S;9@P_PhDun<8I6tS?du(0tz_!ImAmKGLycZu2rEJQ3+QasFu2`Fh)
z0{#U{3kzX27_dm8kix>*yq6#fVxic$&F$^Z&CD%#=tCUa0kmHbMuOsQfJR&5{kUNB
zls&Mw#M+P`N30tYY^7P>VE;KlxXNCiz}*Qhc32-|o)?@L?*jAvxF7M1r;cGW?D>kI
zyhaRb{#8)V2WVyl*#V$HtrU<xYzZ#Ob^jzdJD|7m02Sly>a`NRbMl&Qn!3D$E>eTu
zqYQOs*aI~TLtc*nljJV%6W0z<e@UUbD>z=~`8My+<!4Rj$wwY{Ac(AP=SRqG_c!0y
zf)ag~spC!)esY+<Ix(i{Zw1TLi%f4$P#aC=&c*ot<9`BwIf03H;;kFo@aG}U>~+T3
zpf!@Haa|!Ol{{X4*iH;5tGW|)7wWFm=O1=*kEJ7Hw3z))G8JQ?VNt_P4Rh+YEHO$F
zN9r@`%j)aRLUy`~_1o%>)m7COI>|?@x+1$h`5(4Cme^<<p50$oK0BTKuFc;v*fl_d
ajVkB#X*|BvnQW~${CJ>#?+a|tEZzW`{UO2t

literal 0
HcmV?d00001

diff --git a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv
new file mode 100644
index 0000000000..fae82a570b
--- /dev/null
+++ b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv
@@ -0,0 +1,3 @@
+東京スカイツリー,カスタム名詞,トウキョウスカイツリー
+東武スカイツリーライン,カスタム名詞,トウブスカイツリーライン
+とうきょうスカイツリー駅,カスタム名詞,トウキョウスカイツリーエキ
diff --git a/python/src/lib.rs b/python/src/lib.rs
index 98aa8b43be..f80399dae2 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -144,15 +144,14 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_wrapped(wrap_pyfunction!(read_tfrecord))?;
     m.add_wrapped(wrap_pyfunction!(trace_to_chrome))?;
     m.add_wrapped(wrap_pyfunction!(manifest_needs_migration))?;
+    m.add_wrapped(wrap_pyfunction!(language_model_home))?;
     // Debug functions
     m.add_wrapped(wrap_pyfunction!(debug::format_schema))?;
     m.add_wrapped(wrap_pyfunction!(debug::format_manifest))?;
     m.add_wrapped(wrap_pyfunction!(debug::format_fragment))?;
     m.add_wrapped(wrap_pyfunction!(debug::list_transactions))?;
     m.add("__version__", env!("CARGO_PKG_VERSION"))?;
-    let none = PyNone::get_bound(py).into_py(py);
-    let lm_home = lance_index::scalar::inverted::LANCE_LANGUAGE_MODEL_HOME.as_ref().and_then(|p| p.to_str()).map(|p| PyString::new_bound(py, p).into_py(py)).unwrap_or(none);
-    m.add("LANGUAGE_MODEL_HOME", lm_home)?;
+
     register_datagen(py, m)?;
     register_indices(py, m)?;
     Ok(())
@@ -176,6 +175,21 @@ fn json_to_schema(json: &str) -> PyResult<PyArrowType<ArrowSchema>> {
     Ok(schema.into())
 }
 
+#[pyfunction]
+pub fn language_model_home() -> PyResult<String> {
+    let Some(p) = lance_index::scalar::inverted::language_model_home() else {
+        return Err(pyo3::exceptions::PyValueError::new_err(format!(
+            "Failed to get language model home"
+        )));
+    };
+    let Some(pstr) = p.to_str() else {
+        return Err(pyo3::exceptions::PyValueError::new_err(format!(
+            "Failed to convert language model home to str"
+        )));
+    };
+    Ok(String::from(pstr))
+}
+
 /// Infer schema from tfrecord file
 ///
 /// Parameters
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index d0b38f8f3f..cdb60ff0f8 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -153,7 +153,7 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
         .dynamic()),
         #[cfg(feature = "tokenizer-lindera")]
         s if s.starts_with("lindera/") => {
-            let Some(home) = &*LANCE_LANGUAGE_MODEL_HOME else {
+            let Some(home) = language_model_home() else {
                 return Err(Error::invalid_input(
                     format!("unknown base tokenizer {}", name),
                     location!(),
@@ -164,7 +164,7 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
         #[cfg(feature = "tokenizer-jieba")]
         s if s.starts_with("jieba/") || s == "jieba" => {
             let s = if s == "jieba" { "jieba/default" } else { s };
-            let Some(home) = &*LANCE_LANGUAGE_MODEL_HOME else {
+            let Some(home) = language_model_home() else {
                 return Err(Error::invalid_input(
                     format!("unknown base tokenizer {}", name),
                     location!(),
@@ -185,12 +185,11 @@ pub const LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY: &str = "lance/language_models"
 
 pub const LANCE_LANGUAGE_MODEL_CONFIG_FILE: &str = "config.json";
 
-lazy_static::lazy_static! {
-    /// default directory that stores lance tokenizer related files, e.g. tokenizer model.
-    pub static ref LANCE_LANGUAGE_MODEL_HOME: Option<PathBuf> = match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) {
+pub fn language_model_home() -> Option<PathBuf> {
+    match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) {
         Ok(p) => Some(PathBuf::from(p)),
         Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY))
-    };
+    }
 }
 
 #[cfg(feature = "tokenizer-common")]

From eab0c16160304cfca6c9ef0ce9a001091f97d9fe Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 14:38:09 +0800
Subject: [PATCH 14/22] add document

---
 docs/tokenizer.rst                            |  87 ++++++++++++++++++
 python/python/lance/download.py               |   2 +-
 .../lindera/ipadic/ipadic_simple_userdic.bin  | Bin 1612 -> 0 bytes
 .../lindera/ipadic/ipadic_simple_userdic.csv  |   3 -
 python/src/lib.rs                             |   1 -
 5 files changed, 88 insertions(+), 5 deletions(-)
 create mode 100644 docs/tokenizer.rst
 delete mode 100644 python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin
 delete mode 100644 python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv

diff --git a/docs/tokenizer.rst b/docs/tokenizer.rst
new file mode 100644
index 0000000000..602956b26b
--- /dev/null
+++ b/docs/tokenizer.rst
@@ -0,0 +1,87 @@
+Tokenizers
+============================
+
+Currently, Lance has built-in support for Jieba and Lindera. However, it doesn't come with its own language models.
+If tokenization is needed, you can download language models by yourself.
+You can specify the location where the language models are stored by setting the environment variable LANCE_LANGUAGE_MODEL_HOME.
+If it's not set, the default value is
+
+... code-block::bash
+    ${system data directory}/lance/language_models
+
+It also supports configuring user dictionaries,
+which makes it convenient for users to expand their own dictionaries without retraining the language models.
+
+Language Models of Jieba
+---------------
+
+Downloading the Model
+~~~~~~~~~~~
+
+... code-block::bash
+    python -m lance.download jieba
+
+The language model is stored by default in `${LANCE_LANGUAGE_MODEL_HOME}/jieba/default`.
+
+Using the Model
+~~~~~~~~~~~
+
+... code-block::python
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/default")
+
+User Dictionaries
+~~~~~~~~~~~
+Create a file named config.json in the root directory of the current model.
+
+... code-block::json
+    {
+        "main": "dict.txt",
+        "users": ["path/to/user/dict.txt"]
+    }
+
+- The "main" field is optional. If not filled, the default is "dict.txt".
+- "users" is the path of the user dictionary. For the format of the user dictionary, please refer to https://github.com/messense/jieba-rs/blob/main/src/data/dict.txt.
+
+
+Language Models of Lindera
+---------------
+
+Downloading the Model
+~~~~~~~~~~~
+
+... code-block::bash
+    python -m lance.download lindera -l [ipadic|ko-dic|unidic]
+
+Note that the language models of Lindera need to be compiled. Please install lindera-cli first. For detailed steps, please refer to https://github.com/lindera/lindera/tree/main/lindera-cli.
+
+The language model is stored by default in ${LANCE_LANGUAGE_MODEL_HOME}/lindera/[ipadic|ko-dic|unidic]
+
+Using the Model
+~~~~~~~~~~~
+
+... code-block::python
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ipadic")
+
+User Dictionaries
+~~~~~~~~~~~
+
+Create a file named config.json in the root directory of the current model.
+
+... code-block::json
+    {
+        "main": "main",
+        "users": "path/to/user/dict.bin",
+        "user_type": "ipadic|ko-dic|unidic"
+    }
+
+- The "main" field is optional. If not filled, the default is the "main" directory.
+- "user" is the path of the user dictionary. The user dictionary can be passed as a CSV file or as a binary file compiled by lindera-cli.
+- The "user_type" field can be left blank if the user dictionary is in binary format. If it's in CSV format, you need to specify the type of the language model.
+
+
+Create your own language model
+---------------
+
+Put your language model into `LANCE_LANGUAGE_MODEL_HOME`.
+
+
diff --git a/python/python/lance/download.py b/python/python/lance/download.py
index 66f2558176..778949aaff 100644
--- a/python/python/lance/download.py
+++ b/python/python/lance/download.py
@@ -22,7 +22,7 @@ def check_lindera():
 
 def import_requests():
     try:
-        import requests  # type: ignore
+        import requests
     except Exception:
         raise Exception("requests is not installed, Please pip install requests")
     return requests
diff --git a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.bin
deleted file mode 100644
index ada7701adb197032ce7374b3bed5269a351ca8e2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1612
zcmeHGJxfDD5S;9@P_PhDun<8I6tS?du(0tz_!ImAmKGLycZu2rEJQ3+QasFu2`Fh)
z0{#U{3kzX27_dm8kix>*yq6#fVxic$&F$^Z&CD%#=tCUa0kmHbMuOsQfJR&5{kUNB
zls&Mw#M+P`N30tYY^7P>VE;KlxXNCiz}*Qhc32-|o)?@L?*jAvxF7M1r;cGW?D>kI
zyhaRb{#8)V2WVyl*#V$HtrU<xYzZ#Ob^jzdJD|7m02Sly>a`NRbMl&Qn!3D$E>eTu
zqYQOs*aI~TLtc*nljJV%6W0z<e@UUbD>z=~`8My+<!4Rj$wwY{Ac(AP=SRqG_c!0y
zf)ag~spC!)esY+<Ix(i{Zw1TLi%f4$P#aC=&c*ot<9`BwIf03H;;kFo@aG}U>~+T3
zpf!@Haa|!Ol{{X4*iH;5tGW|)7wWFm=O1=*kEJ7Hw3z))G8JQ?VNt_P4Rh+YEHO$F
zN9r@`%j)aRLUy`~_1o%>)m7COI>|?@x+1$h`5(4Cme^<<p50$oK0BTKuFc;v*fl_d
ajVkB#X*|BvnQW~${CJ>#?+a|tEZzW`{UO2t

diff --git a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv b/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv
deleted file mode 100644
index fae82a570b..0000000000
--- a/python/python/tests/models/lindera/ipadic/ipadic_simple_userdic.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-東京スカイツリー,カスタム名詞,トウキョウスカイツリー
-東武スカイツリーライン,カスタム名詞,トウブスカイツリーライン
-とうきょうスカイツリー駅,カスタム名詞,トウキョウスカイツリーエキ
diff --git a/python/src/lib.rs b/python/src/lib.rs
index f80399dae2..f534d0c3df 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -44,7 +44,6 @@ use futures::StreamExt;
 use lance_index::DatasetIndexExt;
 use pyo3::exceptions::{PyIOError, PyValueError};
 use pyo3::prelude::*;
-use pyo3::types::{PyNone, PyString};
 use session::Session;
 
 #[macro_use]

From d19c957d64753e06dc427566d68adb87fd5a590b Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 16:37:44 +0800
Subject: [PATCH 15/22] update test

---
 docs/tokenizer.rst                            |   4 +-
 .../models/jieba/invalid_dict/config.json     |   6 +
 .../models/jieba/invalid_dict2/config.json    |   3 +
 .../tests/models/jieba/user_dict/config.json  |   2 +-
 .../tests/models/jieba/user_dict/user.txt     |   2 +-
 python/python/tests/models/lindera/README.md  |  28 ++++
 .../models/lindera/invalid_dict/config.json   |   4 +
 .../models/lindera/invalid_dict2/config.json  |   4 +
 .../tests/models/lindera/ipadic/main.zip      | Bin 0 -> 5910 bytes
 .../models/lindera/ipadic/raw/Noun.mock.csv   |   3 +
 .../models/lindera/user_dict/config.json      |   5 +
 .../models/lindera/user_dict/userdic.csv      |   1 +
 .../models/lindera/user_dict2/config.json     |   4 +
 .../models/lindera/user_dict2/userdic.bin     | Bin 0 -> 1226 bytes
 python/python/tests/test_scalar_index.py      | 147 +++++++++++++++++-
 .../src/scalar/inverted/tokenizer.rs          |   6 +-
 .../src/scalar/inverted/tokenizer/lindera.rs  |   2 +-
 17 files changed, 206 insertions(+), 15 deletions(-)
 create mode 100644 python/python/tests/models/jieba/invalid_dict/config.json
 create mode 100644 python/python/tests/models/jieba/invalid_dict2/config.json
 create mode 100644 python/python/tests/models/lindera/README.md
 create mode 100644 python/python/tests/models/lindera/invalid_dict/config.json
 create mode 100644 python/python/tests/models/lindera/invalid_dict2/config.json
 create mode 100644 python/python/tests/models/lindera/ipadic/main.zip
 create mode 100644 python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv
 create mode 100644 python/python/tests/models/lindera/user_dict/config.json
 create mode 100644 python/python/tests/models/lindera/user_dict/userdic.csv
 create mode 100644 python/python/tests/models/lindera/user_dict2/config.json
 create mode 100644 python/python/tests/models/lindera/user_dict2/userdic.bin

diff --git a/docs/tokenizer.rst b/docs/tokenizer.rst
index 602956b26b..306b7919ad 100644
--- a/docs/tokenizer.rst
+++ b/docs/tokenizer.rst
@@ -71,12 +71,12 @@ Create a file named config.json in the root directory of the current model.
     {
         "main": "main",
         "users": "path/to/user/dict.bin",
-        "user_type": "ipadic|ko-dic|unidic"
+        "user_kind": "ipadic|ko-dic|unidic"
     }
 
 - The "main" field is optional. If not filled, the default is the "main" directory.
 - "user" is the path of the user dictionary. The user dictionary can be passed as a CSV file or as a binary file compiled by lindera-cli.
-- The "user_type" field can be left blank if the user dictionary is in binary format. If it's in CSV format, you need to specify the type of the language model.
+- The "user_kind" field can be left blank if the user dictionary is in binary format. If it's in CSV format, you need to specify the type of the language model.
 
 
 Create your own language model
diff --git a/python/python/tests/models/jieba/invalid_dict/config.json b/python/python/tests/models/jieba/invalid_dict/config.json
new file mode 100644
index 0000000000..cf4301aa2b
--- /dev/null
+++ b/python/python/tests/models/jieba/invalid_dict/config.json
@@ -0,0 +1,6 @@
+{
+    "main": "../default/dict.txt",
+    "users": [
+        "invalid_user.txt"
+    ]
+}
diff --git a/python/python/tests/models/jieba/invalid_dict2/config.json b/python/python/tests/models/jieba/invalid_dict2/config.json
new file mode 100644
index 0000000000..d0216419a5
--- /dev/null
+++ b/python/python/tests/models/jieba/invalid_dict2/config.json
@@ -0,0 +1,3 @@
+{
+    "main": "invalid_dict.txt"
+}
diff --git a/python/python/tests/models/jieba/user_dict/config.json b/python/python/tests/models/jieba/user_dict/config.json
index 5f0541ed4f..0d65334ca2 100644
--- a/python/python/tests/models/jieba/user_dict/config.json
+++ b/python/python/tests/models/jieba/user_dict/config.json
@@ -1,6 +1,6 @@
 {
     "main": "../default/dict.txt",
-    "user": [
+    "users": [
         "user.txt"
     ]
 }
diff --git a/python/python/tests/models/jieba/user_dict/user.txt b/python/python/tests/models/jieba/user_dict/user.txt
index be2d8a9582..bb6ffa4d85 100644
--- a/python/python/tests/models/jieba/user_dict/user.txt
+++ b/python/python/tests/models/jieba/user_dict/user.txt
@@ -1 +1 @@
-光明的前途 1219 n
+光明的前途 318825 n
diff --git a/python/python/tests/models/lindera/README.md b/python/python/tests/models/lindera/README.md
new file mode 100644
index 0000000000..c4073b65d5
--- /dev/null
+++ b/python/python/tests/models/lindera/README.md
@@ -0,0 +1,28 @@
+# How to build this test language model
+
+Ipadic model is about 45M. so we created a tiny ipadic in zip.
+
+- Download language model
+
+```bash
+curl -L -o mecab-ipadic-2.7.0-20070801.tar.gz "https://github.com/lindera-morphology/mecab-ipadic/archive/refs/tags/2.7.0-20070801.tar.gz"
+tar xvf mecab-ipadic-2.7.0-20070801.tar.gz
+```
+
+- Remove csv files in folder
+
+- Put files in `ipadic/raw` into folder
+
+- Edit matrix.def, reset last column(weight) into zero, except first row.
+
+- build
+
+```bash
+lindera build --dictionary-kind=ipadic mecab-ipadic-2.7.0-20070801 main
+```
+
+- build user dict
+
+```bash
+lindera build --build-user-dictionary --dictionary-kind=ipadic user_dict/userdict.csv user_dict2
+```
diff --git a/python/python/tests/models/lindera/invalid_dict/config.json b/python/python/tests/models/lindera/invalid_dict/config.json
new file mode 100644
index 0000000000..b486aeba24
--- /dev/null
+++ b/python/python/tests/models/lindera/invalid_dict/config.json
@@ -0,0 +1,4 @@
+{
+    "main": "../main",
+    "user": "invalid.bin"
+}
diff --git a/python/python/tests/models/lindera/invalid_dict2/config.json b/python/python/tests/models/lindera/invalid_dict2/config.json
new file mode 100644
index 0000000000..11c22e9f1c
--- /dev/null
+++ b/python/python/tests/models/lindera/invalid_dict2/config.json
@@ -0,0 +1,4 @@
+{
+    "main": "../main",
+    "user": "ipadic_simple_userdic.csv"
+}
diff --git a/python/python/tests/models/lindera/ipadic/main.zip b/python/python/tests/models/lindera/ipadic/main.zip
new file mode 100644
index 0000000000000000000000000000000000000000..25966ae2a1d06f509cc06ba46cc1555cca2cbeae
GIT binary patch
literal 5910
zcmWIWW@Zs#0D<cjb0WbED8b60z>u4mnWrBb!pp$kwm3h%8;C)4X$3a}Bg<Dt1_rR!
z0I(4p3>*wPl?Wr`fX48_j7Z5$F3~GX%qa$&rMd)S4v0oGD>(rO(h``Q+!VT5J&c%c
z^6;!lFk}euX6Kmu{BWH(&`^-U*sa{e&A=cCvobj&u_!(zHBB!mGmnt1Z?Eq67jYCh
z@NxV1-+bKJ+)StV1DUzh)NWbmoZfivUZq@uQ-_<B)P)EMY2l-B!rZw}GkyrK_$j<#
z;lV@pjZJP%56bt%+N$RrdwQkI*KYgwcY7<1t+!WSSAF<!%U&<9{eS=M`x`pv+VlPA
z%UXLsJ*+%5J7n`gA2XiiuTOnlv|v$I`Jb36GgtTB=UtSz_2#PUAMUOQV*RQa%F(R3
zf8nWB-;~#_kaX29UAA}a#RYdh?C~-=FgN>$<AdGnpFR(C(Tv%jccnQl=BV#prZdTm
z`X9KmZyXWZcDOeBAn&$kkx@?sCJ9<LY?!pU%JlA$yK6b;E$Uobu!*5`&t>8H$63F0
z+BAK+ET^5}#OBhc*t~|ZdP29vBDNyV>4mKo5h?OUN}G%??OLHzbYgMC?SPCV-u54>
z^#4iiRa|_AUGi}0#NJi^rbP%}I-$_d)TQx|QSK4%y+^ioFK$(z>Gzznu_e{!nDC0s
ziG@369C%kd(ZAy7oZ92Rw2JEmCG8#rZCG;N{f+FjhccV5TeQo4c_b$CY-2}%#v|iz
z0(ZIl{#040+uT03yw@W6WV-d4*{?gL=WqJrD>eJGnC{P@_R8CTJips!-pRDTU6uQD
zzwQ11@A9wB{+@iP=GF>t$@#NO(`rS;lU_Z3b~f_U&z6n3HJ57pZ<`m-Kl9q`^YYT=
zrS(DI*Ux`H*^YhN{p{kJ<^K<R&i}mQYYj6frF~#MxS5-YfdQ1Bu&1;GKv^DGN-NFF
zCMJ>jPPFzr>>$ySe5R>)uf!Q+8yO)dCJhlW76U<EP8SmolM+jY0yZrzB|QgD*^)0F
zA|fIMOz#|>g^%64T2cRUvirTc`Fl5o2PBJr+Sv2F<DByWw!;oDzAru&cl^Y<i_*+;
z9~xO6D_po+qIN)q<Dke4HsgeA1!o#716a5d1a%~z>%PwSQfE@#z;tRs(wenRS`o~q
z0qn1<avL_vIX9k-{Ih1q4tKHnQcRPN*4-A{))g<Sxuk3QM*SMm`?uB=gx2I+9t!-p
z_3*9xp$bf*`eiS8{WU+AX@1^4MYVm(-EBF4I(1IvO0lKeJbFH3)AN>1zlBfP37x8P
zpYk_M^R;Pa#myy-W|<m!7Ya7GAO30j{+n!InRswn@Ul0G%W_uxMTR8h+OIeL6lK2t
zYj);ac~C68bMejB0hTeKu*Mz>JiwAb2o?+ABBngQD5W?vrGij~2Ng3Lm>yXvuz`z~
z(m0DJz_<oRDK4v2ZkaIf!>r0pEGf#Y(910$XwTb&hCm<iFfFh(+gR`B{DyVbGoTft
zU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMtlfVNP;?ktegE34S5+DKz%Ok9lssG
zkb?Kd5FI~4y*^`Lx39u#;rFAbDtY|NO!mBpIJ|(h)j$1xt!1{^;m6C)_Rqd%pLgrm
zj!aIj<BSVGzkCyW{#nT(i()gIx%Tomx1`Go&-1@|;KPocuM0n~I_~>hP{(-AHv9OU
zuW$B!<w>|bt#NHE+cT5*jD@qlgF2Gz92Z>p{qlg01vwbIR~T53MiRhYNl7H+kK`E#
z&V1O?)cH`7S@`g0aUY(|_6CM;Vh^7AVAS537{)N+{HX&6P8_&!;o+ep1`8W!KCI*r
zK0L+Phex{MQJ2HVsrfS`%6t>kD!w&$Chk>XKHSIZ!_((b*cNEO<#uoZJJ{tvbPDGu
z09_4oUw}6wlL#~JQ9_V15P0hdB8eO`gcu1MHAEiV1C1KOz>-F1n2|(}A;O#sGaGq0
z2Q)kg14|mW!OSLQun^sN<Z%knm=FvsX;ff_h6a%%MCc|X4@-c?IbdK(<1%2ZfWsM)
z_=y<xKsO&bqk*y$3@m9BWQCef{7?tF;mAoJ)$sX37>*~D;?WI9PMxS>o+XTIIN`h$
U;LXYg@;4~4+yRa>seqF?04AJJ#{d8T

literal 0
HcmV?d00001

diff --git a/python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv b/python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv
new file mode 100644
index 0000000000..4201b57a54
--- /dev/null
+++ b/python/python/tests/models/lindera/ipadic/raw/Noun.mock.csv
@@ -0,0 +1,3 @@
+����,1293,1293,5686,̾��,��ͭ̾��,�ϰ�,����,*,*,����,�ʥ꥿,�ʥ꥿
+���,1285,1285,553,̾��,����,*,*,*,*,���,��������,��������
+����,1285,1285,7778,̾��,����,*,*,*,*,����,��������,��������
\ No newline at end of file
diff --git a/python/python/tests/models/lindera/user_dict/config.json b/python/python/tests/models/lindera/user_dict/config.json
new file mode 100644
index 0000000000..e554849af2
--- /dev/null
+++ b/python/python/tests/models/lindera/user_dict/config.json
@@ -0,0 +1,5 @@
+{
+    "main": "../ipadic/main",
+    "user": "userdic.csv",
+    "user_kind": "ipadic"
+}
diff --git a/python/python/tests/models/lindera/user_dict/userdic.csv b/python/python/tests/models/lindera/user_dict/userdic.csv
new file mode 100644
index 0000000000..652c3f7791
--- /dev/null
+++ b/python/python/tests/models/lindera/user_dict/userdic.csv
@@ -0,0 +1 @@
+成田国際空港,カスタム名詞,トウキョウスカイツリー
diff --git a/python/python/tests/models/lindera/user_dict2/config.json b/python/python/tests/models/lindera/user_dict2/config.json
new file mode 100644
index 0000000000..e06bd8c71b
--- /dev/null
+++ b/python/python/tests/models/lindera/user_dict2/config.json
@@ -0,0 +1,4 @@
+{
+    "main": "../ipadic/main",
+    "user": "userdic.bin"
+}
diff --git a/python/python/tests/models/lindera/user_dict2/userdic.bin b/python/python/tests/models/lindera/user_dict2/userdic.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a0410fa0798689aaaea53c73af4972e8ea805aca
GIT binary patch
literal 1226
zcmeHHF$%&!5FGck(9$2+iqG%~KEx-4JBu{J6&8{cu@EJSn8HFu@dG}<eobOJpWwwL
zVreU6ik;alc8-BtRFPpK5OoM{m!KVixJMXW34s;}tv>Gr%#+~t32{J3ESrIVuL;qZ
zaJ&ob?R;AUDu9!3EvZbPOyCa^Xnei#c}tt(Fr?a~#iE`OnmMyvvplf8u$qN>`0%Ip
x@4wOhMHFiySI46uH0Q)Kv44#A+g4$qT$T%#8&=D=ux8eB&T7DF#p?92!3)L#ORoR`

literal 0
HcmV?d00001

diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
index 0e51ca70b5..83a45ec499 100644
--- a/python/python/tests/test_scalar_index.py
+++ b/python/python/tests/test_scalar_index.py
@@ -3,9 +3,11 @@
 
 import os
 import random
+import shutil
 import string
 from datetime import date, datetime, timedelta
 from pathlib import Path
+import zipfile
 
 import lance
 import numpy as np
@@ -33,6 +35,23 @@ def gen_str(n, split="", char_set=string.ascii_letters + string.digits):
     )
     return tbl
 
+def set_language_model_path():
+    os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(os.path.dirname(__file__), "models")
+
+
+@pytest.fixture()
+def lindera_ipadic():
+    set_language_model_path()
+    model_path = os.path.join(os.path.dirname(__file__), "models", "lindera", "ipadic")
+    cwd = os.getcwd()
+    try:
+        os.chdir(model_path)
+        with zipfile.ZipFile("main.zip", 'r') as zip_ref:
+            zip_ref.extractall()
+        os.chdir(cwd)
+        yield
+    finally:
+        shutil.rmtree(os.path.join(model_path, "main"))
 
 @pytest.fixture()
 def dataset(tmp_path):
@@ -325,8 +344,8 @@ def test_fts_all_deleted(dataset):
     dataset.delete(f"doc = '{first_row_doc}'")
     dataset.to_table(full_text_query=first_row_doc)
 
-
-def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path):
+def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path, lindera_ipadic):
+    os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(os.path.dirname(__file__), "models")
     data = pa.table(
         {
             "text": [
@@ -346,22 +365,136 @@ def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path
     )
     assert results["_rowid"].to_pylist() == [0]
 
+def test_lindera_ipadic_jp_tokenizer_invalid_user_dict_path(tmp_path, lindera_ipadic):
+    data = pa.table(
+        {
+            "text": [
+                "成田国際空港",
+            ],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    with pytest.raises(OSError):
+        ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/invalid_dict")
+
+def test_lindera_ipadic_jp_tokenizer_csv_user_dict_without_type(tmp_path, lindera_ipadic):
+    data = pa.table(
+        {
+            "text": [
+                "成田国際空港",
+            ],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    with pytest.raises(OSError):
+        ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/invalid_dict2")
+
+def test_lindera_ipadic_jp_tokenizer_csv_user_dict(tmp_path, lindera_ipadic):
+    data = pa.table(
+        {
+            "text": [
+                "成田国際空港",
+                "東京国際空港",
+                "羽田空港",
+            ],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/user_dict")
+    results = ds.to_table(
+        full_text_query="成田",
+        prefilter=True,
+        with_row_id=True,
+    )
+    assert len(results) == 0
+    results = ds.to_table(
+        full_text_query="成田国際空港",
+        prefilter=True,
+        with_row_id=True,
+    )
+    assert results["_rowid"].to_pylist() == [0]
+
+def test_lindera_ipadic_jp_tokenizer_bin_user_dict(tmp_path, lindera_ipadic):
+    data = pa.table(
+        {
+            "text": [
+                "成田国際空港",
+            ],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/user_dict2")
+
+def test_jieba_tokenizer(tmp_path):
+    set_language_model_path()
+    data = pa.table(
+        {
+            "text": [
+                "我们都有光明的前途",
+                "光明的前途"
+            ],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/default")
+    results = ds.to_table(
+        full_text_query="我们",
+        prefilter=True,
+        with_row_id=True,
+    )
+    assert results["_rowid"].to_pylist() == [0]
+
+def test_jieba_invalid_user_dict_tokenizer(tmp_path):
+    set_language_model_path()
+    data = pa.table(
+        {
+            "text": [
+                "我们都有光明的前途",
+            ],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    with pytest.raises(OSError):
+        ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/invalid_dict")
+
 
-def test_indexed_filter_with_fts_index_with_lindera_ko_tokenizer(tmp_path):
+def test_jieba_invalid_main_dict_tokenizer(tmp_path):
+    set_language_model_path()
     data = pa.table(
         {
-            "text": ["하네다공항한정토트백", "나리타공항한정토트백"],
+            "text": [
+                "我们都有光明的前途",
+            ],
         }
     )
     ds = lance.write_dataset(data, tmp_path, mode="overwrite")
-    ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ko-dic")
+    with pytest.raises(OSError):
+        ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/invalid_dict2")
 
+def test_jieba_user_dict_tokenizer(tmp_path):
+    set_language_model_path()
+    data = pa.table(
+        {
+            "text": [
+                "我们都有光明的前途",
+                "光明的前途"
+            ],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/user_dict")
+    results = ds.to_table(
+        full_text_query="的前",
+        prefilter=True,
+        with_row_id=True,
+    )
+    assert len(results) == 0
     results = ds.to_table(
-        full_text_query="나리타",
+        full_text_query="光明的前途",
         prefilter=True,
         with_row_id=True,
     )
-    assert results["_rowid"].to_pylist() == [1]
+    assert results["_rowid"].to_pylist() == [1, 0]
 
 
 def test_bitmap_index(tmp_path: Path):
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index cdb60ff0f8..11c161c2f6 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -20,8 +20,8 @@ pub struct TokenizerConfig {
     /// - `simple`: splits tokens on whitespace and punctuation
     /// - `whitespace`: splits tokens on whitespace
     /// - `raw`: no tokenization
-    /// - `lindera-ipadic`: Japanese tokenizer
-    /// - `lindera-ko-dic`: Korea tokenizer
+    /// - `lindera/*`: Lindera tokenizer
+    /// - `jieba/*`: Jieba tokenizer
     ///
     /// `simple` is recommended for most cases and the default value
     base_tokenizer: String,
@@ -170,7 +170,7 @@ fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAn
                     location!(),
                 ));
             };
-            lindera::LinderaBuilder::load(&home.join(s))?.build()
+            jieba::JiebaBuilder::load(&home.join(s))?.build()
         }
         _ => Err(Error::invalid_input(
             format!("unknown base tokenizer {}", name),
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
index ab60790de6..fedb2eb59e 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
@@ -93,7 +93,7 @@ impl TokenizerBuilder for LinderaBuilder {
             Some(conf) => {
                 let user_dictionary = load_user_dictionary_from_config(&conf).map_err(|e| {
                     Error::io(
-                        format!("load lindera tokenizer user dictionary err: {e}"),
+                        format!("load lindera tokenizer user dictionary, conf:{conf}, err: {e}"),
                         location!(),
                     )
                 })?;

From d1d98c3b0de03cc78dcc71b03cd1e10b7d240770 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 16:38:22 +0800
Subject: [PATCH 16/22] format

---
 python/python/tests/test_scalar_index.py | 48 ++++++++++++++++--------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
index 83a45ec499..c24b7abb20 100644
--- a/python/python/tests/test_scalar_index.py
+++ b/python/python/tests/test_scalar_index.py
@@ -5,9 +5,9 @@
 import random
 import shutil
 import string
+import zipfile
 from datetime import date, datetime, timedelta
 from pathlib import Path
-import zipfile
 
 import lance
 import numpy as np
@@ -35,8 +35,11 @@ def gen_str(n, split="", char_set=string.ascii_letters + string.digits):
     )
     return tbl
 
+
 def set_language_model_path():
-    os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(os.path.dirname(__file__), "models")
+    os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(
+        os.path.dirname(__file__), "models"
+    )
 
 
 @pytest.fixture()
@@ -46,13 +49,14 @@ def lindera_ipadic():
     cwd = os.getcwd()
     try:
         os.chdir(model_path)
-        with zipfile.ZipFile("main.zip", 'r') as zip_ref:
+        with zipfile.ZipFile("main.zip", "r") as zip_ref:
             zip_ref.extractall()
         os.chdir(cwd)
         yield
     finally:
         shutil.rmtree(os.path.join(model_path, "main"))
 
+
 @pytest.fixture()
 def dataset(tmp_path):
     tbl = create_table()
@@ -344,8 +348,13 @@ def test_fts_all_deleted(dataset):
     dataset.delete(f"doc = '{first_row_doc}'")
     dataset.to_table(full_text_query=first_row_doc)
 
-def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path, lindera_ipadic):
-    os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(os.path.dirname(__file__), "models")
+
+def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(
+    tmp_path, lindera_ipadic
+):
+    os.environ["LANCE_LANGUAGE_MODEL_HOME"] = os.path.join(
+        os.path.dirname(__file__), "models"
+    )
     data = pa.table(
         {
             "text": [
@@ -365,6 +374,7 @@ def test_indexed_filter_with_fts_index_with_lindera_ipadic_jp_tokenizer(tmp_path
     )
     assert results["_rowid"].to_pylist() == [0]
 
+
 def test_lindera_ipadic_jp_tokenizer_invalid_user_dict_path(tmp_path, lindera_ipadic):
     data = pa.table(
         {
@@ -375,9 +385,14 @@ def test_lindera_ipadic_jp_tokenizer_invalid_user_dict_path(tmp_path, lindera_ip
     )
     ds = lance.write_dataset(data, tmp_path, mode="overwrite")
     with pytest.raises(OSError):
-        ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/invalid_dict")
+        ds.create_scalar_index(
+            "text", "INVERTED", base_tokenizer="lindera/invalid_dict"
+        )
+
 
-def test_lindera_ipadic_jp_tokenizer_csv_user_dict_without_type(tmp_path, lindera_ipadic):
+def test_lindera_ipadic_jp_tokenizer_csv_user_dict_without_type(
+    tmp_path, lindera_ipadic
+):
     data = pa.table(
         {
             "text": [
@@ -387,7 +402,10 @@ def test_lindera_ipadic_jp_tokenizer_csv_user_dict_without_type(tmp_path, linder
     )
     ds = lance.write_dataset(data, tmp_path, mode="overwrite")
     with pytest.raises(OSError):
-        ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/invalid_dict2")
+        ds.create_scalar_index(
+            "text", "INVERTED", base_tokenizer="lindera/invalid_dict2"
+        )
+
 
 def test_lindera_ipadic_jp_tokenizer_csv_user_dict(tmp_path, lindera_ipadic):
     data = pa.table(
@@ -414,6 +432,7 @@ def test_lindera_ipadic_jp_tokenizer_csv_user_dict(tmp_path, lindera_ipadic):
     )
     assert results["_rowid"].to_pylist() == [0]
 
+
 def test_lindera_ipadic_jp_tokenizer_bin_user_dict(tmp_path, lindera_ipadic):
     data = pa.table(
         {
@@ -425,14 +444,12 @@ def test_lindera_ipadic_jp_tokenizer_bin_user_dict(tmp_path, lindera_ipadic):
     ds = lance.write_dataset(data, tmp_path, mode="overwrite")
     ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/user_dict2")
 
+
 def test_jieba_tokenizer(tmp_path):
     set_language_model_path()
     data = pa.table(
         {
-            "text": [
-                "我们都有光明的前途",
-                "光明的前途"
-            ],
+            "text": ["我们都有光明的前途", "光明的前途"],
         }
     )
     ds = lance.write_dataset(data, tmp_path, mode="overwrite")
@@ -444,6 +461,7 @@ def test_jieba_tokenizer(tmp_path):
     )
     assert results["_rowid"].to_pylist() == [0]
 
+
 def test_jieba_invalid_user_dict_tokenizer(tmp_path):
     set_language_model_path()
     data = pa.table(
@@ -471,14 +489,12 @@ def test_jieba_invalid_main_dict_tokenizer(tmp_path):
     with pytest.raises(OSError):
         ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/invalid_dict2")
 
+
 def test_jieba_user_dict_tokenizer(tmp_path):
     set_language_model_path()
     data = pa.table(
         {
-            "text": [
-                "我们都有光明的前途",
-                "光明的前途"
-            ],
+            "text": ["我们都有光明的前途", "光明的前途"],
         }
     )
     ds = lance.write_dataset(data, tmp_path, mode="overwrite")

From 987e883f1110528dfb2277e7943ae68c5aad04ed Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 16:40:12 +0800
Subject: [PATCH 17/22] format

---
 rust/lance-index/src/scalar/inverted/tokenizer.rs         | 2 +-
 rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs   | 3 +++
 rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 11c161c2f6..252ece7e8f 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -188,7 +188,7 @@ pub const LANCE_LANGUAGE_MODEL_CONFIG_FILE: &str = "config.json";
 pub fn language_model_home() -> Option<PathBuf> {
     match env::var(LANCE_LANGUAGE_MODEL_HOME_ENV_KEY) {
         Ok(p) => Some(PathBuf::from(p)),
-        Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY))
+        Err(_) => dirs::data_local_dir().map(|p| p.join(LANCE_LANGUAGE_MODEL_DEFAULT_DIRECTORY)),
     }
 }
 
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
index a874ca3bd4..0c04dc236a 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
 use std::path::PathBuf;
 
 use super::TokenizerBuilder;
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
index fedb2eb59e..5ce6a5ab36 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
 use std::path::PathBuf;
 
 use super::TokenizerBuilder;

From e83a18a0e5a159aa96048c3cbf5de087c2e0ed76 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 16:41:53 +0800
Subject: [PATCH 18/22] format

---
 Cargo.lock | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7ed15c3203..bcf001e073 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -17,12 +17,6 @@ version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
 
-[[package]]
-name = "adler2"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
-
 [[package]]
 name = "adler32"
 version = "1.2.0"

From 466d26ecc2e894d7ff0e7f9310e20d1d744fd5b3 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 16:43:20 +0800
Subject: [PATCH 19/22] format

---
 python/python/lance/download.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/python/lance/download.py b/python/python/lance/download.py
index 778949aaff..cff42520e4 100644
--- a/python/python/lance/download.py
+++ b/python/python/lance/download.py
@@ -37,8 +37,8 @@ def download_jieba():
             "https://github.com/messense/jieba-rs/raw/refs/heads/main/src/data/dict.txt"
         )
         content = resp.content
-        with open(os.path.join(dirname, "dict.txt"), "wb") as fo:
-            fo.write(content)
+        with open(os.path.join(dirname, "dict.txt"), "wb") as out:
+            out.write(content)
     except Exception as _:
         traceback.print_exc()
         print(

From f01777cd6653a9938b5fcc1d75b0112dd4568255 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 16:46:03 +0800
Subject: [PATCH 20/22] format

---
 Cargo.lock | 82 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 62 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bcf001e073..751769ff6e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -936,7 +936,7 @@ dependencies = [
  "addr2line",
  "cfg-if",
  "libc",
- "miniz_oxide 0.7.4",
+ "miniz_oxide",
  "object",
  "rustc-demangle",
  "windows-targets 0.52.6",
@@ -1600,7 +1600,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.89",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1611,7 +1611,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2111,7 +2111,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2121,7 +2121,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
 dependencies = [
  "derive_builder_core",
- "syn 2.0.89",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2452,7 +2452,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c"
 dependencies = [
  "crc32fast",
- "miniz_oxide 0.8.0",
+ "miniz_oxide",
 ]
 
 [[package]]
@@ -2467,6 +2467,21 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2"
 
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared",
+]
+
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.1"
@@ -2960,7 +2975,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
 dependencies = [
  "bytes",
  "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.5.1",
  "hyper-util",
  "native-tls",
  "tokio",
@@ -3137,6 +3152,12 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
 [[package]]
 name = "idna"
 version = "1.0.3"
@@ -3178,7 +3199,7 @@ dependencies = [
  "libflate",
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4093,7 +4114,7 @@ dependencies = [
  "reqwest",
  "serde",
  "tar",
- "thiserror 2.0.3",
+ "thiserror 2.0.4",
  "yada",
 ]
 
@@ -4239,15 +4260,6 @@ dependencies = [
  "adler2",
 ]
 
-[[package]]
-name = "miniz_oxide"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1"
-dependencies = [
- "adler2",
-]
-
 [[package]]
 name = "mio"
 version = "1.0.3"
@@ -4342,7 +4354,7 @@ dependencies = [
  "openssl-probe",
  "openssl-sys",
  "schannel",
- "security-framework",
+ "security-framework 2.11.1",
  "security-framework-sys",
  "tempfile",
 ]
@@ -4589,7 +4601,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.89",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6323,6 +6335,27 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "system-configuration"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
+dependencies = [
+ "bitflags 2.6.0",
+ "core-foundation 0.9.4",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
 [[package]]
 name = "tagptr"
 version = "0.2.0"
@@ -7026,6 +7059,15 @@ version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
+[[package]]
+name = "unicode-normalization"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
+dependencies = [
+ "tinyvec",
+]
+
 [[package]]
 name = "unicode-segmentation"
 version = "1.12.0"

From a25db786a3281f2491617402d5320513e63c371f Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 18:02:40 +0800
Subject: [PATCH 21/22] format

---
 python/src/lib.rs                             | 12 +++++------
 .../src/scalar/inverted/tokenizer.rs          |  4 ++--
 .../src/scalar/inverted/tokenizer/jieba.rs    | 19 +++++-------------
 .../src/scalar/inverted/tokenizer/lindera.rs  | 20 +++++--------------
 4 files changed, 18 insertions(+), 37 deletions(-)

diff --git a/python/src/lib.rs b/python/src/lib.rs
index f534d0c3df..9e46ce8883 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -177,14 +177,14 @@ fn json_to_schema(json: &str) -> PyResult<PyArrowType<ArrowSchema>> {
 #[pyfunction]
 pub fn language_model_home() -> PyResult<String> {
     let Some(p) = lance_index::scalar::inverted::language_model_home() else {
-        return Err(pyo3::exceptions::PyValueError::new_err(format!(
-            "Failed to get language model home"
-        )));
+        return Err(pyo3::exceptions::PyValueError::new_err(
+            "Failed to get language model home",
+        ));
     };
     let Some(pstr) = p.to_str() else {
-        return Err(pyo3::exceptions::PyValueError::new_err(format!(
-            "Failed to convert language model home to str"
-        )));
+        return Err(pyo3::exceptions::PyValueError::new_err(
+            "Failed to convert language model home to str",
+        ));
     };
     Ok(String::from(pstr))
 }
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 252ece7e8f..7d34710286 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -195,7 +195,7 @@ pub fn language_model_home() -> Option<PathBuf> {
 #[cfg(feature = "tokenizer-common")]
 trait TokenizerBuilder: Sized {
     type Config: serde::de::DeserializeOwned + Default;
-    fn load(p: &PathBuf) -> Result<Self> {
+    fn load(p: &std::path::Path) -> Result<Self> {
         if !p.is_dir() {
             return Err(Error::io(
                 format!("{} is not a valid directory", p.display()),
@@ -214,7 +214,7 @@ trait TokenizerBuilder: Sized {
         Self::new(config, p)
     }
 
-    fn new(config: Self::Config, root: &PathBuf) -> Result<Self>;
+    fn new(config: Self::Config, root: &std::path::Path) -> Result<Self>;
 
     fn build(&self) -> Result<tantivy::tokenizer::TextAnalyzerBuilder>;
 }
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
index 0c04dc236a..063cf0b4a9 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
@@ -1,28 +1,19 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 
 use super::TokenizerBuilder;
 use lance_core::{Error, Result};
 use serde::{Deserialize, Serialize};
 use snafu::{location, Location};
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Default)]
 pub struct JiebaConfig {
     main: Option<String>,
     users: Option<Vec<String>>,
 }
 
-impl Default for JiebaConfig {
-    fn default() -> Self {
-        Self {
-            main: Default::default(),
-            users: Default::default(),
-        }
-    }
-}
-
 pub struct JiebaBuilder {
     root: PathBuf,
     config: JiebaConfig,
@@ -47,10 +38,10 @@ impl JiebaBuilder {
 impl TokenizerBuilder for JiebaBuilder {
     type Config = JiebaConfig;
 
-    fn new(config: Self::Config, root: &PathBuf) -> Result<Self> {
-        Ok(JiebaBuilder {
+    fn new(config: Self::Config, root: &Path) -> Result<Self> {
+        Ok(Self {
             config,
-            root: root.clone(),
+            root: root.to_path_buf(),
         })
     }
 
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
index 5ce6a5ab36..23c8042dd0 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/lindera.rs
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 
 use super::TokenizerBuilder;
 use lance_core::{Error, Result};
@@ -17,23 +17,13 @@ use serde::{Deserialize, Serialize};
 use serde_json::{Map, Value};
 use snafu::{location, Location};
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Default)]
 pub struct LinderaConfig {
     main: Option<String>,
     user: Option<String>,
     user_kind: Option<String>,
 }
 
-impl Default for LinderaConfig {
-    fn default() -> Self {
-        Self {
-            main: Default::default(),
-            user: Default::default(),
-            user_kind: Default::default(),
-        }
-    }
-}
-
 pub struct LinderaBuilder {
     root: PathBuf,
     config: LinderaConfig,
@@ -73,10 +63,10 @@ impl LinderaBuilder {
 impl TokenizerBuilder for LinderaBuilder {
     type Config = LinderaConfig;
 
-    fn new(config: Self::Config, root: &PathBuf) -> Result<Self> {
-        Ok(LinderaBuilder {
+    fn new(config: Self::Config, root: &Path) -> Result<Self> {
+        Ok(Self {
             config,
-            root: root.clone(),
+            root: root.to_path_buf(),
         })
     }
 

From eeebd1650ad35ed45e552f095f712019a69176b1 Mon Sep 17 00:00:00 2001
From: Chongchen Chen <chenkovsky@qq.com>
Date: Sat, 28 Dec 2024 18:15:51 +0800
Subject: [PATCH 22/22] format

---
 rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
index 063cf0b4a9..95445fb544 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer/jieba.rs
@@ -73,7 +73,7 @@ impl TokenizerBuilder for JiebaBuilder {
                 )
             })?
         }
-        let tokenizer = JiebaTokenizer { jieba: jieba };
+        let tokenizer = JiebaTokenizer { jieba };
         Ok(tantivy::tokenizer::TextAnalyzer::builder(tokenizer).dynamic())
     }
 }