From 8789cc12a9e60ece0fcce83fd4406328d44dd4a5 Mon Sep 17 00:00:00 2001 From: akashmangoai Date: Sun, 1 Sep 2024 22:12:55 +0530 Subject: [PATCH 1/2] added support for lancedb vectordb Signed-off-by: akashmangoai --- README.md | 1 + examples/data_manager/vector_store.py | 1 + gptcache/manager/vector_data/lancedb.py | 85 ++++++++++++++++++++++++ gptcache/manager/vector_data/manager.py | 23 +++++++ gptcache/utils/__init__.py | 3 + tests/unit_tests/manager/test_lancedb.py | 24 +++++++ 6 files changed, 137 insertions(+) create mode 100644 gptcache/manager/vector_data/lancedb.py create mode 100644 tests/unit_tests/manager/test_lancedb.py diff --git a/README.md b/README.md index c5f6f955..c75e1b06 100644 --- a/README.md +++ b/README.md @@ -360,6 +360,7 @@ The **Vector Store** module helps find the K most similar requests from the inpu - [x] Support [DocArray](https://github.com/docarray/docarray), DocArray is a library for representing, sending and storing multi-modal data, perfect for Machine Learning applications. - [x] Support qdrant - [x] Support weaviate + - [x] Support [LanceDB](https://github.com/lancedb/lancedb),Developer-friendly, serverless vector database for AI applications. Easily add long-term memory to your LLM apps! - [ ] Support other vector databases. - **Cache Manager**: The **Cache Manager** is responsible for controlling the operation of both the **Cache Storage** and **Vector Store**. diff --git a/examples/data_manager/vector_store.py b/examples/data_manager/vector_store.py index 4d804d38..194010ed 100644 --- a/examples/data_manager/vector_store.py +++ b/examples/data_manager/vector_store.py @@ -20,6 +20,7 @@ def run(): 'docarray', 'redis', 'weaviate', + 'lancedb', ] for vector_store in vector_stores: cache_base = CacheBase('sqlite') diff --git a/gptcache/manager/vector_data/lancedb.py b/gptcache/manager/vector_data/lancedb.py new file mode 100644 index 00000000..3bdbad6f --- /dev/null +++ b/gptcache/manager/vector_data/lancedb.py @@ -0,0 +1,85 @@ +from typing import List, Optional + +import numpy as np +import pyarrow as pa + +import lancedb +from gptcache.manager.vector_data.base import VectorBase, VectorData +from gptcache.utils import import_lancedb, import_torch + +import_torch() +import_lancedb() + +class LanceDB(VectorBase): + """Vector store: LanceDB + + :param persist_directory: The directory to persist, defaults to '/tmp/lancedb'. + :type persist_directory: str + :param table_name: The name of the table in LanceDB, defaults to 'gptcache'. + :type table_name: str + :param top_k: The number of the vectors results to return, defaults to 1. + :type top_k: int + """ + + def __init__( + self, + persist_directory: Optional[str] = "/tmp/lancedb", + table_name: str = "gptcache", + top_k: int = 1, + ): + self._persist_directory = persist_directory + self._table_name = table_name + self._top_k = top_k + + # Initialize LanceDB database + self._db = lancedb.connect(self._persist_directory) + + # Define the schema if creating a new table + schema = pa.schema([ + pa.field("id", pa.string()), + pa.field("vector", pa.list_(pa.float32(), list_size=10)) # Assuming dimension 10 for vectors + ]) + + # Initialize or open table + if self._table_name not in self._db.table_names(): + self._table = self._db.create_table(self._table_name, schema=schema) + else: + self._table = self._db.open_table(self._table_name) + + def mul_add(self, datas: List[VectorData]): + """Add multiple vectors to the LanceDB table""" + vectors, ids = map(list, zip(*((data.data.tolist(), str(data.id)) for data in datas))) + data = [{"id": id, "vector": vector} for id, vector in zip(ids, vectors)] + self._table.add(data) + + def search(self, data: np.ndarray, top_k: int = -1): + """Search for the most similar vectors in the LanceDB table""" + if len(self._table) == 0: + return [] + + if top_k == -1: + top_k = self._top_k + + results = self._table.search(data.tolist()).limit(top_k).to_list() + return [(result["_distance"], int(result["id"])) for result in results] + + def delete(self, ids: List[int]): + """Delete vectors from the LanceDB table based on IDs""" + for id in ids: + self._table.delete(f"id = '{id}'") + + def rebuild(self, ids: Optional[List[int]] = None): + """Rebuild the index, if applicable""" + return True + + def flush(self): + """Flush changes to disk (if necessary)""" + pass + + def close(self): + """Close the connection to LanceDB""" + pass + + def count(self): + """Return the total number of vectors in the table""" + return len(self._table) diff --git a/gptcache/manager/vector_data/manager.py b/gptcache/manager/vector_data/manager.py index 815fb934..2314654d 100644 --- a/gptcache/manager/vector_data/manager.py +++ b/gptcache/manager/vector_data/manager.py @@ -42,6 +42,7 @@ class VectorBase: `Chromadb` (with `top_k`, `client_settings`, `persist_directory`, `collection_name` params), `Hnswlib` (with `index_file_path`, `dimension`, `top_k`, `max_elements` params). `pgvector` (with `url`, `collection_name`, `index_params`, `top_k`, `dimension` params). + `lancedb` (with `url`, `collection_name`, `index_params`, `top_k`,). :param name: the name of the vectorbase, it is support 'milvus', 'faiss', 'chromadb', 'hnswlib' now. :type name: str @@ -89,6 +90,14 @@ class VectorBase: :param persist_directory: the directory to persist, defaults to '.chromadb/' in the current directory. :type persist_directory: str + :param client_settings: the setting for LanceDB. + :param persist_directory: The directory to persist, defaults to '/tmp/lancedb'. + :type persist_directory: str + :param table_name: The name of the table in LanceDB, defaults to 'gptcache'. + :type table_name: str + :param top_k: The number of the vectors results to return, defaults to 1. + :type top_k: int + :param index_path: the path to hnswlib index, defaults to 'hnswlib_index.bin'. :type index_path: str :param max_elements: max_elements of hnswlib, defaults 100000. @@ -289,6 +298,20 @@ def get(name, **kwargs): class_schema=class_schema, top_k=top_k, ) + + elif name == "lancedb": + from gptcache.manager.vector_data.lancedb import LanceDB + + persist_directory = kwargs.get("persist_directory", None) + table_name = kwargs.get("table_name", COLLECTION_NAME) + top_k: int = kwargs.get("top_k", TOP_K) + + vector_base = LanceDB( + persist_directory=persist_directory, + table_name=table_name, + top_k=top_k, + ) + else: raise NotFoundError("vector store", name) return vector_base diff --git a/gptcache/utils/__init__.py b/gptcache/utils/__init__.py index 093fd354..53251aa7 100644 --- a/gptcache/utils/__init__.py +++ b/gptcache/utils/__init__.py @@ -42,6 +42,7 @@ "import_redis", "import_qdrant", "import_weaviate", + "import_lancedb", ] import importlib.util @@ -147,6 +148,8 @@ def import_duckdb(): _check_library("duckdb", package="duckdb") _check_library("duckdb-engine", package="duckdb-engine") +def import_lancedb(): + _check_library("lancedb", package="lancedb") def import_sql_client(db_name): if db_name == "postgresql": diff --git a/tests/unit_tests/manager/test_lancedb.py b/tests/unit_tests/manager/test_lancedb.py new file mode 100644 index 00000000..f7d98600 --- /dev/null +++ b/tests/unit_tests/manager/test_lancedb.py @@ -0,0 +1,24 @@ +import unittest +import numpy as np +from gptcache.manager import VectorBase +from gptcache.manager.vector_data.base import VectorData + +class TestLanceDB(unittest.TestCase): + def test_normal(self): + # Initialize the LanceDB with a temporary directory and top_k set to 3 + db = VectorBase("lancedb", persist_directory="/tmp/test_lancedb", top_k=3) + + # Add 100 vectors to the LanceDB + db.mul_add([VectorData(id=i, data=np.random.sample(10)) for i in range(100)]) + + # Perform a search with a random query vector + search_res = db.search(np.random.sample(10)) + + # Check that the search returns 3 results + self.assertEqual(len(search_res), 3) + + # Delete vectors with specific IDs + db.delete([1, 3, 5, 7]) + + # Check that the count of vectors in the table is now 96 + self.assertEqual(db.count(), 96) From bdca143844621c2b1047eac99364f07ee2a8303c Mon Sep 17 00:00:00 2001 From: Akash A Desai <62583018+akashAD98@users.noreply.github.com> Date: Mon, 2 Sep 2024 21:33:20 +0530 Subject: [PATCH 2/2] fixed embeddings dim logic --- gptcache/manager/vector_data/lancedb.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/gptcache/manager/vector_data/lancedb.py b/gptcache/manager/vector_data/lancedb.py index 3bdbad6f..72a11ba0 100644 --- a/gptcache/manager/vector_data/lancedb.py +++ b/gptcache/manager/vector_data/lancedb.py @@ -10,9 +10,9 @@ import_torch() import_lancedb() + class LanceDB(VectorBase): """Vector store: LanceDB - :param persist_directory: The directory to persist, defaults to '/tmp/lancedb'. :type persist_directory: str :param table_name: The name of the table in LanceDB, defaults to 'gptcache'. @@ -34,21 +34,28 @@ def __init__( # Initialize LanceDB database self._db = lancedb.connect(self._persist_directory) - # Define the schema if creating a new table - schema = pa.schema([ - pa.field("id", pa.string()), - pa.field("vector", pa.list_(pa.float32(), list_size=10)) # Assuming dimension 10 for vectors - ]) - # Initialize or open table if self._table_name not in self._db.table_names(): - self._table = self._db.create_table(self._table_name, schema=schema) + self._table = None # Table will be created with the first insertion else: self._table = self._db.open_table(self._table_name) def mul_add(self, datas: List[VectorData]): """Add multiple vectors to the LanceDB table""" vectors, ids = map(list, zip(*((data.data.tolist(), str(data.id)) for data in datas))) + + # Infer the dimension of the vectors + vector_dim = len(vectors[0]) if vectors else 0 + + # Create table with the inferred schema if it doesn't exist + if self._table is None: + schema = pa.schema([ + pa.field("id", pa.string()), + pa.field("vector", pa.list_(pa.float32(), list_size=vector_dim)) + ]) + self._table = self._db.create_table(self._table_name, schema=schema) + + # Prepare data for insertion data = [{"id": id, "vector": vector} for id, vector in zip(ids, vectors)] self._table.add(data)