Merge pull request #17 from epinzur/add-colbert-example

added a colbert example
epinzur · Jun 14, 2024 · f409cd8 · f409cd8
2 parents bc107f4 + 0aae88e
commit f409cd8
Show file tree

Hide file tree

Showing 3 changed files with 163 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -47,14 +47,14 @@ pip install ragulate
   any other variables that you will pass during your experimentation. The method should ingest the passed
   file into your vector store.
 
-   See the `ingest()` method in [experiment_chunk_size_and_k.py](experiment_chunk_size_and_k.py) as an example.
+   See the `ingest()` method in [open_ai_chunk_size_and_k.py](open_ai_chunk_size_and_k.py) as an example.
    This method configures an ingest pipeline using the parameter `chunk_size` and ingests the file passed.
 
 1. Wrap your query pipeline in a single python method, and return it. The method should have parameters for
   any variables that you will pass during your experimentation. Currently only LangChain LCEL query pipelines
   are supported.
 
-   See the `query()` method in [experiment_chunk_size_and_k.py](experiment_chunk_size_and_k.py) as an example.
+   See the `query()` method in [open_ai_chunk_size_and_k.py](open_ai_chunk_size_and_k.py) as an example.
    This method returns a LangChain LCEL pipeline configured by the parameters `chunk_size` and `k`.
 
 Note: It is helpful to have a `**kwargs` param in your pipeline method definitions, so that if extra params
@@ -81,7 +81,7 @@ commands:
 
 ### Example
 
-For the examples below, we will use the example experiment [experiment_chunk_size_and_k.py](experiment_chunk_size_and_k.py)
+For the examples below, we will use the example experiment [open_ai_chunk_size_and_k.py](open_ai_chunk_size_and_k.py)
 and see how the RAG metrics change for changes in `chunk_size` and `k` (number of documents retrieved).
 
 1. Download a dataset. See available datasets here: https://llamahub.ai/?tab=llama_datasets
@@ -98,12 +98,12 @@ and see how the RAG metrics change for changes in `chunk_size` and `k` (number o
     Examples:
     * Ingest with `chunk_size=500`:
       ```
-      ragulate ingest -n chunk_size_500 -s experiment_chunk_size_and_k.py -m ingest \
+      ragulate ingest -n chunk_size_500 -s open_ai_chunk_size_and_k.py -m ingest \
       --var-name chunk_size --var-value 500 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
       ```
     * Ingest with `chunk_size=1000`:
       ```
-      ragulate ingest -n chunk_size_1000 -s experiment_chunk_size_and_k.py -m ingest \
+      ragulate ingest -n chunk_size_1000 -s open_ai_chunk_size_and_k.py -m ingest \
       --var-name chunk_size --var-value 1000 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
       ```
 
@@ -112,25 +112,25 @@ and see how the RAG metrics change for changes in `chunk_size` and `k` (number o
     Examples:
     * Query with `chunk_size=500` and `k=2`
       ```
-      ragulate query -n chunk_size_500_k_2 -s experiment_chunk_size_and_k.py -m query_pipeline \
+      ragulate query -n chunk_size_500_k_2 -s open_ai_chunk_size_and_k.py -m query_pipeline \
       --var-name chunk_size --var-value 500  --var-name k --var-value 2 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
       ```
 
     * Query with `chunk_size=1000` and `k=2`
       ```
-      ragulate query -n chunk_size_1000_k_2 -s experiment_chunk_size_and_k.py -m query_pipeline \
+      ragulate query -n chunk_size_1000_k_2 -s open_ai_chunk_size_and_k.py -m query_pipeline \
       --var-name chunk_size --var-value 1000  --var-name k --var-value 2 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
       ```
 
     * Query with `chunk_size=500` and `k=5`
       ```
-      ragulate query -n chunk_size_500_k_5 -s experiment_chunk_size_and_k.py -m query_pipeline \
+      ragulate query -n chunk_size_500_k_5 -s open_ai_chunk_size_and_k.py -m query_pipeline \
       --var-name chunk_size --var-value 500  --var-name k --var-value 5 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
       ```
 
     * Query with `chunk_size=1000` and `k=25`
       ```
-      ragulate query -n chunk_size_1000_k_5 -s experiment_chunk_size_and_k.py -m query_pipeline \
+      ragulate query -n chunk_size_1000_k_5 -s open_ai_chunk_size_and_k.py -m query_pipeline \
       --var-name chunk_size --var-value 1000  --var-name k --var-value 5 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
       ```
 

diff --git a/colbert_chunk_size_and_k.py b/colbert_chunk_size_and_k.py
@@ -0,0 +1,154 @@
+import os
+import time
+from pathlib import Path
+from typing import List
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import UnstructuredFileLoader
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_openai import ChatOpenAI
+from ragstack_colbert import (
+    CassandraDatabase,
+    Chunk,
+    ColbertEmbeddingModel,
+    ColbertVectorStore,
+)
+from ragstack_langchain.colbert import ColbertVectorStore as LangChainColbertVectorStore
+from transformers import BertTokenizer
+
+LLM_MODEL = "gpt-3.5-turbo"
+
+batch_size = 640
+
+astra_token = os.getenv("ASTRA_DB_TOKEN_COLBERT2")
+database_id = os.getenv("ASTRA_DB_ID_COLBERT2")
+keyspace = "ragulate"
+
+import logging
+logging.basicConfig(level=logging.INFO)
+logging.getLogger("unstructured").setLevel(logging.ERROR)
+logging.getLogger("cassandra").setLevel(logging.ERROR)
+
+
+def get_embedding_model(chunk_size: int) -> ColbertEmbeddingModel:
+    return ColbertEmbeddingModel(doc_maxlen=chunk_size, batch_size=batch_size)
+
+
+def get_database(chunk_size: int) -> CassandraDatabase:
+    table_name = f"colbert_chunk_size_{chunk_size}"
+
+    return CassandraDatabase.from_astra(
+        astra_token=astra_token,
+        database_id=database_id,
+        keyspace=keyspace,
+        table_name=table_name,
+        timeout=500,
+    )
+
+
+def get_lc_vector_store(chunk_size: int) -> LangChainColbertVectorStore:
+    database = get_database(chunk_size=chunk_size)
+    embedding_model = get_embedding_model(chunk_size=chunk_size)
+
+    return LangChainColbertVectorStore(
+        database=database,
+        embedding_model=embedding_model,
+    )
+
+
+def get_vector_store(chunk_size: int) -> ColbertVectorStore:
+    database = get_database(chunk_size=chunk_size)
+    return ColbertVectorStore(database=database)
+
+
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+
+def len_function(text: str) -> int:
+    return len(tokenizer.tokenize(text))
+
+
+async def ingest(file_path: str, chunk_size: int, **kwargs):
+    doc_id = Path(file_path).name
+
+    chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))
+
+    start = time.time()
+    docs = UnstructuredFileLoader(
+        file_path=file_path, mode="single", strategy="fast"
+    ).load()
+    duration = time.time() - start
+    print(f"It took {duration} seconds to load and parse the document")
+
+    # confirm only one document returned per file
+    assert len(docs) == 1
+
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=len_function,
+    )
+
+    start = time.time()
+    chunked_docs = text_splitter.split_documents(docs)
+    duration = time.time() - start
+    print(
+        f"It took {duration} seconds to split the document into {len(chunked_docs)} chunks"
+    )
+
+    texts = [doc.page_content for doc in chunked_docs]
+    start = time.time()
+    embeddings = get_embedding_model(chunk_size=chunk_size).embed_texts(texts=texts)
+    duration = time.time() - start
+    print(f"It took {duration} seconds to embed {len(chunked_docs)} chunks")
+
+    colbert_vector_store = get_vector_store(chunk_size=chunk_size)
+
+    await colbert_vector_store.adelete_chunks(doc_ids=[doc_id])
+
+    chunks: List[Chunk] = []
+    for i, doc in enumerate(chunked_docs):
+        chunks.append(
+            Chunk(
+                doc_id=doc_id,
+                chunk_id=i,
+                text=doc.page_content,
+                metadata={} if doc.metadata is None else doc.metadata,
+                embedding=embeddings[i],
+            )
+        )
+
+    start = time.time()
+    await colbert_vector_store.aadd_chunks(chunks=chunks, concurrent_inserts=100)
+    duration = time.time() - start
+    print(
+        f"It took {duration} seconds to insert {len(chunked_docs)} chunks into AstraDB"
+    )
+
+
+def query_pipeline(k: int, chunk_size: int, **kwargs):
+    vector_store = get_lc_vector_store(chunk_size=chunk_size)
+    llm = ChatOpenAI(model_name=LLM_MODEL)
+
+    # build a prompt
+    prompt_template = """
+    Answer the question based only on the supplied context. If you don't know the answer, say: "I don't know".
+    Context: {context}
+    Question: {question}
+    Your answer:
+    """
+    prompt = ChatPromptTemplate.from_template(prompt_template)
+
+    rag_chain = (
+        {
+            "context": vector_store.as_retriever(search_kwargs={"k": k}),
+            "question": RunnablePassthrough(),
+        }
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+
+    return rag_chain
diff --git a/experiment_chunk_size_and_k.py → open_ai_chunk_size_and_k.py b/experiment_chunk_size_and_k.py → open_ai_chunk_size_and_k.py
@@ -1,6 +1,5 @@
 import logging
 import os
-from typing import List
 
 from langchain_astradb import AstraDBVectorStore
 from langchain_community.document_loaders import UnstructuredFileLoader