Skip to content

Commit

Permalink
Merge pull request #17 from epinzur/add-colbert-example
Browse files Browse the repository at this point in the history
added a colbert example
  • Loading branch information
epinzur authored Jun 14, 2024
2 parents bc107f4 + 0aae88e commit f409cd8
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 10 deletions.
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,14 @@ pip install ragulate
any other variables that you will pass during your experimentation. The method should ingest the passed
file into your vector store.

See the `ingest()` method in [experiment_chunk_size_and_k.py](experiment_chunk_size_and_k.py) as an example.
See the `ingest()` method in [open_ai_chunk_size_and_k.py](open_ai_chunk_size_and_k.py) as an example.
This method configures an ingest pipeline using the parameter `chunk_size` and ingests the file passed.

1. Wrap your query pipeline in a single python method, and return it. The method should have parameters for
any variables that you will pass during your experimentation. Currently only LangChain LCEL query pipelines
are supported.

See the `query()` method in [experiment_chunk_size_and_k.py](experiment_chunk_size_and_k.py) as an example.
See the `query()` method in [open_ai_chunk_size_and_k.py](open_ai_chunk_size_and_k.py) as an example.
This method returns a LangChain LCEL pipeline configured by the parameters `chunk_size` and `k`.

Note: It is helpful to have a `**kwargs` param in your pipeline method definitions, so that if extra params
Expand All @@ -81,7 +81,7 @@ commands:

### Example

For the examples below, we will use the example experiment [experiment_chunk_size_and_k.py](experiment_chunk_size_and_k.py)
For the examples below, we will use the example experiment [open_ai_chunk_size_and_k.py](open_ai_chunk_size_and_k.py)
and see how the RAG metrics change for changes in `chunk_size` and `k` (number of documents retrieved).

1. Download a dataset. See available datasets here: https://llamahub.ai/?tab=llama_datasets
Expand All @@ -98,12 +98,12 @@ and see how the RAG metrics change for changes in `chunk_size` and `k` (number o
Examples:
* Ingest with `chunk_size=500`:
```
ragulate ingest -n chunk_size_500 -s experiment_chunk_size_and_k.py -m ingest \
ragulate ingest -n chunk_size_500 -s open_ai_chunk_size_and_k.py -m ingest \
--var-name chunk_size --var-value 500 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
```
* Ingest with `chunk_size=1000`:
```
ragulate ingest -n chunk_size_1000 -s experiment_chunk_size_and_k.py -m ingest \
ragulate ingest -n chunk_size_1000 -s open_ai_chunk_size_and_k.py -m ingest \
--var-name chunk_size --var-value 1000 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
```
Expand All @@ -112,25 +112,25 @@ and see how the RAG metrics change for changes in `chunk_size` and `k` (number o
Examples:
* Query with `chunk_size=500` and `k=2`
```
ragulate query -n chunk_size_500_k_2 -s experiment_chunk_size_and_k.py -m query_pipeline \
ragulate query -n chunk_size_500_k_2 -s open_ai_chunk_size_and_k.py -m query_pipeline \
--var-name chunk_size --var-value 500 --var-name k --var-value 2 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
```
* Query with `chunk_size=1000` and `k=2`
```
ragulate query -n chunk_size_1000_k_2 -s experiment_chunk_size_and_k.py -m query_pipeline \
ragulate query -n chunk_size_1000_k_2 -s open_ai_chunk_size_and_k.py -m query_pipeline \
--var-name chunk_size --var-value 1000 --var-name k --var-value 2 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
```
* Query with `chunk_size=500` and `k=5`
```
ragulate query -n chunk_size_500_k_5 -s experiment_chunk_size_and_k.py -m query_pipeline \
ragulate query -n chunk_size_500_k_5 -s open_ai_chunk_size_and_k.py -m query_pipeline \
--var-name chunk_size --var-value 500 --var-name k --var-value 5 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
```
* Query with `chunk_size=1000` and `k=25`
```
ragulate query -n chunk_size_1000_k_5 -s experiment_chunk_size_and_k.py -m query_pipeline \
ragulate query -n chunk_size_1000_k_5 -s open_ai_chunk_size_and_k.py -m query_pipeline \
--var-name chunk_size --var-value 1000 --var-name k --var-value 5 --dataset BraintrustCodaHelpDesk --dataset BlockchainSolana
```
Expand Down
154 changes: 154 additions & 0 deletions colbert_chunk_size_and_k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import os
import time
from pathlib import Path
from typing import List

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from ragstack_colbert import (
CassandraDatabase,
Chunk,
ColbertEmbeddingModel,
ColbertVectorStore,
)
from ragstack_langchain.colbert import ColbertVectorStore as LangChainColbertVectorStore
from transformers import BertTokenizer

LLM_MODEL = "gpt-3.5-turbo"

batch_size = 640

astra_token = os.getenv("ASTRA_DB_TOKEN_COLBERT2")
database_id = os.getenv("ASTRA_DB_ID_COLBERT2")
keyspace = "ragulate"

import logging
logging.basicConfig(level=logging.INFO)
logging.getLogger("unstructured").setLevel(logging.ERROR)
logging.getLogger("cassandra").setLevel(logging.ERROR)


def get_embedding_model(chunk_size: int) -> ColbertEmbeddingModel:
return ColbertEmbeddingModel(doc_maxlen=chunk_size, batch_size=batch_size)


def get_database(chunk_size: int) -> CassandraDatabase:
table_name = f"colbert_chunk_size_{chunk_size}"

return CassandraDatabase.from_astra(
astra_token=astra_token,
database_id=database_id,
keyspace=keyspace,
table_name=table_name,
timeout=500,
)


def get_lc_vector_store(chunk_size: int) -> LangChainColbertVectorStore:
database = get_database(chunk_size=chunk_size)
embedding_model = get_embedding_model(chunk_size=chunk_size)

return LangChainColbertVectorStore(
database=database,
embedding_model=embedding_model,
)


def get_vector_store(chunk_size: int) -> ColbertVectorStore:
database = get_database(chunk_size=chunk_size)
return ColbertVectorStore(database=database)


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def len_function(text: str) -> int:
return len(tokenizer.tokenize(text))


async def ingest(file_path: str, chunk_size: int, **kwargs):
doc_id = Path(file_path).name

chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))

start = time.time()
docs = UnstructuredFileLoader(
file_path=file_path, mode="single", strategy="fast"
).load()
duration = time.time() - start
print(f"It took {duration} seconds to load and parse the document")

# confirm only one document returned per file
assert len(docs) == 1

text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len_function,
)

start = time.time()
chunked_docs = text_splitter.split_documents(docs)
duration = time.time() - start
print(
f"It took {duration} seconds to split the document into {len(chunked_docs)} chunks"
)

texts = [doc.page_content for doc in chunked_docs]
start = time.time()
embeddings = get_embedding_model(chunk_size=chunk_size).embed_texts(texts=texts)
duration = time.time() - start
print(f"It took {duration} seconds to embed {len(chunked_docs)} chunks")

colbert_vector_store = get_vector_store(chunk_size=chunk_size)

await colbert_vector_store.adelete_chunks(doc_ids=[doc_id])

chunks: List[Chunk] = []
for i, doc in enumerate(chunked_docs):
chunks.append(
Chunk(
doc_id=doc_id,
chunk_id=i,
text=doc.page_content,
metadata={} if doc.metadata is None else doc.metadata,
embedding=embeddings[i],
)
)

start = time.time()
await colbert_vector_store.aadd_chunks(chunks=chunks, concurrent_inserts=100)
duration = time.time() - start
print(
f"It took {duration} seconds to insert {len(chunked_docs)} chunks into AstraDB"
)


def query_pipeline(k: int, chunk_size: int, **kwargs):
vector_store = get_lc_vector_store(chunk_size=chunk_size)
llm = ChatOpenAI(model_name=LLM_MODEL)

# build a prompt
prompt_template = """
Answer the question based only on the supplied context. If you don't know the answer, say: "I don't know".
Context: {context}
Question: {question}
Your answer:
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

rag_chain = (
{
"context": vector_store.as_retriever(search_kwargs={"k": k}),
"question": RunnablePassthrough(),
}
| prompt
| llm
| StrOutputParser()
)

return rag_chain
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import os
from typing import List

from langchain_astradb import AstraDBVectorStore
from langchain_community.document_loaders import UnstructuredFileLoader
Expand Down

0 comments on commit f409cd8

Please sign in to comment.