Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Semantic splitter #63

Merged
merged 17 commits into from
Mar 2, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cSpell.words": [
"tiktoken",
"Upserted"
]
}
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Input example:
{
"files": [
{
"type": "PDF",
"name": "My file",
"url": "https://path-to-my-file.pdf"
}
],
Expand All @@ -23,8 +23,15 @@ Input example:
}
},
"index_name": "my_index",
"chunk_config": {
"partition_strategy": "auto",
"split_method": "semantic",
"min_chunk_tokens": 400,
"max_token_size": 30,
"rolling_window_size": 1
},
"encoder": {
"type": "openai",
"provider": "openai",
"name": "text-embedding-3-small",
"dimensions": 1536 # encoder depends on the provider and model
},
Expand All @@ -47,8 +54,9 @@ Input example:
},
"index_name": "my_index",
"encoder": {
"type": "openai",
"provider": "openai",
"name": "text-embedding-3-small",
"dimensions": 384
}
"session_id": "my_session_id" # keeps micro-vm sessions and enables caching
}
Expand Down
20 changes: 13 additions & 7 deletions api/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from models.ingest import RequestPayload
from service.embedding import EmbeddingService, get_encoder
from service.ingest import handle_urls, handle_google_drive
from service.ingest import handle_google_drive, handle_urls
from utils.summarise import SUMMARY_SUFFIX

router = APIRouter()
Expand All @@ -16,23 +16,29 @@
async def ingest(payload: RequestPayload) -> Dict:
encoder = get_encoder(encoder_config=payload.encoder)
embedding_service = EmbeddingService(
encoder=encoder,
index_name=payload.index_name,
vector_credentials=payload.vector_database,
dimensions=payload.encoder.dimensions,
)
chunks = []
summary_documents = []
if payload.files:
chunks, summary_documents = await handle_urls(embedding_service, payload.files)
chunks, summary_documents = await handle_urls(
embedding_service, payload.files, payload.chunk_config
)

elif payload.google_drive:
chunks, summary_documents = await handle_google_drive(
embedding_service, payload.google_drive
)
) # type: ignore TODO: Fix typing

await asyncio.gather(
embedding_service.generate_and_upsert_embeddings(
documents=chunks, encoder=encoder, index_name=payload.index_name
embedding_service.embed_and_upsert(
chunks=chunks, encoder=encoder, index_name=payload.index_name
),
embedding_service.generate_and_upsert_embeddings(
documents=summary_documents,
embedding_service.embed_and_upsert(
chunks=summary_documents,
encoder=encoder,
index_name=f"{payload.index_name}{SUMMARY_SUFFIX}",
),
Expand Down
11 changes: 3 additions & 8 deletions api/query.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from fastapi import APIRouter

from models.query import RequestPayload, ResponseData, ResponsePayload
from models.query import RequestPayload, ResponsePayload
from service.router import query as _query

router = APIRouter()
Expand All @@ -9,10 +9,5 @@
@router.post("/query", response_model=ResponsePayload)
async def query(payload: RequestPayload):
chunks = await _query(payload=payload)
response_data = [
ResponseData(
content=chunk.content, doc_url=chunk.doc_url, page_number=chunk.page_number
)
for chunk in chunks
]
return {"success": True, "data": response_data}
# NOTE: Filter out fields before given to LLM
return ResponsePayload(success=True, data=chunks)
2 changes: 1 addition & 1 deletion dev/embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"metadata": {},
"outputs": [],
"source": [
"elements = await embedding_service._download_and_extract_elements(file, strategy=\"auto\")\n"
"elements = await embedding_service._partition_file(file, strategy=\"auto\")\n"
]
},
{
Expand Down
194 changes: 157 additions & 37 deletions dev/walkthrough.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,23 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"API_URL: http://localhost:8000\n",
"PINECONE_INDEX: simonas-serverless-1536\n",
"PINECONE_HOST: https://simonas-serverless-1536-75c816a.svc.apw5-4e34-81fa.pinecone.io\n"
]
}
],
"source": [
"import os\n",
"import requests\n",
"import json\n",
"from dotenv import load_dotenv\n",
"load_dotenv()\n",
"\n",
Expand All @@ -17,24 +28,32 @@
"PINECONE_HOST = os.environ.get('PINECONE_HOST', '')\n",
"\n",
"print(\"API_URL:\", API_URL)\n",
"print(\"PINECONE_API_KEY:\", PINECONE_API_KEY)\n",
"# print(\"PINECONE_API_KEY:\", PINECONE_API_KEY)\n",
"print(\"PINECONE_INDEX:\", PINECONE_INDEX)\n",
"print(\"PINECONE_HOST:\", PINECONE_HOST)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 28,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'success': True, 'index_name': 'simonas-serverless-1536'}\n"
]
}
],
"source": [
"# Ingest a file\n",
"url = f\"{API_URL}/api/v1/ingest\"\n",
"\n",
"payload = {\n",
" \"files\": [\n",
" {\n",
" \"type\": \"PDF\",\n",
" \"name\": \"chunking\",\n",
" \"url\": \"https://arxiv.org/pdf/2402.05131.pdf\"\n",
" }\n",
" ],\n",
Expand All @@ -45,15 +64,135 @@
" \"host\": PINECONE_HOST,\n",
" }\n",
" },\n",
" \"chunk_config\": {\n",
" \"partition_strategy\": \"auto\", # For tables use \"hi_res\"\n",
" \"split_method\": \"semantic\", # or 'by_title'\n",
" \"min_chunk_tokens\": 50,\n",
" \"max_token_size\": 300,\n",
" \"rolling_window_size\": 1\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"cohere\",\n",
" \"encoder\": {\n",
" \"name\": \"text-embedding-ada-002\",\n",
" \"provider\": \"openai\",\n",
" \"dimensions\": 1536\n",
" },\n",
"}\n",
"\n",
"response = requests.post(url, json=payload)\n",
"\n",
"print(response.json())"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"success\": true,\n",
" \"data\": [\n",
" {\n",
" \"id\": \"19baf12d-cc24-44a7-bbb8-b3068e2217e3\",\n",
" \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
" \"content\": \"Results in table 5 show that element-based chunking strategies o\\ufb00er the best question-answering accuracy, which is consistent with page retrieval and para- graph retrieval accuracy. Lastly, our approach stands out for its e\\ufb03ciency. Not only is element-based chunking generalizable without the need to select the chunk size, but when com- pared to the aggregation results that yield the highest retrieval scores. Element- based chunking achieves the highest retrieval scores with only half the number of chunks required compared to methods that do not consider the structure of the documents (62,529 v.s. 112,155). This can reduce the indexing cost and im- prove query latency because there are only half as many vectors to index for the vectordb that stores the chunks. This underscores the e\\ufb00ectiveness of our solu- tion in optimizing the balance between performance and computational resource requirements.\",\n",
" \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"source_type\": \".pdf\",\n",
" \"chunk_index\": null,\n",
" \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
" \"token_count\": null,\n",
" \"page_number\": 9,\n",
" \"metadata\": {\n",
" \"filename\": \"tmpq96h17zo.pdf\",\n",
" \"filetype\": \"application/pdf\",\n",
" \"languages\": [\n",
" \"eng\"\n",
" ],\n",
" \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
" },\n",
" \"dense_embedding\": null\n",
" },\n",
" {\n",
" \"id\": \"20340d3d-f6f0-4dde-98e7-8193c77be46e\",\n",
" \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
" \"content\": \"More speci\\ufb01cally on document chunking methods for RAG, there are stan- dard approaches being considered such as chunking text into spans of a given token length (e.g. 128 and 256) or chunking based on sentences. Open source projects already allow simple processing of documents (e.g. Unstructured4, Lla- maindex5 or Langchain 6), without explicitly considering the table structure on which these chunking strategies are applied. Even though di\\ufb00erent approaches are available, an exhaustive evaluation of chunking applied to RAG and speci\\ufb01cally to \\ufb01nancial reporting, except for some limited chunking analysis [14,36], is non-existent. In our work, we compare a broad range of chunking approaches in addition to more simple ones and provide an analysis of the outcomes of di\\ufb00erent methods when asking questions about di\\ufb00erent aspects of the reports.\",\n",
" \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"source_type\": \".pdf\",\n",
" \"chunk_index\": null,\n",
" \"title\": \"2 Related work\",\n",
" \"token_count\": null,\n",
" \"page_number\": 3,\n",
" \"metadata\": {\n",
" \"filename\": \"tmpq96h17zo.pdf\",\n",
" \"filetype\": \"application/pdf\",\n",
" \"languages\": [\n",
" \"eng\"\n",
" ],\n",
" \"parent_id\": \"5cdbed1de9473b8856ab0befd08ff7cb\"\n",
" },\n",
" \"dense_embedding\": null\n",
" },\n",
" {\n",
" \"id\": \"d8eca566-f0ab-4a36-a674-3f0b99bf3807\",\n",
" \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
" \"content\": \"Chunking strategy Base 128 Base 256 Base 512 Keywords Chipper Summary Chipper Pre\\ufb01x & Table Description Chipper Furthermore, we would like to study the impact of RAG con\\ufb01guration and ele- meant type based chunking.\",\n",
" \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
" \"source_type\": \".pdf\",\n",
" \"chunk_index\": null,\n",
" \"title\": \"Financial Report Chunking for E\\ufb00ective Retrieval Augmented Generation\",\n",
" \"token_count\": null,\n",
" \"page_number\": 1,\n",
" \"metadata\": {\n",
" \"filename\": \"tmpq96h17zo.pdf\",\n",
" \"filetype\": \"application/pdf\",\n",
" \"languages\": [\n",
" \"eng\"\n",
" ],\n",
" \"parent_id\": \"bd989cd79b9c4cb6019cb168a82ff24d\"\n",
" },\n",
" \"dense_embedding\": null\n",
" }\n",
" ]\n",
"}\n"
]
}
],
"source": [
"# Query the index\n",
"query_url = f\"{API_URL}/api/v1/query\"\n",
"\n",
"query_payload = {\n",
" \"input\": \"What are the chunking strategies?\",\n",
" \"vector_database\": {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
" \"api_key\": PINECONE_API_KEY,\n",
" \"host\": PINECONE_HOST,\n",
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": {\n",
" \"name\": \"text-embedding-ada-002\",\n",
" \"provider\": \"openai\",\n",
" \"dimensions\": 1536\n",
" },\n",
"}\n",
"\n",
"query_response = requests.post(query_url, json=query_payload)\n",
"\n",
"# NOTE: Filter out fields before given to LLM\n",
"# Include title, content, source, page_number, chunk_index\n",
"formatted_json = json.dumps(query_response.json(), indent=4)\n",
"print(formatted_json)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -87,33 +226,6 @@
"print(response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Query the index\n",
"query_url = f\"{API_URL}/api/v1/query\"\n",
"\n",
"query_payload = {\n",
" \"input\": \"What are the chunking strategies?\",\n",
" \"vector_database\": {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
" \"api_key\": PINECONE_API_KEY,\n",
" \"host\": PINECONE_HOST,\n",
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"cohere\",\n",
"}\n",
"\n",
"query_response = requests.post(query_url, json=query_payload)\n",
"\n",
"print(query_response.json())"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -163,9 +275,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'detail': [{'type': 'missing', 'loc': ['body', 'files'], 'msg': 'Field required', 'input': {'file_url': 'https://arxiv.org/pdf/2210.03629.pdf', 'vector_database': {'type': 'pinecone', 'config': {'api_key': 'f4adc79e-ad40-4426-a78a-9878e2ed4a79', 'host': 'https://simonas-serverless-1536-75c816a.svc.apw5-4e34-81fa.pinecone.io'}}, 'index_name': 'simonas-serverless-1536', 'encoder': 'openai'}, 'url': 'https://errors.pydantic.dev/2.6/v/missing'}, {'type': 'model_attributes_type', 'loc': ['body', 'encoder'], 'msg': 'Input should be a valid dictionary or object to extract fields from', 'input': 'openai', 'url': 'https://errors.pydantic.dev/2.6/v/model_attributes_type'}]}\n"
]
}
],
"source": [
"# Delete the index\n",
"query_url = f\"{API_URL}/api/v1/delete\"\n",
Expand All @@ -180,7 +300,7 @@
" }\n",
" },\n",
" \"index_name\": PINECONE_INDEX,\n",
" \"encoder\": \"cohere\",\n",
" \"encoder\": \"openai\",\n",
"}\n",
"\n",
"delete_response = requests.delete(query_url, json=delete_payload)\n",
Expand Down
Empty file added models/__init__.py
Empty file.
Loading