superagent-ai · homanp · Mar 2, 2024 · Feb 25, 2024 · Feb 25, 2024 · Feb 25, 2024
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "cSpell.words": [
+        "tiktoken",
+        "Upserted"
+    ]
+}
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Input example:
 {
     "files": [
         {
-            "type": "PDF",
+            "name": "My file",
             "url": "https://path-to-my-file.pdf"
         }
     ],
@@ -23,8 +23,15 @@ Input example:
         }
     },
     "index_name": "my_index",
+    "chunk_config": {
+        "partition_strategy": "auto",
+        "split_method": "semantic",
+        "min_chunk_tokens": 400,
+        "max_token_size": 30,
+        "rolling_window_size": 1
+    },
     "encoder": {
-        "type": "openai",
+        "provider": "openai",
         "name": "text-embedding-3-small",
         "dimensions": 1536  # encoder depends on the provider and model
     },
@@ -47,8 +54,9 @@ Input example:
     },
     "index_name": "my_index",
     "encoder": {
-        "type": "openai",
+        "provider": "openai",
         "name": "text-embedding-3-small",
+        "dimensions": 384
     }
     "session_id": "my_session_id" # keeps micro-vm sessions and enables caching 
 }

diff --git a/api/ingest.py b/api/ingest.py
@@ -6,7 +6,7 @@
 
 from models.ingest import RequestPayload
 from service.embedding import EmbeddingService, get_encoder
-from service.ingest import handle_urls, handle_google_drive
+from service.ingest import handle_google_drive, handle_urls
 from utils.summarise import SUMMARY_SUFFIX
 
 router = APIRouter()
@@ -16,23 +16,29 @@
 async def ingest(payload: RequestPayload) -> Dict:
     encoder = get_encoder(encoder_config=payload.encoder)
     embedding_service = EmbeddingService(
+        encoder=encoder,
         index_name=payload.index_name,
         vector_credentials=payload.vector_database,
         dimensions=payload.encoder.dimensions,
     )
+    chunks = []
+    summary_documents = []
     if payload.files:
-        chunks, summary_documents = await handle_urls(embedding_service, payload.files)
+        chunks, summary_documents = await handle_urls(
+            embedding_service, payload.files, payload.chunk_config
+        )
+
     elif payload.google_drive:
         chunks, summary_documents = await handle_google_drive(
             embedding_service, payload.google_drive
-        )
+        )  # type: ignore TODO: Fix typing
 
     await asyncio.gather(
-        embedding_service.generate_and_upsert_embeddings(
-            documents=chunks, encoder=encoder, index_name=payload.index_name
+        embedding_service.embed_and_upsert(
+            chunks=chunks, encoder=encoder, index_name=payload.index_name
         ),
-        embedding_service.generate_and_upsert_embeddings(
-            documents=summary_documents,
+        embedding_service.embed_and_upsert(
+            chunks=summary_documents,
             encoder=encoder,
             index_name=f"{payload.index_name}{SUMMARY_SUFFIX}",
         ),

diff --git a/api/query.py b/api/query.py
@@ -1,6 +1,6 @@
 from fastapi import APIRouter
 
-from models.query import RequestPayload, ResponseData, ResponsePayload
+from models.query import RequestPayload, ResponsePayload
 from service.router import query as _query
 
 router = APIRouter()
@@ -9,10 +9,5 @@
 @router.post("/query", response_model=ResponsePayload)
 async def query(payload: RequestPayload):
     chunks = await _query(payload=payload)
-    response_data = [
-        ResponseData(
-            content=chunk.content, doc_url=chunk.doc_url, page_number=chunk.page_number
-        )
-        for chunk in chunks
-    ]
-    return {"success": True, "data": response_data}
+    # NOTE: Filter out fields before given to LLM
+    return ResponsePayload(success=True, data=chunks)
diff --git a/dev/embedding.ipynb b/dev/embedding.ipynb
@@ -40,7 +40,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "elements = await embedding_service._download_and_extract_elements(file, strategy=\"auto\")\n"
+    "elements = await embedding_service._partition_file(file, strategy=\"auto\")\n"
    ]
   },
   {

diff --git a/dev/walkthrough.ipynb b/dev/walkthrough.ipynb
@@ -2,12 +2,23 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "API_URL: http://localhost:8000\n",
+      "PINECONE_INDEX: simonas-serverless-1536\n",
+      "PINECONE_HOST: https://simonas-serverless-1536-75c816a.svc.apw5-4e34-81fa.pinecone.io\n"
+     ]
+    }
+   ],
    "source": [
     "import os\n",
     "import requests\n",
+    "import json\n",
     "from dotenv import load_dotenv\n",
     "load_dotenv()\n",
     "\n",
@@ -17,24 +28,32 @@
     "PINECONE_HOST = os.environ.get('PINECONE_HOST', '')\n",
     "\n",
     "print(\"API_URL:\", API_URL)\n",
-    "print(\"PINECONE_API_KEY:\", PINECONE_API_KEY)\n",
+    "# print(\"PINECONE_API_KEY:\", PINECONE_API_KEY)\n",
     "print(\"PINECONE_INDEX:\", PINECONE_INDEX)\n",
     "print(\"PINECONE_HOST:\", PINECONE_HOST)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'success': True, 'index_name': 'simonas-serverless-1536'}\n"
+     ]
+    }
+   ],
    "source": [
     "# Ingest a file\n",
     "url = f\"{API_URL}/api/v1/ingest\"\n",
     "\n",
     "payload = {\n",
     "    \"files\": [\n",
     "        {\n",
-    "            \"type\": \"PDF\",\n",
+    "            \"name\": \"chunking\",\n",
     "            \"url\": \"https://arxiv.org/pdf/2402.05131.pdf\"\n",
     "        }\n",
     "    ],\n",
@@ -45,15 +64,135 @@
     "            \"host\": PINECONE_HOST,\n",
     "        }\n",
     "    },\n",
+    "    \"chunk_config\": {\n",
+    "        \"partition_strategy\": \"auto\", # For tables use \"hi_res\"\n",
+    "        \"split_method\": \"semantic\", # or 'by_title'\n",
+    "        \"min_chunk_tokens\": 50,\n",
+    "        \"max_token_size\": 300,\n",
+    "        \"rolling_window_size\": 1\n",
+    "    },\n",
     "    \"index_name\": PINECONE_INDEX,\n",
-    "    \"encoder\": \"cohere\",\n",
+    "    \"encoder\": {\n",
+    "        \"name\": \"text-embedding-ada-002\",\n",
+    "        \"provider\": \"openai\",\n",
+    "        \"dimensions\": 1536\n",
+    "    },\n",
     "}\n",
     "\n",
     "response = requests.post(url, json=payload)\n",
     "\n",
     "print(response.json())"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "    \"success\": true,\n",
+      "    \"data\": [\n",
+      "        {\n",
+      "            \"id\": \"19baf12d-cc24-44a7-bbb8-b3068e2217e3\",\n",
+      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
+      "            \"content\": \"Results in table 5 show that element-based chunking strategies o\\ufb00er the best question-answering accuracy, which is consistent with page retrieval and para- graph retrieval accuracy. Lastly, our approach stands out for its e\\ufb03ciency. Not only is element-based chunking generalizable without the need to select the chunk size, but when com- pared to the aggregation results that yield the highest retrieval scores. Element- based chunking achieves the highest retrieval scores with only half the number of chunks required compared to methods that do not consider the structure of the documents (62,529 v.s. 112,155). This can reduce the indexing cost and im- prove query latency because there are only half as many vectors to index for the vectordb that stores the chunks. This underscores the e\\ufb00ectiveness of our solu- tion in optimizing the balance between performance and computational resource requirements.\",\n",
+      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"source_type\": \".pdf\",\n",
+      "            \"chunk_index\": null,\n",
+      "            \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
+      "            \"token_count\": null,\n",
+      "            \"page_number\": 9,\n",
+      "            \"metadata\": {\n",
+      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
+      "                \"filetype\": \"application/pdf\",\n",
+      "                \"languages\": [\n",
+      "                    \"eng\"\n",
+      "                ],\n",
+      "                \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
+      "            },\n",
+      "            \"dense_embedding\": null\n",
+      "        },\n",
+      "        {\n",
+      "            \"id\": \"20340d3d-f6f0-4dde-98e7-8193c77be46e\",\n",
+      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
+      "            \"content\": \"More speci\\ufb01cally on document chunking methods for RAG, there are stan- dard approaches being considered such as chunking text into spans of a given token length (e.g. 128 and 256) or chunking based on sentences. Open source projects already allow simple processing of documents (e.g. Unstructured4, Lla- maindex5 or Langchain 6), without explicitly considering the table structure on which these chunking strategies are applied. Even though di\\ufb00erent approaches are available, an exhaustive evaluation of chunking applied to RAG and speci\\ufb01cally to \\ufb01nancial reporting, except for some limited chunking analysis [14,36], is non-existent. In our work, we compare a broad range of chunking approaches in addition to more simple ones and provide an analysis of the outcomes of di\\ufb00erent methods when asking questions about di\\ufb00erent aspects of the reports.\",\n",
+      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"source_type\": \".pdf\",\n",
+      "            \"chunk_index\": null,\n",
+      "            \"title\": \"2 Related work\",\n",
+      "            \"token_count\": null,\n",
+      "            \"page_number\": 3,\n",
+      "            \"metadata\": {\n",
+      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
+      "                \"filetype\": \"application/pdf\",\n",
+      "                \"languages\": [\n",
+      "                    \"eng\"\n",
+      "                ],\n",
+      "                \"parent_id\": \"5cdbed1de9473b8856ab0befd08ff7cb\"\n",
+      "            },\n",
+      "            \"dense_embedding\": null\n",
+      "        },\n",
+      "        {\n",
+      "            \"id\": \"d8eca566-f0ab-4a36-a674-3f0b99bf3807\",\n",
+      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
+      "            \"content\": \"Chunking strategy Base 128 Base 256 Base 512 Keywords Chipper Summary Chipper Pre\\ufb01x & Table Description Chipper Furthermore, we would like to study the impact of RAG con\\ufb01guration and ele- meant type based chunking.\",\n",
+      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"source_type\": \".pdf\",\n",
+      "            \"chunk_index\": null,\n",
+      "            \"title\": \"Financial Report Chunking for E\\ufb00ective Retrieval Augmented Generation\",\n",
+      "            \"token_count\": null,\n",
+      "            \"page_number\": 1,\n",
+      "            \"metadata\": {\n",
+      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
+      "                \"filetype\": \"application/pdf\",\n",
+      "                \"languages\": [\n",
+      "                    \"eng\"\n",
+      "                ],\n",
+      "                \"parent_id\": \"bd989cd79b9c4cb6019cb168a82ff24d\"\n",
+      "            },\n",
+      "            \"dense_embedding\": null\n",
+      "        }\n",
+      "    ]\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Query the index\n",
+    "query_url = f\"{API_URL}/api/v1/query\"\n",
+    "\n",
+    "query_payload = {\n",
+    "    \"input\": \"What are the chunking strategies?\",\n",
+    "    \"vector_database\": {\n",
+    "        \"type\": \"pinecone\",\n",
+    "        \"config\": {\n",
+    "            \"api_key\": PINECONE_API_KEY,\n",
+    "            \"host\": PINECONE_HOST,\n",
+    "        }\n",
+    "    },\n",
+    "    \"index_name\": PINECONE_INDEX,\n",
+    "    \"encoder\": {\n",
+    "        \"name\": \"text-embedding-ada-002\",\n",
+    "        \"provider\": \"openai\",\n",
+    "        \"dimensions\": 1536\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "query_response = requests.post(query_url, json=query_payload)\n",
+    "\n",
+    "# NOTE: Filter out fields before given to LLM\n",
+    "# Include title, content, source, page_number, chunk_index\n",
+    "formatted_json = json.dumps(query_response.json(), indent=4)\n",
+    "print(formatted_json)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -87,33 +226,6 @@
     "print(response.json())"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Query the index\n",
-    "query_url = f\"{API_URL}/api/v1/query\"\n",
-    "\n",
-    "query_payload = {\n",
-    "    \"input\": \"What are the chunking strategies?\",\n",
-    "    \"vector_database\": {\n",
-    "        \"type\": \"pinecone\",\n",
-    "        \"config\": {\n",
-    "            \"api_key\": PINECONE_API_KEY,\n",
-    "            \"host\": PINECONE_HOST,\n",
-    "        }\n",
-    "    },\n",
-    "    \"index_name\": PINECONE_INDEX,\n",
-    "    \"encoder\": \"cohere\",\n",
-    "}\n",
-    "\n",
-    "query_response = requests.post(query_url, json=query_payload)\n",
-    "\n",
-    "print(query_response.json())"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -163,9 +275,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'detail': [{'type': 'missing', 'loc': ['body', 'files'], 'msg': 'Field required', 'input': {'file_url': 'https://arxiv.org/pdf/2210.03629.pdf', 'vector_database': {'type': 'pinecone', 'config': {'api_key': 'f4adc79e-ad40-4426-a78a-9878e2ed4a79', 'host': 'https://simonas-serverless-1536-75c816a.svc.apw5-4e34-81fa.pinecone.io'}}, 'index_name': 'simonas-serverless-1536', 'encoder': 'openai'}, 'url': 'https://errors.pydantic.dev/2.6/v/missing'}, {'type': 'model_attributes_type', 'loc': ['body', 'encoder'], 'msg': 'Input should be a valid dictionary or object to extract fields from', 'input': 'openai', 'url': 'https://errors.pydantic.dev/2.6/v/model_attributes_type'}]}\n"
+     ]
+    }
+   ],
    "source": [
     "# Delete the index\n",
     "query_url = f\"{API_URL}/api/v1/delete\"\n",
@@ -180,7 +300,7 @@
     "        }\n",
     "    },\n",
     "    \"index_name\": PINECONE_INDEX,\n",
-    "    \"encoder\": \"cohere\",\n",
+    "    \"encoder\": \"openai\",\n",
     "}\n",
     "\n",
     "delete_response = requests.delete(query_url, json=delete_payload)\n",

diff --git a/models/__init__.py b/models/__init__.py