From 50daea883cb717616991946fc2c5b02f23fec489 Mon Sep 17 00:00:00 2001 From: Sebastian Rojo Date: Mon, 18 Dec 2023 00:59:43 -0500 Subject: [PATCH] Update pinecone ingestor script --- docs/open-source/rag.mdx | 46 +--------------------------------------- 1 file changed, 1 insertion(+), 45 deletions(-) diff --git a/docs/open-source/rag.mdx b/docs/open-source/rag.mdx index 979298054..0c0b4f5b7 100644 --- a/docs/open-source/rag.mdx +++ b/docs/open-source/rag.mdx @@ -53,49 +53,5 @@ If you have a folder of PDFs, docx files, text files, etc. that you want to add the below script which uses [Unstructured](https://github.com/Unstructured-IOVector/unstructured) to parse many kinds of files types, extract the text, and add it to pinecone. -The script was tested with these package versions: +`apps/voice-rag/manual_pinecone_ingestor.ipynb` -``` -python = "^3.10" -langchain = "^0.0.237" -spacy = "^3.6.0" -unstructured = {extras = ["local-inference"], version = "^0.8.1"} -layoutparser = {extras = ["layoutmodels", "tesseract"], version = "^0.3.4"} -pinecone-client = "^2.2.2" -openai = "^0.27.8" -torch = ">=2.0.0, !=2.0.1" -tiktoken = "^0.4.0" -``` - -```python -import os -import pinecone -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.text_splitter import SpacyTextSplitter -from langchain.vectorstores import Pinecone -from langchain.document_loaders import DirectoryLoader, UnstructuredFileLoader - -PINECONE_API_KEY = os.environ["PINECONE_API_KEY"] -PINECONE_ENVIRONMENT = os.environ["PINECONE_ENVIRONMENT"] -OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] - - -loader = DirectoryLoader('./docs', glob="**/*.*", show_progress=True, loader_cls=UnstructuredFileLoader) -print("Loading documents...") -documents = loader.load() -text_splitter = SpacyTextSplitter(chunk_size=1000) -print("Splitting documents...") -docs = text_splitter.split_documents(documents) - -embeddings = OpenAIEmbeddings() - -pinecone.init( - api_key=PINECONE_API_KEY, - environment=PINECONE_ENVIRONMENT, -) - -index_name = "your_index_name" - -print("Creating index...") -docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name) -```