From 50daea883cb717616991946fc2c5b02f23fec489 Mon Sep 17 00:00:00 2001
From: Sebastian Rojo <arpagon@gmail.com>
Date: Mon, 18 Dec 2023 00:59:43 -0500
Subject: [PATCH] Update pinecone ingestor script

---
 docs/open-source/rag.mdx | 46 +---------------------------------------
 1 file changed, 1 insertion(+), 45 deletions(-)

diff --git a/docs/open-source/rag.mdx b/docs/open-source/rag.mdx
index 979298054..0c0b4f5b7 100644
--- a/docs/open-source/rag.mdx
+++ b/docs/open-source/rag.mdx
@@ -53,49 +53,5 @@ If you have a folder of PDFs, docx files, text files, etc. that you want to add
 the below script which uses [Unstructured](https://github.com/Unstructured-IOVector/unstructured) to parse
 many kinds of files types, extract the text, and add it to pinecone.
 
-The script was tested with these package versions:
+`apps/voice-rag/manual_pinecone_ingestor.ipynb`
 
-```
-python = "^3.10"
-langchain = "^0.0.237"
-spacy = "^3.6.0"
-unstructured = {extras = ["local-inference"], version = "^0.8.1"}
-layoutparser = {extras = ["layoutmodels", "tesseract"], version = "^0.3.4"}
-pinecone-client = "^2.2.2"
-openai = "^0.27.8"
-torch = ">=2.0.0, !=2.0.1"
-tiktoken = "^0.4.0"
-```
-
-```python
-import os
-import pinecone
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.text_splitter import SpacyTextSplitter
-from langchain.vectorstores import Pinecone
-from langchain.document_loaders import DirectoryLoader, UnstructuredFileLoader
-
-PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
-PINECONE_ENVIRONMENT = os.environ["PINECONE_ENVIRONMENT"]
-OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
-
-
-loader = DirectoryLoader('./docs', glob="**/*.*", show_progress=True, loader_cls=UnstructuredFileLoader)
-print("Loading documents...")
-documents = loader.load()
-text_splitter = SpacyTextSplitter(chunk_size=1000)
-print("Splitting documents...")
-docs = text_splitter.split_documents(documents)
-
-embeddings = OpenAIEmbeddings()
-
-pinecone.init(
-    api_key=PINECONE_API_KEY,
-    environment=PINECONE_ENVIRONMENT,
-)
-
-index_name = "your_index_name"
-
-print("Creating index...")
-docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)
-```