Skip to content

Commit

Permalink
Feature: Pinecone connector (#10)
Browse files Browse the repository at this point in the history
* ignore chroma db on project

* add pinecone dependency

* base pinecone client

* remove namespace as it is unnecesary

* add pinecone variables

* add vscode debugging

* final fixes on pinecone client

* add documentation
  • Loading branch information
BorjaZarco authored Jul 26, 2023
1 parent 52176e1 commit 48b2ca9
Show file tree
Hide file tree
Showing 11 changed files with 383 additions and 156 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,5 @@ terraform.rc

.history

# Ignore ChromaDB
chroma.sqlite3
20 changes: 20 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: FastAPI",
"console": "integratedTerminal",
"type": "python",
"request": "launch",
"module": "uvicorn",
"envFile": "${workspaceFolder}/apps/semantic_search/.env",
"args": ["semantic_search:app", "--reload"],
"jinja": true,
"justMyCode": true,
"pythonArgs": ["-Xfrozen_modules=off"]
}
]
}
1 change: 1 addition & 0 deletions apps/semantic_search/.dockerignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
poetry.lock
.env.example
chroma.sqlite3
10 changes: 9 additions & 1 deletion apps/semantic_search/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,12 @@ CHROMA_COLLECTION= # Collection name to store the embeddings.
SUPABASE_URL= # Your Supabase database URL
SUPABASE_KEY= # Your Supabase API key
SUPABASE_TABLE= # The database table used to save the embeddings.
SUPABASE_FUNCTION= # The database function used to query the embeddings.
SUPABASE_FUNCTION= # The database function used to query the embeddings.

################################################

## Pinecone provider (ONLY IF EMBEDDING_STORE is PINECONE)

PINECONE_KEY= # Your Pinecone API key
PINECONE_ENVIRONMENT= # The environment where your Pinecone project is.
PINECONE_INDEX= # The Pinecone Index used to store and query the embeddings.
2 changes: 1 addition & 1 deletion apps/semantic_search/.env.local.example
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ OPENAI_MODEL=text-embedding-ada-002 # Your OpenAI model. See: https://platform.

# Embedding store configuration

EMBEDDING_STORE=LOCAL # Can be LOCAL | CHROMA | SUPABASE
EMBEDDING_STORE=LOCAL # Can be LOCAL | CHROMA | SUPABASE | PINECONE
MATCH_THRESHOLD=0.5 # Threshold to consider two embeddings as a match. Value from 0 to 1

################################################
Expand Down
383 changes: 230 additions & 153 deletions apps/semantic_search/poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions apps/semantic_search/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ python-dotenv = "^1.0.0"
gradio-client = "^0.2.6"
supabase = "^1.0.3"
openai = "^0.27.8"
pinecone-client = "^2.2.2"

[tool.poetry.scripts]
start = "semantic_search:start"
Expand Down
86 changes: 86 additions & 0 deletions apps/semantic_search/stores/pinecone_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import uuid
from typing import Any

import pinecone
from pinecone import Index, Vector
from pydantic import BaseModel
from stores.base import EmbeddingsStore, SearchResult, StoreRequest


class PineconeEmbeddingsStoreSettings(BaseModel):
key: str
environment: str
index: str


class PineconeMatch(BaseModel):
id: str
score: float
values: list[float]
metadata: dict[str, Any]


class PineconeEmbeddingsStore(EmbeddingsStore):
def __init__(self, settings: PineconeEmbeddingsStoreSettings) -> None:
pinecone.init(api_key=settings.key, environment=settings.environment)

self.index = Index(settings.index)

def store(self, embeddings: list[StoreRequest]) -> list[str]:
self._validate_configuration()

pinecone_vectors: list[Vector] = []
for embedding in embeddings:
pinecone_metadata = embedding.metadata.copy()
pinecone_metadata["cluster_id"] = embedding.cluster_id
vector = Vector(
id=str(uuid.uuid4()),
values=embedding.embedding,
metadata=pinecone_metadata,
)
pinecone_vectors.append(vector)

result = self.index.upsert(vectors=pinecone_vectors)

result.get("upsertedData")
if result.get("upsertedData") == 0:
raise Exception("Error inserting: No rows added")

return [vector["id"] for vector in pinecone_vectors]

def search(
self,
embedding: list[float],
cluster_ids: list[str],
match_threshold: float = 0.8,
limit: int = 10,
) -> list[SearchResult]:
self._validate_configuration()

query_response = self.index.query(
top_k=limit,
include_values=True,
include_metadata=True,
vector=embedding,
filter={"cluster_id": {"$in": cluster_ids}},
)

matches: list[PineconeMatch] = query_response.get("matches")
search_results: list[SearchResult] = []
for match in matches:
if match.score < match_threshold:
continue

cluster_id = match.metadata.pop("cluster_id")
search_result = SearchResult(
id=match.id,
metadata=match.metadata,
score=match.score,
cluster_id=cluster_id,
)
search_results.append(search_result)
return search_results

def _validate_configuration(self):
if not self.index:
raise ValueError("Pinecone index is required.")
12 changes: 12 additions & 0 deletions apps/semantic_search/stores/provider.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from common.utils import get_env_or_fail
from stores.base import EmbeddingsStore
from stores.chroma import ChromaEmbeddingsStore, ChromaEmbeddingsStoreSettings
from stores.pinecone_client import (
PineconeEmbeddingsStore,
PineconeEmbeddingsStoreSettings,
)
from stores.supabase_client import (
SupabaseEmbeddingsStore,
SupabaseEmbeddingsStoreSettings,
Expand Down Expand Up @@ -29,5 +33,13 @@ def get_client(name: str) -> EmbeddingsStore:
query_function=get_env_or_fail("SUPABASE_FUNCTION"),
)
)
if name == "PINECONE":
return PineconeEmbeddingsStore(
PineconeEmbeddingsStoreSettings(
key=get_env_or_fail("PINECONE_KEY"),
environment=get_env_or_fail("PINECONE_ENVIRONMENT"),
index=get_env_or_fail("PINECONE_INDEX"),
)
)

raise Exception("Missing valid store client name")
3 changes: 3 additions & 0 deletions website/docs/03_semantic_search/030_getting_started.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ The following table describes the different variables that can be set on the `.e
| `SUPABASE_KEY` | :x: | The key to use when connecting to Supabase. This is only required if you are using the `SUPABASE` embedding store |
| `SUPABASE_TABLE` | :x: | The table to use when storing embeddings. This is only required if you are using the `SUPABASE` embedding store |
| `SUPABASE_FUNCTION` | :x: | The function to use when querying embeddings. This is only required if you are using the `SUPABASE` embedding store |
| `PINECONE_KEY` | :x: | The key to use when connecting to Pinecone. This is only required if you are using the `PINECONE` embedding store |
| `PINECONE_ENVIRONMENT` | :x: | The Pinecone's project environment. This is only required if you are using the `PINECONE` embedding store |
| `PINECONE_INDEX` | :x: | The index (Pinecone database) where embeddings will be stored and queried. This is only required if you are using the `PINECONE` embedding store |



Expand Down
19 changes: 18 additions & 1 deletion website/docs/03_semantic_search/032_embedding_stores.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,21 @@ SUPABASE_FUNCTION="<SUPABASE_FUNCTION>"
- `SUPABASE_TABLE` is the table where the embeddings are stored.
- `SUPABASE_FUNCTION` is the function that is used to perform similarity searches.

You can check out how to set up your Supabase database [here](https://supabase.com/blog/openai-embeddings-postgres-vector).
You can check out how to set up your Supabase database [here](https://supabase.com/blog/openai-embeddings-postgres-vector).

## Pinecone store

Another alternative could be [Pinecone](https://www.pinecone.io/). It is a vector-oriented database created specifically for semanitc search. It is a cloud-based service, so you don't have to worry about setting up a database by yourself.

```.env
EMBEDDING_STORE="PINECONE"
PINECONE_KEY="<PINECONE_KEY>"
PINECONE_ENVIRONMENT="<PINECONE_ENVIRONMENT>"
PINECONE_INDEX="<PINECONE_INDEX>"
```

- `PINECONE_KEY` is the access key of the Pinecone database.
- `PINECONE_ENVIRONMENT` is the environment where the Pinecone project is placed.
- `PINECONE_INDEX` is the Pinecone index (database) where embeddings are stored and queried.

You can check out how to set up your Pinecone database [here](https://www.pinecone.io/).

0 comments on commit 48b2ca9

Please sign in to comment.