Merge branch 'feat/chunk' of https://github.com/asr2003/pg_vectorize …

…into feat/chunk
tembo-io · Dec 22, 2024 · c0b3d50 · c0b3d50
2 parents 45cb3e0 + fe297b1
commit c0b3d50
Show file tree

Hide file tree

Showing 36 changed files with 914 additions and 520 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1 +1 @@
-*       @ChuckHend @shhnwz @jasonmp85
+*       @ChuckHend
diff --git a/.github/workflows/extension_ci.yml b/.github/workflows/extension_ci.yml
@@ -75,12 +75,9 @@ jobs:
       - name: Clippy
         run: cargo clippy
 
-  test:
-    name: Run tests
-    needs: dependencies
+  test-core:
     runs-on: ubuntu-24.04
     services:
-      # Label used to access the service container
       vector-serve:
         image: quay.io/tembo/vector-serve:latest
         ports:
@@ -99,18 +96,45 @@ jobs:
             /home/runner/.pgrx
       - name: Install sys dependencies
         run: |
-          sudo apt-get update && sudo apt-get install -y postgresql-server-dev-16 libopenblas-dev libreadline-dev
-      - uses: ./.github/actions/pgx-init
-        with:
-          working-directory: ./extension
+          sudo apt-get update && sudo apt-get install -y postgresql postgresql-contrib libopenblas-dev libreadline-dev
       - name: Test Core
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           CO_API_KEY: ${{ secrets.CO_API_KEY }}
           PORTKEY_API_KEY: ${{ secrets.PORTKEY_API_KEY }}
           PORTKEY_VIRTUAL_KEY_OPENAI: ${{ secrets.PORTKEY_VIRTUAL_KEY_OPENAI }}
+          VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
         run: |
           cd ../core && cargo test
+
+  test:
+    name: Run tests
+    needs: dependencies
+    runs-on: ubuntu-24.04
+    services:
+      # Label used to access the service container
+      vector-serve:
+        image: quay.io/tembo/vector-serve:latest
+        ports:
+          - 3000:3000
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust stable toolchain
+        uses: dtolnay/rust-toolchain@stable
+      - uses: Swatinem/rust-cache@v2
+        with:
+          prefix-key: "extension-test"
+          workspaces: |
+            vectorize
+          # Additional directories to cache
+          cache-directories: |
+            /home/runner/.pgrx
+      - name: Install sys dependencies
+        run: |
+          sudo apt-get update && sudo apt-get install -y postgresql-server-dev-16 libopenblas-dev libreadline-dev
+      - uses: ./.github/actions/pgx-init
+        with:
+          working-directory: ./extension
       - name: Restore cached binaries
         uses: actions/cache@v2
         with:
@@ -132,6 +156,7 @@ jobs:
           CO_API_KEY: ${{ secrets.CO_API_KEY }}
           PORTKEY_API_KEY: ${{ secrets.PORTKEY_API_KEY }}
           PORTKEY_VIRTUAL_KEY_OPENAI: ${{ secrets.PORTKEY_VIRTUAL_KEY_OPENAI }}
+          VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
         run: |
           echo "\q" | make run
           make test-integration
@@ -142,7 +167,7 @@ jobs:
     runs-on: ubuntu-24.04
     strategy:
       matrix:
-        pg-version: [14, 15, 16]
+        pg-version: [14, 15, 16, 17]
     steps:
       - uses: actions/checkout@v2
       - name: Install Rust stable toolchain

diff --git a/.github/workflows/extension_upgrade.yml b/.github/workflows/extension_upgrade.yml
@@ -16,7 +16,7 @@ on:
 jobs:
   test:
     name: Upgrade Test
-    runs-on: ubuntu-latest
+    runs-on:  ubuntu-24.04
     services:
       vector-serve:
         image: quay.io/tembo/vector-serve:latest

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,5 +1,7 @@
 # Contributing to pg_vectorize
 
+If you encounter any issues or have questions, feel free to join the [Tembo Community Slack](https://join.slack.com/t/tembocommunity/shared_invite/zt-2u3ctm86u-XzcyL76T7o~7Mpnt6KUx1g) for support.
+
 ## Prerequisites
 
 - [Rust](https://www.rust-lang.org/learn/get-started) - Toolchain including `rustc`, `cargo`, and `rustfmt`
@@ -32,68 +34,25 @@ Confirm a successful set up by running the following:
 docker ps
 ```
 
-:wrench: Note: Consider running the following to see the container logs real time:
-
-```bash
-docker logs <your-container-id> -f
-```
-
 ### 3. Clone and compile `pg_vectorize` and extension dependencies
 
-:wrench: When progressing through these steps, refer to the following for troubleshooting:
-
-```bash
-cat ~/.pgrx/15.log
-```
-
-#### 3.1. Apply configurations
-
-Prior to compiling and running `pg_vectorize`, it's essential to update the `postgresql.conf` file.
-`pgrx` uses a Postgres version-specific data directory, each containing its own `postgresql.conf` file.
-The following example, utilizes Postgres version 15.
-If you're using a different version, please alter the file path value `data-<postgres-version>` and run the following:
-
-```bash
-<your-editor> ~/.pgrx/data-15/postgresql.conf
-```
-
-Within this document, add the following:
-
-```text
-shared_preload_libraries = 'pg_cron, vectorize'
-cron.database_name = 'postgres'
-vectorize.embedding_service_url = 'http://localhost:3000/v1/embeddings'
-```
-
-:wrench: Note: If your machine is running a MacOS, you may need to apply the following configurations to Cargo's config file:
-
-```
-<your-editor> ~/.cargo/config
-```
-
-```text
-[target.'cfg(target_os="macos")']
-# Postgres symbols won't be available until runtime
-rustflags = ["-Clink-arg=-Wl,-undefined,dynamic_lookup"]
-```
-
-#### 3.2. Clone and enter directory
+#### 3.1. Clone and enter directory
 
 ```bash
 git clone https://github.com/tembo-io/pg_vectorize.git
 
-cd pg_vectorize
+cd pg_vectorize/extension
 ```
 
-#### 3.3. Install dependencies
+#### 3.2. Install dependencies
 
-From within the pg_vectorize directory, run the following, which will install `pg_cron`, `pgmq`, and `pgvector`:
+From within the pg_vectorize/extension directory, run the following, which will install `pg_cron`, `pgmq`, and `pgvector`:
 
 ```bash
 make setup
 ```
 
-#### 3.4. Compile and run `pg_vectorize`
+#### 3.3. Compile and run `pg_vectorize`
 
 ```bash
 make run
@@ -105,6 +64,12 @@ make run
 
 Once the above command is run, you will be brought into Postgres via `psql`.
 
+Run the following command inside the `psql` console to enable the extensions:
+
+```sql
+create extension vectorize cascade
+```
+
 To list out the enabled extensions, run:
 
 ```sql
@@ -118,61 +83,29 @@ To list out the enabled extensions, run:
  pgmq       | 1.1.1   | pgmq       | A lightweight message queue. Like AWS SQS and RSMQ but on Postgres.
  plpgsql    | 1.0     | pg_catalog | PL/pgSQL procedural language
  vector     | 0.6.0   | public     | vector data type and ivfflat and hnsw access methods
- vectorize  | 0.10.1  | vectorize  | The simplest way to do vector search on Postgres
+ vectorize  | 0.19.0  | vectorize  | The simplest way to do vector search on Postgres
 (6 rows)
 ```
 
 #### 4.2 Confirm embedding service url is set to localhost
 
-In section 3.1., we set the following postgresql.conf variable:
-
-```text
-vectorize.embedding_service_url = 'http://localhost:3000/v1/embeddings'
-```
-
-To confirm its success, run the following SHOW command:
-
-```sql
-SHOW vectorize.embedding_service_url;
-```
-```text
-   vectorize.embedding_service_url
--------------------------------------
- http://localhost:3000/v1/embeddings
-(1 row)
-```
-
-Say, for example, instead of local host, `vector-serve:3000` was the target?
-Should you desire to change this from within Postgre, simply run:
-
-```
-ALTER SYSTEM SET vectorize.embedding_service_url TO 'http://localhost:3000/v1/embeddings';
-```
-
-Making changes such as this requires the following to be run:
-
-```sql
-SELECT pg_reload_conf();
-```
-
-Running the earlier SHOW command should reveal the appropriate change:
+Run the following SHOW command to confirm that the url is set to `localhost`:
 
 ```sql
 SHOW vectorize.embedding_service_url;
 ```
-
 ```text
    vectorize.embedding_service_url
 -------------------------------------
- http://localhost:3000/v1/embeddings
+ http://localhost:3000/v1
 (1 row)
 ```
 
 #### 4.3. Load example data
 
-The following can be found within the this project's README, under [Hugging Face Example](https://github.com/tembo-io/pg_vectorize/blob/main/README.md#hugging-face-example).
+The following can be found within the this project's README, under [Vector Search Example](https://github.com/tembo-io/pg_vectorize/blob/main/README.md#vector-search-example).
 
-Begin by creating a `producs` table with the dataset that comes included with `pg_vectorize`.
+Begin by creating a `products` table with the dataset that comes included with `pg_vectorize`.
 
 ```sql
 CREATE TABLE products (LIKE vectorize.example_products INCLUDING ALL);
@@ -230,9 +163,17 @@ num_results => 3
 
 ### 5. Local URL
 
-Once all of the following is complete, you should be able to visit the `Tembo-Embedding-Service` at [http://localhost:3000/docs](http://localhost:3000/docs) and explore.
+Once all of the following is complete, you should be able to access Swagger UI for `Tembo-Embedding-Service` at [http://localhost:3000/docs](http://localhost:3000/docs) and explore.
 This is a platform that allows, for example, the input of [different sentence-transformers models](https://huggingface.co/models?sort=trending&search=sentence-transformers) from Hugging Face.
 
+## TroubleShooting
+
+To check `pgrx` logs for debugging:
+
+```bash
+cat ~/.pgrx/17.log
+```
+
 # Releases
 
 `pg_vectorize` releases are automated through a [Github workflow](https://github.com/tembo-io/pg_vectorize/blob/main/.github/workflows/extension_ci.yml).

diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ pg_vectorize powers the [VectorDB Stack](https://tembo.io/docs/product/stacks/ai
 ## Features
 
 - Workflows for both vector search and RAG
-- Integrations with OpenAI's [embeddings](https://platform.openai.com/docs/guides/embeddings) and [chat-completion](https://platform.openai.com/docs/guides/text-generation) endpoints and a self-hosted container for running [Hugging Face Sentence-Transformers](https://huggingface.co/sentence-transformers)
+- Integrations with OpenAI's [embeddings](https://platform.openai.com/docs/guides/embeddings) and [Text-Generation](https://platform.openai.com/docs/guides/text-generation) endpoints and a self-hosted container for running [Hugging Face Sentence-Transformers](https://huggingface.co/sentence-transformers)
 - Automated creation of Postgres triggers to keep your embeddings up to date
 - High level API - one function to initialize embeddings transformations, and another function to search
 
@@ -88,14 +88,14 @@ Then set the following either in postgresql.conf or as a configuration parameter
 ```sql
 -- requires restart of Postgres
 alter system set shared_preload_libraries = 'vectorize,pg_cron';
-alter system set cron.database_name = 'postgres'
+alter system set cron.database_name = 'postgres';
 ```
 
 And if you're running the vector-serve container, set the following url as a configuration parameter in Postgres.
  The host may need to change from `localhost` to something else depending on where you are running the container.
 
 ```sql
-alter system set vectorize.embedding_service_url = 'http://localhost:3000/v1/embeddings'
+alter system set vectorize.embedding_service_url = 'http://localhost:3000/v1';
 
 SELECT pg_reload_conf();
 ```
@@ -104,7 +104,7 @@ SELECT pg_reload_conf();
 
 ## Vector Search Example
 
-Text-to-embedding transformation can be done with either Hugging Face's Sentence-Transformers or OpenAI's embeddings. The following examples use Hugging Face's Sentence-Transformers. See the project [documentation](https://tembo-io.github.io/pg_vectorize/) for OpenAI examples.
+Text-to-embedding transformation can be done with either Hugging Face's Sentence-Transformers or OpenAI's embeddings. The following examples use Hugging Face's Sentence-Transformers. See the project [documentation](https://tembo.io/pg_vectorize/examples/openai_embeddings/) for OpenAI examples.
 
 Follow the [installation](#installation) steps if you haven't already.
 
@@ -190,15 +190,15 @@ ADD COLUMN context TEXT GENERATED ALWAYS AS (product_name || ': ' || description
 ```
 
 Initialize the RAG project.
- We'll use the `sentence-transformers/all-MiniLM-L6-v2` model to generate embeddings on our source documents.
+ We'll use the `openai/text-embedding-3-small` model to generate embeddings on our source documents.
 
 ```sql
 SELECT vectorize.init_rag(
     agent_name          => 'product_chat',
     table_name          => 'products',
     "column"            => 'context',
     unique_record_id    => 'product_id',
-    transformer         => 'sentence-transformers/all-MiniLM-L6-v2'
+    transformer         => 'openai/text-embedding-3-small'
 );
 ```
 
@@ -244,8 +244,8 @@ Alternatively, `schedule => 'realtime` creates triggers on the source table and
 Statements below would will result in new embeddings being generated either immediately (`schedule => 'realtime'`) or within the cron schedule set in the `schedule` parameter.
 
 ```sql
-INSERT INTO products (product_id, product_name, description)
-VALUES (12345, 'pizza', 'dish of Italian origin consisting of a flattened disk of bread');
+INSERT INTO products (product_id, product_name, description, product_category, price)
+VALUES (12345, 'pizza', 'dish of Italian origin consisting of a flattened disk of bread', 'food', 5.99);
 
 UPDATE products
 SET description = 'sling made of fabric, rope, or netting, suspended between two or more points, used for swinging, sleeping, or resting'
@@ -286,3 +286,11 @@ select vectorize.encode(
 {0.0028769304,-0.005826319,-0.0035932811, ...}
 ```
 
+## Contributing
+
+We welcome contributions from the community! If you're interested in contributing to `pg_vectorize`, please check out our [Contributing Guide](CONTRIBUTING.md). Your contributions help make this project better for everyone.
+
+## Community Support
+
+If you encounter any issues or have any questions, feel free to join our [Tembo Community Slack](https://join.slack.com/t/tembocommunity/shared_invite/zt-2u3ctm86u-XzcyL76T7o~7Mpnt6KUx1g). We're here to help!
+
diff --git a/core/src/transformers/providers/cohere.rs b/core/src/transformers/providers/cohere.rs
@@ -132,7 +132,7 @@ mod integration_tests {
         );
         assert!(
             embeddings.embeddings[0].len() == 384,
-            "Embeddings should have length 384"
+            "Embeddings should have dimension 384"
         );
     }
 }
diff --git a/core/src/transformers/providers/mod.rs b/core/src/transformers/providers/mod.rs
@@ -3,6 +3,7 @@ pub mod ollama;
 pub mod openai;
 pub mod portkey;
 pub mod vector_serve;
+pub mod voyage;
 
 use anyhow::Result;
 use async_trait::async_trait;
@@ -66,6 +67,9 @@ pub fn get_provider(
             api_key,
             virtual_key,
         ))),
+        ModelSource::Voyage => Ok(Box::new(providers::voyage::VoyageProvider::new(
+            url, api_key,
+        ))),
         ModelSource::SentenceTransformers => Ok(Box::new(
             providers::vector_serve::VectorServeProvider::new(url, api_key),
         )),

diff --git a/core/src/transformers/providers/openai.rs b/core/src/transformers/providers/openai.rs
@@ -202,7 +202,7 @@ mod integration_tests {
         );
         assert!(
             embeddings.embeddings[0].len() == 1536,
-            "Embeddings should have length 1536"
+            "Embeddings should have dimension 1536"
         );
     }
 }

diff --git a/core/src/transformers/providers/portkey.rs b/core/src/transformers/providers/portkey.rs
@@ -154,7 +154,7 @@ mod portkey_integration_tests {
         );
         assert!(
             embeddings.embeddings[0].len() == 1536,
-            "Embeddings should have length 1536"
+            "Embeddings should have dimension 1536"
         );
 
         let dim = provider.model_dim("text-embedding-ada-002").await.unwrap();