diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5934fb62..0f9e0292 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @ChuckHend @shhnwz @jasonmp85 +* @ChuckHend diff --git a/.github/workflows/extension_ci.yml b/.github/workflows/extension_ci.yml index 97100c73..166a7617 100644 --- a/.github/workflows/extension_ci.yml +++ b/.github/workflows/extension_ci.yml @@ -75,12 +75,9 @@ jobs: - name: Clippy run: cargo clippy - test: - name: Run tests - needs: dependencies + test-core: runs-on: ubuntu-24.04 services: - # Label used to access the service container vector-serve: image: quay.io/tembo/vector-serve:latest ports: @@ -99,10 +96,7 @@ jobs: /home/runner/.pgrx - name: Install sys dependencies run: | - sudo apt-get update && sudo apt-get install -y postgresql-server-dev-16 libopenblas-dev libreadline-dev - - uses: ./.github/actions/pgx-init - with: - working-directory: ./extension + sudo apt-get update && sudo apt-get install -y postgresql postgresql-contrib libopenblas-dev libreadline-dev - name: Test Core env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -112,6 +106,35 @@ jobs: VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }} run: | cd ../core && cargo test + + test: + name: Run tests + needs: dependencies + runs-on: ubuntu-24.04 + services: + # Label used to access the service container + vector-serve: + image: quay.io/tembo/vector-serve:latest + ports: + - 3000:3000 + steps: + - uses: actions/checkout@v4 + - name: Install Rust stable toolchain + uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + with: + prefix-key: "extension-test" + workspaces: | + vectorize + # Additional directories to cache + cache-directories: | + /home/runner/.pgrx + - name: Install sys dependencies + run: | + sudo apt-get update && sudo apt-get install -y postgresql-server-dev-16 libopenblas-dev libreadline-dev + - uses: ./.github/actions/pgx-init + with: + working-directory: ./extension - name: Restore cached binaries uses: actions/cache@v2 with: @@ -144,7 +167,7 @@ jobs: runs-on: ubuntu-24.04 strategy: matrix: - pg-version: [14, 15, 16] + pg-version: [14, 15, 16, 17] steps: - uses: actions/checkout@v2 - name: Install Rust stable toolchain diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c83bbbab..11a22f58 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,7 @@ # Contributing to pg_vectorize +If you encounter any issues or have questions, feel free to join the [Tembo Community Slack](https://join.slack.com/t/tembocommunity/shared_invite/zt-2u3ctm86u-XzcyL76T7o~7Mpnt6KUx1g) for support. + ## Prerequisites - [Rust](https://www.rust-lang.org/learn/get-started) - Toolchain including `rustc`, `cargo`, and `rustfmt` @@ -32,52 +34,9 @@ Confirm a successful set up by running the following: docker ps ``` -:wrench: Note: Consider running the following to see the container logs real time: - -```bash -docker logs -f -``` - ### 3. Clone and compile `pg_vectorize` and extension dependencies -:wrench: When progressing through these steps, refer to the following for troubleshooting: - -```bash -cat ~/.pgrx/15.log -``` - -#### 3.1. Apply configurations - -Prior to compiling and running `pg_vectorize`, it's essential to update the `postgresql.conf` file. -`pgrx` uses a Postgres version-specific data directory, each containing its own `postgresql.conf` file. -The following example, utilizes Postgres version 15. -If you're using a different version, please alter the file path value `data-` and run the following: - -```bash - ~/.pgrx/data-15/postgresql.conf -``` - -Within this document, add the following: - -```text -shared_preload_libraries = 'pg_cron, vectorize' -cron.database_name = 'postgres' -vectorize.embedding_service_url = 'http://localhost:3000/v1/embeddings' -``` - -:wrench: Note: If your machine is running a MacOS, you may need to apply the following configurations to Cargo's config file: - -``` - ~/.cargo/config -``` - -```text -[target.'cfg(target_os="macos")'] -# Postgres symbols won't be available until runtime -rustflags = ["-Clink-arg=-Wl,-undefined,dynamic_lookup"] -``` - -#### 3.2. Clone and enter directory +#### 3.1. Clone and enter directory ```bash git clone https://github.com/tembo-io/pg_vectorize.git @@ -85,7 +44,7 @@ git clone https://github.com/tembo-io/pg_vectorize.git cd pg_vectorize/extension ``` -#### 3.3. Install dependencies +#### 3.2. Install dependencies From within the pg_vectorize/extension directory, run the following, which will install `pg_cron`, `pgmq`, and `pgvector`: @@ -93,7 +52,7 @@ From within the pg_vectorize/extension directory, run the following, which will make setup ``` -#### 3.4. Compile and run `pg_vectorize` +#### 3.3. Compile and run `pg_vectorize` ```bash make run @@ -124,61 +83,29 @@ To list out the enabled extensions, run: pgmq | 1.1.1 | pgmq | A lightweight message queue. Like AWS SQS and RSMQ but on Postgres. plpgsql | 1.0 | pg_catalog | PL/pgSQL procedural language vector | 0.6.0 | public | vector data type and ivfflat and hnsw access methods - vectorize | 0.10.1 | vectorize | The simplest way to do vector search on Postgres + vectorize | 0.19.0 | vectorize | The simplest way to do vector search on Postgres (6 rows) ``` #### 4.2 Confirm embedding service url is set to localhost -In section 3.1., we set the following postgresql.conf variable: - -```text -vectorize.embedding_service_url = 'http://localhost:3000/v1/embeddings' -``` - -To confirm its success, run the following SHOW command: - -```sql -SHOW vectorize.embedding_service_url; -``` -```text - vectorize.embedding_service_url -------------------------------------- - http://localhost:3000/v1/embeddings -(1 row) -``` - -Say, for example, instead of local host, `vector-serve:3000` was the target? -Should you desire to change this from within Postgre, simply run: - -``` -ALTER SYSTEM SET vectorize.embedding_service_url TO 'http://localhost:3000/v1/embeddings'; -``` - -Making changes such as this requires the following to be run: - -```sql -SELECT pg_reload_conf(); -``` - -Running the earlier SHOW command should reveal the appropriate change: +Run the following SHOW command to confirm that the url is set to `localhost`: ```sql SHOW vectorize.embedding_service_url; ``` - ```text vectorize.embedding_service_url ------------------------------------- - http://localhost:3000/v1/embeddings + http://localhost:3000/v1 (1 row) ``` #### 4.3. Load example data -The following can be found within the this project's README, under [Hugging Face Example](https://github.com/tembo-io/pg_vectorize/blob/main/README.md#hugging-face-example). +The following can be found within the this project's README, under [Vector Search Example](https://github.com/tembo-io/pg_vectorize/blob/main/README.md#vector-search-example). -Begin by creating a `producs` table with the dataset that comes included with `pg_vectorize`. +Begin by creating a `products` table with the dataset that comes included with `pg_vectorize`. ```sql CREATE TABLE products (LIKE vectorize.example_products INCLUDING ALL); @@ -236,9 +163,17 @@ num_results => 3 ### 5. Local URL -Once all of the following is complete, you should be able to visit the `Tembo-Embedding-Service` at [http://localhost:3000/docs](http://localhost:3000/docs) and explore. +Once all of the following is complete, you should be able to access Swagger UI for `Tembo-Embedding-Service` at [http://localhost:3000/docs](http://localhost:3000/docs) and explore. This is a platform that allows, for example, the input of [different sentence-transformers models](https://huggingface.co/models?sort=trending&search=sentence-transformers) from Hugging Face. +## TroubleShooting + +To check `pgrx` logs for debugging: + +```bash +cat ~/.pgrx/17.log +``` + # Releases `pg_vectorize` releases are automated through a [Github workflow](https://github.com/tembo-io/pg_vectorize/blob/main/.github/workflows/extension_ci.yml). diff --git a/README.md b/README.md index 26f3d30c..d40fb6fb 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ pg_vectorize powers the [VectorDB Stack](https://tembo.io/docs/product/stacks/ai ## Features - Workflows for both vector search and RAG -- Integrations with OpenAI's [embeddings](https://platform.openai.com/docs/guides/embeddings) and [chat-completion](https://platform.openai.com/docs/guides/text-generation) endpoints and a self-hosted container for running [Hugging Face Sentence-Transformers](https://huggingface.co/sentence-transformers) +- Integrations with OpenAI's [embeddings](https://platform.openai.com/docs/guides/embeddings) and [Text-Generation](https://platform.openai.com/docs/guides/text-generation) endpoints and a self-hosted container for running [Hugging Face Sentence-Transformers](https://huggingface.co/sentence-transformers) - Automated creation of Postgres triggers to keep your embeddings up to date - High level API - one function to initialize embeddings transformations, and another function to search @@ -88,14 +88,14 @@ Then set the following either in postgresql.conf or as a configuration parameter ```sql -- requires restart of Postgres alter system set shared_preload_libraries = 'vectorize,pg_cron'; -alter system set cron.database_name = 'postgres' +alter system set cron.database_name = 'postgres'; ``` And if you're running the vector-serve container, set the following url as a configuration parameter in Postgres. The host may need to change from `localhost` to something else depending on where you are running the container. ```sql -alter system set vectorize.embedding_service_url = 'http://localhost:3000/v1/embeddings' +alter system set vectorize.embedding_service_url = 'http://localhost:3000/v1'; SELECT pg_reload_conf(); ``` @@ -104,7 +104,7 @@ SELECT pg_reload_conf(); ## Vector Search Example -Text-to-embedding transformation can be done with either Hugging Face's Sentence-Transformers or OpenAI's embeddings. The following examples use Hugging Face's Sentence-Transformers. See the project [documentation](https://tembo-io.github.io/pg_vectorize/) for OpenAI examples. +Text-to-embedding transformation can be done with either Hugging Face's Sentence-Transformers or OpenAI's embeddings. The following examples use Hugging Face's Sentence-Transformers. See the project [documentation](https://tembo.io/pg_vectorize/examples/openai_embeddings/) for OpenAI examples. Follow the [installation](#installation) steps if you haven't already. @@ -190,7 +190,7 @@ ADD COLUMN context TEXT GENERATED ALWAYS AS (product_name || ': ' || description ``` Initialize the RAG project. - We'll use the `sentence-transformers/all-MiniLM-L6-v2` model to generate embeddings on our source documents. + We'll use the `openai/text-embedding-3-small` model to generate embeddings on our source documents. ```sql SELECT vectorize.init_rag( @@ -198,7 +198,7 @@ SELECT vectorize.init_rag( table_name => 'products', "column" => 'context', unique_record_id => 'product_id', - transformer => 'sentence-transformers/all-MiniLM-L6-v2' + transformer => 'openai/text-embedding-3-small' ); ``` @@ -286,3 +286,11 @@ select vectorize.encode( {0.0028769304,-0.005826319,-0.0035932811, ...} ``` +## Contributing + +We welcome contributions from the community! If you're interested in contributing to `pg_vectorize`, please check out our [Contributing Guide](CONTRIBUTING.md). Your contributions help make this project better for everyone. + +## Community Support + +If you encounter any issues or have any questions, feel free to join our [Tembo Community Slack](https://join.slack.com/t/tembocommunity/shared_invite/zt-2u3ctm86u-XzcyL76T7o~7Mpnt6KUx1g). We're here to help! + diff --git a/extension/sql/meta.sql b/extension/sql/meta.sql index 0570668f..322a67f1 100644 --- a/extension/sql/meta.sql +++ b/extension/sql/meta.sql @@ -20,6 +20,33 @@ GRANT SELECT ON ALL SEQUENCES IN SCHEMA vectorize TO pg_monitor; ALTER DEFAULT PRIVILEGES IN SCHEMA vectorize GRANT SELECT ON TABLES TO pg_monitor; ALTER DEFAULT PRIVILEGES IN SCHEMA vectorize GRANT SELECT ON SEQUENCES TO pg_monitor; +CREATE OR REPLACE FUNCTION handle_table_drop() +RETURNS event_trigger AS $$ +DECLARE + obj RECORD; + schema_name TEXT; + table_name TEXT; +BEGIN + FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects() LOOP + IF obj.object_type = 'table' THEN + schema_name := split_part(obj.object_identity, '.', 1); + table_name := split_part(obj.object_identity, '.', 2); + + -- Perform cleanup: delete the associated job from the vectorize.job table + DELETE FROM vectorize.job + WHERE params ->> 'table' = table_name + AND params ->> 'schema' = schema_name; + END IF; + END LOOP; +END; +$$ LANGUAGE plpgsql; + +DROP EVENT TRIGGER IF EXISTS vectorize_job_drop_trigger; + +CREATE EVENT TRIGGER vectorize_job_drop_trigger +ON sql_drop +WHEN TAG IN ('DROP TABLE') +EXECUTE FUNCTION handle_table_drop(); INSERT INTO vectorize.prompts (prompt_type, sys_prompt, user_prompt) VALUES ( diff --git a/extension/tests/integration_tests.rs b/extension/tests/integration_tests.rs index 6361682c..8828b59b 100644 --- a/extension/tests/integration_tests.rs +++ b/extension/tests/integration_tests.rs @@ -860,3 +860,66 @@ async fn test_cohere() { .unwrap(); assert_eq!(search_results.len(), 3); } + +#[ignore] +#[tokio::test] +async fn test_event_trigger_on_table_drop() { + let conn = common::init_database().await; + let mut rng = rand::thread_rng(); + let test_num = rng.gen_range(1..100000); + let test_table_name = format!("products_test_{}", test_num); + let job_name = format!("job_{}", test_num); + + // Initialize the test table and job + common::init_test_table(&test_table_name, &conn).await; + common::init_embedding_svc_url(&conn).await; + + let _ = sqlx::query(&format!( + "SELECT vectorize.table( + job_name => '{job_name}', + \"table\" => '{test_table_name}', + primary_key => 'product_id', + columns => ARRAY['product_name'], + transformer => 'sentence-transformers/all-MiniLM-L6-v2' + );" + )) + .execute(&conn) + .await + .expect("failed to initialize vectorize job"); + + // Check the job table before dropping the test table + let job_count_before = common::row_count("vectorize.job", &conn).await; + assert_eq!(job_count_before, 1); + + // Drop the test table + let drop_result = sqlx::query(&format!("DROP TABLE {test_table_name} CASCADE;")) + .execute(&conn) + .await; + assert!(drop_result.is_ok(), "Failed to drop the test table"); + + // Debug: Check job table after dropping the test table + let job_count_after = common::row_count("vectorize.job", &conn).await; + assert_eq!(job_count_after, 0, "Job entry was not removed after table drop"); + + // Check if the job was deleted + let deleted_job = sqlx::query("SELECT * FROM vectorize.job WHERE params->>'table' = $1 AND params->>'schema' = $2") + .bind(test_table_name) + .bind("public") + .fetch_optional(&conn) + .await + .expect("Failed to fetch job"); + + assert!(deleted_job.is_none(), "Job was not deleted after table drop"); + + // Attempt to drop a non-associated table and verify no action is taken + let unrelated_table_name = format!("unrelated_test_{}", test_num); + common::init_test_table(&unrelated_table_name, &conn).await; + let _ = sqlx::query(&format!("DROP TABLE {unrelated_table_name};")) + .execute(&conn) + .await + .expect("Failed to drop the unrelated test table"); + + // Ensure vectorize.job is unaffected + let final_job_count = common::row_count("vectorize.job", &conn).await; + assert_eq!(final_job_count, 0, "vectorize.job should remain unaffected by unrelated table drops"); +}