feat: prefilter (#47)

* feat: prefilter Signed-off-by: usamoi <[email protected]> * docs: fix outdated Signed-off-by: usamoi <[email protected]> * fix: ci lint does not setup pgrx Signed-off-by: usamoi <[email protected]> * fix: broken test Signed-off-by: usamoi <[email protected]> * docs: add cast and fix cosine distance definition Signed-off-by: usamoi <[email protected]> * feat: remove GUC Signed-off-by: usamoi <[email protected]> --------- Signed-off-by: usamoi <[email protected]> Co-authored-by: Jinjing Zhou <[email protected]>
tensorchord · Aug 15, 2023 · 4ba1360 · 4ba1360
1 parent 46d133f
commit 4ba1360
Show file tree

Hide file tree

Showing 66 changed files with 4,491 additions and 3,017 deletions.
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -25,9 +25,20 @@ env:
 
 jobs:
   lint:
+    strategy:
+      matrix:
+        version: [15]
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v3
+    - name: Prepare
+      run: |
+        sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
+        wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
+        sudo apt-get update
+        sudo apt-get -y install libpq-dev postgresql-${{ matrix.version }} postgresql-server-dev-${{ matrix.version }}
+        cargo install cargo-pgrx --git https://github.com/tensorchord/pgrx.git --rev $(cat Cargo.toml | grep "pgrx =" | awk -F'rev = "' '{print $2}' | cut -d'"' -f1)
+        cargo pgrx init --pg${{ matrix.version }}=/usr/lib/postgresql/${{ matrix.version }}/bin/pg_config
     - name: Format check
       run: cargo fmt --check
     - name: Semantic check

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vectors"
-version = "0.0.0"
+version = "0.1.1"
 edition = "2021"
 
 [lib]
@@ -18,7 +18,7 @@ pg_test = []
 
 [dependencies]
 pgrx = { git = "https://github.com/tensorchord/pgrx.git", rev = "c0d11a8b78b0d707a5e9106bc4d5f66395ca9a2e" }
-openai_api_rust = "0.1.8"
+openai_api_rust = { git = "https://github.com/tensorchord/openai-api.git", rev = "228d54b6002e98257b3c81501a054942342f585f" }
 static_assertions = "1.1.0"
 libc = "~0.2"
 serde = "1.0.163"
@@ -33,15 +33,11 @@ dashmap = "5.4.0"
 parking_lot = "0.12.1"
 memoffset = "0.9.0"
 serde_json = "1"
-tokio = { version = "1", features = ["full"] }
 thiserror = "1.0.40"
-anyhow = { version = "1.0.71", features = ["backtrace"] }
-async-channel = "1.8.0"
 tempfile = "3.6.0"
 cstr = "0.2.11"
 arrayvec = { version = "0.7.3", features = ["serde"] }
 memmap2 = "0.7.0"
-tokio-stream = { version = "0.1.14", features = ["fs"] }
 validator = { version = "0.16.1", features = ["derive"] }
 toml = "0.7.6"
 

diff --git a/README.md b/README.md
@@ -104,13 +104,18 @@ You can then populate the table with vector data as follows.
 
 INSERT INTO items (embedding)
 VALUES ('[1,2,3]'), ('[4,5,6]');
+
+-- or insert values using a casting from array to vector
+
+INSERT INTO items (embedding)
+VALUES (ARRAY[1, 2, 3]::real[]), (ARRAY[4, 5, 6]::real[]);
 ```
 
 We support three operators to calculate the distance between two vectors.
 
 - `<->`: squared Euclidean distance, defined as $\Sigma (x_i - y_i) ^ 2$.
 - `<#>`: negative dot product distance, defined as $- \Sigma x_iy_i$.
-- `<=>`: negative squared cosine distance, defined as $- \frac{(\Sigma x_iy_i)^2}{\Sigma x_i^2 \Sigma y_i^2}$.
+- `<=>`: negative cosine distance, defined as $- \frac{\Sigma x_iy_i}{\sqrt{\Sigma x_i^2 \Sigma y_i^2}}$.
 
 ```sql
 -- call the distance function through operators
@@ -142,23 +147,21 @@ You can create an index, using squared Euclidean distance with the following SQL
 CREATE INDEX ON items USING vectors (embedding l2_ops)
 WITH (options = $$
 capacity = 2097152
-size_ram = 4294967296
-storage_vectors = "ram"
+[vectors]
+memmap = "ram"
 [algorithm.hnsw]
-storage = "ram"
-m = 32
-ef = 256
+memmap = "ram"
 $$);
 
 --- Or using IVFFlat algorithm.
 
 CREATE INDEX ON items USING vectors (embedding l2_ops)
 WITH (options = $$
 capacity = 2097152
-size_ram = 2147483648
-storage_vectors = "ram"
+[vectors]
+memmap = "ram"
 [algorithm.ivf]
-storage = "ram"
+memmap = "ram"
 nlist = 1000
 nprobe = 10
 $$);
@@ -203,15 +206,14 @@ We utilize TOML syntax to express the index's configuration. Here's what each ke
 | Key                    | Type    | Description                                                                                                           |
 | ---------------------- | ------- | --------------------------------------------------------------------------------------------------------------------- |
 | capacity               | integer | The index's capacity. The value should be greater than the number of rows in your table.                              |
-| size_ram               | integer | (Optional) The maximum amount of memory the persisent part of index can occupy.                                       |
-| size_disk              | integer | (Optional) The maximum amount of disk-backed memory-mapped file size the persisent part of index can occupy.          |
-| storage_vectors        | string  | `ram` ensures that the vectors always stays in memory while `disk` suggests otherwise.                                |
+| vectors                | table   | Configuration of background process vector storage.                                                                   |
+| vectors.memmap         | string  | (Optional) `ram` ensures that the vectors always stays in memory while `disk` suggests otherwise.                     |
 | algorithm.ivf          | table   | If this table is set, the IVF algorithm will be used for the index.                                                   |
-| algorithm.ivf.storage  | string  | (Optional) `ram` ensures that the persisent part of algorithm always stays in memory while `disk` suggests otherwise. |
-| algorithm.ivf.nlist    | integer | (Optional) Number of cluster units.                                                                                   |
-| algorithm.ivf.nprobe   | integer | (Optional) Number of units to query.                                                                                  |
+| algorithm.ivf.memmap   | string  | (Optional) `ram` ensures that the persisent part of algorithm always stays in memory while `disk` suggests otherwise. |
+| algorithm.ivf.nlist    | integer | Number of cluster units.                                                                                              |
+| algorithm.ivf.nprobe   | integer | Number of units to query.                                                                                             |
 | algorithm.hnsw         | table   | If this table is set, the HNSW algorithm will be used for the index.                                                  |
-| algorithm.hnsw.storage | string  | (Optional) `ram` ensures that the persisent part of algorithm always stays in memory while `disk` suggests otherwise. |
+| algorithm.hnsw.memmap  | string  | (Optional) `ram` ensures that the persisent part of algorithm always stays in memory while `disk` suggests otherwise. |
 | algorithm.hnsw.m       | integer | (Optional) Maximum degree of the node.                                                                                |
 | algorithm.hnsw.ef      | integer | (Optional) Search scope in building.                                                                                  |
 
@@ -229,10 +231,10 @@ UPDATE documents SET embedding = ai_embedding_vector(content) WHERE length(embed
 CREATE INDEX ON documents USING vectors (embedding l2_ops)
 WITH (options = $$
 capacity = 2097152
-size_ram = 4294967296
-storage_vectors = "ram"
+[vectors]
+memmap = "ram"
 [algorithm.hnsw]
-storage = "ram"
+memmap = "ram"
 m = 32
 ef = 256
 $$);

diff --git a/rust-toolchain.toml b/rust-toolchain.toml
@@ -1,4 +1,4 @@
 [toolchain]
-channel = "nightly-2023-05-09"
-components = ["rustfmt", "clippy"]
+channel = "nightly-2023-08-03"
+components = ["rustfmt", "clippy", "miri"]
 targets = ["x86_64-unknown-linux-gnu"]
diff --git a/src/algorithms/flat.rs b/src/algorithms/flat.rs
@@ -1,49 +1,81 @@
-use crate::algorithms::Vectors;
-use crate::memory::Address;
+use super::utils::filtered_fixed_heap::FilteredFixedHeap;
+use super::Algo;
+use crate::bgworker::index::IndexOptions;
+use crate::bgworker::storage::Storage;
+use crate::bgworker::storage::StoragePreallocator;
+use crate::bgworker::vectors::Vectors;
 use crate::prelude::*;
-use crate::utils::fixed_heap::FixedHeap;
 use serde::{Deserialize, Serialize};
+use std::marker::PhantomData;
 use std::sync::Arc;
+use thiserror::Error;
+
+#[derive(Debug, Clone, Error, Serialize, Deserialize)]
+pub enum FlatError {
+    //
+}
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct FlatOptions {}
 
-pub struct Flat {
-    distance: Distance,
+pub struct Flat<D: DistanceFamily> {
     vectors: Arc<Vectors>,
+    _maker: PhantomData<D>,
 }
 
-impl Algorithm for Flat {
-    type Options = FlatOptions;
+impl<D: DistanceFamily> Algo for Flat<D> {
+    type Error = FlatError;
 
-    fn build(options: Options, vectors: Arc<Vectors>, _: usize) -> anyhow::Result<Self> {
+    type Save = ();
+
+    fn prebuild(_: &mut StoragePreallocator, _: IndexOptions) -> Result<(), Self::Error> {
+        Ok(())
+    }
+
+    fn build(
+        _: &mut Storage,
+        _: IndexOptions,
+        vectors: Arc<Vectors>,
+        _: usize,
+    ) -> Result<Self, FlatError> {
         Ok(Self {
-            distance: options.distance,
             vectors,
+            _maker: PhantomData,
         })
     }
 
-    fn address(&self) -> Address {
-        Address::DANGLING
-    }
+    fn save(&self) {}
 
-    fn load(options: Options, vectors: Arc<Vectors>, _: Address) -> anyhow::Result<Self> {
+    fn load(
+        _: &mut Storage,
+        _: IndexOptions,
+        vectors: Arc<Vectors>,
+        _: (),
+    ) -> Result<Self, FlatError> {
         Ok(Self {
-            distance: options.distance,
             vectors,
+            _maker: PhantomData,
         })
     }
 
-    fn insert(&self, _: usize) -> anyhow::Result<()> {
+    fn insert(&self, _: usize) -> Result<(), FlatError> {
         Ok(())
     }
 
-    fn search(&self, (vector, k): (Box<[Scalar]>, usize)) -> anyhow::Result<Vec<(Scalar, u64)>> {
-        let mut result = FixedHeap::<(Scalar, u64)>::new(k);
+    fn search<F>(
+        &self,
+        target: Box<[Scalar]>,
+        k: usize,
+        filter: F,
+    ) -> Result<Vec<(Scalar, u64)>, FlatError>
+    where
+        F: FnMut(u64) -> bool,
+    {
+        let mut result = FilteredFixedHeap::new(k, filter);
         for i in 0..self.vectors.len() {
             let this_vector = self.vectors.get_vector(i);
             let this_data = self.vectors.get_data(i);
-            let dis = self.distance.distance(&vector, this_vector);
+            let dis = D::distance(&target, this_vector);
             result.push((dis, this_data));
         }
         Ok(result.into_sorted_vec())

diff --git a/src/algorithms/flat_q.rs b/src/algorithms/flat_q.rs
@@ -0,0 +1,126 @@
+use super::impls::quantization::*;
+use super::utils::filtered_fixed_heap::FilteredFixedHeap;
+use super::Algo;
+use crate::bgworker::index::IndexOptions;
+use crate::bgworker::storage::Storage;
+use crate::bgworker::storage::StoragePreallocator;
+use crate::bgworker::vectors::Vectors;
+use crate::prelude::*;
+use serde::{Deserialize, Serialize};
+use std::marker::PhantomData;
+use std::sync::Arc;
+use thiserror::Error;
+
+#[derive(Debug, Clone, Error, Serialize, Deserialize)]
+pub enum FlatQError {
+    #[error("Quantization {0}")]
+    Quantization(#[from] QuantizationError),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FlatQOptions {
+    pub memmap: Memmap,
+    pub sample_size: usize,
+}
+
+pub struct FlatQ<D: DistanceFamily, Q: Quantization> {
+    vectors: Arc<Vectors>,
+    implementation: QuantizationImpl<Q>,
+    _maker: PhantomData<D>,
+}
+
+impl<D: DistanceFamily, Q: Quantization> Algo for FlatQ<D, Q> {
+    type Error = FlatQError;
+
+    type Save = Q;
+
+    fn prebuild(
+        storage: &mut StoragePreallocator,
+        options: IndexOptions,
+    ) -> Result<(), Self::Error> {
+        let flat_q_options = options.algorithm.clone().unwrap_flat_q();
+        QuantizationImpl::<Q>::prebuild(
+            storage,
+            options.dims,
+            options.capacity,
+            flat_q_options.memmap,
+        )?;
+        Ok(())
+    }
+
+    fn build(
+        storage: &mut Storage,
+        options: IndexOptions,
+        vectors: Arc<Vectors>,
+        n: usize,
+    ) -> Result<Self, FlatQError> {
+        let flat_q_options = options.algorithm.clone().unwrap_flat_q();
+        let implementation = QuantizationImpl::new(
+            storage,
+            vectors.clone(),
+            options.dims,
+            n,
+            flat_q_options.sample_size,
+            options.capacity,
+            flat_q_options.memmap,
+        )?;
+        Ok(Self {
+            vectors,
+            implementation,
+            _maker: PhantomData,
+        })
+    }
+
+    fn save(&self) -> Q {
+        self.implementation.save()
+    }
+
+    fn load(
+        storage: &mut Storage,
+        options: IndexOptions,
+        vectors: Arc<Vectors>,
+        save: Q,
+    ) -> Result<Self, FlatQError> {
+        let flat_q_options = options.algorithm.clone().unwrap_flat_q();
+        Ok(Self {
+            vectors: vectors.clone(),
+            implementation: QuantizationImpl::load(
+                storage,
+                vectors,
+                save,
+                options.capacity,
+                flat_q_options.memmap,
+            )?,
+            _maker: PhantomData,
+        })
+    }
+
+    fn insert(&self, x: usize) -> Result<(), FlatQError> {
+        self.implementation.insert(x)?;
+        Ok(())
+    }
+
+    fn search<F>(
+        &self,
+        target: Box<[Scalar]>,
+        k: usize,
+        filter: F,
+    ) -> Result<Vec<(Scalar, u64)>, FlatQError>
+    where
+        F: FnMut(u64) -> bool,
+    {
+        let mut result = FilteredFixedHeap::new(k, filter);
+        let vector = self.implementation.process(&target);
+        for i in 0..self.vectors.len() {
+            let this_vector = self.implementation.get_vector(i);
+            let this_data = self.vectors.get_data(i);
+            let dis = self.implementation.distance(&vector, this_vector);
+            result.push((dis, this_data));
+        }
+        let mut output = Vec::new();
+        for (i, j) in result.into_sorted_vec().into_iter() {
+            output.push((i, j));
+        }
+        Ok(output)
+    }
+}