Skip to content

Commit

Permalink
feat: prefilter (#47)
Browse files Browse the repository at this point in the history
* feat: prefilter

Signed-off-by: usamoi <[email protected]>

* docs: fix outdated

Signed-off-by: usamoi <[email protected]>

* fix: ci lint does not setup pgrx

Signed-off-by: usamoi <[email protected]>

* fix: broken test

Signed-off-by: usamoi <[email protected]>

* docs: add cast and fix cosine distance definition

Signed-off-by: usamoi <[email protected]>

* feat: remove GUC

Signed-off-by: usamoi <[email protected]>

---------

Signed-off-by: usamoi <[email protected]>
Co-authored-by: Jinjing Zhou <[email protected]>
  • Loading branch information
usamoi and VoVAllen authored Aug 15, 2023
1 parent 46d133f commit 4ba1360
Show file tree
Hide file tree
Showing 66 changed files with 4,491 additions and 3,017 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,20 @@ env:

jobs:
lint:
strategy:
matrix:
version: [15]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Prepare
run: |
sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
sudo apt-get update
sudo apt-get -y install libpq-dev postgresql-${{ matrix.version }} postgresql-server-dev-${{ matrix.version }}
cargo install cargo-pgrx --git https://github.com/tensorchord/pgrx.git --rev $(cat Cargo.toml | grep "pgrx =" | awk -F'rev = "' '{print $2}' | cut -d'"' -f1)
cargo pgrx init --pg${{ matrix.version }}=/usr/lib/postgresql/${{ matrix.version }}/bin/pg_config
- name: Format check
run: cargo fmt --check
- name: Semantic check
Expand Down
8 changes: 2 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "vectors"
version = "0.0.0"
version = "0.1.1"
edition = "2021"

[lib]
Expand All @@ -18,7 +18,7 @@ pg_test = []

[dependencies]
pgrx = { git = "https://github.com/tensorchord/pgrx.git", rev = "c0d11a8b78b0d707a5e9106bc4d5f66395ca9a2e" }
openai_api_rust = "0.1.8"
openai_api_rust = { git = "https://github.com/tensorchord/openai-api.git", rev = "228d54b6002e98257b3c81501a054942342f585f" }
static_assertions = "1.1.0"
libc = "~0.2"
serde = "1.0.163"
Expand All @@ -33,15 +33,11 @@ dashmap = "5.4.0"
parking_lot = "0.12.1"
memoffset = "0.9.0"
serde_json = "1"
tokio = { version = "1", features = ["full"] }
thiserror = "1.0.40"
anyhow = { version = "1.0.71", features = ["backtrace"] }
async-channel = "1.8.0"
tempfile = "3.6.0"
cstr = "0.2.11"
arrayvec = { version = "0.7.3", features = ["serde"] }
memmap2 = "0.7.0"
tokio-stream = { version = "0.1.14", features = ["fs"] }
validator = { version = "0.16.1", features = ["derive"] }
toml = "0.7.6"

Expand Down
40 changes: 21 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,18 @@ You can then populate the table with vector data as follows.

INSERT INTO items (embedding)
VALUES ('[1,2,3]'), ('[4,5,6]');

-- or insert values using a casting from array to vector

INSERT INTO items (embedding)
VALUES (ARRAY[1, 2, 3]::real[]), (ARRAY[4, 5, 6]::real[]);
```

We support three operators to calculate the distance between two vectors.

- `<->`: squared Euclidean distance, defined as $\Sigma (x_i - y_i) ^ 2$.
- `<#>`: negative dot product distance, defined as $- \Sigma x_iy_i$.
- `<=>`: negative squared cosine distance, defined as $- \frac{(\Sigma x_iy_i)^2}{\Sigma x_i^2 \Sigma y_i^2}$.
- `<=>`: negative cosine distance, defined as $- \frac{\Sigma x_iy_i}{\sqrt{\Sigma x_i^2 \Sigma y_i^2}}$.

```sql
-- call the distance function through operators
Expand Down Expand Up @@ -142,23 +147,21 @@ You can create an index, using squared Euclidean distance with the following SQL
CREATE INDEX ON items USING vectors (embedding l2_ops)
WITH (options = $$
capacity = 2097152
size_ram = 4294967296
storage_vectors = "ram"
[vectors]
memmap = "ram"
[algorithm.hnsw]
storage = "ram"
m = 32
ef = 256
memmap = "ram"
$$);

--- Or using IVFFlat algorithm.

CREATE INDEX ON items USING vectors (embedding l2_ops)
WITH (options = $$
capacity = 2097152
size_ram = 2147483648
storage_vectors = "ram"
[vectors]
memmap = "ram"
[algorithm.ivf]
storage = "ram"
memmap = "ram"
nlist = 1000
nprobe = 10
$$);
Expand Down Expand Up @@ -203,15 +206,14 @@ We utilize TOML syntax to express the index's configuration. Here's what each ke
| Key | Type | Description |
| ---------------------- | ------- | --------------------------------------------------------------------------------------------------------------------- |
| capacity | integer | The index's capacity. The value should be greater than the number of rows in your table. |
| size_ram | integer | (Optional) The maximum amount of memory the persisent part of index can occupy. |
| size_disk | integer | (Optional) The maximum amount of disk-backed memory-mapped file size the persisent part of index can occupy. |
| storage_vectors | string | `ram` ensures that the vectors always stays in memory while `disk` suggests otherwise. |
| vectors | table | Configuration of background process vector storage. |
| vectors.memmap | string | (Optional) `ram` ensures that the vectors always stays in memory while `disk` suggests otherwise. |
| algorithm.ivf | table | If this table is set, the IVF algorithm will be used for the index. |
| algorithm.ivf.storage | string | (Optional) `ram` ensures that the persisent part of algorithm always stays in memory while `disk` suggests otherwise. |
| algorithm.ivf.nlist | integer | (Optional) Number of cluster units. |
| algorithm.ivf.nprobe | integer | (Optional) Number of units to query. |
| algorithm.ivf.memmap | string | (Optional) `ram` ensures that the persisent part of algorithm always stays in memory while `disk` suggests otherwise. |
| algorithm.ivf.nlist | integer | Number of cluster units. |
| algorithm.ivf.nprobe | integer | Number of units to query. |
| algorithm.hnsw | table | If this table is set, the HNSW algorithm will be used for the index. |
| algorithm.hnsw.storage | string | (Optional) `ram` ensures that the persisent part of algorithm always stays in memory while `disk` suggests otherwise. |
| algorithm.hnsw.memmap | string | (Optional) `ram` ensures that the persisent part of algorithm always stays in memory while `disk` suggests otherwise. |
| algorithm.hnsw.m | integer | (Optional) Maximum degree of the node. |
| algorithm.hnsw.ef | integer | (Optional) Search scope in building. |

Expand All @@ -229,10 +231,10 @@ UPDATE documents SET embedding = ai_embedding_vector(content) WHERE length(embed
CREATE INDEX ON documents USING vectors (embedding l2_ops)
WITH (options = $$
capacity = 2097152
size_ram = 4294967296
storage_vectors = "ram"
[vectors]
memmap = "ram"
[algorithm.hnsw]
storage = "ram"
memmap = "ram"
m = 32
ef = 256
$$);
Expand Down
4 changes: 2 additions & 2 deletions rust-toolchain.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[toolchain]
channel = "nightly-2023-05-09"
components = ["rustfmt", "clippy"]
channel = "nightly-2023-08-03"
components = ["rustfmt", "clippy", "miri"]
targets = ["x86_64-unknown-linux-gnu"]
68 changes: 50 additions & 18 deletions src/algorithms/flat.rs
Original file line number Diff line number Diff line change
@@ -1,49 +1,81 @@
use crate::algorithms::Vectors;
use crate::memory::Address;
use super::utils::filtered_fixed_heap::FilteredFixedHeap;
use super::Algo;
use crate::bgworker::index::IndexOptions;
use crate::bgworker::storage::Storage;
use crate::bgworker::storage::StoragePreallocator;
use crate::bgworker::vectors::Vectors;
use crate::prelude::*;
use crate::utils::fixed_heap::FixedHeap;
use serde::{Deserialize, Serialize};
use std::marker::PhantomData;
use std::sync::Arc;
use thiserror::Error;

#[derive(Debug, Clone, Error, Serialize, Deserialize)]
pub enum FlatError {
//
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FlatOptions {}

pub struct Flat {
distance: Distance,
pub struct Flat<D: DistanceFamily> {
vectors: Arc<Vectors>,
_maker: PhantomData<D>,
}

impl Algorithm for Flat {
type Options = FlatOptions;
impl<D: DistanceFamily> Algo for Flat<D> {
type Error = FlatError;

fn build(options: Options, vectors: Arc<Vectors>, _: usize) -> anyhow::Result<Self> {
type Save = ();

fn prebuild(_: &mut StoragePreallocator, _: IndexOptions) -> Result<(), Self::Error> {
Ok(())
}

fn build(
_: &mut Storage,
_: IndexOptions,
vectors: Arc<Vectors>,
_: usize,
) -> Result<Self, FlatError> {
Ok(Self {
distance: options.distance,
vectors,
_maker: PhantomData,
})
}

fn address(&self) -> Address {
Address::DANGLING
}
fn save(&self) {}

fn load(options: Options, vectors: Arc<Vectors>, _: Address) -> anyhow::Result<Self> {
fn load(
_: &mut Storage,
_: IndexOptions,
vectors: Arc<Vectors>,
_: (),
) -> Result<Self, FlatError> {
Ok(Self {
distance: options.distance,
vectors,
_maker: PhantomData,
})
}

fn insert(&self, _: usize) -> anyhow::Result<()> {
fn insert(&self, _: usize) -> Result<(), FlatError> {
Ok(())
}

fn search(&self, (vector, k): (Box<[Scalar]>, usize)) -> anyhow::Result<Vec<(Scalar, u64)>> {
let mut result = FixedHeap::<(Scalar, u64)>::new(k);
fn search<F>(
&self,
target: Box<[Scalar]>,
k: usize,
filter: F,
) -> Result<Vec<(Scalar, u64)>, FlatError>
where
F: FnMut(u64) -> bool,
{
let mut result = FilteredFixedHeap::new(k, filter);
for i in 0..self.vectors.len() {
let this_vector = self.vectors.get_vector(i);
let this_data = self.vectors.get_data(i);
let dis = self.distance.distance(&vector, this_vector);
let dis = D::distance(&target, this_vector);
result.push((dis, this_data));
}
Ok(result.into_sorted_vec())
Expand Down
126 changes: 126 additions & 0 deletions src/algorithms/flat_q.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
use super::impls::quantization::*;
use super::utils::filtered_fixed_heap::FilteredFixedHeap;
use super::Algo;
use crate::bgworker::index::IndexOptions;
use crate::bgworker::storage::Storage;
use crate::bgworker::storage::StoragePreallocator;
use crate::bgworker::vectors::Vectors;
use crate::prelude::*;
use serde::{Deserialize, Serialize};
use std::marker::PhantomData;
use std::sync::Arc;
use thiserror::Error;

#[derive(Debug, Clone, Error, Serialize, Deserialize)]
pub enum FlatQError {
#[error("Quantization {0}")]
Quantization(#[from] QuantizationError),
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FlatQOptions {
pub memmap: Memmap,
pub sample_size: usize,
}

pub struct FlatQ<D: DistanceFamily, Q: Quantization> {
vectors: Arc<Vectors>,
implementation: QuantizationImpl<Q>,
_maker: PhantomData<D>,
}

impl<D: DistanceFamily, Q: Quantization> Algo for FlatQ<D, Q> {
type Error = FlatQError;

type Save = Q;

fn prebuild(
storage: &mut StoragePreallocator,
options: IndexOptions,
) -> Result<(), Self::Error> {
let flat_q_options = options.algorithm.clone().unwrap_flat_q();
QuantizationImpl::<Q>::prebuild(
storage,
options.dims,
options.capacity,
flat_q_options.memmap,
)?;
Ok(())
}

fn build(
storage: &mut Storage,
options: IndexOptions,
vectors: Arc<Vectors>,
n: usize,
) -> Result<Self, FlatQError> {
let flat_q_options = options.algorithm.clone().unwrap_flat_q();
let implementation = QuantizationImpl::new(
storage,
vectors.clone(),
options.dims,
n,
flat_q_options.sample_size,
options.capacity,
flat_q_options.memmap,
)?;
Ok(Self {
vectors,
implementation,
_maker: PhantomData,
})
}

fn save(&self) -> Q {
self.implementation.save()
}

fn load(
storage: &mut Storage,
options: IndexOptions,
vectors: Arc<Vectors>,
save: Q,
) -> Result<Self, FlatQError> {
let flat_q_options = options.algorithm.clone().unwrap_flat_q();
Ok(Self {
vectors: vectors.clone(),
implementation: QuantizationImpl::load(
storage,
vectors,
save,
options.capacity,
flat_q_options.memmap,
)?,
_maker: PhantomData,
})
}

fn insert(&self, x: usize) -> Result<(), FlatQError> {
self.implementation.insert(x)?;
Ok(())
}

fn search<F>(
&self,
target: Box<[Scalar]>,
k: usize,
filter: F,
) -> Result<Vec<(Scalar, u64)>, FlatQError>
where
F: FnMut(u64) -> bool,
{
let mut result = FilteredFixedHeap::new(k, filter);
let vector = self.implementation.process(&target);
for i in 0..self.vectors.len() {
let this_vector = self.implementation.get_vector(i);
let this_data = self.vectors.get_data(i);
let dis = self.implementation.distance(&vector, this_vector);
result.push((dis, this_data));
}
let mut output = Vec::new();
for (i, j) in result.into_sorted_vec().into_iter() {
output.push((i, j));
}
Ok(output)
}
}
Loading

0 comments on commit 4ba1360

Please sign in to comment.