diff --git a/bigvectorbench/algorithms/pgvector/Dockerfile b/bigvectorbench/algorithms/pgvector/Dockerfile new file mode 100644 index 0000000..68bbf99 --- /dev/null +++ b/bigvectorbench/algorithms/pgvector/Dockerfile @@ -0,0 +1,43 @@ +FROM bigvectorbench-base + +ENV TZ=Asia/Shanghai +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo '$TZ' > /etc/timezone + +RUN git clone https://github.com/pgvector/pgvector /tmp/pgvector +RUN DEBIAN_FRONTEND=noninteractive apt-get -y install tzdata + +RUN apt-get update && apt-get install -y --no-install-recommends build-essential postgresql-common +RUN /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y +RUN apt-get install -y --no-install-recommends postgresql-16 postgresql-server-dev-16 + +RUN sh -c 'echo "local all all trust" > /etc/postgresql/16/main/pg_hba.conf' + +# Dynamically set OPTFLAGS based on the architecture +RUN ARCH=$(uname -m) && \ + if [ "$ARCH" = "aarch64" ]; then \ + OPTFLAGS="-march=native -msve-vector-bits=512"; \ + elif [ "$ARCH" = "x86_64" ]; then \ + OPTFLAGS="-march=native -mprefer-vector-width=512"; \ + else \ + OPTFLAGS="-march=native"; \ + fi && \ + cd /tmp/pgvector && \ + make clean && \ + make OPTFLAGS="$OPTFLAGS" && \ + make install + +USER postgres +RUN service postgresql start && \ + psql -c "CREATE USER bvb WITH ENCRYPTED PASSWORD 'bvb'" && \ + psql -c "CREATE DATABASE bvb" && \ + psql -c "GRANT ALL PRIVILEGES ON DATABASE bvb TO bvb" && \ + psql -d bvb -c "GRANT ALL ON SCHEMA public TO bvb" && \ + psql -d bvb -c "CREATE EXTENSION vector" && \ + psql -c "ALTER USER bvb SET maintenance_work_mem = '4GB'" && \ + psql -c "ALTER USER bvb SET max_parallel_maintenance_workers = 0" && \ + psql -c "ALTER SYSTEM SET shared_buffers = '4GB'" +USER root + +RUN pip install psycopg[binary] pgvector + +# ENTRYPOINT ["bash"] diff --git a/bigvectorbench/algorithms/pgvector/config.yml b/bigvectorbench/algorithms/pgvector/config.yml new file mode 100644 index 0000000..fab999c --- /dev/null +++ b/bigvectorbench/algorithms/pgvector/config.yml @@ -0,0 +1,17 @@ +float: + any: + - base_args: ["@metric"] + constructor: PGVector + disabled: false + docker_tag: bigvectorbench-pgvector + module: bigvectorbench.algorithms.pgvector + name: pgvector + run_groups: + M-16: + arg_groups: [{M: 16, efConstruction: 200}] + # args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] + M-24: + arg_groups: [{M: 24, efConstruction: 200}] + # args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] \ No newline at end of file diff --git a/bigvectorbench/algorithms/pgvector/module.py b/bigvectorbench/algorithms/pgvector/module.py new file mode 100644 index 0000000..9d62bdd --- /dev/null +++ b/bigvectorbench/algorithms/pgvector/module.py @@ -0,0 +1,92 @@ +""" Pgvector module for BigVectorBench framework. """ + +import subprocess +import sys +import numpy as np +import pgvector.psycopg +import psycopg +import os + +from bigvectorbench.algorithms.base.module import BaseANN + +class PGVector(BaseANN): + def __init__(self, metric, method_param): + self._metric = metric + self._m = method_param['M'] + self._ef_construction = method_param['efConstruction'] + self._cur = None + self.labels = None + self.label_names = None + self.label_types = None + + if metric == "angular": + self._query = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s" + elif metric == "euclidean": + self._query = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s" + else: + raise RuntimeError(f"unknown metric {metric}") + + def load_data( + self, + embeddings: np.array, + labels: np.ndarray | None = None, + label_names: list[str] | None = None, + label_types: list[str] | None = None, + ) -> None: + subprocess.run("service postgresql start", shell=True, check=True, stdout=sys.stdout, stderr=sys.stderr) + conn = psycopg.connect(user="bvb", password="bvb", dbname="bvb", autocommit=True) + pgvector.psycopg.register_vector(conn) + cur = conn.cursor() + cur.execute("DROP TABLE IF EXISTS items") + + if labels is not None and label_names and label_types: + additional_columns = ', '.join(f"{name} {type}" for name, type in zip(label_names, label_types)) + table_definition = f"id int, embedding vector({embeddings.shape[1]}), {additional_columns}" + else: + table_definition = f"id int, embedding vector({embeddings.shape[1]})" + + cur.execute(f"CREATE TABLE items ({table_definition})") + cur.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN") + + if labels is not None: + with cur.copy(f"COPY items (id, embedding, {', '.join(label_names)}) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.set_types(["int4", "vector"] + label_types) + for i, embedding in enumerate(embeddings): + copy.write_row((i, embedding) + tuple(labels[i])) + else: + with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy: + copy.set_types(["int4", "vector"]) + for i, embedding in enumerate(embeddings): + copy.write_row((i, embedding)) + + print("creating index...") + if self._metric == "angular": + cur.execute("CREATE INDEX ON items USING hnsw (embedding vector_cosine_ops) WITH (m = %d, ef_construction = %d)" % (self._m, self._ef_construction)) + elif self._metric == "euclidean": + cur.execute("CREATE INDEX ON items USING hnsw (embedding vector_l2_ops) WITH (m = %d, ef_construction = %d)" % (self._m, self._ef_construction)) + else: + raise RuntimeError(f"unknown metric {self._metric}") + print("done!") + self._cur = cur + + def set_query_arguments(self, ef_search): + self._ef_search = ef_search + self._cur.execute("SET hnsw.ef_search = %d" % ef_search) + + def query(self, v: np.array, n: int, filter_expr: str | None = None) -> list[int]: + if filter_expr: + sql_filter = " AND ".join(f"{name} = {value}" for name, value in zip(self.label_names, eval(filter_expr))) + query = self._query[:-8] + " AND " + sql_filter + self._query[-8:] + else: + query = self._query + self._cur.execute(self._query, (v, n), binary=True, prepare=True) + return [id for id, in self._cur.fetchall()] + + def get_memory_usage(self): + if self._cur is None: + return 0 + self._cur.execute("SELECT pg_relation_size('items_embedding_idx')") + return self._cur.fetchone()[0] / 1024 + + def __str__(self): + return f"PGVector(m={self._m}, ef_construction={self._ef_construction}, ef_search={self._ef_search})" \ No newline at end of file