BenchCouncil · Lewisvan · Sep 2, 2024 · Oct 18, 2024
diff --git a/bigvectorbench/algorithms/pgvector/Dockerfile b/bigvectorbench/algorithms/pgvector/Dockerfile
@@ -0,0 +1,43 @@
+FROM bigvectorbench-base
+
+ENV TZ=Asia/Shanghai
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo '$TZ' > /etc/timezone
+
+RUN git clone https://github.com/pgvector/pgvector /tmp/pgvector
+RUN DEBIAN_FRONTEND=noninteractive apt-get -y install tzdata
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential postgresql-common
+RUN /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y
+RUN apt-get install -y --no-install-recommends postgresql-16 postgresql-server-dev-16
+
+RUN sh -c 'echo "local all all trust" > /etc/postgresql/16/main/pg_hba.conf'
+
+# Dynamically set OPTFLAGS based on the architecture
+RUN ARCH=$(uname -m) && \
+    if [ "$ARCH" = "aarch64" ]; then \
+        OPTFLAGS="-march=native -msve-vector-bits=512"; \
+    elif [ "$ARCH" = "x86_64" ]; then \
+        OPTFLAGS="-march=native -mprefer-vector-width=512"; \
+    else \
+        OPTFLAGS="-march=native"; \
+    fi && \
+    cd /tmp/pgvector && \
+    make clean && \
+    make OPTFLAGS="$OPTFLAGS" && \
+    make install
+
+USER postgres
+RUN service postgresql start && \
+    psql -c "CREATE USER bvb WITH ENCRYPTED PASSWORD 'bvb'" && \
+    psql -c "CREATE DATABASE bvb" && \
+    psql -c "GRANT ALL PRIVILEGES ON DATABASE bvb TO bvb" && \
+    psql -d bvb -c "GRANT ALL ON SCHEMA public TO bvb" && \
+    psql -d bvb -c "CREATE EXTENSION vector" && \
+    psql -c "ALTER USER bvb SET maintenance_work_mem = '4GB'" && \
+    psql -c "ALTER USER bvb SET max_parallel_maintenance_workers = 0" && \
+    psql -c "ALTER SYSTEM SET shared_buffers = '4GB'"
+USER root
+
+RUN pip install psycopg[binary] pgvector
+
+# ENTRYPOINT ["bash"]
diff --git a/bigvectorbench/algorithms/pgvector/config.yml b/bigvectorbench/algorithms/pgvector/config.yml
@@ -0,0 +1,17 @@
+float:
+  any:
+  - base_args: ["@metric"]
+    constructor: PGVector
+    disabled: false
+    docker_tag: bigvectorbench-pgvector
+    module: bigvectorbench.algorithms.pgvector
+    name: pgvector
+    run_groups:
+      M-16:
+        arg_groups: [{M: 16, efConstruction: 200}]
+        # args: {}
+        query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
+      M-24:
+        arg_groups: [{M: 24, efConstruction: 200}]
+        # args: {}
+        query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
diff --git a/bigvectorbench/algorithms/pgvector/module.py b/bigvectorbench/algorithms/pgvector/module.py
@@ -0,0 +1,92 @@
+""" Pgvector module for BigVectorBench framework. """
+
+import subprocess
+import sys
+import numpy as np
+import pgvector.psycopg
+import psycopg
+import os
+
+from bigvectorbench.algorithms.base.module import BaseANN
+
+class PGVector(BaseANN):
+    def __init__(self, metric, method_param):
+        self._metric = metric
+        self._m = method_param['M']
+        self._ef_construction = method_param['efConstruction']
+        self._cur = None
+        self.labels = None
+        self.label_names = None
+        self.label_types = None
+
+        if metric == "angular":
+            self._query = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s"
+        elif metric == "euclidean":
+            self._query = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s"
+        else:
+            raise RuntimeError(f"unknown metric {metric}")
+
+    def load_data(
+        self,
+        embeddings: np.array,
+        labels: np.ndarray | None = None,
+        label_names: list[str] | None = None,
+        label_types: list[str] | None = None,
+    ) -> None:
+        subprocess.run("service postgresql start", shell=True, check=True, stdout=sys.stdout, stderr=sys.stderr)
+        conn = psycopg.connect(user="bvb", password="bvb", dbname="bvb", autocommit=True)
+        pgvector.psycopg.register_vector(conn)
+        cur = conn.cursor()
+        cur.execute("DROP TABLE IF EXISTS items")
+
+        if labels is not None and label_names and label_types:
+            additional_columns = ', '.join(f"{name} {type}" for name, type in zip(label_names, label_types))
+            table_definition = f"id int, embedding vector({embeddings.shape[1]}), {additional_columns}"
+        else:
+            table_definition = f"id int, embedding vector({embeddings.shape[1]})"
+
+        cur.execute(f"CREATE TABLE items ({table_definition})")
+        cur.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN")
+
+        if labels is not None:
+            with cur.copy(f"COPY items (id, embedding, {', '.join(label_names)}) FROM STDIN WITH (FORMAT BINARY)") as copy:
+                copy.set_types(["int4", "vector"] + label_types)
+                for i, embedding in enumerate(embeddings):
+                    copy.write_row((i, embedding) + tuple(labels[i]))
+        else:
+            with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy:
+                copy.set_types(["int4", "vector"])
+                for i, embedding in enumerate(embeddings):
+                    copy.write_row((i, embedding))
+
+        print("creating index...")
+        if self._metric == "angular":
+            cur.execute("CREATE INDEX ON items USING hnsw (embedding vector_cosine_ops) WITH (m = %d, ef_construction = %d)" % (self._m, self._ef_construction))
+        elif self._metric == "euclidean":
+            cur.execute("CREATE INDEX ON items USING hnsw (embedding vector_l2_ops) WITH (m = %d, ef_construction = %d)" % (self._m, self._ef_construction))
+        else:
+            raise RuntimeError(f"unknown metric {self._metric}")
+        print("done!")
+        self._cur = cur
+
+    def set_query_arguments(self, ef_search):
+        self._ef_search = ef_search
+        self._cur.execute("SET hnsw.ef_search = %d" % ef_search)
+
+    def query(self, v: np.array, n: int, filter_expr: str | None = None) -> list[int]:
+        if filter_expr:
+            sql_filter = " AND ".join(f"{name} = {value}" for name, value in zip(self.label_names, eval(filter_expr)))
+            query = self._query[:-8] + " AND " + sql_filter + self._query[-8:]
+        else:
+            query = self._query
+        self._cur.execute(self._query, (v, n), binary=True, prepare=True)
+        return [id for id, in self._cur.fetchall()]
+
+    def get_memory_usage(self):
+        if self._cur is None:
+            return 0
+        self._cur.execute("SELECT pg_relation_size('items_embedding_idx')")
+        return self._cur.fetchone()[0] / 1024
+
+    def __str__(self):
+        return f"PGVector(m={self._m}, ef_construction={self._ef_construction}, ef_search={self._ef_search})"