Skip to content

Commit

Permalink
Merge pull request qdrant#67 from qdrant/feat/benchmark-automation-sc…
Browse files Browse the repository at this point in the history
…ript

feat: Scripts to automate benchmarking
  • Loading branch information
KShivendu authored Nov 6, 2023
2 parents c52f9e8 + 1be2407 commit 0b716a4
Show file tree
Hide file tree
Showing 13 changed files with 335 additions and 5 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ __pycache__
*.pyc
NOTES.md

results/*
results/*
tools/custom/data.json
7 changes: 3 additions & 4 deletions engine/base_client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def run_experiment(
existing_results = list(RESULTS_DIR.glob(glob_pattern))
if len(existing_results) == len(self.searchers):
print(
f"Skipping run (upload + search) for {self.name} since it already ran {len(self.searchers)} search configs previously"
f"Skipping run for {self.name} since it already ran {len(self.searchers)} search configs previously"
)
return

Expand Down Expand Up @@ -103,10 +103,9 @@ def run_experiment(
)
existing_results = list(RESULTS_DIR.glob(glob_pattern))
print("Pattern", glob_pattern, "Results:", existing_results)
if len(existing_results) == 1:
if len(existing_results) >= 1:
print(
f"Skipping search {search_id} as it already exists in",
existing_results[0],
f"Skipping search {search_id} as it already exists",
)
continue

Expand Down
32 changes: 32 additions & 0 deletions tools/custom/create_and_install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

set -e

# path relative to the script

SCRIPT=$(realpath "$0")
SCRIPTPATH=$(dirname "$SCRIPT")


# Create server in custom Cloud

SERVER_NAME=${SERVER_NAME:-test-server-1}


SERVER_IP=$(jq ".[\"${SERVER_NAME}\"].public_ip" -r $SCRIPTPATH/data.json)

SSH_USER=$(jq ".[\"${SERVER_NAME}\"].user" -r $SCRIPTPATH/data.json)

echo "Server IP: ${SERVER_IP}"

ssh-keygen -f "$HOME/.ssh/known_hosts" -R "${SERVER_IP}" || true

# Wait for server to be ready

while ! ssh -oStrictHostKeyChecking=no ${SSH_USER}@${SERVER_IP} echo "Server is ready"; do
sleep 1
done

# Create and install docker, poetry, etc

cat "${SCRIPTPATH}/setup_vm.sh" | ssh "${SSH_USER}@${SERVER_IP}" sudo bash
12 changes: 12 additions & 0 deletions tools/custom/example.data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"benchmark-client-glove-100": {
"public_ip": "aa.bb.cc.dd",
"private_ip": "xx.y.z.a",
"user": "root"
},
"benchmark-server-glove-100": {
"public_ip": "aa.bb.cc.de",
"private_ip": "xx.y.z.b",
"user": "root"
}
}
16 changes: 16 additions & 0 deletions tools/custom/get_private_ip.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -e
# Get ip of the private network interface of custom cloud server

# Usage: ./get_private_ip.sh <server_name>

# Example: ./get_private_ip.sh benchmark-server-1


SCRIPT=$(realpath "$0")
SCRIPTPATH=$(dirname "$SCRIPT")

SERVER_IP=$(jq ".[\"${1}\"].private_ip" -r $SCRIPTPATH/data.json)

echo "${SERVER_IP}"
16 changes: 16 additions & 0 deletions tools/custom/get_public_ip.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -e
# Get ip of the private network interface of custom cloud server

# Usage: ./get_private_ip.sh <server_name>

# Example: ./get_private_ip.sh benchmark-server-1


SCRIPT=$(realpath "$0")
SCRIPTPATH=$(dirname "$SCRIPT")

SERVER_IP=$(jq ".[\"${1}\"].public_ip" -r $SCRIPTPATH/data.json)

echo "${SERVER_IP}"
16 changes: 16 additions & 0 deletions tools/custom/get_ssh_user.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -e
# Get ip of the private network interface of custom cloud server

# Usage: ./get_private_ip.sh <server_name>

# Example: ./get_private_ip.sh benchmark-server-1


SCRIPT=$(realpath "$0")
SCRIPTPATH=$(dirname "$SCRIPT")

SSH_USER=$(jq ".[\"${1}\"].user" -r $SCRIPTPATH/data.json)

echo "${SSH_USER}"
34 changes: 34 additions & 0 deletions tools/custom/setup_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

set -e

mkdir -p projects

# Install docker

apt-get update
apt-get install -y \
ca-certificates \
curl \
gnupg \
lsb-release \
jq

mkdir -p /etc/apt/keyrings

curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg

echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null

apt-get update

apt-get install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin

docker run hello-world

# Install poetry
apt install -y python3-pip
python3 -m pip install poetry
python3 -m poetry --version
48 changes: 48 additions & 0 deletions tools/remote/setup_benchmark_client.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash

set -e
set -x

VECTOR_DB=${VECTOR_DB:-qdrant}
BRANCH=${BRANCH:-master}

if [ -d "./vector-db-benchmark" ]; then
echo "vector-db-benchmark repo already exists"
else
git clone https://github.com/qdrant/vector-db-benchmark
fi

cd vector-db-benchmark
git fetch && git checkout $BRANCH && git pull

python3 -m poetry install

# if using qdrant vector db
if [ "$VECTOR_DB" == "qdrant" ]; then
QDRANT_CONFIGS=$(cat experiments/configurations/qdrant-single-node{-rps,}.json | jq '.[] | .name' | grep -E 'qdrant(-rps)?-m-.*-ef-.*' | sed 's/"//g')

for QDRANT_CONFIG in $QDRANT_CONFIGS; do
# upload
python3 -m poetry run python run.py --engines "${QDRANT_CONFIG}" --datasets $DATASET --host $PRIVATE_SERVER_IP --skip-search >> ${VECTOR_DB}.log 2>&1

# now run search (retry on errors)
set +e
while true; do
python3 -m poetry run python run.py --engines "${QDRANT_CONFIG}" --datasets $DATASET --host $PRIVATE_SERVER_IP >> ${VECTOR_DB}.log --skip-upload 2>&1
if [ $? -ne 0 ]; then
echo "retrying" | tee -a ${VECTOR_DB}.log
sleep 1
else
echo "done" | tee -a ${VECTOR_DB}.log
break
fi
done
set -e
done
else
nohup python3 -m poetry run python run.py --engines "${VECTOR_DB}-m-*-ef-*" --datasets $DATASET --host $PRIVATE_SERVER_IP >> ${VECTOR_DB}.log 2>&1 &
fi

PID_BENCHMARK=$!
echo $PID_BENCHMARK > benchmark.pid
wait $PID_BENCHMARK
45 changes: 45 additions & 0 deletions tools/remote/setup_benchmark_server.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

set -e
set -x

VECTOR_DB=${VECTOR_DB:-qdrant}
BRANCH=${BRANCH:-master}

if [ -d "./vector-db-benchmark" ]; then
echo "vector-db-benchmark repo already exists"
else
git clone https://github.com/qdrant/vector-db-benchmark
fi

cd vector-db-benchmark
git fetch && git checkout $BRANCH && git pull

# remove all running containers:
RUNNING_CONTAINERS=$(docker ps -q)
if [ -n "$RUNNING_CONTAINERS" ]; then
docker container rm -f $RUNNING_CONTAINERS
fi

cd engine/servers/${VECTOR_DB}-single-node
docker compose up -d

# if vector DB is milvus or elasticsearch, wait for them to be up
if [ "$VECTOR_DB" == "milvus" ] || [ "$VECTOR_DB" == "elasticsearch" ]; then
sleep 30 # Throws connection reset which isn't handled by --retry-connrefused in curl. So we need to wait
fi

# Define a map for database types and their health check URLs
declare -A db_health_urls
db_health_urls["milvus"]="http://localhost:19530/v1/vector/collections"
db_health_urls["qdrant"]="http://localhost:6333"
db_health_urls["elasticsearch"]="http://localhost:9200/_cluster/health"

# Check if the specified database type exists in the map
if [ -n "${db_health_urls[$VECTOR_DB]}" ]; then
url="${db_health_urls[$VECTOR_DB]}"
# Retry logic for the specified URL
curl --max-time 120 --retry-connrefused --retry 10 --retry-delay 10 "$url"
else
echo "Assuming engine $VECTOR_DB is already up"
fi
41 changes: 41 additions & 0 deletions tools/run_benchmarks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

# Usage: tools/run_benchmarks.sh "deep-image-96-angular" "new-benchmark-server"

set -e
set -x

SCRIPT_PATH="$( cd "$(dirname "$0")" &>/dev/null ; pwd -P )"
export CLOUD_NAME=${CLOUD_NAME:-"custom"}

DATASETS=("glove-100-angular" "deep-image-96" "gist-960-euclidean" "dbpedia-openai-1M-1536-angular")
VECTOR_DBS=("qdrant" "milvus" "elasticsearch" "weaviate" "redis")
BRANCH="master"

# Run only while setting up new benchmark server and client:
# Create different servers and clients for each dataset so benchmarking can be done in parallel
# for dataset in "${DATASETS[@]}"; do
# SERVER_NAME=benchmark-client-${dataset} bash -x $SCRIPT_PATH/$CLOUD_NAME/create_and_install.sh
# SERVER_NAME=benchmark-server-${dataset} bash -x $SCRIPT_PATH/$CLOUD_NAME/create_and_install.sh
# done

DATASET=$1
SERVER_NAME=$2

# replace "server" with "client" if 3rd argument is not passed
CLIENT_NAME=${3:-"${SERVER_NAME/server/client}"}
PRIVATE_SERVER_IP=$(bash $SCRIPT_PATH/$CLOUD_NAME/get_private_ip.sh $SERVER_NAME)

for VECTOR_DB in "${VECTOR_DBS[@]}"; do
echo Running benchmark for ${VECTOR_DB} on ${DATASET}

RUN_SCRIPT="${SCRIPT_PATH}/remote/setup_benchmark_server.sh" \
ENV_CONTEXT="${VECTOR_DB@A} ${BRANCH@A}" \
SERVER_NAME=${SERVER_NAME} \
bash -x $SCRIPT_PATH/run_remote.sh

RUN_SCRIPT="${SCRIPT_PATH}/remote/setup_benchmark_client.sh" \
ENV_CONTEXT="${VECTOR_DB@A} ${BRANCH@A} ${PRIVATE_SERVER_IP@A} ${DATASET@A}" \
SERVER_NAME=${CLIENT_NAME} \
bash -x $SCRIPT_PATH/run_remote.sh
done
40 changes: 40 additions & 0 deletions tools/run_remote.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

set -e

SCRIPT_PATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
CLOUD_NAME=${CLOUD_NAME:-"hetzner"}


RUN_SCRIPT=${RUN_SCRIPT:-""}
SERVER_NAME=${SERVER_NAME:-""}

DEFAULT_SSH_USER=$(bash $SCRIPT_PATH/$CLOUD_NAME/get_ssh_user.sh $SERVER_NAME)

SSH_USER=${SSH_USER:-${DEFAULT_SSH_USER}}

# List of env variables with values to pass to remote script
# Should be constructed as `${VAR_1@A} ${VAR_2@A}`
ENV_CONTEXT=${ENV_CONTEXT:-""}

if [[ -z "$RUN_SCRIPT" ]]
then
echo "Please specify RUN_SCRIPT env variable"
exit 1
fi

if [[ -z "$SERVER_NAME" ]]
then
echo "Please specify SERVER_NAME env variable"
exit 1
fi


# Get server ip

SERVER_IP=$(bash $SCRIPT_PATH/$CLOUD_NAME/get_public_ip.sh $SERVER_NAME)



echo $ENV_CONTEXT | cat - "$RUN_SCRIPT" | ssh -oStrictHostKeyChecking=no "$SSH_USER@$SERVER_IP" sudo bash -x

30 changes: 30 additions & 0 deletions tools/ssh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# Usage: tools/ssh.sh custom|hetzner <server-name>

set -e

SCRIPT_PATH="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
CLOUD_NAME=${CLOUD_NAME:-$1}

SERVER_NAME=${SERVER_NAME:-$2}

DEFAULT_SSH_USER=$(bash $SCRIPT_PATH/$CLOUD_NAME/get_ssh_user.sh $SERVER_NAME)
SSH_USER=${SSH_USER:-${DEFAULT_SSH_USER}}

if [[ -z "$CLOUD_NAME" ]]
then
echo "Please pass CLOUD_NAME env variable"
exit 1
fi

if [[ -z "$SERVER_NAME" ]]
then
echo "Please specify SERVER_NAME env variable"
exit 1
fi

# Get server ip
SERVER_IP=$(bash $SCRIPT_PATH/$CLOUD_NAME/get_public_ip.sh $SERVER_NAME)

ssh $SSH_USER@$SERVER_IP

0 comments on commit 0b716a4

Please sign in to comment.