Skip to content

Commit

Permalink
Fix minor conflicts
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Parnell <[email protected]>
  • Loading branch information
tdoublep committed Jan 28, 2025
2 parents bf931dd + 310468c commit e7dc638
Show file tree
Hide file tree
Showing 208 changed files with 6,795 additions and 4,381 deletions.
107 changes: 107 additions & 0 deletions .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,104 @@ run_serving_tests() {
kill_gpu_processes
}

run_genai_perf_tests() {
# run genai-perf tests

# $1: a json file specifying genai-perf test cases
local genai_perf_test_file
genai_perf_test_file=$1

# Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')

# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi

# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}

# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')

# get client and server arguments
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"

# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi

if [[ $reuse_server == "true" ]]; then
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
"$server_params" "$common_params"
fi

if wait_for_server; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
break
fi

# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps=$num_prompts
echo "now qps is $qps"
fi

new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE

if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
#TODO: add output dir.
client_command="genai-perf profile \
-m $model \
--service-kind openai \
--backend vllm \
--endpoint-type chat \
--streaming \
--url localhost:$port \
--request-rate $qps \
--num-prompts $num_prompts \
"

echo "Client command: $client_command"

eval "$client_command"

#TODO: process/record outputs
done
done

kill_gpu_processes

}

prepare_dataset() {

Expand Down Expand Up @@ -328,12 +426,17 @@ main() {

pip install -U transformers

pip install -r requirements-dev.txt
which genai-perf

# check storage
df -h

ensure_installed wget
ensure_installed curl
ensure_installed jq
# genai-perf dependency
ensure_installed libb64-0d

prepare_dataset

Expand All @@ -345,6 +448,10 @@ main() {
# run the test
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"

# run genai-perf tests
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
mv artifacts/ $RESULTS_FOLDER/

# upload benchmark results to buildkite
python3 -m pip install tabulate pandas
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
Expand Down
23 changes: 23 additions & 0 deletions .buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[
{
"test_name": "llama8B_tp1_genai_perf",
"qps_list": [4,8,16,32],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"port": 8000,
"num_prompts": 500,
"reuse_server": false
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"genai_perf_input_parameters": {
}
}
]
4 changes: 2 additions & 2 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,6 @@ function cpu_tests() {
tests/lora/test_qwen2vl.py"
}

# All of CPU tests are expected to be finished less than 25 mins.
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
12 changes: 10 additions & 2 deletions .buildkite/run-hpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,17 @@ set -ex
docker build -t hpu-test-env -f Dockerfile.hpu .

# Setup cleanup
# certain versions of HPU software stack have a bug that can
# override the exit code of the script, so we need to use
# separate remove_docker_container and remove_docker_container_and_exit
# functions, while other platforms only need one remove_docker_container
# function.
EXITCODE=1
remove_docker_container() { docker rm -f hpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
trap remove_docker_container_and_exit EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
EXITCODE=$?
9 changes: 8 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ steps:
source_file_dependencies:
- vllm/
commands:
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
Expand All @@ -126,11 +126,15 @@ steps:
- tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
- examples/offline_inference/rlhf.py
commands:
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py

- label: Metrics, Tracing Test # 10min
num_gpus: 2
Expand Down Expand Up @@ -462,7 +466,10 @@ steps:
- vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
commands:
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
Expand Down
13 changes: 4 additions & 9 deletions .github/workflows/test-spyre.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
name: test-sypre

on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main
on: pull_request

jobs:
test-spyre:
Expand All @@ -28,5 +20,8 @@ jobs:
python3.12 -c "from sentence_transformers import SentenceTransformer; SentenceTransformer(\"sentence-transformers/all-roberta-large-v1\")" && \
export VARIANT=$(ls /root/.cache/huggingface/hub/models--sentence-transformers--all-roberta-large-v1/snapshots/) && \
ln -s /root/.cache/huggingface/hub/models--sentence-transformers--all-roberta-large-v1/snapshots/${VARIANT} /models/all-roberta-large-v1 && \
export MASTER_PORT=12355 && \
export MASTER_ADDR=localhost && \
export DISTRIBUTED_STRATEGY_IGNORE_MODULES=WordEmbedding && \
python3.12 -m pytest tests/spyre -v
'''
6 changes: 3 additions & 3 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,20 @@ RUN pip install intel_extension_for_pytorch==2.5.0

WORKDIR /workspace

COPY requirements-build.txt requirements-build.txt
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt

FROM cpu-test-1 AS build

WORKDIR /workspace/vllm

COPY requirements-common.txt requirements-common.txt
COPY requirements-cpu.txt requirements-cpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt

COPY . .
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.hpu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest

COPY ./ /workspace/vllm

Expand Down
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ We will try to rebase against upstream frequently and we plan to contribute thes
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.

Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.

vLLM is fast with:

- State-of-the-art serving throughput
Expand Down Expand Up @@ -76,16 +78,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod

## Getting Started

Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):

```bash
pip install vllm
```

Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [List of Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)

## Contributing

Expand Down
27 changes: 24 additions & 3 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,14 +417,35 @@ def get_model(pretrained_model_name_or_path: str) -> str:


def get_tokenizer(
pretrained_model_name_or_path: str, trust_remote_code: bool
pretrained_model_name_or_path: str,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path):
pretrained_model_name_or_path = get_model(
pretrained_model_name_or_path)
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
trust_remote_code=trust_remote_code)
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError(
"Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
from vllm.transformers_utils.tokenizer import MistralTokenizer
except ImportError as e:
raise ImportError("MistralTokenizer requires vllm package.\n"
"Please install it with `pip install vllm` "
"to use mistral tokenizer mode.") from e
return MistralTokenizer.from_pretrained(
str(pretrained_model_name_or_path))
else:
return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust_remote_code,
**kwargs,
)


ASYNC_REQUEST_FUNCS = {
Expand Down
Loading

0 comments on commit e7dc638

Please sign in to comment.