Skip to content

Commit

Permalink
Merge branch 'main' of github.com:triton-inference-server/server into…
Browse files Browse the repository at this point in the history
… yinggeh-DLIS-7061-add-vllm-metrics
  • Loading branch information
yinggeh committed Aug 15, 2024
2 parents 6f601f4 + 5320009 commit 50551f6
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 48 deletions.
65 changes: 18 additions & 47 deletions qa/L0_perf_tensorrt_llm/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ TRT_ROOT="/usr/local/tensorrt"
MODEL_NAME="gpt2_tensorrt_llm"
NAME="tensorrt_llm_benchmarking_test"
MODEL_REPOSITORY="$(pwd)/triton_model_repo"
TENSORRTLLM_BACKEND_DIR="/opt/tritonserver/tensorrtllm_backend"
TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend"
GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt"
TOKENIZER_DIR="$GPT_DIR/gpt2"
ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu"
Expand All @@ -47,40 +47,27 @@ SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}
function clone_tensorrt_llm_backend_repo {
rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR
apt-get update && apt-get install git-lfs -y --no-install-recommends
git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} https://github.com/triton-inference-server/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR
cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive
}

# Update Open MPI to a version compatible with SLURM.
function upgrade_openmpi {
cd /tmp/
local CURRENT_VERSION=$(mpirun --version 2>&1 | awk '/Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}')

if [ -n "$CURRENT_VERSION" ] && dpkg --compare-versions "$CURRENT_VERSION" lt "5.0.1"; then
# Uninstall the current version of Open MPI
wget "https://download.open-mpi.org/release/open-mpi/v$(echo "${CURRENT_VERSION}" | awk -F. '{print $1"."$2}')/openmpi-${CURRENT_VERSION}.tar.gz" || {
echo "Failed to download Open MPI ${CURRENT_VERSION}"
exit 1
}
rm -rf "openmpi-${CURRENT_VERSION}" && tar -xzf "openmpi-${CURRENT_VERSION}.tar.gz" && cd "openmpi-${CURRENT_VERSION}" || {
echo "Failed to extract Open MPI ${CURRENT_VERSION}"
exit 1
}
unset PMIX_VERSION && ./configure --prefix=/opt/hpcx/ompi/ && make uninstall || {
echo "Failed to uninstall Open MPI ${CURRENT_VERSION}"
exit 1
}
rm -rf /opt/hpcx/ompi/ /usr/local/mpi/ || {
echo "Failed to remove Open MPI ${CURRENT_VERSION} installation directories"
rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$(gcc -print-multiarch)/openmpi || {
echo "Failed to uninstall the existing Open MPI version $CURRENT_VERSION."
exit 1
}
cd ../ && rm -r openmpi-${CURRENT_VERSION}
else
echo "Installed Open MPI version is not less than 5.0.1. Skipping the upgrade."
echo "The installed Open MPI version ($CURRENT_VERSION) is 5.0.1 or higher. Skipping the upgrade."
return
fi

# Install SLURM supported Open MPI version
cd /tmp/
wget "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || {
echo "Failed to download Open MPI 5.0.1"
exit 1
Expand Down Expand Up @@ -108,18 +95,6 @@ function upgrade_openmpi {
mpirun --version
}

function install_tensorrt_llm {
# Install CMake
bash ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/docker/common/install_cmake.sh
export PATH="/usr/local/cmake/bin:${PATH}"

TORCH_INSTALL_TYPE="pypi" &&
(cd ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm &&
bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE &&
python3 ./scripts/build_wheel.py --trt_root=/usr/local/tensorrt &&
pip3 install ./build/tensorrt_llm*.whl)
}

function build_gpt2_base_model {
# Download weights from HuggingFace Transformers
cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2
Expand All @@ -131,24 +106,21 @@ function build_gpt2_base_model {
cd ${GPT_DIR}

# Convert weights from HF Tranformers to FT format
python3 hf_gpt_convert.py -p 1 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism ${NUM_GPUS} --storage-type float16
python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/"
cd ${BASE_DIR}
}

function build_gpt2_tensorrt_engine {
# Build TensorRT engines
cd ${GPT_DIR}
python3 build.py --model_dir="./c-model/gpt2/${NUM_GPUS}-gpu/" \
--world_size="${NUM_GPUS}" \
--dtype float16 \
--use_inflight_batching \
--use_gpt_attention_plugin float16 \
--paged_kv_cache \
--use_gemm_plugin float16 \
--remove_input_padding \
--hidden_act gelu \
--parallel_build \
--output_dir="${ENGINES_DIR}"
trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \
--gpt_attention_plugin float16 \
--remove_input_padding enable \
--paged_kv_cache enable \
--gemm_plugin float16 \
--workers "${NUM_GPUS}" \
--output_dir "${ENGINES_DIR}"

cd ${BASE_DIR}
}

Expand All @@ -172,18 +144,18 @@ function prepare_model_repository {
replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"
replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt"

replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"
replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt"

replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt"
}

# Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on
Expand Down Expand Up @@ -244,13 +216,12 @@ function kill_server {

upgrade_openmpi
clone_tensorrt_llm_backend_repo
install_tensorrt_llm
build_gpt2_base_model
build_gpt2_tensorrt_engine
prepare_model_repository

# Install perf_analyzer
pip3 install tritonclient nvidia-ml-py3
pip3 install tritonclient

ARCH="amd64"
STATIC_BATCH=1
Expand Down
2 changes: 1 addition & 1 deletion qa/L0_perf_vllm/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR}
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:=0}
EXPORT_FILE=profile-export-vllm-model.json

pip3 install tritonclient nvidia-ml-py3
pip3 install tritonclient
rm -rf $MODEL_REPO $EXPORT_FILE *.tjson *.json *.csv

mkdir -p $MODEL_REPO/$MODEL_NAME/1
Expand Down

0 comments on commit 50551f6

Please sign in to comment.