diff --git a/examples/large_models/vllm/requirements.txt b/examples/large_models/vllm/requirements.txt index 6d0209c820..a3f418ffdd 100644 --- a/examples/large_models/vllm/requirements.txt +++ b/examples/large_models/vllm/requirements.txt @@ -1 +1 @@ -vllm==0.6.1.post2 +vllm==0.6.2 diff --git a/ts/torch_handler/vllm_handler.py b/ts/torch_handler/vllm_handler.py index 910a9461cc..31d84eb758 100644 --- a/ts/torch_handler/vllm_handler.py +++ b/ts/torch_handler/vllm_handler.py @@ -13,7 +13,7 @@ ) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion -from vllm.entrypoints.openai.serving_engine import LoRAModulePath +from vllm.entrypoints.openai.serving_engine import BaseModelPath, LoRAModulePath from ts.handler_utils.utils import send_intermediate_predict_response from ts.service import PredictionException @@ -54,6 +54,11 @@ def initialize(self, ctx): else: served_model_names = [vllm_engine_config.model] + base_model_paths = [ + BaseModelPath(name=name, model_path=vllm_engine_config.model) + for name in served_model_names + ] + chat_template = ctx.model_yaml_config.get("handler", {}).get( "chat_template", None ) @@ -64,7 +69,7 @@ def initialize(self, ctx): self.completion_service = OpenAIServingCompletion( self.vllm_engine, model_config, - served_model_names, + base_model_paths, lora_modules=lora_modules, prompt_adapters=None, request_logger=None, @@ -73,7 +78,7 @@ def initialize(self, ctx): self.chat_completion_service = OpenAIServingChat( self.vllm_engine, model_config, - served_model_names, + base_model_paths, "assistant", lora_modules=lora_modules, prompt_adapters=None,