pytorch · mreso · Sep 18, 2024 · Sep 13, 2024 · Sep 16, 2024 · Sep 17, 2024
diff --git a/examples/large_models/vllm/llama3/model-config.yaml b/examples/large_models/vllm/llama3/model-config.yaml
@@ -2,7 +2,7 @@
 minWorkers: 1
 maxWorkers: 1
 maxBatchDelay: 100
-responseTimeout: 1200
+startupTimeout: 1200
 deviceType: "gpu"
 asyncCommunication: true
 

diff --git a/examples/large_models/vllm/lora/Readme.md b/examples/large_models/vllm/lora/Readme.md
@@ -55,7 +55,7 @@ The vllm integration uses an OpenAI compatible interface which lets you perform
 
 Curl:
 ```bash
-curl --header "Content-Type: application/json"   --request POST   --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1
+curl --header "Content-Type: application/json"   --request POST   --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1/completions
 ```
 
 Python + Request:

diff --git a/examples/large_models/vllm/lora/model-config.yaml b/examples/large_models/vllm/lora/model-config.yaml
@@ -2,7 +2,7 @@
 minWorkers: 1
 maxWorkers: 1
 maxBatchDelay: 100
-responseTimeout: 1200
+startupTimeout: 1200
 deviceType: "gpu"
 asyncCommunication: true
 

diff --git a/examples/large_models/vllm/mistral/model-config.yaml b/examples/large_models/vllm/mistral/model-config.yaml
@@ -2,7 +2,7 @@
 minWorkers: 1
 maxWorkers: 1
 maxBatchDelay: 100
-responseTimeout: 1200
+startupTimeout: 1200
 deviceType: "gpu"
 asyncCommunication: true
 

diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
@@ -33,7 +33,7 @@
 public class AsyncWorkerThread extends WorkerThread {
     // protected ConcurrentHashMap requestsInBackend;
     protected static final Logger logger = LoggerFactory.getLogger(AsyncWorkerThread.class);
-    protected static final long MODEL_LOAD_TIMEOUT = 10L;
+    protected static final long WORKER_TIMEOUT = 2L;
 
     protected boolean loadingFinished;
     protected CountDownLatch latch;
@@ -53,6 +53,7 @@ public AsyncWorkerThread(
     @Override
     public void run() {
         responseTimeout = model.getResponseTimeout();
+        startupTimeout = model.getStartupTimeout();
         Thread thread = Thread.currentThread();
         thread.setName(getWorkerName());
         currentThread.set(thread);
@@ -80,11 +81,11 @@ public void run() {
 
                     if (loadingFinished == false) {
                         latch = new CountDownLatch(1);
-                        if (!latch.await(MODEL_LOAD_TIMEOUT, TimeUnit.MINUTES)) {
+                        if (!latch.await(startupTimeout, TimeUnit.SECONDS)) {
                             throw new WorkerInitializationException(
-                                    "Worker did not load the model within"
-                                            + MODEL_LOAD_TIMEOUT
-                                            + " mins");
+                                    "Worker did not load the model within "
+                                            + startupTimeout
+                                            + " seconds");
                         }
                     }
 
@@ -99,7 +100,7 @@ public void run() {
                 logger.debug("Shutting down the thread .. Scaling down.");
             } else {
                 logger.debug(
-                        "Backend worker monitoring thread interrupted or backend worker process died., responseTimeout:"
+                        "Backend worker monitoring thread interrupted or backend worker process died. responseTimeout:"
                                 + responseTimeout
                                 + "sec",
                         e);

diff --git a/ts/llm_launcher.py b/ts/llm_launcher.py
@@ -67,6 +67,7 @@ def get_model_config(args, model_snapshot_path=None):
         "batchSize": 1,
         "maxBatchDelay": 100,
         "responseTimeout": 1200,
+        "startupTimeout": args.startup_timeout,
         "deviceType": "gpu",
         "asyncCommunication": True,
     }
@@ -227,7 +228,7 @@ def main(args):
     parser.add_argument(
         "--vllm_engine.max_num_seqs",
         type=int,
-        default=16,
+        default=256,
         help="Max sequences in vllm engine",
     )
 
@@ -245,6 +246,13 @@ def main(args):
         help="Cache dir",
     )
 
+    parser.add_argument(
+        "--startup_timeout",
+        type=int,
+        default=1200,
+        help="Model startup timeout in seconds",
+    )
+
     parser.add_argument(
         "--engine",
         type=str,
@@ -272,6 +280,7 @@ def main(args):
         default=0.1,
         help="KV Cache free gpu memory fraction",
     )
+
     args = parser.parse_args()
 
     main(args)