diff --git a/docs/configuration.md b/docs/configuration.md index 22e3a8bf06..d9fe733ab5 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -245,6 +245,7 @@ A model's parameters are defined in [model source code](https://github.com/pytor * `maxWorkers`: the maximum number of workers of a model * `batchSize`: the batch size of a model * `maxBatchDelay`: the maximum delay in msec of a batch of a model +* `startupTimeout`: the timeout in sec of a specific model's startup. This setting takes priority over `default_startup_timeout` which is a default timeout over all models * `responseTimeout`: the timeout in sec of a specific model's response. This setting takes priority over `default_response_timeout` which is a default timeout over all models * `defaultVersion`: the default version of a model * `marName`: the mar file name of a model @@ -295,6 +296,7 @@ Most of the following properties are designed for performance tuning. Adjusting * `job_queue_size`: Number inference jobs that frontend will queue before backend can serve. Default: 100. * `async_logging`: Enable asynchronous logging for higher throughput, log output may be delayed if this is enabled. Default: false. * `default_response_timeout`: Timeout, in seconds, used for all models backend workers before they are deemed unresponsive and rebooted. Default: 120 seconds. +* `default_startup_timeout`: Specifies the maximum time, in seconds, allowed for model backend workers to initialize and become ready. If a worker fails to start within this timeframe, it is considered unresponsive and will be restarted. Default: 120 seconds. * `unregister_model_timeout`: Timeout, in seconds, used when handling an unregister model request when cleaning a process before it is deemed unresponsive and an error response is sent. Default: 120 seconds. * `decode_input_request`: Configuration to let backend workers to decode requests, when the content type is known. If this is set to "true", backend workers do "Bytearray to JSON object" conversion when the content type is "application/json" and diff --git a/docs/large_model_inference.md b/docs/large_model_inference.md index 440da33cde..173f002177 100644 --- a/docs/large_model_inference.md +++ b/docs/large_model_inference.md @@ -233,7 +233,8 @@ To reduce model latency we recommend: #### Tune [model config YAML file](https://github.com/pytorch/serve/blob/5ee02e4f050c9b349025d87405b246e970ee710b/model-archiver/README.md) You can tune the model config YAML file to get better performance in the following ways: -* Update the [responseTimeout](https://github.com/pytorch/serve/blob/5ee02e4f050c9b349025d87405b246e970ee710b/docs/configuration.md?plain=1#L216) if high model loading or inference latency causes response timeout. +* Update the [responseTimeout](https://github.com/pytorch/serve/blob/5ee02e4f050c9b349025d87405b246e970ee710b/docs/configuration.md?plain=1#L216) if high model inference latency causes response timeout. +* Update the [startupTimeout](https://github.com/pytorch/serve/blob/5ee02e4f050c9b349025d87405b246e970ee710b/docs/configuration.md?plain=1#L216) if high model loading latency causes startup timeout. * Tune the [torchrun parameters](https://github.com/pytorch/serve/blob/2f1f52f553e83703b5c380c2570a36708ee5cafa/model-archiver/README.md?plain=1#L179). The supported parameters are defined at [here](https://github.com/pytorch/serve/blob/2f1f52f553e83703b5c380c2570a36708ee5cafa/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java#L329). For example, by default, `OMP_NUMBER_THREADS` is 1. This can be modified in the YAML file. ```yaml #frontend settings diff --git a/docs/management_api.md b/docs/management_api.md index 6d2392492b..ed14849e21 100644 --- a/docs/management_api.md +++ b/docs/management_api.md @@ -39,6 +39,7 @@ To use this API after TorchServe starts, model API control has to be enabled. Ad * `initial_workers` - the number of initial workers to create. The default value is `0`. TorchServe will not run inference until there is at least one work assigned. * `synchronous` - whether or not the creation of worker is synchronous. The default value is false. TorchServe will create new workers without waiting for acknowledgement that the previous worker is online. * `response_timeout` - If the model's backend worker doesn't respond with inference response within this timeout period, the worker will be deemed unresponsive and rebooted. The units is seconds. The default value is 120 seconds. +* `startup_timeout` - If the model's backend worker doesn't load the model within this timeout period, the worker will be deemed unresponsive and rebooted. The units is seconds. The default value is 120 seconds. ```bash curl -X POST "http://localhost:8081/models?url=https://torchserve.pytorch.org/mar_files/squeezenet1_1.mar" diff --git a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java index 978967af35..c83da0523c 100644 --- a/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java +++ b/frontend/archive/src/main/java/org/pytorch/serve/archive/model/ModelConfig.java @@ -21,6 +21,8 @@ public class ModelConfig { private int maxBatchDelay; /** the timeout in sec of a specific model's response. */ private int responseTimeout = 120; // unit: sec + /** the timeout in sec of a specific model's startup. */ + private int startupTimeout = 120; // unit: sec /** * the device type where the model is loaded. It can be gpu, cpu. The model is loaded on CPU if * deviceType: "cpu" is set on a GPU host. @@ -122,6 +124,13 @@ public static ModelConfig build(Map yamlMap) { logger.warn("Invalid responseTimeout: {}, should be integer", v); } break; + case "startupTimeout": + if (v instanceof Integer) { + modelConfig.setStartupTimeout((int) v); + } else { + logger.warn("Invalid startupTimeout: {}, should be integer", v); + } + break; case "deviceType": if (v instanceof String) { modelConfig.setDeviceType((String) v); @@ -319,6 +328,18 @@ public void setResponseTimeout(int responseTimeout) { this.responseTimeout = responseTimeout; } + public int getStartupTimeout() { + return startupTimeout; + } + + public void setStartupTimeout(int startupTimeout) { + if (startupTimeout <= 0) { + logger.warn("Invalid startupTimeout:{}", startupTimeout); + return; + } + this.startupTimeout = startupTimeout; + } + public List getDeviceIds() { return deviceIds; } diff --git a/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelConfigTest.java b/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelConfigTest.java index 7a6171d198..4c3db572c7 100644 --- a/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelConfigTest.java +++ b/frontend/archive/src/test/java/org/pytorch/serve/archive/model/ModelConfigTest.java @@ -21,6 +21,7 @@ public void testValidYamlConfig() throws InvalidModelException, IOException { Assert.assertEquals(modelConfig.getBatchSize(), 1); Assert.assertEquals(modelConfig.getMaxBatchDelay(), 100); Assert.assertEquals(modelConfig.getResponseTimeout(), 120); + Assert.assertEquals(modelConfig.getStartupTimeout(), 120); Assert.assertEquals(modelConfig.getDeviceType(), ModelConfig.DeviceType.GPU); Assert.assertEquals(modelConfig.getParallelLevel(), 4); Assert.assertEquals(modelConfig.getParallelType(), ModelConfig.ParallelType.PP); @@ -42,6 +43,7 @@ public void testInvalidYamlConfig() throws InvalidModelException, IOException { Assert.assertEquals(modelConfig.getBatchSize(), 1); Assert.assertEquals(modelConfig.getMaxBatchDelay(), 100); Assert.assertEquals(modelConfig.getResponseTimeout(), 120); + Assert.assertEquals(modelConfig.getStartupTimeout(), 120); Assert.assertNotEquals(modelConfig.getDeviceType(), ModelConfig.DeviceType.GPU); Assert.assertEquals(modelConfig.getParallelLevel(), 0); Assert.assertNotEquals(modelConfig.getParallelType(), ModelConfig.ParallelType.PPTP); diff --git a/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java b/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java index cbc4613dfb..f5b02ee222 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java +++ b/frontend/server/src/main/java/org/pytorch/serve/ModelServer.java @@ -270,6 +270,7 @@ private void initModelStore() throws InvalidSnapshotException, IOException { -1 * RegisterModelRequest.DEFAULT_BATCH_SIZE, -1 * RegisterModelRequest.DEFAULT_MAX_BATCH_DELAY, configManager.getDefaultResponseTimeout(), + configManager.getDefaultStartupTimeout(), defaultModelName, false, false, diff --git a/frontend/server/src/main/java/org/pytorch/serve/http/messages/DescribeModelResponse.java b/frontend/server/src/main/java/org/pytorch/serve/http/messages/DescribeModelResponse.java index 00b5a62142..3780168c33 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/http/messages/DescribeModelResponse.java +++ b/frontend/server/src/main/java/org/pytorch/serve/http/messages/DescribeModelResponse.java @@ -22,6 +22,7 @@ public class DescribeModelResponse { private int batchSize; private int maxBatchDelay; private int responseTimeout; + private int startupTimeout; private long maxRetryTimeoutInSec; private long clientTimeoutInMills; private String parallelType; @@ -132,10 +133,18 @@ public int getResponseTimeout() { return responseTimeout; } + public int getStartupTimeout() { + return startupTimeout; + } + public void setResponseTimeout(int responseTimeout) { this.responseTimeout = responseTimeout; } + public void setStartupTimeout(int startupTimeout) { + this.startupTimeout = startupTimeout; + } + public long getMaxRetryTimeoutInSec() { return maxRetryTimeoutInSec; } diff --git a/frontend/server/src/main/java/org/pytorch/serve/http/messages/RegisterModelRequest.java b/frontend/server/src/main/java/org/pytorch/serve/http/messages/RegisterModelRequest.java index 4cf9024299..45691d3388 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/http/messages/RegisterModelRequest.java +++ b/frontend/server/src/main/java/org/pytorch/serve/http/messages/RegisterModelRequest.java @@ -35,6 +35,9 @@ public class RegisterModelRequest { @SerializedName("response_timeout") private int responseTimeout; + @SerializedName("startup_timeout") + private int startupTimeout; + @SerializedName("url") private String modelUrl; @@ -56,6 +59,7 @@ public RegisterModelRequest(QueryStringDecoder decoder) { ConfigManager.getInstance().getConfiguredDefaultWorkersPerModel()); synchronous = Boolean.parseBoolean(NettyUtils.getParameter(decoder, "synchronous", "true")); responseTimeout = NettyUtils.getIntParameter(decoder, "response_timeout", -1); + startupTimeout = NettyUtils.getIntParameter(decoder, "startup_timeout", -1); modelUrl = NettyUtils.getParameter(decoder, "url", null); s3SseKms = Boolean.parseBoolean(NettyUtils.getParameter(decoder, "s3_sse_kms", "false")); } @@ -74,6 +78,7 @@ public RegisterModelRequest(org.pytorch.serve.grpc.management.RegisterModelReque ConfigManager.getInstance().getConfiguredDefaultWorkersPerModel()); synchronous = request.getSynchronous(); responseTimeout = GRPCUtils.getRegisterParam(request.getResponseTimeout(), -1); + startupTimeout = GRPCUtils.getRegisterParam(request.getStartupTimeout(), -1); modelUrl = GRPCUtils.getRegisterParam(request.getUrl(), null); s3SseKms = request.getS3SseKms(); } @@ -84,6 +89,7 @@ public RegisterModelRequest() { synchronous = true; initialWorkers = ConfigManager.getInstance().getConfiguredDefaultWorkersPerModel(); responseTimeout = -1; + startupTimeout = -1; s3SseKms = false; } @@ -119,6 +125,10 @@ public Integer getResponseTimeout() { return responseTimeout; } + public Integer getStartupTimeout() { + return startupTimeout; + } + public String getModelUrl() { return modelUrl; } diff --git a/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java b/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java index b621bf39fb..0dd4fe81b0 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java +++ b/frontend/server/src/main/java/org/pytorch/serve/openapi/OpenApiUtils.java @@ -294,6 +294,12 @@ private static Operation getRegisterOperation() { "integer", "2", "Maximum time, in seconds, the TorchServe waits for a response from the model inference code, default: 120.")); + operation.addParameter( + new QueryParameter( + "startup_timeout", + "integer", + "120", + "Maximum time, in seconds, the TorchServe waits for the model to startup/initialize, default: 120.")); operation.addParameter( new QueryParameter( "initial_workers", diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java b/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java index 30ce8b156d..46c476affd 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java +++ b/frontend/server/src/main/java/org/pytorch/serve/util/ApiUtils.java @@ -122,10 +122,14 @@ public static StatusResponse registerModel(RegisterModelRequest registerModelReq int maxBatchDelay = registerModelRequest.getMaxBatchDelay(); int initialWorkers = registerModelRequest.getInitialWorkers(); int responseTimeout = registerModelRequest.getResponseTimeout(); + int startupTimeout = registerModelRequest.getStartupTimeout(); boolean s3SseKms = registerModelRequest.getS3SseKms(); if (responseTimeout == -1) { responseTimeout = ConfigManager.getInstance().getDefaultResponseTimeout(); } + if (startupTimeout == -1) { + startupTimeout = ConfigManager.getInstance().getDefaultStartupTimeout(); + } Manifest.RuntimeType runtimeType = null; if (runtime != null) { @@ -144,6 +148,7 @@ public static StatusResponse registerModel(RegisterModelRequest registerModelReq batchSize, maxBatchDelay, responseTimeout, + startupTimeout, initialWorkers, registerModelRequest.getSynchronous(), false, @@ -158,6 +163,7 @@ public static StatusResponse handleRegister( int batchSize, int maxBatchDelay, int responseTimeout, + int startupTimeout, int initialWorkers, boolean isSync, boolean isWorkflowModel, @@ -177,6 +183,7 @@ public static StatusResponse handleRegister( batchSize, maxBatchDelay, responseTimeout, + startupTimeout, null, false, isWorkflowModel, @@ -403,6 +410,7 @@ private static DescribeModelResponse createModelResponse( resp.setModelVersion(manifest.getModel().getModelVersion()); resp.setRuntime(manifest.getRuntime().getValue()); resp.setResponseTimeout(model.getResponseTimeout()); + resp.setStartupTimeout(model.getStartupTimeout()); resp.setMaxRetryTimeoutInSec(model.getMaxRetryTimeoutInMill() / 1000); resp.setClientTimeoutInMills(model.getClientTimeoutInMills()); resp.setParallelType(model.getParallelType().getParallelType()); diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java index 12ece0b54b..791dac511c 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java +++ b/frontend/server/src/main/java/org/pytorch/serve/util/ConfigManager.java @@ -65,6 +65,7 @@ public final class ConfigManager { private static final String TS_BLACKLIST_ENV_VARS = "blacklist_env_vars"; private static final String TS_DEFAULT_WORKERS_PER_MODEL = "default_workers_per_model"; private static final String TS_DEFAULT_RESPONSE_TIMEOUT = "default_response_timeout"; + private static final String TS_DEFAULT_STARTUP_TIMEOUT = "default_startup_timeout"; private static final String TS_UNREGISTER_MODEL_TIMEOUT = "unregister_model_timeout"; private static final String TS_NUMBER_OF_NETTY_THREADS = "number_of_netty_threads"; private static final String TS_NETTY_CLIENT_THREADS = "netty_client_threads"; @@ -879,6 +880,10 @@ public int getDefaultResponseTimeout() { return Integer.parseInt(prop.getProperty(TS_DEFAULT_RESPONSE_TIMEOUT, "120")); } + public int getDefaultStartupTimeout() { + return Integer.parseInt(prop.getProperty(TS_DEFAULT_STARTUP_TIMEOUT, "120")); + } + public int getUnregisterModelTimeout() { return Integer.parseInt(prop.getProperty(TS_UNREGISTER_MODEL_TIMEOUT, "120")); } diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java index 5debd238d0..c41a495260 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java +++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/Model.java @@ -32,6 +32,7 @@ public class Model { public static final String BATCH_SIZE = "batchSize"; public static final String MAX_BATCH_DELAY = "maxBatchDelay"; public static final String RESPONSE_TIMEOUT = "responseTimeout"; + public static final String STARTUP_TIMEOUT = "startupTimeout"; public static final String PARALLEL_LEVEL = "parallelLevel"; public static final String DEFAULT_VERSION = "defaultVersion"; public static final String MAR_NAME = "marName"; @@ -57,6 +58,7 @@ public class Model { private ReentrantLock lock; private ReentrantLock jobGroupLock; private int responseTimeout; + private int startupTimeout; private long sequenceMaxIdleMSec; private long sequenceTimeoutMSec; private int maxNumSequence; @@ -178,6 +180,7 @@ public JsonObject getModelState(boolean isDefaultVersion) { modelInfo.addProperty(BATCH_SIZE, getBatchSize()); modelInfo.addProperty(MAX_BATCH_DELAY, getMaxBatchDelay()); modelInfo.addProperty(RESPONSE_TIMEOUT, getResponseTimeout()); + modelInfo.addProperty(STARTUP_TIMEOUT, getStartupTimeout()); modelInfo.addProperty(RUNTIME_TYPE, getRuntimeType().getValue()); if (parallelLevel > 0) { modelInfo.addProperty(PARALLEL_LEVEL, parallelLevel); @@ -191,6 +194,7 @@ public void setModelState(JsonObject modelInfo) { maxWorkers = modelInfo.get(MAX_WORKERS).getAsInt(); maxBatchDelay = modelInfo.get(MAX_BATCH_DELAY).getAsInt(); responseTimeout = modelInfo.get(RESPONSE_TIMEOUT).getAsInt(); + startupTimeout = modelInfo.get(STARTUP_TIMEOUT).getAsInt(); batchSize = modelInfo.get(BATCH_SIZE).getAsInt(); JsonElement runtime = modelInfo.get(RUNTIME_TYPE); @@ -537,10 +541,18 @@ public int getResponseTimeout() { return ConfigManager.getInstance().isDebug() ? Integer.MAX_VALUE : responseTimeout; } + public int getStartupTimeout() { + return ConfigManager.getInstance().isDebug() ? Integer.MAX_VALUE : startupTimeout; + } + public void setResponseTimeout(int responseTimeout) { this.responseTimeout = responseTimeout; } + public void setStartupTimeout(int startupTimeout) { + this.startupTimeout = startupTimeout; + } + public List getDeviceIds() { return this.deviceIds; } diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/ModelManager.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/ModelManager.java index e3935c9d56..d5f78c7f53 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/wlm/ModelManager.java +++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/ModelManager.java @@ -79,6 +79,7 @@ public ModelArchive registerModel(String url, String defaultModelName) -1 * RegisterModelRequest.DEFAULT_BATCH_SIZE, -1 * RegisterModelRequest.DEFAULT_MAX_BATCH_DELAY, configManager.getDefaultResponseTimeout(), + configManager.getDefaultStartupTimeout(), defaultModelName, false, false, @@ -120,6 +121,7 @@ public ModelArchive registerModel( int batchSize, int maxBatchDelay, int responseTimeout, + int startupTimeout, String defaultModelName, boolean ignoreDuplicate, boolean isWorkflowModel, @@ -143,7 +145,13 @@ public ModelArchive registerModel( } Model tempModel = - createModel(archive, batchSize, maxBatchDelay, responseTimeout, isWorkflowModel); + createModel( + archive, + batchSize, + maxBatchDelay, + responseTimeout, + startupTimeout, + isWorkflowModel); String versionId = archive.getModelVersion(); @@ -386,6 +394,7 @@ private Model createModel( int batchSize, int maxBatchDelay, int responseTimeout, + int startupTimeout, boolean isWorkflowModel) { Model model = new Model(archive, configManager.getJobQueueSize()); @@ -435,6 +444,7 @@ private Model createModel( if (archive.getModelConfig() != null) { int marResponseTimeout = archive.getModelConfig().getResponseTimeout(); + int marStartupTimeout = archive.getModelConfig().getStartupTimeout(); responseTimeout = marResponseTimeout > 0 ? marResponseTimeout @@ -443,6 +453,14 @@ private Model createModel( archive.getModelVersion(), Model.RESPONSE_TIMEOUT, responseTimeout); + startupTimeout = + marStartupTimeout > 0 + ? marStartupTimeout + : configManager.getJsonIntValue( + archive.getModelName(), + archive.getModelVersion(), + Model.STARTUP_TIMEOUT, + startupTimeout); } else { responseTimeout = configManager.getJsonIntValue( @@ -450,8 +468,15 @@ private Model createModel( archive.getModelVersion(), Model.RESPONSE_TIMEOUT, responseTimeout); + startupTimeout = + configManager.getJsonIntValue( + archive.getModelName(), + archive.getModelVersion(), + Model.STARTUP_TIMEOUT, + startupTimeout); } model.setResponseTimeout(responseTimeout); + model.setStartupTimeout(startupTimeout); model.setWorkflowModel(isWorkflowModel); model.setRuntimeType( configManager.getJsonRuntimeTypeValue( diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java index 8a73e91412..bedf5fac3e 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java +++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java @@ -79,6 +79,7 @@ public class WorkerThread implements Runnable { protected WorkerState state; protected WorkerLifeCycle lifeCycle; protected int responseTimeout; + protected int startupTimeout; protected long recoveryStartTS; // 0: default value. no recovery needed, in healthy mode protected BaseModelRequest req = null; @@ -180,6 +181,7 @@ public WorkerLifeCycle getLifeCycle() { @Override public void run() { responseTimeout = model.getResponseTimeout(); + startupTimeout = model.getStartupTimeout(); Thread thread = Thread.currentThread(); thread.setName(getWorkerName()); currentThread.set(thread); @@ -192,6 +194,8 @@ public void run() { while (isRunning()) { req = aggregator.getRequest(workerId, state); WorkerCommands workerCmd = req.getCommand(); + // depending on type of worker command we determine which timeout we should use + int timeout = (workerCmd == WorkerCommands.LOAD) ? startupTimeout : responseTimeout; long wtStartTime = System.currentTimeMillis(); int repeats = getRepeats(workerCmd); @@ -225,8 +229,9 @@ public void run() { do { long begin = System.currentTimeMillis(); + for (int i = 0; i < repeats; i++) { - reply = replies.poll(responseTimeout, TimeUnit.SECONDS); + reply = replies.poll(timeout, TimeUnit.SECONDS); if (req.getCommand() != WorkerCommands.LOAD) { break; } @@ -291,11 +296,19 @@ public void run() { if (state == WorkerState.WORKER_SCALED_DOWN || state == WorkerState.WORKER_STOPPED) { logger.debug("Shutting down the thread .. Scaling down."); } else { - logger.debug( - "Backend worker monitoring thread interrupted or backend worker process died., responseTimeout:" - + responseTimeout - + "sec", - e); + if (state == WorkerState.WORKER_STARTED) { + logger.debug( + "Backend worker monitoring thread interrupted or backend worker process died., startupTimeout:" + + startupTimeout + + "sec", + e); + } else { + logger.debug( + "Backend worker monitoring thread interrupted or backend worker process died., responseTimeout:" + + responseTimeout + + "sec", + e); + } } } catch (WorkerInitializationException e) { logger.error("Backend worker error", e); diff --git a/frontend/server/src/main/java/org/pytorch/serve/workflow/WorkflowManager.java b/frontend/server/src/main/java/org/pytorch/serve/workflow/WorkflowManager.java index 8f8bf87c08..e5844dbbe0 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/workflow/WorkflowManager.java +++ b/frontend/server/src/main/java/org/pytorch/serve/workflow/WorkflowManager.java @@ -103,15 +103,21 @@ private WorkFlow createWorkflow(WorkflowArchive archive) } public StatusResponse registerWorkflow( - String workflowName, String url, int responseTimeout, boolean synchronous) + String workflowName, + String url, + int responseTimeout, + int startupTimeout, + boolean synchronous) throws WorkflowException { - return registerWorkflow(workflowName, url, responseTimeout, synchronous, false); + return registerWorkflow( + workflowName, url, responseTimeout, startupTimeout, synchronous, false); } public StatusResponse registerWorkflow( String workflowName, String url, int responseTimeout, + int startupTimeout, boolean synchronous, boolean s3SseKms) throws WorkflowException { @@ -149,7 +155,12 @@ public StatusResponse registerWorkflow( futures.add( executorCompletionService.submit( - () -> registerModelWrapper(wfm, responseTimeout, synchronous))); + () -> + registerModelWrapper( + wfm, + responseTimeout, + startupTimeout, + synchronous))); } int i = 0; @@ -225,7 +236,7 @@ public StatusResponse registerWorkflow( } public ModelRegistrationResult registerModelWrapper( - WorkflowModel wfm, int responseTimeout, boolean synchronous) { + WorkflowModel wfm, int responseTimeout, int startupTimeout, boolean synchronous) { StatusResponse status = new StatusResponse(); try { status = @@ -237,6 +248,7 @@ public ModelRegistrationResult registerModelWrapper( wfm.getBatchSize(), wfm.getMaxBatchDelay(), responseTimeout, + startupTimeout, wfm.getMaxWorkers(), synchronous, true, diff --git a/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowMgmtRequestHandler.java b/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowMgmtRequestHandler.java index 6125de61e7..f3e045a07f 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowMgmtRequestHandler.java +++ b/frontend/server/src/main/java/org/pytorch/serve/workflow/api/http/WorkflowMgmtRequestHandler.java @@ -153,6 +153,7 @@ private void handleRegisterWorkflows( registerWFRequest.getWorkflowName(), registerWFRequest.getWorkflowUrl(), registerWFRequest.getResponseTimeout(), + registerWFRequest.getStartupTimeout(), true, registerWFRequest.getS3SseKms()); diff --git a/frontend/server/src/main/java/org/pytorch/serve/workflow/messages/RegisterWorkflowRequest.java b/frontend/server/src/main/java/org/pytorch/serve/workflow/messages/RegisterWorkflowRequest.java index 5da63feb2b..f61a026a79 100644 --- a/frontend/server/src/main/java/org/pytorch/serve/workflow/messages/RegisterWorkflowRequest.java +++ b/frontend/server/src/main/java/org/pytorch/serve/workflow/messages/RegisterWorkflowRequest.java @@ -11,6 +11,9 @@ public class RegisterWorkflowRequest { @SerializedName("response_timeout") private int responseTimeout; + @SerializedName("startup_timeout") + private int startupTimeout; + @SerializedName("url") private String workflowUrl; @@ -20,6 +23,7 @@ public class RegisterWorkflowRequest { public RegisterWorkflowRequest(QueryStringDecoder decoder) { workflowName = NettyUtils.getParameter(decoder, "workflow_name", null); responseTimeout = NettyUtils.getIntParameter(decoder, "response_timeout", 120); + startupTimeout = NettyUtils.getIntParameter(decoder, "startup_timeout", 120); workflowUrl = NettyUtils.getParameter(decoder, "url", null); s3SseKms = Boolean.parseBoolean(NettyUtils.getParameter(decoder, "s3_sse_kms", "false")); } @@ -40,6 +44,14 @@ public void setResponseTimeout(int responseTimeout) { this.responseTimeout = responseTimeout; } + public int getStartupTimeout() { + return startupTimeout; + } + + public void setStartupTimeout(int startupTimeout) { + this.startupTimeout = startupTimeout; + } + public String getWorkflowUrl() { return workflowUrl; } diff --git a/frontend/server/src/main/resources/proto/management.proto b/frontend/server/src/main/resources/proto/management.proto index ad65c33caa..0d53258565 100644 --- a/frontend/server/src/main/resources/proto/management.proto +++ b/frontend/server/src/main/resources/proto/management.proto @@ -56,6 +56,9 @@ message RegisterModelRequest { // Decides whether S3 SSE KMS enabled or not, default: false. bool s3_sse_kms = 10; //optional + + // Maximum time, in seconds, the TorchServe waits for a model to startup, default: 120. + int32 startup_timeout = 11; //optional } message ScaleWorkerRequest { diff --git a/frontend/server/src/test/java/org/pytorch/serve/EnsembleTest.java b/frontend/server/src/test/java/org/pytorch/serve/EnsembleTest.java index 0d1165d999..b879e71e16 100644 --- a/frontend/server/src/test/java/org/pytorch/serve/EnsembleTest.java +++ b/frontend/server/src/test/java/org/pytorch/serve/EnsembleTest.java @@ -169,6 +169,7 @@ public void testWorkflowYaml() throws Exception { "test.war", "file:///Users/demo/git/serve/frontend/server/src/test/resources/test.war", 300, + 300, true); } catch (Exception e) { System.out.println(e.getMessage()); diff --git a/frontend/server/src/test/resources/management_open_api.json b/frontend/server/src/test/resources/management_open_api.json index 21f23577c8..315abf0095 100644 --- a/frontend/server/src/test/resources/management_open_api.json +++ b/frontend/server/src/test/resources/management_open_api.json @@ -242,6 +242,16 @@ "default": "2" } }, + { + "in": "query", + "name": "startup_timeout", + "description": "Maximum time, in seconds, the TorchServe waits for the model to startup/initialize, default: 120.", + "required": false, + "schema": { + "type": "integer", + "default": "120" + } + }, { "in": "query", "name": "initial_workers", diff --git a/frontend/server/src/test/resources/snapshots/snapshot1.cfg b/frontend/server/src/test/resources/snapshots/snapshot1.cfg index bbd4eae8a2..470ebab953 100644 --- a/frontend/server/src/test/resources/snapshots/snapshot1.cfg +++ b/frontend/server/src/test/resources/snapshots/snapshot1.cfg @@ -9,7 +9,7 @@ default_workers_per_model=4 model_store=../archive/src/test/resources/models async_logging=true number_of_gpu=0 -model_snapshot={\n "name"\: "20200329045334828-startup.cfg",\n "modelCount"\: 1,\n "created"\: 1585457614832,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 4,\n "maxWorkers"\: 4,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} +model_snapshot={\n "name"\: "20200329045334828-startup.cfg",\n "modelCount"\: 1,\n "created"\: 1585457614832,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 4,\n "maxWorkers"\: 4,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} tsConfigFile=src/test/resources/config.properties max_response_size=2047093252 max_request_size=2047093252 diff --git a/frontend/server/src/test/resources/snapshots/snapshot3.cfg b/frontend/server/src/test/resources/snapshots/snapshot3.cfg index d1e23e021e..d27868c653 100644 --- a/frontend/server/src/test/resources/snapshots/snapshot3.cfg +++ b/frontend/server/src/test/resources/snapshots/snapshot3.cfg @@ -9,7 +9,7 @@ default_workers_per_model=4 model_store=../archive/src/test/resources/models async_logging=true number_of_gpu=0 -model_snapshot={\n "name"\: "20200329045335910-snapshot.cfg",\n "modelCount"\: 1,\n "created"\: 1585457615910,\n "models"\: {\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 0,\n "maxWorkers"\: 0,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} +model_snapshot={\n "name"\: "20200329045335910-snapshot.cfg",\n "modelCount"\: 1,\n "created"\: 1585457615910,\n "models"\: {\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 0,\n "maxWorkers"\: 0,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} tsConfigFile=src/test/resources/config.properties max_response_size=2047093252 max_request_size=2047093252 diff --git a/frontend/server/src/test/resources/snapshots/snapshot4.cfg b/frontend/server/src/test/resources/snapshots/snapshot4.cfg index 7479f088ac..03574b2237 100644 --- a/frontend/server/src/test/resources/snapshots/snapshot4.cfg +++ b/frontend/server/src/test/resources/snapshots/snapshot4.cfg @@ -9,7 +9,7 @@ default_workers_per_model=4 model_store=../archive/src/test/resources/models async_logging=true number_of_gpu=0 -model_snapshot={\n "name"\: "20200329065301226-snapshot.cfg",\n "modelCount"\: 1,\n "created"\: 1585464781226,\n "models"\: {\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} +model_snapshot={\n "name"\: "20200329065301226-snapshot.cfg",\n "modelCount"\: 1,\n "created"\: 1585464781226,\n "models"\: {\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} tsConfigFile=src/test/resources/config.properties max_response_size=2047093252 max_request_size=2047093252 diff --git a/frontend/server/src/test/resources/snapshots/snapshot5.cfg b/frontend/server/src/test/resources/snapshots/snapshot5.cfg index bfd6cd7069..eed52178f0 100644 --- a/frontend/server/src/test/resources/snapshots/snapshot5.cfg +++ b/frontend/server/src/test/resources/snapshots/snapshot5.cfg @@ -9,7 +9,7 @@ default_workers_per_model=4 model_store=../archive/src/test/resources/models async_logging=true number_of_gpu=0 -model_snapshot={\n "name"\: "20200329072056820-snapshot.cfg",\n "modelCount"\: 2,\n "created"\: 1585466456820,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} +model_snapshot={\n "name"\: "20200329072056820-snapshot.cfg",\n "modelCount"\: 2,\n "created"\: 1585466456820,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} tsConfigFile=src/test/resources/config.properties max_response_size=2047093252 max_request_size=2047093252 diff --git a/frontend/server/src/test/resources/snapshots/snapshot6.cfg b/frontend/server/src/test/resources/snapshots/snapshot6.cfg index c62d659597..bd19e0a727 100644 --- a/frontend/server/src/test/resources/snapshots/snapshot6.cfg +++ b/frontend/server/src/test/resources/snapshots/snapshot6.cfg @@ -9,7 +9,7 @@ default_workers_per_model=4 model_store=../archive/src/test/resources/models async_logging=true number_of_gpu=0 -model_snapshot={\n "name"\: "20200329072056831-snapshot.cfg",\n "modelCount"\: 3,\n "created"\: 1585466456831,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noopversioned"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} +model_snapshot={\n "name"\: "20200329072056831-snapshot.cfg",\n "modelCount"\: 3,\n "created"\: 1585466456831,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noopversioned"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} tsConfigFile=src/test/resources/config.properties max_response_size=2047093252 max_request_size=2047093252 diff --git a/frontend/server/src/test/resources/snapshots/snapshot7.cfg b/frontend/server/src/test/resources/snapshots/snapshot7.cfg index cc38f325cf..96c9a606e8 100644 --- a/frontend/server/src/test/resources/snapshots/snapshot7.cfg +++ b/frontend/server/src/test/resources/snapshots/snapshot7.cfg @@ -9,7 +9,7 @@ default_workers_per_model=4 model_store=../archive/src/test/resources/models async_logging=true number_of_gpu=0 -model_snapshot={\n "name"\: "20200329072056839-snapshot.cfg",\n "modelCount"\: 4,\n "created"\: 1585466456840,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noopversioned"\: {\n "1.2.1"\: {\n "defaultVersion"\: false,\n "marName"\: "noop_v2.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n },\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} +model_snapshot={\n "name"\: "20200329072056839-snapshot.cfg",\n "modelCount"\: 4,\n "created"\: 1585466456840,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noopversioned"\: {\n "1.2.1"\: {\n "defaultVersion"\: false,\n "marName"\: "noop_v2.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n },\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} tsConfigFile=src/test/resources/config.properties max_response_size=2047093252 max_request_size=2047093252 diff --git a/frontend/server/src/test/resources/snapshots/snapshot8.cfg b/frontend/server/src/test/resources/snapshots/snapshot8.cfg index bd5c5b3d64..2094b21761 100644 --- a/frontend/server/src/test/resources/snapshots/snapshot8.cfg +++ b/frontend/server/src/test/resources/snapshots/snapshot8.cfg @@ -9,7 +9,7 @@ default_workers_per_model=4 model_store=../archive/src/test/resources/models async_logging=true number_of_gpu=0 -model_snapshot={\n "name"\: "20200329072459543-snapshot.cfg",\n "modelCount"\: 4,\n "created"\: 1585466699543,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noopversioned"\: {\n "1.2.1"\: {\n "defaultVersion"\: true,\n "marName"\: "noop_v2.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n },\n "1.11"\: {\n "defaultVersion"\: false,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} +model_snapshot={\n "name"\: "20200329072459543-snapshot.cfg",\n "modelCount"\: 4,\n "created"\: 1585466699543,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noopversioned"\: {\n "1.2.1"\: {\n "defaultVersion"\: true,\n "marName"\: "noop_v2.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n },\n "1.11"\: {\n "defaultVersion"\: false,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} tsConfigFile=src/test/resources/config.properties max_response_size=2047093252 max_request_size=2047093252 diff --git a/frontend/server/src/test/resources/snapshots/snapshot9.cfg b/frontend/server/src/test/resources/snapshots/snapshot9.cfg index 7b34b649b3..11b5cc1ca2 100644 --- a/frontend/server/src/test/resources/snapshots/snapshot9.cfg +++ b/frontend/server/src/test/resources/snapshots/snapshot9.cfg @@ -9,7 +9,7 @@ default_workers_per_model=4 model_store=../archive/src/test/resources/models async_logging=true number_of_gpu=0 -model_snapshot={\n "name"\: "20200331062429525-snapshot.cfg",\n "modelCount"\: 4,\n "created"\: 1585635869526,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 2,\n "maxWorkers"\: 2,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noopversioned"\: {\n "1.2.1"\: {\n "defaultVersion"\: true,\n "marName"\: "noop_v2.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n },\n "1.11"\: {\n "defaultVersion"\: false,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} +model_snapshot={\n "name"\: "20200331062429525-snapshot.cfg",\n "modelCount"\: 4,\n "created"\: 1585635869526,\n "models"\: {\n "noop"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noop_v1.0"\: {\n "1.11"\: {\n "defaultVersion"\: true,\n "marName"\: "noop.mar",\n "minWorkers"\: 2,\n "maxWorkers"\: 2,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n },\n "noopversioned"\: {\n "1.2.1"\: {\n "defaultVersion"\: true,\n "marName"\: "noop_v2.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n },\n "1.11"\: {\n "defaultVersion"\: false,\n "marName"\: "noop.mar",\n "minWorkers"\: 1,\n "maxWorkers"\: 1,\n "batchSize"\: 1,\n "maxBatchDelay"\: 100,\n "responseTimeout"\: 120,\n "startupTimeout"\: 120,\n "runtimeType"\: "python"\n }\n }\n }\n} tsConfigFile=src/test/resources/config.properties max_response_size=2047093252 max_request_size=2047093252 diff --git a/test/postman/management_data.json b/test/postman/management_data.json index c5e123b22b..9e642099aa 100644 --- a/test/postman/management_data.json +++ b/test/postman/management_data.json @@ -200,17 +200,17 @@ }, { "type": "register", - "test": "Register Model with Additional Params and response_timeout as 0", + "test": "Register Model with Additional Params and startup_timeout as 0", "METHOD": "POST", - "path": "models?url={{mar_path_squeezenet1_1}}&model_name=squeezenet1_1&batch_size=3&initial_workers=3&response_timeout=0", + "path": "models?url={{mar_path_squeezenet1_1}}&model_name=squeezenet1_1&batch_size=3&initial_workers=3&startup_timeout=0", "status_code": 500, "grpc_status_code": 13 }, { "type": "register", - "test": "Register Model with response_timeout as 0", + "test": "Register Model with startup_timeout as 0", "METHOD": "POST", - "path": "models?url={{mar_path_squeezenet1_1}}&model_name=squeezenet1_1&response_timeout=0" + "path": "models?url={{mar_path_squeezenet1_1}}&model_name=squeezenet1_1&startup_timeout=0" }, { "type": "unregister", diff --git a/test/pytest/test_gRPC_management_apis.py b/test/pytest/test_gRPC_management_apis.py index b1dbe7f268..9b606b8315 100644 --- a/test/pytest/test_gRPC_management_apis.py +++ b/test/pytest/test_gRPC_management_apis.py @@ -35,6 +35,7 @@ def __get_query_params(parsed_url): "batch_size", "max_batch_delay", "response_timeout", + "startup_timeout", "limit", "next_page_token", ]: diff --git a/test/pytest/test_startup_timeout.py b/test/pytest/test_startup_timeout.py new file mode 100644 index 0000000000..620ba1f0e4 --- /dev/null +++ b/test/pytest/test_startup_timeout.py @@ -0,0 +1,140 @@ +import shutil +import time +from pathlib import Path +from unittest.mock import patch + +import pytest +import requests +import test_utils +from model_archiver import ModelArchiverConfig + +REPO_ROOT_DIR = Path(__file__).parents[2] +EXAMPLES_DIR = REPO_ROOT_DIR / "examples" +MNIST_DIR = EXAMPLES_DIR / "image_classifier" / "mnist" + +model_pt_file = MNIST_DIR / "mnist_cnn.pt" +model_py_file = MNIST_DIR / "mnist.py" + +HANDLER_PY = """ +import time +from ts.torch_handler.base_handler import BaseHandler + +class CustomHandler(BaseHandler): + def initialize(self, context): + time.sleep(10) # to simulate long startup time + super().initialize(context) + + def handle(self, data, context): + return ["Dummy response"] +""" + +MODEL_CONFIG_YAML_TEMPLATE = """ +minWorkers: 1 +maxWorkers: 1 +responseTimeout: 1 +startupTimeout: {startup_timeout} +""" + + +@pytest.fixture(scope="module") +def model_name(): + return "startup_timeout_test_model" + + +@pytest.fixture(scope="module") +def work_dir(tmp_path_factory): + return tmp_path_factory.mktemp("startup_timeout_test") + + +@pytest.fixture(scope="module", name="mar_file_path", params=[30, 5]) +def create_mar_file(work_dir, model_archiver, model_name, request): + startup_timeout = request.param + model_name = model_name + f"_{startup_timeout}" + mar_file_path = work_dir / f"{model_name}.mar" + + handler_file = work_dir / f"handler_{startup_timeout}.py" + handler_file.write_text(HANDLER_PY) + + model_config_file = work_dir / f"model_config_{startup_timeout}.yaml" + model_config_yaml = MODEL_CONFIG_YAML_TEMPLATE.format( + startup_timeout=startup_timeout + ) + model_config_file.write_text(model_config_yaml) + + config = ModelArchiverConfig( + model_name=model_name, + version="1.0", + serialized_file=str(model_pt_file), + model_file=str(model_py_file), + handler=str(handler_file), + export_path=str(work_dir), + config_file=str(model_config_file), + ) + + with patch("archiver.ArgParser.export_model_args_parser", return_value=config): + model_archiver.generate_model_archive() + + assert mar_file_path.exists() + return str(mar_file_path) + + +@pytest.fixture(scope="function") +def register_model(mar_file_path, model_store, torchserve, request): + shutil.copy(mar_file_path, model_store) + model_name = Path(mar_file_path).stem + startup_timeout = request.node.callspec.params["mar_file_path"] + + params = ( + ("model_name", model_name), + ("url", Path(mar_file_path).name), + ("initial_workers", "1"), + ("synchronous", "false"), + ) + + response = test_utils.register_model_with_params(params) + assert response.status_code == 202, "Model registration failed" + + yield model_name, torchserve, startup_timeout + + test_utils.unregister_model(model_name) + + +def test_startup_timeout(register_model): + model_name, torchserve, startup_timeout = register_model + + model_status = "LOADING" + max_wait = 30 + start_time = time.time() + + # We expect model to timeout in this case, since in handler initialization we set time.sleep(10) + if startup_timeout == 5: + expected_error = "org.pytorch.serve.wlm.WorkerInitializationException: Backend worker did not respond in given time" + while (time.time() - start_time) < max_wait: + output = torchserve.get() + if "Backend worker error" in output: + error_message = torchserve.get() + break + else: + assert ( + False + ), f"Timeout waiting for 'Backend worker error' (waited {max_wait} seconds)" + assert ( + expected_error in error_message + ), "Unexpected error message, expected model to timeout during startup" + return + # If startup timeout is set to 30 then we expect model to startup, even though + # response timeout is set to 1 + while (time.time() - start_time) < max_wait: + response = requests.get(f"http://localhost:8081/models/{model_name}") + if response.status_code == 200: + model_status = response.json()[0]["workers"][0]["status"] + if model_status == "READY": + break + time.sleep(1) + + end_time = time.time() + elapsed_time = end_time - start_time + + assert response.status_code == 200, "Model startup failed" + assert model_status == "READY", f"Unexpected model status: {model_status}" + assert 10 <= elapsed_time < 20, f"Unexpected startup time: {elapsed_time} seconds" diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index fad4e8b724..0e3cf7ee17 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -218,6 +218,8 @@ preflight readthedocs req responseTimeout +startupTimeout +timeframe scalability storepass storetype