diff --git a/llgtrt/src/routes/health_check.rs b/llgtrt/src/routes/health_check.rs index 34f9689..5927ee3 100644 --- a/llgtrt/src/routes/health_check.rs +++ b/llgtrt/src/routes/health_check.rs @@ -34,11 +34,13 @@ pub async fn model_check( headers: HeaderMap, State(app_state): State>, ) -> Result { - let req: CompletionCreateParams = serde_json::from_value(json!({ + let mut req: CompletionCreateParams = serde_json::from_value(json!({ "model": "model", "prompt": "Hi", "max_tokens": 2 }))?; + // set very high priority for this request, so that it returns quickly + req.params.priority = Some(10.0); let resp = completions::route_completions(headers, State(app_state), Json(req)).await?; let status = resp.status(); let body = axum::body::to_bytes(resp.into_body(), 1024 * 1024).await?; diff --git a/llgtrt/src/routes/openai.rs b/llgtrt/src/routes/openai.rs index 1a67913..f2630b2 100644 --- a/llgtrt/src/routes/openai.rs +++ b/llgtrt/src/routes/openai.rs @@ -173,6 +173,7 @@ pub struct CommonCreateParams { pub logprobs: Option, /// Defaults to 0.5. We don't allow it in JSON requests, but can be set internally. + /// Setting to higher value like 1.0 or 10.0 will make the request complete faster. #[serde(skip)] pub priority: Option, } diff --git a/llguidance b/llguidance index cfef3df..0ca091a 160000 --- a/llguidance +++ b/llguidance @@ -1 +1 @@ -Subproject commit cfef3df97372a7b84d74976ff41cc9cb78bca6cc +Subproject commit 0ca091a701a50134e0503fa03c5c12b206e182a3 diff --git a/scripts/req.py b/scripts/req.py old mode 100644 new mode 100755 index 9d8bd15..046a2f5 --- a/scripts/req.py +++ b/scripts/req.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import requests import os import threading @@ -247,7 +249,7 @@ def one_round(): def main(): - random.seed(0) + # random.seed(0) parser = argparse.ArgumentParser() parser.add_argument("--max_threads", type=int, default=0) parser.add_argument("--sessions", type=int, default=0) @@ -259,6 +261,7 @@ def main(): LLG = True NUM_THREADS = args.sessions PROMPT_SIZE = 2600 + PROMPT_SIZE = 40_000 NUM_REPS = 1 NUM_JOKES = 100 MAX_TOKENS = 4000 diff --git a/scripts/test-infer.sh b/scripts/test-infer.sh index a86b81b..2e10b59 100755 --- a/scripts/test-infer.sh +++ b/scripts/test-infer.sh @@ -91,6 +91,10 @@ curl -X POST "${TRT_API_BASE}chat/completions" \ curl -v "${TRT_API_BASE}health/live" ;; + health) + curl -v "${TRT_API_BASE}health/model" + ;; + ready) curl -v "${TRT_API_BASE}health/ready" ;;