Skip to content

Commit

Permalink
[perf] Optimize HTTP inference client and Uvicorn server configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
charlieyl committed Feb 12, 2025
1 parent 3e1ae10 commit d6d67e8
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ class FedMLHttpInference:
@classmethod
async def get_http_client(cls):
if cls._http_client is None:
limits = httpx.Limits(max_keepalive_connections=50, max_connections=1000)
limits = httpx.Limits(
max_keepalive_connections=100,
max_connections=1000,
keepalive_expiry=60
)
cls._http_client = httpx.AsyncClient(limits=limits)
return cls._http_client

Expand All @@ -39,8 +43,9 @@ async def is_inference_ready(inference_url, path="ready", timeout=None):

# TODO (Raphael): Support more methods and return codes rules.
try:
async with httpx.AsyncClient() as client:
ready_response = await client.get(url=ready_url, timeout=timeout)
# async with httpx.AsyncClient() as client:
client = await FedMLHttpInference.get_http_client()
ready_response = await client.get(url=ready_url, timeout=timeout)

if isinstance(ready_response, (Response, StreamingResponse)):
error_code = ready_response.status_code
Expand Down Expand Up @@ -99,12 +104,13 @@ async def run_http_inference_with_curl_request(


async def stream_generator(inference_url, input_json, method="POST"):
async with httpx.AsyncClient() as client:
async with client.stream(method, inference_url, json=input_json,
timeout=ClientConstants.WORKER_STREAM_API_TIMEOUT) as response:
async for chunk in response.aiter_lines():
# we consumed a newline, need to put it back
yield f"{chunk}\n"
# async with httpx.AsyncClient() as client:
client = await FedMLHttpInference.get_http_client()
async with client.stream(method, inference_url, json=input_json,
timeout=ClientConstants.WORKER_STREAM_API_TIMEOUT) as response:
async for chunk in response.aiter_lines():
# we consumed a newline, need to put it back
yield f"{chunk}\n"


async def redirect_non_stream_req_to_worker(inference_type, inference_url, model_api_headers, model_inference_json,
Expand All @@ -115,6 +121,7 @@ async def redirect_non_stream_req_to_worker(inference_type, inference_url, model
logging.info(f"[Request-{request_id}] Starting HTTP request to {inference_url}")

try:
# async with httpx.AsyncClient() as client:
client = await FedMLHttpInference.get_http_client()
response = await client.request(
method=method, url=inference_url, headers=model_api_headers, json=model_inference_json, timeout=timeout
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def start_device_inference_gateway():
if inference_gateway_pids is None or len(inference_gateway_pids) <= 0:
cur_dir = os.path.dirname(__file__)
fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir)))
workers = 2
workers = 4
logging.info(f"start the model inference gateway workers[{workers}] no uvloop/httptools...")
inference_gateway_process = ServerConstants.exec_console_with_script(
f"{python_program} -m uvicorn {inference_gw_cmd} "
Expand All @@ -460,9 +460,9 @@ def start_device_inference_gateway():
f"--workers {workers} "
# f"--loop uvloop "
# f"--http httptools "
f"--limit-concurrency 1000 "
f"--limit-concurrency 1024 "
f"--backlog 2048 "
f"--timeout-keep-alive 75 "
f"--timeout-keep-alive 60 "
f"--log-level warning ",
should_capture_stdout=False,
should_capture_stderr=False
Expand Down

0 comments on commit d6d67e8

Please sign in to comment.