[perf] Optimize HTTP inference client and Uvicorn server configuration

FedML-AI · Feb 12, 2025 · d6d67e8 · d6d67e8
1 parent 3e1ae10
commit d6d67e8
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 12 deletions.
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py
@@ -19,7 +19,11 @@ class FedMLHttpInference:
     @classmethod
     async def get_http_client(cls):
         if cls._http_client is None:
-            limits = httpx.Limits(max_keepalive_connections=50, max_connections=1000)
+            limits = httpx.Limits(
+                max_keepalive_connections=100,
+                max_connections=1000,
+                keepalive_expiry=60
+            )
             cls._http_client = httpx.AsyncClient(limits=limits)
         return cls._http_client
 
@@ -39,8 +43,9 @@ async def is_inference_ready(inference_url, path="ready", timeout=None):
 
         # TODO (Raphael): Support more methods and return codes rules.
         try:
-            async with httpx.AsyncClient() as client:
-                ready_response = await client.get(url=ready_url, timeout=timeout)
+            # async with httpx.AsyncClient() as client:
+            client = await FedMLHttpInference.get_http_client()
+            ready_response = await client.get(url=ready_url, timeout=timeout)
 
             if isinstance(ready_response, (Response, StreamingResponse)):
                 error_code = ready_response.status_code
@@ -99,12 +104,13 @@ async def run_http_inference_with_curl_request(
 
 
 async def stream_generator(inference_url, input_json, method="POST"):
-    async with httpx.AsyncClient() as client:
-        async with client.stream(method, inference_url, json=input_json,
-                                 timeout=ClientConstants.WORKER_STREAM_API_TIMEOUT) as response:
-            async for chunk in response.aiter_lines():
-                # we consumed a newline, need to put it back
-                yield f"{chunk}\n"
+    # async with httpx.AsyncClient() as client:
+    client = await FedMLHttpInference.get_http_client()
+    async with client.stream(method, inference_url, json=input_json,
+                                timeout=ClientConstants.WORKER_STREAM_API_TIMEOUT) as response:
+        async for chunk in response.aiter_lines():
+            # we consumed a newline, need to put it back
+            yield f"{chunk}\n"
 
 
 async def redirect_non_stream_req_to_worker(inference_type, inference_url, model_api_headers, model_inference_json,
@@ -115,6 +121,7 @@ async def redirect_non_stream_req_to_worker(inference_type, inference_url, model
     logging.info(f"[Request-{request_id}] Starting HTTP request to {inference_url}")
 
     try:
+         # async with httpx.AsyncClient() as client:
         client = await FedMLHttpInference.get_http_client()
         response = await client.request(
             method=method, url=inference_url, headers=model_api_headers, json=model_inference_json, timeout=timeout

diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -451,7 +451,7 @@ def start_device_inference_gateway():
             if inference_gateway_pids is None or len(inference_gateway_pids) <= 0:
                 cur_dir = os.path.dirname(__file__)
                 fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir)))
-                workers = 2
+                workers = 4
                 logging.info(f"start the model inference gateway workers[{workers}] no uvloop/httptools...")
                 inference_gateway_process = ServerConstants.exec_console_with_script(
                     f"{python_program} -m uvicorn {inference_gw_cmd} "
@@ -460,9 +460,9 @@ def start_device_inference_gateway():
                     f"--workers {workers} "
                     # f"--loop uvloop "
                     # f"--http httptools "
-                    f"--limit-concurrency 1000 "
+                    f"--limit-concurrency 1024 "
                     f"--backlog 2048 "
-                    f"--timeout-keep-alive 75 "
+                    f"--timeout-keep-alive 60 "
                     f"--log-level warning ",
                     should_capture_stdout=False,
                     should_capture_stderr=False