diff --git a/.ci/options.ini b/.ci/options.ini index e1ae8b9..76ec00f 100644 --- a/.ci/options.ini +++ b/.ci/options.ini @@ -5,8 +5,8 @@ llamacpp = https://github.com/ggerganov/llama.cpp [revisions] automatic = 2ec6e9eec481223b8abe50cb5165d5fd9be2d086 -comfyui = 29c2e26724d4982a3e33114eb9064f1a11f4f4ed -llamacpp = b3376 +comfyui = 7914c47d5afb1c5ffab49d665f9a1ac86a458821 +llamacpp = b3400 [files] diff --git a/airootfs/home/tori/.local/share/tori/patches/0000-automatic-drop-pstate-in-idle.patch b/airootfs/home/tori/.local/share/tori/patches/0000-automatic-drop-pstate-in-idle.patch index 59138b1..170a4c7 100644 --- a/airootfs/home/tori/.local/share/tori/patches/0000-automatic-drop-pstate-in-idle.patch +++ b/airootfs/home/tori/.local/share/tori/patches/0000-automatic-drop-pstate-in-idle.patch @@ -8,7 +8,7 @@ debug_install = installer.log.debug if os.environ.get('SD_INSTALL_DEBUG', None) is not None else lambda *args, **kwargs: None -@@ -260,4 +261,5 @@ def main(): +@@ -262,4 +263,5 @@ def main(): if __name__ == "__main__": @@ -24,7 +24,7 @@ class State: -@@ -81,6 +82,8 @@ class State: +@@ -83,6 +84,8 @@ class State: self.time_start = time.time() if self.debug_output: log.debug(f'State begin: {self.job}') @@ -33,7 +33,7 @@ modules.devices.torch_gc() def end(self, api=None): -@@ -90,6 +93,8 @@ class State: +@@ -92,6 +95,8 @@ class State: self.time_start = time.time() if self.debug_output: log.debug(f'State end: {self.job} time={time.time() - self.time_start:.2f}') diff --git a/airootfs/home/tori/.local/share/tori/patches/0000-llamacpp-server-drop-pstate-in-idle.patch b/airootfs/home/tori/.local/share/tori/patches/0000-llamacpp-server-drop-pstate-in-idle.patch index 54e689c..ec9c947 100644 --- a/airootfs/home/tori/.local/share/tori/patches/0000-llamacpp-server-drop-pstate-in-idle.patch +++ b/airootfs/home/tori/.local/share/tori/patches/0000-llamacpp-server-drop-pstate-in-idle.patch @@ -1,6 +1,6 @@ --- a/examples/server/server.cpp +++ b/examples/server/server.cpp -@@ -1103,6 +1103,7 @@ struct server_context { +@@ -1106,6 +1106,7 @@ struct server_context { {"id_task", slot.id_task}, }); @@ -8,7 +8,7 @@ return true; } -@@ -1910,6 +1911,7 @@ struct server_context { +@@ -1913,6 +1914,7 @@ struct server_context { kv_cache_clear(); } @@ -16,7 +16,7 @@ return; } } -@@ -2466,6 +2468,7 @@ inline void signal_handler(int signal) { +@@ -2485,6 +2487,7 @@ inline void signal_handler(int signal) { } int main(int argc, char ** argv) { diff --git a/airootfs/home/tori/.local/share/tori/patches/0000-vllm-drop-pstate-in-idle.patch b/airootfs/home/tori/.local/share/tori/patches/0000-vllm-drop-pstate-in-idle.patch index 379fc38..f5300d2 100644 --- a/airootfs/home/tori/.local/share/tori/patches/0000-vllm-drop-pstate-in-idle.patch +++ b/airootfs/home/tori/.local/share/tori/patches/0000-vllm-drop-pstate-in-idle.patch @@ -20,7 +20,7 @@ self.num_generation_tokens.append(stats.num_generation_tokens_iter) --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py -@@ -12,6 +12,7 @@ from fastapi import Request +@@ -12,6 +12,7 @@ from fastapi import APIRouter, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -28,30 +28,46 @@ from prometheus_client import make_asgi_app from starlette.routing import Mount -@@ -129,6 +130,7 @@ async def show_version(): - @app.post("/v1/chat/completions") +@@ -121,6 +122,7 @@ async def show_version(): + @router.post("/v1/chat/completions") async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + set_pstate_high() generator = await openai_serving_chat.create_chat_completion( request, raw_request) if isinstance(generator, ErrorResponse): -@@ -144,6 +146,7 @@ async def create_chat_completion(request: ChatCompletionRequest, +@@ -136,6 +138,7 @@ async def create_chat_completion(request: ChatCompletionRequest, - @app.post("/v1/completions") + @router.post("/v1/completions") async def create_completion(request: CompletionRequest, raw_request: Request): + set_pstate_high() generator = await openai_serving_completion.create_completion( request, raw_request) if isinstance(generator, ErrorResponse): -@@ -168,6 +171,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): +@@ -273,6 +276,7 @@ def run_server(args, llm_engine=None): if __name__ == "__main__": + set_pstate_low() - args = parse_args() + # NOTE(simon): + # This section should be in sync with vllm/scripts.py for CLI entrypoints. + parser = FlexibleArgumentParser( +--- a/vllm/scripts.py ++++ b/vllm/scripts.py +@@ -5,6 +5,7 @@ import signal + import sys + from typing import Optional - app.add_middleware( ++from nvidia_pstate import set_pstate_high, set_pstate_low + from openai import OpenAI + + from vllm.entrypoints.openai.api_server import run_server +@@ -151,4 +152,5 @@ def main(): + + + if __name__ == "__main__": ++ set_pstate_low() + main() --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -10,6 +10,7 @@ import numpy as np @@ -62,7 +78,7 @@ try: from flashinfer import BatchDecodeWithPagedKVCacheWrapper -@@ -903,6 +904,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): +@@ -1003,6 +1004,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): Since it is used for decoding-only, it assumes there's only 1 token per sequence in the batch. """ @@ -70,7 +86,7 @@ assert not self.model_config.enforce_eager logger.info("Capturing the model for CUDA graphs. This may lead to " "unexpected consequences if the model is not static. To " -@@ -1098,6 +1100,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): +@@ -1206,6 +1208,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): elapsed_time = end_time - start_time # This usually takes < 10 seconds. logger.info("Graph capturing finished in %.0f secs.", elapsed_time) @@ -88,7 +104,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, MultiModalConfig, ParallelConfig, -@@ -164,6 +165,7 @@ class Worker(LocalOrDistributedWorkerBase): +@@ -170,6 +171,7 @@ class Worker(LocalOrDistributedWorkerBase): You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ @@ -96,7 +112,7 @@ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() -@@ -179,6 +181,8 @@ class Worker(LocalOrDistributedWorkerBase): +@@ -185,6 +187,8 @@ class Worker(LocalOrDistributedWorkerBase): # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. peak_memory = self.init_gpu_memory - free_gpu_memory @@ -105,7 +121,7 @@ assert peak_memory > 0, ( "Error in memory profiling. This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") -@@ -195,6 +199,7 @@ class Worker(LocalOrDistributedWorkerBase): +@@ -201,6 +205,7 @@ class Worker(LocalOrDistributedWorkerBase): self.model_runner.remove_all_loras() gc.collect() torch.cuda.empty_cache()