[projects] Update

sasha0552 · Jul 15, 2024 · ca9cae5 · ca9cae5
1 parent 807cb4f
commit ca9cae5
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 21 deletions.
diff --git a/.ci/options.ini b/.ci/options.ini
@@ -5,8 +5,8 @@ llamacpp = https://github.com/ggerganov/llama.cpp
 
 [revisions]
 automatic = 2ec6e9eec481223b8abe50cb5165d5fd9be2d086
-comfyui = 29c2e26724d4982a3e33114eb9064f1a11f4f4ed
-llamacpp = b3376
+comfyui = 7914c47d5afb1c5ffab49d665f9a1ac86a458821
+llamacpp = b3400
 
 [files]
 

diff --git a/airootfs/home/tori/.local/share/tori/patches/0000-automatic-drop-pstate-in-idle.patch b/airootfs/home/tori/.local/share/tori/patches/0000-automatic-drop-pstate-in-idle.patch
@@ -8,7 +8,7 @@
 
 
  debug_install = installer.log.debug if os.environ.get('SD_INSTALL_DEBUG', None) is not None else lambda *args, **kwargs: None
-@@ -260,4 +261,5 @@ def main():
+@@ -262,4 +263,5 @@ def main():
 
 
  if __name__ == "__main__":
@@ -24,7 +24,7 @@
 
 
  class State:
-@@ -81,6 +82,8 @@ class State:
+@@ -83,6 +84,8 @@ class State:
          self.time_start = time.time()
          if self.debug_output:
              log.debug(f'State begin: {self.job}')
@@ -33,7 +33,7 @@
          modules.devices.torch_gc()
 
      def end(self, api=None):
-@@ -90,6 +93,8 @@ class State:
+@@ -92,6 +95,8 @@ class State:
              self.time_start = time.time()
          if self.debug_output:
              log.debug(f'State end: {self.job} time={time.time() - self.time_start:.2f}')

diff --git a/airootfs/home/tori/.local/share/tori/patches/0000-llamacpp-server-drop-pstate-in-idle.patch b/airootfs/home/tori/.local/share/tori/patches/0000-llamacpp-server-drop-pstate-in-idle.patch
@@ -1,22 +1,22 @@
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -1103,6 +1103,7 @@ struct server_context {
+@@ -1106,6 +1106,7 @@ struct server_context {
              {"id_task", slot.id_task},
          });
 
 +        system("nvidia-pstate -s -ps 16");
          return true;
      }
 
-@@ -1910,6 +1911,7 @@ struct server_context {
+@@ -1913,6 +1914,7 @@ struct server_context {
                      kv_cache_clear();
                  }
 
 +                system("nvidia-pstate -s -ps 8");
                  return;
              }
          }
-@@ -2466,6 +2468,7 @@ inline void signal_handler(int signal) {
+@@ -2485,6 +2487,7 @@ inline void signal_handler(int signal) {
  }
 
  int main(int argc, char ** argv) {

diff --git a/airootfs/home/tori/.local/share/tori/patches/0000-vllm-drop-pstate-in-idle.patch b/airootfs/home/tori/.local/share/tori/patches/0000-vllm-drop-pstate-in-idle.patch
@@ -20,38 +20,54 @@
          self.num_generation_tokens.append(stats.num_generation_tokens_iter)
 --- a/vllm/entrypoints/openai/api_server.py
 +++ b/vllm/entrypoints/openai/api_server.py
-@@ -12,6 +12,7 @@ from fastapi import Request
+@@ -12,6 +12,7 @@ from fastapi import APIRouter, Request
  from fastapi.exceptions import RequestValidationError
  from fastapi.middleware.cors import CORSMiddleware
  from fastapi.responses import JSONResponse, Response, StreamingResponse
 +from nvidia_pstate import set_pstate_high, set_pstate_low
  from prometheus_client import make_asgi_app
  from starlette.routing import Mount
 
-@@ -129,6 +130,7 @@ async def show_version():
- @app.post("/v1/chat/completions")
+@@ -121,6 +122,7 @@ async def show_version():
+ @router.post("/v1/chat/completions")
  async def create_chat_completion(request: ChatCompletionRequest,
                                   raw_request: Request):
 +    set_pstate_high()
      generator = await openai_serving_chat.create_chat_completion(
          request, raw_request)
      if isinstance(generator, ErrorResponse):
-@@ -144,6 +146,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
+@@ -136,6 +138,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
- @app.post("/v1/completions")
+ @router.post("/v1/completions")
  async def create_completion(request: CompletionRequest, raw_request: Request):
 +    set_pstate_high()
      generator = await openai_serving_completion.create_completion(
          request, raw_request)
      if isinstance(generator, ErrorResponse):
-@@ -168,6 +171,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
+@@ -273,6 +276,7 @@ def run_server(args, llm_engine=None):
 
 
  if __name__ == "__main__":
 +    set_pstate_low()
-     args = parse_args()
+     # NOTE(simon):
+     # This section should be in sync with vllm/scripts.py for CLI entrypoints.
+     parser = FlexibleArgumentParser(
+--- a/vllm/scripts.py
++++ b/vllm/scripts.py
+@@ -5,6 +5,7 @@ import signal
+ import sys
+ from typing import Optional
 
-     app.add_middleware(
++from nvidia_pstate import set_pstate_high, set_pstate_low
+ from openai import OpenAI
+
+ from vllm.entrypoints.openai.api_server import run_server
+@@ -151,4 +152,5 @@ def main():
+
+
+ if __name__ == "__main__":
++    set_pstate_low()
+     main()
 --- a/vllm/worker/model_runner.py
 +++ b/vllm/worker/model_runner.py
 @@ -10,6 +10,7 @@ import numpy as np
@@ -62,15 +78,15 @@
 
  try:
      from flashinfer import BatchDecodeWithPagedKVCacheWrapper
-@@ -903,6 +904,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
+@@ -1003,6 +1004,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
          Since it is used for decoding-only, it assumes there's only 1 token
          per sequence in the batch.
          """
 +        set_pstate_high()
          assert not self.model_config.enforce_eager
          logger.info("Capturing the model for CUDA graphs. This may lead to "
                      "unexpected consequences if the model is not static. To "
-@@ -1098,6 +1100,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
+@@ -1206,6 +1208,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
          elapsed_time = end_time - start_time
          # This usually takes < 10 seconds.
          logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
@@ -88,15 +104,15 @@
 
  from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                           ModelConfig, MultiModalConfig, ParallelConfig,
-@@ -164,6 +165,7 @@ class Worker(LocalOrDistributedWorkerBase):
+@@ -170,6 +171,7 @@ class Worker(LocalOrDistributedWorkerBase):
              You may limit the usage of GPU memory
              by adjusting the `gpu_memory_utilization` parameter.
          """
 +        set_pstate_high()
          # Profile the memory usage of the model and get the maximum number of
          # cache blocks that can be allocated with the remaining free memory.
          torch.cuda.empty_cache()
-@@ -179,6 +181,8 @@ class Worker(LocalOrDistributedWorkerBase):
+@@ -185,6 +187,8 @@ class Worker(LocalOrDistributedWorkerBase):
          # NOTE(woosuk): Here we assume that the other processes using the same
          # GPU did not change their memory usage during the profiling.
          peak_memory = self.init_gpu_memory - free_gpu_memory
@@ -105,7 +121,7 @@
          assert peak_memory > 0, (
              "Error in memory profiling. This happens when the GPU memory was "
              "not properly cleaned up before initializing the vLLM instance.")
-@@ -195,6 +199,7 @@ class Worker(LocalOrDistributedWorkerBase):
+@@ -201,6 +205,7 @@ class Worker(LocalOrDistributedWorkerBase):
              self.model_runner.remove_all_loras()
          gc.collect()
          torch.cuda.empty_cache()