Skip to content

Commit

Permalink
[projects] Update
Browse files Browse the repository at this point in the history
  • Loading branch information
sasha0552 authored Jul 15, 2024
1 parent 807cb4f commit ca9cae5
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 21 deletions.
4 changes: 2 additions & 2 deletions .ci/options.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ llamacpp = https://github.com/ggerganov/llama.cpp

[revisions]
automatic = 2ec6e9eec481223b8abe50cb5165d5fd9be2d086
comfyui = 29c2e26724d4982a3e33114eb9064f1a11f4f4ed
llamacpp = b3376
comfyui = 7914c47d5afb1c5ffab49d665f9a1ac86a458821
llamacpp = b3400

[files]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


debug_install = installer.log.debug if os.environ.get('SD_INSTALL_DEBUG', None) is not None else lambda *args, **kwargs: None
@@ -260,4 +261,5 @@ def main():
@@ -262,4 +263,5 @@ def main():


if __name__ == "__main__":
Expand All @@ -24,7 +24,7 @@


class State:
@@ -81,6 +82,8 @@ class State:
@@ -83,6 +84,8 @@ class State:
self.time_start = time.time()
if self.debug_output:
log.debug(f'State begin: {self.job}')
Expand All @@ -33,7 +33,7 @@
modules.devices.torch_gc()

def end(self, api=None):
@@ -90,6 +93,8 @@ class State:
@@ -92,6 +95,8 @@ class State:
self.time_start = time.time()
if self.debug_output:
log.debug(f'State end: {self.job} time={time.time() - self.time_start:.2f}')
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1103,6 +1103,7 @@ struct server_context {
@@ -1106,6 +1106,7 @@ struct server_context {
{"id_task", slot.id_task},
});

+ system("nvidia-pstate -s -ps 16");
return true;
}

@@ -1910,6 +1911,7 @@ struct server_context {
@@ -1913,6 +1914,7 @@ struct server_context {
kv_cache_clear();
}

+ system("nvidia-pstate -s -ps 8");
return;
}
}
@@ -2466,6 +2468,7 @@ inline void signal_handler(int signal) {
@@ -2485,6 +2487,7 @@ inline void signal_handler(int signal) {
}

int main(int argc, char ** argv) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,38 +20,54 @@
self.num_generation_tokens.append(stats.num_generation_tokens_iter)
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -12,6 +12,7 @@ from fastapi import Request
@@ -12,6 +12,7 @@ from fastapi import APIRouter, Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse
+from nvidia_pstate import set_pstate_high, set_pstate_low
from prometheus_client import make_asgi_app
from starlette.routing import Mount

@@ -129,6 +130,7 @@ async def show_version():
@app.post("/v1/chat/completions")
@@ -121,6 +122,7 @@ async def show_version():
@router.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest,
raw_request: Request):
+ set_pstate_high()
generator = await openai_serving_chat.create_chat_completion(
request, raw_request)
if isinstance(generator, ErrorResponse):
@@ -144,6 +146,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
@@ -136,6 +138,7 @@ async def create_chat_completion(request: ChatCompletionRequest,

@app.post("/v1/completions")
@router.post("/v1/completions")
async def create_completion(request: CompletionRequest, raw_request: Request):
+ set_pstate_high()
generator = await openai_serving_completion.create_completion(
request, raw_request)
if isinstance(generator, ErrorResponse):
@@ -168,6 +171,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
@@ -273,6 +276,7 @@ def run_server(args, llm_engine=None):


if __name__ == "__main__":
+ set_pstate_low()
args = parse_args()
# NOTE(simon):
# This section should be in sync with vllm/scripts.py for CLI entrypoints.
parser = FlexibleArgumentParser(
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -5,6 +5,7 @@ import signal
import sys
from typing import Optional

app.add_middleware(
+from nvidia_pstate import set_pstate_high, set_pstate_low
from openai import OpenAI

from vllm.entrypoints.openai.api_server import run_server
@@ -151,4 +152,5 @@ def main():


if __name__ == "__main__":
+ set_pstate_low()
main()
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -10,6 +10,7 @@ import numpy as np
Expand All @@ -62,15 +78,15 @@

try:
from flashinfer import BatchDecodeWithPagedKVCacheWrapper
@@ -903,6 +904,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
@@ -1003,6 +1004,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
Since it is used for decoding-only, it assumes there's only 1 token
per sequence in the batch.
"""
+ set_pstate_high()
assert not self.model_config.enforce_eager
logger.info("Capturing the model for CUDA graphs. This may lead to "
"unexpected consequences if the model is not static. To "
@@ -1098,6 +1100,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
@@ -1206,6 +1208,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
elapsed_time = end_time - start_time
# This usually takes < 10 seconds.
logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
Expand All @@ -88,15 +104,15 @@

from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, MultiModalConfig, ParallelConfig,
@@ -164,6 +165,7 @@ class Worker(LocalOrDistributedWorkerBase):
@@ -170,6 +171,7 @@ class Worker(LocalOrDistributedWorkerBase):
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
"""
+ set_pstate_high()
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
torch.cuda.empty_cache()
@@ -179,6 +181,8 @@ class Worker(LocalOrDistributedWorkerBase):
@@ -185,6 +187,8 @@ class Worker(LocalOrDistributedWorkerBase):
# NOTE(woosuk): Here we assume that the other processes using the same
# GPU did not change their memory usage during the profiling.
peak_memory = self.init_gpu_memory - free_gpu_memory
Expand All @@ -105,7 +121,7 @@
assert peak_memory > 0, (
"Error in memory profiling. This happens when the GPU memory was "
"not properly cleaned up before initializing the vLLM instance.")
@@ -195,6 +199,7 @@ class Worker(LocalOrDistributedWorkerBase):
@@ -201,6 +205,7 @@ class Worker(LocalOrDistributedWorkerBase):
self.model_runner.remove_all_loras()
gc.collect()
torch.cuda.empty_cache()
Expand Down

0 comments on commit ca9cae5

Please sign in to comment.