diff --git a/06_gpu_and_ml/llm-serving/vllm_gemma.py b/06_gpu_and_ml/llm-serving/vllm_gemma.py deleted file mode 100644 index f04c2a98f..000000000 --- a/06_gpu_and_ml/llm-serving/vllm_gemma.py +++ /dev/null @@ -1,265 +0,0 @@ -# --- -# tags: ["use-case-lm-inference"] -# --- -# # Fast inference with vLLM (Gemma 7B) -# -# In this example, we show how to run basic LLM inference, using [`vLLM`](https://github.com/vllm-project/vllm) -# to take advantage of [PagedAttention](https://arxiv.org/abs/2309.06180), which speeds up inference on longer sequences with optimized key-value caching. -# You can read more about PagedAttention [here](https://charlesfrye.github.io/programming/2023/11/10/llms-systems.html). -# -# We'll run the [Gemma 7B Instruct](https://huggingface.co/google/gemma-7b-it) large language model. -# Gemma is the weights-available version of Google's Gemini model series. -# -# The "7B" in the name refers to the number of parameters (floating point numbers used to control inference) -# in the model. Applying those 7,000,000,000 numbers onto an input is a lot of work, -# so we'll use a GPU to speed up the process -- specifically, a top-of-the-line [NVIDIA H100](https://modal.com/blog/introducing-h100). -# -# "Instruct" means that this version of Gemma is not simply a statistical model of language, -# but has been fine-tuned to follow instructions -- like ChatGPT or Claude, -# it is a model of an assistant that can understand and follow instructions. -# -# You can expect cold starts in under 30 seconds and well over 1000 tokens/second throughput. -# The larger the batch of prompts, the higher the throughput. For example, with the 64 prompts below, -# we can produce nearly 15k tokens with a latency just over 5 seconds, for a throughput of >2.5k tokens/second. -# That's a lot of text! -# -# -# To run -# [any of the other supported models](https://vllm.readthedocs.io/en/latest/models/supported_models.html), -# just change the model name. You may also need to change engine configuration, like `trust_remote_code`, -# or GPU configuration, in order to run some models. -# -# ## Setup -# -# First we import the components we need from `modal`. - -import os -import time - -import modal - -MODEL_DIR = "/model" -MODEL_NAME = "google/gemma-7b-it" - - -# ## Define a container image -# -# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this -# is that the container no longer has to re-download the model from Hugging Face - instead, it will take -# advantage of Modal's internal filesystem for faster cold starts. -# -# ### Download the weights -# Make sure you have created a [HuggingFace access token](https://huggingface.co/settings/tokens). -# To access the token in a Modal function, we can create a secret on the [secrets page](https://modal.com/secrets). -# Now the token will be available via the environment variable named `HF_TOKEN`. Functions that inject this secret -# will have access to the environment variable. -# -# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. -# -# You may need to accept the license agreement from an account associated with that Hugging Face Token -# to download the model. -def download_model_to_image(model_dir, model_name): - from huggingface_hub import snapshot_download - from transformers.utils import move_cache - - os.makedirs(model_dir, exist_ok=True) - - snapshot_download( - model_name, - local_dir=model_dir, - token=os.environ["HF_TOKEN"], - ignore_patterns=["*.pt", "*.gguf"], # Using safetensors - ) - move_cache() - - -# ### Image definition -# We’ll start from a Docker Hub image by NVIDIA and install `vLLM`. -# Then we’ll use `run_function` to execute `download_model_to_image` -# and save the resulting files to the container image -- that way we don't need -# to redownload the weights every time we change the server's code or start up more instances of the server. -image = ( - modal.Image.debian_slim(python_version="3.10") - .pip_install( - "vllm==0.4.0.post1", - "torch==2.1.2", - "transformers==4.39.3", - "ray==2.10.0", - "huggingface_hub==0.19.4", - "hf-transfer==0.1.4", - ) - # Use the barebones hf-transfer package for maximum download speeds. Varies from 100MB/s to 1.5 GB/s, - # so download times can vary from under a minute to tens of minutes. - # If your download slows down or times out, try interrupting and restarting. - .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) - .run_function( - download_model_to_image, - secrets=[ - modal.Secret.from_name( - "huggingface-secret", required_keys=["HF_TOKEN"] - ) - ], - timeout=60 * 20, - kwargs={"model_dir": MODEL_DIR, "model_name": MODEL_NAME}, - ) -) - -app = modal.App(f"example-vllm-{MODEL_NAME}", image=image) - -# Using `image.imports` allows us to have a reference to vLLM in global scope without getting an error when our script executes locally. -with image.imports(): - import vllm - -# ## Encapulate the model in a class -# -# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator. -# This enables us to load the model into memory just once every time a container starts up, and keep it cached -# on the GPU for each subsequent invocation of the function. -# -# The `vLLM` library allows the code to remain quite clean! - -GPU_CONFIG = modal.gpu.H100(count=1) - - -@app.cls(gpu=GPU_CONFIG, secrets=[modal.Secret.from_name("huggingface-secret")]) -class Model: - @modal.enter() - def load(self): - self.template = ( - "user\n{user}\nmodel\n" - ) - - # Load the model. Tip: Some models, like MPT, may require `trust_remote_code=true`. - self.llm = vllm.LLM( - MODEL_DIR, - enforce_eager=True, # skip graph capturing for faster cold starts - tensor_parallel_size=GPU_CONFIG.count, - ) - - @modal.method() - def generate(self, user_questions): - prompts = [self.template.format(user=q) for q in user_questions] - - sampling_params = vllm.SamplingParams( - temperature=0.75, - top_p=0.99, - max_tokens=256, - presence_penalty=1.15, - ) - start = time.monotonic_ns() - result = self.llm.generate(prompts, sampling_params) - duration_s = (time.monotonic_ns() - start) / 1e9 - num_tokens = 0 - - COLOR = { - "HEADER": "\033[95m", - "BLUE": "\033[94m", - "GREEN": "\033[92m", - "RED": "\033[91m", - "ENDC": "\033[0m", - } - - for output in result: - num_tokens += len(output.outputs[0].token_ids) - print( - f"{COLOR['HEADER']}{COLOR['GREEN']}{output.prompt}", - f"\n{COLOR['BLUE']}{output.outputs[0].text}", - "\n\n", - sep=COLOR["ENDC"], - ) - time.sleep(0.01) - print( - f"{COLOR['HEADER']}{COLOR['GREEN']}Generated {num_tokens} tokens from {MODEL_NAME} in {duration_s:.1f} seconds," - f" throughput = {num_tokens / duration_s:.0f} tokens/second on {GPU_CONFIG}.{COLOR['ENDC']}" - ) - - @modal.exit() - def stop_engine(self): - if GPU_CONFIG.count > 1: - import ray - - ray.shutdown() - - -# ## Run the model -# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function -# sequentially for a list of inputs. Run it by executing the command `modal run vllm_gemma.py`. -# -# The examples below are meant to put the model through its paces, with a variety of questions and prompts. -# We also calculate the throughput and latency we achieve. -@app.local_entrypoint() -def main(): - questions = [ - # Coding questions - "Implement a Python function to compute the Fibonacci numbers.", - "Write a Rust function that performs binary exponentiation.", - "How do I allocate memory in C?", - "What are the differences between Javascript and Python?", - "How do I find invalid indices in Postgres?", - "How can you implement a LRU (Least Recently Used) cache in Python?", - "What approach would you use to detect and prevent race conditions in a multithreaded application?", - "Can you explain how a decision tree algorithm works in machine learning?", - "How would you design a simple key-value store database from scratch?", - "How do you handle deadlock situations in concurrent programming?", - "What is the logic behind the A* search algorithm, and where is it used?", - "How can you design an efficient autocomplete system?", - "What approach would you take to design a secure session management system in a web application?", - "How would you handle collision in a hash table?", - "How can you implement a load balancer for a distributed system?", - # Literature - "What is the fable involving a fox and grapes?", - "Write a story in the style of James Joyce about a trip to the Australian outback in 2083, to see robots in the beautiful desert.", - "Who does Harry turn into a balloon?", - "Write a tale about a time-traveling historian who's determined to witness the most significant events in human history.", - "Describe a day in the life of a secret agent who's also a full-time parent.", - "Create a story about a detective who can communicate with animals.", - "What is the most unusual thing about living in a city floating in the clouds?", - "In a world where dreams are shared, what happens when a nightmare invades a peaceful dream?", - "Describe the adventure of a lifetime for a group of friends who found a map leading to a parallel universe.", - "Tell a story about a musician who discovers that their music has magical powers.", - "In a world where people age backwards, describe the life of a 5-year-old man.", - "Create a tale about a painter whose artwork comes to life every night.", - "What happens when a poet's verses start to predict future events?", - "Imagine a world where books can talk. How does a librarian handle them?", - "Tell a story about an astronaut who discovered a planet populated by plants.", - "Describe the journey of a letter traveling through the most sophisticated postal service ever.", - "Write a tale about a chef whose food can evoke memories from the eater's past.", - # History - "What were the major contributing factors to the fall of the Roman Empire?", - "How did the invention of the printing press revolutionize European society?", - "What are the effects of quantitative easing?", - "How did the Greek philosophers influence economic thought in the ancient world?", - "What were the economic and philosophical factors that led to the fall of the Soviet Union?", - "How did decolonization in the 20th century change the geopolitical map?", - "What was the influence of the Khmer Empire on Southeast Asia's history and culture?", - # Thoughtfulness - "Describe the city of the future, considering advances in technology, environmental changes, and societal shifts.", - "In a dystopian future where water is the most valuable commodity, how would society function?", - "If a scientist discovers immortality, how could this impact society, economy, and the environment?", - "What could be the potential implications of contact with an advanced alien civilization?", - "Describe how you would mediate a conflict between two roommates about doing the dishes using techniques of non-violent communication.", - # Math - "What is the product of 9 and 8?", - "If a train travels 120 kilometers in 2 hours, what is its average speed?", - "Think through this step by step. If the sequence a_n is defined by a_1 = 3, a_2 = 5, and a_n = a_(n-1) + a_(n-2) for n > 2, find a_6.", - "Think through this step by step. Calculate the sum of an arithmetic series with first term 3, last term 35, and total terms 11.", - "Think through this step by step. What is the area of a triangle with vertices at the points (1,2), (3,-4), and (-2,5)?", - "Think through this step by step. Solve the following system of linear equations: 3x + 2y = 14, 5x - y = 15.", - # Facts - "Who was Emperor Norton I, and what was his significance in San Francisco's history?", - "What is the Voynich manuscript, and why has it perplexed scholars for centuries?", - "What was Project A119 and what were its objectives?", - "What is the 'Dyatlov Pass incident' and why does it remain a mystery?", - "What is the 'Emu War' that took place in Australia in the 1930s?", - "What is the 'Phantom Time Hypothesis' proposed by Heribert Illig?", - "Who was the 'Green Children of Woolpit' as per 12th-century English legend?", - "What are 'zombie stars' in the context of astronomy?", - "Who were the 'Dog-Headed Saint' and the 'Lion-Faced Saint' in medieval Christian traditions?", - "What is the story of the 'Globsters', unidentified organic masses washed up on the shores?", - # Multilingual - "战国时期最重要的人物是谁?", - "Tuende hatua kwa hatua. Hesabu jumla ya mfululizo wa kihesabu wenye neno la kwanza 2, neno la mwisho 42, na jumla ya maneno 21.", - "Kannst du die wichtigsten Eigenschaften und Funktionen des NMDA-Rezeptors beschreiben?", - ] - model = Model() - model.generate.remote(questions) diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py deleted file mode 100644 index bfce0771d..000000000 --- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py +++ /dev/null @@ -1,264 +0,0 @@ -# --- -# tags: ["use-case-lm-inference"] -# --- -# # Fast inference with vLLM (Mixtral 8x7B) -# -# In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) -# to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. -# -# We are running a [variant](https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO) -# of [Mistral AI](https://mistral.ai/)'s ~56 billion parameter mixture-of-experts model Mixtral 8x7B model -# that has been additionally finetuned by [Nous Research](https://nousresearch.com/). -# You can expect ~3 minute cold starts. -# For a single request, the throughput is around 50 tokens/second. -# The larger the batch of prompts, the higher the throughput (up to hundreds of tokens per second). -# -# ## Setup - -import os -import time - -import modal - -MODEL_DIR = "/model" -MODEL_NAME = "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO" -MODEL_REVISION = "286ae6737d048ad1d965c2e830864df02db50f2f" -GPU_CONFIG = modal.gpu.A100(size="80GB", count=2) - - -# ## Define a container image -# -# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this -# is that the container no longer has to re-download the model from Huggingface - instead, it will take -# advantage of Modal's internal filesystem for faster cold starts. -# -# ### Download the weights -# -# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. -# -# If you adapt this example to run another model, -# note that for this step to work on a [gated model](https://huggingface.co/docs/hub/en/models-gated) -# the `HF_TOKEN` environment variable must be set and provided as a [Modal Secret](https://modal.com/secrets). -# -# Mixtral is beefy, at nearly 100 GB in `safetensors` format, so this can take some time -- at least a few minutes. - - -def download_model_to_image(model_dir, model_name, model_revision): - from huggingface_hub import snapshot_download - from transformers.utils import move_cache - - os.makedirs(model_dir, exist_ok=True) - - snapshot_download( - model_name, - revision=model_revision, - local_dir=model_dir, - ignore_patterns=["*.pt", "*.bin"], # Using safetensors - ) - move_cache() - - -# ### Image definition -# -# We’ll start from a basic Linux container image, install `vllm` and related libraries, -# and then use `run_function` to run the function defined above and ensure the weights of -# the model are saved within the container image. - -vllm_image = ( - modal.Image.debian_slim(python_version="3.10") - .pip_install( - "vllm==0.4.0.post1", - "torch==2.1.2", - "transformers==4.39.3", - "ray==2.10.0", - "hf-transfer==0.1.6", - "huggingface_hub==0.22.2", - ) - .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) - .run_function( - download_model_to_image, - timeout=60 * 20, - kwargs={ - "model_dir": MODEL_DIR, - "model_name": MODEL_NAME, - "model_revision": MODEL_REVISION, - }, - ) -) - -app = modal.App("example-vllm-mixtral") - - -# ## The model class -# -# The inference function is best represented with Modal's [class syntax](/docs/guide/lifecycle-functions) and the `@enter` decorator. -# This enables us to load the model into memory just once every time a container starts up, and keep it cached -# on the GPU for each subsequent invocation of the function. -# -# The `vLLM` library allows the code to remain quite clean. We do have to patch the multi-GPU setup due to issues with Ray. -@app.cls( - gpu=GPU_CONFIG, - timeout=60 * 10, - container_idle_timeout=60 * 10, - allow_concurrent_inputs=10, - image=vllm_image, -) -class Model: - @modal.enter() - def start_engine(self): - from vllm.engine.arg_utils import AsyncEngineArgs - from vllm.engine.async_llm_engine import AsyncLLMEngine - - print("🥶 cold starting inference") - start = time.monotonic_ns() - - engine_args = AsyncEngineArgs( - model=MODEL_DIR, - tensor_parallel_size=GPU_CONFIG.count, - gpu_memory_utilization=0.90, - enforce_eager=False, # capture the graph for faster inference, but slower cold starts - disable_log_stats=True, # disable logging so we can stream tokens - disable_log_requests=True, - ) - - # this can take some time! - self.engine = AsyncLLMEngine.from_engine_args(engine_args) - duration_s = (time.monotonic_ns() - start) / 1e9 - print(f"🏎️ engine started in {duration_s:.0f}s") - - @modal.method() - async def completion_stream(self, user_question): - from vllm import SamplingParams - from vllm.utils import random_uuid - - sampling_params = SamplingParams( - temperature=0.75, - max_tokens=128, - repetition_penalty=1.1, - ) - - request_id = random_uuid() - result_generator = self.engine.generate( - user_question, - sampling_params, - request_id, - ) - index, num_tokens = 0, 0 - start = time.monotonic_ns() - async for output in result_generator: - if ( - output.outputs[0].text - and "\ufffd" == output.outputs[0].text[-1] - ): - continue - text_delta = output.outputs[0].text[index:] - index = len(output.outputs[0].text) - num_tokens = len(output.outputs[0].token_ids) - - yield text_delta - duration_s = (time.monotonic_ns() - start) / 1e9 - - yield ( - f"\n\tGenerated {num_tokens} tokens from {MODEL_NAME} in {duration_s:.1f}s," - f" throughput = {num_tokens / duration_s:.0f} tokens/second on {GPU_CONFIG}.\n" - ) - - @modal.exit() - def stop_engine(self): - if GPU_CONFIG.count > 1: - import ray - - ray.shutdown() - - -# ## Run the model -# We define a [`local_entrypoint`](/docs/guide/apps#entrypoints-for-ephemeral-apps) to call our remote function -# sequentially for a list of inputs. You can run this locally with `modal run -q vllm_mixtral.py`. The `q` flag -# enables the text to stream in your local terminal. -@app.local_entrypoint() -def main(): - questions = [ - "Implement a Python function to compute the Fibonacci numbers.", - "What is the fable involving a fox and grapes?", - "What were the major contributing factors to the fall of the Roman Empire?", - "Describe the city of the future, considering advances in technology, environmental changes, and societal shifts.", - "What is the product of 9 and 8?", - "Who was Emperor Norton I, and what was his significance in San Francisco's history?", - ] - model = Model() - for question in questions: - print("Sending new request:", question, "\n\n") - for text in model.completion_stream.remote_gen(question): - print(text, end="", flush=text.endswith("\n")) - - -# ## Deploy and invoke the model -# Once we deploy this model with `modal deploy vllm_mixtral.py`, -# we can invoke inference from other apps, sharing the same pool -# of GPU containers with all other apps we might need. -# -# ``` -# $ python -# >>> import modal -# >>> f = modal.Function.lookup("example-vllm-mixtral", "Model.completion_stream") -# >>> for text in f.remote_gen("What is the story about the fox and grapes?"): -# >>> print(text, end="", flush=text.endswith("\n")) -# 'The story about the fox and grapes ... -# ``` - -# ## Coupling a frontend web application -# -# We can stream inference from a FastAPI backend, also deployed on Modal. -# -# You can try our deployment [here](https://modal-labs--vllm-mixtral.modal.run). - -from pathlib import Path - -import modal - -frontend_image = modal.Image.debian_slim().pip_install("fastapi[standard]") -frontend_path = Path(__file__).parent.parent / "llm-frontend" - - -@app.function( - image=frontend_image, - mounts=[modal.Mount.from_local_dir(frontend_path, remote_path="/assets")], - keep_warm=1, - allow_concurrent_inputs=20, - timeout=60 * 10, -) -@modal.asgi_app(label="vllm-mixtral") -def vllm_mixtral(): - import json - - import fastapi - import fastapi.staticfiles - from fastapi.responses import StreamingResponse - - web_app = fastapi.FastAPI() - - @web_app.get("/stats") - async def stats(): - stats = await Model().completion_stream.get_current_stats.aio() - return { - "backlog": stats.backlog, - "num_total_runners": stats.num_total_runners, - "model": MODEL_NAME + " (vLLM)", - } - - @web_app.get("/completion/{question}") - async def completion(question: str): - from urllib.parse import unquote - - async def generate(): - async for text in Model().completion_stream.remote_gen.aio( - unquote(question) - ): - yield f"data: {json.dumps(dict(text=text), ensure_ascii=False)}\n\n" - - return StreamingResponse(generate(), media_type="text/event-stream") - - web_app.mount( - "/", fastapi.staticfiles.StaticFiles(directory="/assets", html=True) - ) - return web_app