From 0e6f4d817136e9746fd017b587ba14744673b9b2 Mon Sep 17 00:00:00 2001 From: Alonso Astroza Tagle Date: Sun, 28 Apr 2024 23:50:46 -0400 Subject: [PATCH 01/15] python_version in Image declaration (#720) --- 06_gpu_and_ml/llm-serving/vllm_inference.py | 2 +- 06_gpu_and_ml/llm-serving/vllm_mixtral.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py index c3d0375c4..9f6b23a6a 100644 --- a/06_gpu_and_ml/llm-serving/vllm_inference.py +++ b/06_gpu_and_ml/llm-serving/vllm_inference.py @@ -56,7 +56,7 @@ def download_model_to_image(model_dir, model_name): # We’ll start from Modal's Debian slim image. # Then we’ll use `run_function` with `download_model_to_image` to write the model into the container image. image = ( - modal.Image.debian_slim() + modal.Image.debian_slim(python_version="3.10") .pip_install( "vllm==0.4.0.post1", "torch==2.1.2", diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py index 9bb9471ba..57618ae28 100644 --- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py +++ b/06_gpu_and_ml/llm-serving/vllm_mixtral.py @@ -63,7 +63,7 @@ def download_model_to_image(model_dir, model_name, model_revision): # the model are saved within the container image. vllm_image = ( - modal.Image.debian_slim() + modal.Image.debian_slim(python_version="3.10") .pip_install( "vllm==0.4.0.post1", "torch==2.1.2", From dabdf155706bf8f56b3a9b4ebc15afedd95030a3 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Sun, 28 Apr 2024 22:45:00 -0700 Subject: [PATCH 02/15] Clean up instructor example (#722) * removes extra inference file * cleans up instructor example --- .../llm-structured/instructor/inference.py | 79 --------- .../instructor/instructor_generate.py | 161 ++++++++++-------- 2 files changed, 90 insertions(+), 150 deletions(-) delete mode 100644 06_gpu_and_ml/llm-structured/instructor/inference.py diff --git a/06_gpu_and_ml/llm-structured/instructor/inference.py b/06_gpu_and_ml/llm-structured/instructor/inference.py deleted file mode 100644 index d8e765764..000000000 --- a/06_gpu_and_ml/llm-structured/instructor/inference.py +++ /dev/null @@ -1,79 +0,0 @@ -# # Fast inference with vLLM (Mistral 7B) -# -# In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) -# to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. - -import os -import subprocess - -from modal import App, Image, Secret, gpu, web_server - -MODEL_DIR = "/model" -BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1" - - -# ## Define a container image - - -# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this -# is that the container no longer has to re-download the model from Huggingface - instead, it will take -# advantage of Modal's internal filesystem for faster cold starts. -# -# ### Download the weights -# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. -# -# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. -def download_model_to_folder(): - from huggingface_hub import snapshot_download - from transformers.utils import move_cache - - os.makedirs(MODEL_DIR, exist_ok=True) - - snapshot_download( - BASE_MODEL, - local_dir=MODEL_DIR, - ignore_patterns=["*.pt", "*.bin"], # Using safetensors - ) - move_cache() - - -# ### Image definition -# We'll start from a recommended Docker Hub image and install `vLLM`. -# Then we'll use `run_function` to run the function defined above to ensure the weights of -# the model are saved within the container image. -image = ( - Image.from_registry( - "nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10" - ) - .pip_install( - "vllm==0.2.5", - "huggingface_hub==0.19.4", - "hf-transfer==0.1.4", - "torch==2.1.2", - ) - # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. - .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) - .run_function( - download_model_to_folder, - secrets=[Secret.from_name("huggingface")], - timeout=60 * 20, - ) -) - -app = App( - "vllm-inference-openai-compatible", image=image -) # Note: prior to April 2024, "app" was called "stub" - - -GPU_CONFIG = gpu.A100(count=1) # 40GB A100 by default - - -@app.function( - allow_concurrent_inputs=100, - gpu=GPU_CONFIG, -) -@web_server(8000, startup_timeout=90) -def openai_compatible_server(): - target = BASE_MODEL - cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --host 0.0.0.0 --port 8000" - subprocess.Popen(cmd, shell=True) diff --git a/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py b/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py index ca6d66fec..242f419d3 100644 --- a/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py +++ b/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py @@ -3,47 +3,46 @@ # --- # # Structured Data Extraction using `instructor` # -# This example demonstrates how to use the `instructor` library to extract structured data from unstructured text. +# This example demonstrates how to use the `instructor` library to extract structured, schematized data from unstructured text. # -# Structured output is a powerful but under-appreciated feature of LLMs, -# because it makes it easier to connect LLMs to other software, -# for example enabling the ingestion of unstructured data into structured databases. +# Structured output is a powerful but under-appreciated feature of LLMs. +# Structured output allows LLMs and multimodal models to connect to traditional software, +# for example enabling the ingestion of unstructured data like text files into structured databases. +# Applied properly, it makes them an extreme example of the [Robustness Principle](https://en.wikipedia.org/wiki/Robustness_principle) +# Jon Postel formulated for TCP: "Be conservative in what you send, be liberal in what you accept". # -# The unstructured data in this example is the code from the examples in the Modal examples repository -- -# including this one! -# -# We use this exact code to monitor the coverage of the examples -# and to make decisions about which examples to write next! +# The unstructured data used in this example code is the code from the examples in the Modal examples repository -- +# including this example's code! # # The output includes a JSONL file containing, on each line, the metadata extracted from the code in one example. # This can be consumed downstream by other software systems, like a database or a dashboard. # -# We include in this folder a Jupyter notebook with some basic analyses. -# # ## Environment setup # -# We setup the environment our code will run in first. +# We set up the environment our code will run in first. # In Modal, we define environments via [container images](https://modal.com/docs/guide/custom-container), # much like Docker images, by iteratively chaining together commands. # -# This example also uses models from Anthropic, so if you want to run it yourself, -# you'll need to set up a Modal [`Secret`](https://modal.com/docs/guide/secrets) -# called `my-anthropic-secret` for your OpenAI API key. +# Here there's just one command, installing instructor and the Python SDK for Anthropic's LLM API. from pathlib import Path from typing import Literal, Optional import modal +from pydantic import BaseModel, Field image = modal.Image.debian_slim(python_version="3.11").pip_install( - "instructor~=1.0.0", "anthropic~=0.23.1", "matplotlib~=3.8.3" + "instructor~=1.0.0", "anthropic~=0.23.1" ) +# This example uses models from Anthropic, so if you want to run it yourself, +# you'll need to set up a Modal [`Secret`](https://modal.com/docs/guide/secrets) +# called `my-anthropic-secret` for your OpenAI API key. + app = modal.App( image=image, secrets=[modal.Secret.from_name("my-anthropic-secret")] ) # Note: prior to April 2024, "app" was called "stub" - -# ## The overall flow +# ## Running Modal functions from the command line # # We'll run the example by calling `modal run instructor_generate.py` from the command line. # @@ -64,7 +63,7 @@ @app.local_entrypoint() -def main(limit: int = 15, with_opus: bool = False): +def main(limit: int = 1, with_opus: bool = False): # find all of the examples in the repo examples = get_examples() # optionally limit the number of examples we process @@ -72,17 +71,17 @@ def main(limit: int = 15, with_opus: bool = False): examples = [None] # just run on this example else: examples = examples[:limit] - if examples: - # use Modal to map our extraction function over the examples concurrently - results = extract_example_metadata.map( - [ - f"{example.stem}\n" + Path(example.filename).read_text() - if example - else None - for example in examples - ], - kwargs={"with_opus": with_opus}, - ) + # use Modal to map our extraction function over the examples concurrently + results = extract_example_metadata.map( + ( # iterable of file contents + Path(example.filename).read_text() if example else None + for example in examples + ), + ( # iterable of filenames + example.stem if example else None for example in examples + ), + kwargs={"with_opus": with_opus}, + ) # save the results to a local file results_path = Path("/tmp") / "instructor_generate" / "results.jsonl" @@ -97,15 +96,65 @@ def main(limit: int = 15, with_opus: bool = False): f.write(result + "\n") -# ## Extracting JSON from unstructured text with `instructor` +# ## Extracting JSON from unstructured text with `instructor` and Pydantic # -# The real meat of this example is here, in the `extract_example_metadata` function. +# The real meat of this example is in this section, in the `extract_example_metadata` function and its schemas. # -# TODO: write this up -# TODO: refactor classes out of this function, explain separately +# We define a schema for the data we want the LLM to extract, using Pydantic. +# Instructor ensures that the LLM's output matches this schema. +# +# We can use the type system provided by Python and Pydantic to express many useful features +# of the data we want to extract -- ranging from wide-open fields like a `str`ing-valued `summary` +# to constrained fields like `difficulty`, which can only take on value between 1 and 5. + +class ExampleMetadataExtraction(BaseModel): + """Extracted metadata about an example from the Modal examples repo.""" -@app.function(concurrency_limit=5) # watch those rate limits! + summary: str = Field(..., description="A brief summary of the example.") + has_thorough_explanation: bool = Field( + ..., + description="The example contains, in the form of inline comments with markdown formatting, a thorough explanation of what the code does.", + ) + domains: list[ + Literal[ + "artificial_intelligence", + "machine_learning", + "data_science", + "web_serving", + "parallel_computing", + ] + ] = Field(..., description="The") + difficulty: Literal[1, 2, 3, 4, 5] = Field( + ..., + description="The difficulty of the example, from 1 to 5. An example that uses only one or two basic Modal features and is understandable by a professional Python developer familiar with the basics of the relevant domains is a 1, while an example that uses many Modal features and uses advanced Python features like async generator coroutines or metaclasses is a 5.", + ) + freshness: float = Field( + ..., + description="The freshness of the example, from 0 to 1. This is relative to your knowledge cutoff. Examples are less fresh if they use older libraries and tools.", + ) + + +# That schema describes the data to be extracted by the LLM, but not all data is best extracted by an LLM. +# For example, the filename is easily determined in software. +# +# So we inject that information into the output after the LLM has done its work. That necessitates +# an additional schema, which inherits from the first. + + +class ExampleMetadata(ExampleMetadataExtraction): + """Metadata about an example from the Modal examples repo.""" + + filename: Optional[str] = Field( + ..., description="The filename of the example." + ) + + +# With these schemas in hand, it's straightforward to write the function that extracts the metadata. +# Note that we decorate it with `@app.function` to make it run on Modal. + + +@app.function(concurrency_limit=5) # watch those LLM API rate limits! def extract_example_metadata( example_contents: Optional[str] = None, filename: Optional[str] = None, @@ -113,47 +162,16 @@ def extract_example_metadata( ): import instructor from anthropic import Anthropic - from pydantic import BaseModel, Field + # if no example is provided, use the contents of this example if example_contents is None: example_contents = Path(__file__).read_text() filename = Path(__file__).name - class ExampleMetadataExtraction(BaseModel): - """Extracted metadata about an example from the Modal examples repo.""" - - summary: str = Field(..., description="A brief summary of the example.") - has_thorough_explanation: bool = Field( - ..., - description="The example contains, in the form of inline comments with markdown formatting, a thorough explanation of what the code does.", - ) - domains: list[ - Literal[ - "artificial_intelligence", - "machine_learning", - "data_science", - "web_serving", - "parallel_computing", - ] - ] = Field(..., description="The") - difficulty: Literal[1, 2, 3, 4, 5] = Field( - ..., - description="The difficulty of the example, from 1 to 5. An example that uses only one or two basic Modal features and is understandable by a professional Python developer familiar with the basics of the relevant domains is a 1, while an example that uses many Modal features and uses advanced Python features like async generator coroutines or metaclasses is a 5.", - ) - freshness: float = Field( - ..., - description="The freshness of the example, from 0 to 1. This is relative to your knowledge cutoff. Examples are less fresh if they use older libraries and tools.", - ) - - class ExampleMetadata(ExampleMetadataExtraction): - """Metadata about an example from the Modal examples repo.""" - - filename: str = Field(..., description="The filename of the example.") - client = instructor.from_anthropic(Anthropic()) - model = "claude-3-opus-20240229" if with_opus else "claude-3-haiku-20240307" + # add the schema as the `response_model` argument in what otherwise looks like a normal LLM API call extracted_metadata = client.messages.create( model=model, temperature=0.0, @@ -167,18 +185,19 @@ class ExampleMetadata(ExampleMetadataExtraction): ], ) + # inject the filename full_metadata = ExampleMetadata( **extracted_metadata.dict(), filename=filename ) + # return it as JSON return full_metadata.model_dump_json() # ## Addenda # # The rest of the code used in this example is not particularly interesting: -# some boilerplate matplotlib code to generate the figures, -# and a utility function to find all of the examples. +# just a utility function to find all of the examples, which we invoke in the `local_entrypoint` above. def get_examples(silent=True): @@ -195,7 +214,7 @@ def get_examples(silent=True): spec.loader.exec_module(example_utils) examples = [ example - for example in example_utils.get_examples(silent=silent) + for example in example_utils.get_examples() if example.type != 2 # filter out non-code assets ] return examples From d35dd4386b8c5248f5e83e2899f9177fa6cdb1fc Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Sun, 28 Apr 2024 23:24:26 -0700 Subject: [PATCH 03/15] resurrect blender (#723) * adds a refreshed blender example * adds gif of final render --- 06_gpu_and_ml/blender/blender_video.py | 296 +++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 06_gpu_and_ml/blender/blender_video.py diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py new file mode 100644 index 000000000..f5cc55622 --- /dev/null +++ b/06_gpu_and_ml/blender/blender_video.py @@ -0,0 +1,296 @@ +# --- +# output-directory: "/tmp/render" +# --- +# # Render a video with Blender on many GPUs or CPUs in parallel +# +# This example shows how you can render an animated 3D scene using +# [Blender](https://www.blender.org/)'s Python interface. +# +# You can run it on CPUs to scale out on one hundred of containers +# or run it on GPUs to get higher throughput per node. +# Even with this simple scene, GPUs render 2x faster than CPUs. +# +# The final render looks something like this: +# +# ![Spinning Modal logo](https://modal-public-assets.s3.amazonaws.com/modal-blender-render.gif) +# +# ## Defining a Modal app + +import io +import math +from pathlib import Path + +import modal + +# Modal runs your Python functions for you in the cloud. +# You organize your code into apps, collections of functions that work together. + +app = modal.App("examples-blender-logo") + +# We need to define the environment each function runs in -- its container image. +# The block below defines a container image, starting from a basic Debian Linux image +# adding Blender's system-level dependencies +# and then installing the `bpy` package, which is Blender's Python API. + +rendering_image = ( + modal.Image.debian_slim(python_version="3.11") + .apt_install("xorg", "libxkbcommon0") # X11 (Unix GUI) dependencies + .pip_install("bpy") # Blender as a Python package +) + +# ## Rendering a single frame +# +# We define a function that renders a single frame. We'll scale this function out on Modal later. +# +# Functions in Modal are defined along with their hardware and their dependencies. +# This function can be run with GPU acceleration or without it, and we'll use a global flag in the code to switch between the two. + +WITH_GPU = True # try changing this to False to run rendering massively in parallel on CPUs! + +# We decorate the function with `@app.function` to define it as a Modal function. +# Note that in addition to defining the hardware requirements of the function, +# we also specify the container image that the function runs in (the one we defined above). + +# The details of the rendering function aren't too important for this example, +# so we abstract them out into functions defined at the end of the file. +# We draw a simple version of the Modal logo: +# two neon green rectangular prisms facing different directions. +# We include a parameter to rotate the prisms around the vertical/Z axis, +# which we'll use to animate the logo. + + +@app.function( + gpu="T4" if WITH_GPU else None, + concurrency_limit=10 + if WITH_GPU + else 100, # default limits on Modal free tier + image=rendering_image, +) +def render(angle: int = 0) -> bytes: + """ + Renders Modal's logo, two neon green rectangular prisms. + + + Args: + angle: How much to rotate the two prisms around the vertical/Z axis, in degrees. + + Returns: + The rendered frame as a PNG image. + """ + import bpy + + # clear existing objects + bpy.ops.object.select_all(action="DESELECT") + bpy.ops.object.select_by_type(type="MESH") + bpy.ops.object.delete() + + # ctx: the current Blender state, which we mutate + ctx = bpy.context + + # scene: the 3D environment we are rendering and its camera(s) + scene = ctx.scene + + # configure rendering -- CPU or GPU, resolution, etc. + # see function definition below for details + configure_rendering(ctx, WITH_GPU) + + scene.render.image_settings.file_format = "PNG" + scene.render.filepath = "output.png" + + # set background to black + black = (0, 0, 0, 1) + scene.world.node_tree.nodes["Background"].inputs[0].default_value = black + + # add the Modal logo: two neon green rectangular prisms + iridescent_material = create_iridescent_material() + + add_prism(ctx, (-1, 0, 0), 45, angle, iridescent_material) + add_prism(ctx, (3, 0, 0), -45, angle, iridescent_material) + + # set up the lighting and camera + bpy.ops.object.light_add(type="POINT", location=(5, 5, 5)) + bpy.context.object.data.energy = 10 + bpy.ops.object.camera_add(location=(7, -7, 5)) + scene.camera = bpy.context.object + ctx.object.rotation_euler = (1.1, 0, 0.785) + + # render + bpy.ops.render.render(write_still=True) + + # return the bytes to the caller + with open(scene.render.filepath, "rb") as image_file: + image_bytes = image_file.read() + + return image_bytes + + +# ### Rendering with acceleration +# +# We can configure the rendering process to use GPU acceleration with NVIDIA CUDA. +# We select the [Cycles rendering engine](https://www.cycles-renderer.org/), which is compatible with CUDA, +# and then activate the GPU. + + +def configure_rendering(ctx, with_gpu: bool): + # configure the rendering process + ctx.scene.render.engine = "CYCLES" + ctx.scene.render.resolution_x = 1920 + ctx.scene.render.resolution_y = 1080 + ctx.scene.render.resolution_percentage = 100 + + # add GPU acceleration if available + if with_gpu: + ctx.preferences.addons[ + "cycles" + ].preferences.compute_device_type = "CUDA" + ctx.scene.cycles.device = "GPU" + + # reload the devices to update the configuration + ctx.preferences.addons["cycles"].preferences.get_devices() + for device in ctx.preferences.addons["cycles"].preferences.devices: + device.use = True + + else: + ctx.scene.cycles.device = "CPU" + + # report rendering devices -- a nice snippet for debugging and ensuring the accelerators are being used + for dev in ctx.preferences.addons["cycles"].preferences.devices: + print( + f"ID:{dev['id']} Name:{dev['name']} Type:{dev['type']} Use:{dev['use']}" + ) + + +# ## Combining frames into a GIF +# +# Rendering 3D images is fun, and GPUs can make it faster, but rendering 3D videos is better! +# We add another function to our app, running on a different, simpler container image +# and different hardware, to combine the frames into a GIF. + +combination_image = modal.Image.debian_slim(python_version="3.11").pip_install( + "pillow==10.3.0" +) + +# The video has a few parameters, which we set here. + +FPS = 60 +FRAME_DURATION_MS = 1000 // FPS +NUM_FRAMES = 360 # drop this for faster iteration while playing around + +# The function to combine the frames into a GIF takes a sequence of byte sequences, one for each rendered frame, +# and converts them into a single sequence of bytes, the GIF. + + +@app.function(image=combination_image) +def combine( + frames_bytes: list[bytes], frame_duration: int = FRAME_DURATION_MS +) -> bytes: + print("🎞️ combining frames into a gif") + from PIL import Image + + frames = [ + Image.open(io.BytesIO(frame_bytes)) for frame_bytes in frames_bytes + ] + + gif_image = io.BytesIO() + frames[0].save( + gif_image, + format="GIF", + save_all=True, + append_images=frames[1:], + duration=frame_duration, + loop=0, + ) + + gif_image.seek(0) + + return gif_image.getvalue() + + +# ## Rendering in parallel in the cloud from the comfort of the command line +# +# With these two functions defined, we need only a few more lines to run our rendering at scale on Modal. +# +# First, we need a function that coordinates our functions to `render` frames and `combine` them. +# We decorate that function with `@app.local_entrypoint` so that we can run it with `modal run blender_video.py`. +# +# In that function, we use `render.map` to map the `render` function over a `range` of `angle`s, +# so that the logo will appear to spin in the final video. +# +# We collect the bytes from each frame into a `list` locally and then send it to `combine` with `.remote`. +# +# The bytes for the video come back to our local machine, and we write them to a file. +# +# The whole rendering process (for six seconds of 1080p 60 FPS video) takes between five and ten minutes on 10 T4 GPUs. + + +@app.local_entrypoint() +def main(): + output_directory = Path("/tmp") / "render" + output_directory.mkdir(parents=True, exist_ok=True) + filename = output_directory / "output.gif" + with open(filename, "wb") as out_file: + out_file.write( + combine.remote(list(render.map(range(0, 360, 360 // NUM_FRAMES)))) + ) + print(f"Image saved to {filename}") + + +# ## Addenda +# +# The remainder of the code in this example defines the details of the render. +# It's not particularly interesting, so we put it the end of the file. + + +def add_prism(ctx, location, initial_rotation, angle, material): + """Add a prism at a given location, rotation, and angle, made of the provided material.""" + import bpy + import mathutils + + bpy.ops.mesh.primitive_cube_add(size=2, location=location) + obj = ctx.object # the newly created object + + # assign the material to the object + obj.data.materials.append(material) + + obj.scale = (1, 1, 2) # square base, 2x taller than wide + # Modal logo is rotated 45 degrees + obj.rotation_euler[1] = math.radians(initial_rotation) + + # apply initial transformations + bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) + + # to "animate" the rendering, we rotate the prisms around the Z axis + angle_radians = math.radians(angle) + rotation_matrix = mathutils.Matrix.Rotation(angle_radians, 4, "Z") + obj.matrix_world = rotation_matrix @ obj.matrix_world + bpy.ops.object.transform_apply(location=True, rotation=True, scale=True) + + +def create_iridescent_material(): + import bpy + + mat = bpy.data.materials.new(name="IridescentGreen") + mat.use_nodes = True + nodes = mat.node_tree.nodes + links = mat.node_tree.links + + nodes.clear() + + output_node = nodes.new(type="ShaderNodeOutputMaterial") + emission_node = nodes.new(type="ShaderNodeEmission") + layer_weight = nodes.new(type="ShaderNodeLayerWeight") + color_ramp = nodes.new(type="ShaderNodeValToRGB") + + color_ramp.color_ramp.elements[0].color = (0, 0, 0, 1) + color_ramp.color_ramp.elements[1].color = (0, 1, 0, 1) + layer_weight.inputs["Blend"].default_value = 0.4 + + links.new(layer_weight.outputs["Fresnel"], color_ramp.inputs["Fac"]) + links.new(color_ramp.outputs["Color"], emission_node.inputs["Color"]) + + emission_node.inputs["Strength"].default_value = 5.0 + emission_node.inputs["Color"].default_value = (0.0, 1.0, 0.0, 1) + + links.new(emission_node.outputs["Emission"], output_node.inputs["Surface"]) + + return mat From f23c5c20168e5585341d39a27cf825b5117c6c4c Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Sun, 28 Apr 2024 23:53:47 -0700 Subject: [PATCH 04/15] Add detailed example for Fooocus on Modal (#721) * Add Fooocus Modal example with detailed comments and Markdown sections * edits fooocus example --------- Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Charles Frye --- misc/run_fooocus.py | 99 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 misc/run_fooocus.py diff --git a/misc/run_fooocus.py b/misc/run_fooocus.py new file mode 100644 index 000000000..904f0d530 --- /dev/null +++ b/misc/run_fooocus.py @@ -0,0 +1,99 @@ +# # Generate: Fooocus +# +# This example demonstrates how to set up and run a web server using the Modal library with Fooocus as the frontend. +# Fooocus provides a beginner-friendly interface to work with the SDXL 1.0 model for image generation tasks. +# The script includes the setup of a Docker image, initialization of Fooocus, and launching a web server with GPU support. +# +# ## Basic setup + +import modal + +# To create an image that can run Fooocus, we start from an official NVIDIA base image and then add Python +# and a few system packages. +# +# We then download the Fooocus repository. + +image = ( + modal.Image.from_registry( + "nvidia/cuda:12.3.1-base-ubuntu22.04", add_python="3.10" + ) + .apt_install( + "software-properties-common", + "git", + "git-lfs", + "coreutils", + "aria2", + "libgl1", + "libglib2.0-0", + "curl", + "wget", + "libsm6", + "libxrender1", + "libxext6", + "ffmpeg", + ) + .run_commands("git clone https://github.com/lllyasviel/Fooocus.git") +) + +# ## Initialize Fooocus +# +# We are not limited to running shell commands and package installers in the image setup. +# We can also run Python functions by defining them in our code and passing them to the `run_function` method. +# +# This function installs Fooocus's dependencies and downloads the SDXL 1.0 model to the container image. +# +# This all happens at the time the container image is defined, so that the image is ready to run Fooocus when it is deployed. + + +def init_Fooocus(): + import os + import subprocess + + # change the working directory to the Fooocus directory and install the required Python packages from the requirements file. + os.chdir("/Fooocus") + os.system("pip install -r requirements_versions.txt") + + # change the directory to the models' checkpoints and download the SDXL 1.0 model using wget. + os.chdir("./models/checkpoints") + subprocess.run( + "wget -O juggernautXL_v8Rundiffusion.safetensors 'https://huggingface.co/lllyasviel/fav_models/resolve/main/fav/juggernautXL_v8Rundiffusion.safetensors'", + shell=True, + ) + + +GPU_CONFIG = modal.gpu.T4() +image = image.run_function(init_Fooocus, gpu=GPU_CONFIG) + +# ## Run Fooocus +# +# The `run` function is decorated with `app.function` to define it as a Modal function. +# The `web_server` decorator indicates that this function will serve a web application on the specified port. +# We increase the startup timeout to three minutes to account for the time it takes to load the model and start the server. + +app = modal.App("Fooocus", image=image) + +PORT = 8000 +MINUTES = 60 + + +@app.function(gpu=GPU_CONFIG, timeout=10 * MINUTES) +@modal.web_server(port=PORT, startup_timeout=3 * MINUTES) +def run(): + import os + import subprocess + + # change the working directory to the Fooocus directory. + os.chdir("/Fooocus") + + # launch the Fooocus application using a subprocess that listens on the specified port + subprocess.Popen( + [ + "python", + "launch.py", + "--listen", + "0.0.0.0", + "--port", + str(PORT), + "--always-high-vram", + ] + ) From f87d12deca8f8e14072dd91461ad17d87546ecd8 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Mon, 29 Apr 2024 14:21:10 -0700 Subject: [PATCH 05/15] remove unnecessary nesting of instructor (#725) will add local inference version as a peer when ready --- .../llm-structured/{instructor => }/instructor_generate.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename 06_gpu_and_ml/llm-structured/{instructor => }/instructor_generate.py (100%) diff --git a/06_gpu_and_ml/llm-structured/instructor/instructor_generate.py b/06_gpu_and_ml/llm-structured/instructor_generate.py similarity index 100% rename from 06_gpu_and_ml/llm-structured/instructor/instructor_generate.py rename to 06_gpu_and_ml/llm-structured/instructor_generate.py From f3341012c93f69fdd4729809de948d8753d1d04d Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Mon, 29 Apr 2024 19:15:01 -0700 Subject: [PATCH 06/15] faster renders, numbers on throughput and latency (#726) --- 06_gpu_and_ml/blender/blender_video.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py index f5cc55622..ee1ed85a7 100644 --- a/06_gpu_and_ml/blender/blender_video.py +++ b/06_gpu_and_ml/blender/blender_video.py @@ -137,6 +137,7 @@ def configure_rendering(ctx, with_gpu: bool): ctx.scene.render.resolution_x = 1920 ctx.scene.render.resolution_y = 1080 ctx.scene.render.resolution_percentage = 100 + ctx.scene.cycles.samples = 128 # add GPU acceleration if available if with_gpu: @@ -220,7 +221,8 @@ def combine( # # The bytes for the video come back to our local machine, and we write them to a file. # -# The whole rendering process (for six seconds of 1080p 60 FPS video) takes between five and ten minutes on 10 T4 GPUs. +# The whole rendering process (for six seconds of 1080p 60 FPS video) takes about five minutes to run on 10 T4 GPUs, +# with a per-frame latency of under 10 seconds, and about two minutes to run on 100 CPUs, with a per-frame latency of about 30 seconds. @app.local_entrypoint() From 54c379561c03fde020d43014721bd7a857fe875d Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Thu, 2 May 2024 19:30:03 -0700 Subject: [PATCH 07/15] fixes newly-gated models in certain examples (#727) * remove extra line * adds instructions for handling gated model * handles gating for Mistral 7B in outlines example --- 06_gpu_and_ml/llm-serving/tgi_mixtral.py | 1 - 06_gpu_and_ml/llm-serving/vllm_inference.py | 9 +++++++++ 06_gpu_and_ml/llm-structured/outlines_generate.py | 14 ++++++++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/tgi_mixtral.py b/06_gpu_and_ml/llm-serving/tgi_mixtral.py index c4313043c..5ca7da284 100644 --- a/06_gpu_and_ml/llm-serving/tgi_mixtral.py +++ b/06_gpu_and_ml/llm-serving/tgi_mixtral.py @@ -48,7 +48,6 @@ # We can use the included utilities to download the model weights (and convert to safetensors, if necessary) # as part of the image build. # -# # For this step to work on a [gated model](https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/gated_model_access) # like Mixtral 8x7B, the `HF_TOKEN` environment variable must be set. # diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py index 9f6b23a6a..3f67aa908 100644 --- a/06_gpu_and_ml/llm-serving/vllm_inference.py +++ b/06_gpu_and_ml/llm-serving/vllm_inference.py @@ -36,6 +36,13 @@ # ### Download the weights # We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. # +# For this step to work on a [gated model](https://huggingface.co/docs/hub/en/models-gated) +# like Mistral 7B, the `HF_TOKEN` environment variable must be set. +# +# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens) +# and accepting the [terms of use](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1), +# head to the [secrets page](https://modal.com/secrets) to share it with Modal as `huggingface-secret`. +# # Tip: avoid using global variables in this function. # Changes to code outside this function will not be detected, and the download step will not re-run. def download_model_to_image(model_dir, model_name): @@ -48,6 +55,7 @@ def download_model_to_image(model_dir, model_name): model_name, local_dir=model_dir, ignore_patterns=["*.pt", "*.bin"], # Using safetensors + token=os.environ["HF_TOKEN"], ) move_cache() @@ -71,6 +79,7 @@ def download_model_to_image(model_dir, model_name): download_model_to_image, timeout=60 * 20, kwargs={"model_dir": MODEL_DIR, "model_name": MODEL_NAME}, + secrets=[modal.Secret.from_name("huggingface-secret")], ) ) diff --git a/06_gpu_and_ml/llm-structured/outlines_generate.py b/06_gpu_and_ml/llm-structured/outlines_generate.py index 19e7ae763..b54acadbf 100644 --- a/06_gpu_and_ml/llm-structured/outlines_generate.py +++ b/06_gpu_and_ml/llm-structured/outlines_generate.py @@ -24,7 +24,7 @@ # First, you'll want to build an image and install the relevant Python dependencies: # `outlines` and a Hugging Face inference stack. -from modal import App, Image, gpu +from modal import App, Image, Secret, gpu app = App( name="outlines-app" @@ -42,6 +42,13 @@ # Next, we download the Mistral-7B model from Hugging Face. # We do this as part of the definition of our Modal image so that # we don't need to download it every time our inference function is run. +# +# For this step to work on a [gated model](https://huggingface.co/docs/hub/en/models-gated) +# like Mistral 7B, the `HF_TOKEN` environment variable must be set. +# +# After [creating a HuggingFace access token](https://huggingface.co/settings/tokens) +# and accepting the [terms of use](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1), +# head to the [secrets page](https://modal.com/secrets) to share it with Modal as `huggingface-secret`. def import_model(): @@ -50,7 +57,10 @@ def import_model(): outlines.models.transformers("mistralai/Mistral-7B-v0.1") -outlines_image = outlines_image.run_function(import_model) +outlines_image = outlines_image.run_function( + import_model, + secrets=[Secret.from_name("huggingface-secret")], +) # ## Define the schema From ad9346a7bf38272470ce20e1a3c6d4f578b2cd2c Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Thu, 2 May 2024 20:09:25 -0700 Subject: [PATCH 08/15] fixes relative path between instructor_generate and utils (#728) --- 06_gpu_and_ml/llm-structured/instructor_generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/06_gpu_and_ml/llm-structured/instructor_generate.py b/06_gpu_and_ml/llm-structured/instructor_generate.py index 242f419d3..57ba0ed0d 100644 --- a/06_gpu_and_ml/llm-structured/instructor_generate.py +++ b/06_gpu_and_ml/llm-structured/instructor_generate.py @@ -206,7 +206,7 @@ def get_examples(silent=True): We use importlib to avoid the need to define the repo as a package.""" import importlib - examples_root = Path(__file__).parent.parent.parent.parent + examples_root = Path(__file__).parent.parent.parent spec = importlib.util.spec_from_file_location( "utils", f"{examples_root}/internal/utils.py" ) From 8ff22cc373be59f9331f55c7aff799a41b7c0360 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Thu, 2 May 2024 22:34:50 -0700 Subject: [PATCH 09/15] centers logo, positions prisms, nicer material, match CPU + GPU throughput (#729) --- 06_gpu_and_ml/blender/blender_video.py | 63 +++++++++++++++++++++----- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py index ee1ed85a7..c07547691 100644 --- a/06_gpu_and_ml/blender/blender_video.py +++ b/06_gpu_and_ml/blender/blender_video.py @@ -6,9 +6,9 @@ # This example shows how you can render an animated 3D scene using # [Blender](https://www.blender.org/)'s Python interface. # -# You can run it on CPUs to scale out on one hundred of containers +# You can run it on CPUs to scale out on one hundred containers # or run it on GPUs to get higher throughput per node. -# Even with this simple scene, GPUs render 2x faster than CPUs. +# Even with this simple scene, GPUs render 10x faster than CPUs. # # The final render looks something like this: # @@ -60,7 +60,7 @@ @app.function( - gpu="T4" if WITH_GPU else None, + gpu="A10G" if WITH_GPU else None, concurrency_limit=10 if WITH_GPU else 100, # default limits on Modal free tier @@ -104,12 +104,33 @@ def render(angle: int = 0) -> bytes: # add the Modal logo: two neon green rectangular prisms iridescent_material = create_iridescent_material() - add_prism(ctx, (-1, 0, 0), 45, angle, iridescent_material) - add_prism(ctx, (3, 0, 0), -45, angle, iridescent_material) + add_prism(ctx, (-2.07, -1, 0), 45, angle, iridescent_material) + add_prism(ctx, (2.07, 1, 0), -45, angle, iridescent_material) - # set up the lighting and camera + # set up the lighting + # warm key light bpy.ops.object.light_add(type="POINT", location=(5, 5, 5)) - bpy.context.object.data.energy = 10 + key_light = bpy.context.object + key_light.data.energy = 100 + key_light.data.color = (1, 0.8, 0.5) # warm + + # tight, cool spotlight + bpy.ops.object.light_add(type="SPOT", radius=1, location=(4, 0, 6)) + spot_light = bpy.context.object + spot_light.data.energy = 500 + spot_light.data.spot_size = 0.5 + spot_light.data.color = (0.8, 0.8, 1) # cool + spot_light.rotation_euler = (3.14 / 4, 0, -3.14 / 4) + + # soft overall illumination + bpy.ops.object.light_add(type="AREA", radius=3, location=(-3, 3, 5)) + area_light = bpy.context.object + area_light.data.energy = 50 # softer + area_light.data.size = 5 # larger + area_light.data.color = (1, 1, 1) # neutral + area_light.rotation_euler = (3.14 / 2, 0, 3.14) + + # add camera bpy.ops.object.camera_add(location=(7, -7, 5)) scene.camera = bpy.context.object ctx.object.rotation_euler = (1.1, 0, 0.785) @@ -221,8 +242,8 @@ def combine( # # The bytes for the video come back to our local machine, and we write them to a file. # -# The whole rendering process (for six seconds of 1080p 60 FPS video) takes about five minutes to run on 10 T4 GPUs, -# with a per-frame latency of under 10 seconds, and about two minutes to run on 100 CPUs, with a per-frame latency of about 30 seconds. +# The whole rendering process (for six seconds of 1080p 60 FPS video) takes about five minutes to run on 10 A10G GPUs, +# with a per-frame latency of about 10 seconds, and about five minutes to run on 100 CPUs, with a per-frame latency of about one minute. @app.local_entrypoint() @@ -251,6 +272,11 @@ def add_prism(ctx, location, initial_rotation, angle, material): bpy.ops.mesh.primitive_cube_add(size=2, location=location) obj = ctx.object # the newly created object + bevel = obj.modifiers.new(name="Bevel", type="BEVEL") + bevel.width = 0.2 + bevel.segments = 5 + bevel.profile = 1.0 + # assign the material to the object obj.data.materials.append(material) @@ -278,13 +304,22 @@ def create_iridescent_material(): nodes.clear() - output_node = nodes.new(type="ShaderNodeOutputMaterial") + principled_node = nodes.new(type="ShaderNodeBsdfPrincipled") + emission_node = nodes.new(type="ShaderNodeEmission") layer_weight = nodes.new(type="ShaderNodeLayerWeight") color_ramp = nodes.new(type="ShaderNodeValToRGB") + mix_shader_node = nodes.new(type="ShaderNodeMixShader") + + output_node = nodes.new(type="ShaderNodeOutputMaterial") + + principled_node.inputs["Base Color"].default_value = (1, 1, 1, 1) + principled_node.inputs["Metallic"].default_value = 1.0 + principled_node.inputs["Roughness"].default_value = 0.5 + color_ramp.color_ramp.elements[0].color = (0, 0, 0, 1) - color_ramp.color_ramp.elements[1].color = (0, 1, 0, 1) + color_ramp.color_ramp.elements[1].color = (0, 0.5, 0, 1) layer_weight.inputs["Blend"].default_value = 0.4 links.new(layer_weight.outputs["Fresnel"], color_ramp.inputs["Fac"]) @@ -293,6 +328,10 @@ def create_iridescent_material(): emission_node.inputs["Strength"].default_value = 5.0 emission_node.inputs["Color"].default_value = (0.0, 1.0, 0.0, 1) - links.new(emission_node.outputs["Emission"], output_node.inputs["Surface"]) + links.new(emission_node.outputs["Emission"], mix_shader_node.inputs[1]) + links.new(principled_node.outputs["BSDF"], mix_shader_node.inputs[2]) + links.new(layer_weight.outputs["Fresnel"], mix_shader_node.inputs["Fac"]) + + links.new(mix_shader_node.outputs["Shader"], output_node.inputs["Surface"]) return mat From 03c44cb42a7440fc31ef00631f1a0cf0589161bb Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Fri, 3 May 2024 10:19:43 -0700 Subject: [PATCH 10/15] refactors lighting out of main render function (#730) --- 06_gpu_and_ml/blender/blender_video.py | 52 ++++++++++++++------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/06_gpu_and_ml/blender/blender_video.py b/06_gpu_and_ml/blender/blender_video.py index c07547691..c58c4cbf4 100644 --- a/06_gpu_and_ml/blender/blender_video.py +++ b/06_gpu_and_ml/blender/blender_video.py @@ -107,30 +107,8 @@ def render(angle: int = 0) -> bytes: add_prism(ctx, (-2.07, -1, 0), 45, angle, iridescent_material) add_prism(ctx, (2.07, 1, 0), -45, angle, iridescent_material) - # set up the lighting - # warm key light - bpy.ops.object.light_add(type="POINT", location=(5, 5, 5)) - key_light = bpy.context.object - key_light.data.energy = 100 - key_light.data.color = (1, 0.8, 0.5) # warm - - # tight, cool spotlight - bpy.ops.object.light_add(type="SPOT", radius=1, location=(4, 0, 6)) - spot_light = bpy.context.object - spot_light.data.energy = 500 - spot_light.data.spot_size = 0.5 - spot_light.data.color = (0.8, 0.8, 1) # cool - spot_light.rotation_euler = (3.14 / 4, 0, -3.14 / 4) - - # soft overall illumination - bpy.ops.object.light_add(type="AREA", radius=3, location=(-3, 3, 5)) - area_light = bpy.context.object - area_light.data.energy = 50 # softer - area_light.data.size = 5 # larger - area_light.data.color = (1, 1, 1) # neutral - area_light.rotation_euler = (3.14 / 2, 0, 3.14) - - # add camera + # add lighting and camera + add_lighting() bpy.ops.object.camera_add(location=(7, -7, 5)) scene.camera = bpy.context.object ctx.object.rotation_euler = (1.1, 0, 0.785) @@ -335,3 +313,29 @@ def create_iridescent_material(): links.new(mix_shader_node.outputs["Shader"], output_node.inputs["Surface"]) return mat + + +def add_lighting(): + import bpy + + # warm key light + bpy.ops.object.light_add(type="POINT", location=(5, 5, 5)) + key_light = bpy.context.object + key_light.data.energy = 100 + key_light.data.color = (1, 0.8, 0.5) # warm + + # tight, cool spotlight + bpy.ops.object.light_add(type="SPOT", radius=1, location=(4, 0, 6)) + spot_light = bpy.context.object + spot_light.data.energy = 500 + spot_light.data.spot_size = 0.5 + spot_light.data.color = (0.8, 0.8, 1) # cool + spot_light.rotation_euler = (3.14 / 4, 0, -3.14 / 4) + + # soft overall illumination + bpy.ops.object.light_add(type="AREA", radius=3, location=(-3, 3, 5)) + area_light = bpy.context.object + area_light.data.energy = 50 # softer + area_light.data.size = 5 # larger + area_light.data.color = (1, 1, 1) # neutral + area_light.rotation_euler = (3.14 / 2, 0, 3.14) From 5923bff5ab734633ae06b6ab4493838014794d06 Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Fri, 3 May 2024 10:36:41 -0700 Subject: [PATCH 11/15] adds rate limit handler from slack SDK (#731) --- 10_integrations/webscraper.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/10_integrations/webscraper.py b/10_integrations/webscraper.py index e85135b08..d817b323e 100644 --- a/10_integrations/webscraper.py +++ b/10_integrations/webscraper.py @@ -39,7 +39,9 @@ async def get_links(url: str) -> set[str]: return set(links) -slack_sdk_image = modal.Image.debian_slim().pip_install("slack-sdk") +slack_sdk_image = modal.Image.debian_slim(python_version="3.10").pip_install( + "slack-sdk==3.27.1" +) @app.function( @@ -48,9 +50,13 @@ async def get_links(url: str) -> set[str]: ) def bot_token_msg(channel, message): import slack_sdk + from slack_sdk.http_retry.builtin_handlers import RateLimitErrorRetryHandler - print(f"Posting {message} to #{channel}") client = slack_sdk.WebClient(token=os.environ["SLACK_BOT_TOKEN"]) + rate_limit_handler = RateLimitErrorRetryHandler(max_retry_count=3) + client.retry_handlers.append(rate_limit_handler) + + print(f"Posting {message} to #{channel}") client.chat_postMessage(channel=channel, text=message) From e0b46deb9889d25832fb392307e9fdccb52d3528 Mon Sep 17 00:00:00 2001 From: Talha SARI Date: Sun, 5 May 2024 04:00:56 +0300 Subject: [PATCH 12/15] Fix whisper streaming (#733) * change endpoint name to transcribe to match example usage * add remote method to modal function usage * use aio to convert synch map into asynch * minor fix * change sleep to 0, fixed the curl giving error otherwise * correct old typo --------- Co-authored-by: Charles Frye --- 06_gpu_and_ml/openai_whisper/streaming/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/06_gpu_and_ml/openai_whisper/streaming/main.py b/06_gpu_and_ml/openai_whisper/streaming/main.py index cc8ae23b3..676d2b485 100644 --- a/06_gpu_and_ml/openai_whisper/streaming/main.py +++ b/06_gpu_and_ml/openai_whisper/streaming/main.py @@ -183,16 +183,16 @@ async def stream_whisper(audio_data: bytes): f.flush() segment_gen = split_silences(f.name) - for result in transcribe_segment.starmap( + async for result in transcribe_segment.starmap( segment_gen, kwargs=dict(audio_data=audio_data, model="base.en") ): - # Must cooperatively yeild here otherwise `StreamingResponse` will not iteratively return stream parts. - # see: https://github.com/python/asyncio/issues/284 - await asyncio.sleep(0.5) + # Must cooperatively yield here otherwise `StreamingResponse` will not iteratively return stream parts. + # see: https://github.com/python/asyncio/issues/284#issuecomment-154162668 + await asyncio.sleep(0) yield result["text"] -@web_app.get("/") +@web_app.get("/transcribe") async def transcribe(url: str): """ Usage: @@ -213,7 +213,7 @@ async def transcribe(url: str): print(f"downloading {url}") try: - audio_data = download_mp3_from_youtube(url) + audio_data = download_mp3_from_youtube.remote(url) except pytube.exceptions.RegexMatchError: raise HTTPException( status_code=422, detail=f"Could not process url {url}" From a238c9758583ccaeccdcbc217dddee75651cf26e Mon Sep 17 00:00:00 2001 From: bofeng huang Date: Sun, 5 May 2024 03:08:48 +0200 Subject: [PATCH 13/15] Fix vLLM template (#734) * Update vllm_mixtral.py * Fix template * Fix template --- 06_gpu_and_ml/llm-serving/vllm_gemma.py | 2 +- 06_gpu_and_ml/llm-serving/vllm_inference.py | 4 ++-- 06_gpu_and_ml/llm-serving/vllm_mixtral.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/06_gpu_and_ml/llm-serving/vllm_gemma.py b/06_gpu_and_ml/llm-serving/vllm_gemma.py index 2a3545961..634c6d47a 100644 --- a/06_gpu_and_ml/llm-serving/vllm_gemma.py +++ b/06_gpu_and_ml/llm-serving/vllm_gemma.py @@ -121,7 +121,7 @@ class Model: @modal.enter() def load(self): self.template = ( - "start_of_turn>user\n{user}\nmodel" + "user\n{user}\nmodel\n" ) # Load the model. Tip: Some models, like MPT, may require `trust_remote_code=true`. diff --git a/06_gpu_and_ml/llm-serving/vllm_inference.py b/06_gpu_and_ml/llm-serving/vllm_inference.py index 3f67aa908..c24e345db 100644 --- a/06_gpu_and_ml/llm-serving/vllm_inference.py +++ b/06_gpu_and_ml/llm-serving/vllm_inference.py @@ -109,11 +109,11 @@ class Model: def load_model(self): # Tip: models that are not fully implemented by Hugging Face may require `trust_remote_code=true`. self.llm = vllm.LLM(MODEL_DIR, tensor_parallel_size=GPU_CONFIG.count) - self.template = """[INST] <> + self.template = """[INST] <> {system} <> -{user} [/INST] """ +{user} [/INST]""" @modal.method() def generate(self, user_questions): diff --git a/06_gpu_and_ml/llm-serving/vllm_mixtral.py b/06_gpu_and_ml/llm-serving/vllm_mixtral.py index 57618ae28..eb236b9cb 100644 --- a/06_gpu_and_ml/llm-serving/vllm_mixtral.py +++ b/06_gpu_and_ml/llm-serving/vllm_mixtral.py @@ -121,7 +121,7 @@ def start_engine(self): disable_log_stats=True, # disable logging so we can stream tokens disable_log_requests=True, ) - self.template = " [INST] {user} [/INST] " + self.template = "[INST] {user} [/INST]" # this can take some time! self.engine = AsyncLLMEngine.from_engine_args(engine_args) From 2ac53ebc35b38e30d2288efb3cecaf41f19c8733 Mon Sep 17 00:00:00 2001 From: Akshat Bubna Date: Mon, 6 May 2024 00:10:06 -0400 Subject: [PATCH 14/15] install numpy explicitly in wikipedia example (#736) --- 06_gpu_and_ml/embeddings/wikipedia/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/06_gpu_and_ml/embeddings/wikipedia/main.py b/06_gpu_and_ml/embeddings/wikipedia/main.py index 95d898c22..0c3ffb5cc 100644 --- a/06_gpu_and_ml/embeddings/wikipedia/main.py +++ b/06_gpu_and_ml/embeddings/wikipedia/main.py @@ -78,7 +78,7 @@ def spawn_server() -> subprocess.Popen: add_python="3.10", ) .dockerfile_commands("ENTRYPOINT []") - .pip_install("httpx") + .pip_install("httpx", "numpy") ) with tei_image.imports(): From 75d6c997ecedb953e81b5b00cadf677eb96aed9f Mon Sep 17 00:00:00 2001 From: Charles Frye Date: Mon, 6 May 2024 10:44:15 -0700 Subject: [PATCH 15/15] Run examples on change (#735) * inital draft of action to test monitoring * cleans up monitoring workflow * more complete draft of monitoring test action * removes draft monitoring workflow, reorganizes existing workflows * update internal development requirements * turn off dry run now that we're going back to prod * reorganize environment setup * WIP version of example execution * adds .secrets file from act * handles modal serve, proper system exit, drops extra script * updates actions, better environment setup * handle PRs with no changed files * add back dev dependencies for jupytext and pydantic in deploy * reverts changes to typechecking to avoid slowdown --- .github/actions/setup/action.yml | 36 ++++++++++++++ .github/workflows/cd.yml | 8 +--- .github/workflows/check.yml | 27 +++-------- .github/workflows/run-examples.yml | 76 ++++++++++++++++++++++++++++++ .gitignore | 3 ++ internal/requirements.txt | 7 ++- internal/run_example.py | 50 ++++++++++++++++++++ 7 files changed, 179 insertions(+), 28 deletions(-) create mode 100644 .github/actions/setup/action.yml create mode 100644 .github/workflows/run-examples.yml create mode 100644 internal/run_example.py diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml new file mode 100644 index 000000000..0312efef1 --- /dev/null +++ b/.github/actions/setup/action.yml @@ -0,0 +1,36 @@ +name: setup + +description: Set up a Python environment for the examples. + +inputs: + version: + description: Which Python version to install + required: false + default: "3.11" + devDependencies: + description: Whether to skip dependencies + required: false + default: "no-skip" + +runs: + using: composite + steps: + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.version }} + + - name: Install base packages + shell: bash + run: | + pip install uv + uv pip install --system setuptools wheel + + - name: Install development Python packages + if: ${{ inputs.devDependencies != 'skip' }} + shell: bash + run: uv pip install --system -r internal/requirements.txt + + - name: Install the modal client + shell: bash + run: uv pip install --system modal diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 50ab209a0..451c08f46 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -17,13 +17,9 @@ jobs: steps: - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 with: - python-version: "3.9" - - - name: Install Modal client package and jupytext - run: pip install modal-client jupytext pydantic~=1.10 + fetch-depth: 1 + - uses: ./.github/actions/setup - name: Run deployment script run: | diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 389875d8d..9f058e4c0 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -13,13 +13,9 @@ jobs: steps: - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 with: - python-version: "3.11" - - # keep version here in sync with .pre-commit-config.yaml and other modal repos - - run: pip install ruff==0.2.1 + fetch-depth: 1 + - uses: ./.github/actions/setup - run: ruff check @@ -31,16 +27,14 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 with: - python-version: "3.11" - - name: Install NbConvert - run: pip install jupyter nbconvert + fetch-depth: 1 + - uses: ./.github/actions/setup - name: Check notebooks are cleaned run: | jupyter nbconvert --clear-output --inplace 11_notebooks/*.ipynb - git diff --quiet && git diff --cached --quiet || exit 1 + git diff --quiet 11_notebooks/*.ipynb && git diff --cached --quiet 11_notebooks/*.ipynb || exit 1 pytest: name: Pytest @@ -48,16 +42,9 @@ jobs: steps: - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 with: - python-version: "3.11" - - - name: Install dev dependencies - run: pip install pytest jupytext pydantic~=1.10 - - - name: Install the Modal client - run: pip install modal-client + fetch-depth: 1 + - uses: ./.github/actions/setup - name: Run run: pytest -v . diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml new file mode 100644 index 000000000..bf27d0adb --- /dev/null +++ b/.github/workflows/run-examples.yml @@ -0,0 +1,76 @@ +name: Run + +on: + pull_request: + branches: + - main + paths: + - "**.py" + push: + branches: + - main + paths: + - "**.py" + workflow_dispatch: + +# Cancel previous runs of the same PR but do not cancel previous runs on main +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +env: + TERM: linux + TERMINFO: /etc/terminfo + MODAL_TOKEN_ID: ${{ secrets.MODAL_MODAL_LABS_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_MODAL_LABS_TOKEN_SECRET }} + MODAL_ENVIRONMENT: main + +jobs: + # Output all changed files in a JSON format compatible with GitHub Actions job matrices + diff-matrix: + name: Generate matrix of changed examples + runs-on: ubuntu-20.04 + outputs: + matrix: ${{ steps.diff.outputs.all_changed_files }} + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Find changed examples + id: diff + uses: tj-actions/changed-files@v44 + with: + files: "**.py" + files_ignore: "internal/**,misc/**" + matrix: true + + - name: List all changed examples + run: echo '${{ steps.diff.outputs.all_changed_files }}' + + # Run each changed example, using the output of the previous step as a job matrix + run-changed: + name: Run changed example + needs: [diff-matrix] + if: + ${{ needs.diff-matrix.outputs.matrix != '[]' && + needs.diff-matrix.outputs.matrix != '' }} + runs-on: ubuntu-20.04 + strategy: + matrix: + file: ${{ fromJson(needs.diff-matrix.outputs.matrix) }} + fail-fast: false + + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + with: + fetch-depth: 1 + - uses: ./.github/actions/setup + + - name: Run example + run: | + echo "Running ${{ matrix.file }}" + stem=$(basename "${{ matrix.file }}" .py) + python3 -m internal.run_example $stem || exit $? diff --git a/.gitignore b/.gitignore index 53fe8b69e..3218fc050 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ venv .venv + +# secrets file for act, tool for local GitHub Actions testing +.secrets diff --git a/internal/requirements.txt b/internal/requirements.txt index 42bf85702..5c5120ec8 100644 --- a/internal/requirements.txt +++ b/internal/requirements.txt @@ -1,5 +1,8 @@ -modal pytest +jupyter +ipython +nbconvert jupytext~=1.16.1 pydantic~=1.10.14 -mypy==0.950 +mypy==1.2.0 +ruff==0.2.1 diff --git a/internal/run_example.py b/internal/run_example.py new file mode 100644 index 000000000..3b06a3cb0 --- /dev/null +++ b/internal/run_example.py @@ -0,0 +1,50 @@ +import os +import subprocess +import sys +import time + +from . import utils + +MINUTES = 60 +TIMEOUT = 12 * MINUTES + + +def run_script(example): + t0 = time.time() + + try: + print(f"cli args: {example.cli_args}") + process = subprocess.run( + example.cli_args, + env=os.environ | {"MODAL_SERVE_TIMEOUT": "5.0"}, + timeout=TIMEOUT, + ) + total_time = time.time() - t0 + if process.returncode == 0: + print(f"Success after {total_time:.2f}s :)") + else: + print( + f"Failed after {total_time:.2f}s with return code {process.returncode} :(" + ) + + returncode = process.returncode + + except subprocess.TimeoutExpired: + print(f"Past timeout of {TIMEOUT}s :(") + returncode = 999 + + return returncode + + +def run_single_example(stem): + examples = utils.get_examples() + for example in examples: + if stem == example.stem: + return run_script(example) + else: + print(f"Could not find example name {stem}") + return 0 + + +if __name__ == "__main__": + sys.exit(run_single_example(sys.argv[1]))