Group offloading with cuda stream prefetching #10516

a-r-r-o-w · 2025-01-10T05:36:40Z

See #10503 for more details.

Code

import argparse
import gc
import os
import time

# os.environ["TORCH_LOGS"] = "+recompiles_verbose,guards"
os.environ["TORCH_LOGS"] = "+recompiles_verbose"

import types
types.MethodType

import torch
import torch._dynamo
import torch._dynamo.utils
from diffusers import LTXPipeline
from diffusers.utils import export_to_video
from diffusers.hooks import apply_group_offloading

torch._dynamo.config.cache_size_limit = 4

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()


def benchmark_fn(f, *args, **kwargs):
    torch.cuda.synchronize()
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)

    start.record()
    output = f(*args, **kwargs)
    end.record()
    torch.cuda.synchronize()
    elapsed_time = round(start.elapsed_time(end) / 1000, 3)

    return elapsed_time, output


def normal():
    pipe = LTXPipeline.from_pretrained("a-r-r-o-w/LTX-Video-diffusers", torch_dtype=torch.bfloat16)
    pipe.to("cuda")
    
    cleanup()
    print(f"Model memory: {torch.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB")

    prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
    negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

    t1 = time.time()
    video = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=768,
        height=512,
        num_frames=161,
        num_inference_steps=50,
    ).frames[0]
    torch.cuda.synchronize()
    t2 = time.time()
    export_to_video(video, "output.mp4", fps=24)
    
    print(f"Inference time: {t2 - t1:.2f} s")
    print(f"Inference memory: {torch.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB")


def model_cpu_offload(compile: bool = False):
    pipe = LTXPipeline.from_pretrained("a-r-r-o-w/LTX-Video-diffusers", torch_dtype=torch.bfloat16)
    pipe.enable_model_cpu_offload()

    if compile:
        pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")
    
    cleanup()
    print(f"Model memory: {torch.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB")

    prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
    negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

    t1 = time.time()
    video = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=768,
        height=512,
        num_frames=161,
        num_inference_steps=50,
    ).frames[0]
    torch.cuda.synchronize()
    t2 = time.time()
    export_to_video(video, "output.mp4", fps=24)
    
    print(f"Inference time: {t2 - t1:.2f} s")
    print(f"Inference memory: {torch.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB")


def sequential_cpu_offload(compile: bool = False):
    pipe = LTXPipeline.from_pretrained("a-r-r-o-w/LTX-Video-diffusers", torch_dtype=torch.bfloat16)
    pipe.enable_sequential_cpu_offload()

    if compile:
        pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")
    
    cleanup()
    print(f"Model memory: {torch.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB")

    prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
    negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

    t1 = time.time()
    video = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=768,
        height=512,
        num_frames=161,
        num_inference_steps=50,
    ).frames[0]
    torch.cuda.synchronize()
    t2 = time.time()
    export_to_video(video, "output.mp4", fps=24)
    
    print(f"Inference time: {t2 - t1:.2f} s")
    print(f"Inference memory: {torch.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB")


def group_offloading(offload_group_patterns: str = "diffusers_block", num_blocks_per_group: int = 4, compile: bool = False):
    pipe = LTXPipeline.from_pretrained("a-r-r-o-w/LTX-Video-diffusers", torch_dtype=torch.bfloat16)
    pipe.text_encoder.to("cuda")
    pipe.vae.to("cuda")
    apply_group_offloading(
        module=pipe.transformer,
        offload_group_patterns=offload_group_patterns,
        num_blocks_per_group=num_blocks_per_group,
        offload_device=torch.device("cpu"),
        onload_device=torch.device("cuda"),
        force_offload=True,
        non_blocking=True,
        # cuda_stream=True,
    )
    if compile:
        pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")
        # pipe.transformer = torch.compile(pipe.transformer, backend="eager")
    
    cleanup()
    print(f"Model memory: {torch.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB")

    prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
    negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

    t1 = time.time()
    video = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=768,
        height=512,
        num_frames=161,
        num_inference_steps=50,
    ).frames[0]
    torch.cuda.synchronize()
    t2 = time.time()
    export_to_video(video, "output.mp4", fps=24)

    compile_times = torch._dynamo.utils.compile_times()
    print(f"Compile times: {compile_times}")
    
    print(f"Inference time: {t2 - t1:.2f} s")
    print(f"Inference memory: {torch.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB")


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--offload_type", type=str, default="normal", choices=["normal", "model", "sequential", "group"])
    parser.add_argument("--offload_group_patterns", type=str, default="diffusers_block")
    parser.add_argument("--num_blocks_per_group", type=int, default=4)
    parser.add_argument("--compile", action="store_true")
    return parser.parse_args()


if __name__ == "__main__":
    args = get_args()

    if args.offload_type == "normal":
        normal()
    elif args.offload_type == "model":
        model_cpu_offload(args.compile)
    elif args.offload_type == "sequential":
        sequential_cpu_offload(args.compile)
    else:
        group_offloading(args.offload_group_patterns, args.num_blocks_per_group, args.compile)

Results for:

python3 test.py --offload_type group --offload_group_patterns diffusers_block --num_blocks_per_group 4

Without cuda stream prefetching:

Model memory: 9.70 GB
Inference time: 85.36 s
Inference memory: 21.25 GB

With cuda streams:

# 4 block groups
Model memory: 10.26 GB
Inference time: 47.71 s
Inference memory: 21.89 GB

# 1 block groups
Model memory: 9.84 GB
Inference time: 46.15 s
Inference memory: 21.04

Currently not compatible with torch.compile and triggers several recompilations.

Reading:

cc @gau-nernst because the original ideas for layer prefetching come from his implementation

cc @DN6 @yiyixuxu

HuggingFaceDocBuilderDev · 2025-01-10T05:43:37Z

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

a-r-r-o-w · 2025-01-10T06:13:54Z

src/diffusers/hooks/group_offloading.py

+            if len(module_groups) > 1:
+                # Assign the first module_group as the next_group for the last module_group
+                hook_registry = HookRegistry.check_if_exists_or_initialize(module_groups[-1].onload_leader)
+                hook_registry.hooks["group_offloading"].next_group = module_groups[0]


This is a bit hacky for the moment just so that I could get it running quickly without putting in much thought. Will try to improve soon

a-r-r-o-w · 2025-01-11T07:13:55Z

To avoid some confusion, the numbers in the PR are not a 1:1 comparison with the group offloading PR.

In that PR, the benchmark only times and notes the memory requirement for transformer forward pass (prompt embeddings are precomputed and latents are decoded separately), but here they are including both prompt embedding and decoding steps. Will do a fair comparison numbers report shortly after completing benchmark script

a-r-r-o-w added 2 commits January 10, 2025 06:29

cuda stream prefetch

be98308

remove breakpoints

193b98c

a-r-r-o-w commented Jan 10, 2025

View reviewed changes

a-r-r-o-w merged commit d579037 into groupwise-offloading Jan 11, 2025
2 checks passed

a-r-r-o-w deleted the cuda-stream-group-offloading branch January 11, 2025 07:16

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Group offloading with cuda stream prefetching #10516

Group offloading with cuda stream prefetching #10516

a-r-r-o-w commented Jan 10, 2025 •

edited

Loading

HuggingFaceDocBuilderDev commented Jan 10, 2025

a-r-r-o-w Jan 10, 2025 •

edited

Loading

a-r-r-o-w commented Jan 11, 2025

Group offloading with cuda stream prefetching #10516

Group offloading with cuda stream prefetching #10516

Conversation

a-r-r-o-w commented Jan 10, 2025 • edited Loading

HuggingFaceDocBuilderDev commented Jan 10, 2025

a-r-r-o-w Jan 10, 2025 • edited Loading

Choose a reason for hiding this comment

a-r-r-o-w commented Jan 11, 2025

a-r-r-o-w commented Jan 10, 2025 •

edited

Loading

a-r-r-o-w Jan 10, 2025 •

edited

Loading