inference/cli_demo.py

"""
This script demonstrates how to generate an image using the CogView3-Plus-3B model with the Hugging Face `diffusers` pipeline.
It showcases memory-efficient techniques like model offloading, VAE slicing, and tiling to reduce memory consumption during inference.
The prompt describes an image to be generated by the model, and the final image is saved to disk.

Running the Script:
To run the script, use the following command with appropriate arguments:

```bash
python cli_demo.py --prompt "A beautiful sunset over a mountain" --width 1024 --height 1024
```

Additional options are available to specify the model path, guidance scale, number of inference steps, image generation type, and output paths.
"""

from diffusers import CogView3PlusPipeline
import torch
import argparse


def generate_image(
    prompt, model_path, guidance_scale, num_images_per_prompt, num_inference_steps, width, height, output_path, dtype
):
    # Load the pre-trained model with the specified precision
    pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype)

    # Enable CPU offloading to free up GPU memory when layers are not actively being used
    pipe.enable_model_cpu_offload()

    # Enable VAE slicing and tiling for memory optimization
    pipe.vae.enable_slicing()
    pipe.vae.enable_tiling()

    # Generate the image based on the prompt
    image = pipe(
        prompt=prompt,
        guidance_scale=guidance_scale,
        num_images_per_prompt=num_images_per_prompt,
        num_inference_steps=num_inference_steps,
        width=width,
        height=height,
    ).images[0]

    # Save the generated image to the local file system
    image.save(output_path)

    print(f"Image saved to {output_path}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate an image using the CogView3-Plus-3B model.")

    # Define arguments for prompt, model path, etc.
    parser.add_argument("--prompt", type=str, required=True, help="The text description for generating the image.")
    parser.add_argument(
        "--model_path", type=str, default="THUDM/CogView3-Plus-3B", help="Path to the pre-trained model."
    )
    parser.add_argument(
        "--guidance_scale", type=float, default=7.0, help="The guidance scale for classifier-free guidance."
    )
    parser.add_argument(
        "--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt."
    )
    parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of denoising steps for inference.")
    parser.add_argument("--width", type=int, default=1024, help="Width of the generated image.")
    parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.")
    parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.")
    parser.add_argument("--dtype", type=str, default="bfloat16", help="Precision type (float16 or bfloat16).")

    # Parse the arguments
    args = parser.parse_args()

    # Convert dtype argument to torch dtype
    dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16

    # Call the function to generate the image
    generate_image(
        prompt=args.prompt,
        model_path=args.model_path,
        guidance_scale=args.guidance_scale,
        num_images_per_prompt=args.num_images_per_prompt,
        num_inference_steps=args.num_inference_steps,
        width=args.width,
        height=args.height,
        output_path=args.output_path,
        dtype=dtype,
    )