-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathcli_demo.py
87 lines (70 loc) · 3.49 KB
/
cli_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""
This script demonstrates how to generate an image using the CogView3-Plus-3B model with the Hugging Face `diffusers` pipeline.
It showcases memory-efficient techniques like model offloading, VAE slicing, and tiling to reduce memory consumption during inference.
The prompt describes an image to be generated by the model, and the final image is saved to disk.
Running the Script:
To run the script, use the following command with appropriate arguments:
```bash
python cli_demo.py --prompt "A beautiful sunset over a mountain" --width 1024 --height 1024
```
Additional options are available to specify the model path, guidance scale, number of inference steps, image generation type, and output paths.
"""
from diffusers import CogView3PlusPipeline
import torch
import argparse
def generate_image(
prompt, model_path, guidance_scale, num_images_per_prompt, num_inference_steps, width, height, output_path, dtype
):
# Load the pre-trained model with the specified precision
pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype)
# Enable CPU offloading to free up GPU memory when layers are not actively being used
pipe.enable_model_cpu_offload()
# Enable VAE slicing and tiling for memory optimization
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()
# Generate the image based on the prompt
image = pipe(
prompt=prompt,
guidance_scale=guidance_scale,
num_images_per_prompt=num_images_per_prompt,
num_inference_steps=num_inference_steps,
width=width,
height=height,
).images[0]
# Save the generated image to the local file system
image.save(output_path)
print(f"Image saved to {output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate an image using the CogView3-Plus-3B model.")
# Define arguments for prompt, model path, etc.
parser.add_argument("--prompt", type=str, required=True, help="The text description for generating the image.")
parser.add_argument(
"--model_path", type=str, default="THUDM/CogView3-Plus-3B", help="Path to the pre-trained model."
)
parser.add_argument(
"--guidance_scale", type=float, default=7.0, help="The guidance scale for classifier-free guidance."
)
parser.add_argument(
"--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt."
)
parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of denoising steps for inference.")
parser.add_argument("--width", type=int, default=1024, help="Width of the generated image.")
parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.")
parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Precision type (float16 or bfloat16).")
# Parse the arguments
args = parser.parse_args()
# Convert dtype argument to torch dtype
dtype = torch.bfloat16 if args.dtype == "bfloat16" else torch.float16
# Call the function to generate the image
generate_image(
prompt=args.prompt,
model_path=args.model_path,
guidance_scale=args.guidance_scale,
num_images_per_prompt=args.num_images_per_prompt,
num_inference_steps=args.num_inference_steps,
width=args.width,
height=args.height,
output_path=args.output_path,
dtype=dtype,
)