diff --git a/examples/dreambooth/train_dreambooth_lora_sana.py b/examples/dreambooth/train_dreambooth_lora_sana.py index 7bec9c799cae..7956efb4471e 100644 --- a/examples/dreambooth/train_dreambooth_lora_sana.py +++ b/examples/dreambooth/train_dreambooth_lora_sana.py @@ -158,6 +158,9 @@ def log_validation( f"Running validation... \n Generating {args.num_validation_images} images with prompt:" f" {args.validation_prompt}." ) + if args.enable_vae_tiling: + pipeline.vae.enable_tiling(tile_sample_min_height=1024, tile_sample_stride_width=1024) + pipeline.text_encoder = pipeline.text_encoder.to(torch.bfloat16) pipeline = pipeline.to(accelerator.device) pipeline.set_progress_bar_config(disable=True) @@ -597,6 +600,7 @@ def parse_args(input_args=None): help="Whether to offload the VAE and the text encoder to CPU when they are not used.", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument("--enable_vae_tiling", action="store_true", help="Enabla vae tiling in log validation") if input_args is not None: args = parser.parse_args(input_args) diff --git a/src/diffusers/pipelines/mochi/pipeline_mochi.py b/src/diffusers/pipelines/mochi/pipeline_mochi.py index 435470064633..a3028c50d8b7 100644 --- a/src/diffusers/pipelines/mochi/pipeline_mochi.py +++ b/src/diffusers/pipelines/mochi/pipeline_mochi.py @@ -21,7 +21,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...loaders import Mochi1LoraLoaderMixin -from ...models.autoencoders import AutoencoderKL +from ...models.autoencoders import AutoencoderKLMochi from ...models.transformers import MochiTransformer3DModel from ...schedulers import FlowMatchEulerDiscreteScheduler from ...utils import ( @@ -151,8 +151,8 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin): Conditional Transformer architecture to denoise the encoded video latents. scheduler ([`FlowMatchEulerDiscreteScheduler`]): A scheduler to be used in combination with `transformer` to denoise the encoded image latents. - vae ([`AutoencoderKL`]): - Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + vae ([`AutoencoderKLMochi`]): + Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations. text_encoder ([`T5EncoderModel`]): [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant. @@ -171,7 +171,7 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin): def __init__( self, scheduler: FlowMatchEulerDiscreteScheduler, - vae: AutoencoderKL, + vae: AutoencoderKLMochi, text_encoder: T5EncoderModel, tokenizer: T5TokenizerFast, transformer: MochiTransformer3DModel, diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index b223c71cb5ce..2661196afc70 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -18,6 +18,7 @@ import numpy as np import pytest +from huggingface_hub import hf_hub_download from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel, logging from diffusers.utils import is_accelerate_version @@ -30,6 +31,7 @@ numpy_cosine_similarity_distance, require_accelerate, require_bitsandbytes_version_greater, + require_peft_version_greater, require_torch, require_torch_gpu, require_transformers_version_greater, @@ -509,6 +511,29 @@ def test_quality(self): max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice) self.assertTrue(max_diff < 1e-3) + @require_peft_version_greater("0.14.0") + def test_lora_loading(self): + self.pipeline_8bit.load_lora_weights( + hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors"), adapter_name="hyper-sd" + ) + self.pipeline_8bit.set_adapters("hyper-sd", adapter_weights=0.125) + + output = self.pipeline_8bit( + prompt=self.prompt, + height=256, + width=256, + max_sequence_length=64, + output_type="np", + num_inference_steps=8, + generator=torch.manual_seed(42), + ).images + out_slice = output[0, -3:, -3:, -1].flatten() + + expected_slice = np.array([0.3916, 0.3916, 0.3887, 0.4243, 0.4155, 0.4233, 0.4570, 0.4531, 0.4248]) + + max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice) + self.assertTrue(max_diff < 1e-3) + @slow class BaseBnb8bitSerializationTests(Base8bitTests):