diff --git a/tests/lora/test_lora_layers_peft.py b/tests/lora/test_lora_layers_peft.py index 180d45b6803e..38e55b9ed7b4 100644 --- a/tests/lora/test_lora_layers_peft.py +++ b/tests/lora/test_lora_layers_peft.py @@ -115,9 +115,12 @@ def get_dummy_components(self, scheduler_cls=None): torch.manual_seed(0) unet = UNet2DConditionModel(**self.unet_kwargs) + scheduler = scheduler_cls(**self.scheduler_kwargs) + torch.manual_seed(0) vae = AutoencoderKL(**self.vae_kwargs) + text_encoder = CLIPTextModel.from_pretrained("peft-internal-testing/tiny-clip-text-2") tokenizer = CLIPTokenizer.from_pretrained("peft-internal-testing/tiny-clip-text-2") @@ -1402,6 +1405,35 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase): @slow @require_torch_gpu class LoraIntegrationTests(PeftLoraLoaderMixinTests, unittest.TestCase): + pipeline_class = StableDiffusionPipeline + scheduler_cls = DDIMScheduler + scheduler_kwargs = { + "beta_start": 0.00085, + "beta_end": 0.012, + "beta_schedule": "scaled_linear", + "clip_sample": False, + "set_alpha_to_one": False, + "steps_offset": 1, + } + unet_kwargs = { + "block_out_channels": (32, 64), + "layers_per_block": 2, + "sample_size": 32, + "in_channels": 4, + "out_channels": 4, + "down_block_types": ("DownBlock2D", "CrossAttnDownBlock2D"), + "up_block_types": ("CrossAttnUpBlock2D", "UpBlock2D"), + "cross_attention_dim": 32, + } + vae_kwargs = { + "block_out_channels": [32, 64], + "in_channels": 3, + "out_channels": 3, + "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"], + "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"], + "latent_channels": 4, + } + def tearDown(self): import gc @@ -1655,6 +1687,42 @@ def test_load_unload_load_kohya_lora(self): @slow @require_torch_gpu class LoraSDXLIntegrationTests(PeftLoraLoaderMixinTests, unittest.TestCase): + has_two_text_encoders = True + pipeline_class = StableDiffusionXLPipeline + scheduler_cls = EulerDiscreteScheduler + scheduler_kwargs = { + "beta_start": 0.00085, + "beta_end": 0.012, + "beta_schedule": "scaled_linear", + "timestep_spacing": "leading", + "steps_offset": 1, + } + unet_kwargs = { + "block_out_channels": (32, 64), + "layers_per_block": 2, + "sample_size": 32, + "in_channels": 4, + "out_channels": 4, + "down_block_types": ("DownBlock2D", "CrossAttnDownBlock2D"), + "up_block_types": ("CrossAttnUpBlock2D", "UpBlock2D"), + "attention_head_dim": (2, 4), + "use_linear_projection": True, + "addition_embed_type": "text_time", + "addition_time_embed_dim": 8, + "transformer_layers_per_block": (1, 2), + "projection_class_embeddings_input_dim": 80, # 6 * 8 + 32 + "cross_attention_dim": 64, + } + vae_kwargs = { + "block_out_channels": [32, 64], + "in_channels": 3, + "out_channels": 3, + "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"], + "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"], + "latent_channels": 4, + "sample_size": 128, + } + def tearDown(self): import gc