diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index cc0cd3da0218..678a0591ae3b 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -83,7 +83,7 @@ jobs: python utils/print_env.py - name: PyTorch CUDA checkpoint tests on Ubuntu env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | @@ -137,7 +137,7 @@ jobs: - name: Run PyTorch CUDA tests env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms CUBLAS_WORKSPACE_CONFIG: :16:8 run: | diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md index fd2c07e59f3f..f6e524af88db 100644 --- a/docs/source/en/api/pipelines/flux.md +++ b/docs/source/en/api/pipelines/flux.md @@ -367,7 +367,7 @@ transformer_8bit = FluxTransformer2DModel.from_pretrained( pipeline = FluxPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", - text_encoder=text_encoder_8bit, + text_encoder_2=text_encoder_8bit, transformer=transformer_8bit, torch_dtype=torch.float16, device_map="balanced", diff --git a/docs/source/en/api/pipelines/hunyuan_video.md b/docs/source/en/api/pipelines/hunyuan_video.md index df43c7f8568d..5148a97b754a 100644 --- a/docs/source/en/api/pipelines/hunyuan_video.md +++ b/docs/source/en/api/pipelines/hunyuan_video.md @@ -16,7 +16,7 @@ [HunyuanVideo](https://www.arxiv.org/abs/2412.03603) by Tencent. -*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/Tencent/HunyuanVideo).* +*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/tencent/HunyuanVideo).* @@ -45,14 +45,14 @@ from diffusers.utils import export_to_video quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True) transformer_8bit = HunyuanVideoTransformer3DModel.from_pretrained( - "tencent/HunyuanVideo", + "hunyuanvideo-community/HunyuanVideo", subfolder="transformer", quantization_config=quant_config, - torch_dtype=torch.float16, + torch_dtype=torch.bfloat16, ) pipeline = HunyuanVideoPipeline.from_pretrained( - "tencent/HunyuanVideo", + "hunyuanvideo-community/HunyuanVideo", transformer=transformer_8bit, torch_dtype=torch.float16, device_map="balanced", diff --git a/docs/source/en/using-diffusers/other-formats.md b/docs/source/en/using-diffusers/other-formats.md index 24ac9ced84ce..e662e3940a38 100644 --- a/docs/source/en/using-diffusers/other-formats.md +++ b/docs/source/en/using-diffusers/other-formats.md @@ -240,6 +240,46 @@ Benefits of using a single-file layout include: 1. Easy compatibility with diffusion interfaces such as [ComfyUI](https://github.com/comfyanonymous/ComfyUI) or [Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) which commonly use a single-file layout. 2. Easier to manage (download and share) a single file. +### DDUF + +> [!WARNING] +> DDUF is an experimental file format and APIs related to it can change in the future. + +DDUF (**D**DUF **D**iffusion **U**nified **F**ormat) is a file format designed to make storing, distributing, and using diffusion models much easier. Built on the ZIP file format, DDUF offers a standardized, efficient, and flexible way to package all parts of a diffusion model into a single, easy-to-manage file. It provides a balance between Diffusers multi-folder format and the widely popular single-file format. + +Learn more details about DDUF on the Hugging Face Hub [documentation](https://huggingface.co/docs/hub/dduf). + +Pass a checkpoint to the `dduf_file` parameter to load it in [`DiffusionPipeline`]. + +```py +from diffusers import DiffusionPipeline +import torch + +pipe = DiffusionPipeline.from_pretrained( + "DDUF/FLUX.1-dev-DDUF", dduf_file="FLUX.1-dev.dduf", torch_dtype=torch.bfloat16 +).to("cuda") +image = pipe( + "photo a cat holding a sign that says Diffusers", num_inference_steps=50, guidance_scale=3.5 +).images[0] +image.save("cat.png") +``` + +To save a pipeline as a `.dduf` checkpoint, use the [`~huggingface_hub.export_folder_as_dduf`] utility, which takes care of all the necessary file-level validations. + +```py +from huggingface_hub import export_folder_as_dduf +from diffusers import DiffusionPipeline +import torch + +pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16) + +save_folder = "flux-dev" +pipe.save_pretrained("flux-dev") +export_folder_as_dduf("flux-dev.dduf", folder_path=save_folder) + +> [!TIP] +> Packaging and loading quantized checkpoints in the DDUF format is supported as long as they respect the multi-folder structure. + ## Convert layout and files Diffusers provides many scripts and methods to convert storage layouts and file formats to enable broader support across the diffusion ecosystem. diff --git a/docs/source/en/using-diffusers/text-img2vid.md b/docs/source/en/using-diffusers/text-img2vid.md index 7b27a258f247..92e740bb579d 100644 --- a/docs/source/en/using-diffusers/text-img2vid.md +++ b/docs/source/en/using-diffusers/text-img2vid.md @@ -78,10 +78,10 @@ from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel from diffusers.utils import export_to_video transformer = HunyuanVideoTransformer3DModel.from_pretrained( - "tencent/HunyuanVideo", subfolder="transformer", torch_dtype=torch.bfloat16 + "hunyuanvideo-community/HunyuanVideo", subfolder="transformer", torch_dtype=torch.bfloat16 ) pipe = HunyuanVideoPipeline.from_pretrained( - "tencent/HunyuanVideo", transformer=transformer, torch_dtype=torch.float16 + "hunyuanvideo-community/HunyuanVideo", transformer=transformer, torch_dtype=torch.float16 ) # reduce memory requirements diff --git a/examples/dreambooth/train_dreambooth_lora_sana.py b/examples/dreambooth/train_dreambooth_lora_sana.py index 7bec9c799cae..7956efb4471e 100644 --- a/examples/dreambooth/train_dreambooth_lora_sana.py +++ b/examples/dreambooth/train_dreambooth_lora_sana.py @@ -158,6 +158,9 @@ def log_validation( f"Running validation... \n Generating {args.num_validation_images} images with prompt:" f" {args.validation_prompt}." ) + if args.enable_vae_tiling: + pipeline.vae.enable_tiling(tile_sample_min_height=1024, tile_sample_stride_width=1024) + pipeline.text_encoder = pipeline.text_encoder.to(torch.bfloat16) pipeline = pipeline.to(accelerator.device) pipeline.set_progress_bar_config(disable=True) @@ -597,6 +600,7 @@ def parse_args(input_args=None): help="Whether to offload the VAE and the text encoder to CPU when they are not used.", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument("--enable_vae_tiling", action="store_true", help="Enabla vae tiling in log validation") if input_args is not None: args = parser.parse_args(input_args) diff --git a/examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py b/examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py index 163ff8f08931..e883d8ef95a7 100644 --- a/examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py +++ b/examples/research_projects/sd3_lora_colab/train_dreambooth_lora_sd3_miniature.py @@ -765,7 +765,7 @@ def load_model_hook(models, input_dir): lora_state_dict = StableDiffusion3Pipeline.lora_state_dict(input_dir) transformer_state_dict = { - f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.") + f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.") } transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict) incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default") diff --git a/setup.py b/setup.py index 35ce34920f2a..0acdcbbb9c52 100644 --- a/setup.py +++ b/setup.py @@ -101,7 +101,7 @@ "filelock", "flax>=0.4.1", "hf-doc-builder>=0.3.0", - "huggingface-hub>=0.23.2", + "huggingface-hub>=0.27.0", "requests-mock==1.10.0", "importlib_metadata", "invisible-watermark>=0.2.0", @@ -135,6 +135,7 @@ "transformers>=4.41.2", "urllib3<=2.0.0", "black", + "phonemizer", ] # this is a lookup table with items like: @@ -227,6 +228,7 @@ def run(self): "scipy", "torchvision", "transformers", + "phonemizer", ) extras["torch"] = deps_list("torch", "accelerate") diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index d21ada6fbe60..9dd4f0121a44 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -24,10 +24,10 @@ import re from collections import OrderedDict from pathlib import Path -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np -from huggingface_hub import create_repo, hf_hub_download +from huggingface_hub import DDUFEntry, create_repo, hf_hub_download from huggingface_hub.utils import ( EntryNotFoundError, RepositoryNotFoundError, @@ -347,6 +347,7 @@ def load_config( _ = kwargs.pop("mirror", None) subfolder = kwargs.pop("subfolder", None) user_agent = kwargs.pop("user_agent", {}) + dduf_entries: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_entries", None) user_agent = {**user_agent, "file_type": "config"} user_agent = http_user_agent(user_agent) @@ -358,8 +359,15 @@ def load_config( "`self.config_name` is not defined. Note that one should not load a config from " "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`" ) - - if os.path.isfile(pretrained_model_name_or_path): + # Custom path for now + if dduf_entries: + if subfolder is not None: + raise ValueError( + "DDUF file only allow for 1 level of directory (e.g transformer/model1/model.safetentors is not allowed). " + "Please check the DDUF structure" + ) + config_file = cls._get_config_file_from_dduf(pretrained_model_name_or_path, dduf_entries) + elif os.path.isfile(pretrained_model_name_or_path): config_file = pretrained_model_name_or_path elif os.path.isdir(pretrained_model_name_or_path): if subfolder is not None and os.path.isfile( @@ -426,10 +434,8 @@ def load_config( f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory " f"containing a {cls.config_name} file" ) - try: - # Load config dict - config_dict = cls._dict_from_json_file(config_file) + config_dict = cls._dict_from_json_file(config_file, dduf_entries=dduf_entries) commit_hash = extract_commit_hash(config_file) except (json.JSONDecodeError, UnicodeDecodeError): @@ -552,9 +558,14 @@ def extract_init_dict(cls, config_dict, **kwargs): return init_dict, unused_kwargs, hidden_config_dict @classmethod - def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]): - with open(json_file, "r", encoding="utf-8") as reader: - text = reader.read() + def _dict_from_json_file( + cls, json_file: Union[str, os.PathLike], dduf_entries: Optional[Dict[str, DDUFEntry]] = None + ): + if dduf_entries: + text = dduf_entries[json_file].read_text() + else: + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() return json.loads(text) def __repr__(self): @@ -616,6 +627,20 @@ def to_json_file(self, json_file_path: Union[str, os.PathLike]): with open(json_file_path, "w", encoding="utf-8") as writer: writer.write(self.to_json_string()) + @classmethod + def _get_config_file_from_dduf(cls, pretrained_model_name_or_path: str, dduf_entries: Dict[str, DDUFEntry]): + # paths inside a DDUF file must always be "/" + config_file = ( + cls.config_name + if pretrained_model_name_or_path == "" + else "/".join([pretrained_model_name_or_path, cls.config_name]) + ) + if config_file not in dduf_entries: + raise ValueError( + f"We did not manage to find the file {config_file} in the dduf file. We only have the following files {dduf_entries.keys()}" + ) + return config_file + def register_to_config(init): r""" diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 9e7bf242eca7..7999368f1417 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -9,7 +9,7 @@ "filelock": "filelock", "flax": "flax>=0.4.1", "hf-doc-builder": "hf-doc-builder>=0.3.0", - "huggingface-hub": "huggingface-hub>=0.23.2", + "huggingface-hub": "huggingface-hub>=0.27.0", "requests-mock": "requests-mock==1.10.0", "importlib_metadata": "importlib_metadata", "invisible-watermark": "invisible-watermark>=0.2.0", @@ -43,4 +43,5 @@ "transformers": "transformers>=4.41.2", "urllib3": "urllib3<=2.0.0", "black": "black", + "phonemizer": "phonemizer", } diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py index 7492ba028c81..efefe5264daa 100644 --- a/src/diffusers/loaders/lora_pipeline.py +++ b/src/diffusers/loaders/lora_pipeline.py @@ -21,6 +21,7 @@ from ..utils import ( USE_PEFT_BACKEND, deprecate, + get_submodule_by_name, is_peft_available, is_peft_version, is_torch_version, @@ -1981,10 +1982,17 @@ def _maybe_expand_transformer_param_shape_or_error_( in_features = state_dict[lora_A_weight_name].shape[1] out_features = state_dict[lora_B_weight_name].shape[0] + # Model maybe loaded with different quantization schemes which may flatten the params. + # `bitsandbytes`, for example, flatten the weights when using 4bit. 8bit bnb models + # preserve weight shape. + module_weight_shape = cls._calculate_module_shape(model=transformer, base_module=module) + # This means there's no need for an expansion in the params, so we simply skip. - if tuple(module_weight.shape) == (out_features, in_features): + if tuple(module_weight_shape) == (out_features, in_features): continue + # TODO (sayakpaul): We still need to consider if the module we're expanding is + # quantized and handle it accordingly if that is the case. module_out_features, module_in_features = module_weight.shape debug_message = "" if in_features > module_in_features: @@ -2080,13 +2088,16 @@ def _maybe_expand_lora_state_dict(cls, transformer, lora_state_dict): base_weight_param = transformer_state_dict[base_param_name] lora_A_param = lora_state_dict[f"{prefix}{k}.lora_A.weight"] - if base_weight_param.shape[1] > lora_A_param.shape[1]: + # TODO (sayakpaul): Handle the cases when we actually need to expand when using quantization. + base_module_shape = cls._calculate_module_shape(model=transformer, base_weight_param_name=base_param_name) + + if base_module_shape[1] > lora_A_param.shape[1]: shape = (lora_A_param.shape[0], base_weight_param.shape[1]) expanded_state_dict_weight = torch.zeros(shape, device=base_weight_param.device) expanded_state_dict_weight[:, : lora_A_param.shape[1]].copy_(lora_A_param) lora_state_dict[f"{prefix}{k}.lora_A.weight"] = expanded_state_dict_weight expanded_module_names.add(k) - elif base_weight_param.shape[1] < lora_A_param.shape[1]: + elif base_module_shape[1] < lora_A_param.shape[1]: raise NotImplementedError( f"This LoRA param ({k}.lora_A.weight) has an incompatible shape {lora_A_param.shape}. Please open an issue to file for a feature request - https://github.com/huggingface/diffusers/issues/new." ) @@ -2098,6 +2109,28 @@ def _maybe_expand_lora_state_dict(cls, transformer, lora_state_dict): return lora_state_dict + @staticmethod + def _calculate_module_shape( + model: "torch.nn.Module", + base_module: "torch.nn.Linear" = None, + base_weight_param_name: str = None, + ) -> "torch.Size": + def _get_weight_shape(weight: torch.Tensor): + return weight.quant_state.shape if weight.__class__.__name__ == "Params4bit" else weight.shape + + if base_module is not None: + return _get_weight_shape(base_module.weight) + elif base_weight_param_name is not None: + if not base_weight_param_name.endswith(".weight"): + raise ValueError( + f"Invalid `base_weight_param_name` passed as it does not end with '.weight' {base_weight_param_name=}." + ) + module_path = base_weight_param_name.rsplit(".weight", 1)[0] + submodule = get_submodule_by_name(model, module_path) + return _get_weight_shape(submodule.weight) + + raise ValueError("Either `base_module` or `base_weight_param_name` must be provided.") + # The reason why we subclass from `StableDiffusionLoraLoaderMixin` here is because Amused initially # relied on `StableDiffusionLoraLoaderMixin` for its LoRA support. diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py index c4932796f44d..454496ff04d4 100644 --- a/src/diffusers/loaders/peft.py +++ b/src/diffusers/loaders/peft.py @@ -300,15 +300,17 @@ def load_lora_adapter(self, pretrained_model_name_or_path_or_dict, prefix="trans try: inject_adapter_in_model(lora_config, self, adapter_name=adapter_name, **peft_kwargs) incompatible_keys = set_peft_model_state_dict(self, state_dict, adapter_name, **peft_kwargs) - except RuntimeError as e: - for module in self.modules(): - if isinstance(module, BaseTunerLayer): - active_adapters = module.active_adapters - for active_adapter in active_adapters: - if adapter_name in active_adapter: - module.delete_adapter(adapter_name) - - self.peft_config.pop(adapter_name) + except Exception as e: + # In case `inject_adapter_in_model()` was unsuccessful even before injecting the `peft_config`. + if hasattr(self, "peft_config"): + for module in self.modules(): + if isinstance(module, BaseTunerLayer): + active_adapters = module.active_adapters + for active_adapter in active_adapters: + if adapter_name in active_adapter: + module.delete_adapter(adapter_name) + + self.peft_config.pop(adapter_name) logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}") raise diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index b2b21675054c..1f52efbcc1f7 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -186,6 +186,7 @@ "inpainting": 512, "inpainting_v2": 512, "controlnet": 512, + "instruct-pix2pix": 512, "v2": 768, "v1": 512, } @@ -605,10 +606,14 @@ def infer_diffusers_model_type(checkpoint): if any( g in checkpoint for g in ["guidance_in.in_layer.bias", "model.diffusion_model.guidance_in.in_layer.bias"] ): - if checkpoint["img_in.weight"].shape[1] == 384: - model_type = "flux-fill" + if "model.diffusion_model.img_in.weight" in checkpoint: + key = "model.diffusion_model.img_in.weight" + else: + key = "img_in.weight" - elif checkpoint["img_in.weight"].shape[1] == 128: + if checkpoint[key].shape[1] == 384: + model_type = "flux-fill" + elif checkpoint[key].shape[1] == 128: model_type = "flux-depth" else: model_type = "flux-dev" diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index 095d154cb4fe..e756bb5d4956 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -40,7 +40,7 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) - token = kwargs.pop("token", None) + hf_token = kwargs.pop("hf_token", None) revision = kwargs.pop("revision", None) subfolder = kwargs.pop("subfolder", None) weight_name = kwargs.pop("weight_name", None) @@ -73,7 +73,7 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs) force_download=force_download, proxies=proxies, local_files_only=local_files_only, - token=token, + token=hf_token, revision=revision, subfolder=subfolder, user_agent=user_agent, @@ -93,7 +93,7 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs) force_download=force_download, proxies=proxies, local_files_only=local_files_only, - token=token, + token=hf_token, revision=revision, subfolder=subfolder, user_agent=user_agent, @@ -312,7 +312,7 @@ def load_textual_inversion( local_files_only (`bool`, *optional*, defaults to `False`): Whether to only load local model weights and configuration files or not. If set to `True`, the model won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): + hf_token (`str` or *bool*, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py index 9aa53f7af243..25753afd5ce6 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx.py @@ -1010,10 +1010,12 @@ def __init__( # The minimal tile height and width for spatial tiling to be used self.tile_sample_min_height = 512 self.tile_sample_min_width = 512 + self.tile_sample_min_num_frames = 16 # The minimal distance between two spatial tiles self.tile_sample_stride_height = 448 self.tile_sample_stride_width = 448 + self.tile_sample_stride_num_frames = 8 def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (LTXVideoEncoder3d, LTXVideoDecoder3d)): @@ -1023,8 +1025,10 @@ def enable_tiling( self, tile_sample_min_height: Optional[int] = None, tile_sample_min_width: Optional[int] = None, + tile_sample_min_num_frames: Optional[int] = None, tile_sample_stride_height: Optional[float] = None, tile_sample_stride_width: Optional[float] = None, + tile_sample_stride_num_frames: Optional[float] = None, ) -> None: r""" Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to @@ -1046,8 +1050,10 @@ def enable_tiling( self.use_tiling = True self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width + self.tile_sample_min_num_frames = tile_sample_min_num_frames or self.tile_sample_min_num_frames self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width + self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames def disable_tiling(self) -> None: r""" @@ -1073,18 +1079,13 @@ def disable_slicing(self) -> None: def _encode(self, x: torch.Tensor) -> torch.Tensor: batch_size, num_channels, num_frames, height, width = x.shape + if self.use_framewise_decoding and num_frames > self.tile_sample_min_num_frames: + return self._temporal_tiled_encode(x) + if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height): return self.tiled_encode(x) - if self.use_framewise_encoding: - # TODO(aryan): requires investigation - raise NotImplementedError( - "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to " - "quality issues caused by splitting inference across frame dimension. If you believe this " - "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls." - ) - else: - enc = self.encoder(x) + enc = self.encoder(x) return enc @@ -1121,19 +1122,15 @@ def _decode( batch_size, num_channels, num_frames, height, width = z.shape tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio tile_latent_min_width = self.tile_sample_stride_width // self.spatial_compression_ratio + tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio + + if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames: + return self._temporal_tiled_decode(z, temb, return_dict=return_dict) if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height): return self.tiled_decode(z, temb, return_dict=return_dict) - if self.use_framewise_decoding: - # TODO(aryan): requires investigation - raise NotImplementedError( - "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to " - "quality issues caused by splitting inference across frame dimension. If you believe this " - "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls." - ) - else: - dec = self.decoder(z, temb) + dec = self.decoder(z, temb) if not return_dict: return (dec,) @@ -1189,6 +1186,14 @@ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch. ) return b + def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[-3], b.shape[-3], blend_extent) + for x in range(blend_extent): + b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * ( + x / blend_extent + ) + return b + def tiled_encode(self, x: torch.Tensor) -> torch.Tensor: r"""Encode a batch of images using a tiled encoder. @@ -1217,17 +1222,9 @@ def tiled_encode(self, x: torch.Tensor) -> torch.Tensor: for i in range(0, height, self.tile_sample_stride_height): row = [] for j in range(0, width, self.tile_sample_stride_width): - if self.use_framewise_encoding: - # TODO(aryan): requires investigation - raise NotImplementedError( - "Frame-wise encoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to " - "quality issues caused by splitting inference across frame dimension. If you believe this " - "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls." - ) - else: - time = self.encoder( - x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width] - ) + time = self.encoder( + x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width] + ) row.append(time) rows.append(row) @@ -1283,17 +1280,7 @@ def tiled_decode( for i in range(0, height, tile_latent_stride_height): row = [] for j in range(0, width, tile_latent_stride_width): - if self.use_framewise_decoding: - # TODO(aryan): requires investigation - raise NotImplementedError( - "Frame-wise decoding has not been implemented for AutoencoderKLLTXVideo, at the moment, due to " - "quality issues caused by splitting inference across frame dimension. If you believe this " - "should be possible, please submit a PR to https://github.com/huggingface/diffusers/pulls." - ) - else: - time = self.decoder( - z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb - ) + time = self.decoder(z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width], temb) row.append(time) rows.append(row) @@ -1318,6 +1305,74 @@ def tiled_decode( return DecoderOutput(sample=dec) + def _temporal_tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput: + batch_size, num_channels, num_frames, height, width = x.shape + latent_num_frames = (num_frames - 1) // self.temporal_compression_ratio + 1 + + tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio + tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio + blend_num_frames = tile_latent_min_num_frames - tile_latent_stride_num_frames + + row = [] + for i in range(0, num_frames, self.tile_sample_stride_num_frames): + tile = x[:, :, i : i + self.tile_sample_min_num_frames + 1, :, :] + if self.use_tiling and (height > self.tile_sample_min_height or width > self.tile_sample_min_width): + tile = self.tiled_encode(tile) + else: + tile = self.encoder(tile) + if i > 0: + tile = tile[:, :, 1:, :, :] + row.append(tile) + + result_row = [] + for i, tile in enumerate(row): + if i > 0: + tile = self.blend_t(row[i - 1], tile, blend_num_frames) + result_row.append(tile[:, :, :tile_latent_stride_num_frames, :, :]) + else: + result_row.append(tile[:, :, : tile_latent_stride_num_frames + 1, :, :]) + + enc = torch.cat(result_row, dim=2)[:, :, :latent_num_frames] + return enc + + def _temporal_tiled_decode( + self, z: torch.Tensor, temb: Optional[torch.Tensor], return_dict: bool = True + ) -> Union[DecoderOutput, torch.Tensor]: + batch_size, num_channels, num_frames, height, width = z.shape + num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1 + + tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio + tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio + tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio + tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio + blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames + + row = [] + for i in range(0, num_frames, tile_latent_stride_num_frames): + tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :] + if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height): + decoded = self.tiled_decode(tile, temb, return_dict=True).sample + else: + decoded = self.decoder(tile, temb) + if i > 0: + decoded = decoded[:, :, :-1, :, :] + row.append(decoded) + + result_row = [] + for i, tile in enumerate(row): + if i > 0: + tile = self.blend_t(row[i - 1], tile, blend_num_frames) + tile = tile[:, :, : self.tile_sample_stride_num_frames, :, :] + result_row.append(tile) + else: + result_row.append(tile[:, :, : self.tile_sample_stride_num_frames + 1, :, :]) + + dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames] + + if not return_dict: + return (dec,) + return DecoderOutput(sample=dec) + def forward( self, sample: torch.Tensor, @@ -1334,5 +1389,5 @@ def forward( z = posterior.mode() dec = self.decode(z, temb) if not return_dict: - return (dec,) + return (dec.sample,) return dec diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py index a3d006f18994..386c07e8747c 100644 --- a/src/diffusers/models/model_loading_utils.py +++ b/src/diffusers/models/model_loading_utils.py @@ -20,10 +20,11 @@ from array import array from collections import OrderedDict from pathlib import Path -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union import safetensors import torch +from huggingface_hub import DDUFEntry from huggingface_hub.utils import EntryNotFoundError from ..utils import ( @@ -132,7 +133,10 @@ def _fetch_remapped_cls_from_config(config, old_class): def load_state_dict( - checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None, disable_mmap: bool = False + checkpoint_file: Union[str, os.PathLike], + variant: Optional[str] = None, + dduf_entries: Optional[Dict[str, DDUFEntry]] = None, + disable_mmap: bool = False, ): """ Reads a checkpoint file, returning properly formatted errors if they arise. @@ -144,6 +148,10 @@ def load_state_dict( try: file_extension = os.path.basename(checkpoint_file).split(".")[-1] if file_extension == SAFETENSORS_FILE_EXTENSION: + if dduf_entries: + # tensors are loaded on cpu + with dduf_entries[checkpoint_file].as_mmap() as mm: + return safetensors.torch.load(mm) if disable_mmap: return safetensors.torch.load(open(checkpoint_file, "rb").read()) else: @@ -284,6 +292,7 @@ def _fetch_index_file( revision, user_agent, commit_hash, + dduf_entries: Optional[Dict[str, DDUFEntry]] = None, ): if is_local: index_file = Path( @@ -309,8 +318,10 @@ def _fetch_index_file( subfolder=None, user_agent=user_agent, commit_hash=commit_hash, + dduf_entries=dduf_entries, ) - index_file = Path(index_file) + if not dduf_entries: + index_file = Path(index_file) except (EntryNotFoundError, EnvironmentError): index_file = None @@ -319,7 +330,9 @@ def _fetch_index_file( # Adapted from # https://github.com/bghira/SimpleTuner/blob/cea2457ab063f6dedb9e697830ae68a96be90641/helpers/training/save_hooks.py#L64 -def _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata): +def _merge_sharded_checkpoints( + sharded_ckpt_cached_folder, sharded_metadata, dduf_entries: Optional[Dict[str, DDUFEntry]] = None +): weight_map = sharded_metadata.get("weight_map", None) if weight_map is None: raise KeyError("'weight_map' key not found in the shard index file.") @@ -332,14 +345,23 @@ def _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata): # Load tensors from each unique file for file_name in files_to_load: part_file_path = os.path.join(sharded_ckpt_cached_folder, file_name) - if not os.path.exists(part_file_path): - raise FileNotFoundError(f"Part file {file_name} not found.") + if dduf_entries: + if part_file_path not in dduf_entries: + raise FileNotFoundError(f"Part file {file_name} not found.") + else: + if not os.path.exists(part_file_path): + raise FileNotFoundError(f"Part file {file_name} not found.") if is_safetensors: - with safetensors.safe_open(part_file_path, framework="pt", device="cpu") as f: - for tensor_key in f.keys(): - if tensor_key in weight_map: - merged_state_dict[tensor_key] = f.get_tensor(tensor_key) + if dduf_entries: + with dduf_entries[part_file_path].as_mmap() as mm: + tensors = safetensors.torch.load(mm) + merged_state_dict.update(tensors) + else: + with safetensors.safe_open(part_file_path, framework="pt", device="cpu") as f: + for tensor_key in f.keys(): + if tensor_key in weight_map: + merged_state_dict[tensor_key] = f.get_tensor(tensor_key) else: merged_state_dict.update(torch.load(part_file_path, weights_only=True, map_location="cpu")) @@ -360,6 +382,7 @@ def _fetch_index_file_legacy( revision, user_agent, commit_hash, + dduf_entries: Optional[Dict[str, DDUFEntry]] = None, ): if is_local: index_file = Path( @@ -400,6 +423,7 @@ def _fetch_index_file_legacy( subfolder=None, user_agent=user_agent, commit_hash=commit_hash, + dduf_entries=dduf_entries, ) index_file = Path(index_file) deprecation_message = f"This serialization format is now deprecated to standardize the serialization format between `transformers` and `diffusers`. We recommend you to remove the existing files associated with the current variant ({variant}) and re-obtain them by running a `save_pretrained()`." diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 17e9d2043150..fcd7775fb608 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -23,11 +23,11 @@ from collections import OrderedDict from functools import partial, wraps from pathlib import Path -from typing import Any, Callable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import safetensors import torch -from huggingface_hub import create_repo, split_torch_state_dict_into_shards +from huggingface_hub import DDUFEntry, create_repo, split_torch_state_dict_into_shards from huggingface_hub.utils import validate_hf_hub_args from torch import Tensor, nn @@ -607,6 +607,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P variant = kwargs.pop("variant", None) use_safetensors = kwargs.pop("use_safetensors", None) quantization_config = kwargs.pop("quantization_config", None) + dduf_entries: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_entries", None) disable_mmap = kwargs.pop("disable_mmap", False) allow_pickle = False @@ -700,6 +701,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P revision=revision, subfolder=subfolder, user_agent=user_agent, + dduf_entries=dduf_entries, **kwargs, ) # no in-place modification of the original config. @@ -776,13 +778,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P "revision": revision, "user_agent": user_agent, "commit_hash": commit_hash, + "dduf_entries": dduf_entries, } index_file = _fetch_index_file(**index_file_kwargs) # In case the index file was not found we still have to consider the legacy format. # this becomes applicable when the variant is not None. if variant is not None and (index_file is None or not os.path.exists(index_file)): index_file = _fetch_index_file_legacy(**index_file_kwargs) - if index_file is not None and index_file.is_file(): + if index_file is not None and (dduf_entries or index_file.is_file()): is_sharded = True if is_sharded and from_flax: @@ -811,6 +814,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P model = load_flax_checkpoint_in_pytorch_model(model, model_file) else: + # in the case it is sharded, we have already the index if is_sharded: sharded_ckpt_cached_folder, sharded_metadata = _get_checkpoint_shard_files( pretrained_model_name_or_path, @@ -822,10 +826,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P user_agent=user_agent, revision=revision, subfolder=subfolder or "", + dduf_entries=dduf_entries, ) # TODO: https://github.com/huggingface/diffusers/issues/10013 - if hf_quantizer is not None: - model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata) + if hf_quantizer is not None or dduf_entries: + model_file = _merge_sharded_checkpoints( + sharded_ckpt_cached_folder, sharded_metadata, dduf_entries=dduf_entries + ) logger.info("Merged sharded checkpoints as `hf_quantizer` is not None.") is_sharded = False @@ -843,6 +850,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P subfolder=subfolder, user_agent=user_agent, commit_hash=commit_hash, + dduf_entries=dduf_entries, ) except IOError as e: @@ -866,6 +874,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P subfolder=subfolder, user_agent=user_agent, commit_hash=commit_hash, + dduf_entries=dduf_entries, ) if low_cpu_mem_usage: @@ -887,7 +896,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P # TODO (sayakpaul, SunMarc): remove this after model loading refactor else: param_device = torch.device(torch.cuda.current_device()) - state_dict = load_state_dict(model_file, variant=variant, disable_mmap=disable_mmap) + state_dict = load_state_dict( + model_file, variant=variant, dduf_entries=dduf_entries, disable_mmap=disable_mmap + ) model._convert_deprecated_attention_blocks(state_dict) # move the params from meta device to cpu @@ -983,7 +994,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P else: model = cls.from_config(config, **unused_kwargs) - state_dict = load_state_dict(model_file, variant=variant, disable_mmap=disable_mmap) + state_dict = load_state_dict( + model_file, variant=variant, dduf_entries=dduf_entries, disable_mmap=disable_mmap + ) model._convert_deprecated_attention_blocks(state_dict) model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model( diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 044f2048775f..4495623119e5 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -727,7 +727,8 @@ def forward( for i in range(batch_size): attention_mask[i, : effective_sequence_length[i]] = True - attention_mask = attention_mask.unsqueeze(1) # [B, 1, N], for broadcasting across attention heads + # [B, 1, 1, N], for broadcasting across attention heads + attention_mask = attention_mask.unsqueeze(1).unsqueeze(1) # 4. Transformer blocks if torch.is_grad_enabled() and self.gradient_checkpointing: diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 63a8b702f5e1..b8b5d07af529 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -237,7 +237,7 @@ def disable_vae_slicing(self): """ self.vae.disable_slicing() - def enable_model_cpu_offload(self, gpu_id=0): + def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"): r""" Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` @@ -249,11 +249,23 @@ def enable_model_cpu_offload(self, gpu_id=0): else: raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") - device = torch.device(f"cuda:{gpu_id}") + torch_device = torch.device(device) + device_index = torch_device.index + + if gpu_id is not None and device_index is not None: + raise ValueError( + f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}" + f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}" + ) + + device_type = torch_device.type + device = torch.device(f"{device_type}:{gpu_id or torch_device.index}") if self.device.type != "cpu": self.to("cpu", silence_dtype_warnings=True) - torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + device_mod = getattr(torch, device.type, None) + if hasattr(device_mod, "empty_cache") and device_mod.is_available(): + device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist) model_sequence = [ self.text_encoder.text_model, diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index 8bbf1ebe9fa5..b9bba4174121 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -68,6 +68,7 @@ from .pag import ( HunyuanDiTPAGPipeline, PixArtSigmaPAGPipeline, + SanaPAGPipeline, StableDiffusion3PAGImg2ImgPipeline, StableDiffusion3PAGPipeline, StableDiffusionControlNetPAGInpaintPipeline, @@ -82,6 +83,7 @@ StableDiffusionXLPAGPipeline, ) from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline +from .sana import SanaPipeline from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline from .stable_diffusion import ( StableDiffusionImg2ImgPipeline, @@ -121,6 +123,8 @@ ("lcm", LatentConsistencyModelPipeline), ("pixart-alpha", PixArtAlphaPipeline), ("pixart-sigma", PixArtSigmaPipeline), + ("sana", SanaPipeline), + ("sana-pag", SanaPAGPipeline), ("stable-diffusion-pag", StableDiffusionPAGPipeline), ("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGPipeline), ("stable-diffusion-xl-pag", StableDiffusionXLPAGPipeline), diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py index d2e3e0f34519..7f85fcc1d90d 100644 --- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py @@ -404,9 +404,9 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. - negative_prompt_2 (`str` or `List[str]`, *optional*): + negative_prompt_3 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and - `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders + `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders. prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py index 1040ff265985..35e47f4d650e 100644 --- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py @@ -17,14 +17,16 @@ import torch from transformers import ( + BaseImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, + PreTrainedModel, T5EncoderModel, T5TokenizerFast, ) from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin +from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin from ...models.autoencoders import AutoencoderKL from ...models.controlnets.controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel from ...models.transformers import SD3Transformer2DModel @@ -159,7 +161,9 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin): +class StableDiffusion3ControlNetInpaintingPipeline( + DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin +): r""" Args: transformer ([`SD3Transformer2DModel`]): @@ -192,13 +196,17 @@ class StableDiffusion3ControlNetInpaintingPipeline(DiffusionPipeline, SD3LoraLoa Tokenizer of class [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). controlnet ([`SD3ControlNetModel`] or `List[SD3ControlNetModel]` or [`SD3MultiControlNetModel`]): - Provides additional conditioning to the `unet` during the denoising process. If you set multiple + Provides additional conditioning to the `transformer` during the denoising process. If you set multiple ControlNets as a list, the outputs from each ControlNet are added together to create one combined additional conditioning. + image_encoder (`PreTrainedModel`, *optional*): + Pre-trained Vision Model for IP Adapter. + feature_extractor (`BaseImageProcessor`, *optional*): + Image processor for IP Adapter. """ - model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae" - _optional_components = [] + model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae" + _optional_components = ["image_encoder", "feature_extractor"] _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"] def __init__( @@ -215,6 +223,8 @@ def __init__( controlnet: Union[ SD3ControlNetModel, List[SD3ControlNetModel], Tuple[SD3ControlNetModel], SD3MultiControlNetModel ], + image_encoder: PreTrainedModel = None, + feature_extractor: BaseImageProcessor = None, ): super().__init__() @@ -229,6 +239,8 @@ def __init__( transformer=transformer, scheduler=scheduler, controlnet=controlnet, + image_encoder=image_encoder, + feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8 self.image_processor = VaeImageProcessor( @@ -410,9 +422,9 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. - negative_prompt_2 (`str` or `List[str]`, *optional*): + negative_prompt_3 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and - `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders + `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders. prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. @@ -775,6 +787,84 @@ def num_timesteps(self): def interrupt(self): return self._interrupt + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image + def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor: + """Encodes the given image into a feature representation using a pre-trained image encoder. + + Args: + image (`PipelineImageInput`): + Input image to be encoded. + device: (`torch.device`): + Torch device. + + Returns: + `torch.Tensor`: The encoded image feature representation. + """ + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=self.dtype) + + return self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[torch.Tensor] = None, + device: Optional[torch.device] = None, + num_images_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + ) -> torch.Tensor: + """Prepares image embeddings for use in the IP-Adapter. + + Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed. + + Args: + ip_adapter_image (`PipelineImageInput`, *optional*): + The input image to extract features from for IP-Adapter. + ip_adapter_image_embeds (`torch.Tensor`, *optional*): + Precomputed image embeddings. + device: (`torch.device`, *optional*): + Torch device. + num_images_per_prompt (`int`, defaults to 1): + Number of images that should be generated per prompt. + do_classifier_free_guidance (`bool`, defaults to True): + Whether to use classifier free guidance or not. + """ + device = device or self._execution_device + + if ip_adapter_image_embeds is not None: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2) + else: + single_image_embeds = ip_adapter_image_embeds + elif ip_adapter_image is not None: + single_image_embeds = self.encode_image(ip_adapter_image, device) + if do_classifier_free_guidance: + single_negative_image_embeds = torch.zeros_like(single_image_embeds) + else: + raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.") + + image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0) + + if do_classifier_free_guidance: + negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0) + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0) + + return image_embeds.to(device=device) + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, *args, **kwargs): + if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload: + logger.warning( + "`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses " + "`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling " + "`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`." + ) + + super().enable_sequential_cpu_offload(*args, **kwargs) + @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( @@ -803,6 +893,8 @@ def __call__( negative_prompt_embeds: Optional[torch.FloatTensor] = None, pooled_prompt_embeds: Optional[torch.FloatTensor] = None, negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, joint_attention_kwargs: Optional[Dict[str, Any]] = None, @@ -896,6 +988,12 @@ def __call__( Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. + ip_adapter_image (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`torch.Tensor`, *optional*): + Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images, + emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to + `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -1057,7 +1155,22 @@ def __call__( ] controlnet_keep.append(keeps[0] if isinstance(self.controlnet, SD3ControlNetModel) else keeps) - # 7. Denoising loop + # 7. Prepare image embeddings + if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None: + ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + + if self.joint_attention_kwargs is None: + self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds} + else: + self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds) + + # 8. Denoising loop with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): if self.interrupt: diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py index c23b660300db..f5716dc9c8ea 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux.py +++ b/src/diffusers/pipelines/flux/pipeline_flux.py @@ -665,7 +665,16 @@ def __call__( instead. prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is - will be used instead + will be used instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is + not greater than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. + true_cfg_scale (`float`, *optional*, defaults to 1.0): + When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance. height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): @@ -709,6 +718,14 @@ def __call__( Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. @@ -773,7 +790,10 @@ def __call__( lora_scale = ( self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None ) - do_true_cfg = true_cfg_scale > 1 and negative_prompt is not None + has_neg_prompt = negative_prompt is not None or ( + negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None + ) + do_true_cfg = true_cfg_scale > 1 and has_neg_prompt ( prompt_embeds, pooled_prompt_embeds, diff --git a/src/diffusers/pipelines/latte/pipeline_latte.py b/src/diffusers/pipelines/latte/pipeline_latte.py index 852a2b7b795e..1b70650dfa11 100644 --- a/src/diffusers/pipelines/latte/pipeline_latte.py +++ b/src/diffusers/pipelines/latte/pipeline_latte.py @@ -30,6 +30,7 @@ from ...utils import ( BACKENDS_MAPPING, BaseOutput, + deprecate, is_bs4_available, is_ftfy_available, is_torch_xla_available, @@ -848,7 +849,14 @@ def __call__( if XLA_AVAILABLE: xm.mark_step() - if not output_type == "latents": + if output_type == "latents": + deprecation_message = ( + "Passing `output_type='latents'` is deprecated. Please pass `output_type='latent'` instead." + ) + deprecate("output_type_latents", "1.0.0", deprecation_message, standard_warn=False) + output_type = "latent" + + if not output_type == "latent": video = self.decode_latents(latents, video_length, decode_chunk_size=14) video = self.video_processor.postprocess_video(video=video, output_type=output_type) else: diff --git a/src/diffusers/pipelines/ltx/pipeline_ltx.py b/src/diffusers/pipelines/ltx/pipeline_ltx.py index c49918cb7d21..e04290b45754 100644 --- a/src/diffusers/pipelines/ltx/pipeline_ltx.py +++ b/src/diffusers/pipelines/ltx/pipeline_ltx.py @@ -769,7 +769,7 @@ def __call__( if not self.vae.config.timestep_conditioning: timestep = None else: - noise = torch.randn(latents.shape, generator=generator, device=device, dtype=latents.dtype) + noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype) if not isinstance(decode_timestep, list): decode_timestep = [decode_timestep] * batch_size if decode_noise_scale is None: diff --git a/src/diffusers/pipelines/mochi/pipeline_mochi.py b/src/diffusers/pipelines/mochi/pipeline_mochi.py index 435470064633..a3028c50d8b7 100644 --- a/src/diffusers/pipelines/mochi/pipeline_mochi.py +++ b/src/diffusers/pipelines/mochi/pipeline_mochi.py @@ -21,7 +21,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...loaders import Mochi1LoraLoaderMixin -from ...models.autoencoders import AutoencoderKL +from ...models.autoencoders import AutoencoderKLMochi from ...models.transformers import MochiTransformer3DModel from ...schedulers import FlowMatchEulerDiscreteScheduler from ...utils import ( @@ -151,8 +151,8 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin): Conditional Transformer architecture to denoise the encoded video latents. scheduler ([`FlowMatchEulerDiscreteScheduler`]): A scheduler to be used in combination with `transformer` to denoise the encoded image latents. - vae ([`AutoencoderKL`]): - Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + vae ([`AutoencoderKLMochi`]): + Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations. text_encoder ([`T5EncoderModel`]): [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant. @@ -171,7 +171,7 @@ class MochiPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin): def __init__( self, scheduler: FlowMatchEulerDiscreteScheduler, - vae: AutoencoderKL, + vae: AutoencoderKLMochi, text_encoder: T5EncoderModel, tokenizer: T5TokenizerFast, transformer: MochiTransformer3DModel, diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sana.py b/src/diffusers/pipelines/pag/pipeline_pag_sana.py index 2cdc1c70cdcc..416b2f7c60f2 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sana.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sana.py @@ -16,6 +16,7 @@ import inspect import re import urllib.parse as ul +import warnings from typing import Callable, Dict, List, Optional, Tuple, Union import torch @@ -41,6 +42,7 @@ ASPECT_RATIO_1024_BIN, ) from ..pixart_alpha.pipeline_pixart_sigma import ASPECT_RATIO_2048_BIN +from ..sana.pipeline_sana import ASPECT_RATIO_4096_BIN from .pag_utils import PAGMixin @@ -639,7 +641,7 @@ def __call__( negative_prompt_attention_mask: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - clean_caption: bool = True, + clean_caption: bool = False, use_resolution_binning: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], @@ -755,7 +757,9 @@ def __call__( callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs if use_resolution_binning: - if self.transformer.config.sample_size == 64: + if self.transformer.config.sample_size == 128: + aspect_ratio_bin = ASPECT_RATIO_4096_BIN + elif self.transformer.config.sample_size == 64: aspect_ratio_bin = ASPECT_RATIO_2048_BIN elif self.transformer.config.sample_size == 32: aspect_ratio_bin = ASPECT_RATIO_1024_BIN @@ -912,7 +916,14 @@ def __call__( image = latents else: latents = latents.to(self.vae.dtype) - image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + try: + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + except torch.cuda.OutOfMemoryError as e: + warnings.warn( + f"{e}. \n" + f"Try to use VAE tiling for large images. For example: \n" + f"pipe.vae.enable_tiling(tile_sample_min_width=512, tile_sample_min_height=512)" + ) if use_resolution_binning: image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height) diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py index 0285239aaa8d..fde3e500a573 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py @@ -375,9 +375,9 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. - negative_prompt_2 (`str` or `List[str]`, *optional*): + negative_prompt_3 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and - `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders + `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders. prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py index 121be4ce2c07..d64582a26f7a 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py @@ -391,9 +391,9 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. - negative_prompt_2 (`str` or `List[str]`, *optional*): + negative_prompt_3 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and - `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders + `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders. prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py index 23f1279e203d..a100dfe77bdf 100644 --- a/src/diffusers/pipelines/pipeline_loading_utils.py +++ b/src/diffusers/pipelines/pipeline_loading_utils.py @@ -12,19 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - import importlib import os import re import warnings from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union +import requests import torch -from huggingface_hub import ModelCard, model_info -from huggingface_hub.utils import validate_hf_hub_args +from huggingface_hub import DDUFEntry, ModelCard, model_info, snapshot_download +from huggingface_hub.utils import OfflineModeIsEnabled, validate_hf_hub_args from packaging import version +from requests.exceptions import HTTPError from .. import __version__ from ..utils import ( @@ -38,14 +38,16 @@ is_accelerate_available, is_peft_available, is_transformers_available, + is_transformers_version, logging, ) from ..utils.torch_utils import is_compiled_module +from .transformers_loading_utils import _load_tokenizer_from_dduf, _load_transformers_model_from_dduf if is_transformers_available(): import transformers - from transformers import PreTrainedModel + from transformers import PreTrainedModel, PreTrainedTokenizerBase from transformers.utils import FLAX_WEIGHTS_NAME as TRANSFORMERS_FLAX_WEIGHTS_NAME from transformers.utils import SAFE_WEIGHTS_NAME as TRANSFORMERS_SAFE_WEIGHTS_NAME from transformers.utils import WEIGHTS_NAME as TRANSFORMERS_WEIGHTS_NAME @@ -627,6 +629,7 @@ def load_sub_model( low_cpu_mem_usage: bool, cached_folder: Union[str, os.PathLike], use_safetensors: bool, + dduf_entries: Optional[Dict[str, DDUFEntry]], ): """Helper method to load the module `name` from `library_name` and `class_name`""" @@ -663,7 +666,7 @@ def load_sub_model( f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}." ) - load_method = getattr(class_obj, load_method_name) + load_method = _get_load_method(class_obj, load_method_name, is_dduf=dduf_entries is not None) # add kwargs to loading method diffusers_module = importlib.import_module(__name__.split(".")[0]) @@ -721,7 +724,10 @@ def load_sub_model( loading_kwargs["low_cpu_mem_usage"] = False # check if the module is in a subdirectory - if os.path.isdir(os.path.join(cached_folder, name)): + if dduf_entries: + loading_kwargs["dduf_entries"] = dduf_entries + loaded_sub_model = load_method(name, **loading_kwargs) + elif os.path.isdir(os.path.join(cached_folder, name)): loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) else: # else load from the root directory @@ -746,6 +752,22 @@ def load_sub_model( return loaded_sub_model +def _get_load_method(class_obj: object, load_method_name: str, is_dduf: bool) -> Callable: + """ + Return the method to load the sub model. + + In practice, this method will return the `"from_pretrained"` (or `load_method_name`) method of the class object + except if loading from a DDUF checkpoint. In that case, transformers models and tokenizers have a specific loading + method that we need to use. + """ + if is_dduf: + if issubclass(class_obj, PreTrainedTokenizerBase): + return lambda *args, **kwargs: _load_tokenizer_from_dduf(class_obj, *args, **kwargs) + if issubclass(class_obj, PreTrainedModel): + return lambda *args, **kwargs: _load_transformers_model_from_dduf(class_obj, *args, **kwargs) + return getattr(class_obj, load_method_name) + + def _fetch_class_library_tuple(module): # import it here to avoid circular import diffusers_module = importlib.import_module(__name__.split(".")[0]) @@ -968,3 +990,70 @@ def _get_ignore_patterns( ) return ignore_patterns + + +def _download_dduf_file( + pretrained_model_name: str, + dduf_file: str, + pipeline_class_name: str, + cache_dir: str, + proxies: str, + local_files_only: bool, + token: str, + revision: str, +): + model_info_call_error = None + if not local_files_only: + try: + info = model_info(pretrained_model_name, token=token, revision=revision) + except (HTTPError, OfflineModeIsEnabled, requests.ConnectionError) as e: + logger.warning(f"Couldn't connect to the Hub: {e}.\nWill try to load from local cache.") + local_files_only = True + model_info_call_error = e # save error to reraise it if model is not cached locally + + if ( + not local_files_only + and dduf_file is not None + and dduf_file not in (sibling.rfilename for sibling in info.siblings) + ): + raise ValueError(f"Requested {dduf_file} file is not available in {pretrained_model_name}.") + + try: + user_agent = {"pipeline_class": pipeline_class_name, "dduf": True} + cached_folder = snapshot_download( + pretrained_model_name, + cache_dir=cache_dir, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + allow_patterns=[dduf_file], + user_agent=user_agent, + ) + return cached_folder + except FileNotFoundError: + # Means we tried to load pipeline with `local_files_only=True` but the files have not been found in local cache. + # This can happen in two cases: + # 1. If the user passed `local_files_only=True` => we raise the error directly + # 2. If we forced `local_files_only=True` when `model_info` failed => we raise the initial error + if model_info_call_error is None: + # 1. user passed `local_files_only=True` + raise + else: + # 2. we forced `local_files_only=True` when `model_info` failed + raise EnvironmentError( + f"Cannot load model {pretrained_model_name}: model is not cached locally and an error occurred" + " while trying to fetch metadata from the Hub. Please check out the root cause in the stacktrace" + " above." + ) from model_info_call_error + + +def _maybe_raise_error_for_incorrect_transformers(config_dict): + has_transformers_component = False + for k in config_dict: + if isinstance(config_dict[k], list): + has_transformers_component = config_dict[k][0] == "transformers" + if has_transformers_component: + break + if has_transformers_component and not is_transformers_version(">", "4.47.1"): + raise ValueError("Please upgrade your `transformers` installation to the latest version to use DDUF.") diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 527724d1de1a..3cafb77e5d63 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -29,10 +29,12 @@ import requests import torch from huggingface_hub import ( + DDUFEntry, ModelCard, create_repo, hf_hub_download, model_info, + read_dduf_file, snapshot_download, ) from huggingface_hub.utils import OfflineModeIsEnabled, validate_hf_hub_args @@ -72,6 +74,7 @@ CONNECTED_PIPES_KEYS, CUSTOM_PIPELINE_FILE_NAME, LOADABLE_CLASSES, + _download_dduf_file, _fetch_class_library_tuple, _get_custom_components_and_folders, _get_custom_pipeline_class, @@ -79,6 +82,7 @@ _get_ignore_patterns, _get_pipeline_class, _identify_model_variants, + _maybe_raise_error_for_incorrect_transformers, _maybe_raise_warning_for_inpainting, _resolve_custom_pipeline_and_cls, _unwrap_model, @@ -218,6 +222,7 @@ class implements both a save and loading method. The pipeline is easily reloaded Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the repository you want to push to with `repo_id` (will default to the name of `save_directory` in your namespace). + kwargs (`Dict[str, Any]`, *optional*): Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. """ @@ -531,6 +536,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights saved using [`~DiffusionPipeline.save_pretrained`]. + - A path to a *directory* (for example `./my_pipeline_directory/`) containing a dduf file torch_dtype (`str` or `torch.dtype`, *optional*): Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the dtype is automatically derived from the model's weights. @@ -625,6 +631,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P variant (`str`, *optional*): Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when loading `from_flax`. + dduf_file(`str`, *optional*): + Load weights from the specified dduf file. @@ -674,6 +682,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P offload_state_dict = kwargs.pop("offload_state_dict", False) low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT) variant = kwargs.pop("variant", None) + dduf_file = kwargs.pop("dduf_file", None) use_safetensors = kwargs.pop("use_safetensors", None) use_onnx = kwargs.pop("use_onnx", None) load_connected_pipeline = kwargs.pop("load_connected_pipeline", False) @@ -722,6 +731,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P " dispatching. Please make sure to set `low_cpu_mem_usage=True`." ) + if dduf_file: + if custom_pipeline: + raise NotImplementedError("Custom pipelines are not supported with DDUF at the moment.") + if load_connected_pipeline: + raise NotImplementedError("Connected pipelines are not supported with DDUF at the moment.") + # 1. Download the checkpoints and configs # use snapshot download here to get it working from from_pretrained if not os.path.isdir(pretrained_model_name_or_path): @@ -744,6 +759,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P custom_pipeline=custom_pipeline, custom_revision=custom_revision, variant=variant, + dduf_file=dduf_file, load_connected_pipeline=load_connected_pipeline, **kwargs, ) @@ -765,7 +781,17 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ) logger.warning(warn_msg) - config_dict = cls.load_config(cached_folder) + dduf_entries = None + if dduf_file: + dduf_file_path = os.path.join(cached_folder, dduf_file) + dduf_entries = read_dduf_file(dduf_file_path) + # The reader contains already all the files needed, no need to check it again + cached_folder = "" + + config_dict = cls.load_config(cached_folder, dduf_entries=dduf_entries) + + if dduf_file: + _maybe_raise_error_for_incorrect_transformers(config_dict) # pop out "_ignore_files" as it is only needed for download config_dict.pop("_ignore_files", None) @@ -943,6 +969,7 @@ def load_module(name, value): low_cpu_mem_usage=low_cpu_mem_usage, cached_folder=cached_folder, use_safetensors=use_safetensors, + dduf_entries=dduf_entries, ) logger.info( f"Loaded {name} as {class_name} from `{name}` subfolder of {pretrained_model_name_or_path}." @@ -1256,6 +1283,8 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: variant (`str`, *optional*): Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when loading `from_flax`. + dduf_file(`str`, *optional*): + Load weights from the specified DDUF file. use_safetensors (`bool`, *optional*, defaults to `None`): If set to `None`, the safetensors weights are downloaded if they're available **and** if the safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors @@ -1296,6 +1325,23 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: use_onnx = kwargs.pop("use_onnx", None) load_connected_pipeline = kwargs.pop("load_connected_pipeline", False) trust_remote_code = kwargs.pop("trust_remote_code", False) + dduf_file: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_file", None) + + if dduf_file: + if custom_pipeline: + raise NotImplementedError("Custom pipelines are not supported with DDUF at the moment.") + if load_connected_pipeline: + raise NotImplementedError("Connected pipelines are not supported with DDUF at the moment.") + return _download_dduf_file( + pretrained_model_name=pretrained_model_name, + dduf_file=dduf_file, + pipeline_class_name=cls.__name__, + cache_dir=cache_dir, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + ) allow_pickle = False if use_safetensors is None: @@ -1375,7 +1421,6 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: allow_patterns += [f"{custom_pipeline}.py"] if f"{custom_pipeline}.py" in filenames else [] # also allow downloading config.json files with the model allow_patterns += [os.path.join(k, "config.json") for k in model_folder_names] - allow_patterns += [ SCHEDULER_CONFIG_NAME, CONFIG_NAME, @@ -1471,7 +1516,6 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: user_agent=user_agent, ) - # retrieve pipeline class from local file cls_name = cls.load_config(os.path.join(cached_folder, "model_index.json")).get("_class_name", None) cls_name = cls_name[4:] if isinstance(cls_name, str) and cls_name.startswith("Flax") else cls_name diff --git a/src/diffusers/pipelines/sana/pipeline_sana.py b/src/diffusers/pipelines/sana/pipeline_sana.py index 8b318597c12d..cca4dfe5e8ba 100644 --- a/src/diffusers/pipelines/sana/pipeline_sana.py +++ b/src/diffusers/pipelines/sana/pipeline_sana.py @@ -16,6 +16,7 @@ import inspect import re import urllib.parse as ul +import warnings from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch @@ -953,7 +954,14 @@ def __call__( image = latents else: latents = latents.to(self.vae.dtype) - image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + try: + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + except torch.cuda.OutOfMemoryError as e: + warnings.warn( + f"{e}. \n" + f"Try to use VAE tiling for large images. For example: \n" + f"pipe.vae.enable_tiling(tile_sample_min_width=512, tile_sample_min_height=512)" + ) if use_resolution_binning: image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height) diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py index dc0d64144e12..23950f895aae 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py @@ -383,9 +383,9 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. - negative_prompt_2 (`str` or `List[str]`, *optional*): + negative_prompt_3 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and - `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders + `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders. prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py index 6a3a4abe7696..b6e95844b3bd 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py @@ -400,9 +400,9 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. - negative_prompt_2 (`str` or `List[str]`, *optional*): + negative_prompt_3 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and - `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders + `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders. prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py index 23cc4983d54f..de9842913e98 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py @@ -13,19 +13,21 @@ # limitations under the License. import inspect -from typing import Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import torch from transformers import ( + BaseImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, + PreTrainedModel, T5EncoderModel, T5TokenizerFast, ) from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin +from ...loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin from ...models.autoencoders import AutoencoderKL from ...models.transformers import SD3Transformer2DModel from ...schedulers import FlowMatchEulerDiscreteScheduler @@ -162,7 +164,7 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin): +class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin): r""" Args: transformer ([`SD3Transformer2DModel`]): @@ -194,10 +196,14 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro tokenizer_3 (`T5TokenizerFast`): Tokenizer of class [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). + image_encoder (`PreTrainedModel`, *optional*): + Pre-trained Vision Model for IP Adapter. + feature_extractor (`BaseImageProcessor`, *optional*): + Image processor for IP Adapter. """ - model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae" - _optional_components = [] + model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae" + _optional_components = ["image_encoder", "feature_extractor"] _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"] def __init__( @@ -211,6 +217,8 @@ def __init__( tokenizer_2: CLIPTokenizer, text_encoder_3: T5EncoderModel, tokenizer_3: T5TokenizerFast, + image_encoder: PreTrainedModel = None, + feature_extractor: BaseImageProcessor = None, ): super().__init__() @@ -224,6 +232,8 @@ def __init__( tokenizer_3=tokenizer_3, transformer=transformer, scheduler=scheduler, + image_encoder=image_encoder, + feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8 latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16 @@ -406,9 +416,9 @@ def encode_prompt( negative_prompt_2 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. - negative_prompt_2 (`str` or `List[str]`, *optional*): + negative_prompt_3 (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and - `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders + `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders. prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. @@ -818,6 +828,10 @@ def clip_skip(self): def do_classifier_free_guidance(self): return self._guidance_scale > 1 + @property + def joint_attention_kwargs(self): + return self._joint_attention_kwargs + @property def num_timesteps(self): return self._num_timesteps @@ -826,6 +840,84 @@ def num_timesteps(self): def interrupt(self): return self._interrupt + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_image + def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor: + """Encodes the given image into a feature representation using a pre-trained image encoder. + + Args: + image (`PipelineImageInput`): + Input image to be encoded. + device: (`torch.device`): + Torch device. + + Returns: + `torch.Tensor`: The encoded image feature representation. + """ + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=self.dtype) + + return self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[torch.Tensor] = None, + device: Optional[torch.device] = None, + num_images_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + ) -> torch.Tensor: + """Prepares image embeddings for use in the IP-Adapter. + + Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed. + + Args: + ip_adapter_image (`PipelineImageInput`, *optional*): + The input image to extract features from for IP-Adapter. + ip_adapter_image_embeds (`torch.Tensor`, *optional*): + Precomputed image embeddings. + device: (`torch.device`, *optional*): + Torch device. + num_images_per_prompt (`int`, defaults to 1): + Number of images that should be generated per prompt. + do_classifier_free_guidance (`bool`, defaults to True): + Whether to use classifier free guidance or not. + """ + device = device or self._execution_device + + if ip_adapter_image_embeds is not None: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2) + else: + single_image_embeds = ip_adapter_image_embeds + elif ip_adapter_image is not None: + single_image_embeds = self.encode_image(ip_adapter_image, device) + if do_classifier_free_guidance: + single_negative_image_embeds = torch.zeros_like(single_image_embeds) + else: + raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.") + + image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0) + + if do_classifier_free_guidance: + negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0) + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0) + + return image_embeds.to(device=device) + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, *args, **kwargs): + if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload: + logger.warning( + "`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses " + "`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling " + "`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`." + ) + + super().enable_sequential_cpu_offload(*args, **kwargs) + @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( @@ -853,8 +945,11 @@ def __call__( negative_prompt_embeds: Optional[torch.Tensor] = None, pooled_prompt_embeds: Optional[torch.Tensor] = None, negative_pooled_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, clip_skip: Optional[int] = None, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], @@ -890,9 +985,9 @@ def __call__( mask_image_latent (`torch.Tensor`, `List[torch.Tensor]`): `Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask latents tensor will ge generated by `mask_image`. - height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): The height in pixels of the generated image. This is set to 1024 by default for the best results. - width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): The width in pixels of the generated image. This is set to 1024 by default for the best results. padding_mask_crop (`int`, *optional*, defaults to `None`): The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to @@ -953,12 +1048,22 @@ def __call__( Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` input argument. + ip_adapter_image (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`torch.Tensor`, *optional*): + Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images, + emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to + `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of a plain tuple. + joint_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, @@ -1006,6 +1111,7 @@ def __call__( self._guidance_scale = guidance_scale self._clip_skip = clip_skip + self._joint_attention_kwargs = joint_attention_kwargs self._interrupt = False # 2. Define call parameters @@ -1160,7 +1266,22 @@ def __call__( f"The transformer {self.transformer.__class__} should have 16 input channels or 33 input channels, not {self.transformer.config.in_channels}." ) - # 7. Denoising loop + # 7. Prepare image embeddings + if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None: + ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + + if self.joint_attention_kwargs is None: + self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds} + else: + self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds) + + # 8. Denoising loop num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) with self.progress_bar(total=num_inference_steps) as progress_bar: @@ -1181,6 +1302,7 @@ def __call__( timestep=timestep, encoder_hidden_states=prompt_embeds, pooled_projections=pooled_prompt_embeds, + joint_attention_kwargs=self.joint_attention_kwargs, return_dict=False, )[0] diff --git a/src/diffusers/pipelines/transformers_loading_utils.py b/src/diffusers/pipelines/transformers_loading_utils.py new file mode 100644 index 000000000000..f080adb23deb --- /dev/null +++ b/src/diffusers/pipelines/transformers_loading_utils.py @@ -0,0 +1,121 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import os +import tempfile +from typing import TYPE_CHECKING, Dict + +from huggingface_hub import DDUFEntry +from tqdm import tqdm + +from ..utils import is_safetensors_available, is_transformers_available, is_transformers_version + + +if TYPE_CHECKING: + from transformers import PreTrainedModel, PreTrainedTokenizer + +if is_transformers_available(): + from transformers import PreTrainedModel, PreTrainedTokenizer + +if is_safetensors_available(): + import safetensors.torch + + +def _load_tokenizer_from_dduf( + cls: "PreTrainedTokenizer", name: str, dduf_entries: Dict[str, DDUFEntry], **kwargs +) -> "PreTrainedTokenizer": + """ + Load a tokenizer from a DDUF archive. + + In practice, `transformers` do not provide a way to load a tokenizer from a DDUF archive. This function is a + workaround by extracting the tokenizer files from the DDUF archive and loading the tokenizer from the extracted + files. There is an extra cost of extracting the files, but of limited impact as the tokenizer files are usually + small-ish. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + for entry_name, entry in dduf_entries.items(): + if entry_name.startswith(name + "/"): + tmp_entry_path = os.path.join(tmp_dir, *entry_name.split("/")) + # need to create intermediary directory if they don't exist + os.makedirs(os.path.dirname(tmp_entry_path), exist_ok=True) + with open(tmp_entry_path, "wb") as f: + with entry.as_mmap() as mm: + f.write(mm) + return cls.from_pretrained(os.path.dirname(tmp_entry_path), **kwargs) + + +def _load_transformers_model_from_dduf( + cls: "PreTrainedModel", name: str, dduf_entries: Dict[str, DDUFEntry], **kwargs +) -> "PreTrainedModel": + """ + Load a transformers model from a DDUF archive. + + In practice, `transformers` do not provide a way to load a model from a DDUF archive. This function is a workaround + by instantiating a model from the config file and loading the weights from the DDUF archive directly. + """ + config_file = dduf_entries.get(f"{name}/config.json") + if config_file is None: + raise EnvironmentError( + f"Could not find a config.json file for component {name} in DDUF file (contains {dduf_entries.keys()})." + ) + generation_config = dduf_entries.get(f"{name}/generation_config.json", None) + + weight_files = [ + entry + for entry_name, entry in dduf_entries.items() + if entry_name.startswith(f"{name}/") and entry_name.endswith(".safetensors") + ] + if not weight_files: + raise EnvironmentError( + f"Could not find any weight file for component {name} in DDUF file (contains {dduf_entries.keys()})." + ) + if not is_safetensors_available(): + raise EnvironmentError( + "Safetensors is not available, cannot load model from DDUF. Please `pip install safetensors`." + ) + if is_transformers_version("<", "4.47.0"): + raise ImportError( + "You need to install `transformers>4.47.0` in order to load a transformers model from a DDUF file. " + "You can install it with: `pip install --upgrade transformers`" + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + from transformers import AutoConfig, GenerationConfig + + tmp_config_file = os.path.join(tmp_dir, "config.json") + with open(tmp_config_file, "w") as f: + f.write(config_file.read_text()) + config = AutoConfig.from_pretrained(tmp_config_file) + if generation_config is not None: + tmp_generation_config_file = os.path.join(tmp_dir, "generation_config.json") + with open(tmp_generation_config_file, "w") as f: + f.write(generation_config.read_text()) + generation_config = GenerationConfig.from_pretrained(tmp_generation_config_file) + state_dict = {} + with contextlib.ExitStack() as stack: + for entry in tqdm(weight_files, desc="Loading state_dict"): # Loop over safetensors files + # Memory-map the safetensors file + mmap = stack.enter_context(entry.as_mmap()) + # Load tensors from the memory-mapped file + tensors = safetensors.torch.load(mmap) + # Update the state dictionary with tensors + state_dict.update(tensors) + return cls.from_pretrained( + pretrained_model_name_or_path=None, + config=config, + generation_config=generation_config, + state_dict=state_dict, + **kwargs, + ) diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index f8de48ecfc78..0c0613f3c43e 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -70,6 +70,7 @@ is_gguf_available, is_gguf_version, is_google_colab, + is_hf_hub_version, is_inflect_available, is_invisible_watermark_available, is_k_diffusion_available, @@ -100,7 +101,7 @@ is_xformers_available, requires_backends, ) -from .loading_utils import get_module_from_name, load_image, load_video +from .loading_utils import get_module_from_name, get_submodule_by_name, load_image, load_video from .logging import get_logger from .outputs import BaseOutput from .peft_utils import ( diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index a6dfe18433e3..839e696c0ce9 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -26,6 +26,7 @@ from uuid import uuid4 from huggingface_hub import ( + DDUFEntry, ModelCard, ModelCardData, create_repo, @@ -291,9 +292,26 @@ def _get_model_file( user_agent: Optional[Union[Dict, str]] = None, revision: Optional[str] = None, commit_hash: Optional[str] = None, + dduf_entries: Optional[Dict[str, DDUFEntry]] = None, ): pretrained_model_name_or_path = str(pretrained_model_name_or_path) - if os.path.isfile(pretrained_model_name_or_path): + + if dduf_entries: + if subfolder is not None: + raise ValueError( + "DDUF file only allow for 1 level of directory (e.g transformer/model1/model.safetentors is not allowed). " + "Please check the DDUF structure" + ) + model_file = ( + weights_name + if pretrained_model_name_or_path == "" + else "/".join([pretrained_model_name_or_path, weights_name]) + ) + if model_file in dduf_entries: + return model_file + else: + raise EnvironmentError(f"Error no file named {weights_name} found in archive {dduf_entries.keys()}.") + elif os.path.isfile(pretrained_model_name_or_path): return pretrained_model_name_or_path elif os.path.isdir(pretrained_model_name_or_path): if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)): @@ -419,6 +437,7 @@ def _get_checkpoint_shard_files( user_agent=None, revision=None, subfolder="", + dduf_entries: Optional[Dict[str, DDUFEntry]] = None, ): """ For a given model: @@ -430,11 +449,18 @@ def _get_checkpoint_shard_files( For the description of each arg, see [`PreTrainedModel.from_pretrained`]. `index_filename` is the full path to the index (downloaded and cached if `pretrained_model_name_or_path` is a model ID on the Hub). """ - if not os.path.isfile(index_filename): - raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.") + if dduf_entries: + if index_filename not in dduf_entries: + raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.") + else: + if not os.path.isfile(index_filename): + raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.") - with open(index_filename, "r") as f: - index = json.loads(f.read()) + if dduf_entries: + index = json.loads(dduf_entries[index_filename].read_text()) + else: + with open(index_filename, "r") as f: + index = json.loads(f.read()) original_shard_filenames = sorted(set(index["weight_map"].values())) sharded_metadata = index["metadata"] @@ -448,6 +474,8 @@ def _get_checkpoint_shard_files( pretrained_model_name_or_path, subfolder=subfolder, original_shard_filenames=original_shard_filenames ) return shards_path, sharded_metadata + elif dduf_entries: + return shards_path, sharded_metadata # At this stage pretrained_model_name_or_path is a model identifier on the Hub allow_patterns = original_shard_filenames diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index 3014efebc82e..c7d002651f3a 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -115,6 +115,13 @@ except importlib_metadata.PackageNotFoundError: _transformers_available = False +_hf_hub_available = importlib.util.find_spec("huggingface_hub") is not None +try: + _hf_hub_version = importlib_metadata.version("huggingface_hub") + logger.debug(f"Successfully imported huggingface_hub version {_hf_hub_version}") +except importlib_metadata.PackageNotFoundError: + _hf_hub_available = False + _inflect_available = importlib.util.find_spec("inflect") is not None try: @@ -767,6 +774,21 @@ def is_transformers_version(operation: str, version: str): return compare_versions(parse(_transformers_version), operation, version) +def is_hf_hub_version(operation: str, version: str): + """ + Compares the current Hugging Face Hub version to a given reference with an operation. + + Args: + operation (`str`): + A string representation of an operator, such as `">"` or `"<="` + version (`str`): + A version string + """ + if not _hf_hub_available: + return False + return compare_versions(parse(_hf_hub_version), operation, version) + + def is_accelerate_version(operation: str, version: str): """ Compares the current Accelerate version to a given reference with an operation. diff --git a/src/diffusers/utils/loading_utils.py b/src/diffusers/utils/loading_utils.py index bac24fa23e63..fd66aaa4da6e 100644 --- a/src/diffusers/utils/loading_utils.py +++ b/src/diffusers/utils/loading_utils.py @@ -148,3 +148,15 @@ def get_module_from_name(module, tensor_name: str) -> Tuple[Any, str]: module = new_module tensor_name = splits[-1] return module, tensor_name + + +def get_submodule_by_name(root_module, module_path: str): + current = root_module + parts = module_path.split(".") + for part in parts: + if part.isdigit(): + idx = int(part) + current = current[idx] # e.g., for nn.ModuleList or nn.Sequential + else: + current = getattr(current, part) + return current diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 3ae74cddcbbf..62156786c6c8 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -478,6 +478,18 @@ def decorator(test_case): return decorator +def require_hf_hub_version_greater(hf_hub_version): + def decorator(test_case): + correct_hf_hub_version = version.parse( + version.parse(importlib.metadata.version("huggingface_hub")).base_version + ) > version.parse(hf_hub_version) + return unittest.skipUnless( + correct_hf_hub_version, f"Test requires huggingface_hub with the version greater than {hf_hub_version}." + )(test_case) + + return decorator + + def require_gguf_version_greater_or_equal(gguf_version): def decorator(test_case): correct_gguf_version = is_gguf_available() and version.parse( diff --git a/tests/models/autoencoders/test_models_autoencoder_ltx_video.py b/tests/models/autoencoders/test_models_autoencoder_ltx_video.py index 37f9837c8245..66d170b28eee 100644 --- a/tests/models/autoencoders/test_models_autoencoder_ltx_video.py +++ b/tests/models/autoencoders/test_models_autoencoder_ltx_video.py @@ -167,3 +167,34 @@ def test_outputs_equivalence(self): @unittest.skip("AutoencoderKLLTXVideo does not support `norm_num_groups` because it does not use GroupNorm.") def test_forward_with_norm_groups(self): pass + + def test_enable_disable_tiling(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + + torch.manual_seed(0) + output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + torch.manual_seed(0) + model.enable_tiling() + output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertLess( + (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(), + 0.5, + "VAE tiling should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_tiling() + output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertEqual( + output_without_tiling.detach().cpu().numpy().all(), + output_without_tiling_2.detach().cpu().numpy().all(), + "Without tiling outputs should match with the outputs when tiling is manually disabled.", + ) diff --git a/tests/models/transformers/test_models_transformer_cogvideox.py b/tests/models/transformers/test_models_transformer_cogvideox.py index 73b83b9eb514..2b3cca883d17 100644 --- a/tests/models/transformers/test_models_transformer_cogvideox.py +++ b/tests/models/transformers/test_models_transformer_cogvideox.py @@ -33,6 +33,7 @@ class CogVideoXTransformerTests(ModelTesterMixin, unittest.TestCase): model_class = CogVideoXTransformer3DModel main_input_name = "hidden_states" uses_custom_attn_processor = True + model_split_percents = [0.7, 0.7, 0.8] @property def dummy_input(self): diff --git a/tests/models/transformers/test_models_transformer_cogview3plus.py b/tests/models/transformers/test_models_transformer_cogview3plus.py index ec6c58a6734c..91c7c35fbd07 100644 --- a/tests/models/transformers/test_models_transformer_cogview3plus.py +++ b/tests/models/transformers/test_models_transformer_cogview3plus.py @@ -33,6 +33,7 @@ class CogView3PlusTransformerTests(ModelTesterMixin, unittest.TestCase): model_class = CogView3PlusTransformer2DModel main_input_name = "hidden_states" uses_custom_attn_processor = True + model_split_percents = [0.7, 0.6, 0.6] @property def dummy_input(self): diff --git a/tests/pipelines/allegro/test_allegro.py b/tests/pipelines/allegro/test_allegro.py index d09fc0488378..6ca96b19b8ab 100644 --- a/tests/pipelines/allegro/test_allegro.py +++ b/tests/pipelines/allegro/test_allegro.py @@ -14,6 +14,8 @@ import gc import inspect +import os +import tempfile import unittest import numpy as np @@ -24,7 +26,9 @@ from diffusers.utils.testing_utils import ( enable_full_determinism, numpy_cosine_similarity_distance, + require_hf_hub_version_greater, require_torch_gpu, + require_transformers_version_greater, slow, torch_device, ) @@ -297,6 +301,35 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2): "VAE tiling should not affect the inference results", ) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_save_load_dduf(self): + # reimplement because it needs `enable_tiling()` on the loaded pipe. + from huggingface_hub import export_folder_as_dduf + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device="cpu") + inputs.pop("generator") + inputs["generator"] = torch.manual_seed(0) + + pipeline_out = pipe(**inputs)[0].cpu() + + with tempfile.TemporaryDirectory() as tmpdir: + dduf_filename = os.path.join(tmpdir, f"{pipe.__class__.__name__.lower()}.dduf") + pipe.save_pretrained(tmpdir, safe_serialization=True) + export_folder_as_dduf(dduf_filename, folder_path=tmpdir) + loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, dduf_file=dduf_filename).to(torch_device) + + loaded_pipe.vae.enable_tiling() + inputs["generator"] = torch.manual_seed(0) + loaded_pipeline_out = loaded_pipe(**inputs)[0].cpu() + + assert np.allclose(pipeline_out, loaded_pipeline_out) + @slow @require_torch_gpu diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py index eddab54a3c03..aaf44985aafd 100644 --- a/tests/pipelines/audioldm/test_audioldm.py +++ b/tests/pipelines/audioldm/test_audioldm.py @@ -63,6 +63,8 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ] ) + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py index fb550dd3219d..95aaa370ef8b 100644 --- a/tests/pipelines/audioldm2/test_audioldm2.py +++ b/tests/pipelines/audioldm2/test_audioldm2.py @@ -70,6 +70,8 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase): ] ) + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = AudioLDM2UNet2DConditionModel( @@ -469,8 +471,8 @@ def test_xformers_attention_forwardGenerator_pass(self): pass def test_dict_tuple_outputs_equivalent(self): - # increase tolerance from 1e-4 -> 2e-4 to account for large composite model - super().test_dict_tuple_outputs_equivalent(expected_max_difference=2e-4) + # increase tolerance from 1e-4 -> 3e-4 to account for large composite model + super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-4) def test_inference_batch_single_identical(self): # increase tolerance from 1e-4 -> 2e-4 to account for large composite model diff --git a/tests/pipelines/blipdiffusion/test_blipdiffusion.py b/tests/pipelines/blipdiffusion/test_blipdiffusion.py index 7e85cef65129..6d422745ce5a 100644 --- a/tests/pipelines/blipdiffusion/test_blipdiffusion.py +++ b/tests/pipelines/blipdiffusion/test_blipdiffusion.py @@ -60,6 +60,8 @@ class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): "prompt_reps", ] + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) text_encoder_config = CLIPTextConfig( diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index b12655d989d4..fc8ea5284ccc 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -291,6 +291,8 @@ class StableDiffusionMultiControlNetPipelineFastTests( batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( @@ -523,6 +525,8 @@ class StableDiffusionMultiControlNetOneModelPipelineFastTests( batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py b/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py index 99a238caf53a..b4d3e3aaa8ed 100644 --- a/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py +++ b/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py @@ -68,6 +68,8 @@ class BlipDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.Tes "prompt_reps", ] + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) text_encoder_config = CLIPTextConfig( diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index 7c4ae716b37d..516fcc513b99 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -198,6 +198,8 @@ class StableDiffusionMultiControlNetPipelineFastTests( batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index e49106334c2e..0e4dba4265e2 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -257,6 +257,8 @@ class MultiControlNetInpaintPipelineFastTests( params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py index d2c63137c99e..6e752804e2e0 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py @@ -78,6 +78,8 @@ class ControlNetPipelineSDXLFastTests( } ) + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/controlnet/test_controlnet_sdxl.py b/tests/pipelines/controlnet/test_controlnet_sdxl.py index ea7fff5537a5..fc15973faeaf 100644 --- a/tests/pipelines/controlnet/test_controlnet_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py @@ -487,6 +487,8 @@ class StableDiffusionXLMultiControlNetPipelineFastTests( batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( @@ -692,6 +694,8 @@ class StableDiffusionXLMultiControlNetOneModelPipelineFastTests( batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py b/tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py index 9a2a0019d68b..2cd57ce56d52 100644 --- a/tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py +++ b/tests/pipelines/controlnet_sd3/test_controlnet_inpaint_sd3.py @@ -137,6 +137,8 @@ def get_dummy_components(self): "transformer": transformer, "vae": vae, "controlnet": controlnet, + "image_encoder": None, + "feature_extractor": None, } def get_dummy_inputs(self, device, seed=0): diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py index 13a05855f145..2231821fbc4a 100644 --- a/tests/pipelines/deepfloyd_if/test_if.py +++ b/tests/pipelines/deepfloyd_if/test_if.py @@ -26,7 +26,9 @@ from diffusers.utils.testing_utils import ( load_numpy, require_accelerator, + require_hf_hub_version_greater, require_torch_gpu, + require_transformers_version_greater, skip_mps, slow, torch_device, @@ -89,6 +91,11 @@ def test_inference_batch_single_identical(self): def test_xformers_attention_forwardGenerator_pass(self): self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_save_load_dduf(self): + super().test_save_load_dduf(atol=1e-2, rtol=1e-2) + @slow @require_torch_gpu diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py index 26ac42831b8b..c6d5384e2467 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py @@ -26,7 +26,9 @@ floats_tensor, load_numpy, require_accelerator, + require_hf_hub_version_greater, require_torch_gpu, + require_transformers_version_greater, skip_mps, slow, torch_device, @@ -100,6 +102,11 @@ def test_inference_batch_single_identical(self): expected_max_diff=1e-2, ) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_save_load_dduf(self): + super().test_save_load_dduf(atol=1e-2, rtol=1e-2) + @slow @require_torch_gpu diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index 1d1244c96c33..7cdd8cd147f8 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -26,7 +26,9 @@ floats_tensor, load_numpy, require_accelerator, + require_hf_hub_version_greater, require_torch_gpu, + require_transformers_version_greater, skip_mps, slow, torch_device, @@ -97,6 +99,11 @@ def test_inference_batch_single_identical(self): expected_max_diff=1e-2, ) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_save_load_dduf(self): + super().test_save_load_dduf(atol=1e-2, rtol=1e-2) + @slow @require_torch_gpu diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py index 1c4f27403332..9f151190251f 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -26,7 +26,9 @@ floats_tensor, load_numpy, require_accelerator, + require_hf_hub_version_greater, require_torch_gpu, + require_transformers_version_greater, skip_mps, slow, torch_device, @@ -97,6 +99,11 @@ def test_inference_batch_single_identical(self): expected_max_diff=1e-2, ) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_save_load_dduf(self): + super().test_save_load_dduf(atol=1e-2, rtol=1e-2) + @slow @require_torch_gpu diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py index fc1b04aacb9b..c2b48bfd6d77 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -26,7 +26,9 @@ floats_tensor, load_numpy, require_accelerator, + require_hf_hub_version_greater, require_torch_gpu, + require_transformers_version_greater, skip_mps, slow, torch_device, @@ -99,6 +101,11 @@ def test_inference_batch_single_identical(self): expected_max_diff=1e-2, ) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_save_load_dduf(self): + super().test_save_load_dduf(atol=1e-2, rtol=1e-2) + @slow @require_torch_gpu diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py index bdb9f8a76d8a..57e12899e4fd 100644 --- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py @@ -26,7 +26,9 @@ floats_tensor, load_numpy, require_accelerator, + require_hf_hub_version_greater, require_torch_gpu, + require_transformers_version_greater, skip_mps, slow, torch_device, @@ -92,6 +94,11 @@ def test_inference_batch_single_identical(self): expected_max_diff=1e-2, ) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_save_load_dduf(self): + super().test_save_load_dduf(atol=1e-2, rtol=1e-2) + @slow @require_torch_gpu diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py index ab36333c4056..addc29e14670 100644 --- a/tests/pipelines/flux/test_pipeline_flux.py +++ b/tests/pipelines/flux/test_pipeline_flux.py @@ -209,6 +209,17 @@ def test_flux_image_output_shape(self): output_height, output_width, _ = image.shape assert (output_height, output_width) == (expected_height, expected_width) + def test_flux_true_cfg(self): + pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device) + inputs = self.get_dummy_inputs(torch_device) + inputs.pop("generator") + + no_true_cfg_out = pipe(**inputs, generator=torch.manual_seed(0)).images[0] + inputs["negative_prompt"] = "bad quality" + inputs["true_cfg_scale"] = 2.0 + true_cfg_out = pipe(**inputs, generator=torch.manual_seed(0)).images[0] + assert not np.allclose(no_true_cfg_out, true_cfg_out) + @nightly @require_big_gpu_with_torch_cuda diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py index 592ebd35f4a9..f4d6165f9010 100644 --- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py +++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py @@ -59,6 +59,8 @@ class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unit # No `output_type`. required_optional_params = frozenset(["num_inference_steps", "generator", "latents", "return_dict"]) + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) scheduler = DDIMScheduler( diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py index 8553ed96e9e1..1a13ec75d082 100644 --- a/tests/pipelines/kandinsky/test_kandinsky.py +++ b/tests/pipelines/kandinsky/test_kandinsky.py @@ -204,6 +204,8 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ] test_xformers_attention = False + supports_dduf = False + def get_dummy_components(self): dummy = Dummies() return dummy.get_dummy_components() diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py index a7f861565cc9..3c8767a708d4 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py @@ -52,6 +52,8 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase) ] test_xformers_attention = True + supports_dduf = False + def get_dummy_components(self): dummy = Dummies() prior_dummy = PriorDummies() @@ -160,6 +162,8 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te ] test_xformers_attention = False + supports_dduf = False + def get_dummy_components(self): dummy = Img2ImgDummies() prior_dummy = PriorDummies() @@ -269,6 +273,8 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te ] test_xformers_attention = False + supports_dduf = False + def get_dummy_components(self): dummy = InpaintDummies() prior_dummy = PriorDummies() diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py index ea289c5ccd71..23f13ffee223 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py +++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py @@ -226,6 +226,8 @@ class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ] test_xformers_attention = False + supports_dduf = False + def get_dummy_components(self): dummies = Dummies() return dummies.get_dummy_components() diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py index 740046678744..ebb1a4d88739 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py +++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py @@ -220,6 +220,8 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ] test_xformers_attention = False + supports_dduf = False + def get_dummy_components(self): dummies = Dummies() return dummies.get_dummy_components() diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py index 5f42447bd9d5..abb53bfb792f 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_prior.py +++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py @@ -184,6 +184,8 @@ class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ] test_xformers_attention = False + supports_dduf = False + def get_dummy_components(self): dummy = Dummies() return dummy.get_dummy_components() diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py index dbba0831397b..bbf2f08a7b08 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_combined.py @@ -57,6 +57,8 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa test_xformers_attention = True callback_cfg_params = ["image_embds"] + supports_dduf = False + def get_dummy_components(self): dummy = Dummies() prior_dummy = PriorDummies() @@ -181,6 +183,8 @@ class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest test_xformers_attention = False callback_cfg_params = ["image_embds"] + supports_dduf = False + def get_dummy_components(self): dummy = Img2ImgDummies() prior_dummy = PriorDummies() @@ -302,6 +306,8 @@ class KandinskyV22PipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest ] test_xformers_attention = False + supports_dduf = False + def get_dummy_components(self): dummy = InpaintDummies() prior_dummy = PriorDummies() diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py b/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py index be0bc238d4da..bdec6c132f80 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior.py @@ -186,6 +186,8 @@ class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase) callback_cfg_params = ["prompt_embeds", "text_encoder_hidden_states", "text_mask"] test_xformers_attention = False + supports_dduf = False + def get_dummy_components(self): dummies = Dummies() return dummies.get_dummy_components() diff --git a/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py b/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py index e898824e2d17..0ea32981d518 100644 --- a/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py +++ b/tests/pipelines/kandinsky2_2/test_kandinsky_prior_emb2emb.py @@ -59,6 +59,8 @@ class KandinskyV22PriorEmb2EmbPipelineFastTests(PipelineTesterMixin, unittest.Te ] test_xformers_attention = False + supports_dduf = False + @property def text_embedder_hidden_size(self): return 32 diff --git a/tests/pipelines/kolors/test_kolors.py b/tests/pipelines/kolors/test_kolors.py index de44af6d5908..e88ba0282096 100644 --- a/tests/pipelines/kolors/test_kolors.py +++ b/tests/pipelines/kolors/test_kolors.py @@ -47,6 +47,8 @@ class KolorsPipelineFastTests(PipelineTesterMixin, unittest.TestCase): image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"}) + supports_dduf = False + def get_dummy_components(self, time_cond_proj_dim=None): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/kolors/test_kolors_img2img.py b/tests/pipelines/kolors/test_kolors_img2img.py index 2010dbd7055a..9f1ca43a081f 100644 --- a/tests/pipelines/kolors/test_kolors_img2img.py +++ b/tests/pipelines/kolors/test_kolors_img2img.py @@ -51,6 +51,8 @@ class KolorsPipelineImg2ImgFastTests(PipelineTesterMixin, unittest.TestCase): image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"}) + supports_dduf = False + # Copied from tests.pipelines.kolors.test_kolors.KolorsPipelineFastTests.get_dummy_components def get_dummy_components(self, time_cond_proj_dim=None): torch.manual_seed(0) diff --git a/tests/pipelines/lumina/test_lumina_nextdit.py b/tests/pipelines/lumina/test_lumina_nextdit.py index 5fd0dbf06050..e0fd06847b77 100644 --- a/tests/pipelines/lumina/test_lumina_nextdit.py +++ b/tests/pipelines/lumina/test_lumina_nextdit.py @@ -31,6 +31,8 @@ class LuminaText2ImgPipelinePipelineFastTests(unittest.TestCase, PipelineTesterM ) batch_params = frozenset(["prompt", "negative_prompt"]) + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) transformer = LuminaNextDiT2DModel( diff --git a/tests/pipelines/musicldm/test_musicldm.py b/tests/pipelines/musicldm/test_musicldm.py index e51f5103933a..bdd536b6ff86 100644 --- a/tests/pipelines/musicldm/test_musicldm.py +++ b/tests/pipelines/musicldm/test_musicldm.py @@ -65,6 +65,8 @@ class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ] ) + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/pag/test_pag_kolors.py b/tests/pipelines/pag/test_pag_kolors.py index 8cfb2c3fd16a..cf9466988d85 100644 --- a/tests/pipelines/pag/test_pag_kolors.py +++ b/tests/pipelines/pag/test_pag_kolors.py @@ -56,6 +56,8 @@ class KolorsPAGPipelineFastTests( image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"}) + supports_dduf = False + # Copied from tests.pipelines.kolors.test_kolors.KolorsPipelineFastTests.get_dummy_components def get_dummy_components(self, time_cond_proj_dim=None): torch.manual_seed(0) diff --git a/tests/pipelines/pag/test_pag_sana.py b/tests/pipelines/pag/test_pag_sana.py index 12addabeb0a8..a2c657297860 100644 --- a/tests/pipelines/pag/test_pag_sana.py +++ b/tests/pipelines/pag/test_pag_sana.py @@ -53,6 +53,8 @@ class SanaPAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ) test_xformers_attention = False + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) transformer = SanaTransformer2DModel( diff --git a/tests/pipelines/pag/test_pag_sdxl_img2img.py b/tests/pipelines/pag/test_pag_sdxl_img2img.py index 7e5fc5fa28b9..33bd47bfee10 100644 --- a/tests/pipelines/pag/test_pag_sdxl_img2img.py +++ b/tests/pipelines/pag/test_pag_sdxl_img2img.py @@ -82,6 +82,8 @@ class StableDiffusionXLPAGImg2ImgPipelineFastTests( {"add_text_embeds", "add_time_ids", "add_neg_time_ids"} ) + supports_dduf = False + # based on tests.pipelines.stable_diffusion_xl.test_stable_diffusion_xl_img2img_pipeline.get_dummy_components def get_dummy_components( self, skip_first_text_encoder=False, time_cond_proj_dim=None, requires_aesthetics_score=False diff --git a/tests/pipelines/pag/test_pag_sdxl_inpaint.py b/tests/pipelines/pag/test_pag_sdxl_inpaint.py index efc37abd0682..8378b07e9f74 100644 --- a/tests/pipelines/pag/test_pag_sdxl_inpaint.py +++ b/tests/pipelines/pag/test_pag_sdxl_inpaint.py @@ -82,6 +82,8 @@ class StableDiffusionXLPAGInpaintPipelineFastTests( {"add_text_embeds", "add_time_ids", "mask", "masked_image_latents"} ) + supports_dduf = False + # based on tests.pipelines.stable_diffusion_xl.test_stable_diffusion_xl_inpaint.StableDiffusionXLInpaintPipelineFastTests.get_dummy_components def get_dummy_components( self, skip_first_text_encoder=False, time_cond_proj_dim=None, requires_aesthetics_score=False diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py index c71e2d4761c2..6b668de2762a 100644 --- a/tests/pipelines/paint_by_example/test_paint_by_example.py +++ b/tests/pipelines/paint_by_example/test_paint_by_example.py @@ -46,6 +46,8 @@ class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase): batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS image_params = frozenset([]) # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py index f3661355e9dd..ac7096874b31 100644 --- a/tests/pipelines/shap_e/test_shap_e_img2img.py +++ b/tests/pipelines/shap_e/test_shap_e_img2img.py @@ -50,6 +50,8 @@ class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ] test_xformers_attention = False + supports_dduf = False + @property def text_embedder_hidden_size(self): return 16 diff --git a/tests/pipelines/stable_audio/test_stable_audio.py b/tests/pipelines/stable_audio/test_stable_audio.py index 41ac94891c6f..b2ca3ddd0e84 100644 --- a/tests/pipelines/stable_audio/test_stable_audio.py +++ b/tests/pipelines/stable_audio/test_stable_audio.py @@ -70,6 +70,7 @@ class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ) # There is not xformers version of the StableAudioPipeline custom attention processor test_xformers_attention = False + supports_dduf = False def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 01a0a3abe4ee..430d99781a25 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -76,6 +76,8 @@ class StableDiffusionDepth2ImgPipelineFastTests( image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"depth_mask"}) + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_inpaint.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_inpaint.py index 464ef6d017df..a37ea3fc39c5 100644 --- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_inpaint.py +++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3_inpaint.py @@ -106,6 +106,8 @@ def get_dummy_components(self): "tokenizer_3": tokenizer_3, "transformer": transformer, "vae": vae, + "image_encoder": None, + "feature_extractor": None, } def get_dummy_inputs(self, device, seed=0): diff --git a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py index 2a1e691e9e8f..15f298c67e11 100644 --- a/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py +++ b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py @@ -389,6 +389,8 @@ def test_stable_diffusion_adapter_default_case(self): class StableDiffusionMultiAdapterPipelineFastTests(AdapterTests, PipelineTesterMixin, unittest.TestCase): + supports_dduf = False + def get_dummy_components(self, time_cond_proj_dim=None): return super().get_dummy_components("multi_adapter", time_cond_proj_dim=time_cond_proj_dim) diff --git a/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py b/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py index 748702541b1e..15e4c60db82d 100644 --- a/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py +++ b/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py @@ -66,6 +66,8 @@ class GligenTextImagePipelineFastTests( image_params = TEXT_TO_IMAGE_IMAGE_PARAMS image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py index 7a3b0f70ccb1..d7567afdee1f 100644 --- a/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py @@ -58,6 +58,8 @@ class StableDiffusionImageVariationPipelineFastTests( # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess image_latents_params = frozenset([]) + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py index 7c7b03786563..23291b0407aa 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_adapter.py @@ -422,6 +422,8 @@ def test_adapter_sdxl_lcm_custom_timesteps(self): class StableDiffusionXLMultiAdapterPipelineFastTests( StableDiffusionXLAdapterPipelineFastTests, PipelineTesterMixin, unittest.TestCase ): + supports_dduf = False + def get_dummy_components(self, time_cond_proj_dim=None): return super().get_dummy_components("multi_adapter", time_cond_proj_dim=time_cond_proj_dim) diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py index db0905a48310..ceec86a811c0 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py @@ -77,6 +77,8 @@ class StableDiffusionXLImg2ImgPipelineFastTests( {"add_text_embeds", "add_time_ids", "add_neg_time_ids"} ) + supports_dduf = False + def get_dummy_components(self, skip_first_text_encoder=False, time_cond_proj_dim=None): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py index 964c7123dd32..c759f4c112d9 100644 --- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py @@ -72,6 +72,8 @@ class StableDiffusionXLInpaintPipelineFastTests( } ) + supports_dduf = False + def get_dummy_components(self, skip_first_text_encoder=False, time_cond_proj_dim=None): torch.manual_seed(0) unet = UNet2DConditionModel( diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index a5cbf7761501..34f2553a9184 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -51,6 +51,8 @@ class StableUnCLIPImg2ImgPipelineFastTests( ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess image_latents_params = frozenset([]) + supports_dduf = False + def get_dummy_components(self): embedder_hidden_size = 32 embedder_projection_dim = embedder_hidden_size diff --git a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py index ac9acb26afd3..352477ecec56 100644 --- a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py +++ b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py @@ -58,6 +58,8 @@ class StableVideoDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCa ] ) + supports_dduf = False + def get_dummy_components(self): torch.manual_seed(0) unet = UNetSpatioTemporalConditionModel( diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 423c82e0602e..6665a005ba96 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -75,9 +75,11 @@ nightly, require_compel, require_flax, + require_hf_hub_version_greater, require_onnxruntime, require_torch_2, require_torch_gpu, + require_transformers_version_greater, run_test_in_subprocess, slow, torch_device, @@ -981,6 +983,18 @@ def test_download_ignore_files(self): assert not any(f in ["vae/diffusion_pytorch_model.bin", "text_encoder/config.json"] for f in files) assert len(files) == 14 + def test_download_dduf_with_custom_pipeline_raises_error(self): + with self.assertRaises(NotImplementedError): + _ = DiffusionPipeline.download( + "DDUF/tiny-flux-dev-pipe-dduf", dduf_file="fluxpipeline.dduf", custom_pipeline="my_pipeline" + ) + + def test_download_dduf_with_connected_pipeline_raises_error(self): + with self.assertRaises(NotImplementedError): + _ = DiffusionPipeline.download( + "DDUF/tiny-flux-dev-pipe-dduf", dduf_file="fluxpipeline.dduf", load_connected_pipeline=True + ) + def test_get_pipeline_class_from_flax(self): flax_config = {"_class_name": "FlaxStableDiffusionPipeline"} config = {"_class_name": "StableDiffusionPipeline"} @@ -1802,6 +1816,55 @@ def test_pipe_same_device_id_offload(self): sd.maybe_free_model_hooks() assert sd._offload_gpu_id == 5 + @parameterized.expand([torch.float32, torch.float16]) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_load_dduf_from_hub(self, dtype): + with tempfile.TemporaryDirectory() as tmpdir: + pipe = DiffusionPipeline.from_pretrained( + "DDUF/tiny-flux-dev-pipe-dduf", dduf_file="fluxpipeline.dduf", cache_dir=tmpdir, torch_dtype=dtype + ).to(torch_device) + out_1 = pipe(prompt="dog", num_inference_steps=5, generator=torch.manual_seed(0), output_type="np").images + + pipe.save_pretrained(tmpdir) + loaded_pipe = DiffusionPipeline.from_pretrained(tmpdir, torch_dtype=dtype).to(torch_device) + + out_2 = loaded_pipe( + prompt="dog", num_inference_steps=5, generator=torch.manual_seed(0), output_type="np" + ).images + + self.assertTrue(np.allclose(out_1, out_2, atol=1e-4, rtol=1e-4)) + + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_load_dduf_from_hub_local_files_only(self): + with tempfile.TemporaryDirectory() as tmpdir: + pipe = DiffusionPipeline.from_pretrained( + "DDUF/tiny-flux-dev-pipe-dduf", dduf_file="fluxpipeline.dduf", cache_dir=tmpdir + ).to(torch_device) + out_1 = pipe(prompt="dog", num_inference_steps=5, generator=torch.manual_seed(0), output_type="np").images + + local_files_pipe = DiffusionPipeline.from_pretrained( + "DDUF/tiny-flux-dev-pipe-dduf", dduf_file="fluxpipeline.dduf", cache_dir=tmpdir, local_files_only=True + ).to(torch_device) + out_2 = local_files_pipe( + prompt="dog", num_inference_steps=5, generator=torch.manual_seed(0), output_type="np" + ).images + + self.assertTrue(np.allclose(out_1, out_2, atol=1e-4, rtol=1e-4)) + + def test_dduf_raises_error_with_custom_pipeline(self): + with self.assertRaises(NotImplementedError): + _ = DiffusionPipeline.from_pretrained( + "DDUF/tiny-flux-dev-pipe-dduf", dduf_file="fluxpipeline.dduf", custom_pipeline="my_pipeline" + ) + + def test_dduf_raises_error_with_connected_pipeline(self): + with self.assertRaises(NotImplementedError): + _ = DiffusionPipeline.from_pretrained( + "DDUF/tiny-flux-dev-pipe-dduf", dduf_file="fluxpipeline.dduf", load_connected_pipeline=True + ) + def test_wrong_model(self): tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") with self.assertRaises(ValueError) as error_context: @@ -1812,6 +1875,27 @@ def test_wrong_model(self): assert "is of type" in str(error_context.exception) assert "but should be" in str(error_context.exception) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_dduf_load_sharded_checkpoint_diffusion_model(self): + with tempfile.TemporaryDirectory() as tmpdir: + pipe = DiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-flux-dev-pipe-sharded-checkpoint-DDUF", + dduf_file="tiny-flux-dev-pipe-sharded-checkpoint.dduf", + cache_dir=tmpdir, + ).to(torch_device) + + out_1 = pipe(prompt="dog", num_inference_steps=5, generator=torch.manual_seed(0), output_type="np").images + + pipe.save_pretrained(tmpdir) + loaded_pipe = DiffusionPipeline.from_pretrained(tmpdir).to(torch_device) + + out_2 = loaded_pipe( + prompt="dog", num_inference_steps=5, generator=torch.manual_seed(0), output_type="np" + ).images + + self.assertTrue(np.allclose(out_1, out_2, atol=1e-4, rtol=1e-4)) + @slow @require_torch_gpu diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index f5494fbade2e..83b628e09f88 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -43,7 +43,9 @@ CaptureLogger, require_accelerate_version_greater, require_accelerator, + require_hf_hub_version_greater, require_torch, + require_transformers_version_greater, skip_mps, torch_device, ) @@ -986,6 +988,8 @@ class PipelineTesterMixin: test_xformers_attention = True + supports_dduf = True + def get_generator(self, seed): device = torch_device if torch_device != "mps" else "cpu" generator = torch.Generator(device).manual_seed(seed) @@ -1990,6 +1994,39 @@ def test_StableDiffusionMixin_component(self): ) ) + @require_hf_hub_version_greater("0.26.5") + @require_transformers_version_greater("4.47.1") + def test_save_load_dduf(self, atol=1e-4, rtol=1e-4): + if not self.supports_dduf: + return + + from huggingface_hub import export_folder_as_dduf + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device="cpu") + inputs.pop("generator") + inputs["generator"] = torch.manual_seed(0) + + pipeline_out = pipe(**inputs)[0] + + with tempfile.TemporaryDirectory() as tmpdir: + dduf_filename = os.path.join(tmpdir, f"{pipe.__class__.__name__.lower()}.dduf") + pipe.save_pretrained(tmpdir, safe_serialization=True) + export_folder_as_dduf(dduf_filename, folder_path=tmpdir) + loaded_pipe = self.pipeline_class.from_pretrained(tmpdir, dduf_file=dduf_filename).to(torch_device) + + inputs["generator"] = torch.manual_seed(0) + loaded_pipeline_out = loaded_pipe(**inputs)[0] + + if isinstance(pipeline_out, np.ndarray) and isinstance(loaded_pipeline_out, np.ndarray): + assert np.allclose(pipeline_out, loaded_pipeline_out, atol=atol, rtol=rtol) + elif isinstance(pipeline_out, torch.Tensor) and isinstance(loaded_pipeline_out, torch.Tensor): + assert torch.allclose(pipeline_out, loaded_pipeline_out, atol=atol, rtol=rtol) + @is_staging_test class PipelinePushToHubTester(unittest.TestCase): diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py index dfc3acc0c0f2..23a6cd6663b7 100644 --- a/tests/pipelines/unclip/test_unclip_image_variation.py +++ b/tests/pipelines/unclip/test_unclip_image_variation.py @@ -66,6 +66,7 @@ class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCa "super_res_num_inference_steps", ] test_xformers_attention = False + supports_dduf = False @property def text_embedder_hidden_size(self): diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py index 2e0ba1cfb8eb..310e46a2e8c6 100644 --- a/tests/pipelines/unidiffuser/test_unidiffuser.py +++ b/tests/pipelines/unidiffuser/test_unidiffuser.py @@ -86,6 +86,8 @@ class UniDiffuserPipelineFastTests( # vae_latents, not latents, is the argument that corresponds to VAE latent inputs image_latents_params = frozenset(["vae_latents"]) + supports_dduf = False + def get_dummy_components(self): unet = UniDiffuserModel.from_pretrained( "hf-internal-testing/unidiffuser-diffusers-test", diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 1e631114f038..a9b9ab753084 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -20,6 +20,7 @@ import numpy as np import pytest import safetensors.torch +from huggingface_hub import hf_hub_download from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel from diffusers.utils import is_accelerate_version, logging @@ -568,6 +569,27 @@ def test_quality(self): max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice) self.assertTrue(max_diff < 1e-3) + def test_lora_loading(self): + self.pipeline_4bit.load_lora_weights( + hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors"), adapter_name="hyper-sd" + ) + self.pipeline_4bit.set_adapters("hyper-sd", adapter_weights=0.125) + + output = self.pipeline_4bit( + prompt=self.prompt, + height=256, + width=256, + max_sequence_length=64, + output_type="np", + num_inference_steps=8, + generator=torch.Generator().manual_seed(42), + ).images + out_slice = output[0, -3:, -3:, -1].flatten() + expected_slice = np.array([0.5347, 0.5342, 0.5283, 0.5093, 0.4988, 0.5093, 0.5044, 0.5015, 0.4946]) + + max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice) + self.assertTrue(max_diff < 1e-3) + @slow class BaseBnb4BitSerializationTests(Base4bitTests): diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index b223c71cb5ce..2661196afc70 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -18,6 +18,7 @@ import numpy as np import pytest +from huggingface_hub import hf_hub_download from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel, logging from diffusers.utils import is_accelerate_version @@ -30,6 +31,7 @@ numpy_cosine_similarity_distance, require_accelerate, require_bitsandbytes_version_greater, + require_peft_version_greater, require_torch, require_torch_gpu, require_transformers_version_greater, @@ -509,6 +511,29 @@ def test_quality(self): max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice) self.assertTrue(max_diff < 1e-3) + @require_peft_version_greater("0.14.0") + def test_lora_loading(self): + self.pipeline_8bit.load_lora_weights( + hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors"), adapter_name="hyper-sd" + ) + self.pipeline_8bit.set_adapters("hyper-sd", adapter_weights=0.125) + + output = self.pipeline_8bit( + prompt=self.prompt, + height=256, + width=256, + max_sequence_length=64, + output_type="np", + num_inference_steps=8, + generator=torch.manual_seed(42), + ).images + out_slice = output[0, -3:, -3:, -1].flatten() + + expected_slice = np.array([0.3916, 0.3916, 0.3887, 0.4243, 0.4155, 0.4233, 0.4570, 0.4531, 0.4248]) + + max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice) + self.assertTrue(max_diff < 1e-3) + @slow class BaseBnb8bitSerializationTests(Base8bitTests): diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py index 3c3f13db9b1c..7d1503b91f97 100644 --- a/tests/quantization/torchao/test_torchao.py +++ b/tests/quantization/torchao/test_torchao.py @@ -476,6 +476,18 @@ def test_wrong_config(self): with self.assertRaises(ValueError): self.get_dummy_components(TorchAoConfig("int42")) + def test_sequential_cpu_offload(self): + r""" + A test that checks if inference runs as expected when sequential cpu offloading is enabled. + """ + quantization_config = TorchAoConfig("int8wo") + components = self.get_dummy_components(quantization_config) + pipe = FluxPipeline(**components) + pipe.enable_sequential_cpu_offload() + + inputs = self.get_dummy_inputs(torch_device) + _ = pipe(**inputs) + # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners @require_torch diff --git a/tests/single_file/single_file_testing_utils.py b/tests/single_file/single_file_testing_utils.py index 0917bbe2b0d7..4e7bc0af6842 100644 --- a/tests/single_file/single_file_testing_utils.py +++ b/tests/single_file/single_file_testing_utils.py @@ -47,6 +47,8 @@ def download_diffusers_config(repo_id, tmpdir): class SDSingleFileTesterMixin: + single_file_kwargs = {} + def _compare_component_configs(self, pipe, single_file_pipe): for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items(): if param_name in ["torch_dtype", "architectures", "_name_or_path"]: @@ -154,7 +156,7 @@ def test_single_file_components_with_original_config_local_files_only( self._compare_component_configs(pipe, single_file_pipe) def test_single_file_format_inference_is_same_as_pretrained(self, expected_max_diff=1e-4): - sf_pipe = self.pipeline_class.from_single_file(self.ckpt_path, safety_checker=None) + sf_pipe = self.pipeline_class.from_single_file(self.ckpt_path, safety_checker=None, **self.single_file_kwargs) sf_pipe.unet.set_attn_processor(AttnProcessor()) sf_pipe.enable_model_cpu_offload(device=torch_device) @@ -170,7 +172,7 @@ def test_single_file_format_inference_is_same_as_pretrained(self, expected_max_d max_diff = numpy_cosine_similarity_distance(image.flatten(), image_single_file.flatten()) - assert max_diff < expected_max_diff + assert max_diff < expected_max_diff, f"{image.flatten()} != {image_single_file.flatten()}" def test_single_file_components_with_diffusers_config( self, diff --git a/tests/single_file/test_model_flux_transformer_single_file.py b/tests/single_file/test_model_flux_transformer_single_file.py new file mode 100644 index 000000000000..0ec97db26a9e --- /dev/null +++ b/tests/single_file/test_model_flux_transformer_single_file.py @@ -0,0 +1,72 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import torch + +from diffusers import ( + FluxTransformer2DModel, +) +from diffusers.utils.testing_utils import ( + backend_empty_cache, + enable_full_determinism, + require_torch_accelerator, + torch_device, +) + + +enable_full_determinism() + + +@require_torch_accelerator +class FluxTransformer2DModelSingleFileTests(unittest.TestCase): + model_class = FluxTransformer2DModel + ckpt_path = "https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors" + alternate_keys_ckpt_paths = ["https://huggingface.co/Comfy-Org/flux1-dev/blob/main/flux1-dev-fp8.safetensors"] + + repo_id = "black-forest-labs/FLUX.1-dev" + + def setUp(self): + super().setUp() + gc.collect() + backend_empty_cache(torch_device) + + def tearDown(self): + super().tearDown() + gc.collect() + backend_empty_cache(torch_device) + + def test_single_file_components(self): + model = self.model_class.from_pretrained(self.repo_id, subfolder="transformer") + model_single_file = self.model_class.from_single_file(self.ckpt_path) + + PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"] + for param_name, param_value in model_single_file.config.items(): + if param_name in PARAMS_TO_IGNORE: + continue + assert ( + model.config[param_name] == param_value + ), f"{param_name} differs between single file loading and pretrained loading" + + def test_checkpoint_loading(self): + for ckpt_path in self.alternate_keys_ckpt_paths: + torch.cuda.empty_cache() + model = self.model_class.from_single_file(ckpt_path) + + del model + gc.collect() + torch.cuda.empty_cache() diff --git a/tests/single_file/test_stable_diffusion_single_file.py b/tests/single_file/test_stable_diffusion_single_file.py index dd15a5c7c071..78baeb94929c 100644 --- a/tests/single_file/test_stable_diffusion_single_file.py +++ b/tests/single_file/test_stable_diffusion_single_file.py @@ -132,6 +132,7 @@ class StableDiffusionInstructPix2PixPipelineSingleFileSlowTests(unittest.TestCas "https://raw.githubusercontent.com/timothybrooks/instruct-pix2pix/refs/heads/main/configs/generate.yaml" ) repo_id = "timbrooks/instruct-pix2pix" + single_file_kwargs = {"extract_ema": True} def setUp(self): super().setUp()