diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index b623576e3990..1fa1bdf259cc 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -109,6 +109,7 @@ "autoencoder-dc-sana": "encoder.project_in.conv.bias", "mochi-1-preview": ["model.diffusion_model.blocks.0.attn.qkv_x.weight", "blocks.0.attn.qkv_x.weight"], "hunyuan-video": "txt_in.individual_token_refiner.blocks.0.adaLN_modulation.1.bias", + "instruct-pix2pix": "model.diffusion_model.input_blocks.0.0.weight", } DIFFUSERS_DEFAULT_PIPELINE_PATHS = { @@ -165,6 +166,7 @@ "autoencoder-dc-f32c32-sana": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers"}, "mochi-1-preview": {"pretrained_model_name_or_path": "genmo/mochi-1-preview"}, "hunyuan-video": {"pretrained_model_name_or_path": "hunyuanvideo-community/HunyuanVideo"}, + "instruct-pix2pix": {"pretrained_model_name_or_path": "timbrooks/instruct-pix2pix"}, } # Use to configure model sample size when original config is provided @@ -633,6 +635,12 @@ def infer_diffusers_model_type(checkpoint): elif CHECKPOINT_KEY_NAMES["hunyuan-video"] in checkpoint: model_type = "hunyuan-video" + elif ( + CHECKPOINT_KEY_NAMES["instruct-pix2pix"] in checkpoint + and checkpoint[CHECKPOINT_KEY_NAMES["instruct-pix2pix"]].shape[1] == 8 + ): + model_type = "instruct-pix2pix" + else: model_type = "v1" diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index fd89b195c778..af40fe14f8ab 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -22,16 +22,23 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import PIL_INTERPOLATION, deprecate, logging +from ...utils import PIL_INTERPOLATION, deprecate, is_torch_xla_available, logging from ...utils.torch_utils import randn_tensor from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + + XLA_AVAILABLE = True +else: + XLA_AVAILABLE = False + logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -79,6 +86,7 @@ class StableDiffusionInstructPix2PixPipeline( TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin, IPAdapterMixin, + FromSingleFileMixin, ): r""" Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion). @@ -457,6 +465,9 @@ def __call__( step_idx = i // getattr(self.scheduler, "order", 1) callback(step_idx, t, latents) + if XLA_AVAILABLE: + xm.mark_step() + if not output_type == "latent": image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) diff --git a/tests/single_file/test_stable_diffusion_single_file.py b/tests/single_file/test_stable_diffusion_single_file.py index 71afda1b80bb..dd15a5c7c071 100644 --- a/tests/single_file/test_stable_diffusion_single_file.py +++ b/tests/single_file/test_stable_diffusion_single_file.py @@ -4,11 +4,13 @@ import torch -from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline +from diffusers import EulerDiscreteScheduler, StableDiffusionInstructPix2PixPipeline, StableDiffusionPipeline from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name +from diffusers.utils import load_image from diffusers.utils.testing_utils import ( backend_empty_cache, enable_full_determinism, + nightly, require_torch_accelerator, slow, torch_device, @@ -118,3 +120,44 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0 def test_single_file_format_inference_is_same_as_pretrained(self): super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3) + + +@nightly +@slow +@require_torch_accelerator +class StableDiffusionInstructPix2PixPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): + pipeline_class = StableDiffusionInstructPix2PixPipeline + ckpt_path = "https://huggingface.co/timbrooks/instruct-pix2pix/blob/main/instruct-pix2pix-00-22000.safetensors" + original_config = ( + "https://raw.githubusercontent.com/timothybrooks/instruct-pix2pix/refs/heads/main/configs/generate.yaml" + ) + repo_id = "timbrooks/instruct-pix2pix" + + def setUp(self): + super().setUp() + gc.collect() + backend_empty_cache(torch_device) + + def tearDown(self): + super().tearDown() + gc.collect() + backend_empty_cache(torch_device) + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_pix2pix/example.jpg" + ) + inputs = { + "prompt": "turn him into a cyborg", + "image": image, + "generator": generator, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "image_guidance_scale": 1.0, + "output_type": "np", + } + return inputs + + def test_single_file_format_inference_is_same_as_pretrained(self): + super().test_single_file_format_inference_is_same_as_pretrained(expected_max_diff=1e-3)