From 72b0cce27d350be33524e234ac903f8e8daa2fce Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 11 Dec 2023 10:29:57 +0530
Subject: [PATCH 1/5] feat: introduce autoencoders module

---
 src/diffusers/models/__init__.py              | 20 +++++++------
 src/diffusers/models/autoencoders/__init__.py |  5 ++++
 .../{ => autoencoders}/autoencoder_asym_kl.py |  8 +++---
 .../{ => autoencoders}/autoencoder_kl.py      | 12 ++++----
 .../autoencoder_kl_temporal_decoder.py        | 16 +++++------
 .../{ => autoencoders}/autoencoder_tiny.py    |  8 +++---
 .../consistency_decoder_vae.py                | 28 +++++++++----------
 .../models/{ => autoencoders}/vae.py          | 10 +++----
 .../wuerstchen/modeling_paella_vq_model.py    |  2 +-
 9 files changed, 58 insertions(+), 51 deletions(-)
 create mode 100644 src/diffusers/models/autoencoders/__init__.py
 rename src/diffusers/models/{ => autoencoders}/autoencoder_asym_kl.py (97%)
 rename src/diffusers/models/{ => autoencoders}/autoencoder_kl.py (98%)
 rename src/diffusers/models/{ => autoencoders}/autoencoder_kl_temporal_decoder.py (97%)
 rename src/diffusers/models/{ => autoencoders}/autoencoder_tiny.py (98%)
 rename src/diffusers/models/{ => autoencoders}/consistency_decoder_vae.py (95%)
 rename src/diffusers/models/{ => autoencoders}/vae.py (99%)

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index ec1c7ab43494..99a14fed52f3 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -26,11 +26,11 @@
 
 if is_torch_available():
     _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
-    _import_structure["autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
-    _import_structure["autoencoder_kl"] = ["AutoencoderKL"]
     _import_structure["autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
-    _import_structure["autoencoder_tiny"] = ["AutoencoderTiny"]
-    _import_structure["consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
+    _import_structure["autoencoders.autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
+    _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
     _import_structure["controlnet"] = ["ControlNetModel"]
     _import_structure["controlnetxs"] = ["ControlNetXSModel"]
     _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
@@ -58,11 +58,13 @@
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     if is_torch_available():
         from .adapter import MultiAdapter, T2IAdapter
-        from .autoencoder_asym_kl import AsymmetricAutoencoderKL
-        from .autoencoder_kl import AutoencoderKL
-        from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
-        from .autoencoder_tiny import AutoencoderTiny
-        from .consistency_decoder_vae import ConsistencyDecoderVAE
+        from .autoencoders import (
+            AsymmetricAutoencoderKL,
+            AutoencoderKL,
+            AutoencoderKLTemporalDecoder,
+            AutoencoderTiny,
+            ConsistencyDecoderVAE,
+        )
         from .controlnet import ControlNetModel
         from .controlnetxs import ControlNetXSModel
         from .dual_transformer_2d import DualTransformer2DModel
diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
new file mode 100644
index 000000000000..201a40ff17b2
--- /dev/null
+++ b/src/diffusers/models/autoencoders/__init__.py
@@ -0,0 +1,5 @@
+from .autoencoder_asym_kl import AsymmetricAutoencoderKL
+from .autoencoder_kl import AutoencoderKL
+from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
+from .autoencoder_tiny import AutoencoderTiny
+from .consistency_decoder_vae import ConsistencyDecoderVAE
diff --git a/src/diffusers/models/autoencoder_asym_kl.py b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
similarity index 97%
rename from src/diffusers/models/autoencoder_asym_kl.py
rename to src/diffusers/models/autoencoders/autoencoder_asym_kl.py
index 678e47234096..9114650619fc 100644
--- a/src/diffusers/models/autoencoder_asym_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
@@ -16,10 +16,10 @@
 import torch
 import torch.nn as nn
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils.accelerate_utils import apply_forward_hook
-from .modeling_outputs import AutoencoderKLOutput
-from .modeling_utils import ModelMixin
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils.accelerate_utils import apply_forward_hook
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
 from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder
 
 
diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
similarity index 98%
rename from src/diffusers/models/autoencoder_kl.py
rename to src/diffusers/models/autoencoders/autoencoder_kl.py
index 8fa3574125f9..ae2d90c548f8 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -16,10 +16,10 @@
 import torch
 import torch.nn as nn
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import FromOriginalVAEMixin
-from ..utils.accelerate_utils import apply_forward_hook
-from .attention_processor import (
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalVAEMixin
+from ...utils.accelerate_utils import apply_forward_hook
+from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     Attention,
@@ -27,8 +27,8 @@
     AttnAddedKVProcessor,
     AttnProcessor,
 )
-from .modeling_outputs import AutoencoderKLOutput
-from .modeling_utils import ModelMixin
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
diff --git a/src/diffusers/models/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
similarity index 97%
rename from src/diffusers/models/autoencoder_kl_temporal_decoder.py
rename to src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
index 176b6e0df924..0b7f8d1f5336 100644
--- a/src/diffusers/models/autoencoder_kl_temporal_decoder.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -16,14 +16,14 @@
 import torch
 import torch.nn as nn
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import FromOriginalVAEMixin
-from ..utils import is_torch_version
-from ..utils.accelerate_utils import apply_forward_hook
-from .attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
-from .modeling_outputs import AutoencoderKLOutput
-from .modeling_utils import ModelMixin
-from .unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalVAEMixin
+from ...utils import is_torch_version
+from ...utils.accelerate_utils import apply_forward_hook
+from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from ..unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder
 from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
diff --git a/src/diffusers/models/autoencoder_tiny.py b/src/diffusers/models/autoencoders/autoencoder_tiny.py
similarity index 98%
rename from src/diffusers/models/autoencoder_tiny.py
rename to src/diffusers/models/autoencoders/autoencoder_tiny.py
index 56ccf30e0402..08b1c0e74d70 100644
--- a/src/diffusers/models/autoencoder_tiny.py
+++ b/src/diffusers/models/autoencoders/autoencoder_tiny.py
@@ -18,10 +18,10 @@
 
 import torch
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from ..utils.accelerate_utils import apply_forward_hook
-from .modeling_utils import ModelMixin
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ..modeling_utils import ModelMixin
 from .vae import DecoderOutput, DecoderTiny, EncoderTiny
 
 
diff --git a/src/diffusers/models/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
similarity index 95%
rename from src/diffusers/models/consistency_decoder_vae.py
rename to src/diffusers/models/autoencoders/consistency_decoder_vae.py
index 34176a35e835..d92423eafc31 100644
--- a/src/diffusers/models/consistency_decoder_vae.py
+++ b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
@@ -18,20 +18,20 @@
 import torch.nn.functional as F
 from torch import nn
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..schedulers import ConsistencyDecoderScheduler
-from ..utils import BaseOutput
-from ..utils.accelerate_utils import apply_forward_hook
-from ..utils.torch_utils import randn_tensor
-from .attention_processor import (
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...schedulers import ConsistencyDecoderScheduler
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ...utils.torch_utils import randn_tensor
+from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
     CROSS_ATTENTION_PROCESSORS,
     AttentionProcessor,
     AttnAddedKVProcessor,
     AttnProcessor,
 )
-from .modeling_utils import ModelMixin
-from .unet_2d import UNet2DModel
+from ..modeling_utils import ModelMixin
+from ..unet_2d import UNet2DModel
 from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
@@ -153,7 +153,7 @@ def __init__(
         self.use_slicing = False
         self.use_tiling = False
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.enable_tiling
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_tiling
     def enable_tiling(self, use_tiling: bool = True):
         r"""
         Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
@@ -162,7 +162,7 @@ def enable_tiling(self, use_tiling: bool = True):
         """
         self.use_tiling = use_tiling
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.disable_tiling
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_tiling
     def disable_tiling(self):
         r"""
         Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
@@ -170,7 +170,7 @@ def disable_tiling(self):
         """
         self.enable_tiling(False)
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.enable_slicing
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_slicing
     def enable_slicing(self):
         r"""
         Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
@@ -178,7 +178,7 @@ def enable_slicing(self):
         """
         self.use_slicing = True
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.disable_slicing
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_slicing
     def disable_slicing(self):
         r"""
         Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
@@ -333,14 +333,14 @@ def decode(
 
         return DecoderOutput(sample=x_0)
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.blend_v
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.blend_v
     def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
         blend_extent = min(a.shape[2], b.shape[2], blend_extent)
         for y in range(blend_extent):
             b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
         return b
 
-    # Copied from diffusers.models.autoencoder_kl.AutoencoderKL.blend_h
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.blend_h
     def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
         blend_extent = min(a.shape[3], b.shape[3], blend_extent)
         for x in range(blend_extent):
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/autoencoders/vae.py
similarity index 99%
rename from src/diffusers/models/vae.py
rename to src/diffusers/models/autoencoders/vae.py
index 0049456e2187..9ed0232e6983 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/autoencoders/vae.py
@@ -18,11 +18,11 @@
 import torch
 import torch.nn as nn
 
-from ..utils import BaseOutput, is_torch_version
-from ..utils.torch_utils import randn_tensor
-from .activations import get_activation
-from .attention_processor import SpatialNorm
-from .unet_2d_blocks import (
+from ...utils import BaseOutput, is_torch_version
+from ...utils.torch_utils import randn_tensor
+from ..activations import get_activation
+from ..attention_processor import SpatialNorm
+from ..unet_2d_blocks import (
     AutoencoderTinyBlock,
     UNetMidBlock2D,
     get_down_block,
diff --git a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
index 7ee42faa0e82..3115cc2d9d3d 100644
--- a/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
+++ b/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
@@ -19,8 +19,8 @@
 import torch.nn as nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.autoencoders.vae import DecoderOutput, VectorQuantizer
 from ...models.modeling_utils import ModelMixin
-from ...models.vae import DecoderOutput, VectorQuantizer
 from ...models.vq_model import VQEncoderOutput
 from ...utils.accelerate_utils import apply_forward_hook
 

From ce20aa1e3cd228df6f6c2c4bfed64d99171ec467 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 11 Dec 2023 10:30:48 +0530
Subject: [PATCH 2/5] more changes for styling and copy fixing

---
 scripts/convert_consistency_decoder.py | 2 +-
 src/diffusers/models/vq_model.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_consistency_decoder.py b/scripts/convert_consistency_decoder.py
index 6a294038a5a3..3319f4c4665e 100644
--- a/scripts/convert_consistency_decoder.py
+++ b/scripts/convert_consistency_decoder.py
@@ -12,9 +12,9 @@
 from tqdm import tqdm
 
 from diffusers import AutoencoderKL, ConsistencyDecoderVAE, DiffusionPipeline, StableDiffusionPipeline, UNet2DModel
+from diffusers.models.autoencoders.vae import Encoder
 from diffusers.models.embeddings import TimestepEmbedding
 from diffusers.models.unet_2d_blocks import ResnetDownsampleBlock2D, ResnetUpsampleBlock2D, UNetMidBlock2D
-from diffusers.models.vae import Encoder
 
 
 args = ArgumentParser()
diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py
index f4a6c8fb227f..bfe62ec863b3 100644
--- a/src/diffusers/models/vq_model.py
+++ b/src/diffusers/models/vq_model.py
@@ -20,8 +20,8 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput
 from ..utils.accelerate_utils import apply_forward_hook
+from .autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
 from .modeling_utils import ModelMixin
-from .vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
 
 
 @dataclass

From 43ca493f8834e0ab940eb981409bfc91f01848f4 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 11 Dec 2023 10:33:12 +0530
Subject: [PATCH 3/5] path changes in the docs.

---
 docs/source/en/api/models/asymmetricautoencoderkl.md | 6 +++---
 docs/source/en/api/models/autoencoder_tiny.md        | 2 +-
 docs/source/en/api/models/autoencoderkl.md           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/api/models/asymmetricautoencoderkl.md b/docs/source/en/api/models/asymmetricautoencoderkl.md
index 1e102943c5e4..fdc71df7a999 100644
--- a/docs/source/en/api/models/asymmetricautoencoderkl.md
+++ b/docs/source/en/api/models/asymmetricautoencoderkl.md
@@ -49,12 +49,12 @@ make_image_grid([original_image, mask_image, image], rows=1, cols=3)
 
 ## AsymmetricAutoencoderKL
 
-[[autodoc]] models.autoencoder_asym_kl.AsymmetricAutoencoderKL
+[[autodoc]] models.autoencoders.autoencoder_asym_kl.AsymmetricAutoencoderKL
 
 ## AutoencoderKLOutput
 
-[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
 
 ## DecoderOutput
 
-[[autodoc]] models.vae.DecoderOutput
+[[autodoc]] models.autoencoders.vae.DecoderOutput
diff --git a/docs/source/en/api/models/autoencoder_tiny.md b/docs/source/en/api/models/autoencoder_tiny.md
index 1d19539bffe8..b5c9dc638e6f 100644
--- a/docs/source/en/api/models/autoencoder_tiny.md
+++ b/docs/source/en/api/models/autoencoder_tiny.md
@@ -54,4 +54,4 @@ image
 
 ## AutoencoderTinyOutput
 
-[[autodoc]] models.autoencoder_tiny.AutoencoderTinyOutput
+[[autodoc]] models.autoencoders.autoencoder_tiny.AutoencoderTinyOutput
diff --git a/docs/source/en/api/models/autoencoderkl.md b/docs/source/en/api/models/autoencoderkl.md
index f42a4d2941dd..72427ab30e6a 100644
--- a/docs/source/en/api/models/autoencoderkl.md
+++ b/docs/source/en/api/models/autoencoderkl.md
@@ -36,11 +36,11 @@ model = AutoencoderKL.from_single_file(url)
 
 ## AutoencoderKLOutput
 
-[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput
+[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
 
 ## DecoderOutput
 
-[[autodoc]] models.vae.DecoderOutput
+[[autodoc]] models.autoencoders.vae.DecoderOutput
 
 ## FlaxAutoencoderKL
 

From 45e61d69781e0703420c64dd4726295f1d512ca5 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 11 Dec 2023 10:39:09 +0530
Subject: [PATCH 4/5] fix: import structure in init.

---
 src/diffusers/models/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 99a14fed52f3..7487bbf2f98e 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -26,9 +26,9 @@
 
 if is_torch_available():
     _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
-    _import_structure["autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
     _import_structure["autoencoders.autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
     _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
     _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
     _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
     _import_structure["controlnet"] = ["ControlNetModel"]

From 6f489532adf875cadedfda3d986a3e5d1e6358c0 Mon Sep 17 00:00:00 2001
From: sayakpaul <spsayakpaul@gmail.com>
Date: Mon, 11 Dec 2023 10:44:22 +0530
Subject: [PATCH 5/5] fix controlnetxs import

---
 src/diffusers/models/controlnetxs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 41f2d8af01b1..3cc77fe70d72 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -26,7 +26,7 @@
 from .attention_processor import (
     AttentionProcessor,
 )
-from .autoencoder_kl import AutoencoderKL
+from .autoencoders import AutoencoderKL
 from .lora import LoRACompatibleConv
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (