finegrain-ai · Laurent2916 · Feb 2, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/docs/reference/foundationals/latent_diffusion.md b/docs/reference/foundationals/latent_diffusion.md
@@ -1 +1,11 @@
-::: refiners.foundationals.latent_diffusion
+::: refiners.foundationals.latent_diffusion.auto_encoder
+
+::: refiners.foundationals.latent_diffusion.stable_diffusion_xl
+
+::: refiners.foundationals.latent_diffusion.stable_diffusion_1
+
+::: refiners.foundationals.latent_diffusion.solvers
+
+::: refiners.foundationals.latent_diffusion.lora
+
+::: refiners.foundationals.latent_diffusion.image_prompt
diff --git a/src/refiners/fluxion/utils.py b/src/refiners/fluxion/utils.py
@@ -221,7 +221,7 @@ def load_tensors(path: Path | str, /, device: Device | str = "cpu") -> dict[str,
 
     Warning:
         Still, **only load data you trust** and favor using
-        [`load_from_safetensors`](refiners.fluxion.utils.load_from_safetensors) instead.
+        [`load_from_safetensors`][refiners.fluxion.utils.load_from_safetensors] instead.
     """
     # see https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560
     with warnings.catch_warnings():

diff --git a/src/refiners/foundationals/clip/__init__.py b/src/refiners/foundationals/clip/__init__.py
@@ -0,0 +1,21 @@
+from refiners.foundationals.clip.image_encoder import (
+    CLIPImageEncoder,
+    CLIPImageEncoderG,
+    CLIPImageEncoderH,
+)
+from refiners.foundationals.clip.text_encoder import (
+    CLIPTextEncoder,
+    CLIPTextEncoderG,
+    CLIPTextEncoderH,
+    CLIPTextEncoderL,
+)
+
+__all__ = [
+    "CLIPTextEncoder",
+    "CLIPTextEncoderL",
+    "CLIPTextEncoderH",
+    "CLIPTextEncoderG",
+    "CLIPImageEncoder",
+    "CLIPImageEncoderG",
+    "CLIPImageEncoderH",
+]
diff --git a/src/refiners/foundationals/clip/image_encoder.py b/src/refiners/foundationals/clip/image_encoder.py
@@ -108,6 +108,12 @@ def __init__(
 
 
 class CLIPImageEncoder(fl.Chain):
+    """Contrastive Language-Image Pretraining (CLIP) image encoder.
+
+    See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+    for more details.
+    """
+
     def __init__(
         self,
         image_size: int = 224,
@@ -121,6 +127,20 @@ def __init__(
         device: Device | str | None = None,
         dtype: DType | None = None,
     ) -> None:
+        """Initialize a CLIP image encoder.
+
+        Args:
+            image_size: The size of the input image.
+            embedding_dim: The dimension of the embedding.
+            output_dim: The dimension of the output.
+            patch_size: The size of the patches.
+            num_layers: The number of layers.
+            num_attention_heads: The number of attention heads.
+            feedforward_dim: The dimension of the feedforward layer.
+            layer_norm_eps: The epsilon value for normalization.
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         self.image_size = image_size
         self.embedding_dim = embedding_dim
         self.output_dim = output_dim
@@ -152,7 +172,27 @@ def __init__(
 
 
 class CLIPImageEncoderH(CLIPImageEncoder):
+    """CLIP huge image encoder.
+
+    See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 1280
+        output_dim (int): 1024
+        patch_size (int): 14
+        num_layers (int): 32
+        num_attention_heads (int): 16
+        feedforward_dim (int): 5120
+    """
+
     def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
+        """Initialize CLIP huge image encoder.
+
+        Args:
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=1280,
             output_dim=1024,
@@ -166,7 +206,27 @@ def __init__(self, device: Device | str | None = None, dtype: DType | None = Non
 
 
 class CLIPImageEncoderG(CLIPImageEncoder):
+    """CLIP giant image encoder.
+
+    See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 1664
+        output_dim (int): 1280
+        patch_size (int): 14
+        num_layers (int): 48
+        num_attention_heads (int): 16
+        feedforward_dim (int): 8192
+    """
+
     def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
+        """Initialize CLIP giant image encoder.
+
+        Args:
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=1664,
             output_dim=1280,

diff --git a/src/refiners/foundationals/clip/text_encoder.py b/src/refiners/foundationals/clip/text_encoder.py
@@ -71,6 +71,12 @@ def __init__(
 
 
 class CLIPTextEncoder(fl.Chain):
+    """Contrastive Language-Image Pretraining (CLIP) text encoder.
+
+    See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+    for more details.
+    """
+
     def __init__(
         self,
         embedding_dim: int = 768,
@@ -85,6 +91,21 @@ def __init__(
         device: Device | str | None = None,
         dtype: DType | None = None,
     ) -> None:
+        """Initialize CLIP text encoder.
+
+        Args:
+            embedding_dim: The embedding dimension.
+            max_sequence_length: The maximum sequence length.
+            vocabulary_size: The vocabulary size.
+            num_layers: The number of layers.
+            num_attention_heads: The number of attention heads.
+            feedforward_dim: The feedforward dimension.
+            layer_norm_eps: The epsilon value for layer normalization.
+            use_quick_gelu: Whether to use the quick GeLU activation function.
+            tokenizer: The tokenizer.
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         self.embedding_dim = embedding_dim
         self.max_sequence_length = max_sequence_length
         self.vocabulary_size = vocabulary_size
@@ -129,19 +150,30 @@ def __init__(
 
 
 class CLIPTextEncoderL(CLIPTextEncoder):
-    """
-    CLIPTextEncoderL is the CLIP text encoder with the following parameters:
-    embedding_dim=768
-    num_layers=12
-    num_attention_heads=12
-    feedforward_dim=3072
-    use_quick_gelu=True
-
-    We replace the GeLU activation function with an approximate GeLU to comply with the original CLIP implementation
-    of OpenAI (https://github.com/openai/CLIP/blob/main/clip/model.py#L166)
+    """CLIP large text encoder.
+
+    Note:
+        We replace the GeLU activation function with an approximate GeLU to comply with the original CLIP implementation
+        of OpenAI (https://github.com/openai/CLIP/blob/a1d0717/clip/model.py#L166)
+
+    See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 768
+        num_layers (int): 12
+        num_attention_heads (int): 12
+        feedforward_dim (int): 3072
+        use_quick_gelu (bool): True
     """
 
     def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
+        """Initialize CLIP large text encoder.
+
+        Args:
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=768,
             num_layers=12,
@@ -154,15 +186,25 @@ def __init__(self, device: Device | str | None = None, dtype: DType | None = Non
 
 
 class CLIPTextEncoderH(CLIPTextEncoder):
-    """
-    CLIPTextEncoderH is the CLIP text encoder with the following parameters:
-    embedding_dim=1024
-    num_layers=23
-    num_attention_heads=16
-    feedforward_dim=4096
+    """CLIP huge text encoder.
+
+    See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 1024
+        num_layers (int): 23
+        num_attention_heads (int): 16
+        feedforward_dim (int): 4096
     """
 
     def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
+        """Initialize CLIP huge text encoder.
+
+        Args:
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         super().__init__(
             embedding_dim=1024,
             num_layers=23,
@@ -174,15 +216,26 @@ def __init__(self, device: Device | str | None = None, dtype: DType | None = Non
 
 
 class CLIPTextEncoderG(CLIPTextEncoder):
-    """
-    CLIPTextEncoderG is the CLIP text encoder with the following parameters:
-    embedding_dim=1280
-    num_layers=32
-    num_attention_heads=16
-    feedforward_dim=5120
+    """CLIP giant text encoder.
+
+    See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+    for more details.
+
+    Attributes:
+        embedding_dim (int): 1280
+        num_layers (int): 32
+        num_attention_heads (int): 20
+        feedforward_dim (int): 5120
+        tokenizer (CLIPTokenizer): CLIPTokenizer(pad_token_id=0)
     """
 
     def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
+        """Initialize CLIP giant text encoder.
+
+        Args:
+            device: The PyTorch device to use.
+            dtype: The PyTorch data type to use.
+        """
         tokenizer = CLIPTokenizer(pad_token_id=0)
         super().__init__(
             embedding_dim=1280,