Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add docstrings to Foundation Models #248

Merged
merged 17 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion docs/reference/foundationals/latent_diffusion.md
Original file line number Diff line number Diff line change
@@ -1 +1,11 @@
::: refiners.foundationals.latent_diffusion
::: refiners.foundationals.latent_diffusion.auto_encoder

::: refiners.foundationals.latent_diffusion.stable_diffusion_xl

::: refiners.foundationals.latent_diffusion.stable_diffusion_1

::: refiners.foundationals.latent_diffusion.solvers

::: refiners.foundationals.latent_diffusion.lora

::: refiners.foundationals.latent_diffusion.image_prompt
2 changes: 1 addition & 1 deletion src/refiners/fluxion/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def load_tensors(path: Path | str, /, device: Device | str = "cpu") -> dict[str,

Warning:
Still, **only load data you trust** and favor using
[`load_from_safetensors`](refiners.fluxion.utils.load_from_safetensors) instead.
[`load_from_safetensors`][refiners.fluxion.utils.load_from_safetensors] instead.
"""
# see https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560
with warnings.catch_warnings():
Expand Down
21 changes: 21 additions & 0 deletions src/refiners/foundationals/clip/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from refiners.foundationals.clip.image_encoder import (
CLIPImageEncoder,
CLIPImageEncoderG,
CLIPImageEncoderH,
)
from refiners.foundationals.clip.text_encoder import (
CLIPTextEncoder,
CLIPTextEncoderG,
CLIPTextEncoderH,
CLIPTextEncoderL,
)

__all__ = [
"CLIPTextEncoder",
"CLIPTextEncoderL",
"CLIPTextEncoderH",
"CLIPTextEncoderG",
"CLIPImageEncoder",
"CLIPImageEncoderG",
"CLIPImageEncoderH",
]
60 changes: 60 additions & 0 deletions src/refiners/foundationals/clip/image_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ def __init__(


class CLIPImageEncoder(fl.Chain):
"""Contrastive Language-Image Pretraining (CLIP) image encoder.

See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.
"""

def __init__(
self,
image_size: int = 224,
Expand All @@ -121,6 +127,20 @@ def __init__(
device: Device | str | None = None,
dtype: DType | None = None,
) -> None:
"""Initialize a CLIP image encoder.

Args:
image_size: The size of the input image.
embedding_dim: The dimension of the embedding.
output_dim: The dimension of the output.
patch_size: The size of the patches.
num_layers: The number of layers.
num_attention_heads: The number of attention heads.
feedforward_dim: The dimension of the feedforward layer.
layer_norm_eps: The epsilon value for normalization.
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
self.image_size = image_size
self.embedding_dim = embedding_dim
self.output_dim = output_dim
Expand Down Expand Up @@ -152,7 +172,27 @@ def __init__(


class CLIPImageEncoderH(CLIPImageEncoder):
"""CLIP huge image encoder.

See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.

Attributes:
embedding_dim (int): 1280
output_dim (int): 1024
patch_size (int): 14
num_layers (int): 32
num_attention_heads (int): 16
feedforward_dim (int): 5120
"""

def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP huge image encoder.

Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
super().__init__(
embedding_dim=1280,
output_dim=1024,
Expand All @@ -166,7 +206,27 @@ def __init__(self, device: Device | str | None = None, dtype: DType | None = Non


class CLIPImageEncoderG(CLIPImageEncoder):
"""CLIP giant image encoder.

See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.

Attributes:
embedding_dim (int): 1664
output_dim (int): 1280
patch_size (int): 14
num_layers (int): 48
num_attention_heads (int): 16
feedforward_dim (int): 8192
"""

def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP giant image encoder.

Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
super().__init__(
embedding_dim=1664,
output_dim=1280,
Expand Down
97 changes: 75 additions & 22 deletions src/refiners/foundationals/clip/text_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ def __init__(


class CLIPTextEncoder(fl.Chain):
"""Contrastive Language-Image Pretraining (CLIP) text encoder.

See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.
"""

def __init__(
self,
embedding_dim: int = 768,
Expand All @@ -85,6 +91,21 @@ def __init__(
device: Device | str | None = None,
dtype: DType | None = None,
) -> None:
"""Initialize CLIP text encoder.

Args:
embedding_dim: The embedding dimension.
max_sequence_length: The maximum sequence length.
vocabulary_size: The vocabulary size.
num_layers: The number of layers.
num_attention_heads: The number of attention heads.
feedforward_dim: The feedforward dimension.
layer_norm_eps: The epsilon value for layer normalization.
use_quick_gelu: Whether to use the quick GeLU activation function.
tokenizer: The tokenizer.
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
self.embedding_dim = embedding_dim
self.max_sequence_length = max_sequence_length
self.vocabulary_size = vocabulary_size
Expand Down Expand Up @@ -129,19 +150,30 @@ def __init__(


class CLIPTextEncoderL(CLIPTextEncoder):
"""
CLIPTextEncoderL is the CLIP text encoder with the following parameters:
embedding_dim=768
num_layers=12
num_attention_heads=12
feedforward_dim=3072
use_quick_gelu=True

We replace the GeLU activation function with an approximate GeLU to comply with the original CLIP implementation
of OpenAI (https://github.com/openai/CLIP/blob/main/clip/model.py#L166)
"""CLIP large text encoder.

Note:
We replace the GeLU activation function with an approximate GeLU to comply with the original CLIP implementation
of OpenAI (https://github.com/openai/CLIP/blob/a1d0717/clip/model.py#L166)

See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.

Attributes:
embedding_dim (int): 768
num_layers (int): 12
num_attention_heads (int): 12
feedforward_dim (int): 3072
use_quick_gelu (bool): True
"""

def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP large text encoder.

Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
super().__init__(
embedding_dim=768,
num_layers=12,
Expand All @@ -154,15 +186,25 @@ def __init__(self, device: Device | str | None = None, dtype: DType | None = Non


class CLIPTextEncoderH(CLIPTextEncoder):
"""
CLIPTextEncoderH is the CLIP text encoder with the following parameters:
embedding_dim=1024
num_layers=23
num_attention_heads=16
feedforward_dim=4096
"""CLIP huge text encoder.

See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.

Attributes:
embedding_dim (int): 1024
num_layers (int): 23
num_attention_heads (int): 16
feedforward_dim (int): 4096
"""

def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP huge text encoder.

Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
super().__init__(
embedding_dim=1024,
num_layers=23,
Expand All @@ -174,15 +216,26 @@ def __init__(self, device: Device | str | None = None, dtype: DType | None = Non


class CLIPTextEncoderG(CLIPTextEncoder):
"""
CLIPTextEncoderG is the CLIP text encoder with the following parameters:
embedding_dim=1280
num_layers=32
num_attention_heads=16
feedforward_dim=5120
"""CLIP giant text encoder.

See [[arXiv:2103.00020] Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
for more details.

Attributes:
embedding_dim (int): 1280
num_layers (int): 32
num_attention_heads (int): 20
feedforward_dim (int): 5120
tokenizer (CLIPTokenizer): CLIPTokenizer(pad_token_id=0)
"""

def __init__(self, device: Device | str | None = None, dtype: DType | None = None) -> None:
"""Initialize CLIP giant text encoder.

Args:
device: The PyTorch device to use.
dtype: The PyTorch data type to use.
"""
tokenizer = CLIPTokenizer(pad_token_id=0)
super().__init__(
embedding_dim=1280,
Expand Down
Loading