From 2246f13ab3fabf2acc048892e9d95305b05df0dc Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 16 Apr 2024 02:36:57 +0000
Subject: [PATCH 01/40] Improve: Fetch modalities separately

---
 .gitignore                    |   3 +-
 python/uform/__init__.py      |  78 ++++++++++++++++++++-----
 python/uform/onnx_models.py   |  13 +++--
 python/uform/preprocessing.py | 105 ----------------------------------
 python/uform/torch_models.py  |   5 +-
 5 files changed, 78 insertions(+), 126 deletions(-)
 delete mode 100644 python/uform/preprocessing.py

diff --git a/.gitignore b/.gitignore
index af7d4af..fbc703a 100755
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ package-lock.json
 *.onnx
 __pycache__
 .build
-.swiftpm
\ No newline at end of file
+.swiftpm
+node_modules
\ No newline at end of file
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 1ecb242..f1bca3a 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,30 +1,80 @@
 from json import load
-from os.path import join
+from os.path import join, exists
 from typing import Mapping, Optional, Tuple
+from enum import Enum
 
 from huggingface_hub import snapshot_download
 
 
-def get_checkpoint(model_name: str, token: str) -> Tuple[str, Mapping, str]:
-    import torch
-
-    model_path = snapshot_download(repo_id=model_name, token=token)
-    config_path = join(model_path, "torch_config.json")
+class Modality(Enum):
+    TEXT = "text"
+    IMAGE = "image"
 
-    state = torch.load(join(model_path, "torch_weight.pt"))
-    return config_path, state, join(model_path, "tokenizer.json")
 
+def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str]) -> Tuple[str, Mapping, str]:
+    import torch
 
-def get_model(model_name: str, token: Optional[str] = None):
-    from uform.torch_models import VLM
+    # It is not recommended to use `.pth` extension when checkpointing models
+    # because it collides with Python path (`.pth`) configuration files.
+    merged_model_names = ["torch_weight.pt", "weights.pt", "model.pt"]
+    separate_modality_names = [str(x) + ".pt" for x in modalities]
+    config_names = ["config.json", "torch_config.json"]
+    tokenizer_names = ["tokenizer.json"]
+
+    # The download stats depend on the number of times the `config.json` is pulled
+    # https://huggingface.co/docs/hub/models-download-stats
+    model_path = snapshot_download(
+        repo_id=model_name,
+        token=token,
+        allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names,
+    )
+
+    # Find the first name in `config_names` that is present
+    config_path = None
+    for config_name in config_names:
+        if exists(join(model_path, config_name)):
+            config_path = join(model_path, config_name)
+            break
+
+    # Same for the tokenizer
+    tokenizer_path = None
+    for tokenizer_name in tokenizer_names:
+        if exists(join(model_path, tokenizer_name)):
+            tokenizer_path = join(model_path, tokenizer_name)
+            break
+
+    # Ideally, we want to separately fetch all the models.
+    # If those aren't available, aggregate separate modalities and merge them.
+    state = None
+    for file_name in merged_model_names:
+        if exists(join(model_path, file_name)):
+            state = torch.load(join(model_path, file_name))
+            break
+
+    if state is None:
+        state = {}
+        for file_name in separate_modality_names:
+            if exists(join(model_path, file_name)):
+                modality_name, _, _ = file_name.partition(".")
+                property_name = modality_name + "_encoder"
+                state[property_name] = torch.load(join(model_path, file_name))
+
+    return config_path, state, tokenizer_path
+
+
+def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None):
+    from uform.torch_models import TextVisualEncoder
     from uform.torch_preprocessor import TorchProcessor
 
-    config_path, state, tokenizer_path = get_checkpoint(model_name, token)
+    if modalities is None:
+        modalities = (Modality.TEXT, Modality.IMAGE)
+
+    config_path, state, tokenizer_path = get_checkpoint(model_name, token, modalities)
 
     with open(config_path) as f:
         config = load(f)
 
-    model = VLM(config, tokenizer_path)
+    model = TextVisualEncoder(config, tokenizer_path)
     model.image_encoder.load_state_dict(state["image_encoder"])
     model.text_encoder.load_state_dict(state["text_encoder"])
     processor = TorchProcessor(config, tokenizer_path)
@@ -33,7 +83,7 @@ def get_model(model_name: str, token: Optional[str] = None):
 
 
 def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None):
-    from uform.onnx_models import VLM_ONNX
+    from uform.onnx_models import TextVisualEncoder
     from uform.numpy_preprocessor import NumPyProcessor
 
     assert device in (
@@ -53,7 +103,7 @@ def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str
     with open(join(model_path, "config.json")) as f:
         config = load(f)
 
-    model = VLM_ONNX(model_path, config, device, dtype)
+    model = TextVisualEncoder(model_path, config, device, dtype)
     processor = NumPyProcessor(config, join(model_path, "tokenizer.json"))
 
     return model, processor
diff --git a/python/uform/onnx_models.py b/python/uform/onnx_models.py
index 8e2a87a..68255de 100644
--- a/python/uform/onnx_models.py
+++ b/python/uform/onnx_models.py
@@ -23,7 +23,7 @@ def available_providers(device: str) -> Tuple[str, ...]:
     return cpu_providers
 
 
-class VisualEncoderONNX:
+class VisualEncoder:
     def __init__(self, model_path: str, device: str):
         """
         :param model_path: Path to onnx model
@@ -43,7 +43,7 @@ def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]:
         return self.session.run(None, {"images": images})
 
 
-class TextEncoderONNX:
+class TextEncoder:
     def __init__(self, text_encoder_path: str, reranker_path: str, device: str):
         """
         :param text_encoder_path: Path to onnx of text encoder
@@ -82,7 +82,7 @@ def forward_multimodal(
         )
 
 
-class VLM_ONNX:
+class TextVisualEncoder:
     def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
         assert device in (
             "cpu",
@@ -103,13 +103,13 @@ def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
         self._text_encoder_dim = config["text_encoder"]["dim"]
         self._image_encoder_dim = config["image_encoder"]["dim"]
 
-        self.text_encoder = TextEncoderONNX(
+        self.text_encoder = TextEncoder(
             join(checkpoint_path, f"text_encoder.onnx"),
             join(checkpoint_path, f"reranker.onnx"),
             device,
         )
 
-        self.image_encoder = VisualEncoderONNX(join(checkpoint_path, f"image_encoder.onnx"), device)
+        self.image_encoder = VisualEncoder(join(checkpoint_path, f"image_encoder.onnx"), device)
 
     def encode_image(
         self,
@@ -229,3 +229,6 @@ def embedding_dim(self) -> int:
     def multimodal_embedding_dim(self) -> int:
         """Dimensionality of multimodal joint embedding."""
         return self._text_encoder_dim
+
+
+VLM_ONNX = TextVisualEncoder  # legacy
diff --git a/python/uform/preprocessing.py b/python/uform/preprocessing.py
deleted file mode 100644
index d3d833e..0000000
--- a/python/uform/preprocessing.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from os import PathLike
-from typing import Dict, List, Union
-
-import torch
-from PIL import Image
-from tokenizers import Tokenizer
-from torch import Tensor
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
-                                    Normalize, Resize, ToTensor)
-
-
-# lambda is not pickable
-def convert_to_rgb(image):
-    return image.convert("RGB")
-
-
-class Processor:
-    def __init__(self, config: Dict, tokenizer_path: PathLike, tensor_type: str = "pt"):
-        """
-        :param config: model config
-        :param tokenizer_path: path to tokenizer file
-        :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
-        """
-
-        assert tensor_type in ("pt", "np"), "`tensor_type` must be either `pt` or `np`"
-
-        self._image_size = config["image_encoder"]["image_size"]
-        self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
-        self._tokenizer = Tokenizer.from_file(tokenizer_path)
-        self._tokenizer.no_padding()
-        self._pad_token_idx = config["text_encoder"]["padding_idx"]
-
-        self.tensor_type = tensor_type
-
-        self._image_transform = Compose(
-            [
-                Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
-                convert_to_rgb,
-                CenterCrop(self._image_size),
-                ToTensor(),
-                Normalize(
-                    mean=(0.48145466, 0.4578275, 0.40821073),
-                    std=(0.26862954, 0.26130258, 0.27577711),
-                ),
-            ],
-        )
-
-    def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
-        """Transforms one or more strings into dictionary with tokenized strings and attention masks.
-
-        :param texts: text of list of texts to tokenizer
-        """
-        if isinstance(texts, str):
-            texts = [texts]
-
-        input_ids = torch.full(
-            (len(texts), self._max_seq_len),
-            fill_value=self._pad_token_idx,
-            dtype=torch.int64,
-        )
-
-        attention_mask = torch.zeros(
-            len(texts),
-            self._max_seq_len,
-            dtype=torch.int32,
-        )
-        encoded = self._tokenizer.encode_batch(texts)
-
-        for i, seq in enumerate(encoded):
-            seq_len = min(len(seq), self._max_seq_len)
-            input_ids[i, :seq_len] = torch.LongTensor(
-                seq.ids[: self._max_seq_len],
-            )
-            attention_mask[i, :seq_len] = 1
-
-        if self.tensor_type == "np":
-            return {
-                "input_ids": input_ids.numpy(),
-                "attention_mask": attention_mask.numpy(),
-            }
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-    def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
-        """Transforms one or more Pillow images into Torch Tensors.
-
-        :param images: image or list of images to preprocess
-        """
-
-        if isinstance(images, list):
-            batch_images = torch.empty(
-                (len(images), 3, self._image_size, self._image_size),
-                dtype=torch.float32,
-            )
-
-            for i, image in enumerate(images):
-                batch_images[i] = self._image_transform(image)
-
-        else:
-            batch_images = self._image_transform(images).unsqueeze(0)
-
-        if self.tensor_type == "np":
-            return batch_images.numpy()
-
-        return batch_images
diff --git a/python/uform/torch_models.py b/python/uform/torch_models.py
index ab86622..c4f0bcb 100644
--- a/python/uform/torch_models.py
+++ b/python/uform/torch_models.py
@@ -353,7 +353,7 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
         return embeddings
 
 
-class VLM(nn.Module):
+class TextVisualEncoder(nn.Module):
     """
     Vision-Language Model for Multimodal embeddings.
     """
@@ -503,3 +503,6 @@ def embedding_dim(self) -> int:
     def multimodal_embedding_dim(self) -> int:
         """Dimensionality of multimodal joint embedding."""
         return self.text_encoder.dim
+
+
+VLM = TextVisualEncoder  # legacy

From b310e908e2e9fbd9be58fb9e36527ee767e16600 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 16 Apr 2024 02:55:30 +0000
Subject: [PATCH 02/40] Fix: Compatibility with older models

---
 python/scripts/test_embeddings.py | 19 +++++++++++++++----
 python/uform/__init__.py          | 12 ++++++------
 python/uform/torch_models.py      |  3 ++-
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_embeddings.py
index d71bf0b..9cdd4c5 100644
--- a/python/scripts/test_embeddings.py
+++ b/python/scripts/test_embeddings.py
@@ -1,4 +1,5 @@
 from typing import Tuple
+import os
 
 import pytest
 from PIL import Image
@@ -21,6 +22,7 @@
     onnx_available = False
 
 torch_models = [
+    "unum-cloud/uform-vl2-english-small",
     "unum-cloud/uform-vl-english",
     "unum-cloud/uform-vl-multilingual-v2",
 ]
@@ -34,11 +36,20 @@
     ("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
 ]
 
+# Let's check if the HuggingFace Hub API token is set in the environment variable.
+# If it's not there, check if the `.hf_token` file is present in the current working directory.
+token = os.getenv("HUGGINGFACE_HUB_TOKEN", None)
+if token is None:
+    token_path = "./.hf_token"
+    if os.path.exists(token_path):
+        with open(token_path, "r") as file:
+            token = file.read().strip()
+
 
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
 @pytest.mark.parametrize("model_name", torch_models)
 def test_torch_one_embedding(model_name: str):
-    model, processor = uform.get_model(model_name)
+    model, processor = uform.get_model(model_name, token=token)
     text = "a small red panda in a zoo"
     image_path = "assets/unum.png"
 
@@ -67,7 +78,7 @@ def test_torch_one_embedding(model_name: str):
 @pytest.mark.parametrize("model_name", torch_models)
 @pytest.mark.parametrize("batch_size", [1, 2])
 def test_torch_many_embeddings(model_name: str, batch_size: int):
-    model, processor = uform.get_model(model_name)
+    model, processor = uform.get_model(model_name, token=token)
     texts = ["a small red panda in a zoo"] * batch_size
     image_paths = ["assets/unum.png"] * batch_size
 
@@ -90,7 +101,7 @@ def test_onnx_one_embedding(model_specs: Tuple[str, str, str]):
 
     try:
 
-        model, processor = uform.get_model_onnx(*model_specs)
+        model, processor = uform.get_model_onnx(*model_specs, token=token)
         text = "a small red panda in a zoo"
         image_path = "assets/unum.png"
 
@@ -126,7 +137,7 @@ def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int
 
     try:
 
-        model, processor = uform.get_model_onnx(*model_specs)
+        model, processor = uform.get_model_onnx(*model_specs, token=token)
         texts = ["a small red panda in a zoo"] * batch_size
         image_paths = ["assets/unum.png"] * batch_size
 
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index f1bca3a..1d2d41f 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -11,14 +11,14 @@ class Modality(Enum):
     IMAGE = "image"
 
 
-def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str]) -> Tuple[str, Mapping, str]:
+def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]:
     import torch
 
     # It is not recommended to use `.pth` extension when checkpointing models
     # because it collides with Python path (`.pth`) configuration files.
-    merged_model_names = ["torch_weight.pt", "weights.pt", "model.pt"]
-    separate_modality_names = [str(x) + ".pt" for x in modalities]
-    config_names = ["config.json", "torch_config.json"]
+    merged_model_names = ["torch_weight.pt", "weight.pt", "model.pt"]
+    separate_modality_names = [(x.value if isinstance(x, Modality) else x) + ".pt" for x in modalities]
+    config_names = ["torch_config.json", "config.json"]
     tokenizer_names = ["tokenizer.json"]
 
     # The download stats depend on the number of times the `config.json` is pulled
@@ -75,8 +75,8 @@ def get_model(model_name: str, token: Optional[str] = None, modalities: Optional
         config = load(f)
 
     model = TextVisualEncoder(config, tokenizer_path)
-    model.image_encoder.load_state_dict(state["image_encoder"])
-    model.text_encoder.load_state_dict(state["text_encoder"])
+    model.image_encoder.load_state_dict(state.get("image_encoder", None))
+    model.text_encoder.load_state_dict(state.get("text_encoder", None))
     processor = TorchProcessor(config, tokenizer_path)
 
     return model.eval(), processor
diff --git a/python/uform/torch_models.py b/python/uform/torch_models.py
index c4f0bcb..4339765 100644
--- a/python/uform/torch_models.py
+++ b/python/uform/torch_models.py
@@ -364,8 +364,9 @@ def __init__(self, config: Dict, tokenizer_path: PathLike):
         """
 
         super().__init__()
-        self._embedding_dim = config["text_encoder"]["embedding_dim"]
+        config["text_encoder"].pop("tokenizer_class", None)
 
+        self._embedding_dim = config["text_encoder"]["embedding_dim"]
         self.text_encoder = TextEncoder(**config["text_encoder"])
         self.image_encoder = VisualEncoder(**config["image_encoder"])
 

From a2f77d280df72ba39b5530b40dd1eee09a7538e7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 16 Apr 2024 05:03:06 +0000
Subject: [PATCH 03/40] Make: Rename files

---
 python/uform/__init__.py                      |  8 ++--
 ...py_preprocessor.py => numpy_processors.py} |  0
 .../{onnx_models.py => onnx_encoders.py}      |  0
 .../uform/{gen_model.py => torch_decoders.py} | 37 ++++++++-----------
 .../{torch_models.py => torch_encoders.py}    |  0
 ...ch_preprocessor.py => torch_processors.py} |  0
 6 files changed, 19 insertions(+), 26 deletions(-)
 rename python/uform/{numpy_preprocessor.py => numpy_processors.py} (100%)
 rename python/uform/{onnx_models.py => onnx_encoders.py} (100%)
 rename python/uform/{gen_model.py => torch_decoders.py} (94%)
 rename python/uform/{torch_models.py => torch_encoders.py} (100%)
 rename python/uform/{torch_preprocessor.py => torch_processors.py} (100%)

diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 1d2d41f..cdb1250 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -63,8 +63,8 @@ def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str,
 
 
 def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None):
-    from uform.torch_models import TextVisualEncoder
-    from uform.torch_preprocessor import TorchProcessor
+    from python.uform.torch_encoders import TextVisualEncoder
+    from python.uform.torch_processors import TorchProcessor
 
     if modalities is None:
         modalities = (Modality.TEXT, Modality.IMAGE)
@@ -83,8 +83,8 @@ def get_model(model_name: str, token: Optional[str] = None, modalities: Optional
 
 
 def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None):
-    from uform.onnx_models import TextVisualEncoder
-    from uform.numpy_preprocessor import NumPyProcessor
+    from python.uform.onnx_encoders import TextVisualEncoder
+    from python.uform.numpy_processors import NumPyProcessor
 
     assert device in (
         "cpu",
diff --git a/python/uform/numpy_preprocessor.py b/python/uform/numpy_processors.py
similarity index 100%
rename from python/uform/numpy_preprocessor.py
rename to python/uform/numpy_processors.py
diff --git a/python/uform/onnx_models.py b/python/uform/onnx_encoders.py
similarity index 100%
rename from python/uform/onnx_models.py
rename to python/uform/onnx_encoders.py
diff --git a/python/uform/gen_model.py b/python/uform/torch_decoders.py
similarity index 94%
rename from python/uform/gen_model.py
rename to python/uform/torch_decoders.py
index 35faae1..79b058d 100644
--- a/python/uform/gen_model.py
+++ b/python/uform/torch_decoders.py
@@ -3,19 +3,24 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
-                                    Normalize, RandomResizedCrop, Resize,
-                                    ToTensor)
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    InterpolationMode,
+    Normalize,
+    RandomResizedCrop,
+    Resize,
+    ToTensor,
+)
 from transformers import AutoConfig, AutoTokenizer
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
-from transformers.models.auto.modeling_auto import (AutoModel,
-                                                    AutoModelForCausalLM)
+from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import BatchEncoding
 
-from uform.models import VisualEncoder
+from uform.torch_encoders import VisualEncoder
 
 IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
@@ -213,21 +218,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError(
@@ -248,11 +245,7 @@ def forward(
                 )
 
         if position_ids is None:
-            seq_length = (
-                inputs_embeds.shape[1]
-                if inputs_embeds is not None
-                else input_ids.shape[1]
-            )
+            seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1]
             past_key_values_length = 0
 
             if past_key_values is not None:
diff --git a/python/uform/torch_models.py b/python/uform/torch_encoders.py
similarity index 100%
rename from python/uform/torch_models.py
rename to python/uform/torch_encoders.py
diff --git a/python/uform/torch_preprocessor.py b/python/uform/torch_processors.py
similarity index 100%
rename from python/uform/torch_preprocessor.py
rename to python/uform/torch_processors.py

From acbb77ad87a32f8d2fdc8154ea149bbbccdd2a6f Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 16 Apr 2024 22:23:24 +0000
Subject: [PATCH 04/40] Add: Placeholder for JavaScript SDK

---
 .gitignore                                    |   9 +-
 README.md                                     |   8 +-
 javascript/README.md                          |  10 +
 package.json                                  |  11 +
 python/scripts/bench.py                       |   2 +-
 python/scripts/export.ipynb                   | 666 ------------------
 python/scripts/export_encoders.ipynb          | 436 ++++++++++++
 .../{test_generative.py => test_decoders.py}  |   0
 .../{test_embeddings.py => test_encoders.py}  |   2 +-
 python/uform/__init__.py                      |   4 +-
 python/uform/chat.py                          |   2 +-
 python/uform/gen_model.py                     |   1 +
 swift/EmbeddingsTests.swift                   |   6 +-
 swift/README.md                               |  44 ++
 14 files changed, 522 insertions(+), 679 deletions(-)
 create mode 100644 javascript/README.md
 create mode 100644 package.json
 delete mode 100644 python/scripts/export.ipynb
 create mode 100644 python/scripts/export_encoders.ipynb
 rename python/scripts/{test_generative.py => test_decoders.py} (100%)
 rename python/scripts/{test_embeddings.py => test_encoders.py} (99%)
 create mode 100644 python/uform/gen_model.py
 create mode 100644 swift/README.md

diff --git a/.gitignore b/.gitignore
index fbc703a..4db8e17 100755
--- a/.gitignore
+++ b/.gitignore
@@ -4,8 +4,13 @@ test
 build/
 package-lock.json
 *.egg-info
-*.onnx
 __pycache__
 .build
 .swiftpm
-node_modules
\ No newline at end of file
+.hf_token
+node_modules
+
+# Tensors & ML Model
+*.onnx
+*.pt
+*.safetensors
diff --git a/README.md b/README.md
index 031c484..32957e7 100755
--- a/README.md
+++ b/README.md
@@ -20,9 +20,11 @@ For Content Understanding and Generation<br/>
 <p align="center">
 Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
 <br/>
-Short Texts • Images • 🔜 Video Clips
+Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
 <br/>
-PyTorch • ONNX
+ONNX • CoreML • PyTorch
+<br/>
+Python • JavaScript • Swift
 </p>
 
 ---
@@ -279,7 +281,7 @@ The generative model can be used to caption images, summarize their content, or
 The exact behavior is controlled by prompts.
 
 ```python
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
 
 model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen')
 processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen')
diff --git a/javascript/README.md b/javascript/README.md
new file mode 100644
index 0000000..5626d39
--- /dev/null
+++ b/javascript/README.md
@@ -0,0 +1,10 @@
+# UForm for JavaScript
+
+
+
+```bash
+pnpm add uform
+npm add uform
+yarn add uform
+```
+
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..7331231
--- /dev/null
+++ b/package.json
@@ -0,0 +1,11 @@
+{
+    "name": "uform",
+    "private": true,
+    "version": "2.0.2",
+    "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
+    "dependencies": {
+        "@huggingface/hub": "^0.14.8",
+        "@xenova/transformers": "^2.17.0",
+        "onnxruntime-web": "^1.17.3"
+    }
+}
diff --git a/python/scripts/bench.py b/python/scripts/bench.py
index 49c7004..8bcaf37 100644
--- a/python/scripts/bench.py
+++ b/python/scripts/bench.py
@@ -13,7 +13,7 @@
 )
 
 from uform import get_model
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
 
 dtype = torch.bfloat16
 low_cpu_mem_usage = False
diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb
deleted file mode 100644
index ce8cf10..0000000
--- a/python/scripts/export.ipynb
+++ /dev/null
@@ -1,666 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install --upgrade \"uform[torch]\" coremltools"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n",
-      "  Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n",
-      "  Expected in:     <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n",
-      "  warn(f\"Failed to load image Python extension: {e}\")\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fadffc0299c04e249fd4f7a5b40ba0af",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(torch.Size([1, 197, 384]),\n",
-       " torch.Size([1, 64, 768]),\n",
-       " torch.Size([1, 256]),\n",
-       " torch.Size([1, 256]))"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import uform\n",
-    "from PIL import Image\n",
-    "\n",
-    "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
-    "text = 'a small red panda in a zoo'\n",
-    "image = Image.open('../../assets/unum.png')\n",
-    "\n",
-    "image_data = processor.preprocess_image(image)\n",
-    "text_data = processor.preprocess_text(text)\n",
-    "\n",
-    "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
-    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
-    "\n",
-    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "TextEncoder(model_type='bert', dim=768, context_dim=384, vocab_size=30522, padding_idx=0, num_layers=4, num_heads=12, embedding_dim=256, multimodal_layers_ids=[2, 3], head_one_neuron=False, pooling='cls', max_position_embeddings=64, dropout_prob=0.1)"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model.text_encoder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "VisualEncoder(dim=384, patch_size=16, image_size=224, num_layers=12, num_heads=6, embedding_dim=256, pooling='cls', num_reg_tokens=0)"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model.image_encoder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "First layer of image_encoder: patch_embed\n",
-      "First layer of text_encoder: word_embeddings\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
-    "for name, module in model.image_encoder.named_children():\n",
-    "    print(f\"First layer of image_encoder: {name}\")\n",
-    "    break  # We break after the first layer\n",
-    "\n",
-    "for name, module in model.text_encoder.named_children():\n",
-    "    print(f\"First layer of text_encoder: {name}\")\n",
-    "    break  # We break after the first layer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ONNX"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## CoreML"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "scikit-learn version 1.2.1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n",
-      "Torch version 2.1.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import coremltools as ct\n",
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
-    "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
-    "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
-    "text_features = ct.TensorType(name=\"features\")\n",
-    "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
-    "image_features = ct.TensorType(name=\"features\")\n",
-    "image_embeddings = ct.TensorType(name=\"embeddings\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "VisualEncoder(\n",
-       "  original_name=VisualEncoder\n",
-       "  (patch_embed): Conv2d(original_name=Conv2d)\n",
-       "  (blocks): Sequential(\n",
-       "    original_name=Sequential\n",
-       "    (0): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (1): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (2): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (3): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (4): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (5): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (6): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (7): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (8): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (9): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (10): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (11): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "  )\n",
-       "  (norm): LayerNorm(original_name=LayerNorm)\n",
-       "  (embedding_projection): Linear(original_name=Linear)\n",
-       ")"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "module = model.image_encoder\n",
-    "module.eval()\n",
-    "module.return_features = True\n",
-    "\n",
-    "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
-    "traced_script_module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Tuple detected at graph output. This will be flattened in the converted model.\n",
-      "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n",
-      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n",
-      "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n",
-      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "coreml_model = ct.convert(\n",
-    "    traced_script_module, source=\"pytorch\",\n",
-    "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
-    "\n",
-    "coreml_model.author = 'Unum Cloud'\n",
-    "coreml_model.license = 'Apache 2.0'\n",
-    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "TextEncoder(\n",
-       "  original_name=TextEncoder\n",
-       "  (word_embeddings): Embedding(original_name=Embedding)\n",
-       "  (position_embeddings): Embedding(original_name=Embedding)\n",
-       "  (layer_norm): LayerNorm(original_name=LayerNorm)\n",
-       "  (dropout): Dropout(original_name=Dropout)\n",
-       "  (blocks): ModuleList(\n",
-       "    original_name=ModuleList\n",
-       "    (0): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "    (1): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "    (2): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
-       "      (crossattn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "    (3): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
-       "      (crossattn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "  )\n",
-       "  (embedding_projection): Linear(original_name=Linear)\n",
-       "  (matching_head): Linear(original_name=Linear)\n",
-       "  (context_projection): Linear(original_name=Linear)\n",
-       ")"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "module = model.text_encoder\n",
-    "module.eval()\n",
-    "module.return_features = True\n",
-    "\n",
-    "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
-    "traced_script_module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Tuple detected at graph output. This will be flattened in the converted model.\n",
-      "Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/157 [00:00<?, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.\n",
-      "Converting PyTorch Frontend ==> MIL Ops:  99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n",
-      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n",
-      "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n",
-      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "coreml_model = ct.convert(\n",
-    "    traced_script_module, source=\"pytorch\",\n",
-    "    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
-    "\n",
-    "coreml_model.author = 'Unum Cloud'\n",
-    "coreml_model.license = 'Apache 2.0'\n",
-    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
new file mode 100644
index 0000000..369c938
--- /dev/null
+++ b/python/scripts/export_encoders.ipynb
@@ -0,0 +1,436 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade \"uform[torch]\" coremltools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uform\n",
+    "from PIL import Image\n",
+    "\n",
+    "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
+    "text = 'a small red panda in a zoo'\n",
+    "image = Image.open('../../assets/unum.png')\n",
+    "\n",
+    "image_data = processor.preprocess_image(image)\n",
+    "text_data = processor.preprocess_text(text)\n",
+    "\n",
+    "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
+    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+    "\n",
+    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.text_encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.image_encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
+    "for name, module in model.image_encoder.named_children():\n",
+    "    print(f\"First layer of image_encoder: {name}\")\n",
+    "    break  # We break after the first layer\n",
+    "\n",
+    "for name, module in model.text_encoder.named_children():\n",
+    "    print(f\"First layer of text_encoder: {name}\")\n",
+    "    break  # We break after the first layer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## CoreML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import coremltools as ct\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
+    "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
+    "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
+    "text_features = ct.TensorType(name=\"features\")\n",
+    "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
+    "image_features = ct.TensorType(name=\"features\")\n",
+    "image_embeddings = ct.TensorType(name=\"embeddings\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.image_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
+    "traced_script_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coreml_model = ct.convert(\n",
+    "    traced_script_module, source=\"pytorch\",\n",
+    "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
+    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
+    "\n",
+    "coreml_model.author = 'Unum Cloud'\n",
+    "coreml_model.license = 'Apache 2.0'\n",
+    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+    "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.text_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
+    "traced_script_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coreml_model = ct.convert(\n",
+    "    traced_script_module, source=\"pytorch\",\n",
+    "    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
+    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
+    "\n",
+    "coreml_model.author = 'Unum Cloud'\n",
+    "coreml_model.license = 'Apache 2.0'\n",
+    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+    "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PyTorch\n",
+    "\n",
+    "Let's ensure that the input layers and the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from safetensors import safe_open\n",
+    "from safetensors.torch import save_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.image_encoder.eval()\n",
+    "model.image_encoder.to(dtype=torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.image_encoder.state_dict(), 'image.pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_file(model.image_encoder.state_dict(), \"image.safetensors\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.text_encoder.eval()\n",
+    "model.text_encoder.to(dtype=torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.text_encoder.state_dict(), 'text.pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_file(model.text_encoder.state_dict(), \"text.safetensors\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n",
+    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+    "\n",
+    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n",
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ONNX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install onnx onnxconverter-common"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.onnx import export as onnx_export"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.text_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "module.to(dtype=torch.float32)\n",
+    "\n",
+    "onnx_export(\n",
+    "    module,\n",
+    "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
+    "    \"text.onnx\", \n",
+    "    export_params=True,\n",
+    "    opset_version=15,\n",
+    "    do_constant_folding=True,\n",
+    "    input_names = ['input_ids', 'attention_mask'], \n",
+    "    output_names = ['features', 'embeddings'],\n",
+    "    dynamic_axes={\n",
+    "        'input_ids' : {0 : 'batch_size'}, \n",
+    "        'attention_mask' : {0 : 'batch_size'}, \n",
+    "        'features' : {0 : 'batch_size'}, \n",
+    "        'embeddings' : {0 : 'batch_size'}})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "from onnxconverter_common import float16\n",
+    "\n",
+    "module = onnx.load(\"text.onnx\")\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, \"text.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now repeat the same for images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.image_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "module.to(dtype=torch.float32)\n",
+    "\n",
+    "torch.onnx.export(\n",
+    "    module,\n",
+    "    image_data, \n",
+    "    \"image.onnx\", \n",
+    "    export_params=True,\n",
+    "    opset_version=15,\n",
+    "    do_constant_folding=True,\n",
+    "    input_names = ['input'], \n",
+    "    output_names = ['features', 'embeddings'],\n",
+    "    dynamic_axes={\n",
+    "        'input' : {0 : 'batch_size'},\n",
+    "        'features' : {0 : 'batch_size'},\n",
+    "        'embeddings' : {0 : 'batch_size'}})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "from onnxconverter_common import float16\n",
+    "\n",
+    "module = onnx.load(\"image.onnx\")\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, \"image.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py
similarity index 100%
rename from python/scripts/test_generative.py
rename to python/scripts/test_decoders.py
diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_encoders.py
similarity index 99%
rename from python/scripts/test_embeddings.py
rename to python/scripts/test_encoders.py
index 9cdd4c5..e7541c1 100644
--- a/python/scripts/test_embeddings.py
+++ b/python/scripts/test_encoders.py
@@ -22,7 +22,7 @@
     onnx_available = False
 
 torch_models = [
-    "unum-cloud/uform-vl2-english-small",
+    "unum-cloud/uform2-vl-english-small",
     "unum-cloud/uform-vl-english",
     "unum-cloud/uform-vl-multilingual-v2",
 ]
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index cdb1250..f5a15c2 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -7,8 +7,8 @@
 
 
 class Modality(Enum):
-    TEXT = "text"
-    IMAGE = "image"
+    TEXT_ENCODER = "text_encoder"
+    IMAGE_ENCODER = "image_encoder"
 
 
 def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]:
diff --git a/python/uform/chat.py b/python/uform/chat.py
index 5ef44b7..c9f8dc3 100644
--- a/python/uform/chat.py
+++ b/python/uform/chat.py
@@ -5,7 +5,7 @@
 from PIL import Image
 from transformers import TextStreamer
 
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
 
 EOS_TOKEN = 32001
 
diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py
new file mode 100644
index 0000000..6792120
--- /dev/null
+++ b/python/uform/gen_model.py
@@ -0,0 +1 @@
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path
diff --git a/swift/EmbeddingsTests.swift b/swift/EmbeddingsTests.swift
index 5efb87f..889cdb6 100644
--- a/swift/EmbeddingsTests.swift
+++ b/swift/EmbeddingsTests.swift
@@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase {
 
         let api = HubApi(hfToken: "xxx")
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform-vl2-english-small",
+            modelName: "unum-cloud/uform2-vl-english-small",
             hubApi: api
         )
 
@@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase {
         // A better option is to fetch directly from HuggingFace, similar to how users would do that:
         let api = HubApi(hfToken: "xxx")
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform-vl2-english-small",
+            modelName: "unum-cloud/uform2-vl-english-small",
             hubApi: api
         )
         let imageModel = try await ImageEncoder(
-            modelName: "unum-cloud/uform-vl2-english-small",
+            modelName: "unum-cloud/uform2-vl-english-small",
             hubApi: api
         )
 
diff --git a/swift/README.md b/swift/README.md
new file mode 100644
index 0000000..1eebf29
--- /dev/null
+++ b/swift/README.md
@@ -0,0 +1,44 @@
+# UForm for Swift
+
+UForm offers first-party support for Swift.
+To get started, add UForm to your project using Swift Package Manager.
+
+```bash
+swift package init --type executable
+swift package add uform
+```
+
+Then, import UForm in your Swift code:
+
+```swift
+import UForm
+```
+
+## Embeddings
+
+### Text Embeddings
+
+```swift
+let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
+let textEmbedding: Embedding = try textModel.forward(with: text)
+let textVector: [Float32] = textEmbedding.asFloats()
+```
+
+### Image Embeddings
+
+```swift
+let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
+guard let url = URL(string: imageURL),
+    let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
+    let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) {
+    throw Exception("Could not load image from URL: \(imageURL)")
+}
+
+var imageEmbedding: Embedding = try imageModel.forward(with: cgImage)
+var imageVector: [Float32] = embedding.asFloats()
+```
+
+
+### Computing Distances
\ No newline at end of file

From 2351fe9f810d06bfb411566de518ced415c64634 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 16 Apr 2024 15:26:45 -0700
Subject: [PATCH 05/40] Docs: Improve export process

---
 python/scripts/export.ipynb | 629 ++++++++----------------------------
 1 file changed, 134 insertions(+), 495 deletions(-)

diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb
index ce8cf10..7afa4cc 100644
--- a/python/scripts/export.ipynb
+++ b/python/scripts/export.ipynb
@@ -18,52 +18,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n",
-      "  Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n",
-      "  Expected in:     <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n",
-      "  warn(f\"Failed to load image Python extension: {e}\")\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fadffc0299c04e249fd4f7a5b40ba0af",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(torch.Size([1, 197, 384]),\n",
-       " torch.Size([1, 64, 768]),\n",
-       " torch.Size([1, 256]),\n",
-       " torch.Size([1, 256]))"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "model_name = \"uform-vl-english-small\"\n",
+    "output_directory = \"../../\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import uform\n",
     "from PIL import Image\n",
     "\n",
-    "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
+    "model, processor = uform.get_model('unum-cloud/' + model_name)\n",
     "text = 'a small red panda in a zoo'\n",
     "image = Image.open('../../assets/unum.png')\n",
     "\n",
@@ -78,58 +51,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "TextEncoder(model_type='bert', dim=768, context_dim=384, vocab_size=30522, padding_idx=0, num_layers=4, num_heads=12, embedding_dim=256, multimodal_layers_ids=[2, 3], head_one_neuron=False, pooling='cls', max_position_embeddings=64, dropout_prob=0.1)"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "model.text_encoder"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "VisualEncoder(dim=384, patch_size=16, image_size=224, num_layers=12, num_heads=6, embedding_dim=256, pooling='cls', num_reg_tokens=0)"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "model.image_encoder"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "First layer of image_encoder: patch_embed\n",
-      "First layer of text_encoder: word_embeddings\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
     "for name, module in model.image_encoder.named_children():\n",
@@ -141,6 +83,59 @@
     "    break  # We break after the first layer"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PyTorch\n",
+    "\n",
+    "Let's ensure:\n",
+    "\n",
+    "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n",
+    "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n",
+    "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list(name for name, _ in model.text_encoder.named_parameters())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Verify input and output names for text_encoder\n",
+    "text_encoder_input_names = [name for name, _ in model.text_encoder.named_parameters()]\n",
+    "assert 'input_ids' in text_encoder_input_names, \"input_ids not found in text_encoder inputs\"\n",
+    "assert 'attention_mask' in text_encoder_input_names, \"attention_mask not found in text_encoder inputs\"\n",
+    "\n",
+    "text_encoder_output_names = [name for name, _ in model.text_encoder.named_modules()]\n",
+    "assert 'embeddings' in text_encoder_output_names, \"embeddings not found in text_encoder outputs\"\n",
+    "assert 'features' in text_encoder_output_names, \"features not found in text_encoder outputs\"\n",
+    "\n",
+    "# Verify input and output names for image_encoder\n",
+    "image_encoder_input_names = [name for name, _ in model.image_encoder.named_parameters()]\n",
+    "assert 'input' in image_encoder_input_names, \"input not found in image_encoder inputs\"\n",
+    "\n",
+    "image_encoder_output_names = [name for name, _ in model.image_encoder.named_modules()]\n",
+    "assert 'embeddings' in image_encoder_output_names, \"embeddings not found in image_encoder outputs\"\n",
+    "assert 'features' in image_encoder_output_names, \"features not found in image_encoder outputs\"\n",
+    "\n",
+    "# Ensure the model can be converted to f16 half-precision\n",
+    "try:\n",
+    "    model.half()  # Convert to half precision\n",
+    "    print(\"Model successfully converted to half precision (f16).\")\n",
+    "except Exception as e:\n",
+    "    print(f\"An error occurred while converting the model to half precision: {e}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -157,18 +152,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "scikit-learn version 1.2.1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n",
-      "Torch version 2.1.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import coremltools as ct\n",
     "import torch"
@@ -176,13 +162,56 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision = ct.precision.FLOAT32"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n",
+    "\n",
+    "```python\n",
+    "        image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
+    "        text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
+    "        text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
+    "```\n",
+    "\n",
+    "That, however, will only work for batch-size one. To support larger batches, we need to override the input shapes.\n",
+    "\n",
+    "```python\n",
+    "        ct.RangeDim(lower_bound=25, upper_bound=100, default=45)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
-    "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
-    "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
+    "def generalize_first_dimensions(input_shape, upper_bound=64):\n",
+    "    if upper_bound == 1:\n",
+    "        return input_shape\n",
+    "    input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n",
+    "    return input_shape\n",
+    "\n",
+    "generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_input = ct.TensorType(name=\"input\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
+    "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
+    "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
     "text_features = ct.TensorType(name=\"features\")\n",
     "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
     "image_features = ct.TensorType(name=\"features\")\n",
@@ -191,256 +220,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "VisualEncoder(\n",
-       "  original_name=VisualEncoder\n",
-       "  (patch_embed): Conv2d(original_name=Conv2d)\n",
-       "  (blocks): Sequential(\n",
-       "    original_name=Sequential\n",
-       "    (0): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (1): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (2): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (3): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (4): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (5): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (6): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (7): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (8): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (9): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (10): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (11): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "  )\n",
-       "  (norm): LayerNorm(original_name=LayerNorm)\n",
-       "  (embedding_projection): Linear(original_name=Linear)\n",
-       ")"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "module = model.image_encoder\n",
     "module.eval()\n",
@@ -452,149 +234,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Tuple detected at graph output. This will be flattened in the converted model.\n",
-      "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n",
-      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n",
-      "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n",
-      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "coreml_model = ct.convert(\n",
     "    traced_script_module, source=\"pytorch\",\n",
     "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
+    "    convert_to='mlprogram', compute_precision=precision)\n",
     "\n",
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
+    "coreml_model.save(os.path.join(output_directory, model_name + \"-image.mlpackage\"))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "TextEncoder(\n",
-       "  original_name=TextEncoder\n",
-       "  (word_embeddings): Embedding(original_name=Embedding)\n",
-       "  (position_embeddings): Embedding(original_name=Embedding)\n",
-       "  (layer_norm): LayerNorm(original_name=LayerNorm)\n",
-       "  (dropout): Dropout(original_name=Dropout)\n",
-       "  (blocks): ModuleList(\n",
-       "    original_name=ModuleList\n",
-       "    (0): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "    (1): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "    (2): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
-       "      (crossattn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "    (3): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
-       "      (crossattn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "  )\n",
-       "  (embedding_projection): Linear(original_name=Linear)\n",
-       "  (matching_head): Linear(original_name=Linear)\n",
-       "  (context_projection): Linear(original_name=Linear)\n",
-       ")"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "module = model.text_encoder\n",
     "module.eval()\n",
@@ -606,40 +265,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Tuple detected at graph output. This will be flattened in the converted model.\n",
-      "Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/157 [00:00<?, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.\n",
-      "Converting PyTorch Frontend ==> MIL Ops:  99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n",
-      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n",
-      "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n",
-      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "coreml_model = ct.convert(\n",
     "    traced_script_module, source=\"pytorch\",\n",
     "    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
+    "    convert_to='mlprogram', compute_precision=precision)\n",
     "\n",
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
+    "coreml_model.save(os.path.join(output_directory, model_name + \"-text.mlpackage\"))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From 94ebd6e1571d29b82fe0730e2122495f937ff07b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 17 Apr 2024 15:32:49 -0700
Subject: [PATCH 06/40] Break: Deprecate old ONNX structure

---
 .gitignore                                    |   1 +
 .vscode/settings.json                         |   7 +
 Package.swift                                 |   4 +-
 pyproject.toml                                |   3 +-
 python/scripts/export_encoders.ipynb          | 201 +++++++++++++-----
 python/scripts/test_encoders.py               | 115 ++++++++--
 python/uform/__init__.py                      | 116 +++++-----
 python/uform/numpy_processors.py              |   4 +-
 python/uform/onnx_encoders.py                 | 151 ++++++-------
 python/uform/torch_encoders.py                |  41 +++-
 python/uform/torch_processors.py              |   6 +-
 swift/{Embeddings.swift => Encoders.swift}    |  11 +
 ...eddingsTests.swift => EncodersTests.swift} |   6 +-
 swift/README.md                               |   4 +-
 14 files changed, 430 insertions(+), 240 deletions(-)
 rename swift/{Embeddings.swift => Encoders.swift} (98%)
 rename swift/{EmbeddingsTests.swift => EncodersTests.swift} (97%)

diff --git a/.gitignore b/.gitignore
index 4db8e17..f4fa33b 100755
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ node_modules
 *.onnx
 *.pt
 *.safetensors
+*.mlpackage
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a6cceb8..5052dea 100755
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,8 +1,10 @@
 {
     "cSpell.words": [
         "arange",
+        "astype",
         "CFURL",
         "coreml",
+        "crossattn",
         "cumsum",
         "dtype",
         "embs",
@@ -25,12 +27,17 @@
         "pretrained",
         "probs",
         "pypi",
+        "pytest",
+        "randn",
         "rerank",
         "reranker",
         "reranking",
+        "sandbeach",
         "sess",
         "SIMD",
         "softmax",
+        "Tensorrt",
+        "torchvision",
         "transfromers",
         "uform",
         "unimodal",
diff --git a/Package.swift b/Package.swift
index 6ac8372..b3b9ffd 100644
--- a/Package.swift
+++ b/Package.swift
@@ -29,13 +29,13 @@ let package = Package(
                 .product(name: "Transformers", package: "swift-transformers")
             ],
             path: "swift",
-            exclude: ["EmbeddingsTests.swift"]
+            exclude: ["EncodersTests.swift"]
         ),
         .testTarget(
             name: "UFormTests",
             dependencies: ["UForm"],
             path: "swift",
-            sources: ["EmbeddingsTests.swift"]
+            sources: ["EncodersTests.swift"]
         ),
     ]
 )
diff --git a/pyproject.toml b/pyproject.toml
index 10f7a9b..1a84808 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,8 @@ classifiers = [
 dependencies = [
     "huggingface_hub>=0.16.4",
     "tokenizers>=0.13.3",
-    "pillow"
+    "pillow",
+    "simsimd",
 ]
 description = "Pocket-Sized Multimodal AI for Content Understanding and Generation"
 maintainers = [
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
index df57858..c7a94e0 100644
--- a/python/scripts/export_encoders.ipynb
+++ b/python/scripts/export_encoders.ipynb
@@ -4,7 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
+    "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
+    "\n",
+    "Depending on the backend, we prefer different qunatization schemes.\n",
+    "\n",
+    "- For ONNX we use `int8` quantization.\n",
+    "- For PyTorch we use `bfloat16` quantization.\n",
+    "- For CoreML we use `float32` representation."
    ]
   },
   {
@@ -181,12 +187,12 @@
     "coreml_model = ct.convert(\n",
     "    traced_script_module, source=\"pytorch\",\n",
     "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=ct.precision)\n",
+    "    convert_to='mlprogram', compute_precision=precision)\n",
     "\n",
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
+    "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))"
    ]
   },
   {
@@ -217,7 +223,7 @@
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
+    "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))"
    ]
   },
   {
@@ -260,7 +266,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.image_encoder.state_dict(), 'image.pt')"
+    "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))"
    ]
   },
   {
@@ -269,7 +275,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.image_encoder.state_dict(), \"image.safetensors\")"
+    "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))"
    ]
   },
   {
@@ -288,7 +294,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.text_encoder.state_dict(), 'text.pt')"
+    "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))"
    ]
   },
   {
@@ -297,7 +303,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.text_encoder.state_dict(), \"text.safetensors\")"
+    "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))"
    ]
   },
   {
@@ -312,26 +318,6 @@
     "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n",
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n",
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -354,7 +340,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from torch.onnx import export as onnx_export"
+    "from torch.onnx import export as onnx_export\n",
+    "import torch"
    ]
   },
   {
@@ -378,7 +365,7 @@
     "onnx_export(\n",
     "    module,\n",
     "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
-    "    \"text.onnx\", \n",
+    "    os.path.join(output_directory, \"text_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -391,27 +378,6 @@
     "        'embeddings' : {0 : 'batch_size'}})"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import onnx\n",
-    "from onnxconverter_common import float16\n",
-    "\n",
-    "module = onnx.load(\"text.onnx\")\n",
-    "module_fp16 = float16.convert_float_to_float16(module)\n",
-    "onnx.save(module_fp16, \"text.onnx\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -433,7 +399,7 @@
     "torch.onnx.export(\n",
     "    module,\n",
     "    image_data, \n",
-    "    \"image.onnx\", \n",
+    "    os.path.join(output_directory, \"image_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -445,6 +411,15 @@
     "        'embeddings' : {0 : 'batch_size'}})"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `float16`\n",
+    "\n",
+    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -452,11 +427,115 @@
    "outputs": [],
    "source": [
     "import onnx\n",
-    "from onnxconverter_common import float16\n",
-    "\n",
-    "module = onnx.load(\"image.onnx\")\n",
+    "from onnxconverter_common import float16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
     "module_fp16 = float16.convert_float_to_float16(module)\n",
-    "onnx.save(module_fp16, \"image.onnx\")"
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `uint8`\n",
+    "\n",
+    "We can further quantize the model into `uint8` using ONNX quantization tools.\n",
+    "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from onnxruntime.quantization import quantize_dynamic, QuantType"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's check that the runtime can actually load those models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnxruntime as ort\n",
+    "session_options = ort.SessionOptions()\n",
+    "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Upload to Hugging Face"
    ]
   },
   {
@@ -465,8 +544,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n",
-    "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx"
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
    ]
   }
  ],
@@ -486,7 +569,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
index e7541c1..a58544d 100644
--- a/python/scripts/test_encoders.py
+++ b/python/scripts/test_encoders.py
@@ -1,8 +1,12 @@
 from typing import Tuple
+import requests
+from io import BytesIO
 import os
 
 import pytest
+import numpy as np
 from PIL import Image
+
 import uform
 
 # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
@@ -22,18 +26,13 @@
     onnx_available = False
 
 torch_models = [
-    "unum-cloud/uform2-vl-english-small",
+    "unum-cloud/uform3-image-text-english-small",
     "unum-cloud/uform-vl-english",
     "unum-cloud/uform-vl-multilingual-v2",
 ]
 
-onnx_models_and_providers = [
-    ("unum-cloud/uform-vl-english-small", "cpu", "fp32"),
-    ("unum-cloud/uform-vl-english-large", "cpu", "fp32"),
-    ("unum-cloud/uform-vl-english-small", "gpu", "fp32"),
-    ("unum-cloud/uform-vl-english-large", "gpu", "fp32"),
-    ("unum-cloud/uform-vl-english-small", "gpu", "fp16"),
-    ("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
+onnx_models = [
+    "unum-cloud/uform3-image-text-english-small",
 ]
 
 # Let's check if the HuggingFace Hub API token is set in the environment variable.
@@ -46,6 +45,71 @@
             token = file.read().strip()
 
 
+def cosine_similarity(x, y) -> float:
+    if not isinstance(x, np.ndarray):
+        x = x.detach().numpy()
+    if not isinstance(y, np.ndarray):
+        y = y.detach().numpy()
+
+    # Unlike NumPy, SimSIMD can properly deal with integer types
+    x = x.astype(np.float32).flatten()
+    y = y.astype(np.float32).flatten()
+    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
+
+
+def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding):
+    """Test if the embeddings of text and image are semantically similar
+    using a small set of example text-image pairs."""
+
+    texts = [
+        "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
+        "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
+        "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+        "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
+        "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
+    ]
+
+    image_urls = [
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
+    ]
+
+    text_embeddings = []
+    image_embeddings = []
+
+    for text, image_url in zip(texts, image_urls):
+        # Download and open the image
+        response = requests.get(image_url)
+        image = Image.open(BytesIO(response.content))
+
+        # Get embeddings
+        text_embedding = text_to_embedding(text)
+        image_embedding = image_to_embedding(image)
+
+        text_embeddings.append(text_embedding)
+        image_embeddings.append(image_embedding)
+
+    # Evaluate cosine similarity
+    for i in range(len(texts)):
+        pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i])
+        other_text_similarities = [
+            cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(len(texts)) if j != i
+        ]
+        other_image_similarities = [
+            cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(len(texts)) if j != i
+        ]
+
+        assert pair_similarity > max(
+            other_text_similarities
+        ), "Text should be more similar to its corresponding image than to other images."
+        assert pair_similarity > max(
+            other_image_similarities
+        ), "Image should be more similar to its corresponding text than to other texts."
+
+
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
 @pytest.mark.parametrize("model_name", torch_models)
 def test_torch_one_embedding(model_name: str):
@@ -73,6 +137,12 @@ def test_torch_one_embedding(model_name: str):
     assert score.shape[0] == 1, "Matching score batch size is not 1"
     assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
 
+    # Test if the model outputs actually make sense
+    cross_references_image_and_text_embeddings(
+        lambda text: model.encode_text(processor.preprocess_text(text)),
+        lambda image: model.encode_image(processor.preprocess_image(image)),
+    )
+
 
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
 @pytest.mark.parametrize("model_name", torch_models)
@@ -94,14 +164,15 @@ def test_torch_many_embeddings(model_name: str, batch_size: int):
 
 
 @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
-@pytest.mark.parametrize("model_specs", onnx_models_and_providers)
-def test_onnx_one_embedding(model_specs: Tuple[str, str, str]):
+@pytest.mark.parametrize("model_name", onnx_models)
+@pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+def test_onnx_one_embedding(model_name: str, device: str):
 
-    from uform.onnx_models import ExecutionProviderError
+    from uform.onnx_encoders import ExecutionProviderError
 
     try:
 
-        model, processor = uform.get_model_onnx(*model_specs, token=token)
+        model, processor = uform.get_model_onnx(model_name, token=token, device=device)
         text = "a small red panda in a zoo"
         image_path = "assets/unum.png"
 
@@ -115,29 +186,27 @@ def test_onnx_one_embedding(model_specs: Tuple[str, str, str]):
         assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
         assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
 
-        score, joint_embedding = model.encode_multimodal(
-            image_features=image_features,
-            text_features=text_features,
-            attention_mask=text_data["attention_mask"],
-            return_scores=True,
+        # Test if the model outputs actually make sense
+        cross_references_image_and_text_embeddings(
+            lambda text: model.encode_text(processor.preprocess_text(text)),
+            lambda image: model.encode_image(processor.preprocess_image(image)),
         )
-        assert score.shape[0] == 1, "Matching score batch size is not 1"
-        assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
 
     except ExecutionProviderError as e:
         pytest.skip(f"Execution provider error: {e}")
 
 
 @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
-@pytest.mark.parametrize("model_specs", onnx_models_and_providers)
+@pytest.mark.parametrize("model_name", onnx_models)
 @pytest.mark.parametrize("batch_size", [1, 2])
-def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int):
+@pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
 
-    from uform.onnx_models import ExecutionProviderError
+    from uform.onnx_encoders import ExecutionProviderError
 
     try:
 
-        model, processor = uform.get_model_onnx(*model_specs, token=token)
+        model, processor = uform.get_model_onnx(model_name, token=token, device=device)
         texts = ["a small red panda in a zoo"] * batch_size
         image_paths = ["assets/unum.png"] * batch_size
 
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index f5a15c2..44fce13 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,6 +1,6 @@
 from json import load
 from os.path import join, exists
-from typing import Mapping, Optional, Tuple
+from typing import Dict, Optional, Tuple, Literal
 from enum import Enum
 
 from huggingface_hub import snapshot_download
@@ -9,15 +9,38 @@
 class Modality(Enum):
     TEXT_ENCODER = "text_encoder"
     IMAGE_ENCODER = "image_encoder"
+    VIDEO_ENCODER = "video_encoder"
+    TEXT_DECODER = "text_decoder"
 
 
-def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]:
-    import torch
+def normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]:
+    if modalities is None:
+        return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER)
+
+    return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities)
+
+
+def get_checkpoint(
+    model_name: str,
+    modalities: Tuple[str, Modality],
+    token: Optional[str] = None,
+    format: Literal[".pt", ".onnx"] = ".pt",
+) -> Tuple[str, Dict[Modality, str], Optional[str]]:
+    """Downloads a model checkpoint from the Hugging Face Hub.
+
+    :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small`
+    :param token: The Hugging Face API token, if required
+    :param modalities: The modalities to download, like `("text_encoder", "image_encoder")`
+    :param format: The format of the model checkpoint, either `.pt` or `.onnx`
+    :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path
+    """
+
+    modalities = normalize_modalities(modalities)
 
     # It is not recommended to use `.pth` extension when checkpointing models
     # because it collides with Python path (`.pth`) configuration files.
-    merged_model_names = ["torch_weight.pt", "weight.pt", "model.pt"]
-    separate_modality_names = [(x.value if isinstance(x, Modality) else x) + ".pt" for x in modalities]
+    merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]]
+    separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities]
     config_names = ["torch_config.json", "config.json"]
     tokenizer_names = ["tokenizer.json"]
 
@@ -45,65 +68,58 @@ def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str,
 
     # Ideally, we want to separately fetch all the models.
     # If those aren't available, aggregate separate modalities and merge them.
-    state = None
+    modality_paths = None
     for file_name in merged_model_names:
         if exists(join(model_path, file_name)):
-            state = torch.load(join(model_path, file_name))
+            modality_paths = join(model_path, file_name)
             break
 
-    if state is None:
-        state = {}
-        for file_name in separate_modality_names:
-            if exists(join(model_path, file_name)):
-                modality_name, _, _ = file_name.partition(".")
-                property_name = modality_name + "_encoder"
-                state[property_name] = torch.load(join(model_path, file_name))
+    if modality_paths is None:
+        modality_paths = {}
+        for separate_modality_name in separate_modality_names:
+            if exists(join(model_path, separate_modality_name)):
+                modality_name, _, _ = separate_modality_name.partition(".")
+                modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name)
 
-    return config_path, state, tokenizer_path
+    return config_path, modality_paths, tokenizer_path
 
 
-def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None):
-    from python.uform.torch_encoders import TextVisualEncoder
-    from python.uform.torch_processors import TorchProcessor
+def get_model(
+    model_name: str,
+    *,
+    token: Optional[str] = None,
+    modalities: Optional[Tuple[str]] = None,
+):
+    from uform.torch_encoders import TextVisualEncoder
+    from uform.torch_processors import TorchProcessor
 
-    if modalities is None:
-        modalities = (Modality.TEXT, Modality.IMAGE)
-
-    config_path, state, tokenizer_path = get_checkpoint(model_name, token, modalities)
-
-    with open(config_path) as f:
-        config = load(f)
+    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".pt")
+    modality_paths = (
+        {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths
+    )
 
-    model = TextVisualEncoder(config, tokenizer_path)
-    model.image_encoder.load_state_dict(state.get("image_encoder", None))
-    model.text_encoder.load_state_dict(state.get("text_encoder", None))
-    processor = TorchProcessor(config, tokenizer_path)
+    model = TextVisualEncoder(config_path, modality_paths)
+    processor = TorchProcessor(config_path, tokenizer_path)
 
     return model.eval(), processor
 
 
-def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None):
-    from python.uform.onnx_encoders import TextVisualEncoder
-    from python.uform.numpy_processors import NumPyProcessor
+def get_model_onnx(
+    model_name: str,
+    *,
+    device: Literal["cpu", "cuda"] = "cpu",
+    token: Optional[str] = None,
+    modalities: Optional[Tuple[str]] = None,
+):
+    from uform.onnx_encoders import TextVisualEncoder
+    from uform.numpy_processors import NumPyProcessor
 
-    assert device in (
-        "cpu",
-        "gpu",
-    ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`"
-    assert dtype in (
-        "fp32",
-        "fp16",
-    ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)"
-    assert (
-        device == "cpu" and dtype == "fp32"
-    ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported"
-
-    model_path = snapshot_download(repo_id=f"{model_name}-{device}-{dtype}", token=token)
-
-    with open(join(model_path, "config.json")) as f:
-        config = load(f)
+    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".onnx")
+    modality_paths = (
+        {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths
+    )
 
-    model = TextVisualEncoder(model_path, config, device, dtype)
-    processor = NumPyProcessor(config, join(model_path, "tokenizer.json"))
+    model = TextVisualEncoder(config_path, modality_paths, device=device)
+    processor = NumPyProcessor(config_path, tokenizer_path)
 
     return model, processor
diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py
index a556db4..d300504 100644
--- a/python/uform/numpy_processors.py
+++ b/python/uform/numpy_processors.py
@@ -1,5 +1,6 @@
 from os import PathLike
 from typing import Dict, List, Union
+import json
 
 from PIL.Image import Image, BICUBIC
 from tokenizers import Tokenizer
@@ -7,13 +8,14 @@
 
 
 class NumPyProcessor:
-    def __init__(self, config: Dict, tokenizer_path: PathLike):
+    def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
         :param config: model config
         :param tokenizer_path: path to tokenizer file
         :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
         """
 
+        config = json.load(open(config_path, "r"))
         self._image_size = config["image_encoder"]["image_size"]
         self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
index 68255de..8201693 100644
--- a/python/uform/onnx_encoders.py
+++ b/python/uform/onnx_encoders.py
@@ -1,5 +1,6 @@
-from os.path import join
-from typing import Dict, Optional, Tuple, Union
+from os import PathLike
+from typing import Dict, Optional, Tuple, Union, Literal
+import json
 
 import onnxruntime as ort
 from numpy import ndarray
@@ -9,18 +10,52 @@ class ExecutionProviderError(Exception):
     """Exception raised when a requested execution provider is not available."""
 
 
-def available_providers(device: str) -> Tuple[str, ...]:
+def available_providers(device: Optional[str]) -> Tuple[str, ...]:
+    """Returns a tuple of available execution providers based on the requested device.
+    https://onnxruntime.ai/docs/execution-providers/
+
+    :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name.
+    :return: Tuple of available execution providers.
+    :raises ExecutionProviderError: If the requested device is not available.
+    """
+
     gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider")
     cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider")
     available = ort.get_available_providers()
-    if device == "gpu":
+
+    # If no target device is specified, let's sort all the available ones with respect to our preference
+    if device is None:
+        preferences = gpu_providers + cpu_providers
+        filtered_preferences = tuple(provider for provider in preferences if provider in available)
+        if len(filtered_preferences):
+            return filtered_preferences
+        if len(available):
+            return available
+        raise ExecutionProviderError("No execution providers are available")
+
+    # If a GPU is requested, but no GPU providers are available, raise an error
+    if device == "gpu" or device == "cuda":
         if all(provider not in available for provider in gpu_providers):
             raise ExecutionProviderError(
                 f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}"
             )
         return gpu_providers
 
-    return cpu_providers
+    # If a CPU is requested, but no CPU providers are available, raise an error
+    if device == "cpu":
+        if all(provider not in available for provider in cpu_providers):
+            raise ExecutionProviderError(
+                f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}"
+            )
+        return cpu_providers
+
+    if device not in available:
+        available_providers = ", ".join(available)
+        raise ExecutionProviderError(
+            f"Execution provider {device} is not available. Currently installed: {available_providers}"
+        )
+
+    return (device,)
 
 
 class VisualEncoder:
@@ -40,11 +75,11 @@ def __init__(self, model_path: str, device: str):
         )
 
     def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]:
-        return self.session.run(None, {"images": images})
+        return self.session.run(None, {"input": images})
 
 
 class TextEncoder:
-    def __init__(self, text_encoder_path: str, reranker_path: str, device: str):
+    def __init__(self, text_encoder_path: str, device: str):
         """
         :param text_encoder_path: Path to onnx of text encoder
         :param reranker_path: Path to onnx of reranker
@@ -60,56 +95,35 @@ def __init__(self, text_encoder_path: str, reranker_path: str, device: str):
             providers=available_providers(device),
         )
 
-        self.reranker_session = ort.InferenceSession(
-            reranker_path,
-            sess_options=session_options,
-            providers=available_providers(device),
-        )
-
     def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]:
         return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
 
-    def forward_multimodal(
-        self, text_features: ndarray, attention_mask: ndarray, image_features: ndarray
-    ) -> Tuple[ndarray, ndarray]:
-        return self.reranker_session.run(
-            None,
-            {
-                "text_features": text_features,
-                "attention_mask": attention_mask,
-                "image_features": image_features,
-            },
-        )
-
 
 class TextVisualEncoder:
-    def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
-        assert device in (
-            "cpu",
-            "gpu",
-        ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`"
-        assert dtype in (
-            "fp32",
-            "fp16",
-        ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)"
-        assert (
-            device == "cpu" and dtype == "fp32"
-        ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported"
-
+    def __init__(
+        self,
+        config_path: PathLike,
+        modality_paths: Union[Dict[str, PathLike], PathLike] = None,
+        *,
+        device: Literal["cpu", "cuda"] = "cpu",
+    ):
+        """Initializes the model with the configuration and pre-trained weights.
+
+        :param config_path: Path to the JSON model configuration file
+        :param modality_paths:  Dictionary with paths to different modalities,
+                                or a single path to the model checkpoint
+        """
         self.device = device
-        self.dtype = dtype
 
+        config = json.load(open(config_path, "r"))
         self._embedding_dim = config["text_encoder"]["embedding_dim"]
         self._text_encoder_dim = config["text_encoder"]["dim"]
         self._image_encoder_dim = config["image_encoder"]["dim"]
 
-        self.text_encoder = TextEncoder(
-            join(checkpoint_path, f"text_encoder.onnx"),
-            join(checkpoint_path, f"reranker.onnx"),
-            device,
-        )
-
-        self.image_encoder = VisualEncoder(join(checkpoint_path, f"image_encoder.onnx"), device)
+        text_encoder_path = modality_paths.get("text_encoder", None)
+        image_encoder_path = modality_paths.get("image_encoder", None)
+        self.text_encoder = TextEncoder(text_encoder_path, device) if text_encoder_path else None
+        self.image_encoder = VisualEncoder(image_encoder_path, device) if image_encoder_path else None
 
     def encode_image(
         self,
@@ -147,51 +161,6 @@ def encode_text(
 
         return embeddings
 
-    def encode_multimodal(
-        self,
-        image: Optional[ndarray] = None,
-        text: Dict[str, ndarray] = None,
-        image_features: Optional[ndarray] = None,
-        text_features: Optional[ndarray] = None,
-        attention_mask: Optional[ndarray] = None,
-        return_scores: bool = False,
-    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
-        """Passes preprocessed texts (or precomputed texts features) and
-            preprocessed images (or precomputed images features) through multimodal encoded to produce matching scores and optionally multimodal joint embeddings.
-
-        :param image: Preprocessed images
-        :param text: Preprocessed texts
-        :param image_features: Precomputed images features
-        :param text_features: Precomputed text features
-        :param attention_mask: Attention masks, not required if pass `text` instead of text_features
-        """
-
-        assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None"
-        assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None"
-
-        if text_features is not None:
-            assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`"
-
-        if image_features is None:
-            image_features = self.image_encoder(image)
-
-        if text_features is None:
-            text_features = self.text_encoder(
-                text["input_ids"],
-                text["attention_mask"],
-            )
-
-        matching_scores, embeddings = self.text_encoder.forward_multimodal(
-            text_features,
-            attention_mask if attention_mask is not None else text["attention_mask"],
-            image_features,
-        )
-
-        if return_scores:
-            return matching_scores, embeddings
-
-        return embeddings
-
     def forward(
         self,
         images: ndarray,
diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py
index 4339765..2a0a0c9 100644
--- a/python/uform/torch_encoders.py
+++ b/python/uform/torch_encoders.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from os import PathLike
 from typing import Dict, Optional, Tuple, Union
+import json
 
 import torch
 import torch.nn as nn
@@ -358,17 +359,45 @@ class TextVisualEncoder(nn.Module):
     Vision-Language Model for Multimodal embeddings.
     """
 
-    def __init__(self, config: Dict, tokenizer_path: PathLike):
-        """
-        :param config: Model config
+    def __init__(
+        self,
+        config_path: PathLike,
+        modality_paths: Union[Dict[str, PathLike], PathLike] = None,
+    ):
+        """Initializes the model with the configuration and pre-trained weights.
+
+        :param config_path: Path to the JSON model configuration file
+        :param modality_paths:  Dictionary with paths to different modalities,
+                                or a single path to the model checkpoint
         """
 
         super().__init__()
-        config["text_encoder"].pop("tokenizer_class", None)
 
+        config = json.load(open(config_path, "r"))
         self._embedding_dim = config["text_encoder"]["embedding_dim"]
-        self.text_encoder = TextEncoder(**config["text_encoder"])
-        self.image_encoder = VisualEncoder(**config["image_encoder"])
+
+        # Both `text_encoder` and `image_encoder` are data-classes, so we must strip
+        # all the non-member attributes before initializing the classes.
+        text_fields = TextEncoder.__dataclass_fields__
+        image_fields = VisualEncoder.__dataclass_fields__
+        text_encoder_attrs = {k: v for k, v in config["text_encoder"].items() if k in text_fields}
+        image_encoder_attrs = {k: v for k, v in config["image_encoder"].items() if k in image_fields}
+        self.text_encoder = TextEncoder(**text_encoder_attrs)
+        self.image_encoder = VisualEncoder(**image_encoder_attrs)
+
+        # Load pre-trained weights
+        if modality_paths is not None:
+            if isinstance(modality_paths, Union[PathLike, str]):
+                state = torch.load(modality_paths)
+                self.text_encoder.load_state_dict(state["text_encoder"])
+                self.image_encoder.load_state_dict(state["image_encoder"])
+            else:
+                text_encoder_path = modality_paths.get("text_encoder", None)
+                image_encoder_path = modality_paths.get("image_encoder", None)
+                if text_encoder_path:
+                    self.text_encoder.load_state_dict(torch.load(text_encoder_path))
+                if image_encoder_path:
+                    self.image_encoder.load_state_dict(torch.load(image_encoder_path))
 
     def encode_image(
         self,
diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py
index 8bdc70b..b435efb 100644
--- a/python/uform/torch_processors.py
+++ b/python/uform/torch_processors.py
@@ -1,5 +1,6 @@
 from os import PathLike
 from typing import Dict, List, Union
+import json
 
 import torch
 from PIL.Image import Image
@@ -15,19 +16,20 @@
 )
 
 
-# lambda is not pickable
+# lambda is not pickle-able
 def convert_to_rgb(image):
     return image.convert("RGB")
 
 
 class TorchProcessor:
-    def __init__(self, config: Dict, tokenizer_path: PathLike):
+    def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
         :param config: model config
         :param tokenizer_path: path to tokenizer file
         :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
         """
 
+        config = json.load(open(config_path, "r"))
         self._image_size = config["image_encoder"]["image_size"]
         self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
diff --git a/swift/Embeddings.swift b/swift/Encoders.swift
similarity index 98%
rename from swift/Embeddings.swift
rename to swift/Encoders.swift
index 6d973ac..bc78433 100644
--- a/swift/Embeddings.swift
+++ b/swift/Encoders.swift
@@ -11,6 +11,17 @@ import Foundation
 import Hub  // `Config`
 import Tokenizers  // `AutoTokenizer`
 
+
+enum EncoderError: Error {
+    case configLoadingError(String)
+    case modelLoadingError(String)
+    case unsupportedDataType
+    case invalidInput
+    case unsupportedShapeConstraint
+    case modelPredictionFailed(String)
+}
+
+
 public enum Embedding {
     case i32s([Int32])
     case f16s([Float16])
diff --git a/swift/EmbeddingsTests.swift b/swift/EncodersTests.swift
similarity index 97%
rename from swift/EmbeddingsTests.swift
rename to swift/EncodersTests.swift
index 889cdb6..caab363 100644
--- a/swift/EmbeddingsTests.swift
+++ b/swift/EncodersTests.swift
@@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase {
 
         let api = HubApi(hfToken: "xxx")
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform2-vl-english-small",
+            modelName: "unum-cloud/uform3-image-text-english-small",
             hubApi: api
         )
 
@@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase {
         // A better option is to fetch directly from HuggingFace, similar to how users would do that:
         let api = HubApi(hfToken: "xxx")
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform2-vl-english-small",
+            modelName: "unum-cloud/uform3-image-text-english-small",
             hubApi: api
         )
         let imageModel = try await ImageEncoder(
-            modelName: "unum-cloud/uform2-vl-english-small",
+            modelName: "unum-cloud/uform3-image-text-english-small",
             hubApi: api
         )
 
diff --git a/swift/README.md b/swift/README.md
index 1eebf29..66b531f 100644
--- a/swift/README.md
+++ b/swift/README.md
@@ -19,7 +19,7 @@ import UForm
 ### Text Embeddings
 
 ```swift
-let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
 let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
 let textEmbedding: Embedding = try textModel.forward(with: text)
 let textVector: [Float32] = textEmbedding.asFloats()
@@ -28,7 +28,7 @@ let textVector: [Float32] = textEmbedding.asFloats()
 ### Image Embeddings
 
 ```swift
-let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
 let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
 guard let url = URL(string: imageURL),
     let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),

From 479ae61d53bf88c0f871765ef2011292986548a8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 17 Apr 2024 16:01:11 -0700
Subject: [PATCH 07/40] Improve: Support different models with Swift

---
 .vscode/settings.json     |  3 ++-
 CONTRIBUTING.md           |  7 +++++++
 swift/Encoders.swift      | 24 ++++++++++++----------
 swift/EncodersTests.swift | 42 ++++++++++++++++++++++++++++++---------
 4 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 5052dea..3a060e1 100755
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -42,7 +42,8 @@
         "uform",
         "unimodal",
         "unsqueeze",
-        "Vardanian"
+        "Vardanian",
+        "whitespaces"
     ],
     "[python]": {
         "editor.defaultFormatter": "ms-python.black-formatter"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 181d9e2..37bc541 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,6 +20,13 @@ pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loa
 
 ## Swift
 
+To build and test the Swift package, use the following command:
+
+```bash
+swift build
+swift test
+```
+
 Swift formatting is enforced with `swift-format` default utility from Apple.
 To install and run it on all the files in the project, use the following command:
 
diff --git a/swift/Encoders.swift b/swift/Encoders.swift
index bc78433..44c6e71 100644
--- a/swift/Encoders.swift
+++ b/swift/Encoders.swift
@@ -11,7 +11,6 @@ import Foundation
 import Hub  // `Config`
 import Tokenizers  // `AutoTokenizer`
 
-
 enum EncoderError: Error {
     case configLoadingError(String)
     case modelLoadingError(String)
@@ -21,7 +20,6 @@ enum EncoderError: Error {
     case modelPredictionFailed(String)
 }
 
-
 public enum Embedding {
     case i32s([Int32])
     case f16s([Float16])
@@ -116,16 +114,22 @@ public class TextEncoder {
         let finalConfigPath = configPath ?? modelPath + "/config.json"
         let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json"
         self.model = try readModel(fromPath: modelPath)
-        self.processor = try TextProcessor(configPath: finalConfigPath, tokenizerPath: finalTokenizerPath, model: self.model)
+        self.processor = try TextProcessor(
+            configPath: finalConfigPath,
+            tokenizerPath: finalTokenizerPath,
+            model: self.model
+        )
     }
 
-    
     public init(modelName: String, hubApi: HubApi = .shared) async throws {
         let repo = Hub.Repo(id: modelName)
-        let modelURL = try await hubApi.snapshot(from: repo, matching: ["text.mlpackage/*", "config.json", "tokenizer.json"])
+        let modelURL = try await hubApi.snapshot(
+            from: repo,
+            matching: ["text_encoder.mlpackage/*", "config.json", "tokenizer.json"]
+        )
         let configPath = modelURL.appendingPathComponent("config.json").path
         let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path
-        self.model = try readModel(fromURL: modelURL.appendingPathComponent("text.mlpackage", isDirectory: true))
+        self.model = try readModel(fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true))
         self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model)
     }
 
@@ -158,12 +162,12 @@ public class ImageEncoder {
 
     public init(modelName: String, hubApi: HubApi = .shared) async throws {
         let repo = Hub.Repo(id: modelName)
-        let modelURL = try await hubApi.snapshot(from: repo, matching: ["image.mlpackage/*", "config.json"])
+        let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"])
         let configPath = modelURL.appendingPathComponent("config.json").path
-        self.model = try readModel(fromURL: modelURL.appendingPathComponent("image.mlpackage", isDirectory: true))
+        self.model = try readModel(fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true))
         self.processor = try ImageProcessor(configPath: configPath)
     }
-    
+
     public func forward(with image: CGImage) throws -> Embedding {
         let inputFeatureProvider = try self.processor.preprocess(image)
         let prediction = try self.model.prediction(from: inputFeatureProvider)
@@ -240,7 +244,7 @@ class ImageProcessor {
         if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] {
             configDict = imageEncoderConfig
         }
-        
+
         let config = Config(configDict)
         self.imageSize = config.imageSize!.intValue!
     }
diff --git a/swift/EncodersTests.swift b/swift/EncodersTests.swift
index caab363..0096d62 100644
--- a/swift/EncodersTests.swift
+++ b/swift/EncodersTests.swift
@@ -1,11 +1,23 @@
 import CoreGraphics
+import Hub
 import ImageIO
 import UForm
-import Hub
 import XCTest
 
 final class TokenizerTests: XCTestCase {
 
+    var hfToken: String?
+
+    override func setUp() {
+        super.setUp()
+        // Attempt to load the Hugging Face token from the `.hf_token` file in the current directory
+        let fileURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath).appendingPathComponent(".hf_token")
+        if let token = try? String(contentsOf: fileURL, encoding: .utf8).trimmingCharacters(in: .whitespacesAndNewlines)
+        {
+            hfToken = token
+        }
+    }
+
     func cosineSimilarity<T: FloatingPoint>(between vectorA: [T], and vectorB: [T]) -> T {
         guard vectorA.count == vectorB.count else {
             fatalError("Vectors must be of the same length.")
@@ -23,9 +35,9 @@ final class TokenizerTests: XCTestCase {
         return dotProduct / (magnitudeA * magnitudeB)
     }
 
-    func testTextEmbeddings() async throws {
+    func testTextEmbeddings(forModel modelName: String) async throws {
 
-        let api = HubApi(hfToken: "xxx")
+        let api = HubApi(hfToken: hfToken)
         let textModel = try await TextEncoder(
             modelName: "unum-cloud/uform3-image-text-english-small",
             hubApi: api
@@ -60,29 +72,35 @@ final class TokenizerTests: XCTestCase {
         )
     }
 
-    func testImageEmbeddings() async throws {
+    func testTextEmbeddings() async throws {
+        for model in ["unum-cloud/uform3-image-text-english-small"] {
+            try await testTextEmbeddings(forModel: model)
+        }
+    }
+
+    func testImageEmbeddings(forModel modelName: String) async throws {
 
         // One option is to use a local model repository.
         //
         //        let root = "uform/"
         //        let textModel = try TextEncoder(
-        //            modelPath: root + "uform-vl-english-large-text.mlpackage",
+        //            modelPath: root + "uform-vl-english-large-text_encoder.mlpackage",
         //            configPath: root + "uform-vl-english-large-text.json",
         //            tokenizerPath: root + "uform-vl-english-large-text.tokenizer.json"
         //        )
         //        let imageModel = try ImageEncoder(
-        //            modelPath: root + "uform-vl-english-large-image.mlpackage",
+        //            modelPath: root + "uform-vl-english-large-image_encoder.mlpackage",
         //            configPath: root + "uform-vl-english-large-image.json"
         //        )
         //
         // A better option is to fetch directly from HuggingFace, similar to how users would do that:
-        let api = HubApi(hfToken: "xxx")
+        let api = HubApi(hfToken: hfToken)
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform3-image-text-english-small",
+            modelName: modelName,
             hubApi: api
         )
         let imageModel = try await ImageEncoder(
-            modelName: "unum-cloud/uform3-image-text-english-small",
+            modelName: modelName,
             hubApi: api
         )
 
@@ -143,4 +161,10 @@ final class TokenizerTests: XCTestCase {
         }
     }
 
+    func testImageEmbeddings() async throws {
+        for model in ["unum-cloud/uform3-image-text-english-small"] {
+            try await testImageEmbeddings(forModel: model)
+        }
+    }
+
 }

From 45479bdbef457abad69753f3da2876be907898c6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 18 Apr 2024 04:53:53 +0000
Subject: [PATCH 08/40] Add: JavaScript library placeholder

---
 .gitignore                |   6 +-
 .vscode/launch.json       |  10 +
 CONTRIBUTING.md           |   9 +
 javascript/embeddings.mts |  55 ++++
 tsconfig.json             |   8 +
 yarn.lock                 | 594 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 681 insertions(+), 1 deletion(-)
 create mode 100644 javascript/embeddings.mts
 create mode 100644 tsconfig.json
 create mode 100644 yarn.lock

diff --git a/.gitignore b/.gitignore
index 4db8e17..af057d5 100755
--- a/.gitignore
+++ b/.gitignore
@@ -8,9 +8,13 @@ __pycache__
 .build
 .swiftpm
 .hf_token
-node_modules
 
 # Tensors & ML Model
 *.onnx
 *.pt
 *.safetensors
+
+# NodeJS
+node_modules
+node_build
+yarn-error.log
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 59eb78c..305841e 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -10,6 +10,16 @@
             "request": "launch",
             "program": "${file}",
             "console": "integratedTerminal",
+        },
+        {
+            "name": "NodeJS Debugger",
+            "type": "node",
+            "request": "launch",
+            "program": "${workspaceFolder}/javascript/embeddings.ts",
+            "preLaunchTask": "tsc: build - tsconfig.json",
+            "outFiles": [
+                "${workspaceFolder}/node_build/**/*.js"
+            ]
         }
     ]
 }
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 181d9e2..cff4e0f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -30,3 +30,12 @@ swift-format . -i -r
 
 The style is controlled by the `.swift-format` JSON file in the root of the repository.
 As there is no standard for Swift formatting, even Apple's own `swift-format` tool and Xcode differ in their formatting rules, and available settings.
+
+## JavaScript
+
+Before submitting any changes, please make sure that the tests pass.
+
+```sh
+npm install
+npm run test
+```
diff --git a/javascript/embeddings.mts b/javascript/embeddings.mts
new file mode 100644
index 0000000..6a34344
--- /dev/null
+++ b/javascript/embeddings.mts
@@ -0,0 +1,55 @@
+import * as ort from 'onnxruntime-web';
+import { AutoTokenizer, PreTrainedTokenizer } from '@xenova/transformers';
+
+type ModelConfig = {
+    modelPath: string;
+    tokenizerPath: string;
+};
+
+class TextEncoder {
+    private session: ort.InferenceSession;
+    private tokenizer: PreTrainedTokenizer;
+
+    constructor(private config: ModelConfig) {}
+
+    async init(): Promise<void> {
+        this.tokenizer = await AutoTokenizer.from_pretrained(this.config.tokenizerPath);
+        this.session = await ort.InferenceSession.create(this.config.modelPath);
+    }
+
+    async forward(text: string): Promise<{ features: Uint8Array, embeddings: Uint8Array }> {
+        // Tokenization
+        const { input_ids } = await this.tokenizer(text);
+        const tensorInputIds = new ort.Tensor('float32', Float32Array.from(input_ids), [1, input_ids.length]);
+        const tensorAttentionMask = new ort.Tensor('float32', new Float32Array(input_ids.length).fill(1), [1, input_ids.length]);
+
+        // Model inference
+        const feeds = { input_ids: tensorInputIds, attention_mask: tensorAttentionMask };
+        const results = await this.session.run(feeds);
+
+        // Assume output tensors are in results['features'] and results['embeddings']
+        const features = results['features'].data as Uint8Array!
+        const embeddings = results['embeddings'].data as Uint8Array!
+
+        return { features, embeddings };
+    }
+}
+
+// Usage
+async function main() {
+    const textEncoder = new TextEncoder({
+        modelPath: './text_encoder.onnx',
+        tokenizerPath: 'Xenova/bert-base-uncased'
+    });
+
+    await textEncoder.init();
+    const result = await textEncoder.forward('I love transformers!');
+    console.log('Features:', result.features);
+    console.log('Embeddings:', result.embeddings);
+}
+
+main();
+
+
+
+
diff --git a/tsconfig.json b/tsconfig.json
new file mode 100644
index 0000000..a77b46b
--- /dev/null
+++ b/tsconfig.json
@@ -0,0 +1,8 @@
+{
+    "compilerOptions": {
+        "target": "ES5",
+        "module": "CommonJS",
+        "outDir": "node_build",
+        "sourceMap": true
+    }
+}
\ No newline at end of file
diff --git a/yarn.lock b/yarn.lock
new file mode 100644
index 0000000..5ab5bbe
--- /dev/null
+++ b/yarn.lock
@@ -0,0 +1,594 @@
+# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
+# yarn lockfile v1
+
+
+"@huggingface/hub@^0.14.8":
+  version "0.14.8"
+  resolved "https://registry.npmjs.org/@huggingface/hub/-/hub-0.14.8.tgz"
+  integrity sha512-vdJRham99E5Uzsc4rO0gTz0ykafmx6V78pgPpJ7LGz5X+P2exe/izPFndqczAzy8jVWN55Jjtnuqg+Y0zrjc+Q==
+  dependencies:
+    hash-wasm "^4.9.0"
+
+"@huggingface/jinja@^0.2.2":
+  version "0.2.2"
+  resolved "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz"
+  integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==
+
+"@protobufjs/aspromise@^1.1.1", "@protobufjs/aspromise@^1.1.2":
+  version "1.1.2"
+  resolved "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz"
+  integrity sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==
+
+"@protobufjs/base64@^1.1.2":
+  version "1.1.2"
+  resolved "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz"
+  integrity sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==
+
+"@protobufjs/codegen@^2.0.4":
+  version "2.0.4"
+  resolved "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz"
+  integrity sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==
+
+"@protobufjs/eventemitter@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz"
+  integrity sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==
+
+"@protobufjs/fetch@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz"
+  integrity sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==
+  dependencies:
+    "@protobufjs/aspromise" "^1.1.1"
+    "@protobufjs/inquire" "^1.1.0"
+
+"@protobufjs/float@^1.0.2":
+  version "1.0.2"
+  resolved "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz"
+  integrity sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==
+
+"@protobufjs/inquire@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz"
+  integrity sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==
+
+"@protobufjs/path@^1.1.2":
+  version "1.1.2"
+  resolved "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz"
+  integrity sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==
+
+"@protobufjs/pool@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz"
+  integrity sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==
+
+"@protobufjs/utf8@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz"
+  integrity sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==
+
+"@types/long@^4.0.1":
+  version "4.0.2"
+  resolved "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz"
+  integrity sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==
+
+"@types/node@>=13.7.0":
+  version "20.12.7"
+  resolved "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz"
+  integrity sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==
+  dependencies:
+    undici-types "~5.26.4"
+
+"@xenova/transformers@^2.17.0":
+  version "2.17.0"
+  resolved "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.0.tgz"
+  integrity sha512-usmDut7hwnrc4EqP59cboYqE6C8up63SqMy3E9RjG9nCsOhrsLndEU7DMu+bZ9R+HcAI8jRGabTIxH+B6agBVA==
+  dependencies:
+    "@huggingface/jinja" "^0.2.2"
+    onnxruntime-web "1.14.0"
+    sharp "^0.32.0"
+  optionalDependencies:
+    onnxruntime-node "1.14.0"
+
+b4a@^1.6.4:
+  version "1.6.6"
+  resolved "https://registry.npmjs.org/b4a/-/b4a-1.6.6.tgz"
+  integrity sha512-5Tk1HLk6b6ctmjIkAcU/Ujv/1WqiDl0F0JdRCR80VsOcUlHcu7pWeWRlOqQLHfDEsVx9YH/aif5AG4ehoCtTmg==
+
+bare-events@^2.0.0, bare-events@^2.2.0:
+  version "2.2.2"
+  resolved "https://registry.npmjs.org/bare-events/-/bare-events-2.2.2.tgz"
+  integrity sha512-h7z00dWdG0PYOQEvChhOSWvOfkIKsdZGkWr083FgN/HyoQuebSew/cgirYqh9SCuy/hRvxc5Vy6Fw8xAmYHLkQ==
+
+bare-fs@^2.1.1:
+  version "2.2.3"
+  resolved "https://registry.npmjs.org/bare-fs/-/bare-fs-2.2.3.tgz"
+  integrity sha512-amG72llr9pstfXOBOHve1WjiuKKAMnebcmMbPWDZ7BCevAoJLpugjuAPRsDINEyjT0a6tbaVx3DctkXIRbLuJw==
+  dependencies:
+    bare-events "^2.0.0"
+    bare-path "^2.0.0"
+    streamx "^2.13.0"
+
+bare-os@^2.1.0:
+  version "2.2.1"
+  resolved "https://registry.npmjs.org/bare-os/-/bare-os-2.2.1.tgz"
+  integrity sha512-OwPyHgBBMkhC29Hl3O4/YfxW9n7mdTr2+SsO29XBWKKJsbgj3mnorDB80r5TiCQgQstgE5ga1qNYrpes6NvX2w==
+
+bare-path@^2.0.0, bare-path@^2.1.0:
+  version "2.1.1"
+  resolved "https://registry.npmjs.org/bare-path/-/bare-path-2.1.1.tgz"
+  integrity sha512-OHM+iwRDRMDBsSW7kl3dO62JyHdBKO3B25FB9vNQBPcGHMo4+eA8Yj41Lfbk3pS/seDY+siNge0LdRTulAau/A==
+  dependencies:
+    bare-os "^2.1.0"
+
+base64-js@^1.3.1:
+  version "1.5.1"
+  resolved "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz"
+  integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==
+
+bl@^4.0.3:
+  version "4.1.0"
+  resolved "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz"
+  integrity sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==
+  dependencies:
+    buffer "^5.5.0"
+    inherits "^2.0.4"
+    readable-stream "^3.4.0"
+
+buffer@^5.5.0:
+  version "5.7.1"
+  resolved "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz"
+  integrity sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==
+  dependencies:
+    base64-js "^1.3.1"
+    ieee754 "^1.1.13"
+
+chownr@^1.1.1:
+  version "1.1.4"
+  resolved "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz"
+  integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==
+
+color-convert@^2.0.1:
+  version "2.0.1"
+  resolved "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz"
+  integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==
+  dependencies:
+    color-name "~1.1.4"
+
+color-name@^1.0.0, color-name@~1.1.4:
+  version "1.1.4"
+  resolved "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz"
+  integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==
+
+color-string@^1.9.0:
+  version "1.9.1"
+  resolved "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz"
+  integrity sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==
+  dependencies:
+    color-name "^1.0.0"
+    simple-swizzle "^0.2.2"
+
+color@^4.2.3:
+  version "4.2.3"
+  resolved "https://registry.npmjs.org/color/-/color-4.2.3.tgz"
+  integrity sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==
+  dependencies:
+    color-convert "^2.0.1"
+    color-string "^1.9.0"
+
+decompress-response@^6.0.0:
+  version "6.0.0"
+  resolved "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz"
+  integrity sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==
+  dependencies:
+    mimic-response "^3.1.0"
+
+deep-extend@^0.6.0:
+  version "0.6.0"
+  resolved "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz"
+  integrity sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==
+
+detect-libc@^2.0.0, detect-libc@^2.0.2:
+  version "2.0.3"
+  resolved "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.3.tgz"
+  integrity sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw==
+
+end-of-stream@^1.1.0, end-of-stream@^1.4.1:
+  version "1.4.4"
+  resolved "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz"
+  integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==
+  dependencies:
+    once "^1.4.0"
+
+expand-template@^2.0.3:
+  version "2.0.3"
+  resolved "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz"
+  integrity sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==
+
+fast-fifo@^1.1.0, fast-fifo@^1.2.0:
+  version "1.3.2"
+  resolved "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz"
+  integrity sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==
+
+flatbuffers@^1.12.0:
+  version "1.12.0"
+  resolved "https://registry.npmjs.org/flatbuffers/-/flatbuffers-1.12.0.tgz"
+  integrity sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==
+
+fs-constants@^1.0.0:
+  version "1.0.0"
+  resolved "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz"
+  integrity sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==
+
+github-from-package@0.0.0:
+  version "0.0.0"
+  resolved "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz"
+  integrity sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==
+
+guid-typescript@^1.0.9:
+  version "1.0.9"
+  resolved "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz"
+  integrity sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==
+
+hash-wasm@^4.9.0:
+  version "4.11.0"
+  resolved "https://registry.npmjs.org/hash-wasm/-/hash-wasm-4.11.0.tgz"
+  integrity sha512-HVusNXlVqHe0fzIzdQOGolnFN6mX/fqcrSAOcTBXdvzrXVHwTz11vXeKRmkR5gTuwVpvHZEIyKoePDvuAR+XwQ==
+
+ieee754@^1.1.13:
+  version "1.2.1"
+  resolved "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz"
+  integrity sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==
+
+inherits@^2.0.3, inherits@^2.0.4:
+  version "2.0.4"
+  resolved "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz"
+  integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
+
+ini@~1.3.0:
+  version "1.3.8"
+  resolved "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz"
+  integrity sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==
+
+is-arrayish@^0.3.1:
+  version "0.3.2"
+  resolved "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz"
+  integrity sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==
+
+long@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.npmjs.org/long/-/long-4.0.0.tgz"
+  integrity sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==
+
+long@^5.0.0:
+  version "5.2.3"
+  resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz"
+  integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==
+
+long@^5.2.3:
+  version "5.2.3"
+  resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz"
+  integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==
+
+lru-cache@^6.0.0:
+  version "6.0.0"
+  resolved "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz"
+  integrity sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==
+  dependencies:
+    yallist "^4.0.0"
+
+mimic-response@^3.1.0:
+  version "3.1.0"
+  resolved "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz"
+  integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==
+
+minimist@^1.2.0, minimist@^1.2.3:
+  version "1.2.8"
+  resolved "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz"
+  integrity sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==
+
+mkdirp-classic@^0.5.2, mkdirp-classic@^0.5.3:
+  version "0.5.3"
+  resolved "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz"
+  integrity sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==
+
+napi-build-utils@^1.0.1:
+  version "1.0.2"
+  resolved "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz"
+  integrity sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==
+
+node-abi@^3.3.0:
+  version "3.57.0"
+  resolved "https://registry.npmjs.org/node-abi/-/node-abi-3.57.0.tgz"
+  integrity sha512-Dp+A9JWxRaKuHP35H77I4kCKesDy5HUDEmScia2FyncMTOXASMyg251F5PhFoDA5uqBrDDffiLpbqnrZmNXW+g==
+  dependencies:
+    semver "^7.3.5"
+
+node-addon-api@^6.1.0:
+  version "6.1.0"
+  resolved "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz"
+  integrity sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==
+
+once@^1.3.1, once@^1.4.0:
+  version "1.4.0"
+  resolved "https://registry.npmjs.org/once/-/once-1.4.0.tgz"
+  integrity sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==
+  dependencies:
+    wrappy "1"
+
+onnx-proto@^4.0.4:
+  version "4.0.4"
+  resolved "https://registry.npmjs.org/onnx-proto/-/onnx-proto-4.0.4.tgz"
+  integrity sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==
+  dependencies:
+    protobufjs "^6.8.8"
+
+onnxruntime-common@~1.14.0:
+  version "1.14.0"
+  resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.14.0.tgz"
+  integrity sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==
+
+onnxruntime-common@1.17.3:
+  version "1.17.3"
+  resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.17.3.tgz"
+  integrity sha512-IkbaDelNVX8cBfHFgsNADRIq2TlXMFWW+nG55mwWvQT4i0NZb32Jf35Pf6h9yjrnK78RjcnlNYaI37w394ovMw==
+
+onnxruntime-node@1.14.0:
+  version "1.14.0"
+  resolved "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz"
+  integrity sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w==
+  dependencies:
+    onnxruntime-common "~1.14.0"
+
+onnxruntime-web@^1.17.3:
+  version "1.17.3"
+  resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.17.3.tgz"
+  integrity sha512-MSDrNUWgc1biP0YzY488OJ9n/jTMS9EXysgm9Aw4CUj2A836ALbO2J1sgzguWJeVUHTlM6p7tRzo8IGAgaXWKw==
+  dependencies:
+    flatbuffers "^1.12.0"
+    guid-typescript "^1.0.9"
+    long "^5.2.3"
+    onnxruntime-common "1.17.3"
+    platform "^1.3.6"
+    protobufjs "^7.2.4"
+
+onnxruntime-web@1.14.0:
+  version "1.14.0"
+  resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz"
+  integrity sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==
+  dependencies:
+    flatbuffers "^1.12.0"
+    guid-typescript "^1.0.9"
+    long "^4.0.0"
+    onnx-proto "^4.0.4"
+    onnxruntime-common "~1.14.0"
+    platform "^1.3.6"
+
+platform@^1.3.6:
+  version "1.3.6"
+  resolved "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz"
+  integrity sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==
+
+prebuild-install@^7.1.1:
+  version "7.1.2"
+  resolved "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.2.tgz"
+  integrity sha512-UnNke3IQb6sgarcZIDU3gbMeTp/9SSU1DAIkil7PrqG1vZlBtY5msYccSKSHDqa3hNg436IXK+SNImReuA1wEQ==
+  dependencies:
+    detect-libc "^2.0.0"
+    expand-template "^2.0.3"
+    github-from-package "0.0.0"
+    minimist "^1.2.3"
+    mkdirp-classic "^0.5.3"
+    napi-build-utils "^1.0.1"
+    node-abi "^3.3.0"
+    pump "^3.0.0"
+    rc "^1.2.7"
+    simple-get "^4.0.0"
+    tar-fs "^2.0.0"
+    tunnel-agent "^0.6.0"
+
+protobufjs@^6.8.8:
+  version "6.11.4"
+  resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.4.tgz"
+  integrity sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw==
+  dependencies:
+    "@protobufjs/aspromise" "^1.1.2"
+    "@protobufjs/base64" "^1.1.2"
+    "@protobufjs/codegen" "^2.0.4"
+    "@protobufjs/eventemitter" "^1.1.0"
+    "@protobufjs/fetch" "^1.1.0"
+    "@protobufjs/float" "^1.0.2"
+    "@protobufjs/inquire" "^1.1.0"
+    "@protobufjs/path" "^1.1.2"
+    "@protobufjs/pool" "^1.1.0"
+    "@protobufjs/utf8" "^1.1.0"
+    "@types/long" "^4.0.1"
+    "@types/node" ">=13.7.0"
+    long "^4.0.0"
+
+protobufjs@^7.2.4:
+  version "7.2.6"
+  resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.6.tgz"
+  integrity sha512-dgJaEDDL6x8ASUZ1YqWciTRrdOuYNzoOf27oHNfdyvKqHr5i0FV7FSLU+aIeFjyFgVxrpTOtQUi0BLLBymZaBw==
+  dependencies:
+    "@protobufjs/aspromise" "^1.1.2"
+    "@protobufjs/base64" "^1.1.2"
+    "@protobufjs/codegen" "^2.0.4"
+    "@protobufjs/eventemitter" "^1.1.0"
+    "@protobufjs/fetch" "^1.1.0"
+    "@protobufjs/float" "^1.0.2"
+    "@protobufjs/inquire" "^1.1.0"
+    "@protobufjs/path" "^1.1.2"
+    "@protobufjs/pool" "^1.1.0"
+    "@protobufjs/utf8" "^1.1.0"
+    "@types/node" ">=13.7.0"
+    long "^5.0.0"
+
+pump@^3.0.0:
+  version "3.0.0"
+  resolved "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz"
+  integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==
+  dependencies:
+    end-of-stream "^1.1.0"
+    once "^1.3.1"
+
+queue-tick@^1.0.1:
+  version "1.0.1"
+  resolved "https://registry.npmjs.org/queue-tick/-/queue-tick-1.0.1.tgz"
+  integrity sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag==
+
+rc@^1.2.7:
+  version "1.2.8"
+  resolved "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz"
+  integrity sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==
+  dependencies:
+    deep-extend "^0.6.0"
+    ini "~1.3.0"
+    minimist "^1.2.0"
+    strip-json-comments "~2.0.1"
+
+readable-stream@^3.1.1, readable-stream@^3.4.0:
+  version "3.6.2"
+  resolved "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz"
+  integrity sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==
+  dependencies:
+    inherits "^2.0.3"
+    string_decoder "^1.1.1"
+    util-deprecate "^1.0.1"
+
+safe-buffer@^5.0.1, safe-buffer@~5.2.0:
+  version "5.2.1"
+  resolved "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz"
+  integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==
+
+semver@^7.3.5, semver@^7.5.4:
+  version "7.6.0"
+  resolved "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz"
+  integrity sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg==
+  dependencies:
+    lru-cache "^6.0.0"
+
+sharp@^0.32.0:
+  version "0.32.6"
+  resolved "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz"
+  integrity sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==
+  dependencies:
+    color "^4.2.3"
+    detect-libc "^2.0.2"
+    node-addon-api "^6.1.0"
+    prebuild-install "^7.1.1"
+    semver "^7.5.4"
+    simple-get "^4.0.1"
+    tar-fs "^3.0.4"
+    tunnel-agent "^0.6.0"
+
+simple-concat@^1.0.0:
+  version "1.0.1"
+  resolved "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz"
+  integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==
+
+simple-get@^4.0.0, simple-get@^4.0.1:
+  version "4.0.1"
+  resolved "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz"
+  integrity sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==
+  dependencies:
+    decompress-response "^6.0.0"
+    once "^1.3.1"
+    simple-concat "^1.0.0"
+
+simple-swizzle@^0.2.2:
+  version "0.2.2"
+  resolved "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz"
+  integrity sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==
+  dependencies:
+    is-arrayish "^0.3.1"
+
+streamx@^2.13.0, streamx@^2.15.0:
+  version "2.16.1"
+  resolved "https://registry.npmjs.org/streamx/-/streamx-2.16.1.tgz"
+  integrity sha512-m9QYj6WygWyWa3H1YY69amr4nVgy61xfjys7xO7kviL5rfIEc2naf+ewFiOA+aEJD7y0JO3h2GoiUv4TDwEGzQ==
+  dependencies:
+    fast-fifo "^1.1.0"
+    queue-tick "^1.0.1"
+  optionalDependencies:
+    bare-events "^2.2.0"
+
+string_decoder@^1.1.1:
+  version "1.3.0"
+  resolved "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz"
+  integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
+  dependencies:
+    safe-buffer "~5.2.0"
+
+strip-json-comments@~2.0.1:
+  version "2.0.1"
+  resolved "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz"
+  integrity sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==
+
+tar-fs@^2.0.0:
+  version "2.1.1"
+  resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz"
+  integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==
+  dependencies:
+    chownr "^1.1.1"
+    mkdirp-classic "^0.5.2"
+    pump "^3.0.0"
+    tar-stream "^2.1.4"
+
+tar-fs@^3.0.4:
+  version "3.0.5"
+  resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.5.tgz"
+  integrity sha512-JOgGAmZyMgbqpLwct7ZV8VzkEB6pxXFBVErLtb+XCOqzc6w1xiWKI9GVd6bwk68EX7eJ4DWmfXVmq8K2ziZTGg==
+  dependencies:
+    pump "^3.0.0"
+    tar-stream "^3.1.5"
+  optionalDependencies:
+    bare-fs "^2.1.1"
+    bare-path "^2.1.0"
+
+tar-stream@^2.1.4:
+  version "2.2.0"
+  resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz"
+  integrity sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==
+  dependencies:
+    bl "^4.0.3"
+    end-of-stream "^1.4.1"
+    fs-constants "^1.0.0"
+    inherits "^2.0.3"
+    readable-stream "^3.1.1"
+
+tar-stream@^3.1.5:
+  version "3.1.7"
+  resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz"
+  integrity sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==
+  dependencies:
+    b4a "^1.6.4"
+    fast-fifo "^1.2.0"
+    streamx "^2.15.0"
+
+tunnel-agent@^0.6.0:
+  version "0.6.0"
+  resolved "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz"
+  integrity sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==
+  dependencies:
+    safe-buffer "^5.0.1"
+
+undici-types@~5.26.4:
+  version "5.26.5"
+  resolved "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz"
+  integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==
+
+util-deprecate@^1.0.1:
+  version "1.0.2"
+  resolved "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz"
+  integrity sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==
+
+wrappy@1:
+  version "1.0.2"
+  resolved "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz"
+  integrity sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==
+
+yallist@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz"
+  integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==

From 2f814135423b1c01a1793461f1c15e1c9328a0b0 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 18 Apr 2024 20:15:00 +0000
Subject: [PATCH 09/40] Make: Consistent naming between Python and TS

---
 javascript/embeddings.mts      | 55 -------------------------
 javascript/encoders.mts        | 74 ++++++++++++++++++++++++++++++++++
 javascript/encoders_test.ts    | 39 ++++++++++++++++++
 package.json                   | 36 ++++++++++++-----
 python/uform/__init__.py       |  8 ++--
 python/uform/onnx_encoders.py  |  8 ++--
 python/uform/torch_decoders.py |  4 +-
 python/uform/torch_encoders.py | 14 +++----
 tsconfig.json                  | 20 +++++++--
 9 files changed, 172 insertions(+), 86 deletions(-)
 delete mode 100644 javascript/embeddings.mts
 create mode 100644 javascript/encoders.mts
 create mode 100644 javascript/encoders_test.ts

diff --git a/javascript/embeddings.mts b/javascript/embeddings.mts
deleted file mode 100644
index 6a34344..0000000
--- a/javascript/embeddings.mts
+++ /dev/null
@@ -1,55 +0,0 @@
-import * as ort from 'onnxruntime-web';
-import { AutoTokenizer, PreTrainedTokenizer } from '@xenova/transformers';
-
-type ModelConfig = {
-    modelPath: string;
-    tokenizerPath: string;
-};
-
-class TextEncoder {
-    private session: ort.InferenceSession;
-    private tokenizer: PreTrainedTokenizer;
-
-    constructor(private config: ModelConfig) {}
-
-    async init(): Promise<void> {
-        this.tokenizer = await AutoTokenizer.from_pretrained(this.config.tokenizerPath);
-        this.session = await ort.InferenceSession.create(this.config.modelPath);
-    }
-
-    async forward(text: string): Promise<{ features: Uint8Array, embeddings: Uint8Array }> {
-        // Tokenization
-        const { input_ids } = await this.tokenizer(text);
-        const tensorInputIds = new ort.Tensor('float32', Float32Array.from(input_ids), [1, input_ids.length]);
-        const tensorAttentionMask = new ort.Tensor('float32', new Float32Array(input_ids.length).fill(1), [1, input_ids.length]);
-
-        // Model inference
-        const feeds = { input_ids: tensorInputIds, attention_mask: tensorAttentionMask };
-        const results = await this.session.run(feeds);
-
-        // Assume output tensors are in results['features'] and results['embeddings']
-        const features = results['features'].data as Uint8Array!
-        const embeddings = results['embeddings'].data as Uint8Array!
-
-        return { features, embeddings };
-    }
-}
-
-// Usage
-async function main() {
-    const textEncoder = new TextEncoder({
-        modelPath: './text_encoder.onnx',
-        tokenizerPath: 'Xenova/bert-base-uncased'
-    });
-
-    await textEncoder.init();
-    const result = await textEncoder.forward('I love transformers!');
-    console.log('Features:', result.features);
-    console.log('Embeddings:', result.embeddings);
-}
-
-main();
-
-
-
-
diff --git a/javascript/encoders.mts b/javascript/encoders.mts
new file mode 100644
index 0000000..cc3b754
--- /dev/null
+++ b/javascript/encoders.mts
@@ -0,0 +1,74 @@
+import { downloadFile, listFiles, RepoDesignation, Credentials } from "@huggingface/hub";
+
+export enum Modality {
+    TextEncoder = "text_encoder",
+    ImageEncoder = "image_encoder",
+    VideoEncoder = "video_encoder",
+    TextDecoder = "text_decoder",
+}
+
+function isModality(key: any): key is keyof typeof Modality {
+    return Object.keys(Modality).includes(key);
+}
+
+function normalizeModalities(modalities: Array<string | Modality>): Array<Modality> {
+    return modalities.map(x => {
+        if (typeof x === "string") {
+            if (isModality(Modality[x as keyof typeof Modality])) {
+                return Modality[x as keyof typeof Modality];
+            } else {
+                throw new Error(`Invalid modality: ${x}`);
+            }
+        }
+        return x;
+    });
+}
+
+export async function getCheckpoint(
+    modelId: string,
+    modalities: Array<string | Modality>,
+    token: string | null = null,
+    format: '.pt' | '.onnx' = '.onnx'
+): Promise<{ configPath: string | null, modalityPaths: Record<string, string> | null, tokenizerPath: string | null }> {
+    modalities = normalizeModalities(modalities);
+
+    const configNames = ['config.json'];
+    const tokenizerNames = ['tokenizer.json'];
+    const modelFileNames = modalities.map(modality => `${modality}${format}`);
+    const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames];
+
+    const repo: RepoDesignation = { type: "model", name: modelId };
+    const credentials: Credentials | undefined = token ? { accessToken: token } : undefined;
+
+    let configPath: string | null = null;
+    let tokenizerPath: string | null = null;
+    const modalityPaths: Record<string, string> = {};
+
+    // List files and directly process
+    const fileIterator = listFiles({ repo, recursive: true, credentials });
+    for await (const file of fileIterator) {
+        const fileName = file.path.split('/').pop();
+        if (fileName && allowedPatterns.includes(fileName)) {
+            const filePath = file.path;
+            if (configNames.includes(fileName)) {
+                configPath = filePath;
+            } else if (tokenizerNames.includes(fileName)) {
+                tokenizerPath = filePath;
+            } else {
+                const modalityName = fileName.split('.')[0];
+                modalityPaths[modalityName] = filePath;
+            }
+
+            // Download the file
+            const response = await downloadFile({ repo, path: filePath, credentials });
+            if (response) {
+                // Handle file response, save locally or process in-memory as needed
+                // Example: Save to a local file or process the file contents
+                console.log(`Downloaded ${fileName} successfully.`);
+            }
+        }
+    }
+
+    return { configPath, modalityPaths, tokenizerPath };
+}
+
diff --git a/javascript/encoders_test.ts b/javascript/encoders_test.ts
new file mode 100644
index 0000000..b5b9f04
--- /dev/null
+++ b/javascript/encoders_test.ts
@@ -0,0 +1,39 @@
+import { getCheckpoint } from "./encoders.mts";
+import { Modality } from "./encoders.mts";
+
+// Simple function to assert conditions
+function assert(condition: boolean, message: string) {
+    if (!condition) {
+        throw new Error(message);
+    }
+}
+
+// Test case for getCheckpoint function
+async function testGetCheckpoint() {
+    console.log("Test getCheckpoint: Start");
+
+    try {
+        const modelId = 'uform3-image-text-english-small';  // Example model ID
+        const token = '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';  // Example token
+        const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
+
+        const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+            modelId,
+            modalities,
+            token,
+            '.onnx'
+        );
+
+        // Asserts to check if the paths are not null (indicating successful file retrieval)
+        assert(configPath !== null, "Config path should not be null");
+        assert(modalityPaths !== null, "Modality paths should not be null");
+        assert(tokenizerPath !== null, "Tokenizer path should not be null");
+
+        console.log("Test getCheckpoint: Success");
+    } catch (error) {
+        console.error("Test getCheckpoint: Failed", error);
+    }
+}
+
+// Run the test
+testGetCheckpoint();
diff --git a/package.json b/package.json
index 7331231..5bf593e 100644
--- a/package.json
+++ b/package.json
@@ -1,11 +1,27 @@
 {
-    "name": "uform",
-    "private": true,
-    "version": "2.0.2",
-    "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
-    "dependencies": {
-        "@huggingface/hub": "^0.14.8",
-        "@xenova/transformers": "^2.17.0",
-        "onnxruntime-web": "^1.17.3"
-    }
-}
+  "name": "uform",
+  "private": true,
+  "version": "2.0.2",
+  "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
+  "dependencies": {
+    "@huggingface/hub": "^0.14.8",
+    "@xenova/transformers": "^2.17.0",
+    "onnxruntime-web": "^1.17.3"
+  },
+  "devDependencies": {
+    "typescript": "^4.0.5",
+    "ts-node": "^9.0.0",
+    "@types/node": "^14.14.7"
+  },
+  "scripts": {
+    "build": "tsc",
+    "test": "ts-node javascript/encoders_test.ts"
+  },
+  "main": "node_build/encoders.js",
+  "directories": {
+    "doc": "docs"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "ISC"
+}
\ No newline at end of file
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 44fce13..74d5ee9 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -90,7 +90,7 @@ def get_model(
     token: Optional[str] = None,
     modalities: Optional[Tuple[str]] = None,
 ):
-    from uform.torch_encoders import TextVisualEncoder
+    from uform.torch_encoders import TextImageEncoder
     from uform.torch_processors import TorchProcessor
 
     config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".pt")
@@ -98,7 +98,7 @@ def get_model(
         {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths
     )
 
-    model = TextVisualEncoder(config_path, modality_paths)
+    model = TextImageEncoder(config_path, modality_paths)
     processor = TorchProcessor(config_path, tokenizer_path)
 
     return model.eval(), processor
@@ -111,7 +111,7 @@ def get_model_onnx(
     token: Optional[str] = None,
     modalities: Optional[Tuple[str]] = None,
 ):
-    from uform.onnx_encoders import TextVisualEncoder
+    from uform.onnx_encoders import TextImageEncoder
     from uform.numpy_processors import NumPyProcessor
 
     config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".onnx")
@@ -119,7 +119,7 @@ def get_model_onnx(
         {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths
     )
 
-    model = TextVisualEncoder(config_path, modality_paths, device=device)
+    model = TextImageEncoder(config_path, modality_paths, device=device)
     processor = NumPyProcessor(config_path, tokenizer_path)
 
     return model, processor
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
index 8201693..95a0f73 100644
--- a/python/uform/onnx_encoders.py
+++ b/python/uform/onnx_encoders.py
@@ -58,7 +58,7 @@ def available_providers(device: Optional[str]) -> Tuple[str, ...]:
     return (device,)
 
 
-class VisualEncoder:
+class ImageEncoder:
     def __init__(self, model_path: str, device: str):
         """
         :param model_path: Path to onnx model
@@ -99,7 +99,7 @@ def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray
         return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
 
 
-class TextVisualEncoder:
+class TextImageEncoder:
     def __init__(
         self,
         config_path: PathLike,
@@ -123,7 +123,7 @@ def __init__(
         text_encoder_path = modality_paths.get("text_encoder", None)
         image_encoder_path = modality_paths.get("image_encoder", None)
         self.text_encoder = TextEncoder(text_encoder_path, device) if text_encoder_path else None
-        self.image_encoder = VisualEncoder(image_encoder_path, device) if image_encoder_path else None
+        self.image_encoder = ImageEncoder(image_encoder_path, device) if image_encoder_path else None
 
     def encode_image(
         self,
@@ -200,4 +200,4 @@ def multimodal_embedding_dim(self) -> int:
         return self._text_encoder_dim
 
 
-VLM_ONNX = TextVisualEncoder  # legacy
+VLM_ONNX = TextImageEncoder  # legacy
diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py
index 79b058d..db60d63 100644
--- a/python/uform/torch_decoders.py
+++ b/python/uform/torch_decoders.py
@@ -20,7 +20,7 @@
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import BatchEncoding
 
-from uform.torch_encoders import VisualEncoder
+from uform.torch_encoders import ImageEncoder
 
 IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
 IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
@@ -143,7 +143,7 @@ def __init__(self, config: VLMConfig):
         self.text_config.vocab_size += 3
         self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
 
-        self.image_encoder = VisualEncoder(
+        self.image_encoder = ImageEncoder(
             self.config.image_encoder_hidden_size,
             self.config.image_encoder_patch_size,
             self.config.image_size,
diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py
index 2a0a0c9..f122606 100644
--- a/python/uform/torch_encoders.py
+++ b/python/uform/torch_encoders.py
@@ -133,7 +133,7 @@ def forward(
 
 
 @dataclass(eq=False)
-class VisualEncoderBlock(nn.Module):
+class ImageEncoderBlock(nn.Module):
     dim: int
     num_heads: int
 
@@ -293,7 +293,7 @@ def forward(
 
 
 @dataclass(eq=False)
-class VisualEncoder(nn.Module):
+class ImageEncoder(nn.Module):
     dim: int
     patch_size: int
     image_size: int
@@ -315,7 +315,7 @@ def __post_init__(self):
             self.reg_token = nn.Parameter(torch.zeros(1, self.num_reg_tokens, self.dim))
 
         self.blocks = nn.Sequential(
-            *[VisualEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)],
+            *[ImageEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)],
         )
 
         self.norm = nn.LayerNorm(self.dim, eps=1e-6)
@@ -354,7 +354,7 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
         return embeddings
 
 
-class TextVisualEncoder(nn.Module):
+class TextImageEncoder(nn.Module):
     """
     Vision-Language Model for Multimodal embeddings.
     """
@@ -379,11 +379,11 @@ def __init__(
         # Both `text_encoder` and `image_encoder` are data-classes, so we must strip
         # all the non-member attributes before initializing the classes.
         text_fields = TextEncoder.__dataclass_fields__
-        image_fields = VisualEncoder.__dataclass_fields__
+        image_fields = ImageEncoder.__dataclass_fields__
         text_encoder_attrs = {k: v for k, v in config["text_encoder"].items() if k in text_fields}
         image_encoder_attrs = {k: v for k, v in config["image_encoder"].items() if k in image_fields}
         self.text_encoder = TextEncoder(**text_encoder_attrs)
-        self.image_encoder = VisualEncoder(**image_encoder_attrs)
+        self.image_encoder = ImageEncoder(**image_encoder_attrs)
 
         # Load pre-trained weights
         if modality_paths is not None:
@@ -535,4 +535,4 @@ def multimodal_embedding_dim(self) -> int:
         return self.text_encoder.dim
 
 
-VLM = TextVisualEncoder  # legacy
+VLM = TextImageEncoder  # legacy
diff --git a/tsconfig.json b/tsconfig.json
index a77b46b..a489f33 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -1,8 +1,20 @@
 {
     "compilerOptions": {
-        "target": "ES5",
         "module": "CommonJS",
-        "outDir": "node_build",
-        "sourceMap": true
-    }
+        "target": "ES2018",
+        "esModuleInterop": true,
+        "moduleResolution": "node",
+        "baseUrl": ".",
+        "outDir": "dist",
+        "allowImportingTsExtensions": true,
+        "paths": {
+            "*": [
+                "node_modules/*",
+                "javascript/*"
+            ]
+        }
+    },
+    "include": [
+        "javascript/**/*"
+    ]
 }
\ No newline at end of file

From eb88296f5397ce5e47b668c3652dcd3f875c20aa Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 18 Apr 2024 23:16:59 +0000
Subject: [PATCH 10/40] Improve: Separate text and image processors

---
 python/uform/numpy_processors.py | 32 +++++++++++++++-----
 python/uform/torch_processors.py | 50 ++++++++++++++++++++------------
 2 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py
index d300504..afda329 100644
--- a/python/uform/numpy_processors.py
+++ b/python/uform/numpy_processors.py
@@ -7,25 +7,20 @@
 import numpy as np
 
 
-class NumPyProcessor:
+class TextProcessor:
     def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
         :param config: model config
         :param tokenizer_path: path to tokenizer file
-        :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
         """
 
         config = json.load(open(config_path, "r"))
-        self._image_size = config["image_encoder"]["image_size"]
         self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
         self._tokenizer.no_padding()
         self._pad_token_idx = config["text_encoder"]["padding_idx"]
 
-        self.image_mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)[None, None]
-        self.image_std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)[None, None]
-
-    def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]:
+    def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]:
         """Transforms one or more strings into dictionary with tokenized strings and attention masks.
 
         :param texts: text of list of texts to tokenizer
@@ -53,7 +48,28 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]
 
         return {"input_ids": input_ids, "attention_mask": attention_mask}
 
-    def preprocess_image(self, images: Union[Image, List[Image]]) -> np.ndarray:
+
+class ImageProcessor:
+    def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
+        """
+        :param config: model config
+        :param tokenizer_path: path to tokenizer file
+        :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
+        """
+
+        config = json.load(open(config_path, "r"))
+        self._image_size = config["image_encoder"]["image_size"]
+        self._normalization_means = config["image_encoder"]["normalization_means"]
+        self._normalization_deviations = config["image_encoder"]["normalization_deviations"]
+
+        assert isinstance(self._image_size, int) and self._image_size > 0
+        assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
+        assert len(self._normalization_means) == len(self._normalization_deviations) == 3
+
+        self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None]
+        self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None]
+
+    def __call__(self, images: Union[Image, List[Image]]) -> np.ndarray:
         """Transforms one or more Pillow images into Torch Tensors.
 
         :param images: image or list of images to preprocess
diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py
index b435efb..340b117 100644
--- a/python/uform/torch_processors.py
+++ b/python/uform/torch_processors.py
@@ -21,35 +21,20 @@ def convert_to_rgb(image):
     return image.convert("RGB")
 
 
-class TorchProcessor:
+class TextProcessor:
     def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
         :param config: model config
         :param tokenizer_path: path to tokenizer file
-        :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
         """
 
         config = json.load(open(config_path, "r"))
-        self._image_size = config["image_encoder"]["image_size"]
         self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
         self._tokenizer.no_padding()
         self._pad_token_idx = config["text_encoder"]["padding_idx"]
 
-        self._image_transform = Compose(
-            [
-                Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
-                convert_to_rgb,
-                CenterCrop(self._image_size),
-                ToTensor(),
-                Normalize(
-                    mean=(0.48145466, 0.4578275, 0.40821073),
-                    std=(0.26862954, 0.26130258, 0.27577711),
-                ),
-            ],
-        )
-
-    def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
+    def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
         """Transforms one or more strings into dictionary with tokenized strings and attention masks.
 
         :param texts: text of list of texts to tokenizer
@@ -79,7 +64,36 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
 
         return {"input_ids": input_ids, "attention_mask": attention_mask}
 
-    def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
+
+class ImageProcessor:
+    def __init__(self, config_path: PathLike):
+        """
+        :param config: model config
+        """
+
+        config = json.load(open(config_path, "r"))
+        self._image_size = config["image_encoder"]["image_size"]
+        self._normalization_means = config["image_encoder"]["normalization_means"]
+        self._normalization_deviations = config["image_encoder"]["normalization_deviations"]
+
+        assert isinstance(self._image_size, int) and self._image_size > 0
+        assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
+        assert len(self._normalization_means) == len(self._normalization_deviations) == 3
+
+        self._image_transform = Compose(
+            [
+                Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
+                convert_to_rgb,
+                CenterCrop(self._image_size),
+                ToTensor(),
+                Normalize(
+                    mean=tuple(self._normalization_means),
+                    std=tuple(self._normalization_deviations),
+                ),
+            ],
+        )
+
+    def __call__(self, images: Union[Image, List[Image]]) -> Tensor:
         """Transforms one or more Pillow images into Torch Tensors.
 
         :param images: image or list of images to preprocess

From a391b6d79595a97e89843d64a24fa86119e68356 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 18 Apr 2024 23:17:17 +0000
Subject: [PATCH 11/40] Make: Deprecate TypeScript for JavaScript

---
 CONTRIBUTING.md             | 14 +++++++
 javascript/encoders.mjs     | 38 +++++++++++++++++++
 javascript/encoders.mts     | 74 ------------------------------------
 javascript/encoders_test.js | 75 +++++++++++++++++++++++++++++++++++++
 javascript/encoders_test.ts | 39 -------------------
 javascript/hub.mjs          | 68 +++++++++++++++++++++++++++++++++
 package.json                | 22 ++++++-----
 tsconfig.json               | 20 ----------
 8 files changed, 208 insertions(+), 142 deletions(-)
 create mode 100644 javascript/encoders.mjs
 delete mode 100644 javascript/encoders.mts
 create mode 100644 javascript/encoders_test.js
 delete mode 100644 javascript/encoders_test.ts
 create mode 100644 javascript/hub.mjs
 delete mode 100644 tsconfig.json

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index bf4f409..bcf6d91 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -40,9 +40,23 @@ As there is no standard for Swift formatting, even Apple's own `swift-format` to
 
 ## JavaScript
 
+For rapid development you can avoid the TypeScript precompilation step:
+
+```sh
+npm install -g ts-node
+ts-node javascript/embeddings.mts
+```
+
 Before submitting any changes, please make sure that the tests pass.
 
 ```sh
 npm install
+npm run build
 npm run test
 ```
+
+```
+tsc
+node node_build/embeddings.mjs
+```
+
diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
new file mode 100644
index 0000000..0107764
--- /dev/null
+++ b/javascript/encoders.mjs
@@ -0,0 +1,38 @@
+import { readFileSync } from 'fs';
+import { InferenceSession } from 'onnxruntime-web';
+
+import { getCheckpoint, Modality } from "./hub.mjs";
+
+import { AutoTokenizer } from '@xenova/transformers';
+
+
+class TextProcessor {
+
+    async init(configPath, tokenizerPath) {
+        const config = JSON.parse(readFileSync(configPath, { encoding: 'utf8' }));
+        this.maxSeqLen = config.text_encoder.max_position_embeddings;
+        this.padTokenIdx = config.text_encoder.padding_idx;
+        this.tokenizer = await AutoTokenizer.from_pretrained(tokenizerPath);
+    }
+
+    async processTexts(texts) {
+        if (typeof texts === 'string') {
+            texts = [texts];
+        }
+
+        const encoded = await this.tokenizer.encodeBatch(texts, {
+            addSpecialTokens: true,
+            returnAttentionMask: true,
+            padding: 'max_length',
+            max_length: this.maxSeqLen,
+            truncation: true,
+            return_tensors: 'np'
+        });
+
+        const inputIds = encoded.map(e => e.input_ids);
+        const attentionMask = encoded.map(e => e.attention_mask);
+        return { inputIds, attentionMask };
+    }
+}
+
+export { TextProcessor };
diff --git a/javascript/encoders.mts b/javascript/encoders.mts
deleted file mode 100644
index cc3b754..0000000
--- a/javascript/encoders.mts
+++ /dev/null
@@ -1,74 +0,0 @@
-import { downloadFile, listFiles, RepoDesignation, Credentials } from "@huggingface/hub";
-
-export enum Modality {
-    TextEncoder = "text_encoder",
-    ImageEncoder = "image_encoder",
-    VideoEncoder = "video_encoder",
-    TextDecoder = "text_decoder",
-}
-
-function isModality(key: any): key is keyof typeof Modality {
-    return Object.keys(Modality).includes(key);
-}
-
-function normalizeModalities(modalities: Array<string | Modality>): Array<Modality> {
-    return modalities.map(x => {
-        if (typeof x === "string") {
-            if (isModality(Modality[x as keyof typeof Modality])) {
-                return Modality[x as keyof typeof Modality];
-            } else {
-                throw new Error(`Invalid modality: ${x}`);
-            }
-        }
-        return x;
-    });
-}
-
-export async function getCheckpoint(
-    modelId: string,
-    modalities: Array<string | Modality>,
-    token: string | null = null,
-    format: '.pt' | '.onnx' = '.onnx'
-): Promise<{ configPath: string | null, modalityPaths: Record<string, string> | null, tokenizerPath: string | null }> {
-    modalities = normalizeModalities(modalities);
-
-    const configNames = ['config.json'];
-    const tokenizerNames = ['tokenizer.json'];
-    const modelFileNames = modalities.map(modality => `${modality}${format}`);
-    const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames];
-
-    const repo: RepoDesignation = { type: "model", name: modelId };
-    const credentials: Credentials | undefined = token ? { accessToken: token } : undefined;
-
-    let configPath: string | null = null;
-    let tokenizerPath: string | null = null;
-    const modalityPaths: Record<string, string> = {};
-
-    // List files and directly process
-    const fileIterator = listFiles({ repo, recursive: true, credentials });
-    for await (const file of fileIterator) {
-        const fileName = file.path.split('/').pop();
-        if (fileName && allowedPatterns.includes(fileName)) {
-            const filePath = file.path;
-            if (configNames.includes(fileName)) {
-                configPath = filePath;
-            } else if (tokenizerNames.includes(fileName)) {
-                tokenizerPath = filePath;
-            } else {
-                const modalityName = fileName.split('.')[0];
-                modalityPaths[modalityName] = filePath;
-            }
-
-            // Download the file
-            const response = await downloadFile({ repo, path: filePath, credentials });
-            if (response) {
-                // Handle file response, save locally or process in-memory as needed
-                // Example: Save to a local file or process the file contents
-                console.log(`Downloaded ${fileName} successfully.`);
-            }
-        }
-    }
-
-    return { configPath, modalityPaths, tokenizerPath };
-}
-
diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
new file mode 100644
index 0000000..b3bad21
--- /dev/null
+++ b/javascript/encoders_test.js
@@ -0,0 +1,75 @@
+import { existsSync } from 'fs';
+
+import { getCheckpoint, Modality } from "./hub.mjs";
+import { TextProcessor } from "./encoders.mjs";
+
+function assert(condition, message) {
+    if (!condition) {
+        throw new Error(message);
+    }
+}
+
+async function testGetCheckpoint() {
+    console.log("Test getCheckpoint: Start");
+
+    try {
+        const modelId = 'unum-cloud/uform3-image-text-english-small';
+        const token = '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';
+        const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
+
+        const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+            modelId,
+            modalities,
+            token,
+            '.onnx'
+        );
+
+        assert(configPath !== null, "Config path should not be null");
+        assert(modalityPaths !== null, "Modality paths should not be null");
+        assert(tokenizerPath !== null, "Tokenizer path should not be null");
+
+        // Check if the file actually exists
+        assert(existsSync(configPath), `Config file should exist at ${configPath}`);
+        assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`);
+        for (const modalityPath of Object.values(modalityPaths)) {
+            assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`);
+        }
+
+        console.log("Test getCheckpoint: Success");
+    } catch (error) {
+        console.error("Test getCheckpoint: Failed", error);
+    }
+}
+
+async function testTextEncoder() {
+    console.log("Test TextEncoder: Start");
+
+    try {
+        const modelId = 'unum-cloud/uform3-image-text-english-small';
+        const token = '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';
+        const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
+
+        const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+            modelId,
+            modalities,
+            token,
+            '.onnx'
+        );
+
+        assert(configPath !== null, "Config path should not be null");
+        assert(modalityPaths !== null, "Modality paths should not be null");
+        assert(tokenizerPath !== null, "Tokenizer path should not be null");
+
+        const textProcessor = new TextProcessor();
+        await textProcessor.init(configPath, tokenizerPath);
+        const processedTexts = await textProcessor.processTexts(["Hello, world!", "Another example text."]);
+        console.log(processedTexts);
+
+        console.log("Test getCheckpoint: Success");
+    } catch (error) {
+        console.error("Test getCheckpoint: Failed", error);
+    }
+}
+
+testGetCheckpoint();
+testTextEncoder();
diff --git a/javascript/encoders_test.ts b/javascript/encoders_test.ts
deleted file mode 100644
index b5b9f04..0000000
--- a/javascript/encoders_test.ts
+++ /dev/null
@@ -1,39 +0,0 @@
-import { getCheckpoint } from "./encoders.mts";
-import { Modality } from "./encoders.mts";
-
-// Simple function to assert conditions
-function assert(condition: boolean, message: string) {
-    if (!condition) {
-        throw new Error(message);
-    }
-}
-
-// Test case for getCheckpoint function
-async function testGetCheckpoint() {
-    console.log("Test getCheckpoint: Start");
-
-    try {
-        const modelId = 'uform3-image-text-english-small';  // Example model ID
-        const token = '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';  // Example token
-        const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
-
-        const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
-            modelId,
-            modalities,
-            token,
-            '.onnx'
-        );
-
-        // Asserts to check if the paths are not null (indicating successful file retrieval)
-        assert(configPath !== null, "Config path should not be null");
-        assert(modalityPaths !== null, "Modality paths should not be null");
-        assert(tokenizerPath !== null, "Tokenizer path should not be null");
-
-        console.log("Test getCheckpoint: Success");
-    } catch (error) {
-        console.error("Test getCheckpoint: Failed", error);
-    }
-}
-
-// Run the test
-testGetCheckpoint();
diff --git a/javascript/hub.mjs b/javascript/hub.mjs
new file mode 100644
index 0000000..99ebfee
--- /dev/null
+++ b/javascript/hub.mjs
@@ -0,0 +1,68 @@
+import { downloadFile, listFiles } from "@huggingface/hub";
+
+const Modality = {
+    TextEncoder: "text_encoder",
+    ImageEncoder: "image_encoder",
+    VideoEncoder: "video_encoder",
+    TextDecoder: "text_decoder",
+};
+
+function isModality(value) {
+    return Object.values(Modality).includes(value);
+}
+
+function normalizeModalities(modalities) {
+    return modalities.map(x => {
+        if (typeof x === "string") {
+            if (isModality(x)) {
+                return x;
+            } else {
+                throw new Error(`Invalid modality: ${x}`);
+            }
+        }
+        return x;
+    });
+}
+
+async function getCheckpoint(
+    modelId, modalities, token = null, format = '.onnx',
+) {
+    modalities = normalizeModalities(modalities);
+
+    const configNames = ['config.json'];
+    const tokenizerNames = ['tokenizer.json'];
+    const modelFileNames = modalities.map(modality => `${modality}${format}`);
+    const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames];
+
+    const repo = { type: "model", name: modelId };
+    const credentials = token ? { accessToken: token } : undefined;
+
+    let configPath = null;
+    let tokenizerPath = null;
+    const modalityPaths = {};
+
+    const fileIterator = listFiles({ repo, recursive: true, credentials });
+    for await (const file of fileIterator) {
+        const fileName = file.path.split('/').pop();
+        if (fileName && allowedPatterns.includes(fileName)) {
+            const filePath = file.path;
+            if (configNames.includes(fileName)) {
+                configPath = filePath;
+            } else if (tokenizerNames.includes(fileName)) {
+                tokenizerPath = filePath;
+            } else {
+                const modalityName = fileName.split('.')[0];
+                modalityPaths[modalityName] = filePath;
+            }
+
+            const response = await downloadFile({ repo, path: filePath, credentials });
+            if (response) {
+                console.log(`Downloaded ${fileName} successfully to ${response.json()}`);
+            }
+        }
+    }
+
+    return { configPath, modalityPaths, tokenizerPath };
+}
+
+export { getCheckpoint, Modality };
diff --git a/package.json b/package.json
index 5bf593e..a25922f 100644
--- a/package.json
+++ b/package.json
@@ -1,5 +1,6 @@
 {
   "name": "uform",
+  "type": "module",
   "private": true,
   "version": "2.0.2",
   "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
@@ -9,19 +10,22 @@
     "onnxruntime-web": "^1.17.3"
   },
   "devDependencies": {
-    "typescript": "^4.0.5",
-    "ts-node": "^9.0.0",
-    "@types/node": "^14.14.7"
+    "nodemon": "^2.0.15"
   },
   "scripts": {
-    "build": "tsc",
-    "test": "ts-node javascript/encoders_test.ts"
+    "start": "node javascript/encoders.mjs",
+    "test": "node javascript/encoders_test.js"
   },
-  "main": "node_build/encoders.js",
+  "main": "javascript/encoders.mjs",
   "directories": {
     "doc": "docs"
   },
-  "keywords": [],
-  "author": "",
-  "license": "ISC"
+  "keywords": [
+    "AI",
+    "multimodal",
+    "content generation",
+    "huggingface"
+  ],
+  "author": "Ash Vardanian, Unum Cloud",
+  "license": "Apache-2.0"
 }
\ No newline at end of file
diff --git a/tsconfig.json b/tsconfig.json
deleted file mode 100644
index a489f33..0000000
--- a/tsconfig.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-    "compilerOptions": {
-        "module": "CommonJS",
-        "target": "ES2018",
-        "esModuleInterop": true,
-        "moduleResolution": "node",
-        "baseUrl": ".",
-        "outDir": "dist",
-        "allowImportingTsExtensions": true,
-        "paths": {
-            "*": [
-                "node_modules/*",
-                "javascript/*"
-            ]
-        }
-    },
-    "include": [
-        "javascript/**/*"
-    ]
-}
\ No newline at end of file

From 50c71c80741962baac11752333964438a1a1a87e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 19 Apr 2024 02:37:25 +0000
Subject: [PATCH 12/40] Add: Text processor for JS

---
 .vscode/launch.json         |  8 +---
 javascript/encoders.mjs     | 93 ++++++++++++++++++++++++++++++-------
 javascript/encoders_test.js | 13 ++++--
 javascript/hub.mjs          | 50 +++++++++++++++++---
 package.json                |  3 +-
 5 files changed, 132 insertions(+), 35 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 305841e..3343a11 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -13,13 +13,9 @@
         },
         {
             "name": "NodeJS Debugger",
-            "type": "node",
+            "type": "node-terminal",
             "request": "launch",
-            "program": "${workspaceFolder}/javascript/embeddings.ts",
-            "preLaunchTask": "tsc: build - tsconfig.json",
-            "outFiles": [
-                "${workspaceFolder}/node_build/**/*.js"
-            ]
+            "command": "npm run test",
         }
     ]
 }
\ No newline at end of file
diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
index 0107764..bd04ee6 100644
--- a/javascript/encoders.mjs
+++ b/javascript/encoders.mjs
@@ -1,38 +1,97 @@
 import { readFileSync } from 'fs';
-import { InferenceSession } from 'onnxruntime-web';
-
+import { InferenceSession, Tensor } from 'onnxruntime-node';
 import { getCheckpoint, Modality } from "./hub.mjs";
-
-import { AutoTokenizer } from '@xenova/transformers';
+import { PreTrainedTokenizer } from '@xenova/transformers';
 
 
 class TextProcessor {
 
-    async init(configPath, tokenizerPath) {
-        const config = JSON.parse(readFileSync(configPath, { encoding: 'utf8' }));
+    constructor(configPath, tokenizerPath) {
+        this.configPath = configPath;
+        this.tokenizerPath = tokenizerPath;
+
+        this.maxSeqLen = 0;
+        this.padTokenIdx = 0;
+        this.tokenizer = null;
+    }
+
+    async init() {
+        const config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' }));
         this.maxSeqLen = config.text_encoder.max_position_embeddings;
         this.padTokenIdx = config.text_encoder.padding_idx;
-        this.tokenizer = await AutoTokenizer.from_pretrained(tokenizerPath);
+
+        const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' }));
+        this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config.text_encoder);
+        this.tokenizer.model_max_length = this.maxSeqLen;
+        this.tokenizer.pad_token_id = this.padTokenIdx;
     }
 
-    async processTexts(texts) {
-        if (typeof texts === 'string') {
-            texts = [texts];
-        }
+    async process(texts) {
 
-        const encoded = await this.tokenizer.encodeBatch(texts, {
+        const encoded = await this.tokenizer(texts, {
             addSpecialTokens: true,
             returnAttentionMask: true,
             padding: 'max_length',
             max_length: this.maxSeqLen,
             truncation: true,
-            return_tensors: 'np'
         });
 
-        const inputIds = encoded.map(e => e.input_ids);
-        const attentionMask = encoded.map(e => e.attention_mask);
-        return { inputIds, attentionMask };
+        return {
+            'input_ids': encoded.input_ids,
+            'attention_mask': encoded.attention_mask,
+        };
+    }
+}
+
+class TextEncoder {
+
+    constructor(configPath, modelPath, tokenizerPath) {
+        this.configPath = configPath;
+        this.modelPath = modelPath;
+        this.tokenizerPath = tokenizerPath;
+
+        this.session = null;
+    }
+
+    async init() {
+        this.session = await InferenceSession.create(this.modelPath);
     }
+
+    async forward(inputs) {
+        // Helper function to convert any BigInt64Array or other numeric arrays to Int32Array
+        function convertToCompatibleInt32(data) {
+            if (data instanceof Int32Array) {
+                return data; // Already the correct type
+            } else if (data instanceof BigInt64Array) {
+                // Convert BigInt64Array to Int32Array, ensuring values are within range
+                return new Int32Array(data.map(bigInt => {
+                    if (bigInt > 2147483647n || bigInt < -2147483648n) {
+                        throw new Error("Value out of range for Int32.");
+                    }
+                    return Number(bigInt); // Convert BigInt to Number and store in Int32Array
+                }));
+            } else if (Array.isArray(data) || data instanceof Uint32Array) {
+                // Convert other numeric array types to Int32Array
+                return new Int32Array(data.map(Number));
+            }
+            throw new Error("Unsupported data type for tensor conversion.");
+        }
+
+        // Prepare the tensor data using the helper function
+        const inputIDsData = convertToCompatibleInt32(inputs.input_ids.data);
+        const attentionMaskData = convertToCompatibleInt32(inputs.attention_mask.data);
+
+        // Create ONNX Tensors as int32
+        const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims);
+        const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims);
+
+        // Run the model inference
+        return this.session.run({
+            input_ids: inputIDs,
+            attention_mask: attentionMask,
+        });
+    }
+
 }
 
-export { TextProcessor };
+export { TextProcessor, TextEncoder };
diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
index b3bad21..1a09cac 100644
--- a/javascript/encoders_test.js
+++ b/javascript/encoders_test.js
@@ -1,7 +1,7 @@
 import { existsSync } from 'fs';
 
 import { getCheckpoint, Modality } from "./hub.mjs";
-import { TextProcessor } from "./encoders.mjs";
+import { TextProcessor, TextEncoder } from "./encoders.mjs";
 
 function assert(condition, message) {
     if (!condition) {
@@ -60,11 +60,16 @@ async function testTextEncoder() {
         assert(modalityPaths !== null, "Modality paths should not be null");
         assert(tokenizerPath !== null, "Tokenizer path should not be null");
 
-        const textProcessor = new TextProcessor();
-        await textProcessor.init(configPath, tokenizerPath);
-        const processedTexts = await textProcessor.processTexts(["Hello, world!", "Another example text."]);
+        const textProcessor = new TextProcessor(configPath, tokenizerPath);
+        await textProcessor.init();
+        const processedTexts = await textProcessor.process("Hello, world!");
         console.log(processedTexts);
 
+        const textEncoder = new TextEncoder(configPath, modalityPaths.text_encoder, tokenizerPath);
+        await textEncoder.init();
+        const output = await textEncoder.forward(processedTexts);
+        console.log(output);
+
         console.log("Test getCheckpoint: Success");
     } catch (error) {
         console.error("Test getCheckpoint: Failed", error);
diff --git a/javascript/hub.mjs b/javascript/hub.mjs
index 99ebfee..ad534f3 100644
--- a/javascript/hub.mjs
+++ b/javascript/hub.mjs
@@ -1,3 +1,6 @@
+import { join } from "path"
+import { createWriteStream, existsSync, mkdirSync, writeFileSync } from "fs";
+
 import { downloadFile, listFiles } from "@huggingface/hub";
 
 const Modality = {
@@ -24,9 +27,13 @@ function normalizeModalities(modalities) {
     });
 }
 
-async function getCheckpoint(
-    modelId, modalities, token = null, format = '.onnx',
-) {
+async function ensureDirectoryExists(dirPath) {
+    if (!existsSync(dirPath)) {
+        mkdirSync(dirPath, { recursive: true });
+    }
+}
+
+async function getCheckpoint(modelId, modalities, token = null, format = '.onnx', saveDir = './models') {
     modalities = normalizeModalities(modalities);
 
     const configNames = ['config.json'];
@@ -40,24 +47,53 @@ async function getCheckpoint(
     let configPath = null;
     let tokenizerPath = null;
     const modalityPaths = {};
+    const modelSaveDir = join(saveDir, modelId);
+
+    await ensureDirectoryExists(modelSaveDir);
 
     const fileIterator = listFiles({ repo, recursive: true, credentials });
     for await (const file of fileIterator) {
         const fileName = file.path.split('/').pop();
         if (fileName && allowedPatterns.includes(fileName)) {
             const filePath = file.path;
+            const savePath = join(modelSaveDir, fileName);
+
             if (configNames.includes(fileName)) {
-                configPath = filePath;
+                configPath = savePath;
             } else if (tokenizerNames.includes(fileName)) {
-                tokenizerPath = filePath;
+                tokenizerPath = savePath;
             } else {
                 const modalityName = fileName.split('.')[0];
-                modalityPaths[modalityName] = filePath;
+                modalityPaths[modalityName] = savePath;
             }
 
             const response = await downloadFile({ repo, path: filePath, credentials });
             if (response) {
-                console.log(`Downloaded ${fileName} successfully to ${response.json()}`);
+                // HuggingFace might be defining the `env.localModelPath` variable
+                // to store the downloaded files in a local directory.
+                // Let's check if the file is there.
+                // const localPath = join(env.localModelPath, repo, filePath);
+                // if (existsSync(localPath)) {
+                //     console.log(`File already exists locally at ${localPath}`);
+                // }
+
+                if (response.body && response.body.pipe) {
+                    const fileStream = createWriteStream(savePath);
+                    response.body.pipe(fileStream);
+                    await new Promise((resolve, reject) => {
+                        fileStream.on('finish', resolve);
+                        fileStream.on('error', reject);
+                    });
+                } else if (response.arrayBuffer) {
+                    // Handle non-streamable response for environments like Node.js
+                    const buffer = await response.arrayBuffer();
+                    writeFileSync(savePath, Buffer.from(buffer));
+                } else {
+                    console.error('Unexpected response type');
+                }
+                console.log(`Downloaded ${fileName} successfully to ${savePath}`);
+            } else {
+                console.log('No response received for the file download request.');
             }
         }
     }
diff --git a/package.json b/package.json
index a25922f..9be073f 100644
--- a/package.json
+++ b/package.json
@@ -7,6 +7,7 @@
   "dependencies": {
     "@huggingface/hub": "^0.14.8",
     "@xenova/transformers": "^2.17.0",
+    "onnxruntime-node": "^1.17.0",
     "onnxruntime-web": "^1.17.3"
   },
   "devDependencies": {
@@ -28,4 +29,4 @@
   ],
   "author": "Ash Vardanian, Unum Cloud",
   "license": "Apache-2.0"
-}
\ No newline at end of file
+}

From 19c0c30718b53e81267757e2a3bbf9fe8e7dec9c Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 19 Apr 2024 02:38:57 +0000
Subject: [PATCH 13/40] Fix: Mismatch in the input types for text

---
 javascript/encoders.mjs              | 32 +++++-----
 python/scripts/export_encoders.ipynb | 90 +++++++++++++++++++++++++++-
 2 files changed, 106 insertions(+), 16 deletions(-)

diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
index bd04ee6..c55d2d1 100644
--- a/javascript/encoders.mjs
+++ b/javascript/encoders.mjs
@@ -58,34 +58,36 @@ class TextEncoder {
     }
 
     async forward(inputs) {
-        // Helper function to convert any BigInt64Array or other numeric arrays to Int32Array
-        function convertToCompatibleInt32(data) {
+        // Helper function to convert BigInt64Array to Int32Array or validate Int32Array
+        function ensureInt32Array(data) {
             if (data instanceof Int32Array) {
-                return data; // Already the correct type
-            } else if (data instanceof BigInt64Array) {
-                // Convert BigInt64Array to Int32Array, ensuring values are within range
-                return new Int32Array(data.map(bigInt => {
+                return data; // Use as is if already Int32Array
+            }
+            if (data instanceof BigInt64Array) {
+                // Convert BigInt64Array to Int32Array, ensuring all values are in range
+                return new Int32Array(Array.from(data).map(bigInt => {
                     if (bigInt > 2147483647n || bigInt < -2147483648n) {
                         throw new Error("Value out of range for Int32.");
                     }
-                    return Number(bigInt); // Convert BigInt to Number and store in Int32Array
+                    return Number(bigInt); // Convert BigInt to Number
                 }));
-            } else if (Array.isArray(data) || data instanceof Uint32Array) {
-                // Convert other numeric array types to Int32Array
-                return new Int32Array(data.map(Number));
+            }
+            // Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array
+            if (Array.isArray(data) || data instanceof Uint32Array || data instanceof Uint8Array) {
+                return new Int32Array(data); // Convert directly
             }
             throw new Error("Unsupported data type for tensor conversion.");
         }
 
-        // Prepare the tensor data using the helper function
-        const inputIDsData = convertToCompatibleInt32(inputs.input_ids.data);
-        const attentionMaskData = convertToCompatibleInt32(inputs.attention_mask.data);
+        // Prepare tensor data
+        const inputIDsData = ensureInt32Array(inputs.input_ids.data);
+        const attentionMaskData = ensureInt32Array(inputs.attention_mask.data);
 
-        // Create ONNX Tensors as int32
+        // Create ONNX Tensors as 'int32'
         const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims);
         const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims);
 
-        // Run the model inference
+        // Run model inference
         return this.session.run({
             input_ids: inputIDs,
             attention_mask: attentionMask,
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
index c7a94e0..a8d2ac3 100644
--- a/python/scripts/export_encoders.ipynb
+++ b/python/scripts/export_encoders.ipynb
@@ -493,6 +493,87 @@
     "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's make sure that all the text inputs are integers of identical type - `int32`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "import os\n",
+    "from onnx import helper\n",
+    "\n",
+    "# Load the ONNX model\n",
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "\n",
+    "# Get the module's graph\n",
+    "graph = module.graph\n",
+    "\n",
+    "# Iterate through the inputs and update the data type of `input_ids`\n",
+    "for input_tensor in graph.input:\n",
+    "    # Check if this is the tensor we want to change\n",
+    "    if input_tensor.name == 'input_ids' or input_tensor.name == 'attention_mask':\n",
+    "        # Get the tensor type information\n",
+    "        tensor_type = input_tensor.type.tensor_type\n",
+    "        # Set the element type to INT32 (int32's enum value in onnx is 6)\n",
+    "        tensor_type.elem_type = onnx.TensorProto.INT32\n",
+    "\n",
+    "# Optionally, check that the module is still valid\n",
+    "onnx.checker.check_model(module)\n",
+    "\n",
+    "# Save the modified module\n",
+    "onnx.save(module, module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can use the following function to print and validate the input and output types of the ONNX model files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_model_inputs_and_outputs(onnx_model_path):\n",
+    "    model = onnx.load(onnx_model_path)\n",
+    "\n",
+    "    # Get the model's graph\n",
+    "    graph = model.graph\n",
+    "\n",
+    "    # Print input information\n",
+    "    print(\"Model Inputs:\")\n",
+    "    for input_tensor in graph.input:\n",
+    "        tensor_type = input_tensor.type.tensor_type\n",
+    "        # Get the element type (data type)\n",
+    "        elem_type = tensor_type.elem_type\n",
+    "        # Convert numeric type to readable format\n",
+    "        readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
+    "        # Get tensor shape\n",
+    "        shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
+    "        print(f\"Name: {input_tensor.name}, Type: {readable_type}, Shape: {shape}\")\n",
+    "\n",
+    "    # Print output information similarly if needed\n",
+    "    print(\"\\nModel Outputs:\")\n",
+    "    for output_tensor in graph.output:\n",
+    "        tensor_type = output_tensor.type.tensor_type\n",
+    "        elem_type = tensor_type.elem_type\n",
+    "        readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
+    "        shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
+    "        print(f\"Name: {output_tensor.name}, Type: {readable_type}, Shape: {shape}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -551,6 +632,13 @@
     "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
     "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -569,7 +657,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

From 7ac33bd836627ae0b9788b2ffe241e826e9ddd32 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 19 Apr 2024 04:59:05 +0000
Subject: [PATCH 14/40] Fix: Passing tests in JavaScript

---
 javascript/encoders.mjs     | 161 +++++++++++++++++++++++++++++++++---
 javascript/encoders_test.js |  33 +++++---
 2 files changed, 174 insertions(+), 20 deletions(-)

diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
index c55d2d1..ae9dbec 100644
--- a/javascript/encoders.mjs
+++ b/javascript/encoders.mjs
@@ -1,8 +1,9 @@
 import { readFileSync } from 'fs';
 import { InferenceSession, Tensor } from 'onnxruntime-node';
-import { getCheckpoint, Modality } from "./hub.mjs";
 import { PreTrainedTokenizer } from '@xenova/transformers';
+import sharp from 'sharp';
 
+import { getCheckpoint, Modality } from "./hub.mjs";
 
 class TextProcessor {
 
@@ -16,12 +17,16 @@ class TextProcessor {
     }
 
     async init() {
-        const config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' }));
-        this.maxSeqLen = config.text_encoder.max_position_embeddings;
-        this.padTokenIdx = config.text_encoder.padding_idx;
+        var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' }));
+        if (config.text_encoder !== undefined) {
+            config = config.text_encoder;
+        }
+
+        this.maxSeqLen = config.max_position_embeddings;
+        this.padTokenIdx = config.padding_idx;
 
         const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' }));
-        this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config.text_encoder);
+        this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config);
         this.tokenizer.model_max_length = this.maxSeqLen;
         this.tokenizer.pad_token_id = this.padTokenIdx;
     }
@@ -45,11 +50,8 @@ class TextProcessor {
 
 class TextEncoder {
 
-    constructor(configPath, modelPath, tokenizerPath) {
-        this.configPath = configPath;
+    constructor(modelPath, processor = null) {
         this.modelPath = modelPath;
-        this.tokenizerPath = tokenizerPath;
-
         this.session = null;
     }
 
@@ -57,7 +59,18 @@ class TextEncoder {
         this.session = await InferenceSession.create(this.modelPath);
     }
 
+    async dispose() {
+        if (this.session) {
+            await this.session.release();
+            this.session = null;
+        }
+    }
+
     async forward(inputs) {
+        if (!this.session) {
+            throw new Error("Session is not initialized.");
+        }
+
         // Helper function to convert BigInt64Array to Int32Array or validate Int32Array
         function ensureInt32Array(data) {
             if (data instanceof Int32Array) {
@@ -96,4 +109,132 @@ class TextEncoder {
 
 }
 
-export { TextProcessor, TextEncoder };
+
+class ImageProcessor {
+    constructor(configPath) {
+        this.configPath = configPath;
+    }
+
+    async init() {
+        var config = JSON.parse(readFileSync(this.configPath, 'utf8'));
+        if (config.image_encoder !== undefined) {
+            config = config.image_encoder;
+        }
+
+        this.imageSize = config.image_size;
+        this.normalizationMeans = config.normalization_means;
+        this.normalizationDeviations = config.normalization_deviations;
+
+        this.imageMean = new Float32Array(this.normalizationMeans).fill(0);
+        this.imageStd = new Float32Array(this.normalizationDeviations).fill(0);
+    }
+    async process(images) {
+        const processSingle = async (image) => {
+            let img = sharp(image);
+            const metadata = await img.metadata();
+            const scale = this.imageSize / Math.min(metadata.width, metadata.height);
+            const scaledWidth = parseInt(metadata.width * scale);
+            const scaledHeight = parseInt(metadata.height * scale);
+            img = img.resize({
+                width: scaledWidth,
+                height: scaledHeight,
+                fit: sharp.fit.cover,
+                position: sharp.strategy.entropy
+            }).extract({
+                left: Math.max(0, (scaledWidth - this.imageSize) / 2),
+                top: Math.max(0, (scaledHeight - this.imageSize) / 2),
+                width: this.imageSize,
+                height: this.imageSize
+            }).removeAlpha();
+
+            let buffer = await img.raw().toBuffer();
+            let array = new Float32Array(buffer);
+
+            return array.map((value, index) => {
+                const channel = index % 3;
+                return (value / 255.0 - this.normalizationMeans[channel]) / this.normalizationDeviations[channel];
+            });
+        };
+
+        if (Array.isArray(images)) {
+            return Promise.all(images.map(img => processSingle(img)));
+        } else {
+            return [await processSingle(images)];
+        }
+    }
+}
+
+class ImageEncoder {
+    constructor(modelPath, processor) {
+        this.modelPath = modelPath;
+        this.imageSize = processor.imageSize;
+    }
+
+    async init() {
+        this.session = await InferenceSession.create(this.modelPath);
+    }
+
+    async dispose() {
+        if (this.session) {
+            await this.session.release();
+            this.session = null;
+        }
+    }
+
+    async forward(inputs) {
+        if (!this.session) {
+            throw new Error("Session is not initialized.");
+        }
+
+        // Helper function to ensure data is a Float32Array.
+        const ensureFloat32Array = (data) => {
+            if (!(data instanceof Float32Array)) {
+                throw new Error("Unsupported data type for tensor conversion.");
+            }
+            return data;
+        };
+
+        // Helper function to concatenate multiple Float32Arrays into a single Float32Array.
+        const concatFloat32Arrays = (arrays) => {
+            const totalLength = arrays.reduce((acc, val) => acc + val.length, 0);
+            const result = new Float32Array(totalLength);
+            let offset = 0;
+            for (let arr of arrays) {
+                result.set(arr, offset);
+                offset += arr.length;
+            }
+            return result;
+        };
+
+        let inputData;
+        let dims;
+
+        if (Array.isArray(inputs)) {
+            // Assuming each input in the array is a Float32Array representing an image already processed to a fixed size.
+            const arrays = inputs.map(ensureFloat32Array);
+            inputData = concatFloat32Arrays(arrays);
+            const numImages = arrays.length;
+            const numChannels = 3;
+            const height = this.imageSize;
+            const width = this.imageSize;
+            dims = [numImages, numChannels, height, width];
+        } else {
+            // Single image input, which is already a Float32Array.
+            inputData = ensureFloat32Array(inputs);
+            const numChannels = 3;
+            const height = this.imageSize;
+            const width = this.imageSize;
+            dims = [1, numChannels, height, width];
+        }
+
+        // Create ONNX Tensor
+        const inputTensor = new Tensor('float32', inputData, dims);
+
+        // Run model inference
+        return this.session.run({
+            input: inputTensor,
+        });
+    }
+}
+
+export { TextProcessor, TextEncoder, ImageProcessor, ImageEncoder };
diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
index 1a09cac..fba11f4 100644
--- a/javascript/encoders_test.js
+++ b/javascript/encoders_test.js
@@ -1,7 +1,7 @@
 import { existsSync } from 'fs';
 
 import { getCheckpoint, Modality } from "./hub.mjs";
-import { TextProcessor, TextEncoder } from "./encoders.mjs";
+import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs";
 
 function assert(condition, message) {
     if (!condition) {
@@ -41,8 +41,10 @@ async function testGetCheckpoint() {
     }
 }
 
-async function testTextEncoder() {
-    console.log("Test TextEncoder: Start");
+async function testEncoders() {
+    console.log("Test testEncoders: Start");
+    let textEncoder = null;
+    let imageEncoder = null;
 
     try {
         const modelId = 'unum-cloud/uform3-image-text-english-small';
@@ -63,18 +65,29 @@ async function testTextEncoder() {
         const textProcessor = new TextProcessor(configPath, tokenizerPath);
         await textProcessor.init();
         const processedTexts = await textProcessor.process("Hello, world!");
-        console.log(processedTexts);
 
-        const textEncoder = new TextEncoder(configPath, modalityPaths.text_encoder, tokenizerPath);
+        textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
         await textEncoder.init();
-        const output = await textEncoder.forward(processedTexts);
-        console.log(output);
+        const textOutput = await textEncoder.forward(processedTexts);
+        console.log(textOutput.embeddings.dims);
 
-        console.log("Test getCheckpoint: Success");
+        const imageProcessor = new ImageProcessor(configPath);
+        await imageProcessor.init();
+        const processedImages = await imageProcessor.process("assets/unum.png");
+
+        imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+        await imageEncoder.init();
+        const imageOutput = await imageEncoder.forward(processedImages);
+        console.log(imageOutput.embeddings.dims);
+
+        console.log("Test testEncoders: Success");
     } catch (error) {
-        console.error("Test getCheckpoint: Failed", error);
+        console.error("Test testEncoders: Failed", error);
+    } finally {
+        await textEncoder.dispose();
+        await imageEncoder.dispose();
     }
 }
 
 testGetCheckpoint();
-testTextEncoder();
+testEncoders();

From 4f1568fb799b3d150e459a633ad8705efd0ca089 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 20 Apr 2024 00:47:58 +0000
Subject: [PATCH 15/40] Fix: Rename image inputs

---
 python/scripts/export_encoders.ipynb | 20 +++++++-------------
 swift/Encoders.swift                 |  4 ++--
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
index a8d2ac3..029e60a 100644
--- a/python/scripts/export_encoders.ipynb
+++ b/python/scripts/export_encoders.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "Depending on the backend, we prefer different qunatization schemes.\n",
     "\n",
-    "- For ONNX we use `int8` quantization.\n",
+    "- For ONNX we use `uint8` quantization.\n",
     "- For PyTorch we use `bfloat16` quantization.\n",
     "- For CoreML we use `float32` representation."
    ]
@@ -19,6 +19,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "!pip uninstall -y uform\n",
     "!pip install --upgrade \"uform[torch]\" coremltools"
    ]
   },
@@ -42,7 +43,7 @@
     "import uform\n",
     "from PIL import Image\n",
     "\n",
-    "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
+    "model, processor = uform.get_model('unum-cloud/' + model_name)\n",
     "text = 'a small red panda in a zoo'\n",
     "image = Image.open('../../assets/unum.png')\n",
     "\n",
@@ -122,7 +123,7 @@
     "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n",
     "\n",
     "```python\n",
-    "        image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
+    "        image_input = ct.TensorType(name=\"images\", shape=image_data.shape)\n",
     "        text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
     "        text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
     "```\n",
@@ -155,7 +156,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "image_input = ct.TensorType(name=\"input\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
+    "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
     "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
     "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
     "text_features = ct.TensorType(name=\"features\")\n",
@@ -403,10 +404,10 @@
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
-    "    input_names = ['input'], \n",
+    "    input_names = ['images'], \n",
     "    output_names = ['features', 'embeddings'],\n",
     "    dynamic_axes={\n",
-    "        'input' : {0 : 'batch_size'},\n",
+    "        'images' : {0 : 'batch_size'},\n",
     "        'features' : {0 : 'batch_size'},\n",
     "        'embeddings' : {0 : 'batch_size'}})"
    ]
@@ -632,13 +633,6 @@
     "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
     "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/swift/Encoders.swift b/swift/Encoders.swift
index 44c6e71..3582e91 100644
--- a/swift/Encoders.swift
+++ b/swift/Encoders.swift
@@ -402,14 +402,14 @@ class ImageInput: MLFeatureProvider {
     }
 
     var featureNames: Set<String> {
-        return Set(["input"])
+        return Set(["images"])
     }
 
     // The model expects the input IDs to be an array of integers
     // of length `sequenceLength`, padded with `paddingID` if necessary
     func featureValue(for featureName: String) -> MLFeatureValue? {
         switch featureName {
-        case "input":
+        case "images":
             return precomputedFeature
         default:
             return nil

From cccfc620d7d143b642a84b96326d5db49f679ebf Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 20 Apr 2024 01:04:21 +0000
Subject: [PATCH 16/40] Improve: Separate encoders & processors

---
 javascript/encoders.mjs          |  20 +--
 python/scripts/test_encoders.py  |  89 +++++++-----
 python/uform/__init__.py         |  62 +++++---
 python/uform/numpy_processors.py |  18 ++-
 python/uform/onnx_encoders.py    | 123 ++--------------
 python/uform/torch_decoders.py   |  18 ++-
 python/uform/torch_encoders.py   | 241 +++++--------------------------
 python/uform/torch_processors.py |  22 ++-
 8 files changed, 194 insertions(+), 399 deletions(-)

diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
index ae9dbec..7a287cc 100644
--- a/javascript/encoders.mjs
+++ b/javascript/encoders.mjs
@@ -181,7 +181,7 @@ class ImageEncoder {
         }
     }
 
-    async forward(inputs) {
+    async forward(images) {
         if (!this.session) {
             throw new Error("Session is not initialized.");
         }
@@ -206,21 +206,21 @@ class ImageEncoder {
             return result;
         };
 
-        let inputData;
+        let imagesData;
         let dims;
 
-        if (Array.isArray(inputs)) {
-            // Assuming each input in the array is a Float32Array representing an image already processed to a fixed size.
-            const arrays = inputs.map(ensureFloat32Array);
-            inputData = concatFloat32Arrays(arrays);
+        if (Array.isArray(images)) {
+            // Assuming each images in the array is a Float32Array representing an image already processed to a fixed size.
+            const arrays = images.map(ensureFloat32Array);
+            imagesData = concatFloat32Arrays(arrays);
             const numImages = arrays.length;
             const numChannels = 3;
             const height = this.imageSize;
             const width = this.imageSize;
             dims = [numImages, numChannels, height, width];
         } else {
-            // Single image input, which is already a Float32Array.
-            inputData = ensureFloat32Array(inputs);
+            // Single image images, which is already a Float32Array.
+            imagesData = ensureFloat32Array(images);
             const numChannels = 3;
             const height = this.imageSize;
             const width = this.imageSize;
@@ -228,11 +228,11 @@ class ImageEncoder {
         }
 
         // Create ONNX Tensor
-        const inputTensor = new Tensor('float32', inputData, dims);
+        const imagesTensor = new Tensor('float32', imagesData, dims);
 
         // Run model inference
         return this.session.run({
-            input: inputTensor,
+            images: imagesTensor,
         });
     }
 }
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
index a58544d..d26e4f2 100644
--- a/python/scripts/test_encoders.py
+++ b/python/scripts/test_encoders.py
@@ -7,7 +7,7 @@
 import numpy as np
 from PIL import Image
 
-import uform
+from uform import Modality, get_model, get_model_onnx
 
 # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
 try:
@@ -27,12 +27,16 @@
 
 torch_models = [
     "unum-cloud/uform3-image-text-english-small",
-    "unum-cloud/uform-vl-english",
-    "unum-cloud/uform-vl-multilingual-v2",
+    "unum-cloud/uform3-image-text-english-base",
+    "unum-cloud/uform3-image-text-english-large",
+    "unum-cloud/uform3-image-text-multilingual-base",
 ]
 
 onnx_models = [
     "unum-cloud/uform3-image-text-english-small",
+    "unum-cloud/uform3-image-text-english-base",
+    "unum-cloud/uform3-image-text-english-large",
+    "unum-cloud/uform3-image-text-multilingual-base",
 ]
 
 # Let's check if the HuggingFace Hub API token is set in the environment variable.
@@ -113,34 +117,29 @@ def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embed
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
 @pytest.mark.parametrize("model_name", torch_models)
 def test_torch_one_embedding(model_name: str):
-    model, processor = uform.get_model(model_name, token=token)
+    processors, models = get_model(model_name, token=token)
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
+
     text = "a small red panda in a zoo"
     image_path = "assets/unum.png"
 
     image = Image.open(image_path)
-    image_data = processor.preprocess_image(image)
-    text_data = processor.preprocess_text(text)
+    image_data = processor_image(image)
+    text_data = processor_text(text)
 
-    image_features, image_embedding = model.encode_image(image_data, return_features=True)
-    text_features, text_embedding = model.encode_text(text_data, return_features=True)
+    image_features, image_embedding = model_image.forward(image_data, return_features=True)
+    text_features, text_embedding = model_text.forward(text_data, return_features=True)
 
     assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
     assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
 
-    # Test reranking
-    score, joint_embedding = model.encode_multimodal(
-        image_features=image_features,
-        text_features=text_features,
-        attention_mask=text_data["attention_mask"],
-        return_scores=True,
-    )
-    assert score.shape[0] == 1, "Matching score batch size is not 1"
-    assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
-
     # Test if the model outputs actually make sense
     cross_references_image_and_text_embeddings(
-        lambda text: model.encode_text(processor.preprocess_text(text)),
-        lambda image: model.encode_image(processor.preprocess_image(image)),
+        lambda text: model_text(processor_text(text)),
+        lambda image: model_image(processor_image(image)),
     )
 
 
@@ -148,16 +147,22 @@ def test_torch_one_embedding(model_name: str):
 @pytest.mark.parametrize("model_name", torch_models)
 @pytest.mark.parametrize("batch_size", [1, 2])
 def test_torch_many_embeddings(model_name: str, batch_size: int):
-    model, processor = uform.get_model(model_name, token=token)
+
+    processors, models = get_model(model_name, token=token)
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
+
     texts = ["a small red panda in a zoo"] * batch_size
     image_paths = ["assets/unum.png"] * batch_size
 
     images = [Image.open(path) for path in image_paths]
-    image_data = processor.preprocess_image(images)
-    text_data = processor.preprocess_text(texts)
+    image_data = processor_image(images)
+    text_data = processor_text(texts)
 
-    image_embeddings = model.encode_image(image_data, return_features=False)
-    text_embeddings = model.encode_text(text_data, return_features=False)
+    image_embeddings = model_image.forward(image_data, return_features=False)
+    text_embeddings = model_text.forward(text_data, return_features=False)
 
     assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
     assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
@@ -172,24 +177,29 @@ def test_onnx_one_embedding(model_name: str, device: str):
 
     try:
 
-        model, processor = uform.get_model_onnx(model_name, token=token, device=device)
+        processors, models = get_model_onnx(model_name, token=token, device=device)
+        model_text = models[Modality.TEXT_ENCODER]
+        model_image = models[Modality.IMAGE_ENCODER]
+        processor_text = processors[Modality.TEXT_ENCODER]
+        processor_image = processors[Modality.IMAGE_ENCODER]
+
         text = "a small red panda in a zoo"
         image_path = "assets/unum.png"
 
         image = Image.open(image_path)
-        image_data = processor.preprocess_image(image)
-        text_data = processor.preprocess_text(text)
+        image_data = processor_image(image)
+        text_data = processor_text(text)
 
-        image_features, image_embedding = model.encode_image(image_data, return_features=True)
-        text_features, text_embedding = model.encode_text(text_data, return_features=True)
+        image_features, image_embedding = model_image(image_data)
+        text_features, text_embedding = model_text(text_data)
 
         assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
         assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
 
         # Test if the model outputs actually make sense
         cross_references_image_and_text_embeddings(
-            lambda text: model.encode_text(processor.preprocess_text(text)),
-            lambda image: model.encode_image(processor.preprocess_image(image)),
+            lambda text: model_text(processor_text(text)),
+            lambda image: model_image(processor_image(image)),
         )
 
     except ExecutionProviderError as e:
@@ -206,16 +216,21 @@ def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
 
     try:
 
-        model, processor = uform.get_model_onnx(model_name, token=token, device=device)
+        processors, models = get_model_onnx(model_name, token=token, device=device)
+        model_text = models[Modality.TEXT_ENCODER]
+        model_image = models[Modality.IMAGE_ENCODER]
+        processor_text = processors[Modality.TEXT_ENCODER]
+        processor_image = processors[Modality.IMAGE_ENCODER]
+
         texts = ["a small red panda in a zoo"] * batch_size
         image_paths = ["assets/unum.png"] * batch_size
 
         images = [Image.open(path) for path in image_paths]
-        image_data = processor.preprocess_image(images)
-        text_data = processor.preprocess_text(texts)
+        image_data = processor_image(images)
+        text_data = processor_text(texts)
 
-        image_embeddings = model.encode_image(image_data, return_features=False)
-        text_embeddings = model.encode_text(text_data, return_features=False)
+        image_embeddings = model_image(image_data, return_features=False)
+        text_embeddings = model_text(text_data, return_features=False)
 
         assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
         assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 74d5ee9..841440f 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,6 +1,6 @@
 from json import load
 from os.path import join, exists
-from typing import Dict, Optional, Tuple, Literal
+from typing import Dict, Optional, Tuple, Literal, Union, Callable
 from enum import Enum
 
 from huggingface_hub import snapshot_download
@@ -88,20 +88,30 @@ def get_model(
     model_name: str,
     *,
     token: Optional[str] = None,
-    modalities: Optional[Tuple[str]] = None,
-):
-    from uform.torch_encoders import TextImageEncoder
-    from uform.torch_processors import TorchProcessor
+    modalities: Optional[Tuple[Union[str, Modality]]] = None,
+) -> Tuple[Dict[Modality, Callable], Dict]:
+    from uform.torch_encoders import TextEncoder, ImageEncoder
+    from uform.torch_processors import TextProcessor, ImageProcessor
 
-    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".pt")
-    modality_paths = (
-        {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths
-    )
+    modalities = normalize_modalities(modalities)
+    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt")
+
+    result_processors = {}
+    result_models = {}
 
-    model = TextImageEncoder(config_path, modality_paths)
-    processor = TorchProcessor(config_path, tokenizer_path)
+    if Modality.TEXT_ENCODER in modalities:
+        processor = TextProcessor(config_path, tokenizer_path)
+        encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER)).eval()
+        result_processors[Modality.TEXT_ENCODER] = processor
+        result_models[Modality.TEXT_ENCODER] = encoder
 
-    return model.eval(), processor
+    if Modality.IMAGE_ENCODER in modalities:
+        processor = ImageProcessor(config_path)
+        encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER)).eval()
+        result_processors[Modality.IMAGE_ENCODER] = processor
+        result_models[Modality.IMAGE_ENCODER] = encoder
+
+    return result_processors, result_models
 
 
 def get_model_onnx(
@@ -111,15 +121,25 @@ def get_model_onnx(
     token: Optional[str] = None,
     modalities: Optional[Tuple[str]] = None,
 ):
-    from uform.onnx_encoders import TextImageEncoder
-    from uform.numpy_processors import NumPyProcessor
+    from uform.onnx_encoders import TextEncoder, ImageEncoder
+    from uform.numpy_processors import TextProcessor, ImageProcessor
 
-    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".onnx")
-    modality_paths = (
-        {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths
-    )
+    modalities = normalize_modalities(modalities)
+    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx")
+
+    result_processors = {}
+    result_models = {}
+
+    if Modality.TEXT_ENCODER in modalities:
+        processor = TextProcessor(config_path, tokenizer_path)
+        encoder = TextEncoder(modality_paths.get(Modality.TEXT_ENCODER), device=device)
+        result_processors[Modality.TEXT_ENCODER] = processor
+        result_models[Modality.TEXT_ENCODER] = encoder
 
-    model = TextImageEncoder(config_path, modality_paths, device=device)
-    processor = NumPyProcessor(config_path, tokenizer_path)
+    if Modality.IMAGE_ENCODER in modalities:
+        processor = ImageProcessor(config_path)
+        encoder = ImageEncoder(modality_paths.get(Modality.IMAGE_ENCODER), device=device)
+        result_processors[Modality.IMAGE_ENCODER] = processor
+        result_models[Modality.IMAGE_ENCODER] = encoder
 
-    return model, processor
+    return result_processors, result_models
diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py
index afda329..a5faca2 100644
--- a/python/uform/numpy_processors.py
+++ b/python/uform/numpy_processors.py
@@ -15,10 +15,13 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
 
         config = json.load(open(config_path, "r"))
-        self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
+        if "text_encoder" in config:
+            config = config["text_encoder"]
+
+        self._max_seq_len = config["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
         self._tokenizer.no_padding()
-        self._pad_token_idx = config["text_encoder"]["padding_idx"]
+        self._pad_token_idx = config["padding_idx"]
 
     def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]:
         """Transforms one or more strings into dictionary with tokenized strings and attention masks.
@@ -50,7 +53,7 @@ def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]:
 
 
 class ImageProcessor:
-    def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
+    def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None):
         """
         :param config: model config
         :param tokenizer_path: path to tokenizer file
@@ -58,9 +61,12 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
 
         config = json.load(open(config_path, "r"))
-        self._image_size = config["image_encoder"]["image_size"]
-        self._normalization_means = config["image_encoder"]["normalization_means"]
-        self._normalization_deviations = config["image_encoder"]["normalization_deviations"]
+        if "image_encoder" in config:
+            config = config["image_encoder"]
+
+        self._image_size = config["image_size"]
+        self._normalization_means = config["normalization_means"]
+        self._normalization_deviations = config["normalization_deviations"]
 
         assert isinstance(self._image_size, int) and self._image_size > 0
         assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
index 95a0f73..9f63fa4 100644
--- a/python/uform/onnx_encoders.py
+++ b/python/uform/onnx_encoders.py
@@ -59,7 +59,12 @@ def available_providers(device: Optional[str]) -> Tuple[str, ...]:
 
 
 class ImageEncoder:
-    def __init__(self, model_path: str, device: str):
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        device: Literal["cpu", "cuda"] = "cpu",
+    ):
         """
         :param model_path: Path to onnx model
         :param device: Device name, either cpu or gpu
@@ -75,14 +80,18 @@ def __init__(self, model_path: str, device: str):
         )
 
     def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]:
-        return self.session.run(None, {"input": images})
+        return self.session.run(None, {"images": images})
 
 
 class TextEncoder:
-    def __init__(self, text_encoder_path: str, device: str):
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        device: Literal["cpu", "cuda"] = "cpu",
+    ):
         """
         :param text_encoder_path: Path to onnx of text encoder
-        :param reranker_path: Path to onnx of reranker
         :param device: Device name, either cpu or gpu
         """
 
@@ -90,114 +99,10 @@ def __init__(self, text_encoder_path: str, device: str):
         session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
 
         self.text_encoder_session = ort.InferenceSession(
-            text_encoder_path,
+            model_path,
             sess_options=session_options,
             providers=available_providers(device),
         )
 
     def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]:
         return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
-
-
-class TextImageEncoder:
-    def __init__(
-        self,
-        config_path: PathLike,
-        modality_paths: Union[Dict[str, PathLike], PathLike] = None,
-        *,
-        device: Literal["cpu", "cuda"] = "cpu",
-    ):
-        """Initializes the model with the configuration and pre-trained weights.
-
-        :param config_path: Path to the JSON model configuration file
-        :param modality_paths:  Dictionary with paths to different modalities,
-                                or a single path to the model checkpoint
-        """
-        self.device = device
-
-        config = json.load(open(config_path, "r"))
-        self._embedding_dim = config["text_encoder"]["embedding_dim"]
-        self._text_encoder_dim = config["text_encoder"]["dim"]
-        self._image_encoder_dim = config["image_encoder"]["dim"]
-
-        text_encoder_path = modality_paths.get("text_encoder", None)
-        image_encoder_path = modality_paths.get("image_encoder", None)
-        self.text_encoder = TextEncoder(text_encoder_path, device) if text_encoder_path else None
-        self.image_encoder = ImageEncoder(image_encoder_path, device) if image_encoder_path else None
-
-    def encode_image(
-        self,
-        images: ndarray,
-        return_features: bool = False,
-    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
-        """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings.
-
-        :param images: Preprocessed image
-        :param return_features: Whether to return images features or return only embeddings
-        """
-
-        features, embeddings = self.image_encoder(images)
-
-        if return_features:
-            return features, embeddings
-
-        return embeddings
-
-    def encode_text(
-        self,
-        texts: Dict[str, ndarray],
-        return_features: bool = False,
-    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
-        """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings.
-
-        :param texts: Dictionary with tokenized texts and attention masks
-        :param return_features: Whether to return texts features or return only embeddings
-        """
-
-        features, embeddings = self.text_encoder(**texts)
-
-        if return_features:
-            return features, embeddings
-
-        return embeddings
-
-    def forward(
-        self,
-        images: ndarray,
-        texts: Dict[str, ndarray],
-    ) -> Union[ndarray, ndarray]:
-        """Inference forward method
-
-        :param images: Preprocessed images
-        :param texts: Preprocessed texts
-        :return: embeddings for images and texts
-        """
-        _, image_embeddings = self.image_encoder(images)
-        _, text_embeddings = self.text_encoder(texts)
-        return image_embeddings, text_embeddings
-
-    @property
-    def text_features_dim(self) -> int:
-        """Dimensionality of the text encoder features."""
-
-        return self._text_encoder_dim
-
-    @property
-    def image_features_dim(self) -> int:
-        """Dimensionality of the image encoder features."""
-
-        return self._image_encoder_dim
-
-    @property
-    def embedding_dim(self) -> int:
-        """Dimensionality of shared space embedding."""
-
-        return self._embedding_dim
-
-    @property
-    def multimodal_embedding_dim(self) -> int:
-        """Dimensionality of multimodal joint embedding."""
-        return self._text_encoder_dim
-
-
-VLM_ONNX = TextImageEncoder  # legacy
diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py
index db60d63..475f5b0 100644
--- a/python/uform/torch_decoders.py
+++ b/python/uform/torch_decoders.py
@@ -153,7 +153,7 @@ def __init__(self, config: VLMConfig):
             self.config.image_encoder_pooling,
         )
 
-        # replace models' layerscales because `transformers` automatically renames keys in state_dict
+        # replace models' layerscales because `transformers` automatically renames keys in `state_dict`
         for i in range(len(self.image_encoder.blocks)):
             self.image_encoder.blocks[i].ls1 = LayerScale(
                 self.image_encoder.blocks[i].ls1.dim,
@@ -218,6 +218,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -403,7 +404,10 @@ def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
             )
 
             encoding = BatchEncoding(
-                data={"input_ids": input_ids, "attention_mask": attention_mask},
+                data={
+                    "input_ids": input_ids,
+                    "attention_mask": attention_mask,
+                },
             )
 
         if images is not None:
@@ -449,7 +453,15 @@ def from_pretrained(
         revision: str = "main",
         **kwargs,
     ):
-        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+        config = AutoConfig.from_pretrained(
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            revision=revision,
+            token=token,
+            **kwargs,
+        )
         return cls(config)
 
 
diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py
index f122606..8ac7c36 100644
--- a/python/uform/torch_encoders.py
+++ b/python/uform/torch_encoders.py
@@ -1,12 +1,15 @@
+from __future__ import annotations
+
 from dataclasses import dataclass
 from os import PathLike
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union, Callable
 import json
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
+from PIL.Image import Image
 
 
 @dataclass(eq=False)
@@ -220,30 +223,9 @@ def forward_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
 
         return x
 
-    def forward_multimodal(
-        self,
-        x: Tensor,
-        attn_mask: Tensor,
-        context: Tensor,
-    ) -> Tensor:
-        context = self.context_projection(context)
-        expanded_attn_mask = self.get_attention_mask(attn_mask, x.dtype)
-        for block in self.blocks:
-            if block.cross_attention:
-                x = block(x, expanded_attn_mask, context)
-
-        return self.pool_features(x, attn_mask)
-
     def forward_embedding(self, x: Tensor, attn_mask: Tensor) -> Tensor:
         return self.embedding_projection(self.pool_features(x, attn_mask))
 
-    def forward_matching(self, x: Tensor) -> Tensor:
-        logits = self.matching_head(x)
-        if self.head_one_neuron:
-            return torch.sigmoid(logits)[:, 0]
-
-        return F.softmax(logits, dim=1)[:, 1]
-
     def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
         if self.pooling == "cls":
             return x[:, 0]
@@ -291,6 +273,22 @@ def forward(
             return features, embeddings
         return embeddings
 
+    @staticmethod
+    def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> TextEncoder:
+        if isinstance(config, (PathLike, str)):
+            config = json.load(open(config, "r"))
+        if "text_encoder" in config:
+            config = config["text_encoder"]
+
+        # We must strip all the non-member attributes before initializing the classes.
+        text_fields = TextEncoder.__dataclass_fields__
+        config = {k: v for k, v in config.items() if k in text_fields}
+
+        state = torch.load(model_path)
+        encoder = TextEncoder(**config)
+        encoder.load_state_dict(state)
+        return encoder
+
 
 @dataclass(eq=False)
 class ImageEncoder(nn.Module):
@@ -322,19 +320,16 @@ def __post_init__(self):
         self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False)
         self.return_features = False
 
-    def forward_features(self, x: Tensor) -> Tensor:
+    def forward_features(self, x: Union[Tensor, dict]) -> Tensor:
         x = self.patch_embed(x).flatten(start_dim=2).transpose(2, 1)
         x = x + self.pos_embed
-
         special_tokens = [self.cls_token.expand(x.shape[0], -1, -1)]
 
         if self.num_reg_tokens > 0:
             special_tokens.append(self.reg_token.expand(x.shape[0], -1, -1))
 
         x = torch.cat(special_tokens + [x], dim=1)
-
         x = self.blocks(x)
-
         return self.norm(x)
 
     def forward_embedding(self, x: Tensor) -> Tensor:
@@ -346,6 +341,8 @@ def forward_embedding(self, x: Tensor) -> Tensor:
         return self.embedding_projection(x)
 
     def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
+        if isinstance(x, dict):
+            x = x["images"]
         features = self.forward_features(x)
         embeddings = self.forward_embedding(features)
         return_features = return_features if return_features is not None else self.return_features
@@ -353,186 +350,18 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
             return features, embeddings
         return embeddings
 
+    @staticmethod
+    def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> ImageEncoder:
+        if isinstance(config, (PathLike, str)):
+            config = json.load(open(config, "r"))
+        if "image_encoder" in config:
+            config = config["image_encoder"]
 
-class TextImageEncoder(nn.Module):
-    """
-    Vision-Language Model for Multimodal embeddings.
-    """
-
-    def __init__(
-        self,
-        config_path: PathLike,
-        modality_paths: Union[Dict[str, PathLike], PathLike] = None,
-    ):
-        """Initializes the model with the configuration and pre-trained weights.
-
-        :param config_path: Path to the JSON model configuration file
-        :param modality_paths:  Dictionary with paths to different modalities,
-                                or a single path to the model checkpoint
-        """
-
-        super().__init__()
-
-        config = json.load(open(config_path, "r"))
-        self._embedding_dim = config["text_encoder"]["embedding_dim"]
-
-        # Both `text_encoder` and `image_encoder` are data-classes, so we must strip
-        # all the non-member attributes before initializing the classes.
-        text_fields = TextEncoder.__dataclass_fields__
+        # We must strip all the non-member attributes before initializing the classes.
         image_fields = ImageEncoder.__dataclass_fields__
-        text_encoder_attrs = {k: v for k, v in config["text_encoder"].items() if k in text_fields}
-        image_encoder_attrs = {k: v for k, v in config["image_encoder"].items() if k in image_fields}
-        self.text_encoder = TextEncoder(**text_encoder_attrs)
-        self.image_encoder = ImageEncoder(**image_encoder_attrs)
-
-        # Load pre-trained weights
-        if modality_paths is not None:
-            if isinstance(modality_paths, Union[PathLike, str]):
-                state = torch.load(modality_paths)
-                self.text_encoder.load_state_dict(state["text_encoder"])
-                self.image_encoder.load_state_dict(state["image_encoder"])
-            else:
-                text_encoder_path = modality_paths.get("text_encoder", None)
-                image_encoder_path = modality_paths.get("image_encoder", None)
-                if text_encoder_path:
-                    self.text_encoder.load_state_dict(torch.load(text_encoder_path))
-                if image_encoder_path:
-                    self.image_encoder.load_state_dict(torch.load(image_encoder_path))
-
-    def encode_image(
-        self,
-        images: Tensor,
-        return_features: bool = False,
-    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
-        """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings.
-
-        :param images: Preprocessed image
-        :param return_features: Whether to return images features or return only embeddings
-        """
-
-        features = self.image_encoder.forward_features(images)
-        embeddings = self.image_encoder.forward_embedding(features)
-
-        if return_features:
-            return features, embeddings
-
-        return embeddings
-
-    def encode_text(
-        self,
-        texts: Dict[str, Tensor],
-        return_features: bool = False,
-    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
-        """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings.
-
-        :param texts: Dictionary with tokenized texts and attention masks
-        :param return_features: Whether to return texts features or return only embeddings
-        """
-
-        features = self.text_encoder.forward_features(
-            texts["input_ids"],
-            texts["attention_mask"],
-        )
-        embeddings = self.text_encoder.forward_embedding(
-            features,
-            texts["attention_mask"],
-        )
-
-        if return_features:
-            return features, embeddings
-
-        return embeddings
-
-    def encode_multimodal(
-        self,
-        image: Optional[Tensor] = None,
-        text: Optional[Dict] = None,
-        image_features: Optional[Tensor] = None,
-        text_features: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        return_scores: bool = False,
-    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
-        """Passes preprocessed texts (or precomputed texts features) and
-            preprocessed images (or precomputed images features) through multimodal encoded to produce multimodal joint embeddings.
-
-        :param image: Preprocessed images
-        :param text: Preprocessed texts
-        :param image_features: Precomputed images features
-        :param text_features: Precomputed text features
-        :param attention_mask: Attention masks, not required if pass `text` instead of text_features
-        """
-
-        assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None"
-        assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None"
-
-        if text_features is not None:
-            assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`"
-
-        if image_features is None:
-            image_features = self.image_encoder.forward_features(image)
-
-        if text_features is None:
-            text_features = self.text_encoder.forward_features(
-                text["input_ids"],
-                text["attention_mask"],
-            )
-
-        embeddings = self.text_encoder.forward_multimodal(
-            text_features,
-            attention_mask if attention_mask is not None else text["attention_mask"],
-            image_features,
-        )
-
-        if return_scores:
-            return self.get_matching_scores(embeddings), embeddings
-
-        return embeddings
-
-    def get_matching_scores(self, embeddings: Tensor) -> Tensor:
-        """Computes the probability that there is a match between images and texts based on their multimodal embeddings
-
-        :param embeddings: multimodal joint embeddings
-        """
-
-        return self.text_encoder.forward_matching(embeddings)
-
-    def forward(
-        self,
-        images: Tensor,
-        texts: Dict[str, Tensor],
-    ) -> Union[Tensor, Tensor]:
-        """Inference forward method
-
-        :param images: Preprocessed images
-        :param texts: Preprocessed texts
-        :return: embeddings for images and texts
-        """
-        _, image_embeddings = self.image_encoder(images)
-        _, text_embeddings = self.text_encoder(texts)
-        return image_embeddings, text_embeddings
-
-    @property
-    def text_features_dim(self) -> int:
-        """Dimensionality of the text encoder features."""
-
-        return self.text_encoder.dim
-
-    @property
-    def image_features_dim(self) -> int:
-        """Dimensionality of the image encoder features."""
-
-        return self.image_encoder.dim
-
-    @property
-    def embedding_dim(self) -> int:
-        """Dimensionality of shared space embedding."""
-
-        return self._embedding_dim
-
-    @property
-    def multimodal_embedding_dim(self) -> int:
-        """Dimensionality of multimodal joint embedding."""
-        return self.text_encoder.dim
-
+        config = {k: v for k, v in config.items() if k in image_fields}
 
-VLM = TextImageEncoder  # legacy
+        state = torch.load(model_path)
+        encoder = ImageEncoder(**config)
+        encoder.load_state_dict(state)
+        return encoder
diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py
index 340b117..32697ca 100644
--- a/python/uform/torch_processors.py
+++ b/python/uform/torch_processors.py
@@ -29,15 +29,19 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
 
         config = json.load(open(config_path, "r"))
-        self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
+        if "text_encoder" in config:
+            config = config["text_encoder"]
+
+        self._max_seq_len = config["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
         self._tokenizer.no_padding()
-        self._pad_token_idx = config["text_encoder"]["padding_idx"]
+        self._pad_token_idx = config["padding_idx"]
 
     def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
         """Transforms one or more strings into dictionary with tokenized strings and attention masks.
 
         :param texts: text of list of texts to tokenizer
+        :return: dictionary with tokenized strings and attention masks as values
         """
         if isinstance(texts, str):
             texts = [texts]
@@ -72,9 +76,12 @@ def __init__(self, config_path: PathLike):
         """
 
         config = json.load(open(config_path, "r"))
-        self._image_size = config["image_encoder"]["image_size"]
-        self._normalization_means = config["image_encoder"]["normalization_means"]
-        self._normalization_deviations = config["image_encoder"]["normalization_deviations"]
+        if "image_encoder" in config:
+            config = config["image_encoder"]
+
+        self._image_size = config["image_size"]
+        self._normalization_means = config["normalization_means"]
+        self._normalization_deviations = config["normalization_deviations"]
 
         assert isinstance(self._image_size, int) and self._image_size > 0
         assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
@@ -93,10 +100,11 @@ def __init__(self, config_path: PathLike):
             ],
         )
 
-    def __call__(self, images: Union[Image, List[Image]]) -> Tensor:
+    def __call__(self, images: Union[Image, List[Image]]) -> Dict[str, Tensor]:
         """Transforms one or more Pillow images into Torch Tensors.
 
         :param images: image or list of images to preprocess
+        :return: dictionary with float-represented images in tensors as values
         """
 
         if isinstance(images, list):
@@ -111,4 +119,4 @@ def __call__(self, images: Union[Image, List[Image]]) -> Tensor:
         else:
             batch_images = self._image_transform(images).unsqueeze(0)
 
-        return batch_images
+        return {"images": batch_images}

From b7905197387b8f6f2b91667a47146068ea21d4e6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Fri, 19 Apr 2024 21:26:39 -0700
Subject: [PATCH 17/40] Improve: PAss tests for small models

---
 .vscode/settings.json                |   7 +-
 python/scripts/export_encoders.ipynb | 130 ++++++++++++++++-----------
 python/scripts/test_encoders.py      |  16 ++--
 python/uform/numpy_processors.py     |   2 +-
 python/uform/onnx_encoders.py        |  37 +++++++-
 python/uform/torch_encoders.py       |  39 ++++++--
 6 files changed, 156 insertions(+), 75 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 3a060e1..3275f93 100755
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -21,7 +21,9 @@
         "ndarray",
         "numpy",
         "ONNX",
+        "onnxconverter",
         "onnxruntime",
+        "opset",
         "packbits",
         "preprocess",
         "pretrained",
@@ -48,5 +50,8 @@
     "[python]": {
         "editor.defaultFormatter": "ms-python.black-formatter"
     },
-    "python.formatting.provider": "none"
+    "python.formatting.provider": "none",
+    "window.autoDetectColorScheme": true,
+    "workbench.colorTheme": "Default Dark+",
+    "workbench.preferredDarkColorTheme": "Default Dark+"
 }
\ No newline at end of file
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
index 029e60a..a8b868d 100644
--- a/python/scripts/export_encoders.ipynb
+++ b/python/scripts/export_encoders.ipynb
@@ -19,7 +19,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip uninstall -y uform\n",
     "!pip install --upgrade \"uform[torch]\" coremltools"
    ]
   },
@@ -30,8 +29,13 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "model_name = \"uform-vl-english-small\"\n",
-    "output_directory = \"../../\""
+    "\n",
+    "working_directory = \"../..\"\n",
+    "model_name = \"uform3-image-text-english-small\"\n",
+    "model_directory = os.path.join(working_directory, \"models\", model_name)\n",
+    "model_weights_path = os.path.join(model_directory, \"torch_weight.pt\")\n",
+    "config_path = os.path.join(model_directory, \"config.json\")\n",
+    "tokenizer_path = os.path.join(model_directory, \"tokenizer.json\")"
    ]
   },
   {
@@ -40,20 +44,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import uform\n",
-    "from PIL import Image\n",
-    "\n",
-    "model, processor = uform.get_model('unum-cloud/' + model_name)\n",
-    "text = 'a small red panda in a zoo'\n",
-    "image = Image.open('../../assets/unum.png')\n",
-    "\n",
-    "image_data = processor.preprocess_image(image)\n",
-    "text_data = processor.preprocess_text(text)\n",
-    "\n",
-    "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
-    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+    "import torch\n",
     "\n",
-    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+    "state_dict = torch.load(model_weights_path)\n",
+    "list(state_dict.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from uform.torch_encoders import ImageEncoder, TextEncoder\n",
+    "from uform.torch_processors import ImageProcessor, TextProcessor"
    ]
   },
   {
@@ -62,7 +66,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.text_encoder"
+    "image_encoder = ImageEncoder.from_pretrained(config_path, state_dict)\n",
+    "text_encoder = TextEncoder.from_pretrained(config_path, state_dict)\n",
+    "image_encoder, text_encoder"
    ]
   },
   {
@@ -71,7 +77,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.image_encoder"
+    "text_processor = TextProcessor(config_path, tokenizer_path)\n",
+    "image_processor = ImageProcessor(config_path)\n",
+    "text_processor, image_processor"
    ]
   },
   {
@@ -80,14 +88,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
-    "for name, module in model.image_encoder.named_children():\n",
-    "    print(f\"First layer of image_encoder: {name}\")\n",
-    "    break  # We break after the first layer\n",
+    "import uform\n",
+    "from PIL import Image\n",
     "\n",
-    "for name, module in model.text_encoder.named_children():\n",
-    "    print(f\"First layer of text_encoder: {name}\")\n",
-    "    break  # We break after the first layer"
+    "text = 'a small red panda in a zoo'\n",
+    "image = Image.open('../../assets/unum.png')\n",
+    "\n",
+    "text_data = text_processor(text)\n",
+    "image_data = image_processor(image)\n",
+    "\n",
+    "image_features, image_embedding = image_encoder.forward(image_data, return_features=True)\n",
+    "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
+    "\n",
+    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
    ]
   },
   {
@@ -147,7 +160,7 @@
     "    input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n",
     "    return input_shape\n",
     "\n",
-    "generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
+    "generalize_first_dimensions(image_data[\"images\"].shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
    ]
   },
   {
@@ -156,7 +169,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
+    "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data[\"images\"].shape, 1))\n",
     "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
     "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
     "text_features = ct.TensorType(name=\"features\")\n",
@@ -171,11 +184,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module = model.image_encoder\n",
+    "module = image_encoder\n",
     "module.eval()\n",
     "module.return_features = True\n",
     "\n",
-    "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=image_data[\"images\"])\n",
     "traced_script_module"
    ]
   },
@@ -193,7 +206,7 @@
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))"
+    "coreml_model.save(os.path.join(model_directory, \"image_encoder.mlpackage\"))"
    ]
   },
   {
@@ -202,7 +215,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module = model.text_encoder\n",
+    "module = text_encoder\n",
     "module.eval()\n",
     "module.return_features = True\n",
     "\n",
@@ -224,7 +237,7 @@
     "coreml_model.author = 'Unum Cloud'\n",
     "coreml_model.license = 'Apache 2.0'\n",
     "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))"
+    "coreml_model.save(os.path.join(model_directory, \"text_encoder.mlpackage\"))"
    ]
   },
   {
@@ -257,8 +270,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.image_encoder.eval()\n",
-    "model.image_encoder.to(dtype=torch.bfloat16)"
+    "image_encoder.eval()\n",
+    "image_encoder.to(dtype=torch.bfloat16)"
    ]
   },
   {
@@ -267,7 +280,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))"
+    "torch.save(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.pt\"))"
    ]
   },
   {
@@ -276,7 +289,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))"
+    "save_file(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.safetensors\"))"
    ]
   },
   {
@@ -285,8 +298,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.text_encoder.eval()\n",
-    "model.text_encoder.to(dtype=torch.bfloat16)"
+    "text_encoder.eval()\n",
+    "text_encoder.to(dtype=torch.bfloat16)"
    ]
   },
   {
@@ -295,7 +308,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))"
+    "torch.save(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.pt\"))"
    ]
   },
   {
@@ -304,7 +317,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))"
+    "save_file(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.safetensors\"))"
    ]
   },
   {
@@ -313,8 +326,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n",
-    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+    "image_features, image_embedding = image_encoder.forward(image_data[\"images\"].to(dtype=torch.bfloat16), return_features=True)\n",
+    "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
     "\n",
     "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
    ]
@@ -358,7 +371,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module = model.text_encoder\n",
+    "module = text_encoder\n",
     "module.eval()\n",
     "module.return_features = True\n",
     "module.to(dtype=torch.float32)\n",
@@ -366,7 +379,7 @@
     "onnx_export(\n",
     "    module,\n",
     "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
-    "    os.path.join(output_directory, \"text_encoder.onnx\"), \n",
+    "    os.path.join(model_directory, \"text_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -392,15 +405,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module = model.image_encoder\n",
+    "module = image_encoder\n",
     "module.eval()\n",
     "module.return_features = True\n",
     "module.to(dtype=torch.float32)\n",
     "\n",
     "torch.onnx.export(\n",
     "    module,\n",
-    "    image_data, \n",
-    "    os.path.join(output_directory, \"image_encoder.onnx\"), \n",
+    "    image_data[\"images\"], \n",
+    "    os.path.join(model_directory, \"image_encoder.onnx\"), \n",
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
@@ -437,7 +450,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
     "module = onnx.load(module_path)\n",
     "module_fp16 = float16.convert_float_to_float16(module)\n",
     "onnx.save(module_fp16, module_path)"
@@ -449,7 +462,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
     "module = onnx.load(module_path)\n",
     "module_fp16 = float16.convert_float_to_float16(module)\n",
     "onnx.save(module_fp16, module_path)"
@@ -480,7 +493,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
     "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
    ]
   },
@@ -490,7 +503,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
     "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
    ]
   },
@@ -512,7 +525,7 @@
     "from onnx import helper\n",
     "\n",
     "# Load the ONNX model\n",
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
     "module = onnx.load(module_path)\n",
     "\n",
     "# Get the module's graph\n",
@@ -599,7 +612,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
     "session = ort.InferenceSession(module_path, sess_options=session_options)"
    ]
   },
@@ -609,7 +622,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
     "session = ort.InferenceSession(module_path, sess_options=session_options)"
    ]
   },
@@ -620,6 +633,15 @@
     "# Upload to Hugging Face"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../models/uform3-image-text-english-small/ . --exclude=\"torch_weight.pt\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
index d26e4f2..bd26690 100644
--- a/python/scripts/test_encoders.py
+++ b/python/scripts/test_encoders.py
@@ -27,16 +27,16 @@
 
 torch_models = [
     "unum-cloud/uform3-image-text-english-small",
-    "unum-cloud/uform3-image-text-english-base",
-    "unum-cloud/uform3-image-text-english-large",
-    "unum-cloud/uform3-image-text-multilingual-base",
+    # "unum-cloud/uform3-image-text-english-base",
+    # "unum-cloud/uform3-image-text-english-large",
+    # "unum-cloud/uform3-image-text-multilingual-base",
 ]
 
 onnx_models = [
     "unum-cloud/uform3-image-text-english-small",
-    "unum-cloud/uform3-image-text-english-base",
-    "unum-cloud/uform3-image-text-english-large",
-    "unum-cloud/uform3-image-text-multilingual-base",
+    # "unum-cloud/uform3-image-text-english-base",
+    # "unum-cloud/uform3-image-text-english-large",
+    # "unum-cloud/uform3-image-text-multilingual-base",
 ]
 
 # Let's check if the HuggingFace Hub API token is set in the environment variable.
@@ -198,8 +198,8 @@ def test_onnx_one_embedding(model_name: str, device: str):
 
         # Test if the model outputs actually make sense
         cross_references_image_and_text_embeddings(
-            lambda text: model_text(processor_text(text)),
-            lambda image: model_image(processor_image(image)),
+            lambda text: model_text(processor_text(text))[1],
+            lambda image: model_image(processor_image(image))[1],
         )
 
     except ExecutionProviderError as e:
diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py
index a5faca2..027bc0d 100644
--- a/python/uform/numpy_processors.py
+++ b/python/uform/numpy_processors.py
@@ -34,7 +34,7 @@ def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]:
         input_ids = np.full(
             (len(texts), self._max_seq_len),
             fill_value=self._pad_token_idx,
-            dtype=np.int64,
+            dtype=np.int32,
         )
 
         attention_mask = np.zeros(
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
index 9f63fa4..a6f27d3 100644
--- a/python/uform/onnx_encoders.py
+++ b/python/uform/onnx_encoders.py
@@ -64,6 +64,7 @@ def __init__(
         model_path: str,
         *,
         device: Literal["cpu", "cuda"] = "cpu",
+        return_features: bool = True,
     ):
         """
         :param model_path: Path to onnx model
@@ -73,14 +74,21 @@ def __init__(
         session_options = ort.SessionOptions()
         session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
 
+        self.return_features = return_features
         self.session = ort.InferenceSession(
             model_path,
             sess_options=session_options,
             providers=available_providers(device),
         )
 
-    def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]:
-        return self.session.run(None, {"images": images})
+    def __call__(
+        self, images: ndarray, return_features: Optional[bool] = None
+    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
+        features, embeddings = self.session.run(None, {"images": images})
+        return_features = return_features if return_features is not None else self.return_features
+        if return_features:
+            return features, embeddings
+        return embeddings
 
 
 class TextEncoder:
@@ -89,6 +97,7 @@ def __init__(
         model_path: str,
         *,
         device: Literal["cpu", "cuda"] = "cpu",
+        return_features: bool = True,
     ):
         """
         :param text_encoder_path: Path to onnx of text encoder
@@ -98,11 +107,31 @@ def __init__(
         session_options = ort.SessionOptions()
         session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
 
+        self.return_features = return_features
         self.text_encoder_session = ort.InferenceSession(
             model_path,
             sess_options=session_options,
             providers=available_providers(device),
         )
 
-    def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]:
-        return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
+    def __call__(
+        self,
+        x: Union[ndarray, dict],
+        attention_mask: Optional[ndarray] = None,
+        return_features: Optional[bool] = None,
+    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
+        if isinstance(x, dict):
+            assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
+            attention_mask = x["attention_mask"]
+            input_ids = x["input_ids"]
+        else:
+            input_ids = x
+
+        features, embeddings = self.text_encoder_session.run(
+            None, {"input_ids": input_ids, "attention_mask": attention_mask}
+        )
+
+        return_features = return_features if return_features is not None else self.return_features
+        if return_features:
+            return features, embeddings
+        return embeddings
diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py
index 8ac7c36..0504a74 100644
--- a/python/uform/torch_encoders.py
+++ b/python/uform/torch_encoders.py
@@ -2,7 +2,7 @@
 
 from dataclasses import dataclass
 from os import PathLike
-from typing import Dict, Optional, Tuple, Union, Callable
+from typing import Dict, Optional, Union, Mapping, Any
 import json
 
 import torch
@@ -274,7 +274,12 @@ def forward(
         return embeddings
 
     @staticmethod
-    def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> TextEncoder:
+    def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder:
+        """Load the image encoder from the given configuration and model path.
+
+        :param config: the configuration dictionary or path to the JSON configuration file
+        :param model: the model state dictionary or path to the `.pt` model file
+        """
         if isinstance(config, (PathLike, str)):
             config = json.load(open(config, "r"))
         if "text_encoder" in config:
@@ -283,9 +288,15 @@ def from_pretrained(config: Union[PathLike, str, object], model_path: Union[Path
         # We must strip all the non-member attributes before initializing the classes.
         text_fields = TextEncoder.__dataclass_fields__
         config = {k: v for k, v in config.items() if k in text_fields}
-
-        state = torch.load(model_path)
         encoder = TextEncoder(**config)
+
+        # Load from disk
+        if isinstance(model, (PathLike, str)):
+            state = torch.load(model)
+        else:
+            state = model
+        if "text_encoder" in state:
+            state = state["text_encoder"]
         encoder.load_state_dict(state)
         return encoder
 
@@ -351,7 +362,15 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
         return embeddings
 
     @staticmethod
-    def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> ImageEncoder:
+    def from_pretrained(
+        config: Union[PathLike, str, object],
+        model: Union[PathLike, str, Mapping[str, Any]],
+    ) -> ImageEncoder:
+        """Load the image encoder from the given configuration and model path.
+
+        :param config: the configuration dictionary or path to the JSON configuration file
+        :param model: the model state dictionary or path to the `.pt` model file
+        """
         if isinstance(config, (PathLike, str)):
             config = json.load(open(config, "r"))
         if "image_encoder" in config:
@@ -360,8 +379,14 @@ def from_pretrained(config: Union[PathLike, str, object], model_path: Union[Path
         # We must strip all the non-member attributes before initializing the classes.
         image_fields = ImageEncoder.__dataclass_fields__
         config = {k: v for k, v in config.items() if k in image_fields}
-
-        state = torch.load(model_path)
         encoder = ImageEncoder(**config)
+
+        # Load from disk
+        if isinstance(model, (PathLike, str)):
+            state = torch.load(model)
+        else:
+            state = model
+        if "image_encoder" in state:
+            state = state["image_encoder"]
         encoder.load_state_dict(state)
         return encoder

From 605bfc8cf4a9164051cd63003176f9db690d79e1 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 20 Apr 2024 22:18:22 -0700
Subject: [PATCH 18/40] Improve: Test more models

---
 CONTRIBUTING.md                 |  7 -------
 Package.resolved                |  2 +-
 Package.swift                   |  2 +-
 python/scripts/test_encoders.py | 12 ++++++------
 swift/EncodersTests.swift       | 14 ++++++++++++--
 5 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index bcf6d91..ceafee9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -51,12 +51,5 @@ Before submitting any changes, please make sure that the tests pass.
 
 ```sh
 npm install
-npm run build
 npm run test
 ```
-
-```
-tsc
-node node_build/embeddings.mjs
-```
-
diff --git a/Package.resolved b/Package.resolved
index fe63c94..6e3b1f7 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -14,7 +14,7 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/ashvardanian/swift-transformers",
       "state" : {
-        "revision" : "9ef46a51eca46978b62773f8887926dfe72b0ab4"
+        "revision" : "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
       }
     }
   ],
diff --git a/Package.swift b/Package.swift
index b3b9ffd..c2f7fe7 100644
--- a/Package.swift
+++ b/Package.swift
@@ -19,7 +19,7 @@ let package = Package(
     dependencies: [
         .package(
             url: "https://github.com/ashvardanian/swift-transformers",
-            revision: "9ef46a51eca46978b62773f8887926dfe72b0ab4"
+            revision: "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
         )
     ],
     targets: [
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
index bd26690..29c5119 100644
--- a/python/scripts/test_encoders.py
+++ b/python/scripts/test_encoders.py
@@ -27,16 +27,16 @@
 
 torch_models = [
     "unum-cloud/uform3-image-text-english-small",
-    # "unum-cloud/uform3-image-text-english-base",
-    # "unum-cloud/uform3-image-text-english-large",
-    # "unum-cloud/uform3-image-text-multilingual-base",
+    "unum-cloud/uform3-image-text-english-base",
+    "unum-cloud/uform3-image-text-english-large",
+    "unum-cloud/uform3-image-text-multilingual-base",
 ]
 
 onnx_models = [
     "unum-cloud/uform3-image-text-english-small",
-    # "unum-cloud/uform3-image-text-english-base",
-    # "unum-cloud/uform3-image-text-english-large",
-    # "unum-cloud/uform3-image-text-multilingual-base",
+    "unum-cloud/uform3-image-text-english-base",
+    "unum-cloud/uform3-image-text-english-large",
+    "unum-cloud/uform3-image-text-multilingual-base",
 ]
 
 # Let's check if the HuggingFace Hub API token is set in the environment variable.
diff --git a/swift/EncodersTests.swift b/swift/EncodersTests.swift
index 0096d62..5816446 100644
--- a/swift/EncodersTests.swift
+++ b/swift/EncodersTests.swift
@@ -73,7 +73,12 @@ final class TokenizerTests: XCTestCase {
     }
 
     func testTextEmbeddings() async throws {
-        for model in ["unum-cloud/uform3-image-text-english-small"] {
+        for model in [
+            "unum-cloud/uform3-image-text-english-small",
+            "unum-cloud/uform3-image-text-english-base",
+            "unum-cloud/uform3-image-text-english-large",
+            "unum-cloud/uform3-image-text-multilingual-base",
+        ] {
             try await testTextEmbeddings(forModel: model)
         }
     }
@@ -162,7 +167,12 @@ final class TokenizerTests: XCTestCase {
     }
 
     func testImageEmbeddings() async throws {
-        for model in ["unum-cloud/uform3-image-text-english-small"] {
+        for model in [
+            "unum-cloud/uform3-image-text-english-small",
+            "unum-cloud/uform3-image-text-english-base",
+            "unum-cloud/uform3-image-text-english-large",
+            "unum-cloud/uform3-image-text-multilingual-base",
+        ] {
             try await testImageEmbeddings(forModel: model)
         }
     }

From 0c2aa2828693edd6002c34cdd87f43e196c7775a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 21 Apr 2024 05:19:23 +0000
Subject: [PATCH 19/40] Improve: Test many models in JS

---
 javascript/encoders_test.js | 159 ++++++++++++++++++++++--------------
 1 file changed, 98 insertions(+), 61 deletions(-)

diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
index fba11f4..f50d3b6 100644
--- a/javascript/encoders_test.js
+++ b/javascript/encoders_test.js
@@ -1,4 +1,6 @@
-import { existsSync } from 'fs';
+import { existsSync, readFileSync } from 'fs';
+import { fileURLToPath } from 'url';
+import path from 'path';
 
 import { getCheckpoint, Modality } from "./hub.mjs";
 import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs";
@@ -9,83 +11,118 @@ function assert(condition, message) {
     }
 }
 
+// Check if the HuggingFace Hub API token is set in the environment variable.
+let hf_token = process.env.HUGGINGFACE_HUB_TOKEN;
+if (!hf_token) {
+    const dirname = path.dirname(fileURLToPath(import.meta.url));
+    const tokenPath = path.join(dirname, '../', '.hf_token');
+    if (existsSync(tokenPath)) {
+        hf_token = readFileSync(tokenPath, 'utf8').trim();
+    }
+}
+
+async function tryGettingCheckpoint(modelId, modalities) {
+    const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+        modelId,
+        modalities,
+        hf_token,
+        '.onnx'
+    );
+
+    assert(configPath !== null, "Config path should not be null");
+    assert(modalityPaths !== null, "Modality paths should not be null");
+    assert(tokenizerPath !== null, "Tokenizer path should not be null");
+
+    // Check if the file actually exists
+    assert(existsSync(configPath), `Config file should exist at ${configPath}`);
+    assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`);
+    for (const modalityPath of Object.values(modalityPaths)) {
+        assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`);
+    }
+}
+
 async function testGetCheckpoint() {
-    console.log("Test getCheckpoint: Start");
+    console.log("- `testGetCheckpoint`: Start");
 
     try {
-        const modelId = 'unum-cloud/uform3-image-text-english-small';
-        const token = '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';
         const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
 
-        const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
-            modelId,
-            modalities,
-            token,
-            '.onnx'
-        );
-
-        assert(configPath !== null, "Config path should not be null");
-        assert(modalityPaths !== null, "Modality paths should not be null");
-        assert(tokenizerPath !== null, "Tokenizer path should not be null");
-
-        // Check if the file actually exists
-        assert(existsSync(configPath), `Config file should exist at ${configPath}`);
-        assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`);
-        for (const modalityPath of Object.values(modalityPaths)) {
-            assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`);
+        for (const modelId of [
+            'unum-cloud/uform3-image-text-english-small',
+            'unum-cloud/uform3-image-text-english-base',
+            'unum-cloud/uform3-image-text-english-large',
+            'unum-cloud/uform3-image-text-multilingual-base',
+        ]) {
+            await tryGettingCheckpoint(modelId, modalities, hf_token);
         }
 
-        console.log("Test getCheckpoint: Success");
+        console.log("- `testGetCheckpoint`: Success");
     } catch (error) {
-        console.error("Test getCheckpoint: Failed", error);
+        console.error("- `testGetCheckpoint`: Failed", error);
     }
 }
 
-async function testEncoders() {
-    console.log("Test testEncoders: Start");
-    let textEncoder = null;
-    let imageEncoder = null;
-
-    try {
-        const modelId = 'unum-cloud/uform3-image-text-english-small';
-        const token = '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';
-        const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
-
-        const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
-            modelId,
-            modalities,
-            token,
-            '.onnx'
-        );
-
-        assert(configPath !== null, "Config path should not be null");
-        assert(modalityPaths !== null, "Modality paths should not be null");
-        assert(tokenizerPath !== null, "Tokenizer path should not be null");
+async function tryTextEncoderForwardPass(modelId) {
+    const modalities = [Modality.TextEncoder];
+    const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+        modelId,
+        modalities,
+        hf_token,
+        '.onnx'
+    );
+
+    const textProcessor = new TextProcessor(configPath, tokenizerPath);
+    await textProcessor.init();
+    const processedTexts = await textProcessor.process("Hello, world!");
+
+    const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+    await textEncoder.init();
+    const textOutput = await textEncoder.forward(processedTexts);
+    assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
+
+    await textEncoder.dispose();
+}
 
-        const textProcessor = new TextProcessor(configPath, tokenizerPath);
-        await textProcessor.init();
-        const processedTexts = await textProcessor.process("Hello, world!");
+async function tryImageEncoderForwardPass(modelId) {
+    const modalities = [Modality.ImageEncoder];
+    const { configPath, modalityPaths } = await getCheckpoint(
+        modelId,
+        modalities,
+        hf_token,
+        '.onnx'
+    );
+
+    const imageProcessor = new ImageProcessor(configPath);
+    await imageProcessor.init();
+    const processedImages = await imageProcessor.process("assets/unum.png");
+
+    const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+    await imageEncoder.init();
+    const imageOutput = await imageEncoder.forward(processedImages);
+    assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
+
+    await imageEncoder.dispose();
+}
 
-        textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
-        await textEncoder.init();
-        const textOutput = await textEncoder.forward(processedTexts);
-        console.log(textOutput.embeddings.dims);
+async function testEncoders() {
+    console.log("- `testEncoders`: Start");
 
-        const imageProcessor = new ImageProcessor(configPath);
-        await imageProcessor.init();
-        const processedImages = await imageProcessor.process("assets/unum.png");
+    try {
 
-        imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
-        await imageEncoder.init();
-        const imageOutput = await imageEncoder.forward(processedImages);
-        console.log(imageOutput.embeddings.dims);
+        // Go through the bi-modal models
+        for (const modelId of [
+            'unum-cloud/uform3-image-text-english-small',
+            'unum-cloud/uform3-image-text-english-base',
+            'unum-cloud/uform3-image-text-english-large',
+            'unum-cloud/uform3-image-text-multilingual-base',
+        ]) {
+            await tryTextEncoderForwardPass(modelId, hf_token);
+            await tryImageEncoderForwardPass(modelId, hf_token);
+        }
 
-        console.log("Test testEncoders: Success");
+        console.log("- `testEncoders`: Success");
     } catch (error) {
-        console.error("Test testEncoders: Failed", error);
-    } finally {
-        await textEncoder.dispose();
-        await imageEncoder.dispose();
+        console.error("- `testEncoders`: Failed", error);
     }
 }
 

From 766963caaa840b230324ccafdf0a02b0aaeaa3e7 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sun, 21 Apr 2024 05:44:48 +0000
Subject: [PATCH 20/40] Add: Text and image cross-referencing in JS

---
 javascript/encoders.mjs         |   4 +-
 javascript/encoders_test.js     | 110 ++++++++++++++++++++++++++++++--
 python/scripts/test_encoders.py |   2 +-
 swift/EncodersTests.swift       |   5 +-
 4 files changed, 111 insertions(+), 10 deletions(-)

diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
index 7a287cc..7ebaeb9 100644
--- a/javascript/encoders.mjs
+++ b/javascript/encoders.mjs
@@ -141,8 +141,8 @@ class ImageProcessor {
                 fit: sharp.fit.cover,
                 position: sharp.strategy.entropy
             }).extract({
-                left: Math.max(0, (scaledWidth - this.imageSize) / 2),
-                top: Math.max(0, (scaledHeight - this.imageSize) / 2),
+                left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)),
+                top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)),
                 width: this.imageSize,
                 height: this.imageSize
             }).removeAlpha();
diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
index f50d3b6..28538ee 100644
--- a/javascript/encoders_test.js
+++ b/javascript/encoders_test.js
@@ -1,16 +1,12 @@
 import { existsSync, readFileSync } from 'fs';
 import { fileURLToPath } from 'url';
 import path from 'path';
+import assert from 'assert';
+import fetch from 'node-fetch';
 
 import { getCheckpoint, Modality } from "./hub.mjs";
 import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs";
 
-function assert(condition, message) {
-    if (!condition) {
-        throw new Error(message);
-    }
-}
-
 // Check if the HuggingFace Hub API token is set in the environment variable.
 let hf_token = process.env.HUGGINGFACE_HUB_TOKEN;
 if (!hf_token) {
@@ -104,6 +100,107 @@ async function tryImageEncoderForwardPass(modelId) {
     await imageEncoder.dispose();
 }
 
+function cosineSimilarity(vecA, vecB) {
+    // We may be receiving a complex tesnor type, so let's check if it
+    // has an array member named `data`.
+    if (vecA.data) {
+        vecA = vecA.data;
+    }
+    if (vecB.data) {
+        vecB = vecB.data;
+    }
+
+    let dotProduct = 0.0;
+    let normA = 0.0;
+    let normB = 0.0;
+    for (let i = 0; i < vecA.length; i++) {
+        dotProduct += vecA[i] * 1.0 * vecB[i];
+        normA += vecA[i] * 1.0 * vecA[i];
+        normB += vecB[i] * 1.0 * vecB[i];
+    }
+    if (normA === 0 || normB === 0) {
+        return 0;
+    } else {
+        return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
+    }
+}
+
+async function fetchImage(url) {
+    const response = await fetch(url);
+    const arrayBuffer = await response.arrayBuffer();
+    const buffer = Buffer.from(arrayBuffer);
+    return buffer;
+}
+
+async function tryCrossReferencingImageAndText(modelId) {
+
+    const modalities = [Modality.ImageEncoder, Modality.TextEncoder];
+    const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+        modelId,
+        modalities,
+        hf_token,
+        '.onnx'
+    );
+
+    const imageProcessor = new ImageProcessor(configPath);
+    await imageProcessor.init();
+    const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+    await imageEncoder.init();
+    const textProcessor = new TextProcessor(configPath, tokenizerPath);
+    await textProcessor.init();
+    const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+    await textEncoder.init();
+
+    const texts = [
+        "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
+        "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
+        "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+        "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
+        "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
+    ];
+    const imageUrls = [
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
+    ];
+
+    const textEmbeddings = [];
+    const imageEmbeddings = [];
+
+    for (let i = 0; i < texts.length; i++) {
+        const text = texts[i];
+        const imageUrl = imageUrls[i];
+        const imageBuffer = await fetchImage(imageUrl);
+
+        const processedText = await textProcessor.process(text);
+        const processedImage = await imageProcessor.process(imageBuffer);
+
+        const textEmbedding = await textEncoder.forward(processedText);
+        const imageEmbedding = await imageEncoder.forward(processedImage);
+
+        textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data));
+        imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data));
+        console.log(`Text: ${text}, Image: ${imageUrl}, Similarity: ${cosineSimilarity(textEmbedding.embeddings, imageEmbedding.embeddings)}`);
+    }
+
+    for (let i = 0; i < texts.length; i++) {
+        const pairSimilarity = cosineSimilarity(textEmbeddings[i], imageEmbeddings[i]);
+        const otherTextSimilarities = textEmbeddings.map((te, idx) => idx === i ? -Infinity : cosineSimilarity(te, imageEmbeddings[i]));
+        const otherImageSimilarities = imageEmbeddings.map((ie, idx) => idx === i ? -Infinity : cosineSimilarity(textEmbeddings[i], ie));
+
+        const maxOtherTextSimilarity = Math.max(...otherTextSimilarities);
+        const maxOtherImageSimilarity = Math.max(...otherImageSimilarities);
+
+        assert(pairSimilarity > maxOtherTextSimilarity, "Text should be more similar to its corresponding image than to other images.");
+        assert(pairSimilarity > maxOtherImageSimilarity, "Image should be more similar to its corresponding text than to other texts.");
+    }
+
+    await textEncoder.dispose();
+    await imageEncoder.dispose();
+}
+
 async function testEncoders() {
     console.log("- `testEncoders`: Start");
 
@@ -118,6 +215,7 @@ async function testEncoders() {
         ]) {
             await tryTextEncoderForwardPass(modelId, hf_token);
             await tryImageEncoderForwardPass(modelId, hf_token);
+            await tryCrossReferencingImageAndText(modelId, hf_token);
         }
 
         console.log("- `testEncoders`: Success");
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
index d26e4f2..fd78e54 100644
--- a/python/scripts/test_encoders.py
+++ b/python/scripts/test_encoders.py
@@ -68,7 +68,7 @@ def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embed
     texts = [
         "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
         "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
-        "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+        "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
         "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
         "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
     ]
diff --git a/swift/EncodersTests.swift b/swift/EncodersTests.swift
index 0096d62..839a916 100644
--- a/swift/EncodersTests.swift
+++ b/swift/EncodersTests.swift
@@ -16,6 +16,9 @@ final class TokenizerTests: XCTestCase {
         {
             hfToken = token
         }
+
+        hfToken = hfToken ?? ProcessInfo.processInfo.environment["HF_TOKEN"]
+        hfToken = hfToken ?? "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"
     }
 
     func cosineSimilarity<T: FloatingPoint>(between vectorA: [T], and vectorB: [T]) -> T {
@@ -107,7 +110,7 @@ final class TokenizerTests: XCTestCase {
         let texts = [
             "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
             "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
-            "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+            "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
             "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
             "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
         ]

From 6b3f8cd351f534b84f7712cc9638c8adef90bcd8 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Mon, 22 Apr 2024 21:16:24 +0000
Subject: [PATCH 21/40] Add: Initial decoder exporters

---
 python/scripts/export_decoders.ipynb | 654 +++++++++++++++++++++++++++
 1 file changed, 654 insertions(+)
 create mode 100644 python/scripts/export_decoders.ipynb

diff --git a/python/scripts/export_decoders.ipynb b/python/scripts/export_decoders.ipynb
new file mode 100644
index 0000000..3aededb
--- /dev/null
+++ b/python/scripts/export_decoders.ipynb
@@ -0,0 +1,654 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
+    "\n",
+    "Depending on the backend, we prefer different qunatization schemes.\n",
+    "\n",
+    "- For ONNX we use `uint8` quantization.\n",
+    "- For PyTorch we use `bfloat16` quantization.\n",
+    "- For CoreML we use `float32` representation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade \"uform[torch]\" coremltools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "model_name = \"unum-cloud/uform-gen2-dpo\"\n",
+    "output_directory = \"../../\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import uform\n",
+    "from PIL import Image\n",
+    "from transformers import AutoModel, AutoProcessor\n",
+    "\n",
+    "model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)\n",
+    "processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)\n",
+    "\n",
+    "prompt = 'Describe the picture'\n",
+    "image = Image.open('../../assets/unum.png')\n",
+    "inputs = processor(text=[prompt], images=[image], return_tensors='pt')\n",
+    "\n",
+    "with torch.inference_mode():\n",
+    "     output = model.generate(\n",
+    "        **inputs,\n",
+    "        do_sample=False,\n",
+    "        use_cache=True,\n",
+    "        max_new_tokens=256,\n",
+    "        eos_token_id=151645,\n",
+    "        pad_token_id=processor.tokenizer.pad_token_id\n",
+    "    )\n",
+    "prompt_len = inputs['input_ids'].shape[1]\n",
+    "decoded_text = processor.batch_decode(output[:, prompt_len:])[0]\n",
+    "\n",
+    "print(decoded_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
+    "for name, module in model.named_children():\n",
+    "    print(f\"First layer of module: {name}\")\n",
+    "    break  # We break after the first layer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## CoreML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import coremltools as ct\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision = ct.precision.FLOAT32"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n",
+    "\n",
+    "```python\n",
+    "        image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
+    "        text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
+    "        text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
+    "```\n",
+    "\n",
+    "That, however, will only work for batch-size one. To support larger batches, we need to override the input shapes.\n",
+    "\n",
+    "```python\n",
+    "        ct.RangeDim(lower_bound=25, upper_bound=100, default=45)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generalize_first_dimensions(input_shape, upper_bound=64):\n",
+    "    if upper_bound == 1:\n",
+    "        return input_shape\n",
+    "    input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n",
+    "    return input_shape\n",
+    "\n",
+    "generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_input = ct.TensorType(name=\"input\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
+    "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
+    "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
+    "text_features = ct.TensorType(name=\"features\")\n",
+    "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
+    "image_features = ct.TensorType(name=\"features\")\n",
+    "image_embeddings = ct.TensorType(name=\"embeddings\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.image_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
+    "traced_script_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coreml_model = ct.convert(\n",
+    "    traced_script_module, source=\"pytorch\",\n",
+    "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
+    "    convert_to='mlprogram', compute_precision=precision)\n",
+    "\n",
+    "coreml_model.author = 'Unum Cloud'\n",
+    "coreml_model.license = 'Apache 2.0'\n",
+    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+    "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.text_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
+    "traced_script_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coreml_model = ct.convert(\n",
+    "    traced_script_module, source=\"pytorch\",\n",
+    "    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
+    "    convert_to='mlprogram', compute_precision=precision)\n",
+    "\n",
+    "coreml_model.author = 'Unum Cloud'\n",
+    "coreml_model.license = 'Apache 2.0'\n",
+    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+    "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PyTorch\n",
+    "\n",
+    "Let's ensure:\n",
+    "\n",
+    "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n",
+    "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n",
+    "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from safetensors import safe_open\n",
+    "from safetensors.torch import save_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.image_encoder.eval()\n",
+    "model.image_encoder.to(dtype=torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.text_encoder.eval()\n",
+    "model.text_encoder.to(dtype=torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n",
+    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+    "\n",
+    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ONNX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install onnx onnxconverter-common"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.onnx import export as onnx_export\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.text_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "module.to(dtype=torch.float32)\n",
+    "\n",
+    "onnx_export(\n",
+    "    module,\n",
+    "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
+    "    os.path.join(output_directory, \"text_encoder.onnx\"), \n",
+    "    export_params=True,\n",
+    "    opset_version=15,\n",
+    "    do_constant_folding=True,\n",
+    "    input_names = ['input_ids', 'attention_mask'], \n",
+    "    output_names = ['features', 'embeddings'],\n",
+    "    dynamic_axes={\n",
+    "        'input_ids' : {0 : 'batch_size'}, \n",
+    "        'attention_mask' : {0 : 'batch_size'}, \n",
+    "        'features' : {0 : 'batch_size'}, \n",
+    "        'embeddings' : {0 : 'batch_size'}})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now repeat the same for images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = model.image_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "module.to(dtype=torch.float32)\n",
+    "\n",
+    "torch.onnx.export(\n",
+    "    module,\n",
+    "    image_data, \n",
+    "    os.path.join(output_directory, \"image_encoder.onnx\"), \n",
+    "    export_params=True,\n",
+    "    opset_version=15,\n",
+    "    do_constant_folding=True,\n",
+    "    input_names = ['input'], \n",
+    "    output_names = ['features', 'embeddings'],\n",
+    "    dynamic_axes={\n",
+    "        'input' : {0 : 'batch_size'},\n",
+    "        'features' : {0 : 'batch_size'},\n",
+    "        'embeddings' : {0 : 'batch_size'}})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `float16`\n",
+    "\n",
+    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "from onnxconverter_common import float16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `uint8`\n",
+    "\n",
+    "We can further quantize the model into `uint8` using ONNX quantization tools.\n",
+    "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from onnxruntime.quantization import quantize_dynamic, QuantType"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's make sure that all the text inputs are integers of identical type - `int32`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "import os\n",
+    "from onnx import helper\n",
+    "\n",
+    "# Load the ONNX model\n",
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "\n",
+    "# Get the module's graph\n",
+    "graph = module.graph\n",
+    "\n",
+    "# Iterate through the inputs and update the data type of `input_ids`\n",
+    "for input_tensor in graph.input:\n",
+    "    # Check if this is the tensor we want to change\n",
+    "    if input_tensor.name == 'input_ids' or input_tensor.name == 'attention_mask':\n",
+    "        # Get the tensor type information\n",
+    "        tensor_type = input_tensor.type.tensor_type\n",
+    "        # Set the element type to INT32 (int32's enum value in onnx is 6)\n",
+    "        tensor_type.elem_type = onnx.TensorProto.INT32\n",
+    "\n",
+    "# Optionally, check that the module is still valid\n",
+    "onnx.checker.check_model(module)\n",
+    "\n",
+    "# Save the modified module\n",
+    "onnx.save(module, module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can use the following function to print and validate the input and output types of the ONNX model files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_model_inputs_and_outputs(onnx_model_path):\n",
+    "    model = onnx.load(onnx_model_path)\n",
+    "\n",
+    "    # Get the model's graph\n",
+    "    graph = model.graph\n",
+    "\n",
+    "    # Print input information\n",
+    "    print(\"Model Inputs:\")\n",
+    "    for input_tensor in graph.input:\n",
+    "        tensor_type = input_tensor.type.tensor_type\n",
+    "        # Get the element type (data type)\n",
+    "        elem_type = tensor_type.elem_type\n",
+    "        # Convert numeric type to readable format\n",
+    "        readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
+    "        # Get tensor shape\n",
+    "        shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
+    "        print(f\"Name: {input_tensor.name}, Type: {readable_type}, Shape: {shape}\")\n",
+    "\n",
+    "    # Print output information similarly if needed\n",
+    "    print(\"\\nModel Outputs:\")\n",
+    "    for output_tensor in graph.output:\n",
+    "        tensor_type = output_tensor.type.tensor_type\n",
+    "        elem_type = tensor_type.elem_type\n",
+    "        readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
+    "        shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
+    "        print(f\"Name: {output_tensor.name}, Type: {readable_type}, Shape: {shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's check that the runtime can actually load those models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnxruntime as ort\n",
+    "session_options = ort.SessionOptions()\n",
+    "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Upload to Hugging Face"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 4c1ac18d06a4d16e1b36b0cd85a202fe36d1a781 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 23 Apr 2024 01:08:29 +0000
Subject: [PATCH 22/40] Fix: Transposing channels in JS

---
 javascript/encoders.mjs         | 32 +++++++++++++++++++++-----------
 javascript/encoders_test.js     | 19 ++++++++++++-------
 package.json                    |  1 +
 python/scripts/test_encoders.py | 14 ++++++++++----
 4 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
index 7ebaeb9..6c24b5a 100644
--- a/javascript/encoders.mjs
+++ b/javascript/encoders.mjs
@@ -125,21 +125,22 @@ class ImageProcessor {
         this.normalizationMeans = config.normalization_means;
         this.normalizationDeviations = config.normalization_deviations;
 
-        this.imageMean = new Float32Array(this.normalizationMeans).fill(0);
-        this.imageStd = new Float32Array(this.normalizationDeviations).fill(0);
+        this.imageMean = new Float32Array(this.normalizationMeans);
+        this.imageStd = new Float32Array(this.normalizationDeviations);
     }
     async process(images) {
         const processSingle = async (image) => {
-            let img = sharp(image);
+            let img = sharp(image).toColorspace('srgb');
             const metadata = await img.metadata();
             const scale = this.imageSize / Math.min(metadata.width, metadata.height);
-            const scaledWidth = parseInt(metadata.width * scale);
-            const scaledHeight = parseInt(metadata.height * scale);
+            const scaledWidth = Math.ceil(metadata.width * scale);
+            const scaledHeight = Math.ceil(metadata.height * scale);
             img = img.resize({
                 width: scaledWidth,
                 height: scaledHeight,
                 fit: sharp.fit.cover,
-                position: sharp.strategy.entropy
+                position: sharp.strategy.entropy,
+                options: sharp.interpolators.bicubic
             }).extract({
                 left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)),
                 top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)),
@@ -148,12 +149,21 @@ class ImageProcessor {
             }).removeAlpha();
 
             let buffer = await img.raw().toBuffer();
-            let array = new Float32Array(buffer);
+            let array = new Float32Array(buffer.length);
+
+            // When we export into the `array`, we reorder the dimensions of the tensor 
+            // from HWC to CHW, and normalize the pixel values.
+            let channelSize = this.imageSize * this.imageSize;
+            for (let i = 0; i < this.imageSize * this.imageSize; i++) {
+                let r = buffer[i * 3];
+                let g = buffer[i * 3 + 1];
+                let b = buffer[i * 3 + 2];
+                array[i] = (r / 255.0 - this.imageMean[0]) / this.imageStd[0];
+                array[channelSize + i] = (g / 255.0 - this.imageMean[1]) / this.imageStd[1];
+                array[channelSize * 2 + i] = (b / 255.0 - this.imageMean[2]) / this.imageStd[2];
+            }
 
-            return array.map((value, index) => {
-                const channel = index % 3;
-                return (value / 255.0 - this.normalizationMeans[channel]) / this.normalizationDeviations[channel];
-            });
+            return array;
         };
 
         if (Array.isArray(images)) {
diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
index 28538ee..f45ff4c 100644
--- a/javascript/encoders_test.js
+++ b/javascript/encoders_test.js
@@ -69,7 +69,7 @@ async function tryTextEncoderForwardPass(modelId) {
 
     const textProcessor = new TextProcessor(configPath, tokenizerPath);
     await textProcessor.init();
-    const processedTexts = await textProcessor.process("Hello, world!");
+    const processedTexts = await textProcessor.process("a small red panda in a zoo");
 
     const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
     await textEncoder.init();
@@ -180,9 +180,14 @@ async function tryCrossReferencingImageAndText(modelId) {
         const textEmbedding = await textEncoder.forward(processedText);
         const imageEmbedding = await imageEncoder.forward(processedImage);
 
-        textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data));
-        imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data));
-        console.log(`Text: ${text}, Image: ${imageUrl}, Similarity: ${cosineSimilarity(textEmbedding.embeddings, imageEmbedding.embeddings)}`);
+        textEmbeddings.push(new Float32Array(textEmbedding.embeddings.cpuData));
+        imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.cpuData));
+
+        // Print-based debugging at its best :)
+        // console.log(`Text: ${text}, Image: ${imageUrl}`);
+        // console.log(`Text embedding first components: ${textEmbeddings[i].slice(0, 5)}`);
+        // console.log(`Image embedding first components: ${imageEmbeddings[i].slice(0, 5)}`);
+        console.log(`Similarity: ${cosineSimilarity(textEmbeddings[i], imageEmbeddings[i])}`)
     }
 
     for (let i = 0; i < texts.length; i++) {
@@ -209,9 +214,9 @@ async function testEncoders() {
         // Go through the bi-modal models
         for (const modelId of [
             'unum-cloud/uform3-image-text-english-small',
-            'unum-cloud/uform3-image-text-english-base',
-            'unum-cloud/uform3-image-text-english-large',
-            'unum-cloud/uform3-image-text-multilingual-base',
+            // 'unum-cloud/uform3-image-text-english-base',
+            // 'unum-cloud/uform3-image-text-english-large',
+            // 'unum-cloud/uform3-image-text-multilingual-base',
         ]) {
             await tryTextEncoderForwardPass(modelId, hf_token);
             await tryImageEncoderForwardPass(modelId, hf_token);
diff --git a/package.json b/package.json
index 9be073f..948550b 100644
--- a/package.json
+++ b/package.json
@@ -7,6 +7,7 @@
   "dependencies": {
     "@huggingface/hub": "^0.14.8",
     "@xenova/transformers": "^2.17.0",
+    "node-fetch": "^3.3.2",
     "onnxruntime-node": "^1.17.0",
     "onnxruntime-web": "^1.17.3"
   },
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
index ed8dab5..7046217 100644
--- a/python/scripts/test_encoders.py
+++ b/python/scripts/test_encoders.py
@@ -196,11 +196,17 @@ def test_onnx_one_embedding(model_name: str, device: str):
         assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
         assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
 
+        # Nested fucntions are easier to debug, than lambdas
+        def get_image_embedding(image_data):
+            features, embedding = model_image(processor_image(image_data))
+            return embedding
+
+        def get_text_embedding(text_data):
+            features, embedding = model_text(processor_text(text_data))
+            return embedding
+
         # Test if the model outputs actually make sense
-        cross_references_image_and_text_embeddings(
-            lambda text: model_text(processor_text(text))[1],
-            lambda image: model_image(processor_image(image))[1],
-        )
+        cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding)
 
     except ExecutionProviderError as e:
         pytest.skip(f"Execution provider error: {e}")

From 9bf5fe319d2c32f75bfda1a7a45b86c060b7f0f2 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 23 Apr 2024 04:18:33 +0000
Subject: [PATCH 23/40] Improve: Uniform APIs across JS, Py, and Swift

---
 README.md                                     |  52 ++---
 javascript/README.md                          |  63 ++++-
 javascript/encoders.mjs                       |   6 +-
 javascript/encoders_test.js                   |  18 +-
 javascript/hub.mjs                            |   4 +-
 python/README.md                              | 124 ++++++++++
 .../scripts/{bench.py => bench_decoders.py}   |  45 +---
 python/scripts/bench_encoders.py              | 221 ++++++++++++++++++
 python/scripts/test_encoders.py               |  28 +--
 python/uform/__init__.py                      |  26 ++-
 python/uform/numpy_processors.py              |   8 +-
 python/uform/onnx_encoders.py                 |   4 +-
 python/uform/torch_encoders.py                |  25 +-
 python/uform/torch_processors.py              |   6 +-
 swift/Encoders.swift                          |  12 +-
 swift/EncodersTests.swift                     |   6 +-
 swift/README.md                               |  37 ++-
 17 files changed, 564 insertions(+), 121 deletions(-)
 create mode 100644 python/README.md
 rename python/scripts/{bench.py => bench_decoders.py} (80%)
 create mode 100644 python/scripts/bench_encoders.py

diff --git a/README.md b/README.md
index 32957e7..ee62beb 100755
--- a/README.md
+++ b/README.md
@@ -51,13 +51,12 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr
 
 ### Embedding Models
 
-| Model                                    | Parameters | Languages |                                 Architecture |
-| :--------------------------------------- | ---------: | --------: | -------------------------------------------: |
-| [`uform-vl-english-large`][model-e-l] 🆕  |       365M |         1 | 6 text layers, ViT-L/14, 6 multimodal layers |
-| [`uform-vl-english`][model-e]            |       143M |         1 | 2 text layers, ViT-B/16, 2 multimodal layers |
-| [`uform-vl-english-small`][model-e-s] 🆕  |        79M |         1 | 2 text layers, ViT-S/16, 2 multimodal layers |
-| [`uform-vl-multilingual-v2`][model-m-v2] |       206M |        21 | 8 text layers, ViT-B/16, 4 multimodal layers |
-| [`uform-vl-multilingual`][model-m]       |       206M |        12 | 8 text layers, ViT-B/16, 4 multimodal layers |
+| Model                                               | Parameters | Languages |                                 Architecture |
+| :-------------------------------------------------- | ---------: | --------: | -------------------------------------------: |
+| [`uform3-image-text-english-large`][model-e-l] 🆕    |       365M |         1 | 6 text layers, ViT-L/14, 6 multimodal layers |
+| [`uform3-image-text-english-base`][model-e]         |       143M |         1 | 2 text layers, ViT-B/16, 2 multimodal layers |
+| [`uform3-image-text-english-small`][model-e-s] 🆕    |        79M |         1 | 2 text layers, ViT-S/16, 2 multimodal layers |
+| [`uform3-image-text-multilingual-base`][model-m-v2] |       206M |        21 | 8 text layers, ViT-B/16, 4 multimodal layers |
 
 [model-e-l]: https://huggingface.co/unum-cloud/uform-vl-english-large/
 [model-e]: https://huggingface.co/unum-cloud/uform-vl-english/
@@ -307,34 +306,18 @@ prompt_len = inputs['input_ids'].shape[1]
 decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
 ```
 
-### Multimodal Chat
+### Multimodal Chat in CLI
 
-The generative models can be used for chat-like experiences, where the user can provide both text and images as input.
-To use that feature, you can start with the following CLI command:
+The generative models can be used for chat-like experiences in the command line.
+For that, you can use the `uform-chat` CLI tool, which is available in the UForm package.
 
 ```bash
-uform-chat --model unum-cloud/uform-gen-chat --image=zebra.jpg
-uform-chat --model unum-cloud/uform-gen-chat \
-    --image="https://bit.ly/3tIVg9M" \
-    --device="cuda:0" \
-    --fp16
-```
-
-### Multi-GPU
-
-To achieve higher throughput, you can launch UForm on multiple GPUs.
-For that pick the encoder of the model you want to run in parallel (`text_encoder` or `image_encoder`), and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`).
-
-```python
-import uform
-
-model, processor = uform.get_model('unum-cloud/uform-vl-english')
-model_image = nn.DataParallel(model.image_encoder)
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model_image.to(device)
-
-_, res = model_image(images, 0)
+$ pip install uform
+$ uform-chat --model unum-cloud/uform-gen2-dpo --image=zebra.jpg
+$ uform-chat --model unum-cloud/uform-gen2-dpo \
+>     --image="https://bit.ly/3tIVg9M" \
+>     --device="cuda:0" \
+>     --fp16
 ```
 
 ## Evaluation
@@ -471,3 +454,8 @@ On Apple M2 Arm chips the energy efficiency of inference can exceed that of the
 ## License
 
 All models come under the same license as the code - Apache 2.0.
+
+
+TODO:
+
+- [ ] Download the image if a URL is provided
\ No newline at end of file
diff --git a/javascript/README.md b/javascript/README.md
index 5626d39..0ef5c54 100644
--- a/javascript/README.md
+++ b/javascript/README.md
@@ -1,10 +1,67 @@
 # UForm for JavaScript
 
+UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your JavaScript applications.
+Built around ONNX, the SDK is supposed to work with most runtimes and almost any hardware.
 
+## Installation
+
+There are several ways to install the UForm JavaScript SDK from NPM.
 
 ```bash
-pnpm add uform
-npm add uform
-yarn add uform
+pnpm add uform 
+npm add uform  
+yarn add uform  
+```
+
+## Quick Start
+
+### Embeddings
+
+```js
+import { getModel, Modality } from 'uform';
+import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from 'uform';
+
+const { configPath, modalityPaths, tokenizerPath } = await getModel({
+    modelId: 'unum-cloud/uform3-image-text-english-small',
+    modalities: [Modality.TextEncoder, Modality.ImageEncoder],
+    token: null, // Optional Hugging Face token for private models
+    saveDir: null, // Optional directory to save the model to       
+});
+
+const textProcessor = new TextProcessor(configPath, tokenizerPath);
+await textProcessor.init();
+const processedTexts = await textProcessor.process("a small red panda in a zoo");
+
+const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+await textEncoder.init();
+const textOutput = await textEncoder.encode(processedTexts);
+assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
+await textEncoder.dispose();
+
+const imageProcessor = new ImageProcessor(configPath);
+await imageProcessor.init();
+const processedImages = await imageProcessor.process("path/to/image.png");
+
+const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+await imageEncoder.init();
+const imageOutput = await imageEncoder.encode(processedImages);
+assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
 ```
 
+The `textOutput` and `imageOutput` would contain `features` and `embeddings` properties, which are the same as the `features` and `embeddings` properties in the Python SDK.
+The embeddings can later be compared using the cosine similarity or other distance metrics.
+
+### Generative Models
+
+Coming soon ...
+
+## Technical Details
+
+### Faster Search
+
+Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
+Independent of the quantization level, native JavaScript functionality may be too slow for large-scale search.
+In such cases, consider using [USearch][github-usearch] or [SimSimD][github-simsimd].
+
+[github-usearch]: https://github.com/unum-cloud/usearch
+[github-simsimd]: https://github.com/ashvardanian/simsimd
diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
index 6c24b5a..a37b326 100644
--- a/javascript/encoders.mjs
+++ b/javascript/encoders.mjs
@@ -3,7 +3,7 @@ import { InferenceSession, Tensor } from 'onnxruntime-node';
 import { PreTrainedTokenizer } from '@xenova/transformers';
 import sharp from 'sharp';
 
-import { getCheckpoint, Modality } from "./hub.mjs";
+import { getModel, Modality } from "./hub.mjs";
 
 class TextProcessor {
 
@@ -66,7 +66,7 @@ class TextEncoder {
         }
     }
 
-    async forward(inputs) {
+    async encode(inputs) {
         if (!this.session) {
             throw new Error("Session is not initialized.");
         }
@@ -191,7 +191,7 @@ class ImageEncoder {
         }
     }
 
-    async forward(images) {
+    async encode(images) {
         if (!this.session) {
             throw new Error("Session is not initialized.");
         }
diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
index f45ff4c..a0a70b2 100644
--- a/javascript/encoders_test.js
+++ b/javascript/encoders_test.js
@@ -4,7 +4,7 @@ import path from 'path';
 import assert from 'assert';
 import fetch from 'node-fetch';
 
-import { getCheckpoint, Modality } from "./hub.mjs";
+import { getModel, Modality } from "./hub.mjs";
 import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs";
 
 // Check if the HuggingFace Hub API token is set in the environment variable.
@@ -18,7 +18,7 @@ if (!hf_token) {
 }
 
 async function tryGettingCheckpoint(modelId, modalities) {
-    const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+    const { configPath, modalityPaths, tokenizerPath } = await getModel(
         modelId,
         modalities,
         hf_token,
@@ -60,7 +60,7 @@ async function testGetCheckpoint() {
 
 async function tryTextEncoderForwardPass(modelId) {
     const modalities = [Modality.TextEncoder];
-    const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+    const { configPath, modalityPaths, tokenizerPath } = await getModel(
         modelId,
         modalities,
         hf_token,
@@ -73,7 +73,7 @@ async function tryTextEncoderForwardPass(modelId) {
 
     const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
     await textEncoder.init();
-    const textOutput = await textEncoder.forward(processedTexts);
+    const textOutput = await textEncoder.encode(processedTexts);
     assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
 
     await textEncoder.dispose();
@@ -81,7 +81,7 @@ async function tryTextEncoderForwardPass(modelId) {
 
 async function tryImageEncoderForwardPass(modelId) {
     const modalities = [Modality.ImageEncoder];
-    const { configPath, modalityPaths } = await getCheckpoint(
+    const { configPath, modalityPaths } = await getModel(
         modelId,
         modalities,
         hf_token,
@@ -94,7 +94,7 @@ async function tryImageEncoderForwardPass(modelId) {
 
     const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
     await imageEncoder.init();
-    const imageOutput = await imageEncoder.forward(processedImages);
+    const imageOutput = await imageEncoder.encode(processedImages);
     assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
 
     await imageEncoder.dispose();
@@ -135,7 +135,7 @@ async function fetchImage(url) {
 async function tryCrossReferencingImageAndText(modelId) {
 
     const modalities = [Modality.ImageEncoder, Modality.TextEncoder];
-    const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint(
+    const { configPath, modalityPaths, tokenizerPath } = await getModel(
         modelId,
         modalities,
         hf_token,
@@ -177,8 +177,8 @@ async function tryCrossReferencingImageAndText(modelId) {
         const processedText = await textProcessor.process(text);
         const processedImage = await imageProcessor.process(imageBuffer);
 
-        const textEmbedding = await textEncoder.forward(processedText);
-        const imageEmbedding = await imageEncoder.forward(processedImage);
+        const textEmbedding = await textEncoder.encode(processedText);
+        const imageEmbedding = await imageEncoder.encode(processedImage);
 
         textEmbeddings.push(new Float32Array(textEmbedding.embeddings.cpuData));
         imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.cpuData));
diff --git a/javascript/hub.mjs b/javascript/hub.mjs
index ad534f3..a59fb73 100644
--- a/javascript/hub.mjs
+++ b/javascript/hub.mjs
@@ -33,7 +33,7 @@ async function ensureDirectoryExists(dirPath) {
     }
 }
 
-async function getCheckpoint(modelId, modalities, token = null, format = '.onnx', saveDir = './models') {
+async function getModel(modelId, modalities, token = null, format = '.onnx', saveDir = './models') {
     modalities = normalizeModalities(modalities);
 
     const configNames = ['config.json'];
@@ -101,4 +101,4 @@ async function getCheckpoint(modelId, modalities, token = null, format = '.onnx'
     return { configPath, modalityPaths, tokenizerPath };
 }
 
-export { getCheckpoint, Modality };
+export { getModel, Modality };
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..aec9de8
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,124 @@
+# UForm Python SDK
+
+UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your Python applications.
+The SDK doesn't require any deep learning knowledge, PyTorch, or CUDA installation, and can run on almost any hardware.
+
+## Installation
+
+There are several ways to install the UForm Python SDK, depending on the backend you want to use.
+PyTorch is by far the heaviest, but the most capable.
+ONNX is a lightweight alternative that can run on any CPU, and on some GPUs.
+
+```bash
+pip install "uform[torch]"       # For PyTorch
+pip install "uform[onnx]"        # For ONNX on CPU
+pip install "uform[onnx-gpu]"    # For ONNX on GPU, available for some platforms
+pip install "uform[torch,onnx]"  # For PyTorch and ONNX Python tests
+```
+
+## Quick Start
+
+### Embeddings
+
+```py
+from uform import get_model, Modality
+
+import requests
+from io import BytesIO
+from PIL import Image
+
+model_name = 'unum-cloud/uform3-image-text-english-small'
+modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER]
+processors, models = get_model(model_name, modalities=modalities)
+
+model_text = models[Modality.TEXT_ENCODER]
+model_image = models[Modality.IMAGE_ENCODER]
+processor_text = processors[Modality.TEXT_ENCODER]
+processor_image = processors[Modality.IMAGE_ENCODER]
+
+# Download the image
+text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
+image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
+image_url = Image.open(BytesIO(requests.get(image_url).content))
+
+# The actual inference
+image_data = processor_image(image)
+text_data = processor_text(text)
+image_features, image_embedding = model_image.encode(image_data, return_features=True)
+text_features, text_embedding = model_text.encode(text_data, return_features=True)
+```
+
+### Generative Models
+
+## Technical Details
+
+### Down-casting, Quantization, Matryoshka, and Slicing
+
+Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
+Switching from `f32` to `f16` is recommended in almost all cases, unless you are running on very old hardware without half-precision support.
+Switching to `i8` with linear scaling is also possible, but will be noticeable in the recall on larger collections with millions of searchable entries.
+Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is to quantize them into single-bit representations for faster search.
+
+```python
+import numpy as np
+
+f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+f16_embedding: np.ndarray = f32_embedding.astype(np.float16)
+i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8)
+b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8))
+```
+
+Alternative approach to quantization is to use the Matryoshka embeddings, where the embeddings are sliced into smaller parts, and the search is performed in a hierarchical manner.
+
+```python
+import numpy as np
+
+large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+small_embedding: np.ndarray = large_embedding[:, :256]
+tiny_embedding: np.ndarray = large_embedding[:, :64]
+```
+
+Both approaches are natively supported by the [USearch][github-usearch] vector-search engine and the [SimSIMD][github-simsimd] numerics libraries.
+When dealing with small collections (up to millions of entries) and looking for low-latency cosine distance calculations, you can [achieve 5x-2500x performance improvement][report-simsimd] over Torch, NumPy, SciPy, and vanilla Python using SimSIMD.
+
+```python
+from simsimd import cosine, hamming
+
+distance: float = cosine(f32_embedding, f32_embedding) # 32x SciPy performance on Apple M2 CPU
+distance: float = cosine(f16_embedding, f16_embedding) # 79x SciPy performance on Apple M2 CPU
+distance: float = cosine(i8_embedding, i8_embedding) # 133x SciPy performance on Apple M2 CPU
+distance: float = hamming(b1_embedding, b1_embedding) # 17x SciPy performance on Apple M2 CPU
+```
+
+Similarly, when dealing with large collections (up to billions of entries per server) and looking for high-throughput search, you can [achieve 100x performance improvement][report-usearch] over FAISS and other vector-search solutions using USearch.
+Here are a couple of examples:
+
+```python
+from usearch.index import Index
+
+f32_index = Index(ndim=64, metric='cos', dtype='f32') # for Matryoshka embeddings
+f16_index = Index(ndim=64, metric='cos', dtype='f16') # for Matryoshka embeddings
+i8_index = Index(ndim=256, metric='cos', dtype='i8') # for quantized embeddings
+b1_index = Index(ndim=768, metric='hamming', dtype='b1') # for binary embeddings
+```
+
+[github-usearch]: https://github.com/unum-cloud/usearch
+[github-simsimd]: https://github.com/ashvardanian/simsimd
+[report-usearch]: https://www.unum.cloud/blog/2023-11-07-scaling-vector-search-with-intel
+[report-simsimd]: https://ashvardanian.com/posts/python-c-assembly-comparison/
+
+### Multi-GPU Parallelism
+
+To achieve higher throughput, you can launch UForm on multiple GPUs.
+For that pick the encoder of the model you want to run in parallel, and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`).
+
+```python
+from uform import get_model, Modality
+
+encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch', device='gpu')
+
+encoder_image = encoders[Modality.IMAGE_ENCODER]
+encoder_image = nn.DataParallel(encoder_image)
+
+_, res = encoder_image(images, 0)
+```
diff --git a/python/scripts/bench.py b/python/scripts/bench_decoders.py
similarity index 80%
rename from python/scripts/bench.py
rename to python/scripts/bench_decoders.py
index 8bcaf37..d98c130 100644
--- a/python/scripts/bench.py
+++ b/python/scripts/bench_decoders.py
@@ -1,5 +1,6 @@
 from functools import partial
 from time import perf_counter
+from dataclasses import dataclass
 from typing import List
 
 import requests
@@ -12,7 +13,6 @@
     LlavaForConditionalGeneration,
 )
 
-from uform import get_model
 from uform.torch_decoders import VLMForCausalLM, VLMProcessor
 
 dtype = torch.bfloat16
@@ -20,6 +20,17 @@
 device = "cuda:0"
 
 
+@dataclass
+class BenchmarkResult:
+    model_name: str
+    device_name: str
+    backend_name: str
+    duration_image_preprocessing: float
+    duration_image_embedding: float
+    duration_text_preprocessing: float
+    duration_text_embedding: float
+
+
 def caption(model, processor, prompt: str, image: Image.Image) -> str:
     inputs = processor(prompt, image, return_tensors="pt")
     for possible_key in ["images", "pixel_values"]:
@@ -75,30 +86,6 @@ def caption_image(image, model=model, processor=processor, prompt=prompt):
     print(f"Throughput: {total_length/total_duration:.2f} tokens/s")
 
 
-def bench_image_embeddings(model, images):
-    total_duration = 0
-    total_embeddings = 0
-    images *= 10
-    while total_duration < 10:
-        seconds, embeddings = duration(lambda: model.encode_image(processor.preprocess_image(images)))
-        total_duration += seconds
-        total_embeddings += len(embeddings)
-
-    print(f"Throughput: {total_embeddings/total_duration:.2f} images/s")
-
-
-def bench_text_embeddings(model, texts):
-    total_duration = 0
-    total_embeddings = 0
-    texts *= 10
-    while total_duration < 10:
-        seconds, embeddings = duration(lambda: model.encode_text(processor.preprocess_text(texts)))
-        total_duration += seconds
-        total_embeddings += len(embeddings)
-
-    print(f"Throughput: {total_embeddings/total_duration:.2f} queries/s")
-
-
 if __name__ == "__main__":
     image_urls = [
         "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
@@ -157,11 +144,3 @@ def bench_text_embeddings(model, texts):
         prompt="Summarize the visual content of the image.",
         images=images,
     )
-
-    print("UForm-English")
-    bench_image_embeddings(get_model("unum-cloud/uform-vl-english"), images)
-    bench_text_embeddings(get_model("unum-cloud/uform-vl-english"), captions)
-
-    print("UForm-Multilingual")
-    bench_image_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), images)
-    bench_text_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), captions)
diff --git a/python/scripts/bench_encoders.py b/python/scripts/bench_encoders.py
new file mode 100644
index 0000000..6b59d05
--- /dev/null
+++ b/python/scripts/bench_encoders.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+This script provides the throughput of UForm multimodal embedding models.
+
+The output of the script will cover:
+    - Time to preprocess an image, and throughput in images/s.
+    - Time to tokenize the text, and throughput in queries/s.
+    - Time to encode the image, and throughput in images/s.
+    - Time to encode the text, and throughput in queries/s.
+    - Share of time spent on each part of the pipeline.
+    
+Those numbers are presented for every model, device (cpu or gpu), backend (torch or onnx), 
+and precision (float32 or bfloat16), producing a pretty comprehensive benchmark.
+
+Before running the script - install all available packages via `pip install -e ".[torch,onnx,onnx-gpu]"`.
+Before printing the numbers, a warm-up is performed to ensure the model is loaded and the cache is filled.
+"""
+
+from functools import partial
+from time import perf_counter
+from dataclasses import dataclass
+from typing import List, Tuple, Literal, Callable, Generator
+import re
+
+import fire
+import requests
+from PIL import Image
+import pandas as pd
+
+from uform import get_model, get_model_onnx, Modality
+
+# Define global constants for the hardware availability
+torch_available = False
+try:
+    import torch
+
+    torch_available = True
+except ImportError:
+    pass
+onnx_available = False
+try:
+    import onnx
+
+    onnx_available = True
+except ImportError:
+    pass
+cuda_available = False
+try:
+    if torch_available:
+        cuda_available = torch.cuda.is_available()
+    elif onnx_available:
+        import onnxruntime
+
+        cuda_available = onnxruntime.get_device() == "GPU"
+except ImportError:
+    pass
+
+
+@dataclass
+class BenchmarkResult:
+    model_name: str
+    device_name: Literal["cpu", "cuda"] = "cpu"
+    backend_name: Literal["torch", "onnx"] = "torch"
+    duration_image_preprocessing: float = 0
+    duration_image_embedding: float = 0
+    duration_text_preprocessing: float = 0
+    duration_text_embedding: float = 0
+
+
+def duration(callable):
+    """Profile the duration of a callable and return the duration and the result."""
+    start = perf_counter()
+    result = callable()
+    stop = perf_counter()
+    return stop - start, result
+
+
+def get_captioned_images() -> List[Tuple[Image.Image, str]]:
+    """Get a list of pre-downloaded and decoded images and their captions."""
+    image_urls = [
+        "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+        "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+        "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+        "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+        "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+    ]
+    images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]
+    captions = [
+        "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field",
+        "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta",
+        "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank",
+        "asian girl sleeping in a bed. top down view",
+        "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
+    ]
+    return list(zip(images, captions))
+
+
+def yield_benchmarks() -> Generator[Tuple[BenchmarkResult, Callable], None, None]:
+    """Yields callable benchmarks for all supported backends of the given model."""
+
+    # Pull the content and artificially grow the batch size
+    images, captions = zip(*get_captioned_images())
+    images *= 10
+    captions *= 10
+
+    def run(model_name: str, device: str, backend_name: str):
+        result = BenchmarkResult(
+            model_name=model_name,
+            backend_name=backend_name,
+            device_name=device,
+            duration_image_preprocessing=0,
+            duration_image_embedding=0,
+            duration_text_preprocessing=0,
+            duration_text_embedding=0,
+        )
+
+        processors, models = get_model(
+            model_name,
+            device=device,
+            modalities=[Modality.IMAGE_ENCODER, Modality.TEXT_ENCODER],
+            backend=backend_name,
+        )
+
+        model_text = models[Modality.TEXT_ENCODER]
+        model_image = models[Modality.IMAGE_ENCODER]
+        processor_text = processors[Modality.TEXT_ENCODER]
+        processor_image = processors[Modality.IMAGE_ENCODER]
+
+        # Image preprocessing
+        total_duration = 0
+        total_iterations = 0
+        while total_duration < 10:
+            seconds, _ = duration(lambda: processor_image(images))
+            total_duration += seconds
+            total_iterations += len(images)
+        duration_per_iteration = total_duration / total_iterations
+        result.duration_image_preprocessing = duration_per_iteration
+
+        # Image embedding
+        total_duration = 0
+        total_iterations = 0
+        while total_duration < 10:
+            images_data = processor_image(images)
+            seconds, _ = duration(lambda: model_image.encode(images_data))
+            total_duration += seconds
+            total_iterations += len(images)
+        duration_per_iteration = total_duration / total_iterations
+        result.duration_image_embedding = duration_per_iteration
+
+        # Text preprocessing
+        total_duration = 0
+        total_iterations = 0
+        while total_duration < 10:
+            seconds, _ = duration(lambda: processor_text(captions))
+            total_duration += seconds
+            total_iterations += len(captions)
+        duration_per_iteration = total_duration / total_iterations
+        result.duration_text_preprocessing = duration_per_iteration
+
+        # Text embedding
+        total_duration = 0
+        total_iterations = 0
+        while total_duration < 10:
+            texts_data = processor_text(captions)
+            seconds, _ = duration(lambda: model_text.encode(texts_data))
+            total_duration += seconds
+            total_iterations += len(captions)
+        duration_per_iteration = total_duration / total_iterations
+        result.duration_text_embedding = duration_per_iteration
+
+        return result
+
+    devices = ["cpu"]
+    if cuda_available:
+        devices.append("cuda")
+    backends = []
+    if torch_available:
+        backends.append("torch")
+    if onnx_available:
+        backends.append("onnx")
+
+    for device in devices:
+        for backend_name in backends:
+            for model_name in [
+                "unum-cloud/uform3-image-text-english-small",
+                "unum-cloud/uform3-image-text-english-base",
+                "unum-cloud/uform3-image-text-english-large",
+                "unum-cloud/uform3-image-text-multilingual-base",
+            ]:
+                yield BenchmarkResult(
+                    model_name=model_name,
+                    device_name=device,
+                    backend_name=backend_name,
+                ), partial(run, model_name, device, backend_name)
+
+
+def main(filter: str = None):
+    results = []
+    filter_pattern = re.compile(filter) if filter else None
+    for specs, func in yield_benchmarks():
+        if filter_pattern and (
+            not filter_pattern.search(specs.model_name)
+            and not filter_pattern.search(specs.backend_name)
+            and not filter_pattern.search(specs.device_name)
+        ):
+            continue
+
+        print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend")
+        result = func()
+        results.append(result)
+
+    results = sorted(results, key=lambda x: x.model_name)
+    results = [x.__dict__ for x in results]
+
+    df = pd.DataFrame(results)
+    print(df.to_markdown())
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
index 7046217..274ed6c 100644
--- a/python/scripts/test_encoders.py
+++ b/python/scripts/test_encoders.py
@@ -117,7 +117,7 @@ def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embed
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
 @pytest.mark.parametrize("model_name", torch_models)
 def test_torch_one_embedding(model_name: str):
-    processors, models = get_model(model_name, token=token)
+    processors, models = get_model(model_name, token=token, backend="torch")
     model_text = models[Modality.TEXT_ENCODER]
     model_image = models[Modality.IMAGE_ENCODER]
     processor_text = processors[Modality.TEXT_ENCODER]
@@ -130,8 +130,8 @@ def test_torch_one_embedding(model_name: str):
     image_data = processor_image(image)
     text_data = processor_text(text)
 
-    image_features, image_embedding = model_image.forward(image_data, return_features=True)
-    text_features, text_embedding = model_text.forward(text_data, return_features=True)
+    image_features, image_embedding = model_image.encode(image_data, return_features=True)
+    text_features, text_embedding = model_text.encode(text_data, return_features=True)
 
     assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
     assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
@@ -148,7 +148,7 @@ def test_torch_one_embedding(model_name: str):
 @pytest.mark.parametrize("batch_size", [1, 2])
 def test_torch_many_embeddings(model_name: str, batch_size: int):
 
-    processors, models = get_model(model_name, token=token)
+    processors, models = get_model(model_name, token=token, backend="torch")
     model_text = models[Modality.TEXT_ENCODER]
     model_image = models[Modality.IMAGE_ENCODER]
     processor_text = processors[Modality.TEXT_ENCODER]
@@ -161,8 +161,8 @@ def test_torch_many_embeddings(model_name: str, batch_size: int):
     image_data = processor_image(images)
     text_data = processor_text(texts)
 
-    image_embeddings = model_image.forward(image_data, return_features=False)
-    text_embeddings = model_text.forward(text_data, return_features=False)
+    image_embeddings = model_image.encode(image_data, return_features=False)
+    text_embeddings = model_text.encode(text_data, return_features=False)
 
     assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
     assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
@@ -177,7 +177,7 @@ def test_onnx_one_embedding(model_name: str, device: str):
 
     try:
 
-        processors, models = get_model_onnx(model_name, token=token, device=device)
+        processors, models = get_model(model_name, token=token, device=device, backend="onnx")
         model_text = models[Modality.TEXT_ENCODER]
         model_image = models[Modality.IMAGE_ENCODER]
         processor_text = processors[Modality.TEXT_ENCODER]
@@ -190,19 +190,19 @@ def test_onnx_one_embedding(model_name: str, device: str):
         image_data = processor_image(image)
         text_data = processor_text(text)
 
-        image_features, image_embedding = model_image(image_data)
-        text_features, text_embedding = model_text(text_data)
+        image_features, image_embedding = model_image.encode(image_data)
+        text_features, text_embedding = model_text.encode(text_data)
 
         assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
         assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
 
         # Nested fucntions are easier to debug, than lambdas
         def get_image_embedding(image_data):
-            features, embedding = model_image(processor_image(image_data))
+            features, embedding = model_image.encode(processor_image(image_data))
             return embedding
 
         def get_text_embedding(text_data):
-            features, embedding = model_text(processor_text(text_data))
+            features, embedding = model_text.encode(processor_text(text_data))
             return embedding
 
         # Test if the model outputs actually make sense
@@ -222,7 +222,7 @@ def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
 
     try:
 
-        processors, models = get_model_onnx(model_name, token=token, device=device)
+        processors, models = get_model(model_name, token=token, device=device, backend="onnx")
         model_text = models[Modality.TEXT_ENCODER]
         model_image = models[Modality.IMAGE_ENCODER]
         processor_text = processors[Modality.TEXT_ENCODER]
@@ -235,8 +235,8 @@ def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
         image_data = processor_image(images)
         text_data = processor_text(texts)
 
-        image_embeddings = model_image(image_data, return_features=False)
-        text_embeddings = model_text(text_data, return_features=False)
+        image_embeddings = model_image.encode(image_data, return_features=False)
+        text_embeddings = model_text.encode(text_data, return_features=False)
 
         assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
         assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 841440f..2be45ed 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -84,10 +84,11 @@ def get_checkpoint(
     return config_path, modality_paths, tokenizer_path
 
 
-def get_model(
+def get_model_torch(
     model_name: str,
     *,
     token: Optional[str] = None,
+    device: Literal["cpu", "cuda"] = "cpu",
     modalities: Optional[Tuple[Union[str, Modality]]] = None,
 ) -> Tuple[Dict[Modality, Callable], Dict]:
     from uform.torch_encoders import TextEncoder, ImageEncoder
@@ -101,13 +102,15 @@ def get_model(
 
     if Modality.TEXT_ENCODER in modalities:
         processor = TextProcessor(config_path, tokenizer_path)
-        encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER)).eval()
+        encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER))
+        encoder = encoder.eval().to(device)
         result_processors[Modality.TEXT_ENCODER] = processor
         result_models[Modality.TEXT_ENCODER] = encoder
 
     if Modality.IMAGE_ENCODER in modalities:
         processor = ImageProcessor(config_path)
-        encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER)).eval()
+        encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER))
+        encoder = encoder.eval().to(device)
         result_processors[Modality.IMAGE_ENCODER] = processor
         result_models[Modality.IMAGE_ENCODER] = encoder
 
@@ -143,3 +146,20 @@ def get_model_onnx(
         result_models[Modality.IMAGE_ENCODER] = encoder
 
     return result_processors, result_models
+
+
+def get_model(
+    model_name: str,
+    *,
+    device: Literal["cpu", "cuda"] = "cpu",  # change this if you have a GPU
+    backend: Literal["onnx", "torch"] = "onnx",  # lighter = better
+    modalities: Optional[Tuple[str, Modality]] = None,  # all by default
+    token: Optional[str] = None,  # optional HuggingFace Hub token for private models
+) -> Tuple[Dict[Modality, Callable], Dict]:
+
+    if backend == "onnx":
+        return get_model_onnx(model_name, device=device, token=token, modalities=modalities)
+    elif backend == "torch":
+        return get_model_torch(model_name, device=device, token=token, modalities=modalities)
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py
index 027bc0d..3782c26 100644
--- a/python/uform/numpy_processors.py
+++ b/python/uform/numpy_processors.py
@@ -1,5 +1,5 @@
 from os import PathLike
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Sequence
 import json
 
 from PIL.Image import Image, BICUBIC
@@ -23,7 +23,7 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         self._tokenizer.no_padding()
         self._pad_token_idx = config["padding_idx"]
 
-    def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]:
+    def __call__(self, texts: Union[str, Sequence[str]]) -> Dict[str, np.ndarray]:
         """Transforms one or more strings into dictionary with tokenized strings and attention masks.
 
         :param texts: text of list of texts to tokenizer
@@ -75,13 +75,13 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None):
         self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None]
         self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None]
 
-    def __call__(self, images: Union[Image, List[Image]]) -> np.ndarray:
+    def __call__(self, images: Union[Image, Sequence[Image]]) -> np.ndarray:
         """Transforms one or more Pillow images into Torch Tensors.
 
         :param images: image or list of images to preprocess
         """
 
-        if isinstance(images, list):
+        if isinstance(images, Sequence):
             batch_images = np.empty(
                 (len(images), 3, self._image_size, self._image_size),
                 dtype=np.float32,
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
index a6f27d3..0b88473 100644
--- a/python/uform/onnx_encoders.py
+++ b/python/uform/onnx_encoders.py
@@ -81,7 +81,7 @@ def __init__(
             providers=available_providers(device),
         )
 
-    def __call__(
+    def encode(
         self, images: ndarray, return_features: Optional[bool] = None
     ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
         features, embeddings = self.session.run(None, {"images": images})
@@ -114,7 +114,7 @@ def __init__(
             providers=available_providers(device),
         )
 
-    def __call__(
+    def encode(
         self,
         x: Union[ndarray, dict],
         attention_mask: Optional[ndarray] = None,
diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py
index 0504a74..1120926 100644
--- a/python/uform/torch_encoders.py
+++ b/python/uform/torch_encoders.py
@@ -2,7 +2,7 @@
 
 from dataclasses import dataclass
 from os import PathLike
-from typing import Dict, Optional, Union, Mapping, Any
+from typing import Dict, Optional, Union, Mapping, Any, Tuple
 import json
 
 import torch
@@ -256,7 +256,8 @@ def forward(
         x: Union[Tensor, dict],
         attention_mask: Optional[Tensor] = None,
         return_features: Optional[bool] = None,
-    ) -> Tensor:
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+
         if isinstance(x, dict):
             assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
             attention_mask = x["attention_mask"]
@@ -273,6 +274,19 @@ def forward(
             return features, embeddings
         return embeddings
 
+    def encode(
+        self,
+        x: Union[Tensor, dict],
+        attention_mask: Optional[Tensor] = None,
+        return_features: Optional[bool] = None,
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+
+        result = self.forward(x, attention_mask, return_features)
+        if isinstance(result, tuple):
+            return result[0].detach(), result[1].detach()
+        else:
+            return result.detach()
+
     @staticmethod
     def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder:
         """Load the image encoder from the given configuration and model path.
@@ -361,6 +375,13 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
             return features, embeddings
         return embeddings
 
+    def encode(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
+        result = self.forward(x, return_features)
+        if isinstance(result, tuple):
+            return result[0].detach(), result[1].detach()
+        else:
+            return result.detach()
+
     @staticmethod
     def from_pretrained(
         config: Union[PathLike, str, object],
diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py
index 32697ca..b61b224 100644
--- a/python/uform/torch_processors.py
+++ b/python/uform/torch_processors.py
@@ -1,5 +1,5 @@
 from os import PathLike
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Sequence
 import json
 
 import torch
@@ -100,14 +100,14 @@ def __init__(self, config_path: PathLike):
             ],
         )
 
-    def __call__(self, images: Union[Image, List[Image]]) -> Dict[str, Tensor]:
+    def __call__(self, images: Union[Image, Sequence[Image]]) -> Dict[str, Tensor]:
         """Transforms one or more Pillow images into Torch Tensors.
 
         :param images: image or list of images to preprocess
         :return: dictionary with float-represented images in tensors as values
         """
 
-        if isinstance(images, list):
+        if isinstance(images, Sequence):
             batch_images = torch.empty(
                 (len(images), 3, self._image_size, self._image_size),
                 dtype=torch.float32,
diff --git a/swift/Encoders.swift b/swift/Encoders.swift
index 3582e91..2f1e7c1 100644
--- a/swift/Encoders.swift
+++ b/swift/Encoders.swift
@@ -129,11 +129,13 @@ public class TextEncoder {
         )
         let configPath = modelURL.appendingPathComponent("config.json").path
         let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path
-        self.model = try readModel(fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true))
+        self.model = try readModel(
+            fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true)
+        )
         self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model)
     }
 
-    public func forward(with text: String) throws -> Embedding {
+    public func encode(with text: String) throws -> Embedding {
         let inputFeatureProvider = try self.processor.preprocess(text)
         let prediction = try self.model.prediction(from: inputFeatureProvider)
         guard let predictionFeature = prediction.featureValue(for: "embeddings"),
@@ -164,11 +166,13 @@ public class ImageEncoder {
         let repo = Hub.Repo(id: modelName)
         let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"])
         let configPath = modelURL.appendingPathComponent("config.json").path
-        self.model = try readModel(fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true))
+        self.model = try readModel(
+            fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true)
+        )
         self.processor = try ImageProcessor(configPath: configPath)
     }
 
-    public func forward(with image: CGImage) throws -> Embedding {
+    public func encode(with image: CGImage) throws -> Embedding {
         let inputFeatureProvider = try self.processor.preprocess(image)
         let prediction = try self.model.prediction(from: inputFeatureProvider)
         guard let predictionFeature = prediction.featureValue(for: "embeddings"),
diff --git a/swift/EncodersTests.swift b/swift/EncodersTests.swift
index 0395a29..645d531 100644
--- a/swift/EncodersTests.swift
+++ b/swift/EncodersTests.swift
@@ -55,7 +55,7 @@ final class TokenizerTests: XCTestCase {
 
         var textEmbeddings: [[Float32]] = []
         for text in texts {
-            let embedding: [Float32] = try textModel.forward(with: text).asFloats()
+            let embedding: [Float32] = try textModel.encode(text).asFloats()
             textEmbeddings.append(embedding)
         }
 
@@ -141,9 +141,9 @@ final class TokenizerTests: XCTestCase {
                 )
             }
 
-            let textEmbedding: [Float32] = try textModel.forward(with: text).asFloats()
+            let textEmbedding: [Float32] = try textModel.encode(text).asFloats()
             textEmbeddings.append(textEmbedding)
-            let imageEmbedding: [Float32] = try imageModel.forward(with: cgImage).asFloats()
+            let imageEmbedding: [Float32] = try imageModel.encode(cgImage).asFloats()
             imageEmbeddings.append(imageEmbedding)
         }
 
diff --git a/swift/README.md b/swift/README.md
index 66b531f..8fa0eb8 100644
--- a/swift/README.md
+++ b/swift/README.md
@@ -1,4 +1,4 @@
-# UForm for Swift
+# UForm Swift SDK
 
 UForm offers first-party support for Swift.
 To get started, add UForm to your project using Swift Package Manager.
@@ -21,7 +21,7 @@ import UForm
 ```swift
 let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
 let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
-let textEmbedding: Embedding = try textModel.forward(with: text)
+let textEmbedding: Embedding = try textModel.encode(text)
 let textVector: [Float32] = textEmbedding.asFloats()
 ```
 
@@ -36,9 +36,38 @@ guard let url = URL(string: imageURL),
     throw Exception("Could not load image from URL: \(imageURL)")
 }
 
-var imageEmbedding: Embedding = try imageModel.forward(with: cgImage)
+var imageEmbedding: Embedding = try imageModel.encode(cgImage)
 var imageVector: [Float32] = embedding.asFloats()
 ```
 
+### Computing Distances
 
-### Computing Distances
\ No newline at end of file
+There are several ways to compute distances between embeddings, once you have them.
+Naive Swift code might look like this:
+
+```swift
+func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
+    let dotProduct = zip(a, b).map(*).reduce(0, +)
+    let normA = sqrt(a.map { $0 * $0 }.reduce(0, +))
+    let normB = sqrt(b.map { $0 * $0 }.reduce(0, +))
+    return dotProduct / (normA * normB)
+}
+```
+
+A faster way to compute distances is to use the Accelerate framework:
+
+```swift
+import Accelerate
+
+func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
+    var result: Float32 = 0
+    var aNorm: Float32 = 0
+    var bNorm: Float32 = 0
+    vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count))
+    vDSP_svesq(a, 1, &aNorm, vDSP_Length(a.count))
+    vDSP_svesq(b, 1, &bNorm, vDSP_Length(b.count))
+    return result / sqrt(aNorm * bNorm)
+}
+```
+
+An even faster approach would be to use USearch or SimSIMD, that work not only for `Float32` and `Float64`, but also for `Float16`, `Int8`, and binary embeddings.

From 3e1e57664a71290a57ff8e115d8cfea2fa6c501e Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 23 Apr 2024 11:56:40 -0700
Subject: [PATCH 24/40] Improve: Error handling in Swift

---
 swift/Encoders.swift | 264 ++++++++++++++++++++++++++++---------------
 1 file changed, 174 insertions(+), 90 deletions(-)

diff --git a/swift/Encoders.swift b/swift/Encoders.swift
index 3582e91..17da36a 100644
--- a/swift/Encoders.swift
+++ b/swift/Encoders.swift
@@ -11,21 +11,25 @@ import Foundation
 import Hub  // `Config`
 import Tokenizers  // `AutoTokenizer`
 
+/// Defines custom errors related to the encoder's functionality.
 enum EncoderError: Error {
-    case configLoadingError(String)
-    case modelLoadingError(String)
-    case unsupportedDataType
-    case invalidInput
-    case unsupportedShapeConstraint
+    case downloadError(String)
+    case loadingError(String)
+    case invalidInput(String)
     case modelPredictionFailed(String)
+    case unknownError(String)
 }
 
+/// Represents different types of embeddings as arrays of different numeric types.
 public enum Embedding {
     case i32s([Int32])
     case f16s([Float16])
     case f32s([Float32])
     case f64s([Float64])
 
+    /// Initializes an embedding from a `MLMultiArray`.
+    /// - Parameter multiArray: The MLMultiArray to convert into an Embedding.
+    /// - Returns: nil if the data type is unsupported.
     init?(from multiArray: MLMultiArray) {
         switch multiArray.dataType {
         case .float64:
@@ -65,51 +69,57 @@ public enum Embedding {
                 )
             )
         @unknown default:
-            return nil  // return nil for unsupported data types
+            return nil
         }
     }
 
+    /// Converts the embedding to an array of `Float`.
     public func asFloats() -> [Float] {
         switch self {
-        case .f32s(let array):
-            return array
-        case .i32s(let array):
-            return array.map { Float($0) }
-        case .f16s(let array):
-            return array.map { Float($0) }
-        case .f64s(let array):
-            return array.map { Float($0) }
+        case .f32s(let array): return array
+        case .i32s(let array): return array.map(Float.init)
+        case .f16s(let array): return array.map(Float.init)
+        case .f64s(let array): return array.map(Float.init)
         }
     }
 }
 
-// MARK: - Helpers
-
+/// Provides methods for reading and handling configurations and models.
+/// - Parameter path: The file path where the configuration file is located.
+/// - Returns: A dictionary containing the configuration data.
 func readConfig(fromPath path: String) throws -> [String: Any] {
-    // If it's not an absolute path, let's assume it's a path relative to the current working directory
     let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
     let data = try Data(contentsOf: URL(fileURLWithPath: absPath))
     return try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any]
 }
 
+/// Compiles and loads a machine learning model from a URL.
+/// - Parameter modelURL: The URL where the model package is located.
+/// - Returns: An instance of `MLModel`.
 func readModel(fromURL modelURL: URL) throws -> MLModel {
     let compiledModelURL = try MLModel.compileModel(at: modelURL)
     return try MLModel(contentsOf: compiledModelURL)
 }
 
+/// Loads a machine learning model from a local file path.
+/// - Parameter path: The file path where the model file is located.
+/// - Returns: An instance of `MLModel`.
 func readModel(fromPath path: String) throws -> MLModel {
-    // If it's not an absolute path, let's assume it's a path relative to the current working directory
     let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
     let modelURL = URL(fileURLWithPath: absPath, isDirectory: true)
     return try readModel(fromURL: modelURL)
 }
 
-// MARK: - Encoders
-
+/// Encodes text input into embeddings using a machine learning model.
 public class TextEncoder {
     let model: MLModel
     let processor: TextProcessor
 
+    /// Initializes a `TextEncoder` using paths for the model and configuration.
+    /// - Parameters:
+    ///   - modelPath: The path to the directory containing the machine learning model.
+    ///   - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory.
+    ///   - tokenizerPath: Optional. The path to the tokenizer file. Defaults to tokenizer.json in the model directory.
     public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws {
         let finalConfigPath = configPath ?? modelPath + "/config.json"
         let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json"
@@ -121,6 +131,10 @@ public class TextEncoder {
         )
     }
 
+    /// Initializes a `TextEncoder` using a model name and an API for fetching models.
+    /// - Parameters:
+    ///   - modelName: The identifier for the model repository.
+    ///   - hubApi: The API object to interact with the model hub. Defaults to a shared instance.
     public init(modelName: String, hubApi: HubApi = .shared) async throws {
         let repo = Hub.Repo(id: modelName)
         let modelURL = try await hubApi.snapshot(
@@ -129,57 +143,68 @@ public class TextEncoder {
         )
         let configPath = modelURL.appendingPathComponent("config.json").path
         let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path
-        self.model = try readModel(fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true))
+        self.model = try readModel(
+            fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true)
+        )
         self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model)
     }
 
+    /// Processes text and returns embeddings. Throws an error if processing fails.
+    /// - Parameter text: The text input to encode.
+    /// - Returns: An `Embedding` object containing the model output.
     public func forward(with text: String) throws -> Embedding {
         let inputFeatureProvider = try self.processor.preprocess(text)
-        let prediction = try self.model.prediction(from: inputFeatureProvider)
-        guard let predictionFeature = prediction.featureValue(for: "embeddings"),
+        guard let prediction = try? self.model.prediction(from: inputFeatureProvider),
+            let predictionFeature = prediction.featureValue(for: "embeddings"),
             let output = predictionFeature.multiArrayValue,
             let embedding = Embedding(from: output)
         else {
-            throw NSError(
-                domain: "TextEncoder",
-                code: 0,
-                userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."]
-            )
+            throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.")
         }
         return embedding
     }
 }
 
+/// Encodes image input into embeddings using a machine learning model.
 public class ImageEncoder {
     let model: MLModel
     let processor: ImageProcessor
 
+    /// Initializes an `ImageEncoder` using a path for the model and optionally a configuration file.
+    /// - Parameters:
+    ///   - modelPath: The path to the directory containing the machine learning model.
+    ///   - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory.
     public init(modelPath: String, configPath: String? = nil) throws {
         let finalConfigPath = configPath ?? modelPath + "/config.json"
         self.model = try readModel(fromPath: modelPath)
         self.processor = try ImageProcessor(configPath: finalConfigPath)
     }
 
+    /// Initializes an `ImageEncoder` using a model name and an API for fetching models.
+    /// - Parameters:
+    ///   - modelName: The identifier for the model repository.
+    ///   - hubApi: The API object to interact with the model hub. Defaults to a shared instance.
     public init(modelName: String, hubApi: HubApi = .shared) async throws {
         let repo = Hub.Repo(id: modelName)
         let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"])
         let configPath = modelURL.appendingPathComponent("config.json").path
-        self.model = try readModel(fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true))
+        self.model = try readModel(
+            fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true)
+        )
         self.processor = try ImageProcessor(configPath: configPath)
     }
 
+    /// Processes an image and returns embeddings. Throws an error if processing fails.
+    /// - Parameter image: The `CGImage` to encode.
+    /// - Returns: An `Embedding` object containing the model output.
     public func forward(with image: CGImage) throws -> Embedding {
         let inputFeatureProvider = try self.processor.preprocess(image)
-        let prediction = try self.model.prediction(from: inputFeatureProvider)
-        guard let predictionFeature = prediction.featureValue(for: "embeddings"),
+        guard let prediction = try? self.model.prediction(from: inputFeatureProvider),
+            let predictionFeature = prediction.featureValue(for: "embeddings"),
             let output = predictionFeature.multiArrayValue,
             let embedding = Embedding(from: output)
         else {
-            throw NSError(
-                domain: "ImageEncoder",
-                code: 0,
-                userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."]
-            )
+            throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.")
         }
         return embedding
     }
@@ -187,11 +212,18 @@ public class ImageEncoder {
 
 // MARK: - Processors
 
+/// Handles the preprocessing of text data to be used by a machine learning model.
 class TextProcessor {
     let tokenizer: Tokenizer
     let minContextLength: Int
     let maxContextLength: Int
 
+    /// Initializes a `TextProcessor` with specific configuration.
+    /// - Parameters:
+    ///   - configPath: The path to the configuration file specifying tokenizer and model configurations.
+    ///   - tokenizerPath: The path to the tokenizer configuration.
+    ///   - model: The machine learning model to be used with this processor.
+    /// - Throws: An error if the configuration is invalid or missing necessary components.
     public init(configPath: String, tokenizerPath: String, model: MLModel) throws {
         var configDict = try readConfig(fromPath: configPath)
         let tokenizerDict = try readConfig(fromPath: tokenizerPath)
@@ -201,60 +233,101 @@ class TextProcessor {
             configDict = textEncoderConfig  // Use the specific 'text_encoder' configuration
         }
 
+        // Initialize the tokenizer with its configuration.
         let config = Config(configDict)
         let tokenizerData = Config(tokenizerDict)
         self.tokenizer = try AutoTokenizer.from(tokenizerConfig: config, tokenizerData: tokenizerData)
 
-        let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"]
-        guard let shapeConstraint = inputDescription?.multiArrayConstraint?.shapeConstraint else {
-            fatalError("Cannot obtain shape information")
+        // Extract the model's input shape constraints.
+        guard let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"],
+            let multiArrayConstraint = inputDescription.multiArrayConstraint
+        else {
+            throw EncoderError.invalidInput("Cannot obtain shape information from the model.")
         }
 
+        // Determine the context length constraints based on the model's input shape constraint.
+        let shapeConstraint = multiArrayConstraint.shapeConstraint
         switch shapeConstraint.type {
         case .enumerated:
             minContextLength = shapeConstraint.enumeratedShapes[0][1].intValue
             maxContextLength = minContextLength
         case .range:
-            let range = inputDescription?.multiArrayConstraint?.shapeConstraint.sizeRangeForDimension[1] as? NSRange
-            minContextLength = range?.location ?? 1
-            maxContextLength = range?.length ?? 128
+            guard let range = shapeConstraint.sizeRangeForDimension[1] as? NSRange else {
+                throw EncoderError.unknownError("Model input shape has a range constraint that cannot be interpreted.")
+            }
+            minContextLength = range.location
+            maxContextLength = range.length
         case .unspecified:
-            minContextLength = 128
-            maxContextLength = 128
+            throw EncoderError.unknownError("Model input shape is unspecified.")
         @unknown default:
-            minContextLength = 128
-            maxContextLength = 128
+            throw EncoderError.unknownError("Unknown model input shape constraint type.")
         }
     }
 
+    /// Preprocesses a string of text into a format suitable for model prediction.
+    /// - Parameter text: The text to preprocess.
+    /// - Returns: A `MLFeatureProvider` containing the processed text ready for the model.
+    /// - Throws: An error if the text encoding fails.
     public func preprocess(_ text: String) throws -> MLFeatureProvider {
         let inputIDs = self.tokenizer.encode(text: text)
         return TextInput(inputIDs: inputIDs, sequenceLength: self.maxContextLength)
     }
 }
 
+/// Handles the preprocessing of image data to be used by a machine learning model.
 class ImageProcessor {
     let imageSize: Int
-    let mean: [Float] = [0.485, 0.456, 0.406]  // Common mean values for normalization
-    let std: [Float] = [0.229, 0.224, 0.225]  // Common std values for normalization
+    let mean: [Float]
+    let std: [Float]
 
+    /// Initializes an `ImageProcessor` with specific configuration.
+    /// - Parameter configPath: The path to the configuration file specifying image size, mean, and std.
     init(configPath: String) throws {
-        var configDict = try readConfig(fromPath: configPath)
-        // Check if there's a specific 'image_encoder' configuration within the main configuration
-        if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] {
-            configDict = imageEncoderConfig
+        let configDict = try readConfig(fromPath: configPath)
+        guard let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] else {
+            throw EncoderError.loadingError("Image encoder configuration is missing.")
         }
 
-        let config = Config(configDict)
-        self.imageSize = config.imageSize!.intValue!
+        guard let imageSize = imageEncoderConfig["imageSize"] as? Int else {
+            throw EncoderError.invalidInput("Invalid or missing image size.")
+        }
+        self.imageSize = imageSize
+
+        guard let meanArray = imageEncoderConfig["normalizationMeans"] as? [Any],
+            let stdArray = imageEncoderConfig["normalizationDeviations"] as? [Any]
+        else {
+            throw EncoderError.invalidInput("Normalization means or deviations are missing.")
+        }
+
+        self.mean = try meanArray.compactMap({
+            guard let floatValue = $0 as? Float else {
+                throw EncoderError.invalidInput("Normalization means should be an array of floats.")
+            }
+            return floatValue
+        })
+
+        self.std = try stdArray.compactMap({
+            guard let floatValue = $0 as? Float else {
+                throw EncoderError.invalidInput("Normalization deviations should be an array of floats.")
+            }
+            return floatValue
+        })
+
+        // Check if the arrays have 3 values for the 3 channels
+        if self.mean.count != 3 || self.std.count != 3 {
+            throw EncoderError.invalidInput("Normalization means should contain 3 values.")
+        }
     }
 
+    /// Preprocesses a `CGImage` into a format suitable for model prediction.
+    /// - Parameter cgImage: The image to preprocess.
+    /// - Returns: An `MLFeatureProvider` containing the preprocessed image data.
     func preprocess(_ cgImage: CGImage) throws -> MLFeatureProvider {
-        // Populate a tensor of size 3 x `imageSize` x `imageSize`,
-        // by resizing the image, then performing a center crop.
-        // Then normalize with the `mean` and `std` and export as a provider.
-        let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize)!
-        let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std)!
+        guard let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize),
+            let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std)
+        else {
+            throw EncoderError.invalidInput("Image preprocessing failed.")
+        }
         let featureValue = MLFeatureValue(multiArray: normalized)
         return try ImageInput(precomputedFeature: featureValue)
     }
@@ -263,7 +336,6 @@ class ImageProcessor {
         let originalWidth = CGFloat(image.width)
         let originalHeight = CGFloat(image.height)
 
-        // Calculate new size preserving the aspect ratio
         let widthRatio = CGFloat(imageSize) / originalWidth
         let heightRatio = CGFloat(imageSize) / originalHeight
         let scaleFactor = max(widthRatio, heightRatio)
@@ -271,7 +343,6 @@ class ImageProcessor {
         let scaledWidth = originalWidth * scaleFactor
         let scaledHeight = originalHeight * scaleFactor
 
-        // Calculate the crop rectangle
         let dx = (scaledWidth - CGFloat(imageSize)) / 2.0
         let dy = (scaledHeight - CGFloat(imageSize)) / 2.0
         guard
@@ -299,18 +370,19 @@ class ImageProcessor {
         // Prepare the bitmap context for drawing the image.
         var pixelData = [UInt8](repeating: 0, count: width * height * 4)
         let colorSpace = CGColorSpaceCreateDeviceRGB()
-        let context = CGContext(
-            data: &pixelData,
-            width: width,
-            height: height,
-            bitsPerComponent: 8,
-            bytesPerRow: 4 * width,
-            space: colorSpace,
-            bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
-        )
-        context?.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
+        guard
+            let context = CGContext(
+                data: &pixelData,
+                width: width,
+                height: height,
+                bitsPerComponent: 8,
+                bytesPerRow: 4 * width,
+                space: colorSpace,
+                bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
+            )
+        else { return nil }
+        context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
 
-        // Normalize the pixel data
         var floatPixels = [Float](repeating: 0, count: width * height * 3)
         for c in 0 ..< 3 {
             for i in 0 ..< (width * height) {
@@ -318,33 +390,36 @@ class ImageProcessor {
             }
         }
 
-        // Create the tensor array
-        var tensor = [Float](repeating: 0, count: 3 * width * height)
-        for i in 0 ..< (width * height) {
-            for c in 0 ..< 3 {
-                tensor[c * width * height + i] = floatPixels[i * 3 + c]
+        // We need to wrap the constructor that may fail
+        do {
+            let tensor = try MLMultiArray(
+                shape: [1, 3, NSNumber(value: height), NSNumber(value: width)],
+                dataType: .float32
+            )
+            for i in 0 ..< floatPixels.count {
+                tensor[i] = NSNumber(value: floatPixels[i])
             }
+            return tensor
         }
-
-        let multiArray = try? MLMultiArray(
-            shape: [1, 3, NSNumber(value: height), NSNumber(value: width)],
-            dataType: .float32
-        )
-        for i in 0 ..< tensor.count {
-            multiArray?[i] = NSNumber(value: tensor[i])
+        catch {
+            return nil
         }
-        return multiArray
     }
-
 }
 
 // MARK: - Feature Providers
 
+/// Provides features for text input to a machine learning model, handling padding and attention mask generation.
 class TextInput: MLFeatureProvider {
     var inputIDs: [Int]
     var sequenceLength: Int
     var paddingID: Int
 
+    /// Initializes a new instance for providing text input features.
+    /// - Parameters:
+    ///   - inputIDs: Array of integer IDs representing the encoded text.
+    ///   - sequenceLength: The fixed length to which the input sequence should be padded.
+    ///   - paddingID: The integer ID used for padding shorter sequences. Defaults to 0.
     init(inputIDs: [Int], sequenceLength: Int, paddingID: Int = 0) {
         self.inputIDs = inputIDs
         self.sequenceLength = sequenceLength
@@ -355,8 +430,9 @@ class TextInput: MLFeatureProvider {
         return Set(["input_ids", "attention_mask"])
     }
 
-    // The model expects the input IDs to be an array of integers
-    // of length `sequenceLength`, padded with `paddingID` if necessary
+    /// Returns the feature value for the specified feature name.
+    /// - Parameter featureName: The name of the feature for which the value is requested.
+    /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature.
     func featureValue(for featureName: String) -> MLFeatureValue? {
         switch featureName {
         case "input_ids", "attention_mask":
@@ -366,6 +442,9 @@ class TextInput: MLFeatureProvider {
         }
     }
 
+    /// Creates the feature value for input IDs or attention mask based on the specified feature name.
+    /// - Parameter featureName: The name of the feature.
+    /// - Returns: An `MLFeatureValue` if the array can be created, otherwise nil.
     private func createFeatureValue(for featureName: String) -> MLFeatureValue? {
         let count = min(inputIDs.count, sequenceLength)
         let totalElements = sequenceLength
@@ -394,9 +473,13 @@ class TextInput: MLFeatureProvider {
     }
 }
 
+/// Provides a precomputed feature for image inputs to a machine learning model.
 class ImageInput: MLFeatureProvider {
     var precomputedFeature: MLFeatureValue
 
+    /// Initializes a new instance with a precomputed feature.
+    /// - Parameter precomputedFeature: The `MLFeatureValue` containing the precomputed feature data.
+    /// - Throws: An error if the precomputed feature is not valid for the model.
     init(precomputedFeature: MLFeatureValue) throws {
         self.precomputedFeature = precomputedFeature
     }
@@ -405,8 +488,9 @@ class ImageInput: MLFeatureProvider {
         return Set(["images"])
     }
 
-    // The model expects the input IDs to be an array of integers
-    // of length `sequenceLength`, padded with `paddingID` if necessary
+    /// Returns the feature value for the specified feature name.
+    /// - Parameter featureName: The name of the feature for which the value is requested.
+    /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature.
     func featureValue(for featureName: String) -> MLFeatureValue? {
         switch featureName {
         case "images":

From f8654b50204c1c5878be68cf3d735b82ba19a9a5 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 23 Apr 2024 15:23:26 -0700
Subject: [PATCH 25/40] Improve: Image pre-processing in Swift

---
 swift/Encoders.swift | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/swift/Encoders.swift b/swift/Encoders.swift
index a1c3ce2..509ad11 100644
--- a/swift/Encoders.swift
+++ b/swift/Encoders.swift
@@ -283,34 +283,35 @@ class ImageProcessor {
     /// Initializes an `ImageProcessor` with specific configuration.
     /// - Parameter configPath: The path to the configuration file specifying image size, mean, and std.
     init(configPath: String) throws {
-        let configDict = try readConfig(fromPath: configPath)
-        guard let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] else {
-            throw EncoderError.loadingError("Image encoder configuration is missing.")
+        var configDict = try readConfig(fromPath: configPath)
+        if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] {
+            configDict = imageEncoderConfig
         }
 
-        guard let imageSize = imageEncoderConfig["imageSize"] as? Int else {
+        let config = Config(configDict)
+        guard let imageSize = config.imageSize?.value as? Int else {
             throw EncoderError.invalidInput("Invalid or missing image size.")
         }
         self.imageSize = imageSize
 
-        guard let meanArray = imageEncoderConfig["normalizationMeans"] as? [Any],
-            let stdArray = imageEncoderConfig["normalizationDeviations"] as? [Any]
+        guard let meanArray = config.normalizationMeans?.value as? [Any],
+            let stdArray = config.normalizationDeviations?.value as? [Any]
         else {
             throw EncoderError.invalidInput("Normalization means or deviations are missing.")
         }
 
         self.mean = try meanArray.compactMap({
-            guard let floatValue = $0 as? Float else {
+            guard let doubleValue = $0 as? Double else {
                 throw EncoderError.invalidInput("Normalization means should be an array of floats.")
             }
-            return floatValue
+            return Float(doubleValue)
         })
 
         self.std = try stdArray.compactMap({
-            guard let floatValue = $0 as? Float else {
+            guard let doubleValue = $0 as? Double else {
                 throw EncoderError.invalidInput("Normalization deviations should be an array of floats.")
             }
-            return floatValue
+            return Float(doubleValue)
         })
 
         // Check if the arrays have 3 values for the 3 channels
@@ -383,11 +384,13 @@ class ImageProcessor {
         else { return nil }
         context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
 
-        var floatPixels = [Float](repeating: 0, count: width * height * 3)
-        for c in 0 ..< 3 {
-            for i in 0 ..< (width * height) {
-                floatPixels[i * 3 + c] = (Float(pixelData[i * 4 + c]) / 255.0 - mean[c]) / std[c]
-            }
+        // While normalizing the pixels, let's also transpose them from HWC to CHW
+        let channelSize = width * height
+        var floatPixels = [Float](repeating: 0, count: channelSize * 3)
+        for i in 0 ..< channelSize {
+            floatPixels[channelSize * 0 + i] = (Float(pixelData[i * 4 + 0]) / 255.0 - mean[0]) / std[0]
+            floatPixels[channelSize * 1 + i] = (Float(pixelData[i * 4 + 1]) / 255.0 - mean[1]) / std[1]
+            floatPixels[channelSize * 2 + i] = (Float(pixelData[i * 4 + 2]) / 255.0 - mean[2]) / std[2]
         }
 
         // We need to wrap the constructor that may fail

From 37d7f52b863bebf2cae27001e2f53c0cbb860191 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 23 Apr 2024 22:40:01 +0000
Subject: [PATCH 26/40] Improve: Hide temporary files

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index fd8d9d0..fc16361 100755
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,7 @@ test
 src/__pycache__
 src/test.py
 build/
-package-lock.json
\ No newline at end of file
+package-lock.json
+
+dictionary*
+vocab*
\ No newline at end of file

From 67b083f09bf98675396f1b7cd462a93e18019986 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 23 Apr 2024 23:32:22 +0000
Subject: [PATCH 27/40] Improve: Pretty-print benchmarks

---
 python/scripts/bench_encoders.py | 52 ++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/python/scripts/bench_encoders.py b/python/scripts/bench_encoders.py
index 6b59d05..a8adb91 100644
--- a/python/scripts/bench_encoders.py
+++ b/python/scripts/bench_encoders.py
@@ -22,13 +22,13 @@
 from dataclasses import dataclass
 from typing import List, Tuple, Literal, Callable, Generator
 import re
+import argparse
 
-import fire
 import requests
 from PIL import Image
 import pandas as pd
 
-from uform import get_model, get_model_onnx, Modality
+from uform import get_model, Modality, ExecutionProviderError
 
 # Define global constants for the hardware availability
 torch_available = False
@@ -195,27 +195,55 @@ def run(model_name: str, device: str, backend_name: str):
                 ), partial(run, model_name, device, backend_name)
 
 
-def main(filter: str = None):
+def main(filter_out: str = None):
     results = []
-    filter_pattern = re.compile(filter) if filter else None
+    filter_pattern = re.compile(filter_out) if filter_out else None
     for specs, func in yield_benchmarks():
         if filter_pattern and (
-            not filter_pattern.search(specs.model_name)
-            and not filter_pattern.search(specs.backend_name)
-            and not filter_pattern.search(specs.device_name)
+            filter_pattern.search(specs.model_name)
+            or filter_pattern.search(specs.backend_name)
+            or filter_pattern.search(specs.device_name)
         ):
             continue
 
-        print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend")
-        result = func()
-        results.append(result)
+        try:
+            print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend")
+            result = func()
+            results.append(result)
+        except ExecutionProviderError as e:
+            print(f"- skipping missing backend")
+            print(e)
 
     results = sorted(results, key=lambda x: x.model_name)
     results = [x.__dict__ for x in results]
 
     df = pd.DataFrame(results)
-    print(df.to_markdown())
+    df.columns = [
+        "Model Name",
+        "Device",
+        "Backend",
+        "Images Preprocessed/s",
+        "Images Encoded/s",
+        "Texts Preprocessed/s",
+        "Texts Encoded/s",
+    ]
+
+    def inverse(x):
+        return 1 / x if x != 0 else 0
+
+    # Apply number formatting directly in the DataFrame
+    formatted_df = df.copy()
+    formatted_df["Images Preprocessed/s"] = df["Images Preprocessed/s"].map(inverse).map("{:,.2f}".format)
+    formatted_df["Images Encoded/s"] = df["Images Encoded/s"].map(inverse).map("{:,.2f}".format)
+    formatted_df["Texts Preprocessed/s"] = df["Texts Preprocessed/s"].map(inverse).map("{:,.2f}".format)
+    formatted_df["Texts Encoded/s"] = df["Texts Encoded/s"].map(inverse).map("{:,.2f}".format)
+
+    # Convert formatted DataFrame to Markdown
+    print(formatted_df.to_markdown())
 
 
 if __name__ == "__main__":
-    fire.Fire(main)
+    argparse = argparse.ArgumentParser()
+    argparse.add_argument("--filter-out", type=str, default=None)
+    args = argparse.parse_args()
+    main(filter_out=args.filter_out)

From 8e38b2e8005728e92768fb26cea76a2689542203 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 23 Apr 2024 23:35:26 +0000
Subject: [PATCH 28/40] Make: Add development dependencies

---
 CONTRIBUTING.md | 17 ++++++++++++++---
 pyproject.toml  |  1 +
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ceafee9..65e0b26 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,12 +7,11 @@ We welcome contributions to UForm!
 Before submitting any changes, please make sure that the tests pass.
 
 ```sh
-pip install -e .                # For core dependencies
-
+pip install -e ".[dev]"         # For development dependencies
 pip install -e ".[torch]"       # For PyTorch
 pip install -e ".[onnx]"        # For ONNX on CPU
 pip install -e ".[onnx-gpu]"    # For ONNX on GPU, available for some platforms
-pip install -e ".[torch,onnx]"  # For PyTorch and ONNX Python tests
+pip install -e ".[torch,onnx,onnx-gpu,dev]"  # For all
 
 pytest python/scripts/ -s -x -Wd -v
 pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch
@@ -53,3 +52,15 @@ Before submitting any changes, please make sure that the tests pass.
 npm install
 npm run test
 ```
+
+## Benchmarking
+
+If you want to double check, how fast the model may work on your hardware, you can clone the library and repeat the benchmarks locally.
+The following benchmark will exclude PyTorch backend, CUDA-capable devices, and all the `-base` and `-large` models, running only the ONNX benchmarks on the CPU.
+
+```sh
+git clone https://github.com/unum-cloud/uform --depth 1 # Clone the repository
+cd uform && pip install -e ".[torch,onnx,onnx-gpu,dev]" # Install all dependencies
+python python/scripts/bench_encoders.py --filter-out "torch|cuda|base|large"
+```
+
diff --git a/pyproject.toml b/pyproject.toml
index 1a84808..fef02d3 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ uform-chat = "uform.chat:main"
 torch = ["torch>=1.13.1", "torchvision", "transformers>=4.36.2"]
 onnx = ["onnx>=1.15.0", "onnxruntime>=1.17.1", "numpy"]
 onnx-gpu = ["onnx>=1.15.0", "onnxruntime-gpu>=1.17.1", "numpy"]
+dev = ["pytest", "pandas"]
 
 [project.urls]
 "Homepage" = "https://github.com/unum-cloud/uform"

From 96df21d3a33f703a641596fdb80a6cca9cbac15b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 23 Apr 2024 23:52:56 +0000
Subject: [PATCH 29/40] Improve: Reduce warnings

---
 python/uform/__init__.py      | 11 +++++++++--
 python/uform/onnx_encoders.py | 10 +++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 2be45ed..8f0a30b 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,9 +1,10 @@
-from json import load
 from os.path import join, exists
 from typing import Dict, Optional, Tuple, Literal, Union, Callable
 from enum import Enum
 
-from huggingface_hub import snapshot_download
+from huggingface_hub import snapshot_download, utils
+
+from uform.onnx_encoders import ExecutionProviderError
 
 
 class Modality(Enum):
@@ -44,6 +45,9 @@ def get_checkpoint(
     config_names = ["torch_config.json", "config.json"]
     tokenizer_names = ["tokenizer.json"]
 
+    old_progress_behavior = utils.are_progress_bars_disabled()
+    utils.disable_progress_bars()
+
     # The download stats depend on the number of times the `config.json` is pulled
     # https://huggingface.co/docs/hub/models-download-stats
     model_path = snapshot_download(
@@ -52,6 +56,9 @@ def get_checkpoint(
         allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names,
     )
 
+    if old_progress_behavior:
+        utils.enable_progress_bars()
+
     # Find the first name in `config_names` that is present
     config_path = None
     for config_name in config_names:
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
index 0b88473..d2668b9 100644
--- a/python/uform/onnx_encoders.py
+++ b/python/uform/onnx_encoders.py
@@ -39,7 +39,7 @@ def available_providers(device: Optional[str]) -> Tuple[str, ...]:
             raise ExecutionProviderError(
                 f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}"
             )
-        return gpu_providers
+        return [x for x in gpu_providers if x in available]
 
     # If a CPU is requested, but no CPU providers are available, raise an error
     if device == "cpu":
@@ -47,7 +47,7 @@ def available_providers(device: Optional[str]) -> Tuple[str, ...]:
             raise ExecutionProviderError(
                 f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}"
             )
-        return cpu_providers
+        return [x for x in cpu_providers if x in available]
 
     if device not in available:
         available_providers = ", ".join(available)
@@ -128,7 +128,11 @@ def encode(
             input_ids = x
 
         features, embeddings = self.text_encoder_session.run(
-            None, {"input_ids": input_ids, "attention_mask": attention_mask}
+            None,
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+            },
         )
 
         return_features = return_features if return_features is not None else self.return_features

From 91c86a1cb62c0d5f6b2573014401167e307f86db Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Tue, 23 Apr 2024 23:53:29 +0000
Subject: [PATCH 30/40] Improve: Move inputs to same device as model

---
 python/uform/torch_encoders.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py
index 1120926..ed413a8 100644
--- a/python/uform/torch_encoders.py
+++ b/python/uform/torch_encoders.py
@@ -266,6 +266,11 @@ def forward(
             # If no attention mask is provided - create one with all ones
             attention_mask = torch.ones_like(x)
 
+        # If the model is on the GPU and the input matrices are not, shift them there
+        if next(self.parameters()).device.type == "cuda" and x.device.type != "cuda":
+            x = x.cuda()
+            attention_mask = attention_mask.cuda()
+
         features = self.forward_features(x, attention_mask)
         embeddings = self.forward_embedding(features, attention_mask)
 
@@ -368,6 +373,11 @@ def forward_embedding(self, x: Tensor) -> Tensor:
     def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
         if isinstance(x, dict):
             x = x["images"]
+
+        # If the model is on the GPU and the input matrices are not, shift them there
+        if next(self.parameters()).device.type == "cuda" and x.device.type != "cuda":
+            x = x.cuda()
+
         features = self.forward_features(x)
         embeddings = self.forward_embedding(features)
         return_features = return_features if return_features is not None else self.return_features

From 6d5f1ce739178f91774cda22b662b4395916bb62 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 24 Apr 2024 05:17:47 +0000
Subject: [PATCH 31/40] Docs: Reorganize

---
 BENCHMARKS.md                        | 160 ++++++++
 README.md                            | 313 +--------------
 python/README.md                     |  51 +++
 python/scripts/bench_encoders.py     |  57 ++-
 python/scripts/export_decoders.ipynb | 567 +--------------------------
 python/uform/torch_encoders.py       |   5 +-
 6 files changed, 269 insertions(+), 884 deletions(-)
 create mode 100644 BENCHMARKS.md

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
new file mode 100644
index 0000000..ef78990
--- /dev/null
+++ b/BENCHMARKS.md
@@ -0,0 +1,160 @@
+# UForm Model Benchmarks
+
+## Accuracy
+
+### Embedding Models
+
+Few retrieval benchmarks exist for multimodal embeddings.
+The most famous ones for English are "MS-COCO" and "Flickr30k".
+Evaluating `uform-vl-english` model, one can expect the following numbers for search quality.
+
+| Dataset  | Recall @ 1 | Recall @ 5 | Recall @ 10 |
+| :------- | ---------: | ---------: | ----------: |
+| Flickr   |      0.727 |      0.915 |       0.949 |
+| MS-COCO¹ |      0.510 |      0.761 |       0.838 |
+
+For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository².
+Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model.
+
+| Language  | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
+| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: |
+| English 🇺🇸 |     __37.8__ |      37.7 |         63.5 |  __65.0__ |          73.5 |   __75.9__ |  1'452 M |
+| Chinese 🇨🇳 |         27.3 |  __32.2__ |         51.3 |  __59.0__ |          62.1 |   __70.5__ |  1'118 M |
+| Hindi 🇮🇳   |         20.7 |  __31.3__ |         42.5 |  __57.9__ |          53.7 |   __69.6__ |    602 M |
+| Spanish 🇪🇸 |         32.6 |  __35.6__ |         58.0 |  __62.8__ |          68.8 |   __73.7__ |    548 M |
+| Arabic 🇸🇦  |         22.7 |  __31.7__ |         44.9 |  __57.8__ |          55.8 |   __69.2__ |    274 M |
+| French 🇫🇷  |         31.3 |  __35.4__ |         56.5 |  __62.6__ |          67.4 |   __73.3__ |    274 M |
+
+
+<details>
+<summary>All languages.</summary>
+<br>
+
+| Language             | OpenCLIP @ 1 |    UForm @ 1 | OpenCLIP @ 5 |    UForm @ 5 | OpenCLIP @ 10 |   UForm @ 10 | Speakers |
+| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: |
+| Arabic 🇸🇦             |         22.7 |     __31.7__ |         44.9 |     __57.8__ |          55.8 |     __69.2__ |    274 M |
+| Armenian 🇦🇲           |          5.6 |     __22.0__ |         14.3 |     __44.7__ |          20.2 |     __56.0__ |      4 M |
+| Chinese 🇨🇳            |         27.3 |     __32.2__ |         51.3 |     __59.0__ |          62.1 |     __70.5__ |  1'118 M |
+| English 🇺🇸            |     __37.8__ |         37.7 |         63.5 |     __65.0__ |          73.5 |     __75.9__ |  1'452 M |
+| French 🇫🇷             |         31.3 |     __35.4__ |         56.5 |     __62.6__ |          67.4 |     __73.3__ |    274 M |
+| German 🇩🇪             |         31.7 |     __35.1__ |         56.9 |     __62.2__ |          67.4 |     __73.3__ |    134 M |
+| Hebrew 🇮🇱             |         23.7 |     __26.7__ |         46.3 |     __51.8__ |          57.0 |     __63.5__ |      9 M |
+| Hindi 🇮🇳              |         20.7 |     __31.3__ |         42.5 |     __57.9__ |          53.7 |     __69.6__ |    602 M |
+| Indonesian 🇮🇩         |         26.9 |     __30.7__ |         51.4 |     __57.0__ |          62.7 |     __68.6__ |    199 M |
+| Italian 🇮🇹            |         31.3 |     __34.9__ |         56.7 |     __62.1__ |          67.1 |     __73.1__ |     67 M |
+| Japanese 🇯🇵           |         27.4 |     __32.6__ |         51.5 |     __59.2__ |          62.6 |     __70.6__ |    125 M |
+| Korean 🇰🇷             |         24.4 |     __31.5__ |         48.1 |     __57.8__ |          59.2 |     __69.2__ |     81 M |
+| Persian 🇮🇷            |         24.0 |     __28.8__ |         47.0 |     __54.6__ |          57.8 |     __66.2__ |     77 M |
+| Polish 🇵🇱             |         29.2 |     __33.6__ |         53.9 |     __60.1__ |          64.7 |     __71.3__ |     41 M |
+| Portuguese 🇵🇹         |         31.6 |     __32.7__ |         57.1 |     __59.6__ |          67.9 |     __71.0__ |    257 M |
+| Russian 🇷🇺            |         29.9 |     __33.9__ |         54.8 |     __60.9__ |          65.8 |     __72.0__ |    258 M |
+| Spanish 🇪🇸            |         32.6 |     __35.6__ |         58.0 |     __62.8__ |          68.8 |     __73.7__ |    548 M |
+| Thai 🇹🇭               |         21.5 |     __28.7__ |         43.0 |     __54.6__ |          53.7 |     __66.0__ |     61 M |
+| Turkish 🇹🇷            |         25.5 |     __33.0__ |         49.1 |     __59.6__ |          60.3 |     __70.8__ |     88 M |
+| Ukranian 🇺🇦           |         26.0 |     __30.6__ |         49.9 |     __56.7__ |          60.9 |     __68.1__ |     41 M |
+| Vietnamese 🇻🇳         |         25.4 |     __28.3__ |         49.2 |     __53.9__ |          60.3 |     __65.5__ |     85 M |
+|                      |              |              |              |              |               |              |          |
+| Mean                 |     26.5±6.4 | __31.8±3.5__ |     49.8±9.8 | __58.1±4.5__ |     60.4±10.6 | __69.4±4.3__ |        - |
+| Google Translate     |     27.4±6.3 | __31.5±3.5__ |     51.1±9.5 | __57.8±4.4__ |     61.7±10.3 | __69.1±4.3__ |        - |
+| Microsoft Translator |     27.2±6.4 | __31.4±3.6__ |     50.8±9.8 | __57.7±4.7__ |     61.4±10.6 | __68.9±4.6__ |        - |
+| Meta NLLB            |     24.9±6.7 | __32.4±3.5__ |    47.5±10.3 | __58.9±4.5__ |     58.2±11.2 | __70.2±4.3__ |        - |
+
+</details>
+
+### Generative Models
+
+| Model                | LLM Size |  SQA |    MME | MMBench | Average¹ |
+| :------------------- | -------: | ---: | -----: | ------: | -------: |
+| UForm-Gen2-Qwen-500m |     0.5B | 45.5 |  880.1 |    42.0 |    29.31 |
+| MobileVLM v2         |     1.4B | 52.1 | 1302.8 |    57.7 |    36.81 |
+| LLaVA-Phi            |     2.7B | 68.4 | 1335.1 |    59.8 |    42.95 |
+
+For captioning evaluation we measure CLIPScore and RefCLIPScore³.
+
+| Model                               | Size | Caption Length | CLIPScore | RefCLIPScore |
+| :---------------------------------- | ---: | -------------: | --------: | -----------: |
+| `llava-hf/llava-1.5-7b-hf`          |   7B |           Long |     0.878 |        0.529 |
+| `llava-hf/llava-1.5-7b-hf`          |   7B |          Short |     0.886 |        0.531 |
+|                                     |
+| `Salesforce/instructblip-vicuna-7b` |   7B |           Long |     0.902 |        0.534 |
+| `Salesforce/instructblip-vicuna-7b` |   7B |          Short |     0.848 |        0.523 |
+|                                     |
+| `unum-cloud/uform-gen`              | 1.5B |           Long |     0.847 |        0.523 |
+| `unum-cloud/uform-gen`              | 1.5B |          Short |     0.842 |        0.522 |
+|                                     |
+| `unum-cloud/uform-gen-chat`         | 1.5B |           Long |     0.860 |        0.525 |
+| `unum-cloud/uform-gen-chat`         | 1.5B |          Short |     0.858 |        0.525 |
+
+Results for VQAv2 evaluation.
+
+| Model                      | Size | Accuracy |
+| :------------------------- | ---: | -------: |
+| `llava-hf/llava-1.5-7b-hf` |   7B |     78.5 |
+| `unum-cloud/uform-gen`     | 1.5B |     66.5 |
+
+<br/>
+
+> ¹ Train split was in training data. <br/>
+> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section. <br/>
+> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model.
+
+## Speed
+
+UForm comes pre-packaged with speed benchmarks for the models.
+    
+```bash
+$ python python/scripts/bench_encoders.py --help
+usage: bench_encoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
+
+options:
+  -h, --help            show this help message and exit
+  --filter-out FILTER_OUT
+                        Filter out models, backends, or devices with a Regular Expression.
+  --batch-size BATCH_SIZE
+                        Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
+```
+
+Running that script for a fairly small batch size of 50 on an Nvidia H100 GPU and 
+
+| Model Name                                     | Device | Backend | Images Preprocessed/s | Images Encoded/s | Texts Preprocessed/s | Texts Encoded/s |
+| :--------------------------------------------- | :----- | :------ | --------------------: | :--------------- | :------------------- | :-------------- |
+| unum-cloud/uform3-image-text-english-base      | cpu    | torch   |                 23.03 | 76.57            | 15,978.03            | 562.28          |
+| unum-cloud/uform3-image-text-english-base      | cpu    | onnx    |                 23.11 | 77.75            | 13,880.27            | 1,067.40        |
+| unum-cloud/uform3-image-text-english-base      | cuda   | torch   |                 22.87 | 1,060.40         | 12,348.94            | 13,242.83       |
+| unum-cloud/uform3-image-text-english-large     | cpu    | torch   |                 22.41 | 10.84            | 13,350.45            | 145.12          |
+| unum-cloud/uform3-image-text-english-large     | cpu    | onnx    |                 23.13 | 19.60            | 18,031.85            | 960.09          |
+| unum-cloud/uform3-image-text-english-large     | cuda   | torch   |                 22.78 | 244.86           | 13,226.40            | 10,204.04       |
+| unum-cloud/uform3-image-text-english-small     | cpu    | torch   |                 20.08 | 71.68            | 12,147.05            | 249.63          |
+| unum-cloud/uform3-image-text-english-small     | cpu    | onnx    |                 22.84 | 195.27           | 13,636.99            | 1,385.25        |
+| unum-cloud/uform3-image-text-english-small     | cuda   | torch   |                 22.63 | 2,662.16         | 14,731.18            | 14,694.87       |
+| unum-cloud/uform3-image-text-multilingual-base | cpu    | torch   |                 22.98 | 64.28            | 10,129.27            | 209.76          |
+| unum-cloud/uform3-image-text-multilingual-base | cpu    | onnx    |                 23.06 | 66.81            | 8,963.13             | 1,104.32        |
+| unum-cloud/uform3-image-text-multilingual-base | cuda   | torch   |                 22.88 | 1,051.95         | 15,639.72            | 12,416.12       |
+
+If you are interested in performance numbers on consumer grade hardware, compared to third-party models, here are some rough estimates.
+On Nvidia RTX 3090:
+
+| Model                                            | Multilingual |                  Speed |    Speedup |
+| :----------------------------------------------- | -----------: | ---------------------: | ---------: |
+| `bert-base-uncased`                              |           No | 1'612 sequences/second |            |
+| `distilbert-base-uncased`                        |           No | 3'174 sequences/second |     x 1.96 |
+| `sentence-transformers/all-MiniLM-L12-v2`        |      __Yes__ | 3'604 sequences/second |     x 2.24 |
+| `unum-cloud/uform3-image-text-multilingual-base` |      __Yes__ | 6'809 sequences/second | __x 4.22__ |
+
+On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
+
+| Model                               | Size |               Speed |   Speedup |
+| :---------------------------------- | ---: | ------------------: | --------: |
+| `llava-hf/llava-1.5-7b-hf`          |   7B |  ~ 40 tokens/second |           |
+| `Salesforce/instructblip-vicuna-7b` |   7B |  ~ 40 tokens/second |           |
+| `unum-cloud/uform-gen`              | 1.5B | ~ 140 tokens/second | __x 3.5__ |
+
+Given the small size of the model it also work well on mobile devices.
+On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
+
+| Device                 |               Speed | Device TDP |        Efficiency |
+| :--------------------- | ------------------: | ---------: | ----------------: |
+| Nvidia RTX 3090        | ~ 140 tokens/second |     < 350W | 0.40 tokens/joule |
+| Apple M2 Pro unplugged |  ~ 19 tokens/second |      < 20W | 0.95 tokens/joule |
+| Apple M2 Max unplugged |  ~ 38 tokens/second |      < 36W | 1.06 tokens/joule |
+| Apple M2 Max plugged   |  ~ 56 tokens/second |      < 89W | 0.63 tokens/joule |
diff --git a/README.md b/README.md
index ee62beb..b7fd6ca 100755
--- a/README.md
+++ b/README.md
@@ -24,16 +24,20 @@ Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
 <br/>
 ONNX • CoreML • PyTorch
 <br/>
-Python • JavaScript • Swift
+<a href="https://github.com/unum-cloud/uform/blob/main/python/README.md">Python</a>
+ • 
+<a href="https://github.com/unum-cloud/uform/blob/main/javascript/README.md">JavaScript</a>
+ • 
+<a href="https://github.com/unum-cloud/uform/blob/main/swift/README.md">Swift</a>
 </p>
 
 ---
 
-![](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true)
+![UForm Chat Preview](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true)
 
 Welcome to UForm, a __multimodal__ AI library that's as versatile as it is efficient.
 UForm [tiny embedding models](#encoder) will help you understand and search visual and textual content across various languages.
-UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are also capable of image captioning and Visual Question Answering (VQA).
+UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are great for fast image captioning and Visual Question Answering (VQA).
 With compact __custom pre-trained transformer models__, this can run anywhere from your server farm down to your smartphone.
 
 ## Features
@@ -42,13 +46,15 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr
 - __Throughput__: Thanks to the small size, the inference speed is [2-4x faster](#speed) than competitors.
 - __Portable__: Models come with native ONNX support, making them easy to deploy on any platform.
 - __Quantization Aware__: Down-cast embeddings from `f32` to `i8` without losing much recall.
-- __Multilingual__: Trained on a balanced dataset, the recall is great across over [20 languages](#evaluation).
+- __Multilingual__: Trained on a balanced dataset, the recall is great across over 20 languages.
 
 [usearch]: https://github.com/unum-cloud/usearch
 [matryoshka]: https://arxiv.org/abs/2205.13147
 
 ## Models
 
+For accuracy and speed benchmarks refer to the [evaluation page](https://github.com/unum-cloud/uform/blob/main/BENCHMARKS.md).
+
 ### Embedding Models
 
 | Model                                               | Parameters | Languages |                                 Architecture |
@@ -75,74 +81,7 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr
 [model-g2]: https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/
 [model-g1]: https://huggingface.co/unum-cloud/uform-gen/
 
-## Producing Embeddings
-
-Add UForm to your dependencies list, or just install it locally:
-
-```bash
-pip install uform
-```
-
-Then, you can use the following code to get embeddings for text and images.
-You can do that either with the PyTorch reference model or the lighter cross-platform ONNX weights.
-
-```python
-import uform
-from PIL import Image
-
-# If you want to use the PyTorch model
-model, processor = uform.get_model('unum-cloud/uform-vl-english-large') # Just English
-model, processor = uform.get_model('unum-cloud/uform-vl-multilingual-v2') # 21 Languages
-
-# If you want to use the light-weight portable ONNX model
-# Available combinations: cpu & fp32, gpu & fp32, gpu & fp16
-# Check out Unum's Hugging Face space for more details: https://huggingface.co/unum-cloud
-model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-small', 'cpu', 'fp32')
-model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-large', 'gpu', 'fp16')
-
-text = 'a small red panda in a zoo'
-image = Image.open('red_panda.jpg')
-
-image_data = processor.preprocess_image(image)
-text_data = processor.preprocess_text(text)
-
-image_features, image_embedding = model.encode_image(image_data, return_features=True)
-text_features, text_embedding = model.encode_text(text_data, return_features=True)
-```
-
-To search for similar items, the embeddings can be compared using cosine similarity.
-The resulting value will fall within the range of `-1` to `1`, where `1` indicates a high likelihood of a match.
-PyTorch provides a built-in function for calculating cosine similarity, while for ONNX, you can use NumPy.
-
-```python
-import torch.nn.functional as F
-
-similarity = F.cosine_similarity(image_embedding, text_embedding)
-```
-
-ONNX has no such function, but you can calculate the cosine similarity using [SimSIMD](https://github.com/ashvardanian/simsimd) or manually, with NumPy:
-
-```python
-import numpy as np
-
-image_embedding = image_embedding / np.linalg.norm(image_embedding, keepdims=True, axis=1)
-text_embedding = text_embedding / np.linalg.norm(text_embedding, keepdims=True, axis=1)
-similarity = (image_embedding * text_embedding).sum(axis=1)
-```
-
-### Reranking
-
-Once the list of nearest neighbors (best matches) is obtained, the joint multimodal embeddings, created from both text and image features, can be used to better rerank (reorder) the list.
-The model can calculate a "matching score" that falls within the range of `[0, 1]`, where `1` indicates a high likelihood of a match.
-
-```python
-score, joint_embedding = model.encode_multimodal(
-    image_features=image_features,
-    text_features=text_features,
-    attention_mask=text_data['attention_mask'],
-    return_scores=True,
-)
-```
+## Features and Recommendations
 
 ### Down-casting, Quantization, Matryoshka, and Slicing
 
@@ -154,7 +93,7 @@ Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is
 ```python
 import numpy as np
 
-f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
 f16_embedding: np.ndarray = f32_embedding.astype(np.float16)
 i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8)
 b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8))
@@ -165,7 +104,7 @@ Alternative approach to quantization is to use the Matryoshka embeddings, where
 ```python
 import numpy as np
 
-large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+large_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
 small_embedding: np.ndarray = large_embedding[:, :256]
 tiny_embedding: np.ndarray = large_embedding[:, :64]
 ```
@@ -220,92 +159,6 @@ You can pick one of many supported [ONNX execution providers][onnx-providers], w
 
 [onnx-providers]: https://onnxruntime.ai/docs/execution-providers/
 
----
-
-The configuration process may include a few additional steps, depending on the environment.
-When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository.
-
-```sh
-wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-sudo dpkg -i cuda-keyring_1.1-1_all.deb
-sudo apt-get update
-sudo apt-get -y install cuda-toolkit-12
-pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
-export CUDA_PATH="/usr/local/cuda-12/bin"
-export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}"
-export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
-pytest python/scripts/ -s -x -Wd -v -k onnx
-```
-
-[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu
-
-## Chat, Image Captioning and Question Answering
-
-UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library.
-Those models can be used to caption images or power multimodal chat experiences.
-
-```python
-from transformers import AutoModel, AutoProcessor
-
-model = AutoModel.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True)
-processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True)
-
-prompt = 'Question or Instruction'
-image = Image.open('image.jpg')
-
-inputs = processor(text=[prompt], images=[image], return_tensors='pt')
-
-with torch.inference_mode():
-     output = model.generate(
-        **inputs,
-        do_sample=False,
-        use_cache=True,
-        max_new_tokens=256,
-        eos_token_id=151645,
-        pad_token_id=processor.tokenizer.pad_token_id
-    )
-prompt_len = inputs['input_ids'].shape[1]
-decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
-```
-
-You can check examples of different prompts in our [demo space](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
-
-
-### Image Captioning and Question Answering
-
-__It is the instruction for the first version of UForm-Gen model. We highly recommend you use the new model, instructions for which you can find above.__
-
-
-The generative model can be used to caption images, summarize their content, or answer questions about them.
-The exact behavior is controlled by prompts.
-
-```python
-from uform.torch_decoders import VLMForCausalLM, VLMProcessor
-
-model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen')
-processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen')
-
-# [cap] Narrate the contents of the image with precision.
-# [cap] Summarize the visual content of the image.
-# [vqa] What is the main subject of the image?
-prompt = '[cap] Summarize the visual content of the image.'
-image = Image.open('zebra.jpg')
-
-inputs = processor(texts=[prompt], images=[image], return_tensors='pt')
-with torch.inference_mode():
-     output = model.generate(
-        **inputs,
-        do_sample=False,
-        use_cache=True,
-        max_new_tokens=128,
-        eos_token_id=32001,
-        pad_token_id=processor.tokenizer.pad_token_id
-    )
-
-prompt_len = inputs['input_ids'].shape[1]
-decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
-```
-
 ### Multimodal Chat in CLI
 
 The generative models can be used for chat-like experiences in the command line.
@@ -319,143 +172,3 @@ $ uform-chat --model unum-cloud/uform-gen2-dpo \
 >     --device="cuda:0" \
 >     --fp16
 ```
-
-## Evaluation
-
-### Embedding Models
-
-Few retrieval benchmarks exist for multimodal embeddings.
-The most famous ones for English are "MS-COCO" and "Flickr30k".
-Evaluating `uform-vl-english` model, one can expect the following numbers for search quality.
-
-| Dataset  | Recall @ 1 | Recall @ 5 | Recall @ 10 |
-| :------- | ---------: | ---------: | ----------: |
-| Flickr   |      0.727 |      0.915 |       0.949 |
-| MS-COCO¹ |      0.510 |      0.761 |       0.838 |
-
-
-For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository².
-Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model.
-
-| Language  | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
-| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: |
-| English 🇺🇸 |     __37.8__ |      37.7 |         63.5 |  __65.0__ |          73.5 |   __75.9__ |  1'452 M |
-| Chinese 🇨🇳 |         27.3 |  __32.2__ |         51.3 |  __59.0__ |          62.1 |   __70.5__ |  1'118 M |
-| Hindi 🇮🇳   |         20.7 |  __31.3__ |         42.5 |  __57.9__ |          53.7 |   __69.6__ |    602 M |
-| Spanish 🇪🇸 |         32.6 |  __35.6__ |         58.0 |  __62.8__ |          68.8 |   __73.7__ |    548 M |
-| Arabic 🇸🇦  |         22.7 |  __31.7__ |         44.9 |  __57.8__ |          55.8 |   __69.2__ |    274 M |
-| French 🇫🇷  |         31.3 |  __35.4__ |         56.5 |  __62.6__ |          67.4 |   __73.3__ |    274 M |
-
-
-<details>
-<summary>All languages.</summary>
-<br>
-
-| Language             | OpenCLIP @ 1 |    UForm @ 1 | OpenCLIP @ 5 |    UForm @ 5 | OpenCLIP @ 10 |   UForm @ 10 | Speakers |
-| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: |
-| Arabic 🇸🇦             |         22.7 |     __31.7__ |         44.9 |     __57.8__ |          55.8 |     __69.2__ |    274 M |
-| Armenian 🇦🇲           |          5.6 |     __22.0__ |         14.3 |     __44.7__ |          20.2 |     __56.0__ |      4 M |
-| Chinese 🇨🇳            |         27.3 |     __32.2__ |         51.3 |     __59.0__ |          62.1 |     __70.5__ |  1'118 M |
-| English 🇺🇸            |     __37.8__ |         37.7 |         63.5 |     __65.0__ |          73.5 |     __75.9__ |  1'452 M |
-| French 🇫🇷             |         31.3 |     __35.4__ |         56.5 |     __62.6__ |          67.4 |     __73.3__ |    274 M |
-| German 🇩🇪             |         31.7 |     __35.1__ |         56.9 |     __62.2__ |          67.4 |     __73.3__ |    134 M |
-| Hebrew 🇮🇱             |         23.7 |     __26.7__ |         46.3 |     __51.8__ |          57.0 |     __63.5__ |      9 M |
-| Hindi 🇮🇳              |         20.7 |     __31.3__ |         42.5 |     __57.9__ |          53.7 |     __69.6__ |    602 M |
-| Indonesian 🇮🇩         |         26.9 |     __30.7__ |         51.4 |     __57.0__ |          62.7 |     __68.6__ |    199 M |
-| Italian 🇮🇹            |         31.3 |     __34.9__ |         56.7 |     __62.1__ |          67.1 |     __73.1__ |     67 M |
-| Japanese 🇯🇵           |         27.4 |     __32.6__ |         51.5 |     __59.2__ |          62.6 |     __70.6__ |    125 M |
-| Korean 🇰🇷             |         24.4 |     __31.5__ |         48.1 |     __57.8__ |          59.2 |     __69.2__ |     81 M |
-| Persian 🇮🇷            |         24.0 |     __28.8__ |         47.0 |     __54.6__ |          57.8 |     __66.2__ |     77 M |
-| Polish 🇵🇱             |         29.2 |     __33.6__ |         53.9 |     __60.1__ |          64.7 |     __71.3__ |     41 M |
-| Portuguese 🇵🇹         |         31.6 |     __32.7__ |         57.1 |     __59.6__ |          67.9 |     __71.0__ |    257 M |
-| Russian 🇷🇺            |         29.9 |     __33.9__ |         54.8 |     __60.9__ |          65.8 |     __72.0__ |    258 M |
-| Spanish 🇪🇸            |         32.6 |     __35.6__ |         58.0 |     __62.8__ |          68.8 |     __73.7__ |    548 M |
-| Thai 🇹🇭               |         21.5 |     __28.7__ |         43.0 |     __54.6__ |          53.7 |     __66.0__ |     61 M |
-| Turkish 🇹🇷            |         25.5 |     __33.0__ |         49.1 |     __59.6__ |          60.3 |     __70.8__ |     88 M |
-| Ukranian 🇺🇦           |         26.0 |     __30.6__ |         49.9 |     __56.7__ |          60.9 |     __68.1__ |     41 M |
-| Vietnamese 🇻🇳         |         25.4 |     __28.3__ |         49.2 |     __53.9__ |          60.3 |     __65.5__ |     85 M |
-|                      |              |              |              |              |               |              |          |
-| Mean                 |     26.5±6.4 | __31.8±3.5__ |     49.8±9.8 | __58.1±4.5__ |     60.4±10.6 | __69.4±4.3__ |        - |
-| Google Translate     |     27.4±6.3 | __31.5±3.5__ |     51.1±9.5 | __57.8±4.4__ |     61.7±10.3 | __69.1±4.3__ |        - |
-| Microsoft Translator |     27.2±6.4 | __31.4±3.6__ |     50.8±9.8 | __57.7±4.7__ |     61.4±10.6 | __68.9±4.6__ |        - |
-| Meta NLLB            |     24.9±6.7 | __32.4±3.5__ |    47.5±10.3 | __58.9±4.5__ |     58.2±11.2 | __70.2±4.3__ |        - |
-
-</details>
-
-### Generative Models
-
-| Model                | LLM Size |  SQA |    MME | MMBench | Average¹ |
-| :------------------- | -------: | ---: | -----: | ------: | -------: |
-| UForm-Gen2-Qwen-500m |     0.5B | 45.5 |  880.1 |    42.0 |    29.31 |
-| MobileVLM v2         |     1.4B | 52.1 | 1302.8 |    57.7 |    36.81 |
-| LLaVA-Phi            |     2.7B | 68.4 | 1335.1 |    59.8 |    42.95 |
-
-For captioning evaluation we measure CLIPScore and RefCLIPScore³.
-
-| Model                               | Size | Caption Length | CLIPScore | RefCLIPScore |
-| :---------------------------------- | ---: | -------------: | --------: | -----------: |
-| `llava-hf/llava-1.5-7b-hf`          |   7B |           Long |     0.878 |        0.529 |
-| `llava-hf/llava-1.5-7b-hf`          |   7B |          Short |     0.886 |        0.531 |
-|                                     |
-| `Salesforce/instructblip-vicuna-7b` |   7B |           Long |     0.902 |        0.534 |
-| `Salesforce/instructblip-vicuna-7b` |   7B |          Short |     0.848 |        0.523 |
-|                                     |
-| `unum-cloud/uform-gen`              | 1.5B |           Long |     0.847 |        0.523 |
-| `unum-cloud/uform-gen`              | 1.5B |          Short |     0.842 |        0.522 |
-|                                     |
-| `unum-cloud/uform-gen-chat`         | 1.5B |           Long |     0.860 |        0.525 |
-| `unum-cloud/uform-gen-chat`         | 1.5B |          Short |     0.858 |        0.525 |
-
-Results for VQAv2 evaluation.
-
-| Model                      | Size | Accuracy |
-| :------------------------- | ---: | -------: |
-| `llava-hf/llava-1.5-7b-hf` |   7B |     78.5 |
-| `unum-cloud/uform-gen`     | 1.5B |     66.5 |
-
-<br/>
-
-> ¹ Train split was in training data. <br/>
-> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section. <br/>
-> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model.
-
-## Speed
-
-On Nvidia RTX 3090, the following performance is expected on text encoding.
-
-| Model                                     | Multilingual |                  Speed |    Speedup |
-| :---------------------------------------- | -----------: | ---------------------: | ---------: |
-| `bert-base-uncased`                       |           No | 1'612 sequences/second |            |
-| `distilbert-base-uncased`                 |           No | 3'174 sequences/second |     x 1.96 |
-| `sentence-transformers/all-MiniLM-L12-v2` |      __Yes__ | 3'604 sequences/second |     x 2.24 |
-| `unum-cloud/uform-vl-multilingual-v2`     |      __Yes__ | 6'809 sequences/second | __x 4.22__ |
-
-On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
-
-| Model                               | Size |               Speed |   Speedup |
-| :---------------------------------- | ---: | ------------------: | --------: |
-| `llava-hf/llava-1.5-7b-hf`          |   7B |  ~ 40 tokens/second |           |
-| `Salesforce/instructblip-vicuna-7b` |   7B |  ~ 40 tokens/second |           |
-| `unum-cloud/uform-gen`              | 1.5B | ~ 140 tokens/second | __x 3.5__ |
-
-Given the small size of the model it also work well on mobile devices.
-On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
-
-| Device                 |               Speed | Device TDP |        Efficiency |
-| :--------------------- | ------------------: | ---------: | ----------------: |
-| Nvidia RTX 3090        | ~ 140 tokens/second |     < 350W | 0.40 tokens/joule |
-| Apple M2 Pro unplugged |  ~ 19 tokens/second |      < 20W | 0.95 tokens/joule |
-| Apple M2 Max unplugged |  ~ 38 tokens/second |      < 36W | 1.06 tokens/joule |
-| Apple M2 Max plugged   |  ~ 56 tokens/second |      < 89W | 0.63 tokens/joule |
-
-> [!WARNING]
-> The above numbers are for reference only and are not guaranteed to be accurate.
-
-## License
-
-All models come under the same license as the code - Apache 2.0.
-
-
-TODO:
-
-- [ ] Download the image if a URL is provided
\ No newline at end of file
diff --git a/python/README.md b/python/README.md
index aec9de8..2340e15 100644
--- a/python/README.md
+++ b/python/README.md
@@ -50,6 +50,38 @@ text_features, text_embedding = model_text.encode(text_data, return_features=Tru
 
 ### Generative Models
 
+UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library.
+Those models can be used to caption images or power multimodal chat experiences.
+
+```python
+from transformers import AutoModel, AutoProcessor
+
+model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+
+prompt = 'Question or Instruction'
+image = Image.open('image.jpg')
+
+inputs = processor(text=[prompt], images=[image], return_tensors='pt')
+
+with torch.inference_mode():
+     output = model.generate(
+        **inputs,
+        do_sample=False,
+        use_cache=True,
+        max_new_tokens=256,
+        eos_token_id=151645,
+        pad_token_id=processor.tokenizer.pad_token_id
+    )
+prompt_len = inputs['input_ids'].shape[1]
+decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
+```
+
+You can check examples of different prompts in our demo spaces:
+
+- for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
+- for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo)
+
 ## Technical Details
 
 ### Down-casting, Quantization, Matryoshka, and Slicing
@@ -122,3 +154,22 @@ encoder_image = nn.DataParallel(encoder_image)
 
 _, res = encoder_image(images, 0)
 ```
+
+### ONNX and CUDA
+
+The configuration process may include a few additional steps, depending on the environment.
+When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository.
+
+```sh
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get -y install cuda-toolkit-12
+pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
+export CUDA_PATH="/usr/local/cuda-12/bin"
+export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}"
+export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+pytest python/scripts/ -s -x -Wd -v -k onnx
+```
+
+[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu
diff --git a/python/scripts/bench_encoders.py b/python/scripts/bench_encoders.py
index a8adb91..b237126 100644
--- a/python/scripts/bench_encoders.py
+++ b/python/scripts/bench_encoders.py
@@ -68,10 +68,14 @@ class BenchmarkResult:
     duration_text_embedding: float = 0
 
 
-def duration(callable):
+def duration(callable, synchronize=False):
     """Profile the duration of a callable and return the duration and the result."""
+    if synchronize and torch_available and cuda_available:
+        torch.cuda.synchronize()  # Wait for CUDA operations to complete
     start = perf_counter()
     result = callable()
+    if synchronize and torch_available and cuda_available:
+        torch.cuda.synchronize()  # Ensure all CUDA kernels have finished
     stop = perf_counter()
     return stop - start, result
 
@@ -96,13 +100,20 @@ def get_captioned_images() -> List[Tuple[Image.Image, str]]:
     return list(zip(images, captions))
 
 
-def yield_benchmarks() -> Generator[Tuple[BenchmarkResult, Callable], None, None]:
+def yield_benchmarks(batch_size: int) -> Generator[Tuple[BenchmarkResult, Callable], None, None]:
     """Yields callable benchmarks for all supported backends of the given model."""
 
     # Pull the content and artificially grow the batch size
     images, captions = zip(*get_captioned_images())
-    images *= 10
-    captions *= 10
+
+    if len(images) < batch_size:
+        import math
+
+        multiplier = int(math.ceil(batch_size / len(images)))
+        images *= multiplier
+        captions *= multiplier
+    images = images[:batch_size]
+    captions = captions[:batch_size]
 
     def run(model_name: str, device: str, backend_name: str):
         result = BenchmarkResult(
@@ -115,6 +126,7 @@ def run(model_name: str, device: str, backend_name: str):
             duration_text_embedding=0,
         )
 
+        sync = backend_name == "torch"
         processors, models = get_model(
             model_name,
             device=device,
@@ -130,7 +142,7 @@ def run(model_name: str, device: str, backend_name: str):
         # Image preprocessing
         total_duration = 0
         total_iterations = 0
-        while total_duration < 10:
+        while total_duration < 10 and total_iterations < 100:
             seconds, _ = duration(lambda: processor_image(images))
             total_duration += seconds
             total_iterations += len(images)
@@ -140,9 +152,9 @@ def run(model_name: str, device: str, backend_name: str):
         # Image embedding
         total_duration = 0
         total_iterations = 0
-        while total_duration < 10:
+        while total_duration < 10 and total_iterations < 100:
             images_data = processor_image(images)
-            seconds, _ = duration(lambda: model_image.encode(images_data))
+            seconds, _ = duration(lambda: model_image.encode(images_data), synchronize=sync)
             total_duration += seconds
             total_iterations += len(images)
         duration_per_iteration = total_duration / total_iterations
@@ -151,7 +163,7 @@ def run(model_name: str, device: str, backend_name: str):
         # Text preprocessing
         total_duration = 0
         total_iterations = 0
-        while total_duration < 10:
+        while total_duration < 10 and total_iterations < 100:
             seconds, _ = duration(lambda: processor_text(captions))
             total_duration += seconds
             total_iterations += len(captions)
@@ -161,9 +173,9 @@ def run(model_name: str, device: str, backend_name: str):
         # Text embedding
         total_duration = 0
         total_iterations = 0
-        while total_duration < 10:
+        while total_duration < 10 and total_iterations < 100:
             texts_data = processor_text(captions)
-            seconds, _ = duration(lambda: model_text.encode(texts_data))
+            seconds, _ = duration(lambda: model_text.encode(texts_data), synchronize=sync)
             total_duration += seconds
             total_iterations += len(captions)
         duration_per_iteration = total_duration / total_iterations
@@ -195,10 +207,10 @@ def run(model_name: str, device: str, backend_name: str):
                 ), partial(run, model_name, device, backend_name)
 
 
-def main(filter_out: str = None):
+def main(filter_out: str = None, batch_size: int = 10):
     results = []
     filter_pattern = re.compile(filter_out) if filter_out else None
-    for specs, func in yield_benchmarks():
+    for specs, func in yield_benchmarks(batch_size=batch_size):
         if filter_pattern and (
             filter_pattern.search(specs.model_name)
             or filter_pattern.search(specs.backend_name)
@@ -243,7 +255,20 @@ def inverse(x):
 
 
 if __name__ == "__main__":
-    argparse = argparse.ArgumentParser()
-    argparse.add_argument("--filter-out", type=str, default=None)
-    args = argparse.parse_args()
-    main(filter_out=args.filter_out)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--filter-out",
+        type=str,
+        default=None,
+        help="Filter out models, backends, or devices with a Regular Expression.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=10,
+        help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
+    )
+    args = parser.parse_args()
+
+    main(filter_out=args.filter_out, batch_size=args.batch_size)
diff --git a/python/scripts/export_decoders.ipynb b/python/scripts/export_decoders.ipynb
index 3aededb..26e463b 100644
--- a/python/scripts/export_decoders.ipynb
+++ b/python/scripts/export_decoders.ipynb
@@ -44,8 +44,8 @@
     "from PIL import Image\n",
     "from transformers import AutoModel, AutoProcessor\n",
     "\n",
-    "model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)\n",
-    "processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)\n",
+    "model = AutoModel.from_pretrained(model_name, trust_remote_code=True)\n",
+    "processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)\n",
     "\n",
     "prompt = 'Describe the picture'\n",
     "image = Image.open('../../assets/unum.png')\n",
@@ -65,569 +65,6 @@
     "\n",
     "print(decoded_text)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
-    "for name, module in model.named_children():\n",
-    "    print(f\"First layer of module: {name}\")\n",
-    "    break  # We break after the first layer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## CoreML"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import coremltools as ct\n",
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "precision = ct.precision.FLOAT32"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n",
-    "\n",
-    "```python\n",
-    "        image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
-    "        text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
-    "        text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
-    "```\n",
-    "\n",
-    "That, however, will only work for batch-size one. To support larger batches, we need to override the input shapes.\n",
-    "\n",
-    "```python\n",
-    "        ct.RangeDim(lower_bound=25, upper_bound=100, default=45)\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def generalize_first_dimensions(input_shape, upper_bound=64):\n",
-    "    if upper_bound == 1:\n",
-    "        return input_shape\n",
-    "    input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n",
-    "    return input_shape\n",
-    "\n",
-    "generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "image_input = ct.TensorType(name=\"input\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
-    "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
-    "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
-    "text_features = ct.TensorType(name=\"features\")\n",
-    "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
-    "image_features = ct.TensorType(name=\"features\")\n",
-    "image_embeddings = ct.TensorType(name=\"embeddings\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module = model.image_encoder\n",
-    "module.eval()\n",
-    "module.return_features = True\n",
-    "\n",
-    "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
-    "traced_script_module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "coreml_model = ct.convert(\n",
-    "    traced_script_module, source=\"pytorch\",\n",
-    "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=precision)\n",
-    "\n",
-    "coreml_model.author = 'Unum Cloud'\n",
-    "coreml_model.license = 'Apache 2.0'\n",
-    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module = model.text_encoder\n",
-    "module.eval()\n",
-    "module.return_features = True\n",
-    "\n",
-    "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
-    "traced_script_module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "coreml_model = ct.convert(\n",
-    "    traced_script_module, source=\"pytorch\",\n",
-    "    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=precision)\n",
-    "\n",
-    "coreml_model.author = 'Unum Cloud'\n",
-    "coreml_model.license = 'Apache 2.0'\n",
-    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# PyTorch\n",
-    "\n",
-    "Let's ensure:\n",
-    "\n",
-    "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n",
-    "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n",
-    "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from safetensors import safe_open\n",
-    "from safetensors.torch import save_file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.image_encoder.eval()\n",
-    "model.image_encoder.to(dtype=torch.bfloat16)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.text_encoder.eval()\n",
-    "model.text_encoder.to(dtype=torch.bfloat16)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n",
-    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
-    "\n",
-    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ONNX"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install onnx onnxconverter-common"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from torch.onnx import export as onnx_export\n",
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module = model.text_encoder\n",
-    "module.eval()\n",
-    "module.return_features = True\n",
-    "module.to(dtype=torch.float32)\n",
-    "\n",
-    "onnx_export(\n",
-    "    module,\n",
-    "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
-    "    os.path.join(output_directory, \"text_encoder.onnx\"), \n",
-    "    export_params=True,\n",
-    "    opset_version=15,\n",
-    "    do_constant_folding=True,\n",
-    "    input_names = ['input_ids', 'attention_mask'], \n",
-    "    output_names = ['features', 'embeddings'],\n",
-    "    dynamic_axes={\n",
-    "        'input_ids' : {0 : 'batch_size'}, \n",
-    "        'attention_mask' : {0 : 'batch_size'}, \n",
-    "        'features' : {0 : 'batch_size'}, \n",
-    "        'embeddings' : {0 : 'batch_size'}})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now repeat the same for images."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module = model.image_encoder\n",
-    "module.eval()\n",
-    "module.return_features = True\n",
-    "module.to(dtype=torch.float32)\n",
-    "\n",
-    "torch.onnx.export(\n",
-    "    module,\n",
-    "    image_data, \n",
-    "    os.path.join(output_directory, \"image_encoder.onnx\"), \n",
-    "    export_params=True,\n",
-    "    opset_version=15,\n",
-    "    do_constant_folding=True,\n",
-    "    input_names = ['input'], \n",
-    "    output_names = ['features', 'embeddings'],\n",
-    "    dynamic_axes={\n",
-    "        'input' : {0 : 'batch_size'},\n",
-    "        'features' : {0 : 'batch_size'},\n",
-    "        'embeddings' : {0 : 'batch_size'}})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Quantizing to `float16`\n",
-    "\n",
-    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import onnx\n",
-    "from onnxconverter_common import float16"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
-    "module = onnx.load(module_path)\n",
-    "module_fp16 = float16.convert_float_to_float16(module)\n",
-    "onnx.save(module_fp16, module_path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
-    "module = onnx.load(module_path)\n",
-    "module_fp16 = float16.convert_float_to_float16(module)\n",
-    "onnx.save(module_fp16, module_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Quantizing to `uint8`\n",
-    "\n",
-    "We can further quantize the model into `uint8` using ONNX quantization tools.\n",
-    "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from onnxruntime.quantization import quantize_dynamic, QuantType"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
-    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
-    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's make sure that all the text inputs are integers of identical type - `int32`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import onnx\n",
-    "import os\n",
-    "from onnx import helper\n",
-    "\n",
-    "# Load the ONNX model\n",
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
-    "module = onnx.load(module_path)\n",
-    "\n",
-    "# Get the module's graph\n",
-    "graph = module.graph\n",
-    "\n",
-    "# Iterate through the inputs and update the data type of `input_ids`\n",
-    "for input_tensor in graph.input:\n",
-    "    # Check if this is the tensor we want to change\n",
-    "    if input_tensor.name == 'input_ids' or input_tensor.name == 'attention_mask':\n",
-    "        # Get the tensor type information\n",
-    "        tensor_type = input_tensor.type.tensor_type\n",
-    "        # Set the element type to INT32 (int32's enum value in onnx is 6)\n",
-    "        tensor_type.elem_type = onnx.TensorProto.INT32\n",
-    "\n",
-    "# Optionally, check that the module is still valid\n",
-    "onnx.checker.check_model(module)\n",
-    "\n",
-    "# Save the modified module\n",
-    "onnx.save(module, module_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can use the following function to print and validate the input and output types of the ONNX model files."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def print_model_inputs_and_outputs(onnx_model_path):\n",
-    "    model = onnx.load(onnx_model_path)\n",
-    "\n",
-    "    # Get the model's graph\n",
-    "    graph = model.graph\n",
-    "\n",
-    "    # Print input information\n",
-    "    print(\"Model Inputs:\")\n",
-    "    for input_tensor in graph.input:\n",
-    "        tensor_type = input_tensor.type.tensor_type\n",
-    "        # Get the element type (data type)\n",
-    "        elem_type = tensor_type.elem_type\n",
-    "        # Convert numeric type to readable format\n",
-    "        readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
-    "        # Get tensor shape\n",
-    "        shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
-    "        print(f\"Name: {input_tensor.name}, Type: {readable_type}, Shape: {shape}\")\n",
-    "\n",
-    "    # Print output information similarly if needed\n",
-    "    print(\"\\nModel Outputs:\")\n",
-    "    for output_tensor in graph.output:\n",
-    "        tensor_type = output_tensor.type.tensor_type\n",
-    "        elem_type = tensor_type.elem_type\n",
-    "        readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
-    "        shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
-    "        print(f\"Name: {output_tensor.name}, Type: {readable_type}, Shape: {shape}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's check that the runtime can actually load those models."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import onnxruntime as ort\n",
-    "session_options = ort.SessionOptions()\n",
-    "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n",
-    "session = ort.InferenceSession(module_path, sess_options=session_options)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n",
-    "session = ort.InferenceSession(module_path, sess_options=session_options)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Upload to Hugging Face"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n",
-    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n",
-    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n",
-    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n",
-    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
-    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py
index ed413a8..c149088 100644
--- a/python/uform/torch_encoders.py
+++ b/python/uform/torch_encoders.py
@@ -231,7 +231,6 @@ def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
             return x[:, 0]
 
         attn_mask = attn_mask.unsqueeze(2).type_as(x)
-
         return (x * attn_mask).sum(dim=1) / attn_mask.sum(dim=1)
 
     def get_attention_mask(self, attn_mask: Tensor, dtype: torch.dtype) -> Tensor:
@@ -370,7 +369,7 @@ def forward_embedding(self, x: Tensor) -> Tensor:
 
         return self.embedding_projection(x)
 
-    def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
+    def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
         if isinstance(x, dict):
             x = x["images"]
 
@@ -385,7 +384,7 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
             return features, embeddings
         return embeddings
 
-    def encode(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
+    def encode(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
         result = self.forward(x, return_features)
         if isinstance(result, tuple):
             return result[0].detach(), result[1].detach()

From 1f556b867ad80460f36df7fbc2f500e0d5785951 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 24 Apr 2024 05:54:47 +0000
Subject: [PATCH 32/40] Improve: Extend benchmarks

---
 BENCHMARKS.md                    | 33 +++++++++++++++++------
 python/scripts/bench_decoders.py | 45 +++++++++++++++++++++++++++++++-
 python/uform/chat.py             | 12 ++++-----
 3 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index ef78990..9b0fa1d 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -100,6 +100,8 @@ Results for VQAv2 evaluation.
 
 ## Speed
 
+### Embedding Models
+
 UForm comes pre-packaged with speed benchmarks for the models.
     
 ```bash
@@ -141,14 +143,6 @@ On Nvidia RTX 3090:
 | `sentence-transformers/all-MiniLM-L12-v2`        |      __Yes__ | 3'604 sequences/second |     x 2.24 |
 | `unum-cloud/uform3-image-text-multilingual-base` |      __Yes__ | 6'809 sequences/second | __x 4.22__ |
 
-On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
-
-| Model                               | Size |               Speed |   Speedup |
-| :---------------------------------- | ---: | ------------------: | --------: |
-| `llava-hf/llava-1.5-7b-hf`          |   7B |  ~ 40 tokens/second |           |
-| `Salesforce/instructblip-vicuna-7b` |   7B |  ~ 40 tokens/second |           |
-| `unum-cloud/uform-gen`              | 1.5B | ~ 140 tokens/second | __x 3.5__ |
-
 Given the small size of the model it also work well on mobile devices.
 On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
 
@@ -158,3 +152,26 @@ On Apple M2 Arm chips the energy efficiency of inference can exceed that of the
 | Apple M2 Pro unplugged |  ~ 19 tokens/second |      < 20W | 0.95 tokens/joule |
 | Apple M2 Max unplugged |  ~ 38 tokens/second |      < 36W | 1.06 tokens/joule |
 | Apple M2 Max plugged   |  ~ 56 tokens/second |      < 89W | 0.63 tokens/joule |
+
+### Generative Models
+
+```bash
+$ python python/scripts/bench_decoders.py --help
+usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
+
+options:
+  -h, --help            show this help message and exit
+  --filter-out FILTER_OUT
+                        Filter out models, backends, or devices with a Regular Expression.
+  --batch-size BATCH_SIZE
+                        Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
+```
+
+On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
+
+| Model                               | Size |               Speed |   Speedup |
+| :---------------------------------- | ---: | ------------------: | --------: |
+| `llava-hf/llava-1.5-7b-hf`          |   7B |  ~ 40 tokens/second |           |
+| `Salesforce/instructblip-vicuna-7b` |   7B |  ~ 40 tokens/second |           |
+| `unum-cloud/uform-gen`              | 1.5B | ~ 140 tokens/second | __x 3.5__ |
+
diff --git a/python/scripts/bench_decoders.py b/python/scripts/bench_decoders.py
index d98c130..4241ee6 100644
--- a/python/scripts/bench_decoders.py
+++ b/python/scripts/bench_decoders.py
@@ -2,6 +2,7 @@
 from time import perf_counter
 from dataclasses import dataclass
 from typing import List
+import argparse
 
 import requests
 import torch
@@ -11,6 +12,8 @@
     InstructBlipForConditionalGeneration,
     InstructBlipProcessor,
     LlavaForConditionalGeneration,
+    AutoModel,
+    AutoProcessor,
 )
 
 from uform.torch_decoders import VLMForCausalLM, VLMProcessor
@@ -57,6 +60,7 @@ def caption(model, processor, prompt: str, image: Image.Image) -> str:
 
 
 def duration(callable):
+    """Profile the duration of a callable and return the duration and the result."""
     start = perf_counter()
     result = callable()
     stop = perf_counter()
@@ -86,7 +90,8 @@ def caption_image(image, model=model, processor=processor, prompt=prompt):
     print(f"Throughput: {total_length/total_duration:.2f} tokens/s")
 
 
-if __name__ == "__main__":
+def main(filter_out: str = None, batch_size: int = 10):
+
     image_urls = [
         "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
         "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
@@ -103,12 +108,30 @@ def caption_image(image, model=model, processor=processor, prompt=prompt):
         "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
     ]
 
+    print("UForm-Gen2")
+    bench_captions(
+        model=AutoModel.from_pretrained(
+            "unum-cloud/uform-gen2-dpo",
+            trust_remote_code=True,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            ignore_mismatched_sizes=True,
+        ).to(device),
+        processor=AutoProcessor.from_pretrained(
+            "unum-cloud/uform-gen2-dpo",
+            trust_remote_code=True,
+        ),
+        prompt="Describe the picture in great detail",
+        images=images,
+    )
+
     print("UForm-Gen")
     bench_captions(
         model=VLMForCausalLM.from_pretrained(
             "unum-cloud/uform-gen",
             torch_dtype=dtype,
             low_cpu_mem_usage=low_cpu_mem_usage,
+            ignore_mismatched_sizes=True,
         ).to(device),
         processor=VLMProcessor.from_pretrained(
             "unum-cloud/uform-gen",
@@ -144,3 +167,23 @@ def caption_image(image, model=model, processor=processor, prompt=prompt):
         prompt="Summarize the visual content of the image.",
         images=images,
     )
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--filter-out",
+        type=str,
+        default=None,
+        help="Filter out models, backends, or devices with a Regular Expression.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=10,
+        help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
+    )
+    args = parser.parse_args()
+
+    main(filter_out=args.filter_out, batch_size=args.batch_size)
diff --git a/python/uform/chat.py b/python/uform/chat.py
index c9f8dc3..7bb1737 100644
--- a/python/uform/chat.py
+++ b/python/uform/chat.py
@@ -13,10 +13,10 @@
 def parse_args():
     parser = ArgumentParser(description="Chat with UForm generative model")
 
-    parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat")
-    parser.add_argument("--image", type=str, help="", required=True)
-    parser.add_argument("--device", type=str, required=True)
-    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat", help="Model name or path")
+    parser.add_argument("--image", type=str, required=True, help="Path to image or URL")
+    parser.add_argument("--device", type=str, required=True, help="Device to run on, like `cpu` or `cuda:0`")
+    parser.add_argument("--fp16", action="store_true", help="Use half-precision math for faster inference")
 
     return parser.parse_args()
 
@@ -95,16 +95,16 @@ def run_chat(opts, model, processor):
 def main():
     try:
         opts = parse_args()
-
+        processor = VLMProcessor.from_pretrained(opts.model)
         model = (
             VLMForCausalLM.from_pretrained(
                 opts.model,
                 torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32,
+                ignore_mismatched_sizes=True,
             )
             .eval()
             .to(opts.device)
         )
-        processor = VLMProcessor.from_pretrained(opts.model)
 
         run_chat(opts, model, processor)
 

From 47b7a49b7e2b436b1a42a3aa5ad7bba179bc0680 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:05:22 +0000
Subject: [PATCH 33/40] Docs: Add examples

---
 README.md        | 165 +++++++++++++++++++++++++++++++++++++++++------
 python/README.md |  83 ++++++------------------
 2 files changed, 163 insertions(+), 85 deletions(-)

diff --git a/README.md b/README.md
index b7fd6ca..4c66d29 100755
--- a/README.md
+++ b/README.md
@@ -57,31 +57,156 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github.
 
 ### Embedding Models
 
-| Model                                               | Parameters | Languages |                                 Architecture |
-| :-------------------------------------------------- | ---------: | --------: | -------------------------------------------: |
-| [`uform3-image-text-english-large`][model-e-l] 🆕    |       365M |         1 | 6 text layers, ViT-L/14, 6 multimodal layers |
-| [`uform3-image-text-english-base`][model-e]         |       143M |         1 | 2 text layers, ViT-B/16, 2 multimodal layers |
-| [`uform3-image-text-english-small`][model-e-s] 🆕    |        79M |         1 | 2 text layers, ViT-S/16, 2 multimodal layers |
-| [`uform3-image-text-multilingual-base`][model-m-v2] |       206M |        21 | 8 text layers, ViT-B/16, 4 multimodal layers |
-
-[model-e-l]: https://huggingface.co/unum-cloud/uform-vl-english-large/
-[model-e]: https://huggingface.co/unum-cloud/uform-vl-english/
-[model-e-s]: https://huggingface.co/unum-cloud/uform-vl-english-small/
-[model-m]: https://huggingface.co/unum-cloud/uform-vl-multilingual/
-[model-m-v2]: https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/
+<table border="1" style="width:100%; border-collapse:collapse;">
+    <thead>
+        <tr style="background-color:#f2f2f2;">
+            <th>Model</th>
+            <th style="text-align:right;">Parameters</th>
+            <th style="text-align:right;">Languages</th>
+            <th style="text-align:right;">Architecture</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td><a href="https://huggingface.co/unum-cloud/uform-vl-english-large/">uform3-image-text-english-large 🆕</a></td>
+            <td style="text-align:right;">365M</td>
+            <td style="text-align:right;">1</td>
+            <td style="text-align:right;">12 layer BERT, ViT-L/14</td>
+        </tr>
+        <tr>
+            <td><a href="https://huggingface.co/unum-cloud/uform-vl-english/">uform3-image-text-english-base</a></td>
+            <td style="text-align:right;">143M</td>
+            <td style="text-align:right;">1</td>
+            <td style="text-align:right;">4 layer BERT, ViT-B/16</td>
+        </tr>
+        <tr>
+            <td><a href="https://huggingface.co/unum-cloud/uform-vl-english-small/">uform3-image-text-english-small 🆕</a></td>
+            <td style="text-align:right;">79M</td>
+            <td style="text-align:right;">1</td>
+            <td style="text-align:right;">4 layer BERT, ViT-S/16</td>
+        </tr>
+        <tr>
+            <td><a href="https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/">uform3-image-text-multilingual-base</a></td>
+            <td style="text-align:right;">206M</td>
+            <td style="text-align:right;">21</td>
+            <td style="text-align:right;">12 layer BERT, ViT-B/16</td>
+        </tr>
+    </tbody>
+</table>
 
 ### Generative Models
 
-| Model                              | Parameters |                     Purpose |           Architecture |
-| :--------------------------------- | ---------: | --------------------------: | ---------------------: |
-| [`uform-gen2-dpo`][model-g2] 🆕     |       1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 |
-| [`uform-gen2-qwen-500m`][model-g2] |       1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 |
-| [`uform-gen`][model-g1]            |       1.5B |       Image Captioning, VQA |   llama-1.3B, ViT-B/16 |
+<table border="1" style="width:100%; border-collapse:collapse;">
+    <thead>
+        <tr style="background-color:#f2f2f2;">
+            <th>Model</th>
+            <th style="text-align:right;">Parameters</th>
+            <th style="text-align:right;">Purpose</th>
+            <th style="text-align:right;">Architecture</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td><a href="https://huggingface.co/unum-cloud/uform-gen2-dpo/">uform-gen2-dpo 🆕</a></td>
+            <td style="text-align:right;">1.2B</td>
+            <td style="text-align:right;">Chat, Image Captioning, VQA</td>
+            <td style="text-align:right;">qwen1.5-0.5B, ViT-H/14</td>
+        </tr>
+        <tr>
+            <td><a href="https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/">uform-gen2-qwen-500m</a></td>
+            <td style="text-align:right;">1.2B</td>
+            <td style="text-align:right;">Chat, Image Captioning, VQA</td>
+            <td style="text-align:right;">qwen1.5-0.5B, ViT-H/14</td>
+        </tr>
+        <tr>
+            <td><a href="https://huggingface.co/unum-cloud/uform-gen/">uform-gen</a></td>
+            <td style="text-align:right;">1.5B</td>
+            <td style="text-align:right;">Image Captioning, VQA</td>
+            <td style="text-align:right;">llama-1.3B, ViT-B/16</td>
+        </tr>
+    </tbody>
+</table>
+
+## Quick Start Examples
 
-[model-g2]: https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/
-[model-g1]: https://huggingface.co/unum-cloud/uform-gen/
+### Embedding Models
+
+First, `pip install uform`.
+Then, load the model:
+
+```py
+from uform import get_model, Modality
+
+processors, models = get_model('unum-cloud/uform3-image-text-english-small')
+
+model_text = models[Modality.TEXT_ENCODER]
+model_image = models[Modality.IMAGE_ENCODER]
+processor_text = processors[Modality.TEXT_ENCODER]
+processor_image = processors[Modality.IMAGE_ENCODER]
+```
+
+Embed images:
+
+```py
+import requests
+from io import BytesIO
+from PIL import Image
+
+image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
+image_url = Image.open(BytesIO(requests.get(image_url).content))
+image_data = processor_image(image)
+image_features, image_embedding = model_image.encode(image_data, return_features=True)
+```
+
+Embed queries:
+
+```py
+text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
+text_data = processor_text(text)
+text_features, text_embedding = model_text.encode(text_data, return_features=True)
+```
+
+For more details check out:
+
+- Python docs on embedding models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#embedding-models)
+- JavaScript docs on embedding models in [javascript/README.md](https://github.com/unum-cloud/uform/blob/main/javascript/README.md#embedding-models)
+- Swift docs on embedding models in [swift/README.md](https://github.com/unum-cloud/uform/blob/main/swift/README.md#embedding-models)
+
+### Generative Models
+
+The generative models are natively compatible with 
+
+```python
+from transformers import AutoModel, AutoProcessor
+
+model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+
+prompt = 'Question or Instruction'
+image = Image.open('image.jpg')
+
+inputs = processor(text=[prompt], images=[image], return_tensors='pt')
+
+with torch.inference_mode():
+     output = model.generate(
+        **inputs,
+        do_sample=False,
+        use_cache=True,
+        max_new_tokens=256,
+        eos_token_id=151645,
+        pad_token_id=processor.tokenizer.pad_token_id
+    )
+prompt_len = inputs['input_ids'].shape[1]
+decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
+```
+
+For more details check out:
+
+- Python docs on generative models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#generative-models)
+- JavaScript docs on generative models 🔜
+- Swift docs on generative models 🔜
 
-## Features and Recommendations
+## Technical Details
 
 ### Down-casting, Quantization, Matryoshka, and Slicing
 
diff --git a/python/README.md b/python/README.md
index 2340e15..621bee0 100644
--- a/python/README.md
+++ b/python/README.md
@@ -20,13 +20,11 @@ pip install "uform[torch,onnx]"  # For PyTorch and ONNX Python tests
 
 ### Embeddings
 
+Load the model:
+
 ```py
 from uform import get_model, Modality
 
-import requests
-from io import BytesIO
-from PIL import Image
-
 model_name = 'unum-cloud/uform3-image-text-english-small'
 modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER]
 processors, models = get_model(model_name, modalities=modalities)
@@ -35,16 +33,26 @@ model_text = models[Modality.TEXT_ENCODER]
 model_image = models[Modality.IMAGE_ENCODER]
 processor_text = processors[Modality.TEXT_ENCODER]
 processor_image = processors[Modality.IMAGE_ENCODER]
+```
+
+Embed images:
+
+```py
+import requests
+from io import BytesIO
+from PIL import Image
 
-# Download the image
-text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
 image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
 image_url = Image.open(BytesIO(requests.get(image_url).content))
-
-# The actual inference
 image_data = processor_image(image)
-text_data = processor_text(text)
 image_features, image_embedding = model_image.encode(image_data, return_features=True)
+```
+
+Embed queries:
+
+```py
+text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
+text_data = processor_text(text)
 text_features, text_embedding = model_text.encode(text_data, return_features=True)
 ```
 
@@ -77,68 +85,13 @@ prompt_len = inputs['input_ids'].shape[1]
 decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
 ```
 
-You can check examples of different prompts in our demo spaces:
+You can check examples of different prompts in our demo Gradio spaces on HuggingFace:
 
 - for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
 - for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo)
 
 ## Technical Details
 
-### Down-casting, Quantization, Matryoshka, and Slicing
-
-Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
-Switching from `f32` to `f16` is recommended in almost all cases, unless you are running on very old hardware without half-precision support.
-Switching to `i8` with linear scaling is also possible, but will be noticeable in the recall on larger collections with millions of searchable entries.
-Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is to quantize them into single-bit representations for faster search.
-
-```python
-import numpy as np
-
-f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
-f16_embedding: np.ndarray = f32_embedding.astype(np.float16)
-i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8)
-b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8))
-```
-
-Alternative approach to quantization is to use the Matryoshka embeddings, where the embeddings are sliced into smaller parts, and the search is performed in a hierarchical manner.
-
-```python
-import numpy as np
-
-large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
-small_embedding: np.ndarray = large_embedding[:, :256]
-tiny_embedding: np.ndarray = large_embedding[:, :64]
-```
-
-Both approaches are natively supported by the [USearch][github-usearch] vector-search engine and the [SimSIMD][github-simsimd] numerics libraries.
-When dealing with small collections (up to millions of entries) and looking for low-latency cosine distance calculations, you can [achieve 5x-2500x performance improvement][report-simsimd] over Torch, NumPy, SciPy, and vanilla Python using SimSIMD.
-
-```python
-from simsimd import cosine, hamming
-
-distance: float = cosine(f32_embedding, f32_embedding) # 32x SciPy performance on Apple M2 CPU
-distance: float = cosine(f16_embedding, f16_embedding) # 79x SciPy performance on Apple M2 CPU
-distance: float = cosine(i8_embedding, i8_embedding) # 133x SciPy performance on Apple M2 CPU
-distance: float = hamming(b1_embedding, b1_embedding) # 17x SciPy performance on Apple M2 CPU
-```
-
-Similarly, when dealing with large collections (up to billions of entries per server) and looking for high-throughput search, you can [achieve 100x performance improvement][report-usearch] over FAISS and other vector-search solutions using USearch.
-Here are a couple of examples:
-
-```python
-from usearch.index import Index
-
-f32_index = Index(ndim=64, metric='cos', dtype='f32') # for Matryoshka embeddings
-f16_index = Index(ndim=64, metric='cos', dtype='f16') # for Matryoshka embeddings
-i8_index = Index(ndim=256, metric='cos', dtype='i8') # for quantized embeddings
-b1_index = Index(ndim=768, metric='hamming', dtype='b1') # for binary embeddings
-```
-
-[github-usearch]: https://github.com/unum-cloud/usearch
-[github-simsimd]: https://github.com/ashvardanian/simsimd
-[report-usearch]: https://www.unum.cloud/blog/2023-11-07-scaling-vector-search-with-intel
-[report-simsimd]: https://ashvardanian.com/posts/python-c-assembly-comparison/
-
 ### Multi-GPU Parallelism
 
 To achieve higher throughput, you can launch UForm on multiple GPUs.

From ebd7f66ef5f3711c2a2b6f4916f5d9fe7293d271 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 24 Apr 2024 20:25:32 +0000
Subject: [PATCH 34/40] Improve: Refresh CLI for new models

---
 python/uform/chat.py | 46 +++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/python/uform/chat.py b/python/uform/chat.py
index 7bb1737..b9e4423 100644
--- a/python/uform/chat.py
+++ b/python/uform/chat.py
@@ -3,11 +3,7 @@
 import requests
 import torch
 from PIL import Image
-from transformers import TextStreamer
-
-from uform.torch_decoders import VLMForCausalLM, VLMProcessor
-
-EOS_TOKEN = 32001
+from transformers import TextStreamer, AutoModel, AutoProcessor
 
 
 def parse_args():
@@ -30,22 +26,18 @@ def run_chat(opts, model, processor):
 
     messages = [{"role": "system", "content": "You are a helpful assistant."}]
     is_first_message = True
+
     if opts.image.startswith("http"):
-        image = (
-            processor.image_processor(
-                Image.open(requests.get(opts.image, stream=True).raw),
-            )
-            .unsqueeze(0)
-            .to(torch.bfloat16 if opts.fp16 else torch.float32)
-            .to(opts.device)
-        )
+        image = Image.open(requests.get(opts.image, stream=True).raw)
     else:
-        image = (
-            processor.image_processor(Image.open(opts.image))
-            .unsqueeze(0)
-            .to(torch.bfloat16 if opts.fp16 else torch.float32)
-            .to(opts.device)
-        )
+        image = Image.open(opts.image)
+
+    image = (
+        processor.feature_extractor(image)  #
+        .unsqueeze(0)
+        .to(torch.bfloat16 if opts.fp16 else torch.float32)
+        .to(opts.device)
+    )
 
     while True:
         if messages[-1]["role"] in ("system", "assistant"):
@@ -68,7 +60,7 @@ def run_chat(opts, model, processor):
                 1,
                 input_ids.shape[1] + processor.num_image_latents - 1,
             ).to(opts.device)
-            x = {
+            inputs = {
                 "input_ids": input_ids,
                 "attention_mask": attention_mask,
                 "images": image,
@@ -76,18 +68,19 @@ def run_chat(opts, model, processor):
 
             print("Assistant: ", end="")
             with torch.inference_mode():
-                y = model.generate(
-                    **x,
+                output = model.generate(
+                    **inputs,
                     do_sample=False,
                     use_cache=True,
                     max_new_tokens=1024,
-                    eos_token_id=EOS_TOKEN,
+                    eos_token_id=151645,
                     pad_token_id=processor.tokenizer.pad_token_id,
                     streamer=streamer,
                 )
             print()
 
-            message = processor.batch_decode(y[:, x["input_ids"].shape[1] : -1])[0]
+            prompt_len = inputs["input_ids"].shape[1]
+            message = processor.batch_decode(output[:, prompt_len:-1])[0]
 
             messages.append({"role": "assistant", "content": message})
 
@@ -95,12 +88,13 @@ def run_chat(opts, model, processor):
 def main():
     try:
         opts = parse_args()
-        processor = VLMProcessor.from_pretrained(opts.model)
+        processor = AutoProcessor.from_pretrained(opts.model, trust_remote_code=True)
         model = (
-            VLMForCausalLM.from_pretrained(
+            AutoModel.from_pretrained(
                 opts.model,
                 torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32,
                 ignore_mismatched_sizes=True,
+                trust_remote_code=True,
             )
             .eval()
             .to(opts.device)

From d00204f817748aaa204975862f9912101ab956f6 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 24 Apr 2024 20:51:50 +0000
Subject: [PATCH 35/40] Docs: Reference for Py and Swift

---
 .github/workflows/release.yml     |  8 +++-
 BENCHMARKS.md                     | 20 ++++-----
 README.md                         | 34 +++++++--------
 docs/_static/custom.js            |  2 +-
 docs/benchmarks.rst               |  5 +++
 docs/conf.py                      |  7 ++-
 docs/contributing.rst             |  5 +++
 docs/index.rst                    | 24 ++++++++---
 docs/javascript/index.rst         |  9 ++++
 docs/javascript/reference.rst.txt | 18 ++++++++
 docs/python/index.rst             | 11 +++++
 docs/python/reference.rst         | 42 ++++++++++++++++++
 docs/reference.rst                |  6 ---
 docs/swift/index.rst              |  6 +++
 javascript/encoders.mjs           | 72 ++++++++++++++++++++++++++++---
 python/uform/__init__.py          | 37 +++++++++++++---
 16 files changed, 251 insertions(+), 55 deletions(-)
 create mode 100644 docs/benchmarks.rst
 create mode 100644 docs/contributing.rst
 create mode 100644 docs/javascript/index.rst
 create mode 100644 docs/javascript/reference.rst.txt
 create mode 100644 docs/python/index.rst
 create mode 100644 docs/python/reference.rst
 delete mode 100644 docs/reference.rst
 create mode 100644 docs/swift/index.rst

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4170c99..512b641 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -113,10 +113,14 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: "main"
+      - name: Install dependencies
+        run: |
+          sudo apt update && 
+          sudo apt install -y doxygen graphviz dia git && 
+          pip install sphinx==5.3.0 sphinx-js==3.2.1 breathe==4.35.0 furo==2023.3.27 m2r2==0.3.3.post2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery==4.1 && 
+          npm install -g jsdoc
       - name: Setup GitHub Pages
         uses: actions/configure-pages@v2
-      - name: Install dependencies
-        run: sudo apt update && sudo apt install -y doxygen graphviz dia git && pip install sphinx==7.1.2 breathe furo m2r2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery toml
       - name: Install UForm from PyPi
         run: pip install uform
       - name: Build documentation
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index 9b0fa1d..aa61535 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -8,10 +8,10 @@ Few retrieval benchmarks exist for multimodal embeddings.
 The most famous ones for English are "MS-COCO" and "Flickr30k".
 Evaluating `uform-vl-english` model, one can expect the following numbers for search quality.
 
-| Dataset  | Recall @ 1 | Recall @ 5 | Recall @ 10 |
-| :------- | ---------: | ---------: | ----------: |
-| Flickr   |      0.727 |      0.915 |       0.949 |
-| MS-COCO¹ |      0.510 |      0.761 |       0.838 |
+| Dataset   | Recall @ 1 | Recall @ 5 | Recall @ 10 |
+| :-------- | ---------: | ---------: | ----------: |
+| Flickr    |      0.727 |      0.915 |       0.949 |
+| MS-COCO ¹ |      0.510 |      0.761 |       0.838 |
 
 For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository².
 Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model.
@@ -26,9 +26,7 @@ Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the f
 | French 🇫🇷  |         31.3 |  __35.4__ |         56.5 |  __62.6__ |          67.4 |   __73.3__ |    274 M |
 
 
-<details>
-<summary>All languages.</summary>
-<br>
+All languages:
 
 | Language             | OpenCLIP @ 1 |    UForm @ 1 | OpenCLIP @ 5 |    UForm @ 5 | OpenCLIP @ 10 |   UForm @ 10 | Speakers |
 | :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: |
@@ -59,8 +57,6 @@ Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the f
 | Microsoft Translator |     27.2±6.4 | __31.4±3.6__ |     50.8±9.8 | __57.7±4.7__ |     61.4±10.6 | __68.9±4.6__ |        - |
 | Meta NLLB            |     24.9±6.7 | __32.4±3.5__ |    47.5±10.3 | __58.9±4.5__ |     58.2±11.2 | __70.2±4.3__ |        - |
 
-</details>
-
 ### Generative Models
 
 | Model                | LLM Size |  SQA |    MME | MMBench | Average¹ |
@@ -75,13 +71,13 @@ For captioning evaluation we measure CLIPScore and RefCLIPScore³.
 | :---------------------------------- | ---: | -------------: | --------: | -----------: |
 | `llava-hf/llava-1.5-7b-hf`          |   7B |           Long |     0.878 |        0.529 |
 | `llava-hf/llava-1.5-7b-hf`          |   7B |          Short |     0.886 |        0.531 |
-|                                     |
+|                                     |      |                |           |              |
 | `Salesforce/instructblip-vicuna-7b` |   7B |           Long |     0.902 |        0.534 |
 | `Salesforce/instructblip-vicuna-7b` |   7B |          Short |     0.848 |        0.523 |
-|                                     |
+|                                     |      |                |           |              |
 | `unum-cloud/uform-gen`              | 1.5B |           Long |     0.847 |        0.523 |
 | `unum-cloud/uform-gen`              | 1.5B |          Short |     0.842 |        0.522 |
-|                                     |
+|                                     |      |                |           |              |
 | `unum-cloud/uform-gen-chat`         | 1.5B |           Long |     0.860 |        0.525 |
 | `unum-cloud/uform-gen-chat`         | 1.5B |          Short |     0.858 |        0.525 |
 
diff --git a/README.md b/README.md
index 4c66d29..8484b0f 100755
--- a/README.md
+++ b/README.md
@@ -57,9 +57,9 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github.
 
 ### Embedding Models
 
-<table border="1" style="width:100%; border-collapse:collapse;">
+<table style="width:100%; border-collapse:collapse;">
     <thead>
-        <tr style="background-color:#f2f2f2;">
+        <tr>
             <th>Model</th>
             <th style="text-align:right;">Parameters</th>
             <th style="text-align:right;">Languages</th>
@@ -68,25 +68,25 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github.
     </thead>
     <tbody>
         <tr>
-            <td><a href="https://huggingface.co/unum-cloud/uform-vl-english-large/">uform3-image-text-english-large 🆕</a></td>
-            <td style="text-align:right;">365M</td>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-english-large/">uform3-image-text-english-large</a></code>  🆕</td>
+            <td style="text-align:right;">365 M</td>
             <td style="text-align:right;">1</td>
             <td style="text-align:right;">12 layer BERT, ViT-L/14</td>
         </tr>
         <tr>
-            <td><a href="https://huggingface.co/unum-cloud/uform-vl-english/">uform3-image-text-english-base</a></td>
-            <td style="text-align:right;">143M</td>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-english/">uform3-image-text-english-base</a></code></td>
+            <td style="text-align:right;">143 M</td>
             <td style="text-align:right;">1</td>
             <td style="text-align:right;">4 layer BERT, ViT-B/16</td>
         </tr>
         <tr>
-            <td><a href="https://huggingface.co/unum-cloud/uform-vl-english-small/">uform3-image-text-english-small 🆕</a></td>
-            <td style="text-align:right;">79M</td>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-english-small/">uform3-image-text-english-small</a></code>  🆕</td>
+            <td style="text-align:right;">79 M</td>
             <td style="text-align:right;">1</td>
             <td style="text-align:right;">4 layer BERT, ViT-S/16</td>
         </tr>
         <tr>
-            <td><a href="https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/">uform3-image-text-multilingual-base</a></td>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/">uform3-image-text-multilingual-base</a></code></td>
             <td style="text-align:right;">206M</td>
             <td style="text-align:right;">21</td>
             <td style="text-align:right;">12 layer BERT, ViT-B/16</td>
@@ -96,9 +96,9 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github.
 
 ### Generative Models
 
-<table border="1" style="width:100%; border-collapse:collapse;">
+<table style="width:100%; border-collapse:collapse;">
     <thead>
-        <tr style="background-color:#f2f2f2;">
+        <tr>
             <th>Model</th>
             <th style="text-align:right;">Parameters</th>
             <th style="text-align:right;">Purpose</th>
@@ -107,20 +107,20 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github.
     </thead>
     <tbody>
         <tr>
-            <td><a href="https://huggingface.co/unum-cloud/uform-gen2-dpo/">uform-gen2-dpo 🆕</a></td>
-            <td style="text-align:right;">1.2B</td>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-gen2-dpo/">uform-gen2-dpo</a></code>  🆕</td>
+            <td style="text-align:right;">1.2 B</td>
             <td style="text-align:right;">Chat, Image Captioning, VQA</td>
             <td style="text-align:right;">qwen1.5-0.5B, ViT-H/14</td>
         </tr>
         <tr>
-            <td><a href="https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/">uform-gen2-qwen-500m</a></td>
-            <td style="text-align:right;">1.2B</td>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/">uform-gen2-qwen-500m</a></code></td>
+            <td style="text-align:right;">1.2 B</td>
             <td style="text-align:right;">Chat, Image Captioning, VQA</td>
             <td style="text-align:right;">qwen1.5-0.5B, ViT-H/14</td>
         </tr>
         <tr>
-            <td><a href="https://huggingface.co/unum-cloud/uform-gen/">uform-gen</a></td>
-            <td style="text-align:right;">1.5B</td>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-gen/">uform-gen</a></code> ⚠️</td>
+            <td style="text-align:right;">1.5 B</td>
             <td style="text-align:right;">Image Captioning, VQA</td>
             <td style="text-align:right;">llama-1.3B, ViT-B/16</td>
         </tr>
diff --git a/docs/_static/custom.js b/docs/_static/custom.js
index b909a1d..3dd0974 100644
--- a/docs/_static/custom.js
+++ b/docs/_static/custom.js
@@ -3,5 +3,5 @@ $(document).ready(function () {
     <svg style="fill: var(--color-foreground-primary);" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"></path></svg>
         </a>`
 
-    $(".sidebar-brand-text").html("Unum · UForm<br/> <span style='font-size:0.8em'>$(VERSION)</span>" + github_logo)
+    $(".sidebar-brand-text").html("Unum · UForm<br/> <span style='font-size:0.8em'>2.1.1</span>" + github_logo)
 })
diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst
new file mode 100644
index 0000000..7683788
--- /dev/null
+++ b/docs/benchmarks.rst
@@ -0,0 +1,5 @@
+====================
+Benchmarks
+====================
+
+.. mdinclude:: ../BENCHMARKS.md
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index acc061e..f9061f5 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -5,12 +5,11 @@
 
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-import toml
 
 project = "Unum · UForm"
 copyright = "2023, Unum"
 author = "Unum"
-release = toml.load("../pyproject.toml")["project"]["version"]
+release = open("../VERSION", "r").read().strip()
 with open("_static/custom.js", "r+") as js:
     content = js.read()
     js.seek(0)
@@ -24,6 +23,7 @@
     "breathe",
     "m2r2",
     "sphinx.ext.autodoc",
+    "sphinx_js",
     "sphinx.ext.autosummary",
     "sphinx.ext.intersphinx",
     "sphinx.ext.napoleon",
@@ -44,6 +44,9 @@
 html_static_path = ["_static"]
 html_css_files = ["custom.css"]
 html_js_files = ["custom.js"]
+html_baseurl = "/docs/uform/"
 
 breathe_projects = {"UForm": "../build/xml"}
 breathe_default_project = "UForm"
+
+js_source_path = "../javascript/"
diff --git a/docs/contributing.rst b/docs/contributing.rst
new file mode 100644
index 0000000..48893cf
--- /dev/null
+++ b/docs/contributing.rst
@@ -0,0 +1,5 @@
+====================
+Contributing
+====================
+
+.. mdinclude:: ../CONTRIBUTING.md
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index 162bbee..d3da0ec 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,11 +1,25 @@
-==========
+====================
 Overview
-==========
+====================
 .. mdinclude:: ../README.md
 
-.. toctree::
+.. toctree:: 
    :hidden:
+   :caption: �
+
+   python/index
+   javascript/index
+   swift/index
+
+.. toctree:: 
+   :hidden:
+   :caption: �
+
+   contributing
+   benchmarks
+
+.. toctree:: 
+   :hidden:
+   :caption: �
 
-   self
-   reference
    genindex
diff --git a/docs/javascript/index.rst b/docs/javascript/index.rst
new file mode 100644
index 0000000..771081c
--- /dev/null
+++ b/docs/javascript/index.rst
@@ -0,0 +1,9 @@
+====================
+JavaScript SDK
+====================
+
+
+.. mdinclude:: ../../javascript/README.md
+
+.. toctree::
+   :hidden:
diff --git a/docs/javascript/reference.rst.txt b/docs/javascript/reference.rst.txt
new file mode 100644
index 0000000..356176a
--- /dev/null
+++ b/docs/javascript/reference.rst.txt
@@ -0,0 +1,18 @@
+API Reference
+====================
+
+====================
+Encoders
+====================
+
+.. js:autoclass:: ../javascript/encoders.TextProcessor
+   :members:
+
+.. js:autoclass:: ../javascript/encoders.ImageProcessor
+   :members:
+
+.. js:autoclass:: ../javascript/encoders.TextEncoder
+   :members:
+
+.. js:autoclass:: ../javascript/encoders.ImageEncoder
+   :members:
diff --git a/docs/python/index.rst b/docs/python/index.rst
new file mode 100644
index 0000000..5f870d1
--- /dev/null
+++ b/docs/python/index.rst
@@ -0,0 +1,11 @@
+====================
+Python SDK
+====================
+
+
+.. mdinclude:: ../../python/README.md
+
+.. toctree::
+   :hidden:
+
+   reference
\ No newline at end of file
diff --git a/docs/python/reference.rst b/docs/python/reference.rst
new file mode 100644
index 0000000..d580583
--- /dev/null
+++ b/docs/python/reference.rst
@@ -0,0 +1,42 @@
+API Reference
+====================
+
+====================
+Root
+====================
+
+.. automodule:: uform
+    :members:
+    :undoc-members:
+
+====================
+Torch Encoreds
+====================
+
+.. automodule:: uform.torch_encoders
+    :members:
+    :undoc-members:
+
+====================
+Torch Processors
+====================
+
+.. automodule:: uform.torch_processors
+    :members:
+    :undoc-members:
+
+====================
+ONNX Encoders
+====================
+
+.. automodule:: uform.onnx_encoders
+    :members:
+    :undoc-members:
+
+====================
+NumPy Processors
+====================
+
+.. automodule:: uform.numpy_processors
+    :members:
+    :undoc-members:
diff --git a/docs/reference.rst b/docs/reference.rst
deleted file mode 100644
index 5828f41..0000000
--- a/docs/reference.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-API Reference
-==============
-
-.. automodule:: uform
-    :members:
-    :undoc-members:
diff --git a/docs/swift/index.rst b/docs/swift/index.rst
new file mode 100644
index 0000000..5f2e213
--- /dev/null
+++ b/docs/swift/index.rst
@@ -0,0 +1,6 @@
+====================
+Swift SDK
+====================
+
+
+.. mdinclude:: ../../swift/README.md
diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
index a37b326..81af1ae 100644
--- a/javascript/encoders.mjs
+++ b/javascript/encoders.mjs
@@ -3,10 +3,17 @@ import { InferenceSession, Tensor } from 'onnxruntime-node';
 import { PreTrainedTokenizer } from '@xenova/transformers';
 import sharp from 'sharp';
 
-import { getModel, Modality } from "./hub.mjs";
-
+/**
+ * A processor for text data that prepares input for the text encoder model.
+ */
 class TextProcessor {
 
+    /**
+     * Constructs a new TextProcessor instance.
+     *
+     * @param {string} configPath - The path to the configuration file for the text encoder.
+     * @param {string} tokenizerPath - The path to the tokenizer configuration file.
+     */
     constructor(configPath, tokenizerPath) {
         this.configPath = configPath;
         this.tokenizerPath = tokenizerPath;
@@ -16,6 +23,9 @@ class TextProcessor {
         this.tokenizer = null;
     }
 
+    /**
+     * Initializes the TextProcessor by loading configurations and setting up the tokenizer.
+     */
     async init() {
         var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' }));
         if (config.text_encoder !== undefined) {
@@ -31,6 +41,12 @@ class TextProcessor {
         this.tokenizer.pad_token_id = this.padTokenIdx;
     }
 
+    /**
+     * Processes a list of text strings into model-ready format, including padding and attention masks.
+     *
+     * @param {Array<string>} texts - An array of text strings to process.
+     * @return {Object} The processed texts as model input features.
+     */
     async process(texts) {
 
         const encoded = await this.tokenizer(texts, {
@@ -48,17 +64,31 @@ class TextProcessor {
     }
 }
 
+/**
+ * An encoder for text data that uses a pre-trained model to encode text.
+ */
 class TextEncoder {
 
-    constructor(modelPath, processor = null) {
+    /**
+     * Constructs a new TextEncoder instance.
+     *
+     * @param {string} modelPath - The path to the pre-trained ONNX model.
+     */
+    constructor(modelPath) {
         this.modelPath = modelPath;
         this.session = null;
     }
 
+    /**
+     * Initializes the ONNX session with the pre-trained model.
+     */
     async init() {
         this.session = await InferenceSession.create(this.modelPath);
     }
 
+    /**
+     * Releases the ONNX session resources.
+     */
     async dispose() {
         if (this.session) {
             await this.session.release();
@@ -66,6 +96,12 @@ class TextEncoder {
         }
     }
 
+    /**
+     * Encodes the input data using the pre-trained model.
+     *
+     * @param {Object} inputs - The input data containing input_ids and attention_mask.
+     * @return {Object} The encoded outputs from the model.
+     */
     async encode(inputs) {
         if (!this.session) {
             throw new Error("Session is not initialized.");
@@ -109,12 +145,17 @@ class TextEncoder {
 
 }
 
-
+/**
+ * A processor for image data that prepares images for the image encoder model.
+ */
 class ImageProcessor {
     constructor(configPath) {
         this.configPath = configPath;
     }
 
+    /**
+     * Initializes the ImageProcessor by loading configuration settings for image preprocessing.
+     */
     async init() {
         var config = JSON.parse(readFileSync(this.configPath, 'utf8'));
         if (config.image_encoder !== undefined) {
@@ -128,6 +169,12 @@ class ImageProcessor {
         this.imageMean = new Float32Array(this.normalizationMeans);
         this.imageStd = new Float32Array(this.normalizationDeviations);
     }
+    /**
+     * Processes raw image data into a model-ready format, including resizing, cropping, and normalizing.
+     *
+     * @param {Buffer|Array<Buffer>} images - A single image or an array of images to process.
+     * @return {Array<Float32Array>} The processed image data as an array of Float32Arrays.
+     */
     async process(images) {
         const processSingle = async (image) => {
             let img = sharp(image).toColorspace('srgb');
@@ -174,16 +221,25 @@ class ImageProcessor {
     }
 }
 
+/**
+ * An encoder for image data that uses a pre-trained model to encode images.
+ */
 class ImageEncoder {
     constructor(modelPath, processor) {
         this.modelPath = modelPath;
         this.imageSize = processor.imageSize;
     }
 
+    /**
+     * Initializes the ONNX session with the pre-trained model.
+     */
     async init() {
         this.session = await InferenceSession.create(this.modelPath);
     }
 
+    /**
+     * Releases the ONNX session resources.
+     */
     async dispose() {
         if (this.session) {
             await this.session.release();
@@ -191,6 +247,12 @@ class ImageEncoder {
         }
     }
 
+    /**
+     * Encodes the processed image data using the pre-trained model.
+     *
+     * @param {Float32Array|Array<Float32Array>} images - The processed image data.
+     * @return {Object} The encoded outputs from the model.
+     */
     async encode(images) {
         if (!this.session) {
             throw new Error("Session is not initialized.");
@@ -220,7 +282,7 @@ class ImageEncoder {
         let dims;
 
         if (Array.isArray(images)) {
-            // Assuming each images in the array is a Float32Array representing an image already processed to a fixed size.
+            // Assuming each image in the array is a Float32Array representing an image already processed to a fixed size.
             const arrays = images.map(ensureFloat32Array);
             imagesData = concatFloat32Arrays(arrays);
             const numImages = arrays.length;
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 8f0a30b..99d13c1 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -14,7 +14,7 @@ class Modality(Enum):
     TEXT_DECODER = "text_decoder"
 
 
-def normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]:
+def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]:
     if modalities is None:
         return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER)
 
@@ -36,7 +36,7 @@ def get_checkpoint(
     :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path
     """
 
-    modalities = normalize_modalities(modalities)
+    modalities = _normalize_modalities(modalities)
 
     # It is not recommended to use `.pth` extension when checkpointing models
     # because it collides with Python path (`.pth`) configuration files.
@@ -98,10 +98,19 @@ def get_model_torch(
     device: Literal["cpu", "cuda"] = "cpu",
     modalities: Optional[Tuple[Union[str, Modality]]] = None,
 ) -> Tuple[Dict[Modality, Callable], Dict]:
+    """
+    Fetches and constructs a PyTorch model with its processors based on provided modalities.
+
+    :param model_name: The identifier of the model on the Hugging Face Hub.
+    :param token: Optional API token for authenticated access to the model.
+    :param device: The device to load the model onto ('cpu' or 'cuda').
+    :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
+    :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
+    """
     from uform.torch_encoders import TextEncoder, ImageEncoder
     from uform.torch_processors import TextProcessor, ImageProcessor
 
-    modalities = normalize_modalities(modalities)
+    modalities = _normalize_modalities(modalities)
     config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt")
 
     result_processors = {}
@@ -131,10 +140,19 @@ def get_model_onnx(
     token: Optional[str] = None,
     modalities: Optional[Tuple[str]] = None,
 ):
+    """
+    Fetches and constructs an ONNX model with its processors based on provided modalities.
+
+    :param model_name: The identifier of the model on the Hugging Face Hub.
+    :param device: The device on which the model will operate ('cpu' or 'cuda').
+    :param token: Optional API token for authenticated access to the model.
+    :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
+    :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
+    """
     from uform.onnx_encoders import TextEncoder, ImageEncoder
     from uform.numpy_processors import TextProcessor, ImageProcessor
 
-    modalities = normalize_modalities(modalities)
+    modalities = _normalize_modalities(modalities)
     config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx")
 
     result_processors = {}
@@ -163,7 +181,16 @@ def get_model(
     modalities: Optional[Tuple[str, Modality]] = None,  # all by default
     token: Optional[str] = None,  # optional HuggingFace Hub token for private models
 ) -> Tuple[Dict[Modality, Callable], Dict]:
-
+    """
+    Fetches a model and its processors from the Hugging Face Hub, using either the ONNX or Torch backend.
+
+    :param model_name: The identifier of the model on the Hugging Face Hub.
+    :param device: The device to load the model onto ('cpu' or 'cuda').
+    :param backend: The backend framework to use ('onnx' or 'torch').
+    :param modalities: A tuple specifying the types of model components to fetch.
+    :param token: Optional API token for authenticated access to the model.
+    :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
+    """
     if backend == "onnx":
         return get_model_onnx(model_name, device=device, token=token, modalities=modalities)
     elif backend == "torch":

From c6f773c8249b8126e64393fff14d59c201d18b0b Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 24 Apr 2024 23:50:47 +0100
Subject: [PATCH 36/40] Docs: Typo

Co-authored-by: Joshua Lochner <admin@xenova.com>
---
 javascript/encoders_test.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
index a0a70b2..1785703 100644
--- a/javascript/encoders_test.js
+++ b/javascript/encoders_test.js
@@ -101,7 +101,7 @@ async function tryImageEncoderForwardPass(modelId) {
 }
 
 function cosineSimilarity(vecA, vecB) {
-    // We may be receiving a complex tesnor type, so let's check if it
+    // We may be receiving a complex tensor type, so let's check if it
     // has an array member named `data`.
     if (vecA.data) {
         vecA = vecA.data;

From 6d4b6149700a45fb117c841f1a92f6497cdc46a4 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 24 Apr 2024 23:51:37 +0100
Subject: [PATCH 37/40] Improve: Backend-agnostic `.data` extraction in JS

Co-authored-by: Joshua Lochner <admin@xenova.com>
---
 javascript/encoders_test.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
index 1785703..30ea96a 100644
--- a/javascript/encoders_test.js
+++ b/javascript/encoders_test.js
@@ -180,8 +180,8 @@ async function tryCrossReferencingImageAndText(modelId) {
         const textEmbedding = await textEncoder.encode(processedText);
         const imageEmbedding = await imageEncoder.encode(processedImage);
 
-        textEmbeddings.push(new Float32Array(textEmbedding.embeddings.cpuData));
-        imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.cpuData));
+        textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data));
+        imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data));
 
         // Print-based debugging at its best :)
         // console.log(`Text: ${text}, Image: ${imageUrl}`);

From cf2516045a0e5c9f199c39907f000a2fb8c49fcb Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 24 Apr 2024 23:54:17 +0100
Subject: [PATCH 38/40] Fix: `add_special_tokens` argument in JS

Co-authored-by: Joshua Lochner <admin@xenova.com>
---
 javascript/encoders.mjs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
index 81af1ae..3c41636 100644
--- a/javascript/encoders.mjs
+++ b/javascript/encoders.mjs
@@ -50,8 +50,7 @@ class TextProcessor {
     async process(texts) {
 
         const encoded = await this.tokenizer(texts, {
-            addSpecialTokens: true,
-            returnAttentionMask: true,
+            add_special_tokens: true,
             padding: 'max_length',
             max_length: this.maxSeqLen,
             truncation: true,

From 917a4a868e9450597f546f33c0a4e26083b83498 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Wed, 24 Apr 2024 23:59:16 +0000
Subject: [PATCH 39/40] Improve: Multi-GPU support in Py

---
 python/README.md                 |  28 ++++-
 python/scripts/test_encoders.py  | 175 +++++++++++++++++++------------
 python/uform/__init__.py         |  10 +-
 python/uform/numpy_processors.py |   6 +-
 python/uform/onnx_encoders.py    |   4 +-
 python/uform/shared.py           |  26 +++++
 python/uform/torch_encoders.py   |  20 ++--
 python/uform/torch_processors.py |   6 +-
 8 files changed, 182 insertions(+), 93 deletions(-)
 create mode 100644 python/uform/shared.py

diff --git a/python/README.md b/python/README.md
index 621bee0..dd7611d 100644
--- a/python/README.md
+++ b/python/README.md
@@ -99,13 +99,33 @@ For that pick the encoder of the model you want to run in parallel, and wrap it
 
 ```python
 from uform import get_model, Modality
+import torch.nn as nn
 
-encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch', device='gpu')
+encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch')
 
-encoder_image = encoders[Modality.IMAGE_ENCODER]
-encoder_image = nn.DataParallel(encoder_image)
+model_text = models[Modality.TEXT_ENCODER]
+model_image = models[Modality.IMAGE_ENCODER]
+processor_text = processors[Modality.TEXT_ENCODER]
+processor_image = processors[Modality.IMAGE_ENCODER]
 
-_, res = encoder_image(images, 0)
+model_text.return_features = False
+model_image.return_features = False
+model_text_parallel = nn.DataParallel(model_text)
+model_image_parallel = nn.DataParallel(model_image)
+```
+
+Since we are now dealing with the PyTorch wrapper, make sure to use the `forward` method (instead of `encode`) to get the embeddings, and the `.detach().cpu().numpy()` sequence to bring the data back to more Pythonic NumPy arrays.
+
+```python
+def get_image_embedding(images: List[Image]):
+    preprocessed = processor_image(images)
+    embedding = model_image_parallel.forward(preprocessed)
+    return embedding.detach().cpu().numpy()
+
+def get_text_embedding(texts: List[str]):
+    preprocessed = processor_text(texts)
+    embedding = model_text_parallel.forward(preprocessed)
+    return embedding.detach().cpu().numpy()
 ```
 
 ### ONNX and CUDA
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
index 274ed6c..20caed2 100644
--- a/python/scripts/test_encoders.py
+++ b/python/scripts/test_encoders.py
@@ -1,3 +1,4 @@
+from functools import wraps
 from typing import Tuple
 import requests
 from io import BytesIO
@@ -7,7 +8,7 @@
 import numpy as np
 from PIL import Image
 
-from uform import Modality, get_model, get_model_onnx
+from uform import Modality, get_model, ExecutionProviderError
 
 # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
 try:
@@ -49,6 +50,21 @@
             token = file.read().strip()
 
 
+def skip_on(exception, reason="No good reason :)"):
+    def decorator_func(f):
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            try:
+                # Try to run the test
+                return f(*args, **kwargs)
+            except exception:
+                pytest.skip(reason)
+
+        return wrapper
+
+    return decorator_func
+
+
 def cosine_similarity(x, y) -> float:
     if not isinstance(x, np.ndarray):
         x = x.detach().numpy()
@@ -61,7 +77,7 @@ def cosine_similarity(x, y) -> float:
     return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
 
 
-def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding):
+def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding, batch_size_multiple: int = 1):
     """Test if the embeddings of text and image are semantically similar
     using a small set of example text-image pairs."""
 
@@ -80,30 +96,27 @@ def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embed
         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
         "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
     ]
+    assert len(texts) == len(image_urls), "Number of texts and images should be the same."
 
-    text_embeddings = []
-    image_embeddings = []
-
-    for text, image_url in zip(texts, image_urls):
-        # Download and open the image
-        response = requests.get(image_url)
-        image = Image.open(BytesIO(response.content))
+    images = [Image.open(BytesIO(requests.get(image_url).content)) for image_url in image_urls]
+    count_pairs = len(texts)
 
-        # Get embeddings
-        text_embedding = text_to_embedding(text)
-        image_embedding = image_to_embedding(image)
+    # Ensure we have a sufficiently large batch
+    texts = texts * batch_size_multiple
+    images = images * batch_size_multiple
 
-        text_embeddings.append(text_embedding)
-        image_embeddings.append(image_embedding)
+    # Compute the embedding in a batch fashion
+    text_embeddings = text_to_embedding(texts)
+    image_embeddings = image_to_embedding(images)
 
     # Evaluate cosine similarity
-    for i in range(len(texts)):
+    for i in range(count_pairs):
         pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i])
         other_text_similarities = [
-            cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(len(texts)) if j != i
+            cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(count_pairs) if j != i
         ]
         other_image_similarities = [
-            cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(len(texts)) if j != i
+            cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(count_pairs) if j != i
         ]
 
         assert pair_similarity > max(
@@ -171,79 +184,109 @@ def test_torch_many_embeddings(model_name: str, batch_size: int):
 @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
 @pytest.mark.parametrize("model_name", onnx_models)
 @pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+@skip_on(ExecutionProviderError, reason="Missing execution provider")
 def test_onnx_one_embedding(model_name: str, device: str):
 
-    from uform.onnx_encoders import ExecutionProviderError
-
-    try:
-
-        processors, models = get_model(model_name, token=token, device=device, backend="onnx")
-        model_text = models[Modality.TEXT_ENCODER]
-        model_image = models[Modality.IMAGE_ENCODER]
-        processor_text = processors[Modality.TEXT_ENCODER]
-        processor_image = processors[Modality.IMAGE_ENCODER]
-
-        text = "a small red panda in a zoo"
-        image_path = "assets/unum.png"
+    processors, models = get_model(model_name, token=token, device=device, backend="onnx")
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
 
-        image = Image.open(image_path)
-        image_data = processor_image(image)
-        text_data = processor_text(text)
+    text = "a small red panda in a zoo"
+    image_path = "assets/unum.png"
 
-        image_features, image_embedding = model_image.encode(image_data)
-        text_features, text_embedding = model_text.encode(text_data)
+    image = Image.open(image_path)
+    image_data = processor_image(image)
+    text_data = processor_text(text)
 
-        assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
-        assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
+    image_features, image_embedding = model_image.encode(image_data)
+    text_features, text_embedding = model_text.encode(text_data)
 
-        # Nested fucntions are easier to debug, than lambdas
-        def get_image_embedding(image_data):
-            features, embedding = model_image.encode(processor_image(image_data))
-            return embedding
+    assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
+    assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
 
-        def get_text_embedding(text_data):
-            features, embedding = model_text.encode(processor_text(text_data))
-            return embedding
+    # Nested fucntions are easier to debug, than lambdas
+    def get_image_embedding(image_data):
+        features, embedding = model_image.encode(processor_image(image_data))
+        return embedding
 
-        # Test if the model outputs actually make sense
-        cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding)
+    def get_text_embedding(text_data):
+        features, embedding = model_text.encode(processor_text(text_data))
+        return embedding
 
-    except ExecutionProviderError as e:
-        pytest.skip(f"Execution provider error: {e}")
+    # Test if the model outputs actually make sense
+    cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding)
 
 
 @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
 @pytest.mark.parametrize("model_name", onnx_models)
 @pytest.mark.parametrize("batch_size", [1, 2])
 @pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+@skip_on(ExecutionProviderError, reason="Missing execution provider")
 def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
 
-    from uform.onnx_encoders import ExecutionProviderError
+    processors, models = get_model(model_name, token=token, device=device, backend="onnx")
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
 
-    try:
+    texts = ["a small red panda in a zoo"] * batch_size
+    image_paths = ["assets/unum.png"] * batch_size
+
+    images = [Image.open(path) for path in image_paths]
+    image_data = processor_image(images)
+    text_data = processor_text(texts)
 
-        processors, models = get_model(model_name, token=token, device=device, backend="onnx")
-        model_text = models[Modality.TEXT_ENCODER]
-        model_image = models[Modality.IMAGE_ENCODER]
-        processor_text = processors[Modality.TEXT_ENCODER]
-        processor_image = processors[Modality.IMAGE_ENCODER]
+    image_embeddings = model_image.encode(image_data, return_features=False)
+    text_embeddings = model_text.encode(text_data, return_features=False)
 
-        texts = ["a small red panda in a zoo"] * batch_size
-        image_paths = ["assets/unum.png"] * batch_size
+    assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
+    assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
 
-        images = [Image.open(path) for path in image_paths]
-        image_data = processor_image(images)
-        text_data = processor_text(texts)
 
-        image_embeddings = model_image.encode(image_data, return_features=False)
-        text_embeddings = model_text.encode(text_data, return_features=False)
+@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
+@pytest.mark.parametrize("model_name", torch_models[:1])
+def test_torch_multi_gpu(model_name: str):
 
-        assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
-        assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
+    count_cuda_devices = torch.cuda.device_count()
+    if count_cuda_devices < 2:
+        pytest.skip("Not enough CUDA devices to run multi-GPU test")
 
-    except ExecutionProviderError as e:
-        pytest.skip(f"Execution provider error: {e}")
+    processors, models = get_model(model_name, token=token, backend="torch", device="cuda")
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
+
+    import torch.nn as nn
+
+    model_text.return_features = False
+    model_image.return_features = False
+    model_text_parallel = nn.DataParallel(model_text)
+    model_image_parallel = nn.DataParallel(model_image)
+
+    # Nested fucntions are easier to debug, than lambdas
+    def get_image_embedding(image_data):
+        preprocessed = processor_image(image_data)
+        embedding = model_image_parallel.forward(preprocessed)
+        return embedding.detach().cpu().numpy()
+
+    def get_text_embedding(text_data):
+        preprocessed = processor_text(text_data)
+        embedding = model_text_parallel.forward(preprocessed)
+        return embedding.detach().cpu().numpy()
+
+    # Test if the model outputs actually make sense
+    cross_references_image_and_text_embeddings(
+        get_text_embedding,
+        get_image_embedding,
+        batch_size_multiple=count_cuda_devices,
+    )
 
 
 if __name__ == "__main__":
-    pytest.main(["-s", "-x", __file__])
+    # If you want to run this test file individually, you can do so by running:
+    # pytest.main(["-s", "-x", __file__])
+    pass
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 99d13c1..7af8b75 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,17 +1,9 @@
 from os.path import join, exists
 from typing import Dict, Optional, Tuple, Literal, Union, Callable
-from enum import Enum
 
 from huggingface_hub import snapshot_download, utils
 
-from uform.onnx_encoders import ExecutionProviderError
-
-
-class Modality(Enum):
-    TEXT_ENCODER = "text_encoder"
-    IMAGE_ENCODER = "image_encoder"
-    VIDEO_ENCODER = "video_encoder"
-    TEXT_DECODER = "text_decoder"
+from uform.shared import ExecutionProviderError, Modality
 
 
 def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]:
diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py
index 3782c26..166ecf4 100644
--- a/python/uform/numpy_processors.py
+++ b/python/uform/numpy_processors.py
@@ -6,6 +6,8 @@
 from tokenizers import Tokenizer
 import numpy as np
 
+from uform.shared import read_config
+
 
 class TextProcessor:
     def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
@@ -14,7 +16,7 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         :param tokenizer_path: path to tokenizer file
         """
 
-        config = json.load(open(config_path, "r"))
+        config = read_config(config_path)
         if "text_encoder" in config:
             config = config["text_encoder"]
 
@@ -60,7 +62,7 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None):
         :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
         """
 
-        config = json.load(open(config_path, "r"))
+        config = read_config(config_path)
         if "image_encoder" in config:
             config = config["image_encoder"]
 
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
index d2668b9..b9c4cc4 100644
--- a/python/uform/onnx_encoders.py
+++ b/python/uform/onnx_encoders.py
@@ -5,9 +5,7 @@
 import onnxruntime as ort
 from numpy import ndarray
 
-
-class ExecutionProviderError(Exception):
-    """Exception raised when a requested execution provider is not available."""
+from uform.shared import ExecutionProviderError
 
 
 def available_providers(device: Optional[str]) -> Tuple[str, ...]:
diff --git a/python/uform/shared.py b/python/uform/shared.py
new file mode 100644
index 0000000..37d256b
--- /dev/null
+++ b/python/uform/shared.py
@@ -0,0 +1,26 @@
+from enum import Enum
+from typing import Union
+from os import PathLike
+import json
+
+
+class Modality(Enum):
+    TEXT_ENCODER = "text_encoder"
+    IMAGE_ENCODER = "image_encoder"
+    VIDEO_ENCODER = "video_encoder"
+    TEXT_DECODER = "text_decoder"
+
+
+class ExecutionProviderError(Exception):
+    """Exception raised when a requested execution provider is not available."""
+
+
+ConfigOrPath = Union[PathLike, str, object]
+
+
+def read_config(path_or_object: ConfigOrPath) -> object:
+    if isinstance(path_or_object, (PathLike, str)):
+        with open(path_or_object, "r") as f:
+            return json.load(f)
+    else:
+        return path_or_object
diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py
index c149088..89f6631 100644
--- a/python/uform/torch_encoders.py
+++ b/python/uform/torch_encoders.py
@@ -3,7 +3,6 @@
 from dataclasses import dataclass
 from os import PathLike
 from typing import Dict, Optional, Union, Mapping, Any, Tuple
-import json
 
 import torch
 import torch.nn as nn
@@ -11,6 +10,15 @@
 from torch import Tensor
 from PIL.Image import Image
 
+from uform.shared import read_config
+
+
+def _is_on_gpu(model: nn.Module) -> bool:
+    try:
+        return next(model.parameters()).device.type == "cuda"
+    except StopIteration:
+        return False
+
 
 @dataclass(eq=False)
 class Attention(nn.Module):
@@ -266,7 +274,7 @@ def forward(
             attention_mask = torch.ones_like(x)
 
         # If the model is on the GPU and the input matrices are not, shift them there
-        if next(self.parameters()).device.type == "cuda" and x.device.type != "cuda":
+        if _is_on_gpu(self) and not x.is_cuda:
             x = x.cuda()
             attention_mask = attention_mask.cuda()
 
@@ -298,8 +306,7 @@ def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike,
         :param config: the configuration dictionary or path to the JSON configuration file
         :param model: the model state dictionary or path to the `.pt` model file
         """
-        if isinstance(config, (PathLike, str)):
-            config = json.load(open(config, "r"))
+        config = read_config(config)
         if "text_encoder" in config:
             config = config["text_encoder"]
 
@@ -374,7 +381,7 @@ def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None
             x = x["images"]
 
         # If the model is on the GPU and the input matrices are not, shift them there
-        if next(self.parameters()).device.type == "cuda" and x.device.type != "cuda":
+        if _is_on_gpu(self) and not x.is_cuda:
             x = x.cuda()
 
         features = self.forward_features(x)
@@ -401,8 +408,7 @@ def from_pretrained(
         :param config: the configuration dictionary or path to the JSON configuration file
         :param model: the model state dictionary or path to the `.pt` model file
         """
-        if isinstance(config, (PathLike, str)):
-            config = json.load(open(config, "r"))
+        config = read_config(config)
         if "image_encoder" in config:
             config = config["image_encoder"]
 
diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py
index b61b224..79c7e87 100644
--- a/python/uform/torch_processors.py
+++ b/python/uform/torch_processors.py
@@ -15,6 +15,8 @@
     ToTensor,
 )
 
+from uform.shared import read_config
+
 
 # lambda is not pickle-able
 def convert_to_rgb(image):
@@ -28,7 +30,7 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         :param tokenizer_path: path to tokenizer file
         """
 
-        config = json.load(open(config_path, "r"))
+        config = read_config(config_path)
         if "text_encoder" in config:
             config = config["text_encoder"]
 
@@ -75,7 +77,7 @@ def __init__(self, config_path: PathLike):
         :param config: model config
         """
 
-        config = json.load(open(config_path, "r"))
+        config = read_config(config_path)
         if "image_encoder" in config:
             config = config["image_encoder"]
 

From f195b667a49c0be802780f4075bc842dd932408a Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Thu, 25 Apr 2024 01:08:24 +0000
Subject: [PATCH 40/40] Add: Parallel decoding bench

---
 .vscode/launch.json              | 14 +++++++-
 BENCHMARKS.md                    | 23 ++++++++----
 python/scripts/bench_decoders.py | 61 ++++++++++++++++++++++----------
 3 files changed, 72 insertions(+), 26 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 3343a11..92a1844 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -5,12 +5,24 @@
     "version": "0.2.0",
     "configurations": [
         {
-            "name": "Python Debugger: Current File with Arguments",
+            "name": "Python Debugger",
             "type": "debugpy",
             "request": "launch",
             "program": "${file}",
             "console": "integratedTerminal",
         },
+        {
+            "name": "PyTest Debugger",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "pytest",
+            "console": "integratedTerminal",
+            "args": [
+                "${file}",
+                "-s",
+                "-x",
+            ],
+        },
         {
             "name": "NodeJS Debugger",
             "type": "node-terminal",
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
index aa61535..07ff0bb 100644
--- a/BENCHMARKS.md
+++ b/BENCHMARKS.md
@@ -157,17 +157,26 @@ usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE
 
 options:
   -h, --help            show this help message and exit
-  --filter-out FILTER_OUT
-                        Filter out models, backends, or devices with a Regular Expression.
   --batch-size BATCH_SIZE
                         Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
+  --max-length MAX_LENGTH
+                        Maximum length of the generated text in tokens.
 ```
 
+On Nvidia H100 GPU, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
+
+| Model                               |  Size | Decoding Speed |    Decoding Parallel Streams |
+| :---------------------------------- | ----: | -------------: | ---------------------------: |
+| `llava-hf/llava-1.5-7b-hf`          |   7 B | ~ 141 tokens/s |  ~ 4 K tokens/s (32 streams) |
+| `Salesforce/instructblip-vicuna-7b` |   7 B | ~ 211 tokens/s |  ~ 2 K tokens/s (32 streams) |
+| `unum-cloud/uform-gen`              | 1.5 B | ~ 252 tokens/s | ~ 3 K tokens/s (128 streams) |
+| `unum-cloud/uform-gen2-dpo`         | 1.2 B | ~ 372 tokens/s | ~ 10 K tokens/s (64 streams) |
+
 On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
 
-| Model                               | Size |               Speed |   Speedup |
-| :---------------------------------- | ---: | ------------------: | --------: |
-| `llava-hf/llava-1.5-7b-hf`          |   7B |  ~ 40 tokens/second |           |
-| `Salesforce/instructblip-vicuna-7b` |   7B |  ~ 40 tokens/second |           |
-| `unum-cloud/uform-gen`              | 1.5B | ~ 140 tokens/second | __x 3.5__ |
+| Model                               |  Size | Decoding Speed |   Speedup |
+| :---------------------------------- | ----: | -------------: | --------: |
+| `llava-hf/llava-1.5-7b-hf`          |   7 B |  ~ 40 tokens/s |           |
+| `Salesforce/instructblip-vicuna-7b` |   7 B |  ~ 40 tokens/s |           |
+| `unum-cloud/uform-gen`              | 1.5 B | ~ 140 tokens/s | __x 3.5__ |
 
diff --git a/python/scripts/bench_decoders.py b/python/scripts/bench_decoders.py
index 4241ee6..0842ba9 100644
--- a/python/scripts/bench_decoders.py
+++ b/python/scripts/bench_decoders.py
@@ -34,8 +34,16 @@ class BenchmarkResult:
     duration_text_embedding: float
 
 
-def caption(model, processor, prompt: str, image: Image.Image) -> str:
-    inputs = processor(prompt, image, return_tensors="pt")
+def caption(model, processor, prompt: str, image: Image.Image, max_length: int, batch_size: int) -> List[str]:
+    # BLIP models require the prompt to be the first argument
+    prompt = [prompt] * batch_size
+    image = [image] * batch_size
+    try:
+        inputs = processor(prompt, image, return_tensors="pt")
+    except ValueError:
+        inputs = processor(image, prompt, return_tensors="pt")
+
+    # Downcast and move to device
     for possible_key in ["images", "pixel_values"]:
         if possible_key not in inputs:
             continue
@@ -47,16 +55,16 @@ def caption(model, processor, prompt: str, image: Image.Image) -> str:
             **inputs,
             do_sample=False,
             # use_cache=True,
-            max_new_tokens=128,
+            max_new_tokens=max_length,
             eos_token_id=32001,
             pad_token_id=processor.tokenizer.pad_token_id,
         )
     prompt_len = inputs["input_ids"].shape[1]
-    decoded_text = processor.batch_decode(
+    decoded_texts = processor.batch_decode(
         output[:, prompt_len:],
         skip_special_tokens=True,
-    )[0].strip()
-    return decoded_text
+    )
+    return decoded_texts
 
 
 def duration(callable):
@@ -72,25 +80,34 @@ def bench_captions(
     processor,
     prompt: str,
     images: List[Image.Image],
+    max_length: int = 256,
+    batch_size: int = 10,
 ) -> List[str]:
     total_duration = 0
     total_length = 0
     model = torch.compile(model)
 
-    def caption_image(image, model=model, processor=processor, prompt=prompt):
-        return caption(model=model, processor=processor, prompt=prompt, image=image)
+    def caption_image(image):
+        return caption(
+            model=model,
+            processor=processor,
+            prompt=prompt,
+            image=image,
+            max_length=max_length,
+            batch_size=batch_size,
+        )
 
     for image in images:
-        seconds, text = duration(partial(caption_image, image=image))
+        seconds, captions = duration(partial(caption_image, image=image))
         total_duration += seconds
-        total_length += len(text)
+        total_length += len(captions.strip()) if isinstance(captions, str) else sum(len(t.strip()) for t in captions)
 
     del model
     del processor
     print(f"Throughput: {total_length/total_duration:.2f} tokens/s")
 
 
-def main(filter_out: str = None, batch_size: int = 10):
+def main(batch_size: int = 10, max_length: int = 256):
 
     image_urls = [
         "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
@@ -123,6 +140,8 @@ def main(filter_out: str = None, batch_size: int = 10):
         ),
         prompt="Describe the picture in great detail",
         images=images,
+        batch_size=batch_size,
+        max_length=max_length,
     )
 
     print("UForm-Gen")
@@ -138,6 +157,8 @@ def main(filter_out: str = None, batch_size: int = 10):
         ),
         prompt="[cap] Summarize the visual content of the image.",
         images=images,
+        batch_size=batch_size,
+        max_length=max_length,
     )
 
     print("LLaVA")
@@ -152,6 +173,8 @@ def main(filter_out: str = None, batch_size: int = 10):
         ),
         prompt="USER: <image>\nWhat are these?\nASSISTANT:",
         images=images,
+        batch_size=batch_size,
+        max_length=max_length,
     )
 
     print("InstructBLIP")
@@ -166,24 +189,26 @@ def main(filter_out: str = None, batch_size: int = 10):
         ),
         prompt="Summarize the visual content of the image.",
         images=images,
+        batch_size=batch_size,
+        max_length=max_length,
     )
 
 
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--filter-out",
-        type=str,
-        default=None,
-        help="Filter out models, backends, or devices with a Regular Expression.",
-    )
     parser.add_argument(
         "--batch-size",
         type=int,
         default=10,
         help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
     )
+    parser.add_argument(
+        "--max-length",
+        type=str,
+        default=256,
+        help="Maximum length of the generated text in tokens.",
+    )
     args = parser.parse_args()
 
-    main(filter_out=args.filter_out, batch_size=args.batch_size)
+    main(batch_size=args.batch_size, max_length=args.max_length)