From 2246f13ab3fabf2acc048892e9d95305b05df0dc Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 16 Apr 2024 02:36:57 +0000 Subject: [PATCH 01/40] Improve: Fetch modalities separately --- .gitignore | 3 +- python/uform/__init__.py | 78 ++++++++++++++++++++----- python/uform/onnx_models.py | 13 +++-- python/uform/preprocessing.py | 105 ---------------------------------- python/uform/torch_models.py | 5 +- 5 files changed, 78 insertions(+), 126 deletions(-) delete mode 100644 python/uform/preprocessing.py diff --git a/.gitignore b/.gitignore index af7d4af..fbc703a 100755 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ package-lock.json *.onnx __pycache__ .build -.swiftpm \ No newline at end of file +.swiftpm +node_modules \ No newline at end of file diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 1ecb242..f1bca3a 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -1,30 +1,80 @@ from json import load -from os.path import join +from os.path import join, exists from typing import Mapping, Optional, Tuple +from enum import Enum from huggingface_hub import snapshot_download -def get_checkpoint(model_name: str, token: str) -> Tuple[str, Mapping, str]: - import torch - - model_path = snapshot_download(repo_id=model_name, token=token) - config_path = join(model_path, "torch_config.json") +class Modality(Enum): + TEXT = "text" + IMAGE = "image" - state = torch.load(join(model_path, "torch_weight.pt")) - return config_path, state, join(model_path, "tokenizer.json") +def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str]) -> Tuple[str, Mapping, str]: + import torch -def get_model(model_name: str, token: Optional[str] = None): - from uform.torch_models import VLM + # It is not recommended to use `.pth` extension when checkpointing models + # because it collides with Python path (`.pth`) configuration files. + merged_model_names = ["torch_weight.pt", "weights.pt", "model.pt"] + separate_modality_names = [str(x) + ".pt" for x in modalities] + config_names = ["config.json", "torch_config.json"] + tokenizer_names = ["tokenizer.json"] + + # The download stats depend on the number of times the `config.json` is pulled + # https://huggingface.co/docs/hub/models-download-stats + model_path = snapshot_download( + repo_id=model_name, + token=token, + allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names, + ) + + # Find the first name in `config_names` that is present + config_path = None + for config_name in config_names: + if exists(join(model_path, config_name)): + config_path = join(model_path, config_name) + break + + # Same for the tokenizer + tokenizer_path = None + for tokenizer_name in tokenizer_names: + if exists(join(model_path, tokenizer_name)): + tokenizer_path = join(model_path, tokenizer_name) + break + + # Ideally, we want to separately fetch all the models. + # If those aren't available, aggregate separate modalities and merge them. + state = None + for file_name in merged_model_names: + if exists(join(model_path, file_name)): + state = torch.load(join(model_path, file_name)) + break + + if state is None: + state = {} + for file_name in separate_modality_names: + if exists(join(model_path, file_name)): + modality_name, _, _ = file_name.partition(".") + property_name = modality_name + "_encoder" + state[property_name] = torch.load(join(model_path, file_name)) + + return config_path, state, tokenizer_path + + +def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None): + from uform.torch_models import TextVisualEncoder from uform.torch_preprocessor import TorchProcessor - config_path, state, tokenizer_path = get_checkpoint(model_name, token) + if modalities is None: + modalities = (Modality.TEXT, Modality.IMAGE) + + config_path, state, tokenizer_path = get_checkpoint(model_name, token, modalities) with open(config_path) as f: config = load(f) - model = VLM(config, tokenizer_path) + model = TextVisualEncoder(config, tokenizer_path) model.image_encoder.load_state_dict(state["image_encoder"]) model.text_encoder.load_state_dict(state["text_encoder"]) processor = TorchProcessor(config, tokenizer_path) @@ -33,7 +83,7 @@ def get_model(model_name: str, token: Optional[str] = None): def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None): - from uform.onnx_models import VLM_ONNX + from uform.onnx_models import TextVisualEncoder from uform.numpy_preprocessor import NumPyProcessor assert device in ( @@ -53,7 +103,7 @@ def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str with open(join(model_path, "config.json")) as f: config = load(f) - model = VLM_ONNX(model_path, config, device, dtype) + model = TextVisualEncoder(model_path, config, device, dtype) processor = NumPyProcessor(config, join(model_path, "tokenizer.json")) return model, processor diff --git a/python/uform/onnx_models.py b/python/uform/onnx_models.py index 8e2a87a..68255de 100644 --- a/python/uform/onnx_models.py +++ b/python/uform/onnx_models.py @@ -23,7 +23,7 @@ def available_providers(device: str) -> Tuple[str, ...]: return cpu_providers -class VisualEncoderONNX: +class VisualEncoder: def __init__(self, model_path: str, device: str): """ :param model_path: Path to onnx model @@ -43,7 +43,7 @@ def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]: return self.session.run(None, {"images": images}) -class TextEncoderONNX: +class TextEncoder: def __init__(self, text_encoder_path: str, reranker_path: str, device: str): """ :param text_encoder_path: Path to onnx of text encoder @@ -82,7 +82,7 @@ def forward_multimodal( ) -class VLM_ONNX: +class TextVisualEncoder: def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str): assert device in ( "cpu", @@ -103,13 +103,13 @@ def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str): self._text_encoder_dim = config["text_encoder"]["dim"] self._image_encoder_dim = config["image_encoder"]["dim"] - self.text_encoder = TextEncoderONNX( + self.text_encoder = TextEncoder( join(checkpoint_path, f"text_encoder.onnx"), join(checkpoint_path, f"reranker.onnx"), device, ) - self.image_encoder = VisualEncoderONNX(join(checkpoint_path, f"image_encoder.onnx"), device) + self.image_encoder = VisualEncoder(join(checkpoint_path, f"image_encoder.onnx"), device) def encode_image( self, @@ -229,3 +229,6 @@ def embedding_dim(self) -> int: def multimodal_embedding_dim(self) -> int: """Dimensionality of multimodal joint embedding.""" return self._text_encoder_dim + + +VLM_ONNX = TextVisualEncoder # legacy diff --git a/python/uform/preprocessing.py b/python/uform/preprocessing.py deleted file mode 100644 index d3d833e..0000000 --- a/python/uform/preprocessing.py +++ /dev/null @@ -1,105 +0,0 @@ -from os import PathLike -from typing import Dict, List, Union - -import torch -from PIL import Image -from tokenizers import Tokenizer -from torch import Tensor -from torchvision.transforms import (CenterCrop, Compose, InterpolationMode, - Normalize, Resize, ToTensor) - - -# lambda is not pickable -def convert_to_rgb(image): - return image.convert("RGB") - - -class Processor: - def __init__(self, config: Dict, tokenizer_path: PathLike, tensor_type: str = "pt"): - """ - :param config: model config - :param tokenizer_path: path to tokenizer file - :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) - """ - - assert tensor_type in ("pt", "np"), "`tensor_type` must be either `pt` or `np`" - - self._image_size = config["image_encoder"]["image_size"] - self._max_seq_len = config["text_encoder"]["max_position_embeddings"] - self._tokenizer = Tokenizer.from_file(tokenizer_path) - self._tokenizer.no_padding() - self._pad_token_idx = config["text_encoder"]["padding_idx"] - - self.tensor_type = tensor_type - - self._image_transform = Compose( - [ - Resize(self._image_size, interpolation=InterpolationMode.BICUBIC), - convert_to_rgb, - CenterCrop(self._image_size), - ToTensor(), - Normalize( - mean=(0.48145466, 0.4578275, 0.40821073), - std=(0.26862954, 0.26130258, 0.27577711), - ), - ], - ) - - def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: - """Transforms one or more strings into dictionary with tokenized strings and attention masks. - - :param texts: text of list of texts to tokenizer - """ - if isinstance(texts, str): - texts = [texts] - - input_ids = torch.full( - (len(texts), self._max_seq_len), - fill_value=self._pad_token_idx, - dtype=torch.int64, - ) - - attention_mask = torch.zeros( - len(texts), - self._max_seq_len, - dtype=torch.int32, - ) - encoded = self._tokenizer.encode_batch(texts) - - for i, seq in enumerate(encoded): - seq_len = min(len(seq), self._max_seq_len) - input_ids[i, :seq_len] = torch.LongTensor( - seq.ids[: self._max_seq_len], - ) - attention_mask[i, :seq_len] = 1 - - if self.tensor_type == "np": - return { - "input_ids": input_ids.numpy(), - "attention_mask": attention_mask.numpy(), - } - - return {"input_ids": input_ids, "attention_mask": attention_mask} - - def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor: - """Transforms one or more Pillow images into Torch Tensors. - - :param images: image or list of images to preprocess - """ - - if isinstance(images, list): - batch_images = torch.empty( - (len(images), 3, self._image_size, self._image_size), - dtype=torch.float32, - ) - - for i, image in enumerate(images): - batch_images[i] = self._image_transform(image) - - else: - batch_images = self._image_transform(images).unsqueeze(0) - - if self.tensor_type == "np": - return batch_images.numpy() - - return batch_images diff --git a/python/uform/torch_models.py b/python/uform/torch_models.py index ab86622..c4f0bcb 100644 --- a/python/uform/torch_models.py +++ b/python/uform/torch_models.py @@ -353,7 +353,7 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: return embeddings -class VLM(nn.Module): +class TextVisualEncoder(nn.Module): """ Vision-Language Model for Multimodal embeddings. """ @@ -503,3 +503,6 @@ def embedding_dim(self) -> int: def multimodal_embedding_dim(self) -> int: """Dimensionality of multimodal joint embedding.""" return self.text_encoder.dim + + +VLM = TextVisualEncoder # legacy From b310e908e2e9fbd9be58fb9e36527ee767e16600 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 16 Apr 2024 02:55:30 +0000 Subject: [PATCH 02/40] Fix: Compatibility with older models --- python/scripts/test_embeddings.py | 19 +++++++++++++++---- python/uform/__init__.py | 12 ++++++------ python/uform/torch_models.py | 3 ++- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_embeddings.py index d71bf0b..9cdd4c5 100644 --- a/python/scripts/test_embeddings.py +++ b/python/scripts/test_embeddings.py @@ -1,4 +1,5 @@ from typing import Tuple +import os import pytest from PIL import Image @@ -21,6 +22,7 @@ onnx_available = False torch_models = [ + "unum-cloud/uform-vl2-english-small", "unum-cloud/uform-vl-english", "unum-cloud/uform-vl-multilingual-v2", ] @@ -34,11 +36,20 @@ ("unum-cloud/uform-vl-english-large", "gpu", "fp16"), ] +# Let's check if the HuggingFace Hub API token is set in the environment variable. +# If it's not there, check if the `.hf_token` file is present in the current working directory. +token = os.getenv("HUGGINGFACE_HUB_TOKEN", None) +if token is None: + token_path = "./.hf_token" + if os.path.exists(token_path): + with open(token_path, "r") as file: + token = file.read().strip() + @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") @pytest.mark.parametrize("model_name", torch_models) def test_torch_one_embedding(model_name: str): - model, processor = uform.get_model(model_name) + model, processor = uform.get_model(model_name, token=token) text = "a small red panda in a zoo" image_path = "assets/unum.png" @@ -67,7 +78,7 @@ def test_torch_one_embedding(model_name: str): @pytest.mark.parametrize("model_name", torch_models) @pytest.mark.parametrize("batch_size", [1, 2]) def test_torch_many_embeddings(model_name: str, batch_size: int): - model, processor = uform.get_model(model_name) + model, processor = uform.get_model(model_name, token=token) texts = ["a small red panda in a zoo"] * batch_size image_paths = ["assets/unum.png"] * batch_size @@ -90,7 +101,7 @@ def test_onnx_one_embedding(model_specs: Tuple[str, str, str]): try: - model, processor = uform.get_model_onnx(*model_specs) + model, processor = uform.get_model_onnx(*model_specs, token=token) text = "a small red panda in a zoo" image_path = "assets/unum.png" @@ -126,7 +137,7 @@ def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int try: - model, processor = uform.get_model_onnx(*model_specs) + model, processor = uform.get_model_onnx(*model_specs, token=token) texts = ["a small red panda in a zoo"] * batch_size image_paths = ["assets/unum.png"] * batch_size diff --git a/python/uform/__init__.py b/python/uform/__init__.py index f1bca3a..1d2d41f 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -11,14 +11,14 @@ class Modality(Enum): IMAGE = "image" -def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str]) -> Tuple[str, Mapping, str]: +def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]: import torch # It is not recommended to use `.pth` extension when checkpointing models # because it collides with Python path (`.pth`) configuration files. - merged_model_names = ["torch_weight.pt", "weights.pt", "model.pt"] - separate_modality_names = [str(x) + ".pt" for x in modalities] - config_names = ["config.json", "torch_config.json"] + merged_model_names = ["torch_weight.pt", "weight.pt", "model.pt"] + separate_modality_names = [(x.value if isinstance(x, Modality) else x) + ".pt" for x in modalities] + config_names = ["torch_config.json", "config.json"] tokenizer_names = ["tokenizer.json"] # The download stats depend on the number of times the `config.json` is pulled @@ -75,8 +75,8 @@ def get_model(model_name: str, token: Optional[str] = None, modalities: Optional config = load(f) model = TextVisualEncoder(config, tokenizer_path) - model.image_encoder.load_state_dict(state["image_encoder"]) - model.text_encoder.load_state_dict(state["text_encoder"]) + model.image_encoder.load_state_dict(state.get("image_encoder", None)) + model.text_encoder.load_state_dict(state.get("text_encoder", None)) processor = TorchProcessor(config, tokenizer_path) return model.eval(), processor diff --git a/python/uform/torch_models.py b/python/uform/torch_models.py index c4f0bcb..4339765 100644 --- a/python/uform/torch_models.py +++ b/python/uform/torch_models.py @@ -364,8 +364,9 @@ def __init__(self, config: Dict, tokenizer_path: PathLike): """ super().__init__() - self._embedding_dim = config["text_encoder"]["embedding_dim"] + config["text_encoder"].pop("tokenizer_class", None) + self._embedding_dim = config["text_encoder"]["embedding_dim"] self.text_encoder = TextEncoder(**config["text_encoder"]) self.image_encoder = VisualEncoder(**config["image_encoder"]) From a2f77d280df72ba39b5530b40dd1eee09a7538e7 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 16 Apr 2024 05:03:06 +0000 Subject: [PATCH 03/40] Make: Rename files --- python/uform/__init__.py | 8 ++-- ...py_preprocessor.py => numpy_processors.py} | 0 .../{onnx_models.py => onnx_encoders.py} | 0 .../uform/{gen_model.py => torch_decoders.py} | 37 ++++++++----------- .../{torch_models.py => torch_encoders.py} | 0 ...ch_preprocessor.py => torch_processors.py} | 0 6 files changed, 19 insertions(+), 26 deletions(-) rename python/uform/{numpy_preprocessor.py => numpy_processors.py} (100%) rename python/uform/{onnx_models.py => onnx_encoders.py} (100%) rename python/uform/{gen_model.py => torch_decoders.py} (94%) rename python/uform/{torch_models.py => torch_encoders.py} (100%) rename python/uform/{torch_preprocessor.py => torch_processors.py} (100%) diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 1d2d41f..cdb1250 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -63,8 +63,8 @@ def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None): - from uform.torch_models import TextVisualEncoder - from uform.torch_preprocessor import TorchProcessor + from python.uform.torch_encoders import TextVisualEncoder + from python.uform.torch_processors import TorchProcessor if modalities is None: modalities = (Modality.TEXT, Modality.IMAGE) @@ -83,8 +83,8 @@ def get_model(model_name: str, token: Optional[str] = None, modalities: Optional def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None): - from uform.onnx_models import TextVisualEncoder - from uform.numpy_preprocessor import NumPyProcessor + from python.uform.onnx_encoders import TextVisualEncoder + from python.uform.numpy_processors import NumPyProcessor assert device in ( "cpu", diff --git a/python/uform/numpy_preprocessor.py b/python/uform/numpy_processors.py similarity index 100% rename from python/uform/numpy_preprocessor.py rename to python/uform/numpy_processors.py diff --git a/python/uform/onnx_models.py b/python/uform/onnx_encoders.py similarity index 100% rename from python/uform/onnx_models.py rename to python/uform/onnx_encoders.py diff --git a/python/uform/gen_model.py b/python/uform/torch_decoders.py similarity index 94% rename from python/uform/gen_model.py rename to python/uform/torch_decoders.py index 35faae1..79b058d 100644 --- a/python/uform/gen_model.py +++ b/python/uform/torch_decoders.py @@ -3,19 +3,24 @@ import torch import torch.nn.functional as F from torch import nn -from torchvision.transforms import (CenterCrop, Compose, InterpolationMode, - Normalize, RandomResizedCrop, Resize, - ToTensor) +from torchvision.transforms import ( + CenterCrop, + Compose, + InterpolationMode, + Normalize, + RandomResizedCrop, + Resize, + ToTensor, +) from transformers import AutoConfig, AutoTokenizer from transformers.configuration_utils import PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.modeling_utils import PreTrainedModel -from transformers.models.auto.modeling_auto import (AutoModel, - AutoModelForCausalLM) +from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM from transformers.processing_utils import ProcessorMixin from transformers.tokenization_utils_base import BatchEncoding -from uform.models import VisualEncoder +from uform.torch_encoders import VisualEncoder IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) @@ -213,21 +218,13 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[dict, Tuple, CausalLMOutputWithPast]: - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError( @@ -248,11 +245,7 @@ def forward( ) if position_ids is None: - seq_length = ( - inputs_embeds.shape[1] - if inputs_embeds is not None - else input_ids.shape[1] - ) + seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1] past_key_values_length = 0 if past_key_values is not None: diff --git a/python/uform/torch_models.py b/python/uform/torch_encoders.py similarity index 100% rename from python/uform/torch_models.py rename to python/uform/torch_encoders.py diff --git a/python/uform/torch_preprocessor.py b/python/uform/torch_processors.py similarity index 100% rename from python/uform/torch_preprocessor.py rename to python/uform/torch_processors.py From acbb77ad87a32f8d2fdc8154ea149bbbccdd2a6f Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 16 Apr 2024 22:23:24 +0000 Subject: [PATCH 04/40] Add: Placeholder for JavaScript SDK --- .gitignore | 9 +- README.md | 8 +- javascript/README.md | 10 + package.json | 11 + python/scripts/bench.py | 2 +- python/scripts/export.ipynb | 666 ------------------ python/scripts/export_encoders.ipynb | 436 ++++++++++++ .../{test_generative.py => test_decoders.py} | 0 .../{test_embeddings.py => test_encoders.py} | 2 +- python/uform/__init__.py | 4 +- python/uform/chat.py | 2 +- python/uform/gen_model.py | 1 + swift/EmbeddingsTests.swift | 6 +- swift/README.md | 44 ++ 14 files changed, 522 insertions(+), 679 deletions(-) create mode 100644 javascript/README.md create mode 100644 package.json delete mode 100644 python/scripts/export.ipynb create mode 100644 python/scripts/export_encoders.ipynb rename python/scripts/{test_generative.py => test_decoders.py} (100%) rename python/scripts/{test_embeddings.py => test_encoders.py} (99%) create mode 100644 python/uform/gen_model.py create mode 100644 swift/README.md diff --git a/.gitignore b/.gitignore index fbc703a..4db8e17 100755 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,13 @@ test build/ package-lock.json *.egg-info -*.onnx __pycache__ .build .swiftpm -node_modules \ No newline at end of file +.hf_token +node_modules + +# Tensors & ML Model +*.onnx +*.pt +*.safetensors diff --git a/README.md b/README.md index 031c484..32957e7 100755 --- a/README.md +++ b/README.md @@ -20,9 +20,11 @@ For Content Understanding and Generation

Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
-Short Texts • Images • 🔜 Video Clips +Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
-PyTorch • ONNX +ONNX • CoreML • PyTorch +
+Python • JavaScript • Swift

--- @@ -279,7 +281,7 @@ The generative model can be used to caption images, summarize their content, or The exact behavior is controlled by prompts. ```python -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen') processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen') diff --git a/javascript/README.md b/javascript/README.md new file mode 100644 index 0000000..5626d39 --- /dev/null +++ b/javascript/README.md @@ -0,0 +1,10 @@ +# UForm for JavaScript + + + +```bash +pnpm add uform +npm add uform +yarn add uform +``` + diff --git a/package.json b/package.json new file mode 100644 index 0000000..7331231 --- /dev/null +++ b/package.json @@ -0,0 +1,11 @@ +{ + "name": "uform", + "private": true, + "version": "2.0.2", + "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation", + "dependencies": { + "@huggingface/hub": "^0.14.8", + "@xenova/transformers": "^2.17.0", + "onnxruntime-web": "^1.17.3" + } +} diff --git a/python/scripts/bench.py b/python/scripts/bench.py index 49c7004..8bcaf37 100644 --- a/python/scripts/bench.py +++ b/python/scripts/bench.py @@ -13,7 +13,7 @@ ) from uform import get_model -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor dtype = torch.bfloat16 low_cpu_mem_usage = False diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb deleted file mode 100644 index ce8cf10..0000000 --- a/python/scripts/export.ipynb +++ /dev/null @@ -1,666 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scripts for Exporting PyTorch Models to ONNX and CoreML" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --upgrade \"uform[torch]\" coremltools" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n", - " Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n", - " Expected in: <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n", - " warn(f\"Failed to load image Python extension: {e}\")\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fadffc0299c04e249fd4f7a5b40ba0af", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 5 files: 0%| | 0/5 [00:00 MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n", - "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n", - "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n", - "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n" - ] - } - ], - "source": [ - "coreml_model = ct.convert(\n", - " traced_script_module, source=\"pytorch\",\n", - " inputs=[image_input], outputs=[image_features, image_embeddings],\n", - " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", - "\n", - "coreml_model.author = 'Unum Cloud'\n", - "coreml_model.license = 'Apache 2.0'\n", - "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "TextEncoder(\n", - " original_name=TextEncoder\n", - " (word_embeddings): Embedding(original_name=Embedding)\n", - " (position_embeddings): Embedding(original_name=Embedding)\n", - " (layer_norm): LayerNorm(original_name=LayerNorm)\n", - " (dropout): Dropout(original_name=Dropout)\n", - " (blocks): ModuleList(\n", - " original_name=ModuleList\n", - " (0): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (1): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (2): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n", - " (crossattn): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (3): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n", - " (crossattn): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " )\n", - " (embedding_projection): Linear(original_name=Linear)\n", - " (matching_head): Linear(original_name=Linear)\n", - " (context_projection): Linear(original_name=Linear)\n", - ")" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "module = model.text_encoder\n", - "module.eval()\n", - "module.return_features = True\n", - "\n", - "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n", - "traced_script_module" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Tuple detected at graph output. This will be flattened in the converted model.\n", - "Converting PyTorch Frontend ==> MIL Ops: 0%| | 0/157 [00:00 MIL Ops: 99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n", - "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n", - "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n", - "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n" - ] - } - ], - "source": [ - "coreml_model = ct.convert(\n", - " traced_script_module, source=\"pytorch\",\n", - " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n", - " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", - "\n", - "coreml_model.author = 'Unum Cloud'\n", - "coreml_model.license = 'Apache 2.0'\n", - "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb new file mode 100644 index 0000000..369c938 --- /dev/null +++ b/python/scripts/export_encoders.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scripts for Exporting PyTorch Models to ONNX and CoreML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade \"uform[torch]\" coremltools" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uform\n", + "from PIL import Image\n", + "\n", + "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n", + "text = 'a small red panda in a zoo'\n", + "image = Image.open('../../assets/unum.png')\n", + "\n", + "image_data = processor.preprocess_image(image)\n", + "text_data = processor.preprocess_text(text)\n", + "\n", + "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n", + "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.text_encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.image_encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n", + "for name, module in model.image_encoder.named_children():\n", + " print(f\"First layer of image_encoder: {name}\")\n", + " break # We break after the first layer\n", + "\n", + "for name, module in model.text_encoder.named_children():\n", + " print(f\"First layer of text_encoder: {name}\")\n", + " break # We break after the first layer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CoreML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import coremltools as ct\n", + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n", + "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n", + "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n", + "text_features = ct.TensorType(name=\"features\")\n", + "text_embeddings = ct.TensorType(name=\"embeddings\")\n", + "image_features = ct.TensorType(name=\"features\")\n", + "image_embeddings = ct.TensorType(name=\"embeddings\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.image_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "\n", + "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n", + "traced_script_module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coreml_model = ct.convert(\n", + " traced_script_module, source=\"pytorch\",\n", + " inputs=[image_input], outputs=[image_features, image_embeddings],\n", + " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", + "\n", + "coreml_model.author = 'Unum Cloud'\n", + "coreml_model.license = 'Apache 2.0'\n", + "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", + "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.text_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "\n", + "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n", + "traced_script_module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coreml_model = ct.convert(\n", + " traced_script_module, source=\"pytorch\",\n", + " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n", + " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", + "\n", + "coreml_model.author = 'Unum Cloud'\n", + "coreml_model.license = 'Apache 2.0'\n", + "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", + "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyTorch\n", + "\n", + "Let's ensure that the input layers and the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from safetensors import safe_open\n", + "from safetensors.torch import save_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.image_encoder.eval()\n", + "model.image_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.image_encoder.state_dict(), 'image.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(model.image_encoder.state_dict(), \"image.safetensors\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.text_encoder.eval()\n", + "model.text_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.text_encoder.state_dict(), 'text.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(model.text_encoder.state_dict(), \"text.safetensors\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n", + "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n", + "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n", + "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ONNX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install onnx onnxconverter-common" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.onnx import export as onnx_export" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.text_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "onnx_export(\n", + " module,\n", + " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n", + " \"text.onnx\", \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['input_ids', 'attention_mask'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'input_ids' : {0 : 'batch_size'}, \n", + " 'attention_mask' : {0 : 'batch_size'}, \n", + " 'features' : {0 : 'batch_size'}, \n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "from onnxconverter_common import float16\n", + "\n", + "module = onnx.load(\"text.onnx\")\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, \"text.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now repeat the same for images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.image_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "torch.onnx.export(\n", + " module,\n", + " image_data, \n", + " \"image.onnx\", \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['input'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'input' : {0 : 'batch_size'},\n", + " 'features' : {0 : 'batch_size'},\n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "from onnxconverter_common import float16\n", + "\n", + "module = onnx.load(\"image.onnx\")\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, \"image.onnx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n", + "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py similarity index 100% rename from python/scripts/test_generative.py rename to python/scripts/test_decoders.py diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_encoders.py similarity index 99% rename from python/scripts/test_embeddings.py rename to python/scripts/test_encoders.py index 9cdd4c5..e7541c1 100644 --- a/python/scripts/test_embeddings.py +++ b/python/scripts/test_encoders.py @@ -22,7 +22,7 @@ onnx_available = False torch_models = [ - "unum-cloud/uform-vl2-english-small", + "unum-cloud/uform2-vl-english-small", "unum-cloud/uform-vl-english", "unum-cloud/uform-vl-multilingual-v2", ] diff --git a/python/uform/__init__.py b/python/uform/__init__.py index cdb1250..f5a15c2 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -7,8 +7,8 @@ class Modality(Enum): - TEXT = "text" - IMAGE = "image" + TEXT_ENCODER = "text_encoder" + IMAGE_ENCODER = "image_encoder" def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]: diff --git a/python/uform/chat.py b/python/uform/chat.py index 5ef44b7..c9f8dc3 100644 --- a/python/uform/chat.py +++ b/python/uform/chat.py @@ -5,7 +5,7 @@ from PIL import Image from transformers import TextStreamer -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor EOS_TOKEN = 32001 diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py new file mode 100644 index 0000000..6792120 --- /dev/null +++ b/python/uform/gen_model.py @@ -0,0 +1 @@ +from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path diff --git a/swift/EmbeddingsTests.swift b/swift/EmbeddingsTests.swift index 5efb87f..889cdb6 100644 --- a/swift/EmbeddingsTests.swift +++ b/swift/EmbeddingsTests.swift @@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase { let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform2-vl-english-small", hubApi: api ) @@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase { // A better option is to fetch directly from HuggingFace, similar to how users would do that: let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform2-vl-english-small", hubApi: api ) let imageModel = try await ImageEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform2-vl-english-small", hubApi: api ) diff --git a/swift/README.md b/swift/README.md new file mode 100644 index 0000000..1eebf29 --- /dev/null +++ b/swift/README.md @@ -0,0 +1,44 @@ +# UForm for Swift + +UForm offers first-party support for Swift. +To get started, add UForm to your project using Swift Package Manager. + +```bash +swift package init --type executable +swift package add uform +``` + +Then, import UForm in your Swift code: + +```swift +import UForm +``` + +## Embeddings + +### Text Embeddings + +```swift +let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie." +let textEmbedding: Embedding = try textModel.forward(with: text) +let textVector: [Float32] = textEmbedding.asFloats() +``` + +### Image Embeddings + +```swift +let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true" +guard let url = URL(string: imageURL), + let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil), + let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) { + throw Exception("Could not load image from URL: \(imageURL)") +} + +var imageEmbedding: Embedding = try imageModel.forward(with: cgImage) +var imageVector: [Float32] = embedding.asFloats() +``` + + +### Computing Distances \ No newline at end of file From 2351fe9f810d06bfb411566de518ced415c64634 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 16 Apr 2024 15:26:45 -0700 Subject: [PATCH 05/40] Docs: Improve export process --- python/scripts/export.ipynb | 629 ++++++++---------------------------- 1 file changed, 134 insertions(+), 495 deletions(-) diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb index ce8cf10..7afa4cc 100644 --- a/python/scripts/export.ipynb +++ b/python/scripts/export.ipynb @@ -18,52 +18,25 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n", - " Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n", - " Expected in: <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n", - " warn(f\"Failed to load image Python extension: {e}\")\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fadffc0299c04e249fd4f7a5b40ba0af", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 5 files: 0%| | 0/5 [00:00 MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n", - "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n", - "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n", - "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n" - ] - } - ], + "outputs": [], "source": [ "coreml_model = ct.convert(\n", " traced_script_module, source=\"pytorch\",\n", " inputs=[image_input], outputs=[image_features, image_embeddings],\n", - " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", + " convert_to='mlprogram', compute_precision=precision)\n", "\n", "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")" + "coreml_model.save(os.path.join(output_directory, model_name + \"-image.mlpackage\"))" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "TextEncoder(\n", - " original_name=TextEncoder\n", - " (word_embeddings): Embedding(original_name=Embedding)\n", - " (position_embeddings): Embedding(original_name=Embedding)\n", - " (layer_norm): LayerNorm(original_name=LayerNorm)\n", - " (dropout): Dropout(original_name=Dropout)\n", - " (blocks): ModuleList(\n", - " original_name=ModuleList\n", - " (0): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (1): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (2): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n", - " (crossattn): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (3): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n", - " (crossattn): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " )\n", - " (embedding_projection): Linear(original_name=Linear)\n", - " (matching_head): Linear(original_name=Linear)\n", - " (context_projection): Linear(original_name=Linear)\n", - ")" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "module = model.text_encoder\n", "module.eval()\n", @@ -606,40 +265,20 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Tuple detected at graph output. This will be flattened in the converted model.\n", - "Converting PyTorch Frontend ==> MIL Ops: 0%| | 0/157 [00:00 MIL Ops: 99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n", - "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n", - "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n", - "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n" - ] - } - ], + "outputs": [], "source": [ "coreml_model = ct.convert(\n", " traced_script_module, source=\"pytorch\",\n", " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n", - " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", + " convert_to='mlprogram', compute_precision=precision)\n", "\n", "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")" + "coreml_model.save(os.path.join(output_directory, model_name + \"-text.mlpackage\"))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 94ebd6e1571d29b82fe0730e2122495f937ff07b Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 17 Apr 2024 15:32:49 -0700 Subject: [PATCH 06/40] Break: Deprecate old ONNX structure --- .gitignore | 1 + .vscode/settings.json | 7 + Package.swift | 4 +- pyproject.toml | 3 +- python/scripts/export_encoders.ipynb | 201 +++++++++++++----- python/scripts/test_encoders.py | 115 ++++++++-- python/uform/__init__.py | 116 +++++----- python/uform/numpy_processors.py | 4 +- python/uform/onnx_encoders.py | 151 ++++++------- python/uform/torch_encoders.py | 41 +++- python/uform/torch_processors.py | 6 +- swift/{Embeddings.swift => Encoders.swift} | 11 + ...eddingsTests.swift => EncodersTests.swift} | 6 +- swift/README.md | 4 +- 14 files changed, 430 insertions(+), 240 deletions(-) rename swift/{Embeddings.swift => Encoders.swift} (98%) rename swift/{EmbeddingsTests.swift => EncodersTests.swift} (97%) diff --git a/.gitignore b/.gitignore index 4db8e17..f4fa33b 100755 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ node_modules *.onnx *.pt *.safetensors +*.mlpackage diff --git a/.vscode/settings.json b/.vscode/settings.json index a6cceb8..5052dea 100755 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,8 +1,10 @@ { "cSpell.words": [ "arange", + "astype", "CFURL", "coreml", + "crossattn", "cumsum", "dtype", "embs", @@ -25,12 +27,17 @@ "pretrained", "probs", "pypi", + "pytest", + "randn", "rerank", "reranker", "reranking", + "sandbeach", "sess", "SIMD", "softmax", + "Tensorrt", + "torchvision", "transfromers", "uform", "unimodal", diff --git a/Package.swift b/Package.swift index 6ac8372..b3b9ffd 100644 --- a/Package.swift +++ b/Package.swift @@ -29,13 +29,13 @@ let package = Package( .product(name: "Transformers", package: "swift-transformers") ], path: "swift", - exclude: ["EmbeddingsTests.swift"] + exclude: ["EncodersTests.swift"] ), .testTarget( name: "UFormTests", dependencies: ["UForm"], path: "swift", - sources: ["EmbeddingsTests.swift"] + sources: ["EncodersTests.swift"] ), ] ) diff --git a/pyproject.toml b/pyproject.toml index 10f7a9b..1a84808 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,8 @@ classifiers = [ dependencies = [ "huggingface_hub>=0.16.4", "tokenizers>=0.13.3", - "pillow" + "pillow", + "simsimd", ] description = "Pocket-Sized Multimodal AI for Content Understanding and Generation" maintainers = [ diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb index df57858..c7a94e0 100644 --- a/python/scripts/export_encoders.ipynb +++ b/python/scripts/export_encoders.ipynb @@ -4,7 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Scripts for Exporting PyTorch Models to ONNX and CoreML" + "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n", + "\n", + "Depending on the backend, we prefer different qunatization schemes.\n", + "\n", + "- For ONNX we use `int8` quantization.\n", + "- For PyTorch we use `bfloat16` quantization.\n", + "- For CoreML we use `float32` representation." ] }, { @@ -181,12 +187,12 @@ "coreml_model = ct.convert(\n", " traced_script_module, source=\"pytorch\",\n", " inputs=[image_input], outputs=[image_features, image_embeddings],\n", - " convert_to='mlprogram', compute_precision=ct.precision)\n", + " convert_to='mlprogram', compute_precision=precision)\n", "\n", "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")" + "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))" ] }, { @@ -217,7 +223,7 @@ "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")" + "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))" ] }, { @@ -260,7 +266,7 @@ "metadata": {}, "outputs": [], "source": [ - "torch.save(model.image_encoder.state_dict(), 'image.pt')" + "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))" ] }, { @@ -269,7 +275,7 @@ "metadata": {}, "outputs": [], "source": [ - "save_file(model.image_encoder.state_dict(), \"image.safetensors\")" + "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))" ] }, { @@ -288,7 +294,7 @@ "metadata": {}, "outputs": [], "source": [ - "torch.save(model.text_encoder.state_dict(), 'text.pt')" + "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))" ] }, { @@ -297,7 +303,7 @@ "metadata": {}, "outputs": [], "source": [ - "save_file(model.text_encoder.state_dict(), \"text.safetensors\")" + "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))" ] }, { @@ -312,26 +318,6 @@ "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n", - "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n", - "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -354,7 +340,8 @@ "metadata": {}, "outputs": [], "source": [ - "from torch.onnx import export as onnx_export" + "from torch.onnx import export as onnx_export\n", + "import torch" ] }, { @@ -378,7 +365,7 @@ "onnx_export(\n", " module,\n", " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n", - " \"text.onnx\", \n", + " os.path.join(output_directory, \"text_encoder.onnx\"), \n", " export_params=True,\n", " opset_version=15,\n", " do_constant_folding=True,\n", @@ -391,27 +378,6 @@ " 'embeddings' : {0 : 'batch_size'}})" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import onnx\n", - "from onnxconverter_common import float16\n", - "\n", - "module = onnx.load(\"text.onnx\")\n", - "module_fp16 = float16.convert_float_to_float16(module)\n", - "onnx.save(module_fp16, \"text.onnx\")" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -433,7 +399,7 @@ "torch.onnx.export(\n", " module,\n", " image_data, \n", - " \"image.onnx\", \n", + " os.path.join(output_directory, \"image_encoder.onnx\"), \n", " export_params=True,\n", " opset_version=15,\n", " do_constant_folding=True,\n", @@ -445,6 +411,15 @@ " 'embeddings' : {0 : 'batch_size'}})" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quantizing to `float16`\n", + "\n", + "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision." + ] + }, { "cell_type": "code", "execution_count": null, @@ -452,11 +427,115 @@ "outputs": [], "source": [ "import onnx\n", - "from onnxconverter_common import float16\n", - "\n", - "module = onnx.load(\"image.onnx\")\n", + "from onnxconverter_common import float16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module = onnx.load(module_path)\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, module_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "module = onnx.load(module_path)\n", "module_fp16 = float16.convert_float_to_float16(module)\n", - "onnx.save(module_fp16, \"image.onnx\")" + "onnx.save(module_fp16, module_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quantizing to `uint8`\n", + "\n", + "We can further quantize the model into `uint8` using ONNX quantization tools.\n", + "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from onnxruntime.quantization import quantize_dynamic, QuantType" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check that the runtime can actually load those models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "session_options = ort.SessionOptions()\n", + "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "session = ort.InferenceSession(module_path, sess_options=session_options)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "session = ort.InferenceSession(module_path, sess_options=session_options)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Upload to Hugging Face" ] }, { @@ -465,8 +544,12 @@ "metadata": {}, "outputs": [], "source": [ - "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n", - "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx" + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt" ] } ], @@ -486,7 +569,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py index e7541c1..a58544d 100644 --- a/python/scripts/test_encoders.py +++ b/python/scripts/test_encoders.py @@ -1,8 +1,12 @@ from typing import Tuple +import requests +from io import BytesIO import os import pytest +import numpy as np from PIL import Image + import uform # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed @@ -22,18 +26,13 @@ onnx_available = False torch_models = [ - "unum-cloud/uform2-vl-english-small", + "unum-cloud/uform3-image-text-english-small", "unum-cloud/uform-vl-english", "unum-cloud/uform-vl-multilingual-v2", ] -onnx_models_and_providers = [ - ("unum-cloud/uform-vl-english-small", "cpu", "fp32"), - ("unum-cloud/uform-vl-english-large", "cpu", "fp32"), - ("unum-cloud/uform-vl-english-small", "gpu", "fp32"), - ("unum-cloud/uform-vl-english-large", "gpu", "fp32"), - ("unum-cloud/uform-vl-english-small", "gpu", "fp16"), - ("unum-cloud/uform-vl-english-large", "gpu", "fp16"), +onnx_models = [ + "unum-cloud/uform3-image-text-english-small", ] # Let's check if the HuggingFace Hub API token is set in the environment variable. @@ -46,6 +45,71 @@ token = file.read().strip() +def cosine_similarity(x, y) -> float: + if not isinstance(x, np.ndarray): + x = x.detach().numpy() + if not isinstance(y, np.ndarray): + y = y.detach().numpy() + + # Unlike NumPy, SimSIMD can properly deal with integer types + x = x.astype(np.float32).flatten() + y = y.astype(np.float32).flatten() + return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) + + +def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding): + """Test if the embeddings of text and image are semantically similar + using a small set of example text-image pairs.""" + + texts = [ + "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", + "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", + "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", + "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", + "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", + ] + + image_urls = [ + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true", + ] + + text_embeddings = [] + image_embeddings = [] + + for text, image_url in zip(texts, image_urls): + # Download and open the image + response = requests.get(image_url) + image = Image.open(BytesIO(response.content)) + + # Get embeddings + text_embedding = text_to_embedding(text) + image_embedding = image_to_embedding(image) + + text_embeddings.append(text_embedding) + image_embeddings.append(image_embedding) + + # Evaluate cosine similarity + for i in range(len(texts)): + pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i]) + other_text_similarities = [ + cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(len(texts)) if j != i + ] + other_image_similarities = [ + cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(len(texts)) if j != i + ] + + assert pair_similarity > max( + other_text_similarities + ), "Text should be more similar to its corresponding image than to other images." + assert pair_similarity > max( + other_image_similarities + ), "Image should be more similar to its corresponding text than to other texts." + + @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") @pytest.mark.parametrize("model_name", torch_models) def test_torch_one_embedding(model_name: str): @@ -73,6 +137,12 @@ def test_torch_one_embedding(model_name: str): assert score.shape[0] == 1, "Matching score batch size is not 1" assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1" + # Test if the model outputs actually make sense + cross_references_image_and_text_embeddings( + lambda text: model.encode_text(processor.preprocess_text(text)), + lambda image: model.encode_image(processor.preprocess_image(image)), + ) + @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") @pytest.mark.parametrize("model_name", torch_models) @@ -94,14 +164,15 @@ def test_torch_many_embeddings(model_name: str, batch_size: int): @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") -@pytest.mark.parametrize("model_specs", onnx_models_and_providers) -def test_onnx_one_embedding(model_specs: Tuple[str, str, str]): +@pytest.mark.parametrize("model_name", onnx_models) +@pytest.mark.parametrize("device", ["CPUExecutionProvider"]) +def test_onnx_one_embedding(model_name: str, device: str): - from uform.onnx_models import ExecutionProviderError + from uform.onnx_encoders import ExecutionProviderError try: - model, processor = uform.get_model_onnx(*model_specs, token=token) + model, processor = uform.get_model_onnx(model_name, token=token, device=device) text = "a small red panda in a zoo" image_path = "assets/unum.png" @@ -115,29 +186,27 @@ def test_onnx_one_embedding(model_specs: Tuple[str, str, str]): assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" - score, joint_embedding = model.encode_multimodal( - image_features=image_features, - text_features=text_features, - attention_mask=text_data["attention_mask"], - return_scores=True, + # Test if the model outputs actually make sense + cross_references_image_and_text_embeddings( + lambda text: model.encode_text(processor.preprocess_text(text)), + lambda image: model.encode_image(processor.preprocess_image(image)), ) - assert score.shape[0] == 1, "Matching score batch size is not 1" - assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1" except ExecutionProviderError as e: pytest.skip(f"Execution provider error: {e}") @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") -@pytest.mark.parametrize("model_specs", onnx_models_and_providers) +@pytest.mark.parametrize("model_name", onnx_models) @pytest.mark.parametrize("batch_size", [1, 2]) -def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int): +@pytest.mark.parametrize("device", ["CPUExecutionProvider"]) +def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str): - from uform.onnx_models import ExecutionProviderError + from uform.onnx_encoders import ExecutionProviderError try: - model, processor = uform.get_model_onnx(*model_specs, token=token) + model, processor = uform.get_model_onnx(model_name, token=token, device=device) texts = ["a small red panda in a zoo"] * batch_size image_paths = ["assets/unum.png"] * batch_size diff --git a/python/uform/__init__.py b/python/uform/__init__.py index f5a15c2..44fce13 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -1,6 +1,6 @@ from json import load from os.path import join, exists -from typing import Mapping, Optional, Tuple +from typing import Dict, Optional, Tuple, Literal from enum import Enum from huggingface_hub import snapshot_download @@ -9,15 +9,38 @@ class Modality(Enum): TEXT_ENCODER = "text_encoder" IMAGE_ENCODER = "image_encoder" + VIDEO_ENCODER = "video_encoder" + TEXT_DECODER = "text_decoder" -def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]: - import torch +def normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]: + if modalities is None: + return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER) + + return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities) + + +def get_checkpoint( + model_name: str, + modalities: Tuple[str, Modality], + token: Optional[str] = None, + format: Literal[".pt", ".onnx"] = ".pt", +) -> Tuple[str, Dict[Modality, str], Optional[str]]: + """Downloads a model checkpoint from the Hugging Face Hub. + + :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small` + :param token: The Hugging Face API token, if required + :param modalities: The modalities to download, like `("text_encoder", "image_encoder")` + :param format: The format of the model checkpoint, either `.pt` or `.onnx` + :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path + """ + + modalities = normalize_modalities(modalities) # It is not recommended to use `.pth` extension when checkpointing models # because it collides with Python path (`.pth`) configuration files. - merged_model_names = ["torch_weight.pt", "weight.pt", "model.pt"] - separate_modality_names = [(x.value if isinstance(x, Modality) else x) + ".pt" for x in modalities] + merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]] + separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities] config_names = ["torch_config.json", "config.json"] tokenizer_names = ["tokenizer.json"] @@ -45,65 +68,58 @@ def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, # Ideally, we want to separately fetch all the models. # If those aren't available, aggregate separate modalities and merge them. - state = None + modality_paths = None for file_name in merged_model_names: if exists(join(model_path, file_name)): - state = torch.load(join(model_path, file_name)) + modality_paths = join(model_path, file_name) break - if state is None: - state = {} - for file_name in separate_modality_names: - if exists(join(model_path, file_name)): - modality_name, _, _ = file_name.partition(".") - property_name = modality_name + "_encoder" - state[property_name] = torch.load(join(model_path, file_name)) + if modality_paths is None: + modality_paths = {} + for separate_modality_name in separate_modality_names: + if exists(join(model_path, separate_modality_name)): + modality_name, _, _ = separate_modality_name.partition(".") + modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name) - return config_path, state, tokenizer_path + return config_path, modality_paths, tokenizer_path -def get_model(model_name: str, token: Optional[str] = None, modalities: Optional[Tuple[str]] = None): - from python.uform.torch_encoders import TextVisualEncoder - from python.uform.torch_processors import TorchProcessor +def get_model( + model_name: str, + *, + token: Optional[str] = None, + modalities: Optional[Tuple[str]] = None, +): + from uform.torch_encoders import TextVisualEncoder + from uform.torch_processors import TorchProcessor - if modalities is None: - modalities = (Modality.TEXT, Modality.IMAGE) - - config_path, state, tokenizer_path = get_checkpoint(model_name, token, modalities) - - with open(config_path) as f: - config = load(f) + config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".pt") + modality_paths = ( + {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths + ) - model = TextVisualEncoder(config, tokenizer_path) - model.image_encoder.load_state_dict(state.get("image_encoder", None)) - model.text_encoder.load_state_dict(state.get("text_encoder", None)) - processor = TorchProcessor(config, tokenizer_path) + model = TextVisualEncoder(config_path, modality_paths) + processor = TorchProcessor(config_path, tokenizer_path) return model.eval(), processor -def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None): - from python.uform.onnx_encoders import TextVisualEncoder - from python.uform.numpy_processors import NumPyProcessor +def get_model_onnx( + model_name: str, + *, + device: Literal["cpu", "cuda"] = "cpu", + token: Optional[str] = None, + modalities: Optional[Tuple[str]] = None, +): + from uform.onnx_encoders import TextVisualEncoder + from uform.numpy_processors import NumPyProcessor - assert device in ( - "cpu", - "gpu", - ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`" - assert dtype in ( - "fp32", - "fp16", - ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)" - assert ( - device == "cpu" and dtype == "fp32" - ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported" - - model_path = snapshot_download(repo_id=f"{model_name}-{device}-{dtype}", token=token) - - with open(join(model_path, "config.json")) as f: - config = load(f) + config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".onnx") + modality_paths = ( + {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths + ) - model = TextVisualEncoder(model_path, config, device, dtype) - processor = NumPyProcessor(config, join(model_path, "tokenizer.json")) + model = TextVisualEncoder(config_path, modality_paths, device=device) + processor = NumPyProcessor(config_path, tokenizer_path) return model, processor diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py index a556db4..d300504 100644 --- a/python/uform/numpy_processors.py +++ b/python/uform/numpy_processors.py @@ -1,5 +1,6 @@ from os import PathLike from typing import Dict, List, Union +import json from PIL.Image import Image, BICUBIC from tokenizers import Tokenizer @@ -7,13 +8,14 @@ class NumPyProcessor: - def __init__(self, config: Dict, tokenizer_path: PathLike): + def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ :param config: model config :param tokenizer_path: path to tokenizer file :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) """ + config = json.load(open(config_path, "r")) self._image_size = config["image_encoder"]["image_size"] self._max_seq_len = config["text_encoder"]["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py index 68255de..8201693 100644 --- a/python/uform/onnx_encoders.py +++ b/python/uform/onnx_encoders.py @@ -1,5 +1,6 @@ -from os.path import join -from typing import Dict, Optional, Tuple, Union +from os import PathLike +from typing import Dict, Optional, Tuple, Union, Literal +import json import onnxruntime as ort from numpy import ndarray @@ -9,18 +10,52 @@ class ExecutionProviderError(Exception): """Exception raised when a requested execution provider is not available.""" -def available_providers(device: str) -> Tuple[str, ...]: +def available_providers(device: Optional[str]) -> Tuple[str, ...]: + """Returns a tuple of available execution providers based on the requested device. + https://onnxruntime.ai/docs/execution-providers/ + + :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name. + :return: Tuple of available execution providers. + :raises ExecutionProviderError: If the requested device is not available. + """ + gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider") cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider") available = ort.get_available_providers() - if device == "gpu": + + # If no target device is specified, let's sort all the available ones with respect to our preference + if device is None: + preferences = gpu_providers + cpu_providers + filtered_preferences = tuple(provider for provider in preferences if provider in available) + if len(filtered_preferences): + return filtered_preferences + if len(available): + return available + raise ExecutionProviderError("No execution providers are available") + + # If a GPU is requested, but no GPU providers are available, raise an error + if device == "gpu" or device == "cuda": if all(provider not in available for provider in gpu_providers): raise ExecutionProviderError( f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}" ) return gpu_providers - return cpu_providers + # If a CPU is requested, but no CPU providers are available, raise an error + if device == "cpu": + if all(provider not in available for provider in cpu_providers): + raise ExecutionProviderError( + f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}" + ) + return cpu_providers + + if device not in available: + available_providers = ", ".join(available) + raise ExecutionProviderError( + f"Execution provider {device} is not available. Currently installed: {available_providers}" + ) + + return (device,) class VisualEncoder: @@ -40,11 +75,11 @@ def __init__(self, model_path: str, device: str): ) def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]: - return self.session.run(None, {"images": images}) + return self.session.run(None, {"input": images}) class TextEncoder: - def __init__(self, text_encoder_path: str, reranker_path: str, device: str): + def __init__(self, text_encoder_path: str, device: str): """ :param text_encoder_path: Path to onnx of text encoder :param reranker_path: Path to onnx of reranker @@ -60,56 +95,35 @@ def __init__(self, text_encoder_path: str, reranker_path: str, device: str): providers=available_providers(device), ) - self.reranker_session = ort.InferenceSession( - reranker_path, - sess_options=session_options, - providers=available_providers(device), - ) - def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]: return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask}) - def forward_multimodal( - self, text_features: ndarray, attention_mask: ndarray, image_features: ndarray - ) -> Tuple[ndarray, ndarray]: - return self.reranker_session.run( - None, - { - "text_features": text_features, - "attention_mask": attention_mask, - "image_features": image_features, - }, - ) - class TextVisualEncoder: - def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str): - assert device in ( - "cpu", - "gpu", - ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`" - assert dtype in ( - "fp32", - "fp16", - ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)" - assert ( - device == "cpu" and dtype == "fp32" - ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported" - + def __init__( + self, + config_path: PathLike, + modality_paths: Union[Dict[str, PathLike], PathLike] = None, + *, + device: Literal["cpu", "cuda"] = "cpu", + ): + """Initializes the model with the configuration and pre-trained weights. + + :param config_path: Path to the JSON model configuration file + :param modality_paths: Dictionary with paths to different modalities, + or a single path to the model checkpoint + """ self.device = device - self.dtype = dtype + config = json.load(open(config_path, "r")) self._embedding_dim = config["text_encoder"]["embedding_dim"] self._text_encoder_dim = config["text_encoder"]["dim"] self._image_encoder_dim = config["image_encoder"]["dim"] - self.text_encoder = TextEncoder( - join(checkpoint_path, f"text_encoder.onnx"), - join(checkpoint_path, f"reranker.onnx"), - device, - ) - - self.image_encoder = VisualEncoder(join(checkpoint_path, f"image_encoder.onnx"), device) + text_encoder_path = modality_paths.get("text_encoder", None) + image_encoder_path = modality_paths.get("image_encoder", None) + self.text_encoder = TextEncoder(text_encoder_path, device) if text_encoder_path else None + self.image_encoder = VisualEncoder(image_encoder_path, device) if image_encoder_path else None def encode_image( self, @@ -147,51 +161,6 @@ def encode_text( return embeddings - def encode_multimodal( - self, - image: Optional[ndarray] = None, - text: Dict[str, ndarray] = None, - image_features: Optional[ndarray] = None, - text_features: Optional[ndarray] = None, - attention_mask: Optional[ndarray] = None, - return_scores: bool = False, - ) -> Union[ndarray, Tuple[ndarray, ndarray]]: - """Passes preprocessed texts (or precomputed texts features) and - preprocessed images (or precomputed images features) through multimodal encoded to produce matching scores and optionally multimodal joint embeddings. - - :param image: Preprocessed images - :param text: Preprocessed texts - :param image_features: Precomputed images features - :param text_features: Precomputed text features - :param attention_mask: Attention masks, not required if pass `text` instead of text_features - """ - - assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None" - assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None" - - if text_features is not None: - assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`" - - if image_features is None: - image_features = self.image_encoder(image) - - if text_features is None: - text_features = self.text_encoder( - text["input_ids"], - text["attention_mask"], - ) - - matching_scores, embeddings = self.text_encoder.forward_multimodal( - text_features, - attention_mask if attention_mask is not None else text["attention_mask"], - image_features, - ) - - if return_scores: - return matching_scores, embeddings - - return embeddings - def forward( self, images: ndarray, diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index 4339765..2a0a0c9 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from os import PathLike from typing import Dict, Optional, Tuple, Union +import json import torch import torch.nn as nn @@ -358,17 +359,45 @@ class TextVisualEncoder(nn.Module): Vision-Language Model for Multimodal embeddings. """ - def __init__(self, config: Dict, tokenizer_path: PathLike): - """ - :param config: Model config + def __init__( + self, + config_path: PathLike, + modality_paths: Union[Dict[str, PathLike], PathLike] = None, + ): + """Initializes the model with the configuration and pre-trained weights. + + :param config_path: Path to the JSON model configuration file + :param modality_paths: Dictionary with paths to different modalities, + or a single path to the model checkpoint """ super().__init__() - config["text_encoder"].pop("tokenizer_class", None) + config = json.load(open(config_path, "r")) self._embedding_dim = config["text_encoder"]["embedding_dim"] - self.text_encoder = TextEncoder(**config["text_encoder"]) - self.image_encoder = VisualEncoder(**config["image_encoder"]) + + # Both `text_encoder` and `image_encoder` are data-classes, so we must strip + # all the non-member attributes before initializing the classes. + text_fields = TextEncoder.__dataclass_fields__ + image_fields = VisualEncoder.__dataclass_fields__ + text_encoder_attrs = {k: v for k, v in config["text_encoder"].items() if k in text_fields} + image_encoder_attrs = {k: v for k, v in config["image_encoder"].items() if k in image_fields} + self.text_encoder = TextEncoder(**text_encoder_attrs) + self.image_encoder = VisualEncoder(**image_encoder_attrs) + + # Load pre-trained weights + if modality_paths is not None: + if isinstance(modality_paths, Union[PathLike, str]): + state = torch.load(modality_paths) + self.text_encoder.load_state_dict(state["text_encoder"]) + self.image_encoder.load_state_dict(state["image_encoder"]) + else: + text_encoder_path = modality_paths.get("text_encoder", None) + image_encoder_path = modality_paths.get("image_encoder", None) + if text_encoder_path: + self.text_encoder.load_state_dict(torch.load(text_encoder_path)) + if image_encoder_path: + self.image_encoder.load_state_dict(torch.load(image_encoder_path)) def encode_image( self, diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py index 8bdc70b..b435efb 100644 --- a/python/uform/torch_processors.py +++ b/python/uform/torch_processors.py @@ -1,5 +1,6 @@ from os import PathLike from typing import Dict, List, Union +import json import torch from PIL.Image import Image @@ -15,19 +16,20 @@ ) -# lambda is not pickable +# lambda is not pickle-able def convert_to_rgb(image): return image.convert("RGB") class TorchProcessor: - def __init__(self, config: Dict, tokenizer_path: PathLike): + def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ :param config: model config :param tokenizer_path: path to tokenizer file :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) """ + config = json.load(open(config_path, "r")) self._image_size = config["image_encoder"]["image_size"] self._max_seq_len = config["text_encoder"]["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) diff --git a/swift/Embeddings.swift b/swift/Encoders.swift similarity index 98% rename from swift/Embeddings.swift rename to swift/Encoders.swift index 6d973ac..bc78433 100644 --- a/swift/Embeddings.swift +++ b/swift/Encoders.swift @@ -11,6 +11,17 @@ import Foundation import Hub // `Config` import Tokenizers // `AutoTokenizer` + +enum EncoderError: Error { + case configLoadingError(String) + case modelLoadingError(String) + case unsupportedDataType + case invalidInput + case unsupportedShapeConstraint + case modelPredictionFailed(String) +} + + public enum Embedding { case i32s([Int32]) case f16s([Float16]) diff --git a/swift/EmbeddingsTests.swift b/swift/EncodersTests.swift similarity index 97% rename from swift/EmbeddingsTests.swift rename to swift/EncodersTests.swift index 889cdb6..caab363 100644 --- a/swift/EmbeddingsTests.swift +++ b/swift/EncodersTests.swift @@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase { let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform2-vl-english-small", + modelName: "unum-cloud/uform3-image-text-english-small", hubApi: api ) @@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase { // A better option is to fetch directly from HuggingFace, similar to how users would do that: let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform2-vl-english-small", + modelName: "unum-cloud/uform3-image-text-english-small", hubApi: api ) let imageModel = try await ImageEncoder( - modelName: "unum-cloud/uform2-vl-english-small", + modelName: "unum-cloud/uform3-image-text-english-small", hubApi: api ) diff --git a/swift/README.md b/swift/README.md index 1eebf29..66b531f 100644 --- a/swift/README.md +++ b/swift/README.md @@ -19,7 +19,7 @@ import UForm ### Text Embeddings ```swift -let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small") let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie." let textEmbedding: Embedding = try textModel.forward(with: text) let textVector: [Float32] = textEmbedding.asFloats() @@ -28,7 +28,7 @@ let textVector: [Float32] = textEmbedding.asFloats() ### Image Embeddings ```swift -let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform3-image-text-english-small") let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true" guard let url = URL(string: imageURL), let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil), From 479ae61d53bf88c0f871765ef2011292986548a8 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 17 Apr 2024 16:01:11 -0700 Subject: [PATCH 07/40] Improve: Support different models with Swift --- .vscode/settings.json | 3 ++- CONTRIBUTING.md | 7 +++++++ swift/Encoders.swift | 24 ++++++++++++---------- swift/EncodersTests.swift | 42 ++++++++++++++++++++++++++++++--------- 4 files changed, 56 insertions(+), 20 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 5052dea..3a060e1 100755 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -42,7 +42,8 @@ "uform", "unimodal", "unsqueeze", - "Vardanian" + "Vardanian", + "whitespaces" ], "[python]": { "editor.defaultFormatter": "ms-python.black-formatter" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 181d9e2..37bc541 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,6 +20,13 @@ pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loa ## Swift +To build and test the Swift package, use the following command: + +```bash +swift build +swift test +``` + Swift formatting is enforced with `swift-format` default utility from Apple. To install and run it on all the files in the project, use the following command: diff --git a/swift/Encoders.swift b/swift/Encoders.swift index bc78433..44c6e71 100644 --- a/swift/Encoders.swift +++ b/swift/Encoders.swift @@ -11,7 +11,6 @@ import Foundation import Hub // `Config` import Tokenizers // `AutoTokenizer` - enum EncoderError: Error { case configLoadingError(String) case modelLoadingError(String) @@ -21,7 +20,6 @@ enum EncoderError: Error { case modelPredictionFailed(String) } - public enum Embedding { case i32s([Int32]) case f16s([Float16]) @@ -116,16 +114,22 @@ public class TextEncoder { let finalConfigPath = configPath ?? modelPath + "/config.json" let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json" self.model = try readModel(fromPath: modelPath) - self.processor = try TextProcessor(configPath: finalConfigPath, tokenizerPath: finalTokenizerPath, model: self.model) + self.processor = try TextProcessor( + configPath: finalConfigPath, + tokenizerPath: finalTokenizerPath, + model: self.model + ) } - public init(modelName: String, hubApi: HubApi = .shared) async throws { let repo = Hub.Repo(id: modelName) - let modelURL = try await hubApi.snapshot(from: repo, matching: ["text.mlpackage/*", "config.json", "tokenizer.json"]) + let modelURL = try await hubApi.snapshot( + from: repo, + matching: ["text_encoder.mlpackage/*", "config.json", "tokenizer.json"] + ) let configPath = modelURL.appendingPathComponent("config.json").path let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path - self.model = try readModel(fromURL: modelURL.appendingPathComponent("text.mlpackage", isDirectory: true)) + self.model = try readModel(fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true)) self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model) } @@ -158,12 +162,12 @@ public class ImageEncoder { public init(modelName: String, hubApi: HubApi = .shared) async throws { let repo = Hub.Repo(id: modelName) - let modelURL = try await hubApi.snapshot(from: repo, matching: ["image.mlpackage/*", "config.json"]) + let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"]) let configPath = modelURL.appendingPathComponent("config.json").path - self.model = try readModel(fromURL: modelURL.appendingPathComponent("image.mlpackage", isDirectory: true)) + self.model = try readModel(fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true)) self.processor = try ImageProcessor(configPath: configPath) } - + public func forward(with image: CGImage) throws -> Embedding { let inputFeatureProvider = try self.processor.preprocess(image) let prediction = try self.model.prediction(from: inputFeatureProvider) @@ -240,7 +244,7 @@ class ImageProcessor { if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] { configDict = imageEncoderConfig } - + let config = Config(configDict) self.imageSize = config.imageSize!.intValue! } diff --git a/swift/EncodersTests.swift b/swift/EncodersTests.swift index caab363..0096d62 100644 --- a/swift/EncodersTests.swift +++ b/swift/EncodersTests.swift @@ -1,11 +1,23 @@ import CoreGraphics +import Hub import ImageIO import UForm -import Hub import XCTest final class TokenizerTests: XCTestCase { + var hfToken: String? + + override func setUp() { + super.setUp() + // Attempt to load the Hugging Face token from the `.hf_token` file in the current directory + let fileURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath).appendingPathComponent(".hf_token") + if let token = try? String(contentsOf: fileURL, encoding: .utf8).trimmingCharacters(in: .whitespacesAndNewlines) + { + hfToken = token + } + } + func cosineSimilarity(between vectorA: [T], and vectorB: [T]) -> T { guard vectorA.count == vectorB.count else { fatalError("Vectors must be of the same length.") @@ -23,9 +35,9 @@ final class TokenizerTests: XCTestCase { return dotProduct / (magnitudeA * magnitudeB) } - func testTextEmbeddings() async throws { + func testTextEmbeddings(forModel modelName: String) async throws { - let api = HubApi(hfToken: "xxx") + let api = HubApi(hfToken: hfToken) let textModel = try await TextEncoder( modelName: "unum-cloud/uform3-image-text-english-small", hubApi: api @@ -60,29 +72,35 @@ final class TokenizerTests: XCTestCase { ) } - func testImageEmbeddings() async throws { + func testTextEmbeddings() async throws { + for model in ["unum-cloud/uform3-image-text-english-small"] { + try await testTextEmbeddings(forModel: model) + } + } + + func testImageEmbeddings(forModel modelName: String) async throws { // One option is to use a local model repository. // // let root = "uform/" // let textModel = try TextEncoder( - // modelPath: root + "uform-vl-english-large-text.mlpackage", + // modelPath: root + "uform-vl-english-large-text_encoder.mlpackage", // configPath: root + "uform-vl-english-large-text.json", // tokenizerPath: root + "uform-vl-english-large-text.tokenizer.json" // ) // let imageModel = try ImageEncoder( - // modelPath: root + "uform-vl-english-large-image.mlpackage", + // modelPath: root + "uform-vl-english-large-image_encoder.mlpackage", // configPath: root + "uform-vl-english-large-image.json" // ) // // A better option is to fetch directly from HuggingFace, similar to how users would do that: - let api = HubApi(hfToken: "xxx") + let api = HubApi(hfToken: hfToken) let textModel = try await TextEncoder( - modelName: "unum-cloud/uform3-image-text-english-small", + modelName: modelName, hubApi: api ) let imageModel = try await ImageEncoder( - modelName: "unum-cloud/uform3-image-text-english-small", + modelName: modelName, hubApi: api ) @@ -143,4 +161,10 @@ final class TokenizerTests: XCTestCase { } } + func testImageEmbeddings() async throws { + for model in ["unum-cloud/uform3-image-text-english-small"] { + try await testImageEmbeddings(forModel: model) + } + } + } From 45479bdbef457abad69753f3da2876be907898c6 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 18 Apr 2024 04:53:53 +0000 Subject: [PATCH 08/40] Add: JavaScript library placeholder --- .gitignore | 6 +- .vscode/launch.json | 10 + CONTRIBUTING.md | 9 + javascript/embeddings.mts | 55 ++++ tsconfig.json | 8 + yarn.lock | 594 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 681 insertions(+), 1 deletion(-) create mode 100644 javascript/embeddings.mts create mode 100644 tsconfig.json create mode 100644 yarn.lock diff --git a/.gitignore b/.gitignore index 4db8e17..af057d5 100755 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,13 @@ __pycache__ .build .swiftpm .hf_token -node_modules # Tensors & ML Model *.onnx *.pt *.safetensors + +# NodeJS +node_modules +node_build +yarn-error.log diff --git a/.vscode/launch.json b/.vscode/launch.json index 59eb78c..305841e 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,6 +10,16 @@ "request": "launch", "program": "${file}", "console": "integratedTerminal", + }, + { + "name": "NodeJS Debugger", + "type": "node", + "request": "launch", + "program": "${workspaceFolder}/javascript/embeddings.ts", + "preLaunchTask": "tsc: build - tsconfig.json", + "outFiles": [ + "${workspaceFolder}/node_build/**/*.js" + ] } ] } \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 181d9e2..cff4e0f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -30,3 +30,12 @@ swift-format . -i -r The style is controlled by the `.swift-format` JSON file in the root of the repository. As there is no standard for Swift formatting, even Apple's own `swift-format` tool and Xcode differ in their formatting rules, and available settings. + +## JavaScript + +Before submitting any changes, please make sure that the tests pass. + +```sh +npm install +npm run test +``` diff --git a/javascript/embeddings.mts b/javascript/embeddings.mts new file mode 100644 index 0000000..6a34344 --- /dev/null +++ b/javascript/embeddings.mts @@ -0,0 +1,55 @@ +import * as ort from 'onnxruntime-web'; +import { AutoTokenizer, PreTrainedTokenizer } from '@xenova/transformers'; + +type ModelConfig = { + modelPath: string; + tokenizerPath: string; +}; + +class TextEncoder { + private session: ort.InferenceSession; + private tokenizer: PreTrainedTokenizer; + + constructor(private config: ModelConfig) {} + + async init(): Promise { + this.tokenizer = await AutoTokenizer.from_pretrained(this.config.tokenizerPath); + this.session = await ort.InferenceSession.create(this.config.modelPath); + } + + async forward(text: string): Promise<{ features: Uint8Array, embeddings: Uint8Array }> { + // Tokenization + const { input_ids } = await this.tokenizer(text); + const tensorInputIds = new ort.Tensor('float32', Float32Array.from(input_ids), [1, input_ids.length]); + const tensorAttentionMask = new ort.Tensor('float32', new Float32Array(input_ids.length).fill(1), [1, input_ids.length]); + + // Model inference + const feeds = { input_ids: tensorInputIds, attention_mask: tensorAttentionMask }; + const results = await this.session.run(feeds); + + // Assume output tensors are in results['features'] and results['embeddings'] + const features = results['features'].data as Uint8Array! + const embeddings = results['embeddings'].data as Uint8Array! + + return { features, embeddings }; + } +} + +// Usage +async function main() { + const textEncoder = new TextEncoder({ + modelPath: './text_encoder.onnx', + tokenizerPath: 'Xenova/bert-base-uncased' + }); + + await textEncoder.init(); + const result = await textEncoder.forward('I love transformers!'); + console.log('Features:', result.features); + console.log('Embeddings:', result.embeddings); +} + +main(); + + + + diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..a77b46b --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,8 @@ +{ + "compilerOptions": { + "target": "ES5", + "module": "CommonJS", + "outDir": "node_build", + "sourceMap": true + } +} \ No newline at end of file diff --git a/yarn.lock b/yarn.lock new file mode 100644 index 0000000..5ab5bbe --- /dev/null +++ b/yarn.lock @@ -0,0 +1,594 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +"@huggingface/hub@^0.14.8": + version "0.14.8" + resolved "https://registry.npmjs.org/@huggingface/hub/-/hub-0.14.8.tgz" + integrity sha512-vdJRham99E5Uzsc4rO0gTz0ykafmx6V78pgPpJ7LGz5X+P2exe/izPFndqczAzy8jVWN55Jjtnuqg+Y0zrjc+Q== + dependencies: + hash-wasm "^4.9.0" + +"@huggingface/jinja@^0.2.2": + version "0.2.2" + resolved "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz" + integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA== + +"@protobufjs/aspromise@^1.1.1", "@protobufjs/aspromise@^1.1.2": + version "1.1.2" + resolved "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz" + integrity sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ== + +"@protobufjs/base64@^1.1.2": + version "1.1.2" + resolved "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz" + integrity sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg== + +"@protobufjs/codegen@^2.0.4": + version "2.0.4" + resolved "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz" + integrity sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg== + +"@protobufjs/eventemitter@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz" + integrity sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q== + +"@protobufjs/fetch@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz" + integrity sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ== + dependencies: + "@protobufjs/aspromise" "^1.1.1" + "@protobufjs/inquire" "^1.1.0" + +"@protobufjs/float@^1.0.2": + version "1.0.2" + resolved "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz" + integrity sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ== + +"@protobufjs/inquire@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz" + integrity sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q== + +"@protobufjs/path@^1.1.2": + version "1.1.2" + resolved "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz" + integrity sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA== + +"@protobufjs/pool@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz" + integrity sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw== + +"@protobufjs/utf8@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz" + integrity sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw== + +"@types/long@^4.0.1": + version "4.0.2" + resolved "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz" + integrity sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA== + +"@types/node@>=13.7.0": + version "20.12.7" + resolved "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz" + integrity sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg== + dependencies: + undici-types "~5.26.4" + +"@xenova/transformers@^2.17.0": + version "2.17.0" + resolved "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.0.tgz" + integrity sha512-usmDut7hwnrc4EqP59cboYqE6C8up63SqMy3E9RjG9nCsOhrsLndEU7DMu+bZ9R+HcAI8jRGabTIxH+B6agBVA== + dependencies: + "@huggingface/jinja" "^0.2.2" + onnxruntime-web "1.14.0" + sharp "^0.32.0" + optionalDependencies: + onnxruntime-node "1.14.0" + +b4a@^1.6.4: + version "1.6.6" + resolved "https://registry.npmjs.org/b4a/-/b4a-1.6.6.tgz" + integrity sha512-5Tk1HLk6b6ctmjIkAcU/Ujv/1WqiDl0F0JdRCR80VsOcUlHcu7pWeWRlOqQLHfDEsVx9YH/aif5AG4ehoCtTmg== + +bare-events@^2.0.0, bare-events@^2.2.0: + version "2.2.2" + resolved "https://registry.npmjs.org/bare-events/-/bare-events-2.2.2.tgz" + integrity sha512-h7z00dWdG0PYOQEvChhOSWvOfkIKsdZGkWr083FgN/HyoQuebSew/cgirYqh9SCuy/hRvxc5Vy6Fw8xAmYHLkQ== + +bare-fs@^2.1.1: + version "2.2.3" + resolved "https://registry.npmjs.org/bare-fs/-/bare-fs-2.2.3.tgz" + integrity sha512-amG72llr9pstfXOBOHve1WjiuKKAMnebcmMbPWDZ7BCevAoJLpugjuAPRsDINEyjT0a6tbaVx3DctkXIRbLuJw== + dependencies: + bare-events "^2.0.0" + bare-path "^2.0.0" + streamx "^2.13.0" + +bare-os@^2.1.0: + version "2.2.1" + resolved "https://registry.npmjs.org/bare-os/-/bare-os-2.2.1.tgz" + integrity sha512-OwPyHgBBMkhC29Hl3O4/YfxW9n7mdTr2+SsO29XBWKKJsbgj3mnorDB80r5TiCQgQstgE5ga1qNYrpes6NvX2w== + +bare-path@^2.0.0, bare-path@^2.1.0: + version "2.1.1" + resolved "https://registry.npmjs.org/bare-path/-/bare-path-2.1.1.tgz" + integrity sha512-OHM+iwRDRMDBsSW7kl3dO62JyHdBKO3B25FB9vNQBPcGHMo4+eA8Yj41Lfbk3pS/seDY+siNge0LdRTulAau/A== + dependencies: + bare-os "^2.1.0" + +base64-js@^1.3.1: + version "1.5.1" + resolved "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz" + integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA== + +bl@^4.0.3: + version "4.1.0" + resolved "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz" + integrity sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w== + dependencies: + buffer "^5.5.0" + inherits "^2.0.4" + readable-stream "^3.4.0" + +buffer@^5.5.0: + version "5.7.1" + resolved "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz" + integrity sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ== + dependencies: + base64-js "^1.3.1" + ieee754 "^1.1.13" + +chownr@^1.1.1: + version "1.1.4" + resolved "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz" + integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg== + +color-convert@^2.0.1: + version "2.0.1" + resolved "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz" + integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ== + dependencies: + color-name "~1.1.4" + +color-name@^1.0.0, color-name@~1.1.4: + version "1.1.4" + resolved "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz" + integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA== + +color-string@^1.9.0: + version "1.9.1" + resolved "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz" + integrity sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg== + dependencies: + color-name "^1.0.0" + simple-swizzle "^0.2.2" + +color@^4.2.3: + version "4.2.3" + resolved "https://registry.npmjs.org/color/-/color-4.2.3.tgz" + integrity sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A== + dependencies: + color-convert "^2.0.1" + color-string "^1.9.0" + +decompress-response@^6.0.0: + version "6.0.0" + resolved "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz" + integrity sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ== + dependencies: + mimic-response "^3.1.0" + +deep-extend@^0.6.0: + version "0.6.0" + resolved "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz" + integrity sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA== + +detect-libc@^2.0.0, detect-libc@^2.0.2: + version "2.0.3" + resolved "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.3.tgz" + integrity sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw== + +end-of-stream@^1.1.0, end-of-stream@^1.4.1: + version "1.4.4" + resolved "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz" + integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q== + dependencies: + once "^1.4.0" + +expand-template@^2.0.3: + version "2.0.3" + resolved "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz" + integrity sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg== + +fast-fifo@^1.1.0, fast-fifo@^1.2.0: + version "1.3.2" + resolved "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz" + integrity sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ== + +flatbuffers@^1.12.0: + version "1.12.0" + resolved "https://registry.npmjs.org/flatbuffers/-/flatbuffers-1.12.0.tgz" + integrity sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ== + +fs-constants@^1.0.0: + version "1.0.0" + resolved "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz" + integrity sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow== + +github-from-package@0.0.0: + version "0.0.0" + resolved "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz" + integrity sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw== + +guid-typescript@^1.0.9: + version "1.0.9" + resolved "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz" + integrity sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ== + +hash-wasm@^4.9.0: + version "4.11.0" + resolved "https://registry.npmjs.org/hash-wasm/-/hash-wasm-4.11.0.tgz" + integrity sha512-HVusNXlVqHe0fzIzdQOGolnFN6mX/fqcrSAOcTBXdvzrXVHwTz11vXeKRmkR5gTuwVpvHZEIyKoePDvuAR+XwQ== + +ieee754@^1.1.13: + version "1.2.1" + resolved "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz" + integrity sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA== + +inherits@^2.0.3, inherits@^2.0.4: + version "2.0.4" + resolved "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz" + integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ== + +ini@~1.3.0: + version "1.3.8" + resolved "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz" + integrity sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew== + +is-arrayish@^0.3.1: + version "0.3.2" + resolved "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz" + integrity sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ== + +long@^4.0.0: + version "4.0.0" + resolved "https://registry.npmjs.org/long/-/long-4.0.0.tgz" + integrity sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA== + +long@^5.0.0: + version "5.2.3" + resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz" + integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q== + +long@^5.2.3: + version "5.2.3" + resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz" + integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q== + +lru-cache@^6.0.0: + version "6.0.0" + resolved "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz" + integrity sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA== + dependencies: + yallist "^4.0.0" + +mimic-response@^3.1.0: + version "3.1.0" + resolved "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz" + integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ== + +minimist@^1.2.0, minimist@^1.2.3: + version "1.2.8" + resolved "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz" + integrity sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA== + +mkdirp-classic@^0.5.2, mkdirp-classic@^0.5.3: + version "0.5.3" + resolved "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz" + integrity sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A== + +napi-build-utils@^1.0.1: + version "1.0.2" + resolved "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz" + integrity sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg== + +node-abi@^3.3.0: + version "3.57.0" + resolved "https://registry.npmjs.org/node-abi/-/node-abi-3.57.0.tgz" + integrity sha512-Dp+A9JWxRaKuHP35H77I4kCKesDy5HUDEmScia2FyncMTOXASMyg251F5PhFoDA5uqBrDDffiLpbqnrZmNXW+g== + dependencies: + semver "^7.3.5" + +node-addon-api@^6.1.0: + version "6.1.0" + resolved "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz" + integrity sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA== + +once@^1.3.1, once@^1.4.0: + version "1.4.0" + resolved "https://registry.npmjs.org/once/-/once-1.4.0.tgz" + integrity sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w== + dependencies: + wrappy "1" + +onnx-proto@^4.0.4: + version "4.0.4" + resolved "https://registry.npmjs.org/onnx-proto/-/onnx-proto-4.0.4.tgz" + integrity sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA== + dependencies: + protobufjs "^6.8.8" + +onnxruntime-common@~1.14.0: + version "1.14.0" + resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.14.0.tgz" + integrity sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew== + +onnxruntime-common@1.17.3: + version "1.17.3" + resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.17.3.tgz" + integrity sha512-IkbaDelNVX8cBfHFgsNADRIq2TlXMFWW+nG55mwWvQT4i0NZb32Jf35Pf6h9yjrnK78RjcnlNYaI37w394ovMw== + +onnxruntime-node@1.14.0: + version "1.14.0" + resolved "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz" + integrity sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w== + dependencies: + onnxruntime-common "~1.14.0" + +onnxruntime-web@^1.17.3: + version "1.17.3" + resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.17.3.tgz" + integrity sha512-MSDrNUWgc1biP0YzY488OJ9n/jTMS9EXysgm9Aw4CUj2A836ALbO2J1sgzguWJeVUHTlM6p7tRzo8IGAgaXWKw== + dependencies: + flatbuffers "^1.12.0" + guid-typescript "^1.0.9" + long "^5.2.3" + onnxruntime-common "1.17.3" + platform "^1.3.6" + protobufjs "^7.2.4" + +onnxruntime-web@1.14.0: + version "1.14.0" + resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz" + integrity sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw== + dependencies: + flatbuffers "^1.12.0" + guid-typescript "^1.0.9" + long "^4.0.0" + onnx-proto "^4.0.4" + onnxruntime-common "~1.14.0" + platform "^1.3.6" + +platform@^1.3.6: + version "1.3.6" + resolved "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz" + integrity sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg== + +prebuild-install@^7.1.1: + version "7.1.2" + resolved "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.2.tgz" + integrity sha512-UnNke3IQb6sgarcZIDU3gbMeTp/9SSU1DAIkil7PrqG1vZlBtY5msYccSKSHDqa3hNg436IXK+SNImReuA1wEQ== + dependencies: + detect-libc "^2.0.0" + expand-template "^2.0.3" + github-from-package "0.0.0" + minimist "^1.2.3" + mkdirp-classic "^0.5.3" + napi-build-utils "^1.0.1" + node-abi "^3.3.0" + pump "^3.0.0" + rc "^1.2.7" + simple-get "^4.0.0" + tar-fs "^2.0.0" + tunnel-agent "^0.6.0" + +protobufjs@^6.8.8: + version "6.11.4" + resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.4.tgz" + integrity sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw== + dependencies: + "@protobufjs/aspromise" "^1.1.2" + "@protobufjs/base64" "^1.1.2" + "@protobufjs/codegen" "^2.0.4" + "@protobufjs/eventemitter" "^1.1.0" + "@protobufjs/fetch" "^1.1.0" + "@protobufjs/float" "^1.0.2" + "@protobufjs/inquire" "^1.1.0" + "@protobufjs/path" "^1.1.2" + "@protobufjs/pool" "^1.1.0" + "@protobufjs/utf8" "^1.1.0" + "@types/long" "^4.0.1" + "@types/node" ">=13.7.0" + long "^4.0.0" + +protobufjs@^7.2.4: + version "7.2.6" + resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.6.tgz" + integrity sha512-dgJaEDDL6x8ASUZ1YqWciTRrdOuYNzoOf27oHNfdyvKqHr5i0FV7FSLU+aIeFjyFgVxrpTOtQUi0BLLBymZaBw== + dependencies: + "@protobufjs/aspromise" "^1.1.2" + "@protobufjs/base64" "^1.1.2" + "@protobufjs/codegen" "^2.0.4" + "@protobufjs/eventemitter" "^1.1.0" + "@protobufjs/fetch" "^1.1.0" + "@protobufjs/float" "^1.0.2" + "@protobufjs/inquire" "^1.1.0" + "@protobufjs/path" "^1.1.2" + "@protobufjs/pool" "^1.1.0" + "@protobufjs/utf8" "^1.1.0" + "@types/node" ">=13.7.0" + long "^5.0.0" + +pump@^3.0.0: + version "3.0.0" + resolved "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz" + integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww== + dependencies: + end-of-stream "^1.1.0" + once "^1.3.1" + +queue-tick@^1.0.1: + version "1.0.1" + resolved "https://registry.npmjs.org/queue-tick/-/queue-tick-1.0.1.tgz" + integrity sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag== + +rc@^1.2.7: + version "1.2.8" + resolved "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz" + integrity sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw== + dependencies: + deep-extend "^0.6.0" + ini "~1.3.0" + minimist "^1.2.0" + strip-json-comments "~2.0.1" + +readable-stream@^3.1.1, readable-stream@^3.4.0: + version "3.6.2" + resolved "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz" + integrity sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA== + dependencies: + inherits "^2.0.3" + string_decoder "^1.1.1" + util-deprecate "^1.0.1" + +safe-buffer@^5.0.1, safe-buffer@~5.2.0: + version "5.2.1" + resolved "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz" + integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ== + +semver@^7.3.5, semver@^7.5.4: + version "7.6.0" + resolved "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz" + integrity sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg== + dependencies: + lru-cache "^6.0.0" + +sharp@^0.32.0: + version "0.32.6" + resolved "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz" + integrity sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w== + dependencies: + color "^4.2.3" + detect-libc "^2.0.2" + node-addon-api "^6.1.0" + prebuild-install "^7.1.1" + semver "^7.5.4" + simple-get "^4.0.1" + tar-fs "^3.0.4" + tunnel-agent "^0.6.0" + +simple-concat@^1.0.0: + version "1.0.1" + resolved "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz" + integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q== + +simple-get@^4.0.0, simple-get@^4.0.1: + version "4.0.1" + resolved "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz" + integrity sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA== + dependencies: + decompress-response "^6.0.0" + once "^1.3.1" + simple-concat "^1.0.0" + +simple-swizzle@^0.2.2: + version "0.2.2" + resolved "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz" + integrity sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg== + dependencies: + is-arrayish "^0.3.1" + +streamx@^2.13.0, streamx@^2.15.0: + version "2.16.1" + resolved "https://registry.npmjs.org/streamx/-/streamx-2.16.1.tgz" + integrity sha512-m9QYj6WygWyWa3H1YY69amr4nVgy61xfjys7xO7kviL5rfIEc2naf+ewFiOA+aEJD7y0JO3h2GoiUv4TDwEGzQ== + dependencies: + fast-fifo "^1.1.0" + queue-tick "^1.0.1" + optionalDependencies: + bare-events "^2.2.0" + +string_decoder@^1.1.1: + version "1.3.0" + resolved "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz" + integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA== + dependencies: + safe-buffer "~5.2.0" + +strip-json-comments@~2.0.1: + version "2.0.1" + resolved "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz" + integrity sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ== + +tar-fs@^2.0.0: + version "2.1.1" + resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz" + integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng== + dependencies: + chownr "^1.1.1" + mkdirp-classic "^0.5.2" + pump "^3.0.0" + tar-stream "^2.1.4" + +tar-fs@^3.0.4: + version "3.0.5" + resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.5.tgz" + integrity sha512-JOgGAmZyMgbqpLwct7ZV8VzkEB6pxXFBVErLtb+XCOqzc6w1xiWKI9GVd6bwk68EX7eJ4DWmfXVmq8K2ziZTGg== + dependencies: + pump "^3.0.0" + tar-stream "^3.1.5" + optionalDependencies: + bare-fs "^2.1.1" + bare-path "^2.1.0" + +tar-stream@^2.1.4: + version "2.2.0" + resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz" + integrity sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ== + dependencies: + bl "^4.0.3" + end-of-stream "^1.4.1" + fs-constants "^1.0.0" + inherits "^2.0.3" + readable-stream "^3.1.1" + +tar-stream@^3.1.5: + version "3.1.7" + resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz" + integrity sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ== + dependencies: + b4a "^1.6.4" + fast-fifo "^1.2.0" + streamx "^2.15.0" + +tunnel-agent@^0.6.0: + version "0.6.0" + resolved "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz" + integrity sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w== + dependencies: + safe-buffer "^5.0.1" + +undici-types@~5.26.4: + version "5.26.5" + resolved "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz" + integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA== + +util-deprecate@^1.0.1: + version "1.0.2" + resolved "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz" + integrity sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw== + +wrappy@1: + version "1.0.2" + resolved "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz" + integrity sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ== + +yallist@^4.0.0: + version "4.0.0" + resolved "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz" + integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A== From 2f814135423b1c01a1793461f1c15e1c9328a0b0 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 18 Apr 2024 20:15:00 +0000 Subject: [PATCH 09/40] Make: Consistent naming between Python and TS --- javascript/embeddings.mts | 55 ------------------------- javascript/encoders.mts | 74 ++++++++++++++++++++++++++++++++++ javascript/encoders_test.ts | 39 ++++++++++++++++++ package.json | 36 ++++++++++++----- python/uform/__init__.py | 8 ++-- python/uform/onnx_encoders.py | 8 ++-- python/uform/torch_decoders.py | 4 +- python/uform/torch_encoders.py | 14 +++---- tsconfig.json | 20 +++++++-- 9 files changed, 172 insertions(+), 86 deletions(-) delete mode 100644 javascript/embeddings.mts create mode 100644 javascript/encoders.mts create mode 100644 javascript/encoders_test.ts diff --git a/javascript/embeddings.mts b/javascript/embeddings.mts deleted file mode 100644 index 6a34344..0000000 --- a/javascript/embeddings.mts +++ /dev/null @@ -1,55 +0,0 @@ -import * as ort from 'onnxruntime-web'; -import { AutoTokenizer, PreTrainedTokenizer } from '@xenova/transformers'; - -type ModelConfig = { - modelPath: string; - tokenizerPath: string; -}; - -class TextEncoder { - private session: ort.InferenceSession; - private tokenizer: PreTrainedTokenizer; - - constructor(private config: ModelConfig) {} - - async init(): Promise { - this.tokenizer = await AutoTokenizer.from_pretrained(this.config.tokenizerPath); - this.session = await ort.InferenceSession.create(this.config.modelPath); - } - - async forward(text: string): Promise<{ features: Uint8Array, embeddings: Uint8Array }> { - // Tokenization - const { input_ids } = await this.tokenizer(text); - const tensorInputIds = new ort.Tensor('float32', Float32Array.from(input_ids), [1, input_ids.length]); - const tensorAttentionMask = new ort.Tensor('float32', new Float32Array(input_ids.length).fill(1), [1, input_ids.length]); - - // Model inference - const feeds = { input_ids: tensorInputIds, attention_mask: tensorAttentionMask }; - const results = await this.session.run(feeds); - - // Assume output tensors are in results['features'] and results['embeddings'] - const features = results['features'].data as Uint8Array! - const embeddings = results['embeddings'].data as Uint8Array! - - return { features, embeddings }; - } -} - -// Usage -async function main() { - const textEncoder = new TextEncoder({ - modelPath: './text_encoder.onnx', - tokenizerPath: 'Xenova/bert-base-uncased' - }); - - await textEncoder.init(); - const result = await textEncoder.forward('I love transformers!'); - console.log('Features:', result.features); - console.log('Embeddings:', result.embeddings); -} - -main(); - - - - diff --git a/javascript/encoders.mts b/javascript/encoders.mts new file mode 100644 index 0000000..cc3b754 --- /dev/null +++ b/javascript/encoders.mts @@ -0,0 +1,74 @@ +import { downloadFile, listFiles, RepoDesignation, Credentials } from "@huggingface/hub"; + +export enum Modality { + TextEncoder = "text_encoder", + ImageEncoder = "image_encoder", + VideoEncoder = "video_encoder", + TextDecoder = "text_decoder", +} + +function isModality(key: any): key is keyof typeof Modality { + return Object.keys(Modality).includes(key); +} + +function normalizeModalities(modalities: Array): Array { + return modalities.map(x => { + if (typeof x === "string") { + if (isModality(Modality[x as keyof typeof Modality])) { + return Modality[x as keyof typeof Modality]; + } else { + throw new Error(`Invalid modality: ${x}`); + } + } + return x; + }); +} + +export async function getCheckpoint( + modelId: string, + modalities: Array, + token: string | null = null, + format: '.pt' | '.onnx' = '.onnx' +): Promise<{ configPath: string | null, modalityPaths: Record | null, tokenizerPath: string | null }> { + modalities = normalizeModalities(modalities); + + const configNames = ['config.json']; + const tokenizerNames = ['tokenizer.json']; + const modelFileNames = modalities.map(modality => `${modality}${format}`); + const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames]; + + const repo: RepoDesignation = { type: "model", name: modelId }; + const credentials: Credentials | undefined = token ? { accessToken: token } : undefined; + + let configPath: string | null = null; + let tokenizerPath: string | null = null; + const modalityPaths: Record = {}; + + // List files and directly process + const fileIterator = listFiles({ repo, recursive: true, credentials }); + for await (const file of fileIterator) { + const fileName = file.path.split('/').pop(); + if (fileName && allowedPatterns.includes(fileName)) { + const filePath = file.path; + if (configNames.includes(fileName)) { + configPath = filePath; + } else if (tokenizerNames.includes(fileName)) { + tokenizerPath = filePath; + } else { + const modalityName = fileName.split('.')[0]; + modalityPaths[modalityName] = filePath; + } + + // Download the file + const response = await downloadFile({ repo, path: filePath, credentials }); + if (response) { + // Handle file response, save locally or process in-memory as needed + // Example: Save to a local file or process the file contents + console.log(`Downloaded ${fileName} successfully.`); + } + } + } + + return { configPath, modalityPaths, tokenizerPath }; +} + diff --git a/javascript/encoders_test.ts b/javascript/encoders_test.ts new file mode 100644 index 0000000..b5b9f04 --- /dev/null +++ b/javascript/encoders_test.ts @@ -0,0 +1,39 @@ +import { getCheckpoint } from "./encoders.mts"; +import { Modality } from "./encoders.mts"; + +// Simple function to assert conditions +function assert(condition: boolean, message: string) { + if (!condition) { + throw new Error(message); + } +} + +// Test case for getCheckpoint function +async function testGetCheckpoint() { + console.log("Test getCheckpoint: Start"); + + try { + const modelId = 'uform3-image-text-english-small'; // Example model ID + const token = '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'; // Example token + const modalities = [Modality.TextEncoder, Modality.ImageEncoder]; + + const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( + modelId, + modalities, + token, + '.onnx' + ); + + // Asserts to check if the paths are not null (indicating successful file retrieval) + assert(configPath !== null, "Config path should not be null"); + assert(modalityPaths !== null, "Modality paths should not be null"); + assert(tokenizerPath !== null, "Tokenizer path should not be null"); + + console.log("Test getCheckpoint: Success"); + } catch (error) { + console.error("Test getCheckpoint: Failed", error); + } +} + +// Run the test +testGetCheckpoint(); diff --git a/package.json b/package.json index 7331231..5bf593e 100644 --- a/package.json +++ b/package.json @@ -1,11 +1,27 @@ { - "name": "uform", - "private": true, - "version": "2.0.2", - "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation", - "dependencies": { - "@huggingface/hub": "^0.14.8", - "@xenova/transformers": "^2.17.0", - "onnxruntime-web": "^1.17.3" - } -} + "name": "uform", + "private": true, + "version": "2.0.2", + "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation", + "dependencies": { + "@huggingface/hub": "^0.14.8", + "@xenova/transformers": "^2.17.0", + "onnxruntime-web": "^1.17.3" + }, + "devDependencies": { + "typescript": "^4.0.5", + "ts-node": "^9.0.0", + "@types/node": "^14.14.7" + }, + "scripts": { + "build": "tsc", + "test": "ts-node javascript/encoders_test.ts" + }, + "main": "node_build/encoders.js", + "directories": { + "doc": "docs" + }, + "keywords": [], + "author": "", + "license": "ISC" +} \ No newline at end of file diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 44fce13..74d5ee9 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -90,7 +90,7 @@ def get_model( token: Optional[str] = None, modalities: Optional[Tuple[str]] = None, ): - from uform.torch_encoders import TextVisualEncoder + from uform.torch_encoders import TextImageEncoder from uform.torch_processors import TorchProcessor config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".pt") @@ -98,7 +98,7 @@ def get_model( {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths ) - model = TextVisualEncoder(config_path, modality_paths) + model = TextImageEncoder(config_path, modality_paths) processor = TorchProcessor(config_path, tokenizer_path) return model.eval(), processor @@ -111,7 +111,7 @@ def get_model_onnx( token: Optional[str] = None, modalities: Optional[Tuple[str]] = None, ): - from uform.onnx_encoders import TextVisualEncoder + from uform.onnx_encoders import TextImageEncoder from uform.numpy_processors import NumPyProcessor config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".onnx") @@ -119,7 +119,7 @@ def get_model_onnx( {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths ) - model = TextVisualEncoder(config_path, modality_paths, device=device) + model = TextImageEncoder(config_path, modality_paths, device=device) processor = NumPyProcessor(config_path, tokenizer_path) return model, processor diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py index 8201693..95a0f73 100644 --- a/python/uform/onnx_encoders.py +++ b/python/uform/onnx_encoders.py @@ -58,7 +58,7 @@ def available_providers(device: Optional[str]) -> Tuple[str, ...]: return (device,) -class VisualEncoder: +class ImageEncoder: def __init__(self, model_path: str, device: str): """ :param model_path: Path to onnx model @@ -99,7 +99,7 @@ def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask}) -class TextVisualEncoder: +class TextImageEncoder: def __init__( self, config_path: PathLike, @@ -123,7 +123,7 @@ def __init__( text_encoder_path = modality_paths.get("text_encoder", None) image_encoder_path = modality_paths.get("image_encoder", None) self.text_encoder = TextEncoder(text_encoder_path, device) if text_encoder_path else None - self.image_encoder = VisualEncoder(image_encoder_path, device) if image_encoder_path else None + self.image_encoder = ImageEncoder(image_encoder_path, device) if image_encoder_path else None def encode_image( self, @@ -200,4 +200,4 @@ def multimodal_embedding_dim(self) -> int: return self._text_encoder_dim -VLM_ONNX = TextVisualEncoder # legacy +VLM_ONNX = TextImageEncoder # legacy diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py index 79b058d..db60d63 100644 --- a/python/uform/torch_decoders.py +++ b/python/uform/torch_decoders.py @@ -20,7 +20,7 @@ from transformers.processing_utils import ProcessorMixin from transformers.tokenization_utils_base import BatchEncoding -from uform.torch_encoders import VisualEncoder +from uform.torch_encoders import ImageEncoder IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) @@ -143,7 +143,7 @@ def __init__(self, config: VLMConfig): self.text_config.vocab_size += 3 self.text_decoder = AutoModelForCausalLM.from_config(self.text_config) - self.image_encoder = VisualEncoder( + self.image_encoder = ImageEncoder( self.config.image_encoder_hidden_size, self.config.image_encoder_patch_size, self.config.image_size, diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index 2a0a0c9..f122606 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -133,7 +133,7 @@ def forward( @dataclass(eq=False) -class VisualEncoderBlock(nn.Module): +class ImageEncoderBlock(nn.Module): dim: int num_heads: int @@ -293,7 +293,7 @@ def forward( @dataclass(eq=False) -class VisualEncoder(nn.Module): +class ImageEncoder(nn.Module): dim: int patch_size: int image_size: int @@ -315,7 +315,7 @@ def __post_init__(self): self.reg_token = nn.Parameter(torch.zeros(1, self.num_reg_tokens, self.dim)) self.blocks = nn.Sequential( - *[VisualEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)], + *[ImageEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)], ) self.norm = nn.LayerNorm(self.dim, eps=1e-6) @@ -354,7 +354,7 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: return embeddings -class TextVisualEncoder(nn.Module): +class TextImageEncoder(nn.Module): """ Vision-Language Model for Multimodal embeddings. """ @@ -379,11 +379,11 @@ def __init__( # Both `text_encoder` and `image_encoder` are data-classes, so we must strip # all the non-member attributes before initializing the classes. text_fields = TextEncoder.__dataclass_fields__ - image_fields = VisualEncoder.__dataclass_fields__ + image_fields = ImageEncoder.__dataclass_fields__ text_encoder_attrs = {k: v for k, v in config["text_encoder"].items() if k in text_fields} image_encoder_attrs = {k: v for k, v in config["image_encoder"].items() if k in image_fields} self.text_encoder = TextEncoder(**text_encoder_attrs) - self.image_encoder = VisualEncoder(**image_encoder_attrs) + self.image_encoder = ImageEncoder(**image_encoder_attrs) # Load pre-trained weights if modality_paths is not None: @@ -535,4 +535,4 @@ def multimodal_embedding_dim(self) -> int: return self.text_encoder.dim -VLM = TextVisualEncoder # legacy +VLM = TextImageEncoder # legacy diff --git a/tsconfig.json b/tsconfig.json index a77b46b..a489f33 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,8 +1,20 @@ { "compilerOptions": { - "target": "ES5", "module": "CommonJS", - "outDir": "node_build", - "sourceMap": true - } + "target": "ES2018", + "esModuleInterop": true, + "moduleResolution": "node", + "baseUrl": ".", + "outDir": "dist", + "allowImportingTsExtensions": true, + "paths": { + "*": [ + "node_modules/*", + "javascript/*" + ] + } + }, + "include": [ + "javascript/**/*" + ] } \ No newline at end of file From eb88296f5397ce5e47b668c3652dcd3f875c20aa Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 18 Apr 2024 23:16:59 +0000 Subject: [PATCH 10/40] Improve: Separate text and image processors --- python/uform/numpy_processors.py | 32 +++++++++++++++----- python/uform/torch_processors.py | 50 ++++++++++++++++++++------------ 2 files changed, 56 insertions(+), 26 deletions(-) diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py index d300504..afda329 100644 --- a/python/uform/numpy_processors.py +++ b/python/uform/numpy_processors.py @@ -7,25 +7,20 @@ import numpy as np -class NumPyProcessor: +class TextProcessor: def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ :param config: model config :param tokenizer_path: path to tokenizer file - :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) """ config = json.load(open(config_path, "r")) - self._image_size = config["image_encoder"]["image_size"] self._max_seq_len = config["text_encoder"]["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) self._tokenizer.no_padding() self._pad_token_idx = config["text_encoder"]["padding_idx"] - self.image_mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)[None, None] - self.image_std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)[None, None] - - def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]: + def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]: """Transforms one or more strings into dictionary with tokenized strings and attention masks. :param texts: text of list of texts to tokenizer @@ -53,7 +48,28 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray] return {"input_ids": input_ids, "attention_mask": attention_mask} - def preprocess_image(self, images: Union[Image, List[Image]]) -> np.ndarray: + +class ImageProcessor: + def __init__(self, config_path: PathLike, tokenizer_path: PathLike): + """ + :param config: model config + :param tokenizer_path: path to tokenizer file + :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) + """ + + config = json.load(open(config_path, "r")) + self._image_size = config["image_encoder"]["image_size"] + self._normalization_means = config["image_encoder"]["normalization_means"] + self._normalization_deviations = config["image_encoder"]["normalization_deviations"] + + assert isinstance(self._image_size, int) and self._image_size > 0 + assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list) + assert len(self._normalization_means) == len(self._normalization_deviations) == 3 + + self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None] + self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None] + + def __call__(self, images: Union[Image, List[Image]]) -> np.ndarray: """Transforms one or more Pillow images into Torch Tensors. :param images: image or list of images to preprocess diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py index b435efb..340b117 100644 --- a/python/uform/torch_processors.py +++ b/python/uform/torch_processors.py @@ -21,35 +21,20 @@ def convert_to_rgb(image): return image.convert("RGB") -class TorchProcessor: +class TextProcessor: def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ :param config: model config :param tokenizer_path: path to tokenizer file - :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) """ config = json.load(open(config_path, "r")) - self._image_size = config["image_encoder"]["image_size"] self._max_seq_len = config["text_encoder"]["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) self._tokenizer.no_padding() self._pad_token_idx = config["text_encoder"]["padding_idx"] - self._image_transform = Compose( - [ - Resize(self._image_size, interpolation=InterpolationMode.BICUBIC), - convert_to_rgb, - CenterCrop(self._image_size), - ToTensor(), - Normalize( - mean=(0.48145466, 0.4578275, 0.40821073), - std=(0.26862954, 0.26130258, 0.27577711), - ), - ], - ) - - def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: + def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: """Transforms one or more strings into dictionary with tokenized strings and attention masks. :param texts: text of list of texts to tokenizer @@ -79,7 +64,36 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: return {"input_ids": input_ids, "attention_mask": attention_mask} - def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor: + +class ImageProcessor: + def __init__(self, config_path: PathLike): + """ + :param config: model config + """ + + config = json.load(open(config_path, "r")) + self._image_size = config["image_encoder"]["image_size"] + self._normalization_means = config["image_encoder"]["normalization_means"] + self._normalization_deviations = config["image_encoder"]["normalization_deviations"] + + assert isinstance(self._image_size, int) and self._image_size > 0 + assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list) + assert len(self._normalization_means) == len(self._normalization_deviations) == 3 + + self._image_transform = Compose( + [ + Resize(self._image_size, interpolation=InterpolationMode.BICUBIC), + convert_to_rgb, + CenterCrop(self._image_size), + ToTensor(), + Normalize( + mean=tuple(self._normalization_means), + std=tuple(self._normalization_deviations), + ), + ], + ) + + def __call__(self, images: Union[Image, List[Image]]) -> Tensor: """Transforms one or more Pillow images into Torch Tensors. :param images: image or list of images to preprocess From a391b6d79595a97e89843d64a24fa86119e68356 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 18 Apr 2024 23:17:17 +0000 Subject: [PATCH 11/40] Make: Deprecate TypeScript for JavaScript --- CONTRIBUTING.md | 14 +++++++ javascript/encoders.mjs | 38 +++++++++++++++++++ javascript/encoders.mts | 74 ------------------------------------ javascript/encoders_test.js | 75 +++++++++++++++++++++++++++++++++++++ javascript/encoders_test.ts | 39 ------------------- javascript/hub.mjs | 68 +++++++++++++++++++++++++++++++++ package.json | 22 ++++++----- tsconfig.json | 20 ---------- 8 files changed, 208 insertions(+), 142 deletions(-) create mode 100644 javascript/encoders.mjs delete mode 100644 javascript/encoders.mts create mode 100644 javascript/encoders_test.js delete mode 100644 javascript/encoders_test.ts create mode 100644 javascript/hub.mjs delete mode 100644 tsconfig.json diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bf4f409..bcf6d91 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,9 +40,23 @@ As there is no standard for Swift formatting, even Apple's own `swift-format` to ## JavaScript +For rapid development you can avoid the TypeScript precompilation step: + +```sh +npm install -g ts-node +ts-node javascript/embeddings.mts +``` + Before submitting any changes, please make sure that the tests pass. ```sh npm install +npm run build npm run test ``` + +``` +tsc +node node_build/embeddings.mjs +``` + diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs new file mode 100644 index 0000000..0107764 --- /dev/null +++ b/javascript/encoders.mjs @@ -0,0 +1,38 @@ +import { readFileSync } from 'fs'; +import { InferenceSession } from 'onnxruntime-web'; + +import { getCheckpoint, Modality } from "./hub.mjs"; + +import { AutoTokenizer } from '@xenova/transformers'; + + +class TextProcessor { + + async init(configPath, tokenizerPath) { + const config = JSON.parse(readFileSync(configPath, { encoding: 'utf8' })); + this.maxSeqLen = config.text_encoder.max_position_embeddings; + this.padTokenIdx = config.text_encoder.padding_idx; + this.tokenizer = await AutoTokenizer.from_pretrained(tokenizerPath); + } + + async processTexts(texts) { + if (typeof texts === 'string') { + texts = [texts]; + } + + const encoded = await this.tokenizer.encodeBatch(texts, { + addSpecialTokens: true, + returnAttentionMask: true, + padding: 'max_length', + max_length: this.maxSeqLen, + truncation: true, + return_tensors: 'np' + }); + + const inputIds = encoded.map(e => e.input_ids); + const attentionMask = encoded.map(e => e.attention_mask); + return { inputIds, attentionMask }; + } +} + +export { TextProcessor }; diff --git a/javascript/encoders.mts b/javascript/encoders.mts deleted file mode 100644 index cc3b754..0000000 --- a/javascript/encoders.mts +++ /dev/null @@ -1,74 +0,0 @@ -import { downloadFile, listFiles, RepoDesignation, Credentials } from "@huggingface/hub"; - -export enum Modality { - TextEncoder = "text_encoder", - ImageEncoder = "image_encoder", - VideoEncoder = "video_encoder", - TextDecoder = "text_decoder", -} - -function isModality(key: any): key is keyof typeof Modality { - return Object.keys(Modality).includes(key); -} - -function normalizeModalities(modalities: Array): Array { - return modalities.map(x => { - if (typeof x === "string") { - if (isModality(Modality[x as keyof typeof Modality])) { - return Modality[x as keyof typeof Modality]; - } else { - throw new Error(`Invalid modality: ${x}`); - } - } - return x; - }); -} - -export async function getCheckpoint( - modelId: string, - modalities: Array, - token: string | null = null, - format: '.pt' | '.onnx' = '.onnx' -): Promise<{ configPath: string | null, modalityPaths: Record | null, tokenizerPath: string | null }> { - modalities = normalizeModalities(modalities); - - const configNames = ['config.json']; - const tokenizerNames = ['tokenizer.json']; - const modelFileNames = modalities.map(modality => `${modality}${format}`); - const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames]; - - const repo: RepoDesignation = { type: "model", name: modelId }; - const credentials: Credentials | undefined = token ? { accessToken: token } : undefined; - - let configPath: string | null = null; - let tokenizerPath: string | null = null; - const modalityPaths: Record = {}; - - // List files and directly process - const fileIterator = listFiles({ repo, recursive: true, credentials }); - for await (const file of fileIterator) { - const fileName = file.path.split('/').pop(); - if (fileName && allowedPatterns.includes(fileName)) { - const filePath = file.path; - if (configNames.includes(fileName)) { - configPath = filePath; - } else if (tokenizerNames.includes(fileName)) { - tokenizerPath = filePath; - } else { - const modalityName = fileName.split('.')[0]; - modalityPaths[modalityName] = filePath; - } - - // Download the file - const response = await downloadFile({ repo, path: filePath, credentials }); - if (response) { - // Handle file response, save locally or process in-memory as needed - // Example: Save to a local file or process the file contents - console.log(`Downloaded ${fileName} successfully.`); - } - } - } - - return { configPath, modalityPaths, tokenizerPath }; -} - diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js new file mode 100644 index 0000000..b3bad21 --- /dev/null +++ b/javascript/encoders_test.js @@ -0,0 +1,75 @@ +import { existsSync } from 'fs'; + +import { getCheckpoint, Modality } from "./hub.mjs"; +import { TextProcessor } from "./encoders.mjs"; + +function assert(condition, message) { + if (!condition) { + throw new Error(message); + } +} + +async function testGetCheckpoint() { + console.log("Test getCheckpoint: Start"); + + try { + const modelId = 'unum-cloud/uform3-image-text-english-small'; + const token = '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'; + const modalities = [Modality.TextEncoder, Modality.ImageEncoder]; + + const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( + modelId, + modalities, + token, + '.onnx' + ); + + assert(configPath !== null, "Config path should not be null"); + assert(modalityPaths !== null, "Modality paths should not be null"); + assert(tokenizerPath !== null, "Tokenizer path should not be null"); + + // Check if the file actually exists + assert(existsSync(configPath), `Config file should exist at ${configPath}`); + assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`); + for (const modalityPath of Object.values(modalityPaths)) { + assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`); + } + + console.log("Test getCheckpoint: Success"); + } catch (error) { + console.error("Test getCheckpoint: Failed", error); + } +} + +async function testTextEncoder() { + console.log("Test TextEncoder: Start"); + + try { + const modelId = 'unum-cloud/uform3-image-text-english-small'; + const token = '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'; + const modalities = [Modality.TextEncoder, Modality.ImageEncoder]; + + const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( + modelId, + modalities, + token, + '.onnx' + ); + + assert(configPath !== null, "Config path should not be null"); + assert(modalityPaths !== null, "Modality paths should not be null"); + assert(tokenizerPath !== null, "Tokenizer path should not be null"); + + const textProcessor = new TextProcessor(); + await textProcessor.init(configPath, tokenizerPath); + const processedTexts = await textProcessor.processTexts(["Hello, world!", "Another example text."]); + console.log(processedTexts); + + console.log("Test getCheckpoint: Success"); + } catch (error) { + console.error("Test getCheckpoint: Failed", error); + } +} + +testGetCheckpoint(); +testTextEncoder(); diff --git a/javascript/encoders_test.ts b/javascript/encoders_test.ts deleted file mode 100644 index b5b9f04..0000000 --- a/javascript/encoders_test.ts +++ /dev/null @@ -1,39 +0,0 @@ -import { getCheckpoint } from "./encoders.mts"; -import { Modality } from "./encoders.mts"; - -// Simple function to assert conditions -function assert(condition: boolean, message: string) { - if (!condition) { - throw new Error(message); - } -} - -// Test case for getCheckpoint function -async function testGetCheckpoint() { - console.log("Test getCheckpoint: Start"); - - try { - const modelId = 'uform3-image-text-english-small'; // Example model ID - const token = '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'; // Example token - const modalities = [Modality.TextEncoder, Modality.ImageEncoder]; - - const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( - modelId, - modalities, - token, - '.onnx' - ); - - // Asserts to check if the paths are not null (indicating successful file retrieval) - assert(configPath !== null, "Config path should not be null"); - assert(modalityPaths !== null, "Modality paths should not be null"); - assert(tokenizerPath !== null, "Tokenizer path should not be null"); - - console.log("Test getCheckpoint: Success"); - } catch (error) { - console.error("Test getCheckpoint: Failed", error); - } -} - -// Run the test -testGetCheckpoint(); diff --git a/javascript/hub.mjs b/javascript/hub.mjs new file mode 100644 index 0000000..99ebfee --- /dev/null +++ b/javascript/hub.mjs @@ -0,0 +1,68 @@ +import { downloadFile, listFiles } from "@huggingface/hub"; + +const Modality = { + TextEncoder: "text_encoder", + ImageEncoder: "image_encoder", + VideoEncoder: "video_encoder", + TextDecoder: "text_decoder", +}; + +function isModality(value) { + return Object.values(Modality).includes(value); +} + +function normalizeModalities(modalities) { + return modalities.map(x => { + if (typeof x === "string") { + if (isModality(x)) { + return x; + } else { + throw new Error(`Invalid modality: ${x}`); + } + } + return x; + }); +} + +async function getCheckpoint( + modelId, modalities, token = null, format = '.onnx', +) { + modalities = normalizeModalities(modalities); + + const configNames = ['config.json']; + const tokenizerNames = ['tokenizer.json']; + const modelFileNames = modalities.map(modality => `${modality}${format}`); + const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames]; + + const repo = { type: "model", name: modelId }; + const credentials = token ? { accessToken: token } : undefined; + + let configPath = null; + let tokenizerPath = null; + const modalityPaths = {}; + + const fileIterator = listFiles({ repo, recursive: true, credentials }); + for await (const file of fileIterator) { + const fileName = file.path.split('/').pop(); + if (fileName && allowedPatterns.includes(fileName)) { + const filePath = file.path; + if (configNames.includes(fileName)) { + configPath = filePath; + } else if (tokenizerNames.includes(fileName)) { + tokenizerPath = filePath; + } else { + const modalityName = fileName.split('.')[0]; + modalityPaths[modalityName] = filePath; + } + + const response = await downloadFile({ repo, path: filePath, credentials }); + if (response) { + console.log(`Downloaded ${fileName} successfully to ${response.json()}`); + } + } + } + + return { configPath, modalityPaths, tokenizerPath }; +} + +export { getCheckpoint, Modality }; diff --git a/package.json b/package.json index 5bf593e..a25922f 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,6 @@ { "name": "uform", + "type": "module", "private": true, "version": "2.0.2", "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation", @@ -9,19 +10,22 @@ "onnxruntime-web": "^1.17.3" }, "devDependencies": { - "typescript": "^4.0.5", - "ts-node": "^9.0.0", - "@types/node": "^14.14.7" + "nodemon": "^2.0.15" }, "scripts": { - "build": "tsc", - "test": "ts-node javascript/encoders_test.ts" + "start": "node javascript/encoders.mjs", + "test": "node javascript/encoders_test.js" }, - "main": "node_build/encoders.js", + "main": "javascript/encoders.mjs", "directories": { "doc": "docs" }, - "keywords": [], - "author": "", - "license": "ISC" + "keywords": [ + "AI", + "multimodal", + "content generation", + "huggingface" + ], + "author": "Ash Vardanian, Unum Cloud", + "license": "Apache-2.0" } \ No newline at end of file diff --git a/tsconfig.json b/tsconfig.json deleted file mode 100644 index a489f33..0000000 --- a/tsconfig.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "compilerOptions": { - "module": "CommonJS", - "target": "ES2018", - "esModuleInterop": true, - "moduleResolution": "node", - "baseUrl": ".", - "outDir": "dist", - "allowImportingTsExtensions": true, - "paths": { - "*": [ - "node_modules/*", - "javascript/*" - ] - } - }, - "include": [ - "javascript/**/*" - ] -} \ No newline at end of file From 50c71c80741962baac11752333964438a1a1a87e Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 19 Apr 2024 02:37:25 +0000 Subject: [PATCH 12/40] Add: Text processor for JS --- .vscode/launch.json | 8 +--- javascript/encoders.mjs | 93 ++++++++++++++++++++++++++++++------- javascript/encoders_test.js | 13 ++++-- javascript/hub.mjs | 50 +++++++++++++++++--- package.json | 3 +- 5 files changed, 132 insertions(+), 35 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 305841e..3343a11 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -13,13 +13,9 @@ }, { "name": "NodeJS Debugger", - "type": "node", + "type": "node-terminal", "request": "launch", - "program": "${workspaceFolder}/javascript/embeddings.ts", - "preLaunchTask": "tsc: build - tsconfig.json", - "outFiles": [ - "${workspaceFolder}/node_build/**/*.js" - ] + "command": "npm run test", } ] } \ No newline at end of file diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs index 0107764..bd04ee6 100644 --- a/javascript/encoders.mjs +++ b/javascript/encoders.mjs @@ -1,38 +1,97 @@ import { readFileSync } from 'fs'; -import { InferenceSession } from 'onnxruntime-web'; - +import { InferenceSession, Tensor } from 'onnxruntime-node'; import { getCheckpoint, Modality } from "./hub.mjs"; - -import { AutoTokenizer } from '@xenova/transformers'; +import { PreTrainedTokenizer } from '@xenova/transformers'; class TextProcessor { - async init(configPath, tokenizerPath) { - const config = JSON.parse(readFileSync(configPath, { encoding: 'utf8' })); + constructor(configPath, tokenizerPath) { + this.configPath = configPath; + this.tokenizerPath = tokenizerPath; + + this.maxSeqLen = 0; + this.padTokenIdx = 0; + this.tokenizer = null; + } + + async init() { + const config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' })); this.maxSeqLen = config.text_encoder.max_position_embeddings; this.padTokenIdx = config.text_encoder.padding_idx; - this.tokenizer = await AutoTokenizer.from_pretrained(tokenizerPath); + + const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' })); + this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config.text_encoder); + this.tokenizer.model_max_length = this.maxSeqLen; + this.tokenizer.pad_token_id = this.padTokenIdx; } - async processTexts(texts) { - if (typeof texts === 'string') { - texts = [texts]; - } + async process(texts) { - const encoded = await this.tokenizer.encodeBatch(texts, { + const encoded = await this.tokenizer(texts, { addSpecialTokens: true, returnAttentionMask: true, padding: 'max_length', max_length: this.maxSeqLen, truncation: true, - return_tensors: 'np' }); - const inputIds = encoded.map(e => e.input_ids); - const attentionMask = encoded.map(e => e.attention_mask); - return { inputIds, attentionMask }; + return { + 'input_ids': encoded.input_ids, + 'attention_mask': encoded.attention_mask, + }; + } +} + +class TextEncoder { + + constructor(configPath, modelPath, tokenizerPath) { + this.configPath = configPath; + this.modelPath = modelPath; + this.tokenizerPath = tokenizerPath; + + this.session = null; + } + + async init() { + this.session = await InferenceSession.create(this.modelPath); } + + async forward(inputs) { + // Helper function to convert any BigInt64Array or other numeric arrays to Int32Array + function convertToCompatibleInt32(data) { + if (data instanceof Int32Array) { + return data; // Already the correct type + } else if (data instanceof BigInt64Array) { + // Convert BigInt64Array to Int32Array, ensuring values are within range + return new Int32Array(data.map(bigInt => { + if (bigInt > 2147483647n || bigInt < -2147483648n) { + throw new Error("Value out of range for Int32."); + } + return Number(bigInt); // Convert BigInt to Number and store in Int32Array + })); + } else if (Array.isArray(data) || data instanceof Uint32Array) { + // Convert other numeric array types to Int32Array + return new Int32Array(data.map(Number)); + } + throw new Error("Unsupported data type for tensor conversion."); + } + + // Prepare the tensor data using the helper function + const inputIDsData = convertToCompatibleInt32(inputs.input_ids.data); + const attentionMaskData = convertToCompatibleInt32(inputs.attention_mask.data); + + // Create ONNX Tensors as int32 + const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims); + const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims); + + // Run the model inference + return this.session.run({ + input_ids: inputIDs, + attention_mask: attentionMask, + }); + } + } -export { TextProcessor }; +export { TextProcessor, TextEncoder }; diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js index b3bad21..1a09cac 100644 --- a/javascript/encoders_test.js +++ b/javascript/encoders_test.js @@ -1,7 +1,7 @@ import { existsSync } from 'fs'; import { getCheckpoint, Modality } from "./hub.mjs"; -import { TextProcessor } from "./encoders.mjs"; +import { TextProcessor, TextEncoder } from "./encoders.mjs"; function assert(condition, message) { if (!condition) { @@ -60,11 +60,16 @@ async function testTextEncoder() { assert(modalityPaths !== null, "Modality paths should not be null"); assert(tokenizerPath !== null, "Tokenizer path should not be null"); - const textProcessor = new TextProcessor(); - await textProcessor.init(configPath, tokenizerPath); - const processedTexts = await textProcessor.processTexts(["Hello, world!", "Another example text."]); + const textProcessor = new TextProcessor(configPath, tokenizerPath); + await textProcessor.init(); + const processedTexts = await textProcessor.process("Hello, world!"); console.log(processedTexts); + const textEncoder = new TextEncoder(configPath, modalityPaths.text_encoder, tokenizerPath); + await textEncoder.init(); + const output = await textEncoder.forward(processedTexts); + console.log(output); + console.log("Test getCheckpoint: Success"); } catch (error) { console.error("Test getCheckpoint: Failed", error); diff --git a/javascript/hub.mjs b/javascript/hub.mjs index 99ebfee..ad534f3 100644 --- a/javascript/hub.mjs +++ b/javascript/hub.mjs @@ -1,3 +1,6 @@ +import { join } from "path" +import { createWriteStream, existsSync, mkdirSync, writeFileSync } from "fs"; + import { downloadFile, listFiles } from "@huggingface/hub"; const Modality = { @@ -24,9 +27,13 @@ function normalizeModalities(modalities) { }); } -async function getCheckpoint( - modelId, modalities, token = null, format = '.onnx', -) { +async function ensureDirectoryExists(dirPath) { + if (!existsSync(dirPath)) { + mkdirSync(dirPath, { recursive: true }); + } +} + +async function getCheckpoint(modelId, modalities, token = null, format = '.onnx', saveDir = './models') { modalities = normalizeModalities(modalities); const configNames = ['config.json']; @@ -40,24 +47,53 @@ async function getCheckpoint( let configPath = null; let tokenizerPath = null; const modalityPaths = {}; + const modelSaveDir = join(saveDir, modelId); + + await ensureDirectoryExists(modelSaveDir); const fileIterator = listFiles({ repo, recursive: true, credentials }); for await (const file of fileIterator) { const fileName = file.path.split('/').pop(); if (fileName && allowedPatterns.includes(fileName)) { const filePath = file.path; + const savePath = join(modelSaveDir, fileName); + if (configNames.includes(fileName)) { - configPath = filePath; + configPath = savePath; } else if (tokenizerNames.includes(fileName)) { - tokenizerPath = filePath; + tokenizerPath = savePath; } else { const modalityName = fileName.split('.')[0]; - modalityPaths[modalityName] = filePath; + modalityPaths[modalityName] = savePath; } const response = await downloadFile({ repo, path: filePath, credentials }); if (response) { - console.log(`Downloaded ${fileName} successfully to ${response.json()}`); + // HuggingFace might be defining the `env.localModelPath` variable + // to store the downloaded files in a local directory. + // Let's check if the file is there. + // const localPath = join(env.localModelPath, repo, filePath); + // if (existsSync(localPath)) { + // console.log(`File already exists locally at ${localPath}`); + // } + + if (response.body && response.body.pipe) { + const fileStream = createWriteStream(savePath); + response.body.pipe(fileStream); + await new Promise((resolve, reject) => { + fileStream.on('finish', resolve); + fileStream.on('error', reject); + }); + } else if (response.arrayBuffer) { + // Handle non-streamable response for environments like Node.js + const buffer = await response.arrayBuffer(); + writeFileSync(savePath, Buffer.from(buffer)); + } else { + console.error('Unexpected response type'); + } + console.log(`Downloaded ${fileName} successfully to ${savePath}`); + } else { + console.log('No response received for the file download request.'); } } } diff --git a/package.json b/package.json index a25922f..9be073f 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "dependencies": { "@huggingface/hub": "^0.14.8", "@xenova/transformers": "^2.17.0", + "onnxruntime-node": "^1.17.0", "onnxruntime-web": "^1.17.3" }, "devDependencies": { @@ -28,4 +29,4 @@ ], "author": "Ash Vardanian, Unum Cloud", "license": "Apache-2.0" -} \ No newline at end of file +} From 19c0c30718b53e81267757e2a3bbf9fe8e7dec9c Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 19 Apr 2024 02:38:57 +0000 Subject: [PATCH 13/40] Fix: Mismatch in the input types for text --- javascript/encoders.mjs | 32 +++++----- python/scripts/export_encoders.ipynb | 90 +++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 16 deletions(-) diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs index bd04ee6..c55d2d1 100644 --- a/javascript/encoders.mjs +++ b/javascript/encoders.mjs @@ -58,34 +58,36 @@ class TextEncoder { } async forward(inputs) { - // Helper function to convert any BigInt64Array or other numeric arrays to Int32Array - function convertToCompatibleInt32(data) { + // Helper function to convert BigInt64Array to Int32Array or validate Int32Array + function ensureInt32Array(data) { if (data instanceof Int32Array) { - return data; // Already the correct type - } else if (data instanceof BigInt64Array) { - // Convert BigInt64Array to Int32Array, ensuring values are within range - return new Int32Array(data.map(bigInt => { + return data; // Use as is if already Int32Array + } + if (data instanceof BigInt64Array) { + // Convert BigInt64Array to Int32Array, ensuring all values are in range + return new Int32Array(Array.from(data).map(bigInt => { if (bigInt > 2147483647n || bigInt < -2147483648n) { throw new Error("Value out of range for Int32."); } - return Number(bigInt); // Convert BigInt to Number and store in Int32Array + return Number(bigInt); // Convert BigInt to Number })); - } else if (Array.isArray(data) || data instanceof Uint32Array) { - // Convert other numeric array types to Int32Array - return new Int32Array(data.map(Number)); + } + // Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array + if (Array.isArray(data) || data instanceof Uint32Array || data instanceof Uint8Array) { + return new Int32Array(data); // Convert directly } throw new Error("Unsupported data type for tensor conversion."); } - // Prepare the tensor data using the helper function - const inputIDsData = convertToCompatibleInt32(inputs.input_ids.data); - const attentionMaskData = convertToCompatibleInt32(inputs.attention_mask.data); + // Prepare tensor data + const inputIDsData = ensureInt32Array(inputs.input_ids.data); + const attentionMaskData = ensureInt32Array(inputs.attention_mask.data); - // Create ONNX Tensors as int32 + // Create ONNX Tensors as 'int32' const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims); const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims); - // Run the model inference + // Run model inference return this.session.run({ input_ids: inputIDs, attention_mask: attentionMask, diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb index c7a94e0..a8d2ac3 100644 --- a/python/scripts/export_encoders.ipynb +++ b/python/scripts/export_encoders.ipynb @@ -493,6 +493,87 @@ "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's make sure that all the text inputs are integers of identical type - `int32`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "import os\n", + "from onnx import helper\n", + "\n", + "# Load the ONNX model\n", + "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module = onnx.load(module_path)\n", + "\n", + "# Get the module's graph\n", + "graph = module.graph\n", + "\n", + "# Iterate through the inputs and update the data type of `input_ids`\n", + "for input_tensor in graph.input:\n", + " # Check if this is the tensor we want to change\n", + " if input_tensor.name == 'input_ids' or input_tensor.name == 'attention_mask':\n", + " # Get the tensor type information\n", + " tensor_type = input_tensor.type.tensor_type\n", + " # Set the element type to INT32 (int32's enum value in onnx is 6)\n", + " tensor_type.elem_type = onnx.TensorProto.INT32\n", + "\n", + "# Optionally, check that the module is still valid\n", + "onnx.checker.check_model(module)\n", + "\n", + "# Save the modified module\n", + "onnx.save(module, module_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the following function to print and validate the input and output types of the ONNX model files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_model_inputs_and_outputs(onnx_model_path):\n", + " model = onnx.load(onnx_model_path)\n", + "\n", + " # Get the model's graph\n", + " graph = model.graph\n", + "\n", + " # Print input information\n", + " print(\"Model Inputs:\")\n", + " for input_tensor in graph.input:\n", + " tensor_type = input_tensor.type.tensor_type\n", + " # Get the element type (data type)\n", + " elem_type = tensor_type.elem_type\n", + " # Convert numeric type to readable format\n", + " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n", + " # Get tensor shape\n", + " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n", + " print(f\"Name: {input_tensor.name}, Type: {readable_type}, Shape: {shape}\")\n", + "\n", + " # Print output information similarly if needed\n", + " print(\"\\nModel Outputs:\")\n", + " for output_tensor in graph.output:\n", + " tensor_type = output_tensor.type.tensor_type\n", + " elem_type = tensor_type.elem_type\n", + " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n", + " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n", + " print(f\"Name: {output_tensor.name}, Type: {readable_type}, Shape: {shape}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -551,6 +632,13 @@ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n", "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -569,7 +657,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.11.5" } }, "nbformat": 4, From 7ac33bd836627ae0b9788b2ffe241e826e9ddd32 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 19 Apr 2024 04:59:05 +0000 Subject: [PATCH 14/40] Fix: Passing tests in JavaScript --- javascript/encoders.mjs | 161 +++++++++++++++++++++++++++++++++--- javascript/encoders_test.js | 33 +++++--- 2 files changed, 174 insertions(+), 20 deletions(-) diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs index c55d2d1..ae9dbec 100644 --- a/javascript/encoders.mjs +++ b/javascript/encoders.mjs @@ -1,8 +1,9 @@ import { readFileSync } from 'fs'; import { InferenceSession, Tensor } from 'onnxruntime-node'; -import { getCheckpoint, Modality } from "./hub.mjs"; import { PreTrainedTokenizer } from '@xenova/transformers'; +import sharp from 'sharp'; +import { getCheckpoint, Modality } from "./hub.mjs"; class TextProcessor { @@ -16,12 +17,16 @@ class TextProcessor { } async init() { - const config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' })); - this.maxSeqLen = config.text_encoder.max_position_embeddings; - this.padTokenIdx = config.text_encoder.padding_idx; + var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' })); + if (config.text_encoder !== undefined) { + config = config.text_encoder; + } + + this.maxSeqLen = config.max_position_embeddings; + this.padTokenIdx = config.padding_idx; const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' })); - this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config.text_encoder); + this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config); this.tokenizer.model_max_length = this.maxSeqLen; this.tokenizer.pad_token_id = this.padTokenIdx; } @@ -45,11 +50,8 @@ class TextProcessor { class TextEncoder { - constructor(configPath, modelPath, tokenizerPath) { - this.configPath = configPath; + constructor(modelPath, processor = null) { this.modelPath = modelPath; - this.tokenizerPath = tokenizerPath; - this.session = null; } @@ -57,7 +59,18 @@ class TextEncoder { this.session = await InferenceSession.create(this.modelPath); } + async dispose() { + if (this.session) { + await this.session.release(); + this.session = null; + } + } + async forward(inputs) { + if (!this.session) { + throw new Error("Session is not initialized."); + } + // Helper function to convert BigInt64Array to Int32Array or validate Int32Array function ensureInt32Array(data) { if (data instanceof Int32Array) { @@ -96,4 +109,132 @@ class TextEncoder { } -export { TextProcessor, TextEncoder }; + +class ImageProcessor { + constructor(configPath) { + this.configPath = configPath; + } + + async init() { + var config = JSON.parse(readFileSync(this.configPath, 'utf8')); + if (config.image_encoder !== undefined) { + config = config.image_encoder; + } + + this.imageSize = config.image_size; + this.normalizationMeans = config.normalization_means; + this.normalizationDeviations = config.normalization_deviations; + + this.imageMean = new Float32Array(this.normalizationMeans).fill(0); + this.imageStd = new Float32Array(this.normalizationDeviations).fill(0); + } + async process(images) { + const processSingle = async (image) => { + let img = sharp(image); + const metadata = await img.metadata(); + const scale = this.imageSize / Math.min(metadata.width, metadata.height); + const scaledWidth = parseInt(metadata.width * scale); + const scaledHeight = parseInt(metadata.height * scale); + img = img.resize({ + width: scaledWidth, + height: scaledHeight, + fit: sharp.fit.cover, + position: sharp.strategy.entropy + }).extract({ + left: Math.max(0, (scaledWidth - this.imageSize) / 2), + top: Math.max(0, (scaledHeight - this.imageSize) / 2), + width: this.imageSize, + height: this.imageSize + }).removeAlpha(); + + let buffer = await img.raw().toBuffer(); + let array = new Float32Array(buffer); + + return array.map((value, index) => { + const channel = index % 3; + return (value / 255.0 - this.normalizationMeans[channel]) / this.normalizationDeviations[channel]; + }); + }; + + if (Array.isArray(images)) { + return Promise.all(images.map(img => processSingle(img))); + } else { + return [await processSingle(images)]; + } + } +} + +class ImageEncoder { + constructor(modelPath, processor) { + this.modelPath = modelPath; + this.imageSize = processor.imageSize; + } + + async init() { + this.session = await InferenceSession.create(this.modelPath); + } + + async dispose() { + if (this.session) { + await this.session.release(); + this.session = null; + } + } + + async forward(inputs) { + if (!this.session) { + throw new Error("Session is not initialized."); + } + + // Helper function to ensure data is a Float32Array. + const ensureFloat32Array = (data) => { + if (!(data instanceof Float32Array)) { + throw new Error("Unsupported data type for tensor conversion."); + } + return data; + }; + + // Helper function to concatenate multiple Float32Arrays into a single Float32Array. + const concatFloat32Arrays = (arrays) => { + const totalLength = arrays.reduce((acc, val) => acc + val.length, 0); + const result = new Float32Array(totalLength); + let offset = 0; + for (let arr of arrays) { + result.set(arr, offset); + offset += arr.length; + } + return result; + }; + + let inputData; + let dims; + + if (Array.isArray(inputs)) { + // Assuming each input in the array is a Float32Array representing an image already processed to a fixed size. + const arrays = inputs.map(ensureFloat32Array); + inputData = concatFloat32Arrays(arrays); + const numImages = arrays.length; + const numChannels = 3; + const height = this.imageSize; + const width = this.imageSize; + dims = [numImages, numChannels, height, width]; + } else { + // Single image input, which is already a Float32Array. + inputData = ensureFloat32Array(inputs); + const numChannels = 3; + const height = this.imageSize; + const width = this.imageSize; + dims = [1, numChannels, height, width]; + } + + // Create ONNX Tensor + const inputTensor = new Tensor('float32', inputData, dims); + + // Run model inference + return this.session.run({ + input: inputTensor, + }); + } +} + +export { TextProcessor, TextEncoder, ImageProcessor, ImageEncoder }; diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js index 1a09cac..fba11f4 100644 --- a/javascript/encoders_test.js +++ b/javascript/encoders_test.js @@ -1,7 +1,7 @@ import { existsSync } from 'fs'; import { getCheckpoint, Modality } from "./hub.mjs"; -import { TextProcessor, TextEncoder } from "./encoders.mjs"; +import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs"; function assert(condition, message) { if (!condition) { @@ -41,8 +41,10 @@ async function testGetCheckpoint() { } } -async function testTextEncoder() { - console.log("Test TextEncoder: Start"); +async function testEncoders() { + console.log("Test testEncoders: Start"); + let textEncoder = null; + let imageEncoder = null; try { const modelId = 'unum-cloud/uform3-image-text-english-small'; @@ -63,18 +65,29 @@ async function testTextEncoder() { const textProcessor = new TextProcessor(configPath, tokenizerPath); await textProcessor.init(); const processedTexts = await textProcessor.process("Hello, world!"); - console.log(processedTexts); - const textEncoder = new TextEncoder(configPath, modalityPaths.text_encoder, tokenizerPath); + textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); await textEncoder.init(); - const output = await textEncoder.forward(processedTexts); - console.log(output); + const textOutput = await textEncoder.forward(processedTexts); + console.log(textOutput.embeddings.dims); - console.log("Test getCheckpoint: Success"); + const imageProcessor = new ImageProcessor(configPath); + await imageProcessor.init(); + const processedImages = await imageProcessor.process("assets/unum.png"); + + imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); + await imageEncoder.init(); + const imageOutput = await imageEncoder.forward(processedImages); + console.log(imageOutput.embeddings.dims); + + console.log("Test testEncoders: Success"); } catch (error) { - console.error("Test getCheckpoint: Failed", error); + console.error("Test testEncoders: Failed", error); + } finally { + await textEncoder.dispose(); + await imageEncoder.dispose(); } } testGetCheckpoint(); -testTextEncoder(); +testEncoders(); From 4f1568fb799b3d150e459a633ad8705efd0ca089 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 20 Apr 2024 00:47:58 +0000 Subject: [PATCH 15/40] Fix: Rename image inputs --- python/scripts/export_encoders.ipynb | 20 +++++++------------- swift/Encoders.swift | 4 ++-- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb index a8d2ac3..029e60a 100644 --- a/python/scripts/export_encoders.ipynb +++ b/python/scripts/export_encoders.ipynb @@ -8,7 +8,7 @@ "\n", "Depending on the backend, we prefer different qunatization schemes.\n", "\n", - "- For ONNX we use `int8` quantization.\n", + "- For ONNX we use `uint8` quantization.\n", "- For PyTorch we use `bfloat16` quantization.\n", "- For CoreML we use `float32` representation." ] @@ -19,6 +19,7 @@ "metadata": {}, "outputs": [], "source": [ + "!pip uninstall -y uform\n", "!pip install --upgrade \"uform[torch]\" coremltools" ] }, @@ -42,7 +43,7 @@ "import uform\n", "from PIL import Image\n", "\n", - "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n", + "model, processor = uform.get_model('unum-cloud/' + model_name)\n", "text = 'a small red panda in a zoo'\n", "image = Image.open('../../assets/unum.png')\n", "\n", @@ -122,7 +123,7 @@ "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n", "\n", "```python\n", - " image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n", + " image_input = ct.TensorType(name=\"images\", shape=image_data.shape)\n", " text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n", " text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n", "```\n", @@ -155,7 +156,7 @@ "metadata": {}, "outputs": [], "source": [ - "image_input = ct.TensorType(name=\"input\", shape=generalize_first_dimensions(image_data.shape, 1))\n", + "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data.shape, 1))\n", "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n", "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n", "text_features = ct.TensorType(name=\"features\")\n", @@ -403,10 +404,10 @@ " export_params=True,\n", " opset_version=15,\n", " do_constant_folding=True,\n", - " input_names = ['input'], \n", + " input_names = ['images'], \n", " output_names = ['features', 'embeddings'],\n", " dynamic_axes={\n", - " 'input' : {0 : 'batch_size'},\n", + " 'images' : {0 : 'batch_size'},\n", " 'features' : {0 : 'batch_size'},\n", " 'embeddings' : {0 : 'batch_size'}})" ] @@ -632,13 +633,6 @@ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n", "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/swift/Encoders.swift b/swift/Encoders.swift index 44c6e71..3582e91 100644 --- a/swift/Encoders.swift +++ b/swift/Encoders.swift @@ -402,14 +402,14 @@ class ImageInput: MLFeatureProvider { } var featureNames: Set { - return Set(["input"]) + return Set(["images"]) } // The model expects the input IDs to be an array of integers // of length `sequenceLength`, padded with `paddingID` if necessary func featureValue(for featureName: String) -> MLFeatureValue? { switch featureName { - case "input": + case "images": return precomputedFeature default: return nil From cccfc620d7d143b642a84b96326d5db49f679ebf Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 20 Apr 2024 01:04:21 +0000 Subject: [PATCH 16/40] Improve: Separate encoders & processors --- javascript/encoders.mjs | 20 +-- python/scripts/test_encoders.py | 89 +++++++----- python/uform/__init__.py | 62 +++++--- python/uform/numpy_processors.py | 18 ++- python/uform/onnx_encoders.py | 123 ++-------------- python/uform/torch_decoders.py | 18 ++- python/uform/torch_encoders.py | 241 +++++-------------------------- python/uform/torch_processors.py | 22 ++- 8 files changed, 194 insertions(+), 399 deletions(-) diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs index ae9dbec..7a287cc 100644 --- a/javascript/encoders.mjs +++ b/javascript/encoders.mjs @@ -181,7 +181,7 @@ class ImageEncoder { } } - async forward(inputs) { + async forward(images) { if (!this.session) { throw new Error("Session is not initialized."); } @@ -206,21 +206,21 @@ class ImageEncoder { return result; }; - let inputData; + let imagesData; let dims; - if (Array.isArray(inputs)) { - // Assuming each input in the array is a Float32Array representing an image already processed to a fixed size. - const arrays = inputs.map(ensureFloat32Array); - inputData = concatFloat32Arrays(arrays); + if (Array.isArray(images)) { + // Assuming each images in the array is a Float32Array representing an image already processed to a fixed size. + const arrays = images.map(ensureFloat32Array); + imagesData = concatFloat32Arrays(arrays); const numImages = arrays.length; const numChannels = 3; const height = this.imageSize; const width = this.imageSize; dims = [numImages, numChannels, height, width]; } else { - // Single image input, which is already a Float32Array. - inputData = ensureFloat32Array(inputs); + // Single image images, which is already a Float32Array. + imagesData = ensureFloat32Array(images); const numChannels = 3; const height = this.imageSize; const width = this.imageSize; @@ -228,11 +228,11 @@ class ImageEncoder { } // Create ONNX Tensor - const inputTensor = new Tensor('float32', inputData, dims); + const imagesTensor = new Tensor('float32', imagesData, dims); // Run model inference return this.session.run({ - input: inputTensor, + images: imagesTensor, }); } } diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py index a58544d..d26e4f2 100644 --- a/python/scripts/test_encoders.py +++ b/python/scripts/test_encoders.py @@ -7,7 +7,7 @@ import numpy as np from PIL import Image -import uform +from uform import Modality, get_model, get_model_onnx # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed try: @@ -27,12 +27,16 @@ torch_models = [ "unum-cloud/uform3-image-text-english-small", - "unum-cloud/uform-vl-english", - "unum-cloud/uform-vl-multilingual-v2", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", ] onnx_models = [ "unum-cloud/uform3-image-text-english-small", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", ] # Let's check if the HuggingFace Hub API token is set in the environment variable. @@ -113,34 +117,29 @@ def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embed @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") @pytest.mark.parametrize("model_name", torch_models) def test_torch_one_embedding(model_name: str): - model, processor = uform.get_model(model_name, token=token) + processors, models = get_model(model_name, token=token) + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + text = "a small red panda in a zoo" image_path = "assets/unum.png" image = Image.open(image_path) - image_data = processor.preprocess_image(image) - text_data = processor.preprocess_text(text) + image_data = processor_image(image) + text_data = processor_text(text) - image_features, image_embedding = model.encode_image(image_data, return_features=True) - text_features, text_embedding = model.encode_text(text_data, return_features=True) + image_features, image_embedding = model_image.forward(image_data, return_features=True) + text_features, text_embedding = model_text.forward(text_data, return_features=True) assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" - # Test reranking - score, joint_embedding = model.encode_multimodal( - image_features=image_features, - text_features=text_features, - attention_mask=text_data["attention_mask"], - return_scores=True, - ) - assert score.shape[0] == 1, "Matching score batch size is not 1" - assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1" - # Test if the model outputs actually make sense cross_references_image_and_text_embeddings( - lambda text: model.encode_text(processor.preprocess_text(text)), - lambda image: model.encode_image(processor.preprocess_image(image)), + lambda text: model_text(processor_text(text)), + lambda image: model_image(processor_image(image)), ) @@ -148,16 +147,22 @@ def test_torch_one_embedding(model_name: str): @pytest.mark.parametrize("model_name", torch_models) @pytest.mark.parametrize("batch_size", [1, 2]) def test_torch_many_embeddings(model_name: str, batch_size: int): - model, processor = uform.get_model(model_name, token=token) + + processors, models = get_model(model_name, token=token) + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + texts = ["a small red panda in a zoo"] * batch_size image_paths = ["assets/unum.png"] * batch_size images = [Image.open(path) for path in image_paths] - image_data = processor.preprocess_image(images) - text_data = processor.preprocess_text(texts) + image_data = processor_image(images) + text_data = processor_text(texts) - image_embeddings = model.encode_image(image_data, return_features=False) - text_embeddings = model.encode_text(text_data, return_features=False) + image_embeddings = model_image.forward(image_data, return_features=False) + text_embeddings = model_text.forward(text_data, return_features=False) assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" @@ -172,24 +177,29 @@ def test_onnx_one_embedding(model_name: str, device: str): try: - model, processor = uform.get_model_onnx(model_name, token=token, device=device) + processors, models = get_model_onnx(model_name, token=token, device=device) + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + text = "a small red panda in a zoo" image_path = "assets/unum.png" image = Image.open(image_path) - image_data = processor.preprocess_image(image) - text_data = processor.preprocess_text(text) + image_data = processor_image(image) + text_data = processor_text(text) - image_features, image_embedding = model.encode_image(image_data, return_features=True) - text_features, text_embedding = model.encode_text(text_data, return_features=True) + image_features, image_embedding = model_image(image_data) + text_features, text_embedding = model_text(text_data) assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" # Test if the model outputs actually make sense cross_references_image_and_text_embeddings( - lambda text: model.encode_text(processor.preprocess_text(text)), - lambda image: model.encode_image(processor.preprocess_image(image)), + lambda text: model_text(processor_text(text)), + lambda image: model_image(processor_image(image)), ) except ExecutionProviderError as e: @@ -206,16 +216,21 @@ def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str): try: - model, processor = uform.get_model_onnx(model_name, token=token, device=device) + processors, models = get_model_onnx(model_name, token=token, device=device) + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + texts = ["a small red panda in a zoo"] * batch_size image_paths = ["assets/unum.png"] * batch_size images = [Image.open(path) for path in image_paths] - image_data = processor.preprocess_image(images) - text_data = processor.preprocess_text(texts) + image_data = processor_image(images) + text_data = processor_text(texts) - image_embeddings = model.encode_image(image_data, return_features=False) - text_embeddings = model.encode_text(text_data, return_features=False) + image_embeddings = model_image(image_data, return_features=False) + text_embeddings = model_text(text_data, return_features=False) assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 74d5ee9..841440f 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -1,6 +1,6 @@ from json import load from os.path import join, exists -from typing import Dict, Optional, Tuple, Literal +from typing import Dict, Optional, Tuple, Literal, Union, Callable from enum import Enum from huggingface_hub import snapshot_download @@ -88,20 +88,30 @@ def get_model( model_name: str, *, token: Optional[str] = None, - modalities: Optional[Tuple[str]] = None, -): - from uform.torch_encoders import TextImageEncoder - from uform.torch_processors import TorchProcessor + modalities: Optional[Tuple[Union[str, Modality]]] = None, +) -> Tuple[Dict[Modality, Callable], Dict]: + from uform.torch_encoders import TextEncoder, ImageEncoder + from uform.torch_processors import TextProcessor, ImageProcessor - config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".pt") - modality_paths = ( - {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths - ) + modalities = normalize_modalities(modalities) + config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt") + + result_processors = {} + result_models = {} - model = TextImageEncoder(config_path, modality_paths) - processor = TorchProcessor(config_path, tokenizer_path) + if Modality.TEXT_ENCODER in modalities: + processor = TextProcessor(config_path, tokenizer_path) + encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER)).eval() + result_processors[Modality.TEXT_ENCODER] = processor + result_models[Modality.TEXT_ENCODER] = encoder - return model.eval(), processor + if Modality.IMAGE_ENCODER in modalities: + processor = ImageProcessor(config_path) + encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER)).eval() + result_processors[Modality.IMAGE_ENCODER] = processor + result_models[Modality.IMAGE_ENCODER] = encoder + + return result_processors, result_models def get_model_onnx( @@ -111,15 +121,25 @@ def get_model_onnx( token: Optional[str] = None, modalities: Optional[Tuple[str]] = None, ): - from uform.onnx_encoders import TextImageEncoder - from uform.numpy_processors import NumPyProcessor + from uform.onnx_encoders import TextEncoder, ImageEncoder + from uform.numpy_processors import TextProcessor, ImageProcessor - config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, token, modalities, format=".onnx") - modality_paths = ( - {k.value: v for k, v in modality_paths.items()} if isinstance(modality_paths, dict) else modality_paths - ) + modalities = normalize_modalities(modalities) + config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx") + + result_processors = {} + result_models = {} + + if Modality.TEXT_ENCODER in modalities: + processor = TextProcessor(config_path, tokenizer_path) + encoder = TextEncoder(modality_paths.get(Modality.TEXT_ENCODER), device=device) + result_processors[Modality.TEXT_ENCODER] = processor + result_models[Modality.TEXT_ENCODER] = encoder - model = TextImageEncoder(config_path, modality_paths, device=device) - processor = NumPyProcessor(config_path, tokenizer_path) + if Modality.IMAGE_ENCODER in modalities: + processor = ImageProcessor(config_path) + encoder = ImageEncoder(modality_paths.get(Modality.IMAGE_ENCODER), device=device) + result_processors[Modality.IMAGE_ENCODER] = processor + result_models[Modality.IMAGE_ENCODER] = encoder - return model, processor + return result_processors, result_models diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py index afda329..a5faca2 100644 --- a/python/uform/numpy_processors.py +++ b/python/uform/numpy_processors.py @@ -15,10 +15,13 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ config = json.load(open(config_path, "r")) - self._max_seq_len = config["text_encoder"]["max_position_embeddings"] + if "text_encoder" in config: + config = config["text_encoder"] + + self._max_seq_len = config["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) self._tokenizer.no_padding() - self._pad_token_idx = config["text_encoder"]["padding_idx"] + self._pad_token_idx = config["padding_idx"] def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]: """Transforms one or more strings into dictionary with tokenized strings and attention masks. @@ -50,7 +53,7 @@ def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]: class ImageProcessor: - def __init__(self, config_path: PathLike, tokenizer_path: PathLike): + def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None): """ :param config: model config :param tokenizer_path: path to tokenizer file @@ -58,9 +61,12 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ config = json.load(open(config_path, "r")) - self._image_size = config["image_encoder"]["image_size"] - self._normalization_means = config["image_encoder"]["normalization_means"] - self._normalization_deviations = config["image_encoder"]["normalization_deviations"] + if "image_encoder" in config: + config = config["image_encoder"] + + self._image_size = config["image_size"] + self._normalization_means = config["normalization_means"] + self._normalization_deviations = config["normalization_deviations"] assert isinstance(self._image_size, int) and self._image_size > 0 assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list) diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py index 95a0f73..9f63fa4 100644 --- a/python/uform/onnx_encoders.py +++ b/python/uform/onnx_encoders.py @@ -59,7 +59,12 @@ def available_providers(device: Optional[str]) -> Tuple[str, ...]: class ImageEncoder: - def __init__(self, model_path: str, device: str): + def __init__( + self, + model_path: str, + *, + device: Literal["cpu", "cuda"] = "cpu", + ): """ :param model_path: Path to onnx model :param device: Device name, either cpu or gpu @@ -75,14 +80,18 @@ def __init__(self, model_path: str, device: str): ) def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]: - return self.session.run(None, {"input": images}) + return self.session.run(None, {"images": images}) class TextEncoder: - def __init__(self, text_encoder_path: str, device: str): + def __init__( + self, + model_path: str, + *, + device: Literal["cpu", "cuda"] = "cpu", + ): """ :param text_encoder_path: Path to onnx of text encoder - :param reranker_path: Path to onnx of reranker :param device: Device name, either cpu or gpu """ @@ -90,114 +99,10 @@ def __init__(self, text_encoder_path: str, device: str): session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL self.text_encoder_session = ort.InferenceSession( - text_encoder_path, + model_path, sess_options=session_options, providers=available_providers(device), ) def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]: return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask}) - - -class TextImageEncoder: - def __init__( - self, - config_path: PathLike, - modality_paths: Union[Dict[str, PathLike], PathLike] = None, - *, - device: Literal["cpu", "cuda"] = "cpu", - ): - """Initializes the model with the configuration and pre-trained weights. - - :param config_path: Path to the JSON model configuration file - :param modality_paths: Dictionary with paths to different modalities, - or a single path to the model checkpoint - """ - self.device = device - - config = json.load(open(config_path, "r")) - self._embedding_dim = config["text_encoder"]["embedding_dim"] - self._text_encoder_dim = config["text_encoder"]["dim"] - self._image_encoder_dim = config["image_encoder"]["dim"] - - text_encoder_path = modality_paths.get("text_encoder", None) - image_encoder_path = modality_paths.get("image_encoder", None) - self.text_encoder = TextEncoder(text_encoder_path, device) if text_encoder_path else None - self.image_encoder = ImageEncoder(image_encoder_path, device) if image_encoder_path else None - - def encode_image( - self, - images: ndarray, - return_features: bool = False, - ) -> Union[ndarray, Tuple[ndarray, ndarray]]: - """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings. - - :param images: Preprocessed image - :param return_features: Whether to return images features or return only embeddings - """ - - features, embeddings = self.image_encoder(images) - - if return_features: - return features, embeddings - - return embeddings - - def encode_text( - self, - texts: Dict[str, ndarray], - return_features: bool = False, - ) -> Union[ndarray, Tuple[ndarray, ndarray]]: - """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings. - - :param texts: Dictionary with tokenized texts and attention masks - :param return_features: Whether to return texts features or return only embeddings - """ - - features, embeddings = self.text_encoder(**texts) - - if return_features: - return features, embeddings - - return embeddings - - def forward( - self, - images: ndarray, - texts: Dict[str, ndarray], - ) -> Union[ndarray, ndarray]: - """Inference forward method - - :param images: Preprocessed images - :param texts: Preprocessed texts - :return: embeddings for images and texts - """ - _, image_embeddings = self.image_encoder(images) - _, text_embeddings = self.text_encoder(texts) - return image_embeddings, text_embeddings - - @property - def text_features_dim(self) -> int: - """Dimensionality of the text encoder features.""" - - return self._text_encoder_dim - - @property - def image_features_dim(self) -> int: - """Dimensionality of the image encoder features.""" - - return self._image_encoder_dim - - @property - def embedding_dim(self) -> int: - """Dimensionality of shared space embedding.""" - - return self._embedding_dim - - @property - def multimodal_embedding_dim(self) -> int: - """Dimensionality of multimodal joint embedding.""" - return self._text_encoder_dim - - -VLM_ONNX = TextImageEncoder # legacy diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py index db60d63..475f5b0 100644 --- a/python/uform/torch_decoders.py +++ b/python/uform/torch_decoders.py @@ -153,7 +153,7 @@ def __init__(self, config: VLMConfig): self.config.image_encoder_pooling, ) - # replace models' layerscales because `transformers` automatically renames keys in state_dict + # replace models' layerscales because `transformers` automatically renames keys in `state_dict` for i in range(len(self.image_encoder.blocks)): self.image_encoder.blocks[i].ls1 = LayerScale( self.image_encoder.blocks[i].ls1.dim, @@ -218,6 +218,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[dict, Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -403,7 +404,10 @@ def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs): ) encoding = BatchEncoding( - data={"input_ids": input_ids, "attention_mask": attention_mask}, + data={ + "input_ids": input_ids, + "attention_mask": attention_mask, + }, ) if images is not None: @@ -449,7 +453,15 @@ def from_pretrained( revision: str = "main", **kwargs, ): - config = AutoConfig.from_pretrained(pretrained_model_name_or_path) + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + revision=revision, + token=token, + **kwargs, + ) return cls(config) diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index f122606..8ac7c36 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -1,12 +1,15 @@ +from __future__ import annotations + from dataclasses import dataclass from os import PathLike -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union, Callable import json import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor +from PIL.Image import Image @dataclass(eq=False) @@ -220,30 +223,9 @@ def forward_features(self, x: Tensor, attn_mask: Tensor) -> Tensor: return x - def forward_multimodal( - self, - x: Tensor, - attn_mask: Tensor, - context: Tensor, - ) -> Tensor: - context = self.context_projection(context) - expanded_attn_mask = self.get_attention_mask(attn_mask, x.dtype) - for block in self.blocks: - if block.cross_attention: - x = block(x, expanded_attn_mask, context) - - return self.pool_features(x, attn_mask) - def forward_embedding(self, x: Tensor, attn_mask: Tensor) -> Tensor: return self.embedding_projection(self.pool_features(x, attn_mask)) - def forward_matching(self, x: Tensor) -> Tensor: - logits = self.matching_head(x) - if self.head_one_neuron: - return torch.sigmoid(logits)[:, 0] - - return F.softmax(logits, dim=1)[:, 1] - def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor: if self.pooling == "cls": return x[:, 0] @@ -291,6 +273,22 @@ def forward( return features, embeddings return embeddings + @staticmethod + def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> TextEncoder: + if isinstance(config, (PathLike, str)): + config = json.load(open(config, "r")) + if "text_encoder" in config: + config = config["text_encoder"] + + # We must strip all the non-member attributes before initializing the classes. + text_fields = TextEncoder.__dataclass_fields__ + config = {k: v for k, v in config.items() if k in text_fields} + + state = torch.load(model_path) + encoder = TextEncoder(**config) + encoder.load_state_dict(state) + return encoder + @dataclass(eq=False) class ImageEncoder(nn.Module): @@ -322,19 +320,16 @@ def __post_init__(self): self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False) self.return_features = False - def forward_features(self, x: Tensor) -> Tensor: + def forward_features(self, x: Union[Tensor, dict]) -> Tensor: x = self.patch_embed(x).flatten(start_dim=2).transpose(2, 1) x = x + self.pos_embed - special_tokens = [self.cls_token.expand(x.shape[0], -1, -1)] if self.num_reg_tokens > 0: special_tokens.append(self.reg_token.expand(x.shape[0], -1, -1)) x = torch.cat(special_tokens + [x], dim=1) - x = self.blocks(x) - return self.norm(x) def forward_embedding(self, x: Tensor) -> Tensor: @@ -346,6 +341,8 @@ def forward_embedding(self, x: Tensor) -> Tensor: return self.embedding_projection(x) def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: + if isinstance(x, dict): + x = x["images"] features = self.forward_features(x) embeddings = self.forward_embedding(features) return_features = return_features if return_features is not None else self.return_features @@ -353,186 +350,18 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: return features, embeddings return embeddings + @staticmethod + def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> ImageEncoder: + if isinstance(config, (PathLike, str)): + config = json.load(open(config, "r")) + if "image_encoder" in config: + config = config["image_encoder"] -class TextImageEncoder(nn.Module): - """ - Vision-Language Model for Multimodal embeddings. - """ - - def __init__( - self, - config_path: PathLike, - modality_paths: Union[Dict[str, PathLike], PathLike] = None, - ): - """Initializes the model with the configuration and pre-trained weights. - - :param config_path: Path to the JSON model configuration file - :param modality_paths: Dictionary with paths to different modalities, - or a single path to the model checkpoint - """ - - super().__init__() - - config = json.load(open(config_path, "r")) - self._embedding_dim = config["text_encoder"]["embedding_dim"] - - # Both `text_encoder` and `image_encoder` are data-classes, so we must strip - # all the non-member attributes before initializing the classes. - text_fields = TextEncoder.__dataclass_fields__ + # We must strip all the non-member attributes before initializing the classes. image_fields = ImageEncoder.__dataclass_fields__ - text_encoder_attrs = {k: v for k, v in config["text_encoder"].items() if k in text_fields} - image_encoder_attrs = {k: v for k, v in config["image_encoder"].items() if k in image_fields} - self.text_encoder = TextEncoder(**text_encoder_attrs) - self.image_encoder = ImageEncoder(**image_encoder_attrs) - - # Load pre-trained weights - if modality_paths is not None: - if isinstance(modality_paths, Union[PathLike, str]): - state = torch.load(modality_paths) - self.text_encoder.load_state_dict(state["text_encoder"]) - self.image_encoder.load_state_dict(state["image_encoder"]) - else: - text_encoder_path = modality_paths.get("text_encoder", None) - image_encoder_path = modality_paths.get("image_encoder", None) - if text_encoder_path: - self.text_encoder.load_state_dict(torch.load(text_encoder_path)) - if image_encoder_path: - self.image_encoder.load_state_dict(torch.load(image_encoder_path)) - - def encode_image( - self, - images: Tensor, - return_features: bool = False, - ) -> Union[Tensor, Tuple[Tensor, Tensor]]: - """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings. - - :param images: Preprocessed image - :param return_features: Whether to return images features or return only embeddings - """ - - features = self.image_encoder.forward_features(images) - embeddings = self.image_encoder.forward_embedding(features) - - if return_features: - return features, embeddings - - return embeddings - - def encode_text( - self, - texts: Dict[str, Tensor], - return_features: bool = False, - ) -> Union[Tensor, Tuple[Tensor, Tensor]]: - """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings. - - :param texts: Dictionary with tokenized texts and attention masks - :param return_features: Whether to return texts features or return only embeddings - """ - - features = self.text_encoder.forward_features( - texts["input_ids"], - texts["attention_mask"], - ) - embeddings = self.text_encoder.forward_embedding( - features, - texts["attention_mask"], - ) - - if return_features: - return features, embeddings - - return embeddings - - def encode_multimodal( - self, - image: Optional[Tensor] = None, - text: Optional[Dict] = None, - image_features: Optional[Tensor] = None, - text_features: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - return_scores: bool = False, - ) -> Union[Tensor, Tuple[Tensor, Tensor]]: - """Passes preprocessed texts (or precomputed texts features) and - preprocessed images (or precomputed images features) through multimodal encoded to produce multimodal joint embeddings. - - :param image: Preprocessed images - :param text: Preprocessed texts - :param image_features: Precomputed images features - :param text_features: Precomputed text features - :param attention_mask: Attention masks, not required if pass `text` instead of text_features - """ - - assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None" - assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None" - - if text_features is not None: - assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`" - - if image_features is None: - image_features = self.image_encoder.forward_features(image) - - if text_features is None: - text_features = self.text_encoder.forward_features( - text["input_ids"], - text["attention_mask"], - ) - - embeddings = self.text_encoder.forward_multimodal( - text_features, - attention_mask if attention_mask is not None else text["attention_mask"], - image_features, - ) - - if return_scores: - return self.get_matching_scores(embeddings), embeddings - - return embeddings - - def get_matching_scores(self, embeddings: Tensor) -> Tensor: - """Computes the probability that there is a match between images and texts based on their multimodal embeddings - - :param embeddings: multimodal joint embeddings - """ - - return self.text_encoder.forward_matching(embeddings) - - def forward( - self, - images: Tensor, - texts: Dict[str, Tensor], - ) -> Union[Tensor, Tensor]: - """Inference forward method - - :param images: Preprocessed images - :param texts: Preprocessed texts - :return: embeddings for images and texts - """ - _, image_embeddings = self.image_encoder(images) - _, text_embeddings = self.text_encoder(texts) - return image_embeddings, text_embeddings - - @property - def text_features_dim(self) -> int: - """Dimensionality of the text encoder features.""" - - return self.text_encoder.dim - - @property - def image_features_dim(self) -> int: - """Dimensionality of the image encoder features.""" - - return self.image_encoder.dim - - @property - def embedding_dim(self) -> int: - """Dimensionality of shared space embedding.""" - - return self._embedding_dim - - @property - def multimodal_embedding_dim(self) -> int: - """Dimensionality of multimodal joint embedding.""" - return self.text_encoder.dim - + config = {k: v for k, v in config.items() if k in image_fields} -VLM = TextImageEncoder # legacy + state = torch.load(model_path) + encoder = ImageEncoder(**config) + encoder.load_state_dict(state) + return encoder diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py index 340b117..32697ca 100644 --- a/python/uform/torch_processors.py +++ b/python/uform/torch_processors.py @@ -29,15 +29,19 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ config = json.load(open(config_path, "r")) - self._max_seq_len = config["text_encoder"]["max_position_embeddings"] + if "text_encoder" in config: + config = config["text_encoder"] + + self._max_seq_len = config["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) self._tokenizer.no_padding() - self._pad_token_idx = config["text_encoder"]["padding_idx"] + self._pad_token_idx = config["padding_idx"] def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: """Transforms one or more strings into dictionary with tokenized strings and attention masks. :param texts: text of list of texts to tokenizer + :return: dictionary with tokenized strings and attention masks as values """ if isinstance(texts, str): texts = [texts] @@ -72,9 +76,12 @@ def __init__(self, config_path: PathLike): """ config = json.load(open(config_path, "r")) - self._image_size = config["image_encoder"]["image_size"] - self._normalization_means = config["image_encoder"]["normalization_means"] - self._normalization_deviations = config["image_encoder"]["normalization_deviations"] + if "image_encoder" in config: + config = config["image_encoder"] + + self._image_size = config["image_size"] + self._normalization_means = config["normalization_means"] + self._normalization_deviations = config["normalization_deviations"] assert isinstance(self._image_size, int) and self._image_size > 0 assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list) @@ -93,10 +100,11 @@ def __init__(self, config_path: PathLike): ], ) - def __call__(self, images: Union[Image, List[Image]]) -> Tensor: + def __call__(self, images: Union[Image, List[Image]]) -> Dict[str, Tensor]: """Transforms one or more Pillow images into Torch Tensors. :param images: image or list of images to preprocess + :return: dictionary with float-represented images in tensors as values """ if isinstance(images, list): @@ -111,4 +119,4 @@ def __call__(self, images: Union[Image, List[Image]]) -> Tensor: else: batch_images = self._image_transform(images).unsqueeze(0) - return batch_images + return {"images": batch_images} From b7905197387b8f6f2b91667a47146068ea21d4e6 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Fri, 19 Apr 2024 21:26:39 -0700 Subject: [PATCH 17/40] Improve: PAss tests for small models --- .vscode/settings.json | 7 +- python/scripts/export_encoders.ipynb | 130 ++++++++++++++++----------- python/scripts/test_encoders.py | 16 ++-- python/uform/numpy_processors.py | 2 +- python/uform/onnx_encoders.py | 37 +++++++- python/uform/torch_encoders.py | 39 ++++++-- 6 files changed, 156 insertions(+), 75 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 3a060e1..3275f93 100755 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -21,7 +21,9 @@ "ndarray", "numpy", "ONNX", + "onnxconverter", "onnxruntime", + "opset", "packbits", "preprocess", "pretrained", @@ -48,5 +50,8 @@ "[python]": { "editor.defaultFormatter": "ms-python.black-formatter" }, - "python.formatting.provider": "none" + "python.formatting.provider": "none", + "window.autoDetectColorScheme": true, + "workbench.colorTheme": "Default Dark+", + "workbench.preferredDarkColorTheme": "Default Dark+" } \ No newline at end of file diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb index 029e60a..a8b868d 100644 --- a/python/scripts/export_encoders.ipynb +++ b/python/scripts/export_encoders.ipynb @@ -19,7 +19,6 @@ "metadata": {}, "outputs": [], "source": [ - "!pip uninstall -y uform\n", "!pip install --upgrade \"uform[torch]\" coremltools" ] }, @@ -30,8 +29,13 @@ "outputs": [], "source": [ "import os\n", - "model_name = \"uform-vl-english-small\"\n", - "output_directory = \"../../\"" + "\n", + "working_directory = \"../..\"\n", + "model_name = \"uform3-image-text-english-small\"\n", + "model_directory = os.path.join(working_directory, \"models\", model_name)\n", + "model_weights_path = os.path.join(model_directory, \"torch_weight.pt\")\n", + "config_path = os.path.join(model_directory, \"config.json\")\n", + "tokenizer_path = os.path.join(model_directory, \"tokenizer.json\")" ] }, { @@ -40,20 +44,20 @@ "metadata": {}, "outputs": [], "source": [ - "import uform\n", - "from PIL import Image\n", - "\n", - "model, processor = uform.get_model('unum-cloud/' + model_name)\n", - "text = 'a small red panda in a zoo'\n", - "image = Image.open('../../assets/unum.png')\n", - "\n", - "image_data = processor.preprocess_image(image)\n", - "text_data = processor.preprocess_text(text)\n", - "\n", - "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n", - "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "import torch\n", "\n", - "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + "state_dict = torch.load(model_weights_path)\n", + "list(state_dict.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from uform.torch_encoders import ImageEncoder, TextEncoder\n", + "from uform.torch_processors import ImageProcessor, TextProcessor" ] }, { @@ -62,7 +66,9 @@ "metadata": {}, "outputs": [], "source": [ - "model.text_encoder" + "image_encoder = ImageEncoder.from_pretrained(config_path, state_dict)\n", + "text_encoder = TextEncoder.from_pretrained(config_path, state_dict)\n", + "image_encoder, text_encoder" ] }, { @@ -71,7 +77,9 @@ "metadata": {}, "outputs": [], "source": [ - "model.image_encoder" + "text_processor = TextProcessor(config_path, tokenizer_path)\n", + "image_processor = ImageProcessor(config_path)\n", + "text_processor, image_processor" ] }, { @@ -80,14 +88,19 @@ "metadata": {}, "outputs": [], "source": [ - "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n", - "for name, module in model.image_encoder.named_children():\n", - " print(f\"First layer of image_encoder: {name}\")\n", - " break # We break after the first layer\n", + "import uform\n", + "from PIL import Image\n", "\n", - "for name, module in model.text_encoder.named_children():\n", - " print(f\"First layer of text_encoder: {name}\")\n", - " break # We break after the first layer" + "text = 'a small red panda in a zoo'\n", + "image = Image.open('../../assets/unum.png')\n", + "\n", + "text_data = text_processor(text)\n", + "image_data = image_processor(image)\n", + "\n", + "image_features, image_embedding = image_encoder.forward(image_data, return_features=True)\n", + "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" ] }, { @@ -147,7 +160,7 @@ " input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n", " return input_shape\n", "\n", - "generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)" + "generalize_first_dimensions(image_data[\"images\"].shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)" ] }, { @@ -156,7 +169,7 @@ "metadata": {}, "outputs": [], "source": [ - "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data.shape, 1))\n", + "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data[\"images\"].shape, 1))\n", "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n", "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n", "text_features = ct.TensorType(name=\"features\")\n", @@ -171,11 +184,11 @@ "metadata": {}, "outputs": [], "source": [ - "module = model.image_encoder\n", + "module = image_encoder\n", "module.eval()\n", "module.return_features = True\n", "\n", - "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n", + "traced_script_module = torch.jit.trace(module, example_inputs=image_data[\"images\"])\n", "traced_script_module" ] }, @@ -193,7 +206,7 @@ "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))" + "coreml_model.save(os.path.join(model_directory, \"image_encoder.mlpackage\"))" ] }, { @@ -202,7 +215,7 @@ "metadata": {}, "outputs": [], "source": [ - "module = model.text_encoder\n", + "module = text_encoder\n", "module.eval()\n", "module.return_features = True\n", "\n", @@ -224,7 +237,7 @@ "coreml_model.author = 'Unum Cloud'\n", "coreml_model.license = 'Apache 2.0'\n", "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))" + "coreml_model.save(os.path.join(model_directory, \"text_encoder.mlpackage\"))" ] }, { @@ -257,8 +270,8 @@ "metadata": {}, "outputs": [], "source": [ - "model.image_encoder.eval()\n", - "model.image_encoder.to(dtype=torch.bfloat16)" + "image_encoder.eval()\n", + "image_encoder.to(dtype=torch.bfloat16)" ] }, { @@ -267,7 +280,7 @@ "metadata": {}, "outputs": [], "source": [ - "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))" + "torch.save(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.pt\"))" ] }, { @@ -276,7 +289,7 @@ "metadata": {}, "outputs": [], "source": [ - "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))" + "save_file(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.safetensors\"))" ] }, { @@ -285,8 +298,8 @@ "metadata": {}, "outputs": [], "source": [ - "model.text_encoder.eval()\n", - "model.text_encoder.to(dtype=torch.bfloat16)" + "text_encoder.eval()\n", + "text_encoder.to(dtype=torch.bfloat16)" ] }, { @@ -295,7 +308,7 @@ "metadata": {}, "outputs": [], "source": [ - "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))" + "torch.save(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.pt\"))" ] }, { @@ -304,7 +317,7 @@ "metadata": {}, "outputs": [], "source": [ - "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))" + "save_file(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.safetensors\"))" ] }, { @@ -313,8 +326,8 @@ "metadata": {}, "outputs": [], "source": [ - "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n", - "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "image_features, image_embedding = image_encoder.forward(image_data[\"images\"].to(dtype=torch.bfloat16), return_features=True)\n", + "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n", "\n", "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" ] @@ -358,7 +371,7 @@ "metadata": {}, "outputs": [], "source": [ - "module = model.text_encoder\n", + "module = text_encoder\n", "module.eval()\n", "module.return_features = True\n", "module.to(dtype=torch.float32)\n", @@ -366,7 +379,7 @@ "onnx_export(\n", " module,\n", " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n", - " os.path.join(output_directory, \"text_encoder.onnx\"), \n", + " os.path.join(model_directory, \"text_encoder.onnx\"), \n", " export_params=True,\n", " opset_version=15,\n", " do_constant_folding=True,\n", @@ -392,15 +405,15 @@ "metadata": {}, "outputs": [], "source": [ - "module = model.image_encoder\n", + "module = image_encoder\n", "module.eval()\n", "module.return_features = True\n", "module.to(dtype=torch.float32)\n", "\n", "torch.onnx.export(\n", " module,\n", - " image_data, \n", - " os.path.join(output_directory, \"image_encoder.onnx\"), \n", + " image_data[\"images\"], \n", + " os.path.join(model_directory, \"image_encoder.onnx\"), \n", " export_params=True,\n", " opset_version=15,\n", " do_constant_folding=True,\n", @@ -437,7 +450,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", "module = onnx.load(module_path)\n", "module_fp16 = float16.convert_float_to_float16(module)\n", "onnx.save(module_fp16, module_path)" @@ -449,7 +462,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n", "module = onnx.load(module_path)\n", "module_fp16 = float16.convert_float_to_float16(module)\n", "onnx.save(module_fp16, module_path)" @@ -480,7 +493,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" ] }, @@ -490,7 +503,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n", "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" ] }, @@ -512,7 +525,7 @@ "from onnx import helper\n", "\n", "# Load the ONNX model\n", - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", "module = onnx.load(module_path)\n", "\n", "# Get the module's graph\n", @@ -599,7 +612,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", "session = ort.InferenceSession(module_path, sess_options=session_options)" ] }, @@ -609,7 +622,7 @@ "metadata": {}, "outputs": [], "source": [ - "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n", "session = ort.InferenceSession(module_path, sess_options=session_options)" ] }, @@ -620,6 +633,15 @@ "# Upload to Hugging Face" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../models/uform3-image-text-english-small/ . --exclude=\"torch_weight.pt\"" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py index d26e4f2..bd26690 100644 --- a/python/scripts/test_encoders.py +++ b/python/scripts/test_encoders.py @@ -27,16 +27,16 @@ torch_models = [ "unum-cloud/uform3-image-text-english-small", - "unum-cloud/uform3-image-text-english-base", - "unum-cloud/uform3-image-text-english-large", - "unum-cloud/uform3-image-text-multilingual-base", + # "unum-cloud/uform3-image-text-english-base", + # "unum-cloud/uform3-image-text-english-large", + # "unum-cloud/uform3-image-text-multilingual-base", ] onnx_models = [ "unum-cloud/uform3-image-text-english-small", - "unum-cloud/uform3-image-text-english-base", - "unum-cloud/uform3-image-text-english-large", - "unum-cloud/uform3-image-text-multilingual-base", + # "unum-cloud/uform3-image-text-english-base", + # "unum-cloud/uform3-image-text-english-large", + # "unum-cloud/uform3-image-text-multilingual-base", ] # Let's check if the HuggingFace Hub API token is set in the environment variable. @@ -198,8 +198,8 @@ def test_onnx_one_embedding(model_name: str, device: str): # Test if the model outputs actually make sense cross_references_image_and_text_embeddings( - lambda text: model_text(processor_text(text)), - lambda image: model_image(processor_image(image)), + lambda text: model_text(processor_text(text))[1], + lambda image: model_image(processor_image(image))[1], ) except ExecutionProviderError as e: diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py index a5faca2..027bc0d 100644 --- a/python/uform/numpy_processors.py +++ b/python/uform/numpy_processors.py @@ -34,7 +34,7 @@ def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]: input_ids = np.full( (len(texts), self._max_seq_len), fill_value=self._pad_token_idx, - dtype=np.int64, + dtype=np.int32, ) attention_mask = np.zeros( diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py index 9f63fa4..a6f27d3 100644 --- a/python/uform/onnx_encoders.py +++ b/python/uform/onnx_encoders.py @@ -64,6 +64,7 @@ def __init__( model_path: str, *, device: Literal["cpu", "cuda"] = "cpu", + return_features: bool = True, ): """ :param model_path: Path to onnx model @@ -73,14 +74,21 @@ def __init__( session_options = ort.SessionOptions() session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + self.return_features = return_features self.session = ort.InferenceSession( model_path, sess_options=session_options, providers=available_providers(device), ) - def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]: - return self.session.run(None, {"images": images}) + def __call__( + self, images: ndarray, return_features: Optional[bool] = None + ) -> Union[ndarray, Tuple[ndarray, ndarray]]: + features, embeddings = self.session.run(None, {"images": images}) + return_features = return_features if return_features is not None else self.return_features + if return_features: + return features, embeddings + return embeddings class TextEncoder: @@ -89,6 +97,7 @@ def __init__( model_path: str, *, device: Literal["cpu", "cuda"] = "cpu", + return_features: bool = True, ): """ :param text_encoder_path: Path to onnx of text encoder @@ -98,11 +107,31 @@ def __init__( session_options = ort.SessionOptions() session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + self.return_features = return_features self.text_encoder_session = ort.InferenceSession( model_path, sess_options=session_options, providers=available_providers(device), ) - def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]: - return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask}) + def __call__( + self, + x: Union[ndarray, dict], + attention_mask: Optional[ndarray] = None, + return_features: Optional[bool] = None, + ) -> Union[ndarray, Tuple[ndarray, ndarray]]: + if isinstance(x, dict): + assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None" + attention_mask = x["attention_mask"] + input_ids = x["input_ids"] + else: + input_ids = x + + features, embeddings = self.text_encoder_session.run( + None, {"input_ids": input_ids, "attention_mask": attention_mask} + ) + + return_features = return_features if return_features is not None else self.return_features + if return_features: + return features, embeddings + return embeddings diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index 8ac7c36..0504a74 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from os import PathLike -from typing import Dict, Optional, Tuple, Union, Callable +from typing import Dict, Optional, Union, Mapping, Any import json import torch @@ -274,7 +274,12 @@ def forward( return embeddings @staticmethod - def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> TextEncoder: + def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder: + """Load the image encoder from the given configuration and model path. + + :param config: the configuration dictionary or path to the JSON configuration file + :param model: the model state dictionary or path to the `.pt` model file + """ if isinstance(config, (PathLike, str)): config = json.load(open(config, "r")) if "text_encoder" in config: @@ -283,9 +288,15 @@ def from_pretrained(config: Union[PathLike, str, object], model_path: Union[Path # We must strip all the non-member attributes before initializing the classes. text_fields = TextEncoder.__dataclass_fields__ config = {k: v for k, v in config.items() if k in text_fields} - - state = torch.load(model_path) encoder = TextEncoder(**config) + + # Load from disk + if isinstance(model, (PathLike, str)): + state = torch.load(model) + else: + state = model + if "text_encoder" in state: + state = state["text_encoder"] encoder.load_state_dict(state) return encoder @@ -351,7 +362,15 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: return embeddings @staticmethod - def from_pretrained(config: Union[PathLike, str, object], model_path: Union[PathLike, str]) -> ImageEncoder: + def from_pretrained( + config: Union[PathLike, str, object], + model: Union[PathLike, str, Mapping[str, Any]], + ) -> ImageEncoder: + """Load the image encoder from the given configuration and model path. + + :param config: the configuration dictionary or path to the JSON configuration file + :param model: the model state dictionary or path to the `.pt` model file + """ if isinstance(config, (PathLike, str)): config = json.load(open(config, "r")) if "image_encoder" in config: @@ -360,8 +379,14 @@ def from_pretrained(config: Union[PathLike, str, object], model_path: Union[Path # We must strip all the non-member attributes before initializing the classes. image_fields = ImageEncoder.__dataclass_fields__ config = {k: v for k, v in config.items() if k in image_fields} - - state = torch.load(model_path) encoder = ImageEncoder(**config) + + # Load from disk + if isinstance(model, (PathLike, str)): + state = torch.load(model) + else: + state = model + if "image_encoder" in state: + state = state["image_encoder"] encoder.load_state_dict(state) return encoder From 605bfc8cf4a9164051cd63003176f9db690d79e1 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 20 Apr 2024 22:18:22 -0700 Subject: [PATCH 18/40] Improve: Test more models --- CONTRIBUTING.md | 7 ------- Package.resolved | 2 +- Package.swift | 2 +- python/scripts/test_encoders.py | 12 ++++++------ swift/EncodersTests.swift | 14 ++++++++++++-- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bcf6d91..ceafee9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -51,12 +51,5 @@ Before submitting any changes, please make sure that the tests pass. ```sh npm install -npm run build npm run test ``` - -``` -tsc -node node_build/embeddings.mjs -``` - diff --git a/Package.resolved b/Package.resolved index fe63c94..6e3b1f7 100644 --- a/Package.resolved +++ b/Package.resolved @@ -14,7 +14,7 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/ashvardanian/swift-transformers", "state" : { - "revision" : "9ef46a51eca46978b62773f8887926dfe72b0ab4" + "revision" : "89fb5d97e1df347f9f588f62fc538dcad6fdb16c" } } ], diff --git a/Package.swift b/Package.swift index b3b9ffd..c2f7fe7 100644 --- a/Package.swift +++ b/Package.swift @@ -19,7 +19,7 @@ let package = Package( dependencies: [ .package( url: "https://github.com/ashvardanian/swift-transformers", - revision: "9ef46a51eca46978b62773f8887926dfe72b0ab4" + revision: "89fb5d97e1df347f9f588f62fc538dcad6fdb16c" ) ], targets: [ diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py index bd26690..29c5119 100644 --- a/python/scripts/test_encoders.py +++ b/python/scripts/test_encoders.py @@ -27,16 +27,16 @@ torch_models = [ "unum-cloud/uform3-image-text-english-small", - # "unum-cloud/uform3-image-text-english-base", - # "unum-cloud/uform3-image-text-english-large", - # "unum-cloud/uform3-image-text-multilingual-base", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", ] onnx_models = [ "unum-cloud/uform3-image-text-english-small", - # "unum-cloud/uform3-image-text-english-base", - # "unum-cloud/uform3-image-text-english-large", - # "unum-cloud/uform3-image-text-multilingual-base", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", ] # Let's check if the HuggingFace Hub API token is set in the environment variable. diff --git a/swift/EncodersTests.swift b/swift/EncodersTests.swift index 0096d62..5816446 100644 --- a/swift/EncodersTests.swift +++ b/swift/EncodersTests.swift @@ -73,7 +73,12 @@ final class TokenizerTests: XCTestCase { } func testTextEmbeddings() async throws { - for model in ["unum-cloud/uform3-image-text-english-small"] { + for model in [ + "unum-cloud/uform3-image-text-english-small", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", + ] { try await testTextEmbeddings(forModel: model) } } @@ -162,7 +167,12 @@ final class TokenizerTests: XCTestCase { } func testImageEmbeddings() async throws { - for model in ["unum-cloud/uform3-image-text-english-small"] { + for model in [ + "unum-cloud/uform3-image-text-english-small", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", + ] { try await testImageEmbeddings(forModel: model) } } From 0c2aa2828693edd6002c34cdd87f43e196c7775a Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 21 Apr 2024 05:19:23 +0000 Subject: [PATCH 19/40] Improve: Test many models in JS --- javascript/encoders_test.js | 159 ++++++++++++++++++++++-------------- 1 file changed, 98 insertions(+), 61 deletions(-) diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js index fba11f4..f50d3b6 100644 --- a/javascript/encoders_test.js +++ b/javascript/encoders_test.js @@ -1,4 +1,6 @@ -import { existsSync } from 'fs'; +import { existsSync, readFileSync } from 'fs'; +import { fileURLToPath } from 'url'; +import path from 'path'; import { getCheckpoint, Modality } from "./hub.mjs"; import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs"; @@ -9,83 +11,118 @@ function assert(condition, message) { } } +// Check if the HuggingFace Hub API token is set in the environment variable. +let hf_token = process.env.HUGGINGFACE_HUB_TOKEN; +if (!hf_token) { + const dirname = path.dirname(fileURLToPath(import.meta.url)); + const tokenPath = path.join(dirname, '../', '.hf_token'); + if (existsSync(tokenPath)) { + hf_token = readFileSync(tokenPath, 'utf8').trim(); + } +} + +async function tryGettingCheckpoint(modelId, modalities) { + const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( + modelId, + modalities, + hf_token, + '.onnx' + ); + + assert(configPath !== null, "Config path should not be null"); + assert(modalityPaths !== null, "Modality paths should not be null"); + assert(tokenizerPath !== null, "Tokenizer path should not be null"); + + // Check if the file actually exists + assert(existsSync(configPath), `Config file should exist at ${configPath}`); + assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`); + for (const modalityPath of Object.values(modalityPaths)) { + assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`); + } +} + async function testGetCheckpoint() { - console.log("Test getCheckpoint: Start"); + console.log("- `testGetCheckpoint`: Start"); try { - const modelId = 'unum-cloud/uform3-image-text-english-small'; - const token = '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'; const modalities = [Modality.TextEncoder, Modality.ImageEncoder]; - const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( - modelId, - modalities, - token, - '.onnx' - ); - - assert(configPath !== null, "Config path should not be null"); - assert(modalityPaths !== null, "Modality paths should not be null"); - assert(tokenizerPath !== null, "Tokenizer path should not be null"); - - // Check if the file actually exists - assert(existsSync(configPath), `Config file should exist at ${configPath}`); - assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`); - for (const modalityPath of Object.values(modalityPaths)) { - assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`); + for (const modelId of [ + 'unum-cloud/uform3-image-text-english-small', + 'unum-cloud/uform3-image-text-english-base', + 'unum-cloud/uform3-image-text-english-large', + 'unum-cloud/uform3-image-text-multilingual-base', + ]) { + await tryGettingCheckpoint(modelId, modalities, hf_token); } - console.log("Test getCheckpoint: Success"); + console.log("- `testGetCheckpoint`: Success"); } catch (error) { - console.error("Test getCheckpoint: Failed", error); + console.error("- `testGetCheckpoint`: Failed", error); } } -async function testEncoders() { - console.log("Test testEncoders: Start"); - let textEncoder = null; - let imageEncoder = null; - - try { - const modelId = 'unum-cloud/uform3-image-text-english-small'; - const token = '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'; - const modalities = [Modality.TextEncoder, Modality.ImageEncoder]; - - const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( - modelId, - modalities, - token, - '.onnx' - ); - - assert(configPath !== null, "Config path should not be null"); - assert(modalityPaths !== null, "Modality paths should not be null"); - assert(tokenizerPath !== null, "Tokenizer path should not be null"); +async function tryTextEncoderForwardPass(modelId) { + const modalities = [Modality.TextEncoder]; + const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( + modelId, + modalities, + hf_token, + '.onnx' + ); + + const textProcessor = new TextProcessor(configPath, tokenizerPath); + await textProcessor.init(); + const processedTexts = await textProcessor.process("Hello, world!"); + + const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); + await textEncoder.init(); + const textOutput = await textEncoder.forward(processedTexts); + assert(textOutput.embeddings.dims.length === 2, "Output should be 2D"); + + await textEncoder.dispose(); +} - const textProcessor = new TextProcessor(configPath, tokenizerPath); - await textProcessor.init(); - const processedTexts = await textProcessor.process("Hello, world!"); +async function tryImageEncoderForwardPass(modelId) { + const modalities = [Modality.ImageEncoder]; + const { configPath, modalityPaths } = await getCheckpoint( + modelId, + modalities, + hf_token, + '.onnx' + ); + + const imageProcessor = new ImageProcessor(configPath); + await imageProcessor.init(); + const processedImages = await imageProcessor.process("assets/unum.png"); + + const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); + await imageEncoder.init(); + const imageOutput = await imageEncoder.forward(processedImages); + assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D"); + + await imageEncoder.dispose(); +} - textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); - await textEncoder.init(); - const textOutput = await textEncoder.forward(processedTexts); - console.log(textOutput.embeddings.dims); +async function testEncoders() { + console.log("- `testEncoders`: Start"); - const imageProcessor = new ImageProcessor(configPath); - await imageProcessor.init(); - const processedImages = await imageProcessor.process("assets/unum.png"); + try { - imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); - await imageEncoder.init(); - const imageOutput = await imageEncoder.forward(processedImages); - console.log(imageOutput.embeddings.dims); + // Go through the bi-modal models + for (const modelId of [ + 'unum-cloud/uform3-image-text-english-small', + 'unum-cloud/uform3-image-text-english-base', + 'unum-cloud/uform3-image-text-english-large', + 'unum-cloud/uform3-image-text-multilingual-base', + ]) { + await tryTextEncoderForwardPass(modelId, hf_token); + await tryImageEncoderForwardPass(modelId, hf_token); + } - console.log("Test testEncoders: Success"); + console.log("- `testEncoders`: Success"); } catch (error) { - console.error("Test testEncoders: Failed", error); - } finally { - await textEncoder.dispose(); - await imageEncoder.dispose(); + console.error("- `testEncoders`: Failed", error); } } From 766963caaa840b230324ccafdf0a02b0aaeaa3e7 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sun, 21 Apr 2024 05:44:48 +0000 Subject: [PATCH 20/40] Add: Text and image cross-referencing in JS --- javascript/encoders.mjs | 4 +- javascript/encoders_test.js | 110 ++++++++++++++++++++++++++++++-- python/scripts/test_encoders.py | 2 +- swift/EncodersTests.swift | 5 +- 4 files changed, 111 insertions(+), 10 deletions(-) diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs index 7a287cc..7ebaeb9 100644 --- a/javascript/encoders.mjs +++ b/javascript/encoders.mjs @@ -141,8 +141,8 @@ class ImageProcessor { fit: sharp.fit.cover, position: sharp.strategy.entropy }).extract({ - left: Math.max(0, (scaledWidth - this.imageSize) / 2), - top: Math.max(0, (scaledHeight - this.imageSize) / 2), + left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)), + top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)), width: this.imageSize, height: this.imageSize }).removeAlpha(); diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js index f50d3b6..28538ee 100644 --- a/javascript/encoders_test.js +++ b/javascript/encoders_test.js @@ -1,16 +1,12 @@ import { existsSync, readFileSync } from 'fs'; import { fileURLToPath } from 'url'; import path from 'path'; +import assert from 'assert'; +import fetch from 'node-fetch'; import { getCheckpoint, Modality } from "./hub.mjs"; import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs"; -function assert(condition, message) { - if (!condition) { - throw new Error(message); - } -} - // Check if the HuggingFace Hub API token is set in the environment variable. let hf_token = process.env.HUGGINGFACE_HUB_TOKEN; if (!hf_token) { @@ -104,6 +100,107 @@ async function tryImageEncoderForwardPass(modelId) { await imageEncoder.dispose(); } +function cosineSimilarity(vecA, vecB) { + // We may be receiving a complex tesnor type, so let's check if it + // has an array member named `data`. + if (vecA.data) { + vecA = vecA.data; + } + if (vecB.data) { + vecB = vecB.data; + } + + let dotProduct = 0.0; + let normA = 0.0; + let normB = 0.0; + for (let i = 0; i < vecA.length; i++) { + dotProduct += vecA[i] * 1.0 * vecB[i]; + normA += vecA[i] * 1.0 * vecA[i]; + normB += vecB[i] * 1.0 * vecB[i]; + } + if (normA === 0 || normB === 0) { + return 0; + } else { + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); + } +} + +async function fetchImage(url) { + const response = await fetch(url); + const arrayBuffer = await response.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + return buffer; +} + +async function tryCrossReferencingImageAndText(modelId) { + + const modalities = [Modality.ImageEncoder, Modality.TextEncoder]; + const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( + modelId, + modalities, + hf_token, + '.onnx' + ); + + const imageProcessor = new ImageProcessor(configPath); + await imageProcessor.init(); + const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); + await imageEncoder.init(); + const textProcessor = new TextProcessor(configPath, tokenizerPath); + await textProcessor.init(); + const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); + await textEncoder.init(); + + const texts = [ + "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", + "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", + "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", + "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", + "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", + ]; + const imageUrls = [ + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true", + ]; + + const textEmbeddings = []; + const imageEmbeddings = []; + + for (let i = 0; i < texts.length; i++) { + const text = texts[i]; + const imageUrl = imageUrls[i]; + const imageBuffer = await fetchImage(imageUrl); + + const processedText = await textProcessor.process(text); + const processedImage = await imageProcessor.process(imageBuffer); + + const textEmbedding = await textEncoder.forward(processedText); + const imageEmbedding = await imageEncoder.forward(processedImage); + + textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data)); + imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data)); + console.log(`Text: ${text}, Image: ${imageUrl}, Similarity: ${cosineSimilarity(textEmbedding.embeddings, imageEmbedding.embeddings)}`); + } + + for (let i = 0; i < texts.length; i++) { + const pairSimilarity = cosineSimilarity(textEmbeddings[i], imageEmbeddings[i]); + const otherTextSimilarities = textEmbeddings.map((te, idx) => idx === i ? -Infinity : cosineSimilarity(te, imageEmbeddings[i])); + const otherImageSimilarities = imageEmbeddings.map((ie, idx) => idx === i ? -Infinity : cosineSimilarity(textEmbeddings[i], ie)); + + const maxOtherTextSimilarity = Math.max(...otherTextSimilarities); + const maxOtherImageSimilarity = Math.max(...otherImageSimilarities); + + assert(pairSimilarity > maxOtherTextSimilarity, "Text should be more similar to its corresponding image than to other images."); + assert(pairSimilarity > maxOtherImageSimilarity, "Image should be more similar to its corresponding text than to other texts."); + } + + await textEncoder.dispose(); + await imageEncoder.dispose(); +} + async function testEncoders() { console.log("- `testEncoders`: Start"); @@ -118,6 +215,7 @@ async function testEncoders() { ]) { await tryTextEncoderForwardPass(modelId, hf_token); await tryImageEncoderForwardPass(modelId, hf_token); + await tryCrossReferencingImageAndText(modelId, hf_token); } console.log("- `testEncoders`: Success"); diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py index d26e4f2..fd78e54 100644 --- a/python/scripts/test_encoders.py +++ b/python/scripts/test_encoders.py @@ -68,7 +68,7 @@ def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embed texts = [ "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", - "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", + "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", ] diff --git a/swift/EncodersTests.swift b/swift/EncodersTests.swift index 0096d62..839a916 100644 --- a/swift/EncodersTests.swift +++ b/swift/EncodersTests.swift @@ -16,6 +16,9 @@ final class TokenizerTests: XCTestCase { { hfToken = token } + + hfToken = hfToken ?? ProcessInfo.processInfo.environment["HF_TOKEN"] + hfToken = hfToken ?? "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" } func cosineSimilarity(between vectorA: [T], and vectorB: [T]) -> T { @@ -107,7 +110,7 @@ final class TokenizerTests: XCTestCase { let texts = [ "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", - "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", + "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", ] From 6b3f8cd351f534b84f7712cc9638c8adef90bcd8 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Mon, 22 Apr 2024 21:16:24 +0000 Subject: [PATCH 21/40] Add: Initial decoder exporters --- python/scripts/export_decoders.ipynb | 654 +++++++++++++++++++++++++++ 1 file changed, 654 insertions(+) create mode 100644 python/scripts/export_decoders.ipynb diff --git a/python/scripts/export_decoders.ipynb b/python/scripts/export_decoders.ipynb new file mode 100644 index 0000000..3aededb --- /dev/null +++ b/python/scripts/export_decoders.ipynb @@ -0,0 +1,654 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n", + "\n", + "Depending on the backend, we prefer different qunatization schemes.\n", + "\n", + "- For ONNX we use `uint8` quantization.\n", + "- For PyTorch we use `bfloat16` quantization.\n", + "- For CoreML we use `float32` representation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade \"uform[torch]\" coremltools" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "model_name = \"unum-cloud/uform-gen2-dpo\"\n", + "output_directory = \"../../\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import uform\n", + "from PIL import Image\n", + "from transformers import AutoModel, AutoProcessor\n", + "\n", + "model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)\n", + "processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)\n", + "\n", + "prompt = 'Describe the picture'\n", + "image = Image.open('../../assets/unum.png')\n", + "inputs = processor(text=[prompt], images=[image], return_tensors='pt')\n", + "\n", + "with torch.inference_mode():\n", + " output = model.generate(\n", + " **inputs,\n", + " do_sample=False,\n", + " use_cache=True,\n", + " max_new_tokens=256,\n", + " eos_token_id=151645,\n", + " pad_token_id=processor.tokenizer.pad_token_id\n", + " )\n", + "prompt_len = inputs['input_ids'].shape[1]\n", + "decoded_text = processor.batch_decode(output[:, prompt_len:])[0]\n", + "\n", + "print(decoded_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n", + "for name, module in model.named_children():\n", + " print(f\"First layer of module: {name}\")\n", + " break # We break after the first layer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CoreML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import coremltools as ct\n", + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "precision = ct.precision.FLOAT32" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n", + "\n", + "```python\n", + " image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n", + " text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n", + " text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n", + "```\n", + "\n", + "That, however, will only work for batch-size one. To support larger batches, we need to override the input shapes.\n", + "\n", + "```python\n", + " ct.RangeDim(lower_bound=25, upper_bound=100, default=45)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generalize_first_dimensions(input_shape, upper_bound=64):\n", + " if upper_bound == 1:\n", + " return input_shape\n", + " input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n", + " return input_shape\n", + "\n", + "generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_input = ct.TensorType(name=\"input\", shape=generalize_first_dimensions(image_data.shape, 1))\n", + "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n", + "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n", + "text_features = ct.TensorType(name=\"features\")\n", + "text_embeddings = ct.TensorType(name=\"embeddings\")\n", + "image_features = ct.TensorType(name=\"features\")\n", + "image_embeddings = ct.TensorType(name=\"embeddings\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.image_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "\n", + "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n", + "traced_script_module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coreml_model = ct.convert(\n", + " traced_script_module, source=\"pytorch\",\n", + " inputs=[image_input], outputs=[image_features, image_embeddings],\n", + " convert_to='mlprogram', compute_precision=precision)\n", + "\n", + "coreml_model.author = 'Unum Cloud'\n", + "coreml_model.license = 'Apache 2.0'\n", + "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", + "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.text_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "\n", + "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n", + "traced_script_module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coreml_model = ct.convert(\n", + " traced_script_module, source=\"pytorch\",\n", + " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n", + " convert_to='mlprogram', compute_precision=precision)\n", + "\n", + "coreml_model.author = 'Unum Cloud'\n", + "coreml_model.license = 'Apache 2.0'\n", + "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", + "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyTorch\n", + "\n", + "Let's ensure:\n", + "\n", + "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n", + "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n", + "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from safetensors import safe_open\n", + "from safetensors.torch import save_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.image_encoder.eval()\n", + "model.image_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.text_encoder.eval()\n", + "model.text_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n", + "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ONNX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install onnx onnxconverter-common" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.onnx import export as onnx_export\n", + "import torch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.text_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "onnx_export(\n", + " module,\n", + " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n", + " os.path.join(output_directory, \"text_encoder.onnx\"), \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['input_ids', 'attention_mask'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'input_ids' : {0 : 'batch_size'}, \n", + " 'attention_mask' : {0 : 'batch_size'}, \n", + " 'features' : {0 : 'batch_size'}, \n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now repeat the same for images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.image_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "torch.onnx.export(\n", + " module,\n", + " image_data, \n", + " os.path.join(output_directory, \"image_encoder.onnx\"), \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['input'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'input' : {0 : 'batch_size'},\n", + " 'features' : {0 : 'batch_size'},\n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quantizing to `float16`\n", + "\n", + "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "from onnxconverter_common import float16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module = onnx.load(module_path)\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, module_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "module = onnx.load(module_path)\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, module_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quantizing to `uint8`\n", + "\n", + "We can further quantize the model into `uint8` using ONNX quantization tools.\n", + "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from onnxruntime.quantization import quantize_dynamic, QuantType" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's make sure that all the text inputs are integers of identical type - `int32`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "import os\n", + "from onnx import helper\n", + "\n", + "# Load the ONNX model\n", + "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "module = onnx.load(module_path)\n", + "\n", + "# Get the module's graph\n", + "graph = module.graph\n", + "\n", + "# Iterate through the inputs and update the data type of `input_ids`\n", + "for input_tensor in graph.input:\n", + " # Check if this is the tensor we want to change\n", + " if input_tensor.name == 'input_ids' or input_tensor.name == 'attention_mask':\n", + " # Get the tensor type information\n", + " tensor_type = input_tensor.type.tensor_type\n", + " # Set the element type to INT32 (int32's enum value in onnx is 6)\n", + " tensor_type.elem_type = onnx.TensorProto.INT32\n", + "\n", + "# Optionally, check that the module is still valid\n", + "onnx.checker.check_model(module)\n", + "\n", + "# Save the modified module\n", + "onnx.save(module, module_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the following function to print and validate the input and output types of the ONNX model files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_model_inputs_and_outputs(onnx_model_path):\n", + " model = onnx.load(onnx_model_path)\n", + "\n", + " # Get the model's graph\n", + " graph = model.graph\n", + "\n", + " # Print input information\n", + " print(\"Model Inputs:\")\n", + " for input_tensor in graph.input:\n", + " tensor_type = input_tensor.type.tensor_type\n", + " # Get the element type (data type)\n", + " elem_type = tensor_type.elem_type\n", + " # Convert numeric type to readable format\n", + " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n", + " # Get tensor shape\n", + " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n", + " print(f\"Name: {input_tensor.name}, Type: {readable_type}, Shape: {shape}\")\n", + "\n", + " # Print output information similarly if needed\n", + " print(\"\\nModel Outputs:\")\n", + " for output_tensor in graph.output:\n", + " tensor_type = output_tensor.type.tensor_type\n", + " elem_type = tensor_type.elem_type\n", + " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n", + " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n", + " print(f\"Name: {output_tensor.name}, Type: {readable_type}, Shape: {shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check that the runtime can actually load those models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "session_options = ort.SessionOptions()\n", + "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", + "session = ort.InferenceSession(module_path, sess_options=session_options)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", + "session = ort.InferenceSession(module_path, sess_options=session_options)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Upload to Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 4c1ac18d06a4d16e1b36b0cd85a202fe36d1a781 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 23 Apr 2024 01:08:29 +0000 Subject: [PATCH 22/40] Fix: Transposing channels in JS --- javascript/encoders.mjs | 32 +++++++++++++++++++++----------- javascript/encoders_test.js | 19 ++++++++++++------- package.json | 1 + python/scripts/test_encoders.py | 14 ++++++++++---- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs index 7ebaeb9..6c24b5a 100644 --- a/javascript/encoders.mjs +++ b/javascript/encoders.mjs @@ -125,21 +125,22 @@ class ImageProcessor { this.normalizationMeans = config.normalization_means; this.normalizationDeviations = config.normalization_deviations; - this.imageMean = new Float32Array(this.normalizationMeans).fill(0); - this.imageStd = new Float32Array(this.normalizationDeviations).fill(0); + this.imageMean = new Float32Array(this.normalizationMeans); + this.imageStd = new Float32Array(this.normalizationDeviations); } async process(images) { const processSingle = async (image) => { - let img = sharp(image); + let img = sharp(image).toColorspace('srgb'); const metadata = await img.metadata(); const scale = this.imageSize / Math.min(metadata.width, metadata.height); - const scaledWidth = parseInt(metadata.width * scale); - const scaledHeight = parseInt(metadata.height * scale); + const scaledWidth = Math.ceil(metadata.width * scale); + const scaledHeight = Math.ceil(metadata.height * scale); img = img.resize({ width: scaledWidth, height: scaledHeight, fit: sharp.fit.cover, - position: sharp.strategy.entropy + position: sharp.strategy.entropy, + options: sharp.interpolators.bicubic }).extract({ left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)), top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)), @@ -148,12 +149,21 @@ class ImageProcessor { }).removeAlpha(); let buffer = await img.raw().toBuffer(); - let array = new Float32Array(buffer); + let array = new Float32Array(buffer.length); + + // When we export into the `array`, we reorder the dimensions of the tensor + // from HWC to CHW, and normalize the pixel values. + let channelSize = this.imageSize * this.imageSize; + for (let i = 0; i < this.imageSize * this.imageSize; i++) { + let r = buffer[i * 3]; + let g = buffer[i * 3 + 1]; + let b = buffer[i * 3 + 2]; + array[i] = (r / 255.0 - this.imageMean[0]) / this.imageStd[0]; + array[channelSize + i] = (g / 255.0 - this.imageMean[1]) / this.imageStd[1]; + array[channelSize * 2 + i] = (b / 255.0 - this.imageMean[2]) / this.imageStd[2]; + } - return array.map((value, index) => { - const channel = index % 3; - return (value / 255.0 - this.normalizationMeans[channel]) / this.normalizationDeviations[channel]; - }); + return array; }; if (Array.isArray(images)) { diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js index 28538ee..f45ff4c 100644 --- a/javascript/encoders_test.js +++ b/javascript/encoders_test.js @@ -69,7 +69,7 @@ async function tryTextEncoderForwardPass(modelId) { const textProcessor = new TextProcessor(configPath, tokenizerPath); await textProcessor.init(); - const processedTexts = await textProcessor.process("Hello, world!"); + const processedTexts = await textProcessor.process("a small red panda in a zoo"); const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); await textEncoder.init(); @@ -180,9 +180,14 @@ async function tryCrossReferencingImageAndText(modelId) { const textEmbedding = await textEncoder.forward(processedText); const imageEmbedding = await imageEncoder.forward(processedImage); - textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data)); - imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data)); - console.log(`Text: ${text}, Image: ${imageUrl}, Similarity: ${cosineSimilarity(textEmbedding.embeddings, imageEmbedding.embeddings)}`); + textEmbeddings.push(new Float32Array(textEmbedding.embeddings.cpuData)); + imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.cpuData)); + + // Print-based debugging at its best :) + // console.log(`Text: ${text}, Image: ${imageUrl}`); + // console.log(`Text embedding first components: ${textEmbeddings[i].slice(0, 5)}`); + // console.log(`Image embedding first components: ${imageEmbeddings[i].slice(0, 5)}`); + console.log(`Similarity: ${cosineSimilarity(textEmbeddings[i], imageEmbeddings[i])}`) } for (let i = 0; i < texts.length; i++) { @@ -209,9 +214,9 @@ async function testEncoders() { // Go through the bi-modal models for (const modelId of [ 'unum-cloud/uform3-image-text-english-small', - 'unum-cloud/uform3-image-text-english-base', - 'unum-cloud/uform3-image-text-english-large', - 'unum-cloud/uform3-image-text-multilingual-base', + // 'unum-cloud/uform3-image-text-english-base', + // 'unum-cloud/uform3-image-text-english-large', + // 'unum-cloud/uform3-image-text-multilingual-base', ]) { await tryTextEncoderForwardPass(modelId, hf_token); await tryImageEncoderForwardPass(modelId, hf_token); diff --git a/package.json b/package.json index 9be073f..948550b 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "dependencies": { "@huggingface/hub": "^0.14.8", "@xenova/transformers": "^2.17.0", + "node-fetch": "^3.3.2", "onnxruntime-node": "^1.17.0", "onnxruntime-web": "^1.17.3" }, diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py index ed8dab5..7046217 100644 --- a/python/scripts/test_encoders.py +++ b/python/scripts/test_encoders.py @@ -196,11 +196,17 @@ def test_onnx_one_embedding(model_name: str, device: str): assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" + # Nested fucntions are easier to debug, than lambdas + def get_image_embedding(image_data): + features, embedding = model_image(processor_image(image_data)) + return embedding + + def get_text_embedding(text_data): + features, embedding = model_text(processor_text(text_data)) + return embedding + # Test if the model outputs actually make sense - cross_references_image_and_text_embeddings( - lambda text: model_text(processor_text(text))[1], - lambda image: model_image(processor_image(image))[1], - ) + cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding) except ExecutionProviderError as e: pytest.skip(f"Execution provider error: {e}") From 9bf5fe319d2c32f75bfda1a7a45b86c060b7f0f2 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 23 Apr 2024 04:18:33 +0000 Subject: [PATCH 23/40] Improve: Uniform APIs across JS, Py, and Swift --- README.md | 52 ++--- javascript/README.md | 63 ++++- javascript/encoders.mjs | 6 +- javascript/encoders_test.js | 18 +- javascript/hub.mjs | 4 +- python/README.md | 124 ++++++++++ .../scripts/{bench.py => bench_decoders.py} | 45 +--- python/scripts/bench_encoders.py | 221 ++++++++++++++++++ python/scripts/test_encoders.py | 28 +-- python/uform/__init__.py | 26 ++- python/uform/numpy_processors.py | 8 +- python/uform/onnx_encoders.py | 4 +- python/uform/torch_encoders.py | 25 +- python/uform/torch_processors.py | 6 +- swift/Encoders.swift | 12 +- swift/EncodersTests.swift | 6 +- swift/README.md | 37 ++- 17 files changed, 564 insertions(+), 121 deletions(-) create mode 100644 python/README.md rename python/scripts/{bench.py => bench_decoders.py} (80%) create mode 100644 python/scripts/bench_encoders.py diff --git a/README.md b/README.md index 32957e7..ee62beb 100755 --- a/README.md +++ b/README.md @@ -51,13 +51,12 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr ### Embedding Models -| Model | Parameters | Languages | Architecture | -| :--------------------------------------- | ---------: | --------: | -------------------------------------------: | -| [`uform-vl-english-large`][model-e-l] 🆕 | 365M | 1 | 6 text layers, ViT-L/14, 6 multimodal layers | -| [`uform-vl-english`][model-e] | 143M | 1 | 2 text layers, ViT-B/16, 2 multimodal layers | -| [`uform-vl-english-small`][model-e-s] 🆕 | 79M | 1 | 2 text layers, ViT-S/16, 2 multimodal layers | -| [`uform-vl-multilingual-v2`][model-m-v2] | 206M | 21 | 8 text layers, ViT-B/16, 4 multimodal layers | -| [`uform-vl-multilingual`][model-m] | 206M | 12 | 8 text layers, ViT-B/16, 4 multimodal layers | +| Model | Parameters | Languages | Architecture | +| :-------------------------------------------------- | ---------: | --------: | -------------------------------------------: | +| [`uform3-image-text-english-large`][model-e-l] 🆕 | 365M | 1 | 6 text layers, ViT-L/14, 6 multimodal layers | +| [`uform3-image-text-english-base`][model-e] | 143M | 1 | 2 text layers, ViT-B/16, 2 multimodal layers | +| [`uform3-image-text-english-small`][model-e-s] 🆕 | 79M | 1 | 2 text layers, ViT-S/16, 2 multimodal layers | +| [`uform3-image-text-multilingual-base`][model-m-v2] | 206M | 21 | 8 text layers, ViT-B/16, 4 multimodal layers | [model-e-l]: https://huggingface.co/unum-cloud/uform-vl-english-large/ [model-e]: https://huggingface.co/unum-cloud/uform-vl-english/ @@ -307,34 +306,18 @@ prompt_len = inputs['input_ids'].shape[1] decoded_text = processor.batch_decode(output[:, prompt_len:])[0] ``` -### Multimodal Chat +### Multimodal Chat in CLI -The generative models can be used for chat-like experiences, where the user can provide both text and images as input. -To use that feature, you can start with the following CLI command: +The generative models can be used for chat-like experiences in the command line. +For that, you can use the `uform-chat` CLI tool, which is available in the UForm package. ```bash -uform-chat --model unum-cloud/uform-gen-chat --image=zebra.jpg -uform-chat --model unum-cloud/uform-gen-chat \ - --image="https://bit.ly/3tIVg9M" \ - --device="cuda:0" \ - --fp16 -``` - -### Multi-GPU - -To achieve higher throughput, you can launch UForm on multiple GPUs. -For that pick the encoder of the model you want to run in parallel (`text_encoder` or `image_encoder`), and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`). - -```python -import uform - -model, processor = uform.get_model('unum-cloud/uform-vl-english') -model_image = nn.DataParallel(model.image_encoder) - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -model_image.to(device) - -_, res = model_image(images, 0) +$ pip install uform +$ uform-chat --model unum-cloud/uform-gen2-dpo --image=zebra.jpg +$ uform-chat --model unum-cloud/uform-gen2-dpo \ +> --image="https://bit.ly/3tIVg9M" \ +> --device="cuda:0" \ +> --fp16 ``` ## Evaluation @@ -471,3 +454,8 @@ On Apple M2 Arm chips the energy efficiency of inference can exceed that of the ## License All models come under the same license as the code - Apache 2.0. + + +TODO: + +- [ ] Download the image if a URL is provided \ No newline at end of file diff --git a/javascript/README.md b/javascript/README.md index 5626d39..0ef5c54 100644 --- a/javascript/README.md +++ b/javascript/README.md @@ -1,10 +1,67 @@ # UForm for JavaScript +UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your JavaScript applications. +Built around ONNX, the SDK is supposed to work with most runtimes and almost any hardware. +## Installation + +There are several ways to install the UForm JavaScript SDK from NPM. ```bash -pnpm add uform -npm add uform -yarn add uform +pnpm add uform +npm add uform +yarn add uform +``` + +## Quick Start + +### Embeddings + +```js +import { getModel, Modality } from 'uform'; +import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from 'uform'; + +const { configPath, modalityPaths, tokenizerPath } = await getModel({ + modelId: 'unum-cloud/uform3-image-text-english-small', + modalities: [Modality.TextEncoder, Modality.ImageEncoder], + token: null, // Optional Hugging Face token for private models + saveDir: null, // Optional directory to save the model to +}); + +const textProcessor = new TextProcessor(configPath, tokenizerPath); +await textProcessor.init(); +const processedTexts = await textProcessor.process("a small red panda in a zoo"); + +const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); +await textEncoder.init(); +const textOutput = await textEncoder.encode(processedTexts); +assert(textOutput.embeddings.dims.length === 2, "Output should be 2D"); +await textEncoder.dispose(); + +const imageProcessor = new ImageProcessor(configPath); +await imageProcessor.init(); +const processedImages = await imageProcessor.process("path/to/image.png"); + +const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); +await imageEncoder.init(); +const imageOutput = await imageEncoder.encode(processedImages); +assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D"); ``` +The `textOutput` and `imageOutput` would contain `features` and `embeddings` properties, which are the same as the `features` and `embeddings` properties in the Python SDK. +The embeddings can later be compared using the cosine similarity or other distance metrics. + +### Generative Models + +Coming soon ... + +## Technical Details + +### Faster Search + +Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall. +Independent of the quantization level, native JavaScript functionality may be too slow for large-scale search. +In such cases, consider using [USearch][github-usearch] or [SimSimD][github-simsimd]. + +[github-usearch]: https://github.com/unum-cloud/usearch +[github-simsimd]: https://github.com/ashvardanian/simsimd diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs index 6c24b5a..a37b326 100644 --- a/javascript/encoders.mjs +++ b/javascript/encoders.mjs @@ -3,7 +3,7 @@ import { InferenceSession, Tensor } from 'onnxruntime-node'; import { PreTrainedTokenizer } from '@xenova/transformers'; import sharp from 'sharp'; -import { getCheckpoint, Modality } from "./hub.mjs"; +import { getModel, Modality } from "./hub.mjs"; class TextProcessor { @@ -66,7 +66,7 @@ class TextEncoder { } } - async forward(inputs) { + async encode(inputs) { if (!this.session) { throw new Error("Session is not initialized."); } @@ -191,7 +191,7 @@ class ImageEncoder { } } - async forward(images) { + async encode(images) { if (!this.session) { throw new Error("Session is not initialized."); } diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js index f45ff4c..a0a70b2 100644 --- a/javascript/encoders_test.js +++ b/javascript/encoders_test.js @@ -4,7 +4,7 @@ import path from 'path'; import assert from 'assert'; import fetch from 'node-fetch'; -import { getCheckpoint, Modality } from "./hub.mjs"; +import { getModel, Modality } from "./hub.mjs"; import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs"; // Check if the HuggingFace Hub API token is set in the environment variable. @@ -18,7 +18,7 @@ if (!hf_token) { } async function tryGettingCheckpoint(modelId, modalities) { - const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( + const { configPath, modalityPaths, tokenizerPath } = await getModel( modelId, modalities, hf_token, @@ -60,7 +60,7 @@ async function testGetCheckpoint() { async function tryTextEncoderForwardPass(modelId) { const modalities = [Modality.TextEncoder]; - const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( + const { configPath, modalityPaths, tokenizerPath } = await getModel( modelId, modalities, hf_token, @@ -73,7 +73,7 @@ async function tryTextEncoderForwardPass(modelId) { const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); await textEncoder.init(); - const textOutput = await textEncoder.forward(processedTexts); + const textOutput = await textEncoder.encode(processedTexts); assert(textOutput.embeddings.dims.length === 2, "Output should be 2D"); await textEncoder.dispose(); @@ -81,7 +81,7 @@ async function tryTextEncoderForwardPass(modelId) { async function tryImageEncoderForwardPass(modelId) { const modalities = [Modality.ImageEncoder]; - const { configPath, modalityPaths } = await getCheckpoint( + const { configPath, modalityPaths } = await getModel( modelId, modalities, hf_token, @@ -94,7 +94,7 @@ async function tryImageEncoderForwardPass(modelId) { const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); await imageEncoder.init(); - const imageOutput = await imageEncoder.forward(processedImages); + const imageOutput = await imageEncoder.encode(processedImages); assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D"); await imageEncoder.dispose(); @@ -135,7 +135,7 @@ async function fetchImage(url) { async function tryCrossReferencingImageAndText(modelId) { const modalities = [Modality.ImageEncoder, Modality.TextEncoder]; - const { configPath, modalityPaths, tokenizerPath } = await getCheckpoint( + const { configPath, modalityPaths, tokenizerPath } = await getModel( modelId, modalities, hf_token, @@ -177,8 +177,8 @@ async function tryCrossReferencingImageAndText(modelId) { const processedText = await textProcessor.process(text); const processedImage = await imageProcessor.process(imageBuffer); - const textEmbedding = await textEncoder.forward(processedText); - const imageEmbedding = await imageEncoder.forward(processedImage); + const textEmbedding = await textEncoder.encode(processedText); + const imageEmbedding = await imageEncoder.encode(processedImage); textEmbeddings.push(new Float32Array(textEmbedding.embeddings.cpuData)); imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.cpuData)); diff --git a/javascript/hub.mjs b/javascript/hub.mjs index ad534f3..a59fb73 100644 --- a/javascript/hub.mjs +++ b/javascript/hub.mjs @@ -33,7 +33,7 @@ async function ensureDirectoryExists(dirPath) { } } -async function getCheckpoint(modelId, modalities, token = null, format = '.onnx', saveDir = './models') { +async function getModel(modelId, modalities, token = null, format = '.onnx', saveDir = './models') { modalities = normalizeModalities(modalities); const configNames = ['config.json']; @@ -101,4 +101,4 @@ async function getCheckpoint(modelId, modalities, token = null, format = '.onnx' return { configPath, modalityPaths, tokenizerPath }; } -export { getCheckpoint, Modality }; +export { getModel, Modality }; diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..aec9de8 --- /dev/null +++ b/python/README.md @@ -0,0 +1,124 @@ +# UForm Python SDK + +UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your Python applications. +The SDK doesn't require any deep learning knowledge, PyTorch, or CUDA installation, and can run on almost any hardware. + +## Installation + +There are several ways to install the UForm Python SDK, depending on the backend you want to use. +PyTorch is by far the heaviest, but the most capable. +ONNX is a lightweight alternative that can run on any CPU, and on some GPUs. + +```bash +pip install "uform[torch]" # For PyTorch +pip install "uform[onnx]" # For ONNX on CPU +pip install "uform[onnx-gpu]" # For ONNX on GPU, available for some platforms +pip install "uform[torch,onnx]" # For PyTorch and ONNX Python tests +``` + +## Quick Start + +### Embeddings + +```py +from uform import get_model, Modality + +import requests +from io import BytesIO +from PIL import Image + +model_name = 'unum-cloud/uform3-image-text-english-small' +modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER] +processors, models = get_model(model_name, modalities=modalities) + +model_text = models[Modality.TEXT_ENCODER] +model_image = models[Modality.IMAGE_ENCODER] +processor_text = processors[Modality.TEXT_ENCODER] +processor_image = processors[Modality.IMAGE_ENCODER] + +# Download the image +text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background' +image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg' +image_url = Image.open(BytesIO(requests.get(image_url).content)) + +# The actual inference +image_data = processor_image(image) +text_data = processor_text(text) +image_features, image_embedding = model_image.encode(image_data, return_features=True) +text_features, text_embedding = model_text.encode(text_data, return_features=True) +``` + +### Generative Models + +## Technical Details + +### Down-casting, Quantization, Matryoshka, and Slicing + +Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall. +Switching from `f32` to `f16` is recommended in almost all cases, unless you are running on very old hardware without half-precision support. +Switching to `i8` with linear scaling is also possible, but will be noticeable in the recall on larger collections with millions of searchable entries. +Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is to quantize them into single-bit representations for faster search. + +```python +import numpy as np + +f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy() +f16_embedding: np.ndarray = f32_embedding.astype(np.float16) +i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8) +b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8)) +``` + +Alternative approach to quantization is to use the Matryoshka embeddings, where the embeddings are sliced into smaller parts, and the search is performed in a hierarchical manner. + +```python +import numpy as np + +large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy() +small_embedding: np.ndarray = large_embedding[:, :256] +tiny_embedding: np.ndarray = large_embedding[:, :64] +``` + +Both approaches are natively supported by the [USearch][github-usearch] vector-search engine and the [SimSIMD][github-simsimd] numerics libraries. +When dealing with small collections (up to millions of entries) and looking for low-latency cosine distance calculations, you can [achieve 5x-2500x performance improvement][report-simsimd] over Torch, NumPy, SciPy, and vanilla Python using SimSIMD. + +```python +from simsimd import cosine, hamming + +distance: float = cosine(f32_embedding, f32_embedding) # 32x SciPy performance on Apple M2 CPU +distance: float = cosine(f16_embedding, f16_embedding) # 79x SciPy performance on Apple M2 CPU +distance: float = cosine(i8_embedding, i8_embedding) # 133x SciPy performance on Apple M2 CPU +distance: float = hamming(b1_embedding, b1_embedding) # 17x SciPy performance on Apple M2 CPU +``` + +Similarly, when dealing with large collections (up to billions of entries per server) and looking for high-throughput search, you can [achieve 100x performance improvement][report-usearch] over FAISS and other vector-search solutions using USearch. +Here are a couple of examples: + +```python +from usearch.index import Index + +f32_index = Index(ndim=64, metric='cos', dtype='f32') # for Matryoshka embeddings +f16_index = Index(ndim=64, metric='cos', dtype='f16') # for Matryoshka embeddings +i8_index = Index(ndim=256, metric='cos', dtype='i8') # for quantized embeddings +b1_index = Index(ndim=768, metric='hamming', dtype='b1') # for binary embeddings +``` + +[github-usearch]: https://github.com/unum-cloud/usearch +[github-simsimd]: https://github.com/ashvardanian/simsimd +[report-usearch]: https://www.unum.cloud/blog/2023-11-07-scaling-vector-search-with-intel +[report-simsimd]: https://ashvardanian.com/posts/python-c-assembly-comparison/ + +### Multi-GPU Parallelism + +To achieve higher throughput, you can launch UForm on multiple GPUs. +For that pick the encoder of the model you want to run in parallel, and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`). + +```python +from uform import get_model, Modality + +encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch', device='gpu') + +encoder_image = encoders[Modality.IMAGE_ENCODER] +encoder_image = nn.DataParallel(encoder_image) + +_, res = encoder_image(images, 0) +``` diff --git a/python/scripts/bench.py b/python/scripts/bench_decoders.py similarity index 80% rename from python/scripts/bench.py rename to python/scripts/bench_decoders.py index 8bcaf37..d98c130 100644 --- a/python/scripts/bench.py +++ b/python/scripts/bench_decoders.py @@ -1,5 +1,6 @@ from functools import partial from time import perf_counter +from dataclasses import dataclass from typing import List import requests @@ -12,7 +13,6 @@ LlavaForConditionalGeneration, ) -from uform import get_model from uform.torch_decoders import VLMForCausalLM, VLMProcessor dtype = torch.bfloat16 @@ -20,6 +20,17 @@ device = "cuda:0" +@dataclass +class BenchmarkResult: + model_name: str + device_name: str + backend_name: str + duration_image_preprocessing: float + duration_image_embedding: float + duration_text_preprocessing: float + duration_text_embedding: float + + def caption(model, processor, prompt: str, image: Image.Image) -> str: inputs = processor(prompt, image, return_tensors="pt") for possible_key in ["images", "pixel_values"]: @@ -75,30 +86,6 @@ def caption_image(image, model=model, processor=processor, prompt=prompt): print(f"Throughput: {total_length/total_duration:.2f} tokens/s") -def bench_image_embeddings(model, images): - total_duration = 0 - total_embeddings = 0 - images *= 10 - while total_duration < 10: - seconds, embeddings = duration(lambda: model.encode_image(processor.preprocess_image(images))) - total_duration += seconds - total_embeddings += len(embeddings) - - print(f"Throughput: {total_embeddings/total_duration:.2f} images/s") - - -def bench_text_embeddings(model, texts): - total_duration = 0 - total_embeddings = 0 - texts *= 10 - while total_duration < 10: - seconds, embeddings = duration(lambda: model.encode_text(processor.preprocess_text(texts))) - total_duration += seconds - total_embeddings += len(embeddings) - - print(f"Throughput: {total_embeddings/total_duration:.2f} queries/s") - - if __name__ == "__main__": image_urls = [ "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", @@ -157,11 +144,3 @@ def bench_text_embeddings(model, texts): prompt="Summarize the visual content of the image.", images=images, ) - - print("UForm-English") - bench_image_embeddings(get_model("unum-cloud/uform-vl-english"), images) - bench_text_embeddings(get_model("unum-cloud/uform-vl-english"), captions) - - print("UForm-Multilingual") - bench_image_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), images) - bench_text_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), captions) diff --git a/python/scripts/bench_encoders.py b/python/scripts/bench_encoders.py new file mode 100644 index 0000000..6b59d05 --- /dev/null +++ b/python/scripts/bench_encoders.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +This script provides the throughput of UForm multimodal embedding models. + +The output of the script will cover: + - Time to preprocess an image, and throughput in images/s. + - Time to tokenize the text, and throughput in queries/s. + - Time to encode the image, and throughput in images/s. + - Time to encode the text, and throughput in queries/s. + - Share of time spent on each part of the pipeline. + +Those numbers are presented for every model, device (cpu or gpu), backend (torch or onnx), +and precision (float32 or bfloat16), producing a pretty comprehensive benchmark. + +Before running the script - install all available packages via `pip install -e ".[torch,onnx,onnx-gpu]"`. +Before printing the numbers, a warm-up is performed to ensure the model is loaded and the cache is filled. +""" + +from functools import partial +from time import perf_counter +from dataclasses import dataclass +from typing import List, Tuple, Literal, Callable, Generator +import re + +import fire +import requests +from PIL import Image +import pandas as pd + +from uform import get_model, get_model_onnx, Modality + +# Define global constants for the hardware availability +torch_available = False +try: + import torch + + torch_available = True +except ImportError: + pass +onnx_available = False +try: + import onnx + + onnx_available = True +except ImportError: + pass +cuda_available = False +try: + if torch_available: + cuda_available = torch.cuda.is_available() + elif onnx_available: + import onnxruntime + + cuda_available = onnxruntime.get_device() == "GPU" +except ImportError: + pass + + +@dataclass +class BenchmarkResult: + model_name: str + device_name: Literal["cpu", "cuda"] = "cpu" + backend_name: Literal["torch", "onnx"] = "torch" + duration_image_preprocessing: float = 0 + duration_image_embedding: float = 0 + duration_text_preprocessing: float = 0 + duration_text_embedding: float = 0 + + +def duration(callable): + """Profile the duration of a callable and return the duration and the result.""" + start = perf_counter() + result = callable() + stop = perf_counter() + return stop - start, result + + +def get_captioned_images() -> List[Tuple[Image.Image, str]]: + """Get a list of pre-downloaded and decoded images and their captions.""" + image_urls = [ + "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + ] + images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls] + captions = [ + "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field", + "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta", + "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank", + "asian girl sleeping in a bed. top down view", + "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it", + ] + return list(zip(images, captions)) + + +def yield_benchmarks() -> Generator[Tuple[BenchmarkResult, Callable], None, None]: + """Yields callable benchmarks for all supported backends of the given model.""" + + # Pull the content and artificially grow the batch size + images, captions = zip(*get_captioned_images()) + images *= 10 + captions *= 10 + + def run(model_name: str, device: str, backend_name: str): + result = BenchmarkResult( + model_name=model_name, + backend_name=backend_name, + device_name=device, + duration_image_preprocessing=0, + duration_image_embedding=0, + duration_text_preprocessing=0, + duration_text_embedding=0, + ) + + processors, models = get_model( + model_name, + device=device, + modalities=[Modality.IMAGE_ENCODER, Modality.TEXT_ENCODER], + backend=backend_name, + ) + + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + + # Image preprocessing + total_duration = 0 + total_iterations = 0 + while total_duration < 10: + seconds, _ = duration(lambda: processor_image(images)) + total_duration += seconds + total_iterations += len(images) + duration_per_iteration = total_duration / total_iterations + result.duration_image_preprocessing = duration_per_iteration + + # Image embedding + total_duration = 0 + total_iterations = 0 + while total_duration < 10: + images_data = processor_image(images) + seconds, _ = duration(lambda: model_image.encode(images_data)) + total_duration += seconds + total_iterations += len(images) + duration_per_iteration = total_duration / total_iterations + result.duration_image_embedding = duration_per_iteration + + # Text preprocessing + total_duration = 0 + total_iterations = 0 + while total_duration < 10: + seconds, _ = duration(lambda: processor_text(captions)) + total_duration += seconds + total_iterations += len(captions) + duration_per_iteration = total_duration / total_iterations + result.duration_text_preprocessing = duration_per_iteration + + # Text embedding + total_duration = 0 + total_iterations = 0 + while total_duration < 10: + texts_data = processor_text(captions) + seconds, _ = duration(lambda: model_text.encode(texts_data)) + total_duration += seconds + total_iterations += len(captions) + duration_per_iteration = total_duration / total_iterations + result.duration_text_embedding = duration_per_iteration + + return result + + devices = ["cpu"] + if cuda_available: + devices.append("cuda") + backends = [] + if torch_available: + backends.append("torch") + if onnx_available: + backends.append("onnx") + + for device in devices: + for backend_name in backends: + for model_name in [ + "unum-cloud/uform3-image-text-english-small", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", + ]: + yield BenchmarkResult( + model_name=model_name, + device_name=device, + backend_name=backend_name, + ), partial(run, model_name, device, backend_name) + + +def main(filter: str = None): + results = [] + filter_pattern = re.compile(filter) if filter else None + for specs, func in yield_benchmarks(): + if filter_pattern and ( + not filter_pattern.search(specs.model_name) + and not filter_pattern.search(specs.backend_name) + and not filter_pattern.search(specs.device_name) + ): + continue + + print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend") + result = func() + results.append(result) + + results = sorted(results, key=lambda x: x.model_name) + results = [x.__dict__ for x in results] + + df = pd.DataFrame(results) + print(df.to_markdown()) + + +if __name__ == "__main__": + fire.Fire(main) diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py index 7046217..274ed6c 100644 --- a/python/scripts/test_encoders.py +++ b/python/scripts/test_encoders.py @@ -117,7 +117,7 @@ def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embed @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") @pytest.mark.parametrize("model_name", torch_models) def test_torch_one_embedding(model_name: str): - processors, models = get_model(model_name, token=token) + processors, models = get_model(model_name, token=token, backend="torch") model_text = models[Modality.TEXT_ENCODER] model_image = models[Modality.IMAGE_ENCODER] processor_text = processors[Modality.TEXT_ENCODER] @@ -130,8 +130,8 @@ def test_torch_one_embedding(model_name: str): image_data = processor_image(image) text_data = processor_text(text) - image_features, image_embedding = model_image.forward(image_data, return_features=True) - text_features, text_embedding = model_text.forward(text_data, return_features=True) + image_features, image_embedding = model_image.encode(image_data, return_features=True) + text_features, text_embedding = model_text.encode(text_data, return_features=True) assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" @@ -148,7 +148,7 @@ def test_torch_one_embedding(model_name: str): @pytest.mark.parametrize("batch_size", [1, 2]) def test_torch_many_embeddings(model_name: str, batch_size: int): - processors, models = get_model(model_name, token=token) + processors, models = get_model(model_name, token=token, backend="torch") model_text = models[Modality.TEXT_ENCODER] model_image = models[Modality.IMAGE_ENCODER] processor_text = processors[Modality.TEXT_ENCODER] @@ -161,8 +161,8 @@ def test_torch_many_embeddings(model_name: str, batch_size: int): image_data = processor_image(images) text_data = processor_text(texts) - image_embeddings = model_image.forward(image_data, return_features=False) - text_embeddings = model_text.forward(text_data, return_features=False) + image_embeddings = model_image.encode(image_data, return_features=False) + text_embeddings = model_text.encode(text_data, return_features=False) assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" @@ -177,7 +177,7 @@ def test_onnx_one_embedding(model_name: str, device: str): try: - processors, models = get_model_onnx(model_name, token=token, device=device) + processors, models = get_model(model_name, token=token, device=device, backend="onnx") model_text = models[Modality.TEXT_ENCODER] model_image = models[Modality.IMAGE_ENCODER] processor_text = processors[Modality.TEXT_ENCODER] @@ -190,19 +190,19 @@ def test_onnx_one_embedding(model_name: str, device: str): image_data = processor_image(image) text_data = processor_text(text) - image_features, image_embedding = model_image(image_data) - text_features, text_embedding = model_text(text_data) + image_features, image_embedding = model_image.encode(image_data) + text_features, text_embedding = model_text.encode(text_data) assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" # Nested fucntions are easier to debug, than lambdas def get_image_embedding(image_data): - features, embedding = model_image(processor_image(image_data)) + features, embedding = model_image.encode(processor_image(image_data)) return embedding def get_text_embedding(text_data): - features, embedding = model_text(processor_text(text_data)) + features, embedding = model_text.encode(processor_text(text_data)) return embedding # Test if the model outputs actually make sense @@ -222,7 +222,7 @@ def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str): try: - processors, models = get_model_onnx(model_name, token=token, device=device) + processors, models = get_model(model_name, token=token, device=device, backend="onnx") model_text = models[Modality.TEXT_ENCODER] model_image = models[Modality.IMAGE_ENCODER] processor_text = processors[Modality.TEXT_ENCODER] @@ -235,8 +235,8 @@ def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str): image_data = processor_image(images) text_data = processor_text(texts) - image_embeddings = model_image(image_data, return_features=False) - text_embeddings = model_text(text_data, return_features=False) + image_embeddings = model_image.encode(image_data, return_features=False) + text_embeddings = model_text.encode(text_data, return_features=False) assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 841440f..2be45ed 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -84,10 +84,11 @@ def get_checkpoint( return config_path, modality_paths, tokenizer_path -def get_model( +def get_model_torch( model_name: str, *, token: Optional[str] = None, + device: Literal["cpu", "cuda"] = "cpu", modalities: Optional[Tuple[Union[str, Modality]]] = None, ) -> Tuple[Dict[Modality, Callable], Dict]: from uform.torch_encoders import TextEncoder, ImageEncoder @@ -101,13 +102,15 @@ def get_model( if Modality.TEXT_ENCODER in modalities: processor = TextProcessor(config_path, tokenizer_path) - encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER)).eval() + encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER)) + encoder = encoder.eval().to(device) result_processors[Modality.TEXT_ENCODER] = processor result_models[Modality.TEXT_ENCODER] = encoder if Modality.IMAGE_ENCODER in modalities: processor = ImageProcessor(config_path) - encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER)).eval() + encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER)) + encoder = encoder.eval().to(device) result_processors[Modality.IMAGE_ENCODER] = processor result_models[Modality.IMAGE_ENCODER] = encoder @@ -143,3 +146,20 @@ def get_model_onnx( result_models[Modality.IMAGE_ENCODER] = encoder return result_processors, result_models + + +def get_model( + model_name: str, + *, + device: Literal["cpu", "cuda"] = "cpu", # change this if you have a GPU + backend: Literal["onnx", "torch"] = "onnx", # lighter = better + modalities: Optional[Tuple[str, Modality]] = None, # all by default + token: Optional[str] = None, # optional HuggingFace Hub token for private models +) -> Tuple[Dict[Modality, Callable], Dict]: + + if backend == "onnx": + return get_model_onnx(model_name, device=device, token=token, modalities=modalities) + elif backend == "torch": + return get_model_torch(model_name, device=device, token=token, modalities=modalities) + else: + raise ValueError(f"Unknown backend: {backend}") diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py index 027bc0d..3782c26 100644 --- a/python/uform/numpy_processors.py +++ b/python/uform/numpy_processors.py @@ -1,5 +1,5 @@ from os import PathLike -from typing import Dict, List, Union +from typing import Dict, List, Union, Sequence import json from PIL.Image import Image, BICUBIC @@ -23,7 +23,7 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike): self._tokenizer.no_padding() self._pad_token_idx = config["padding_idx"] - def __call__(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]: + def __call__(self, texts: Union[str, Sequence[str]]) -> Dict[str, np.ndarray]: """Transforms one or more strings into dictionary with tokenized strings and attention masks. :param texts: text of list of texts to tokenizer @@ -75,13 +75,13 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None): self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None] self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None] - def __call__(self, images: Union[Image, List[Image]]) -> np.ndarray: + def __call__(self, images: Union[Image, Sequence[Image]]) -> np.ndarray: """Transforms one or more Pillow images into Torch Tensors. :param images: image or list of images to preprocess """ - if isinstance(images, list): + if isinstance(images, Sequence): batch_images = np.empty( (len(images), 3, self._image_size, self._image_size), dtype=np.float32, diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py index a6f27d3..0b88473 100644 --- a/python/uform/onnx_encoders.py +++ b/python/uform/onnx_encoders.py @@ -81,7 +81,7 @@ def __init__( providers=available_providers(device), ) - def __call__( + def encode( self, images: ndarray, return_features: Optional[bool] = None ) -> Union[ndarray, Tuple[ndarray, ndarray]]: features, embeddings = self.session.run(None, {"images": images}) @@ -114,7 +114,7 @@ def __init__( providers=available_providers(device), ) - def __call__( + def encode( self, x: Union[ndarray, dict], attention_mask: Optional[ndarray] = None, diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index 0504a74..1120926 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from os import PathLike -from typing import Dict, Optional, Union, Mapping, Any +from typing import Dict, Optional, Union, Mapping, Any, Tuple import json import torch @@ -256,7 +256,8 @@ def forward( x: Union[Tensor, dict], attention_mask: Optional[Tensor] = None, return_features: Optional[bool] = None, - ) -> Tensor: + ) -> Union[Tensor, Tuple[Tensor, Tensor]]: + if isinstance(x, dict): assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None" attention_mask = x["attention_mask"] @@ -273,6 +274,19 @@ def forward( return features, embeddings return embeddings + def encode( + self, + x: Union[Tensor, dict], + attention_mask: Optional[Tensor] = None, + return_features: Optional[bool] = None, + ) -> Union[Tensor, Tuple[Tensor, Tensor]]: + + result = self.forward(x, attention_mask, return_features) + if isinstance(result, tuple): + return result[0].detach(), result[1].detach() + else: + return result.detach() + @staticmethod def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder: """Load the image encoder from the given configuration and model path. @@ -361,6 +375,13 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: return features, embeddings return embeddings + def encode(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: + result = self.forward(x, return_features) + if isinstance(result, tuple): + return result[0].detach(), result[1].detach() + else: + return result.detach() + @staticmethod def from_pretrained( config: Union[PathLike, str, object], diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py index 32697ca..b61b224 100644 --- a/python/uform/torch_processors.py +++ b/python/uform/torch_processors.py @@ -1,5 +1,5 @@ from os import PathLike -from typing import Dict, List, Union +from typing import Dict, List, Union, Sequence import json import torch @@ -100,14 +100,14 @@ def __init__(self, config_path: PathLike): ], ) - def __call__(self, images: Union[Image, List[Image]]) -> Dict[str, Tensor]: + def __call__(self, images: Union[Image, Sequence[Image]]) -> Dict[str, Tensor]: """Transforms one or more Pillow images into Torch Tensors. :param images: image or list of images to preprocess :return: dictionary with float-represented images in tensors as values """ - if isinstance(images, list): + if isinstance(images, Sequence): batch_images = torch.empty( (len(images), 3, self._image_size, self._image_size), dtype=torch.float32, diff --git a/swift/Encoders.swift b/swift/Encoders.swift index 3582e91..2f1e7c1 100644 --- a/swift/Encoders.swift +++ b/swift/Encoders.swift @@ -129,11 +129,13 @@ public class TextEncoder { ) let configPath = modelURL.appendingPathComponent("config.json").path let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path - self.model = try readModel(fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true)) + self.model = try readModel( + fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true) + ) self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model) } - public func forward(with text: String) throws -> Embedding { + public func encode(with text: String) throws -> Embedding { let inputFeatureProvider = try self.processor.preprocess(text) let prediction = try self.model.prediction(from: inputFeatureProvider) guard let predictionFeature = prediction.featureValue(for: "embeddings"), @@ -164,11 +166,13 @@ public class ImageEncoder { let repo = Hub.Repo(id: modelName) let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"]) let configPath = modelURL.appendingPathComponent("config.json").path - self.model = try readModel(fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true)) + self.model = try readModel( + fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true) + ) self.processor = try ImageProcessor(configPath: configPath) } - public func forward(with image: CGImage) throws -> Embedding { + public func encode(with image: CGImage) throws -> Embedding { let inputFeatureProvider = try self.processor.preprocess(image) let prediction = try self.model.prediction(from: inputFeatureProvider) guard let predictionFeature = prediction.featureValue(for: "embeddings"), diff --git a/swift/EncodersTests.swift b/swift/EncodersTests.swift index 0395a29..645d531 100644 --- a/swift/EncodersTests.swift +++ b/swift/EncodersTests.swift @@ -55,7 +55,7 @@ final class TokenizerTests: XCTestCase { var textEmbeddings: [[Float32]] = [] for text in texts { - let embedding: [Float32] = try textModel.forward(with: text).asFloats() + let embedding: [Float32] = try textModel.encode(text).asFloats() textEmbeddings.append(embedding) } @@ -141,9 +141,9 @@ final class TokenizerTests: XCTestCase { ) } - let textEmbedding: [Float32] = try textModel.forward(with: text).asFloats() + let textEmbedding: [Float32] = try textModel.encode(text).asFloats() textEmbeddings.append(textEmbedding) - let imageEmbedding: [Float32] = try imageModel.forward(with: cgImage).asFloats() + let imageEmbedding: [Float32] = try imageModel.encode(cgImage).asFloats() imageEmbeddings.append(imageEmbedding) } diff --git a/swift/README.md b/swift/README.md index 66b531f..8fa0eb8 100644 --- a/swift/README.md +++ b/swift/README.md @@ -1,4 +1,4 @@ -# UForm for Swift +# UForm Swift SDK UForm offers first-party support for Swift. To get started, add UForm to your project using Swift Package Manager. @@ -21,7 +21,7 @@ import UForm ```swift let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small") let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie." -let textEmbedding: Embedding = try textModel.forward(with: text) +let textEmbedding: Embedding = try textModel.encode(text) let textVector: [Float32] = textEmbedding.asFloats() ``` @@ -36,9 +36,38 @@ guard let url = URL(string: imageURL), throw Exception("Could not load image from URL: \(imageURL)") } -var imageEmbedding: Embedding = try imageModel.forward(with: cgImage) +var imageEmbedding: Embedding = try imageModel.encode(cgImage) var imageVector: [Float32] = embedding.asFloats() ``` +### Computing Distances -### Computing Distances \ No newline at end of file +There are several ways to compute distances between embeddings, once you have them. +Naive Swift code might look like this: + +```swift +func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 { + let dotProduct = zip(a, b).map(*).reduce(0, +) + let normA = sqrt(a.map { $0 * $0 }.reduce(0, +)) + let normB = sqrt(b.map { $0 * $0 }.reduce(0, +)) + return dotProduct / (normA * normB) +} +``` + +A faster way to compute distances is to use the Accelerate framework: + +```swift +import Accelerate + +func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 { + var result: Float32 = 0 + var aNorm: Float32 = 0 + var bNorm: Float32 = 0 + vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count)) + vDSP_svesq(a, 1, &aNorm, vDSP_Length(a.count)) + vDSP_svesq(b, 1, &bNorm, vDSP_Length(b.count)) + return result / sqrt(aNorm * bNorm) +} +``` + +An even faster approach would be to use USearch or SimSIMD, that work not only for `Float32` and `Float64`, but also for `Float16`, `Int8`, and binary embeddings. From 3e1e57664a71290a57ff8e115d8cfea2fa6c501e Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 23 Apr 2024 11:56:40 -0700 Subject: [PATCH 24/40] Improve: Error handling in Swift --- swift/Encoders.swift | 264 ++++++++++++++++++++++++++++--------------- 1 file changed, 174 insertions(+), 90 deletions(-) diff --git a/swift/Encoders.swift b/swift/Encoders.swift index 3582e91..17da36a 100644 --- a/swift/Encoders.swift +++ b/swift/Encoders.swift @@ -11,21 +11,25 @@ import Foundation import Hub // `Config` import Tokenizers // `AutoTokenizer` +/// Defines custom errors related to the encoder's functionality. enum EncoderError: Error { - case configLoadingError(String) - case modelLoadingError(String) - case unsupportedDataType - case invalidInput - case unsupportedShapeConstraint + case downloadError(String) + case loadingError(String) + case invalidInput(String) case modelPredictionFailed(String) + case unknownError(String) } +/// Represents different types of embeddings as arrays of different numeric types. public enum Embedding { case i32s([Int32]) case f16s([Float16]) case f32s([Float32]) case f64s([Float64]) + /// Initializes an embedding from a `MLMultiArray`. + /// - Parameter multiArray: The MLMultiArray to convert into an Embedding. + /// - Returns: nil if the data type is unsupported. init?(from multiArray: MLMultiArray) { switch multiArray.dataType { case .float64: @@ -65,51 +69,57 @@ public enum Embedding { ) ) @unknown default: - return nil // return nil for unsupported data types + return nil } } + /// Converts the embedding to an array of `Float`. public func asFloats() -> [Float] { switch self { - case .f32s(let array): - return array - case .i32s(let array): - return array.map { Float($0) } - case .f16s(let array): - return array.map { Float($0) } - case .f64s(let array): - return array.map { Float($0) } + case .f32s(let array): return array + case .i32s(let array): return array.map(Float.init) + case .f16s(let array): return array.map(Float.init) + case .f64s(let array): return array.map(Float.init) } } } -// MARK: - Helpers - +/// Provides methods for reading and handling configurations and models. +/// - Parameter path: The file path where the configuration file is located. +/// - Returns: A dictionary containing the configuration data. func readConfig(fromPath path: String) throws -> [String: Any] { - // If it's not an absolute path, let's assume it's a path relative to the current working directory let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path let data = try Data(contentsOf: URL(fileURLWithPath: absPath)) return try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any] } +/// Compiles and loads a machine learning model from a URL. +/// - Parameter modelURL: The URL where the model package is located. +/// - Returns: An instance of `MLModel`. func readModel(fromURL modelURL: URL) throws -> MLModel { let compiledModelURL = try MLModel.compileModel(at: modelURL) return try MLModel(contentsOf: compiledModelURL) } +/// Loads a machine learning model from a local file path. +/// - Parameter path: The file path where the model file is located. +/// - Returns: An instance of `MLModel`. func readModel(fromPath path: String) throws -> MLModel { - // If it's not an absolute path, let's assume it's a path relative to the current working directory let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path let modelURL = URL(fileURLWithPath: absPath, isDirectory: true) return try readModel(fromURL: modelURL) } -// MARK: - Encoders - +/// Encodes text input into embeddings using a machine learning model. public class TextEncoder { let model: MLModel let processor: TextProcessor + /// Initializes a `TextEncoder` using paths for the model and configuration. + /// - Parameters: + /// - modelPath: The path to the directory containing the machine learning model. + /// - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory. + /// - tokenizerPath: Optional. The path to the tokenizer file. Defaults to tokenizer.json in the model directory. public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws { let finalConfigPath = configPath ?? modelPath + "/config.json" let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json" @@ -121,6 +131,10 @@ public class TextEncoder { ) } + /// Initializes a `TextEncoder` using a model name and an API for fetching models. + /// - Parameters: + /// - modelName: The identifier for the model repository. + /// - hubApi: The API object to interact with the model hub. Defaults to a shared instance. public init(modelName: String, hubApi: HubApi = .shared) async throws { let repo = Hub.Repo(id: modelName) let modelURL = try await hubApi.snapshot( @@ -129,57 +143,68 @@ public class TextEncoder { ) let configPath = modelURL.appendingPathComponent("config.json").path let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path - self.model = try readModel(fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true)) + self.model = try readModel( + fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true) + ) self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model) } + /// Processes text and returns embeddings. Throws an error if processing fails. + /// - Parameter text: The text input to encode. + /// - Returns: An `Embedding` object containing the model output. public func forward(with text: String) throws -> Embedding { let inputFeatureProvider = try self.processor.preprocess(text) - let prediction = try self.model.prediction(from: inputFeatureProvider) - guard let predictionFeature = prediction.featureValue(for: "embeddings"), + guard let prediction = try? self.model.prediction(from: inputFeatureProvider), + let predictionFeature = prediction.featureValue(for: "embeddings"), let output = predictionFeature.multiArrayValue, let embedding = Embedding(from: output) else { - throw NSError( - domain: "TextEncoder", - code: 0, - userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."] - ) + throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.") } return embedding } } +/// Encodes image input into embeddings using a machine learning model. public class ImageEncoder { let model: MLModel let processor: ImageProcessor + /// Initializes an `ImageEncoder` using a path for the model and optionally a configuration file. + /// - Parameters: + /// - modelPath: The path to the directory containing the machine learning model. + /// - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory. public init(modelPath: String, configPath: String? = nil) throws { let finalConfigPath = configPath ?? modelPath + "/config.json" self.model = try readModel(fromPath: modelPath) self.processor = try ImageProcessor(configPath: finalConfigPath) } + /// Initializes an `ImageEncoder` using a model name and an API for fetching models. + /// - Parameters: + /// - modelName: The identifier for the model repository. + /// - hubApi: The API object to interact with the model hub. Defaults to a shared instance. public init(modelName: String, hubApi: HubApi = .shared) async throws { let repo = Hub.Repo(id: modelName) let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"]) let configPath = modelURL.appendingPathComponent("config.json").path - self.model = try readModel(fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true)) + self.model = try readModel( + fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true) + ) self.processor = try ImageProcessor(configPath: configPath) } + /// Processes an image and returns embeddings. Throws an error if processing fails. + /// - Parameter image: The `CGImage` to encode. + /// - Returns: An `Embedding` object containing the model output. public func forward(with image: CGImage) throws -> Embedding { let inputFeatureProvider = try self.processor.preprocess(image) - let prediction = try self.model.prediction(from: inputFeatureProvider) - guard let predictionFeature = prediction.featureValue(for: "embeddings"), + guard let prediction = try? self.model.prediction(from: inputFeatureProvider), + let predictionFeature = prediction.featureValue(for: "embeddings"), let output = predictionFeature.multiArrayValue, let embedding = Embedding(from: output) else { - throw NSError( - domain: "ImageEncoder", - code: 0, - userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."] - ) + throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.") } return embedding } @@ -187,11 +212,18 @@ public class ImageEncoder { // MARK: - Processors +/// Handles the preprocessing of text data to be used by a machine learning model. class TextProcessor { let tokenizer: Tokenizer let minContextLength: Int let maxContextLength: Int + /// Initializes a `TextProcessor` with specific configuration. + /// - Parameters: + /// - configPath: The path to the configuration file specifying tokenizer and model configurations. + /// - tokenizerPath: The path to the tokenizer configuration. + /// - model: The machine learning model to be used with this processor. + /// - Throws: An error if the configuration is invalid or missing necessary components. public init(configPath: String, tokenizerPath: String, model: MLModel) throws { var configDict = try readConfig(fromPath: configPath) let tokenizerDict = try readConfig(fromPath: tokenizerPath) @@ -201,60 +233,101 @@ class TextProcessor { configDict = textEncoderConfig // Use the specific 'text_encoder' configuration } + // Initialize the tokenizer with its configuration. let config = Config(configDict) let tokenizerData = Config(tokenizerDict) self.tokenizer = try AutoTokenizer.from(tokenizerConfig: config, tokenizerData: tokenizerData) - let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"] - guard let shapeConstraint = inputDescription?.multiArrayConstraint?.shapeConstraint else { - fatalError("Cannot obtain shape information") + // Extract the model's input shape constraints. + guard let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"], + let multiArrayConstraint = inputDescription.multiArrayConstraint + else { + throw EncoderError.invalidInput("Cannot obtain shape information from the model.") } + // Determine the context length constraints based on the model's input shape constraint. + let shapeConstraint = multiArrayConstraint.shapeConstraint switch shapeConstraint.type { case .enumerated: minContextLength = shapeConstraint.enumeratedShapes[0][1].intValue maxContextLength = minContextLength case .range: - let range = inputDescription?.multiArrayConstraint?.shapeConstraint.sizeRangeForDimension[1] as? NSRange - minContextLength = range?.location ?? 1 - maxContextLength = range?.length ?? 128 + guard let range = shapeConstraint.sizeRangeForDimension[1] as? NSRange else { + throw EncoderError.unknownError("Model input shape has a range constraint that cannot be interpreted.") + } + minContextLength = range.location + maxContextLength = range.length case .unspecified: - minContextLength = 128 - maxContextLength = 128 + throw EncoderError.unknownError("Model input shape is unspecified.") @unknown default: - minContextLength = 128 - maxContextLength = 128 + throw EncoderError.unknownError("Unknown model input shape constraint type.") } } + /// Preprocesses a string of text into a format suitable for model prediction. + /// - Parameter text: The text to preprocess. + /// - Returns: A `MLFeatureProvider` containing the processed text ready for the model. + /// - Throws: An error if the text encoding fails. public func preprocess(_ text: String) throws -> MLFeatureProvider { let inputIDs = self.tokenizer.encode(text: text) return TextInput(inputIDs: inputIDs, sequenceLength: self.maxContextLength) } } +/// Handles the preprocessing of image data to be used by a machine learning model. class ImageProcessor { let imageSize: Int - let mean: [Float] = [0.485, 0.456, 0.406] // Common mean values for normalization - let std: [Float] = [0.229, 0.224, 0.225] // Common std values for normalization + let mean: [Float] + let std: [Float] + /// Initializes an `ImageProcessor` with specific configuration. + /// - Parameter configPath: The path to the configuration file specifying image size, mean, and std. init(configPath: String) throws { - var configDict = try readConfig(fromPath: configPath) - // Check if there's a specific 'image_encoder' configuration within the main configuration - if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] { - configDict = imageEncoderConfig + let configDict = try readConfig(fromPath: configPath) + guard let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] else { + throw EncoderError.loadingError("Image encoder configuration is missing.") } - let config = Config(configDict) - self.imageSize = config.imageSize!.intValue! + guard let imageSize = imageEncoderConfig["imageSize"] as? Int else { + throw EncoderError.invalidInput("Invalid or missing image size.") + } + self.imageSize = imageSize + + guard let meanArray = imageEncoderConfig["normalizationMeans"] as? [Any], + let stdArray = imageEncoderConfig["normalizationDeviations"] as? [Any] + else { + throw EncoderError.invalidInput("Normalization means or deviations are missing.") + } + + self.mean = try meanArray.compactMap({ + guard let floatValue = $0 as? Float else { + throw EncoderError.invalidInput("Normalization means should be an array of floats.") + } + return floatValue + }) + + self.std = try stdArray.compactMap({ + guard let floatValue = $0 as? Float else { + throw EncoderError.invalidInput("Normalization deviations should be an array of floats.") + } + return floatValue + }) + + // Check if the arrays have 3 values for the 3 channels + if self.mean.count != 3 || self.std.count != 3 { + throw EncoderError.invalidInput("Normalization means should contain 3 values.") + } } + /// Preprocesses a `CGImage` into a format suitable for model prediction. + /// - Parameter cgImage: The image to preprocess. + /// - Returns: An `MLFeatureProvider` containing the preprocessed image data. func preprocess(_ cgImage: CGImage) throws -> MLFeatureProvider { - // Populate a tensor of size 3 x `imageSize` x `imageSize`, - // by resizing the image, then performing a center crop. - // Then normalize with the `mean` and `std` and export as a provider. - let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize)! - let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std)! + guard let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize), + let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std) + else { + throw EncoderError.invalidInput("Image preprocessing failed.") + } let featureValue = MLFeatureValue(multiArray: normalized) return try ImageInput(precomputedFeature: featureValue) } @@ -263,7 +336,6 @@ class ImageProcessor { let originalWidth = CGFloat(image.width) let originalHeight = CGFloat(image.height) - // Calculate new size preserving the aspect ratio let widthRatio = CGFloat(imageSize) / originalWidth let heightRatio = CGFloat(imageSize) / originalHeight let scaleFactor = max(widthRatio, heightRatio) @@ -271,7 +343,6 @@ class ImageProcessor { let scaledWidth = originalWidth * scaleFactor let scaledHeight = originalHeight * scaleFactor - // Calculate the crop rectangle let dx = (scaledWidth - CGFloat(imageSize)) / 2.0 let dy = (scaledHeight - CGFloat(imageSize)) / 2.0 guard @@ -299,18 +370,19 @@ class ImageProcessor { // Prepare the bitmap context for drawing the image. var pixelData = [UInt8](repeating: 0, count: width * height * 4) let colorSpace = CGColorSpaceCreateDeviceRGB() - let context = CGContext( - data: &pixelData, - width: width, - height: height, - bitsPerComponent: 8, - bytesPerRow: 4 * width, - space: colorSpace, - bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue - ) - context?.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height)) + guard + let context = CGContext( + data: &pixelData, + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: 4 * width, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) + else { return nil } + context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height)) - // Normalize the pixel data var floatPixels = [Float](repeating: 0, count: width * height * 3) for c in 0 ..< 3 { for i in 0 ..< (width * height) { @@ -318,33 +390,36 @@ class ImageProcessor { } } - // Create the tensor array - var tensor = [Float](repeating: 0, count: 3 * width * height) - for i in 0 ..< (width * height) { - for c in 0 ..< 3 { - tensor[c * width * height + i] = floatPixels[i * 3 + c] + // We need to wrap the constructor that may fail + do { + let tensor = try MLMultiArray( + shape: [1, 3, NSNumber(value: height), NSNumber(value: width)], + dataType: .float32 + ) + for i in 0 ..< floatPixels.count { + tensor[i] = NSNumber(value: floatPixels[i]) } + return tensor } - - let multiArray = try? MLMultiArray( - shape: [1, 3, NSNumber(value: height), NSNumber(value: width)], - dataType: .float32 - ) - for i in 0 ..< tensor.count { - multiArray?[i] = NSNumber(value: tensor[i]) + catch { + return nil } - return multiArray } - } // MARK: - Feature Providers +/// Provides features for text input to a machine learning model, handling padding and attention mask generation. class TextInput: MLFeatureProvider { var inputIDs: [Int] var sequenceLength: Int var paddingID: Int + /// Initializes a new instance for providing text input features. + /// - Parameters: + /// - inputIDs: Array of integer IDs representing the encoded text. + /// - sequenceLength: The fixed length to which the input sequence should be padded. + /// - paddingID: The integer ID used for padding shorter sequences. Defaults to 0. init(inputIDs: [Int], sequenceLength: Int, paddingID: Int = 0) { self.inputIDs = inputIDs self.sequenceLength = sequenceLength @@ -355,8 +430,9 @@ class TextInput: MLFeatureProvider { return Set(["input_ids", "attention_mask"]) } - // The model expects the input IDs to be an array of integers - // of length `sequenceLength`, padded with `paddingID` if necessary + /// Returns the feature value for the specified feature name. + /// - Parameter featureName: The name of the feature for which the value is requested. + /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature. func featureValue(for featureName: String) -> MLFeatureValue? { switch featureName { case "input_ids", "attention_mask": @@ -366,6 +442,9 @@ class TextInput: MLFeatureProvider { } } + /// Creates the feature value for input IDs or attention mask based on the specified feature name. + /// - Parameter featureName: The name of the feature. + /// - Returns: An `MLFeatureValue` if the array can be created, otherwise nil. private func createFeatureValue(for featureName: String) -> MLFeatureValue? { let count = min(inputIDs.count, sequenceLength) let totalElements = sequenceLength @@ -394,9 +473,13 @@ class TextInput: MLFeatureProvider { } } +/// Provides a precomputed feature for image inputs to a machine learning model. class ImageInput: MLFeatureProvider { var precomputedFeature: MLFeatureValue + /// Initializes a new instance with a precomputed feature. + /// - Parameter precomputedFeature: The `MLFeatureValue` containing the precomputed feature data. + /// - Throws: An error if the precomputed feature is not valid for the model. init(precomputedFeature: MLFeatureValue) throws { self.precomputedFeature = precomputedFeature } @@ -405,8 +488,9 @@ class ImageInput: MLFeatureProvider { return Set(["images"]) } - // The model expects the input IDs to be an array of integers - // of length `sequenceLength`, padded with `paddingID` if necessary + /// Returns the feature value for the specified feature name. + /// - Parameter featureName: The name of the feature for which the value is requested. + /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature. func featureValue(for featureName: String) -> MLFeatureValue? { switch featureName { case "images": From f8654b50204c1c5878be68cf3d735b82ba19a9a5 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 23 Apr 2024 15:23:26 -0700 Subject: [PATCH 25/40] Improve: Image pre-processing in Swift --- swift/Encoders.swift | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/swift/Encoders.swift b/swift/Encoders.swift index a1c3ce2..509ad11 100644 --- a/swift/Encoders.swift +++ b/swift/Encoders.swift @@ -283,34 +283,35 @@ class ImageProcessor { /// Initializes an `ImageProcessor` with specific configuration. /// - Parameter configPath: The path to the configuration file specifying image size, mean, and std. init(configPath: String) throws { - let configDict = try readConfig(fromPath: configPath) - guard let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] else { - throw EncoderError.loadingError("Image encoder configuration is missing.") + var configDict = try readConfig(fromPath: configPath) + if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] { + configDict = imageEncoderConfig } - guard let imageSize = imageEncoderConfig["imageSize"] as? Int else { + let config = Config(configDict) + guard let imageSize = config.imageSize?.value as? Int else { throw EncoderError.invalidInput("Invalid or missing image size.") } self.imageSize = imageSize - guard let meanArray = imageEncoderConfig["normalizationMeans"] as? [Any], - let stdArray = imageEncoderConfig["normalizationDeviations"] as? [Any] + guard let meanArray = config.normalizationMeans?.value as? [Any], + let stdArray = config.normalizationDeviations?.value as? [Any] else { throw EncoderError.invalidInput("Normalization means or deviations are missing.") } self.mean = try meanArray.compactMap({ - guard let floatValue = $0 as? Float else { + guard let doubleValue = $0 as? Double else { throw EncoderError.invalidInput("Normalization means should be an array of floats.") } - return floatValue + return Float(doubleValue) }) self.std = try stdArray.compactMap({ - guard let floatValue = $0 as? Float else { + guard let doubleValue = $0 as? Double else { throw EncoderError.invalidInput("Normalization deviations should be an array of floats.") } - return floatValue + return Float(doubleValue) }) // Check if the arrays have 3 values for the 3 channels @@ -383,11 +384,13 @@ class ImageProcessor { else { return nil } context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height)) - var floatPixels = [Float](repeating: 0, count: width * height * 3) - for c in 0 ..< 3 { - for i in 0 ..< (width * height) { - floatPixels[i * 3 + c] = (Float(pixelData[i * 4 + c]) / 255.0 - mean[c]) / std[c] - } + // While normalizing the pixels, let's also transpose them from HWC to CHW + let channelSize = width * height + var floatPixels = [Float](repeating: 0, count: channelSize * 3) + for i in 0 ..< channelSize { + floatPixels[channelSize * 0 + i] = (Float(pixelData[i * 4 + 0]) / 255.0 - mean[0]) / std[0] + floatPixels[channelSize * 1 + i] = (Float(pixelData[i * 4 + 1]) / 255.0 - mean[1]) / std[1] + floatPixels[channelSize * 2 + i] = (Float(pixelData[i * 4 + 2]) / 255.0 - mean[2]) / std[2] } // We need to wrap the constructor that may fail From 37d7f52b863bebf2cae27001e2f53c0cbb860191 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 23 Apr 2024 22:40:01 +0000 Subject: [PATCH 26/40] Improve: Hide temporary files --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index fd8d9d0..fc16361 100755 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,7 @@ test src/__pycache__ src/test.py build/ -package-lock.json \ No newline at end of file +package-lock.json + +dictionary* +vocab* \ No newline at end of file From 67b083f09bf98675396f1b7cd462a93e18019986 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 23 Apr 2024 23:32:22 +0000 Subject: [PATCH 27/40] Improve: Pretty-print benchmarks --- python/scripts/bench_encoders.py | 52 ++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/python/scripts/bench_encoders.py b/python/scripts/bench_encoders.py index 6b59d05..a8adb91 100644 --- a/python/scripts/bench_encoders.py +++ b/python/scripts/bench_encoders.py @@ -22,13 +22,13 @@ from dataclasses import dataclass from typing import List, Tuple, Literal, Callable, Generator import re +import argparse -import fire import requests from PIL import Image import pandas as pd -from uform import get_model, get_model_onnx, Modality +from uform import get_model, Modality, ExecutionProviderError # Define global constants for the hardware availability torch_available = False @@ -195,27 +195,55 @@ def run(model_name: str, device: str, backend_name: str): ), partial(run, model_name, device, backend_name) -def main(filter: str = None): +def main(filter_out: str = None): results = [] - filter_pattern = re.compile(filter) if filter else None + filter_pattern = re.compile(filter_out) if filter_out else None for specs, func in yield_benchmarks(): if filter_pattern and ( - not filter_pattern.search(specs.model_name) - and not filter_pattern.search(specs.backend_name) - and not filter_pattern.search(specs.device_name) + filter_pattern.search(specs.model_name) + or filter_pattern.search(specs.backend_name) + or filter_pattern.search(specs.device_name) ): continue - print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend") - result = func() - results.append(result) + try: + print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend") + result = func() + results.append(result) + except ExecutionProviderError as e: + print(f"- skipping missing backend") + print(e) results = sorted(results, key=lambda x: x.model_name) results = [x.__dict__ for x in results] df = pd.DataFrame(results) - print(df.to_markdown()) + df.columns = [ + "Model Name", + "Device", + "Backend", + "Images Preprocessed/s", + "Images Encoded/s", + "Texts Preprocessed/s", + "Texts Encoded/s", + ] + + def inverse(x): + return 1 / x if x != 0 else 0 + + # Apply number formatting directly in the DataFrame + formatted_df = df.copy() + formatted_df["Images Preprocessed/s"] = df["Images Preprocessed/s"].map(inverse).map("{:,.2f}".format) + formatted_df["Images Encoded/s"] = df["Images Encoded/s"].map(inverse).map("{:,.2f}".format) + formatted_df["Texts Preprocessed/s"] = df["Texts Preprocessed/s"].map(inverse).map("{:,.2f}".format) + formatted_df["Texts Encoded/s"] = df["Texts Encoded/s"].map(inverse).map("{:,.2f}".format) + + # Convert formatted DataFrame to Markdown + print(formatted_df.to_markdown()) if __name__ == "__main__": - fire.Fire(main) + argparse = argparse.ArgumentParser() + argparse.add_argument("--filter-out", type=str, default=None) + args = argparse.parse_args() + main(filter_out=args.filter_out) From 8e38b2e8005728e92768fb26cea76a2689542203 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 23 Apr 2024 23:35:26 +0000 Subject: [PATCH 28/40] Make: Add development dependencies --- CONTRIBUTING.md | 17 ++++++++++++++--- pyproject.toml | 1 + 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ceafee9..65e0b26 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,12 +7,11 @@ We welcome contributions to UForm! Before submitting any changes, please make sure that the tests pass. ```sh -pip install -e . # For core dependencies - +pip install -e ".[dev]" # For development dependencies pip install -e ".[torch]" # For PyTorch pip install -e ".[onnx]" # For ONNX on CPU pip install -e ".[onnx-gpu]" # For ONNX on GPU, available for some platforms -pip install -e ".[torch,onnx]" # For PyTorch and ONNX Python tests +pip install -e ".[torch,onnx,onnx-gpu,dev]" # For all pytest python/scripts/ -s -x -Wd -v pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch @@ -53,3 +52,15 @@ Before submitting any changes, please make sure that the tests pass. npm install npm run test ``` + +## Benchmarking + +If you want to double check, how fast the model may work on your hardware, you can clone the library and repeat the benchmarks locally. +The following benchmark will exclude PyTorch backend, CUDA-capable devices, and all the `-base` and `-large` models, running only the ONNX benchmarks on the CPU. + +```sh +git clone https://github.com/unum-cloud/uform --depth 1 # Clone the repository +cd uform && pip install -e ".[torch,onnx,onnx-gpu,dev]" # Install all dependencies +python python/scripts/bench_encoders.py --filter-out "torch|cuda|base|large" +``` + diff --git a/pyproject.toml b/pyproject.toml index 1a84808..fef02d3 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ uform-chat = "uform.chat:main" torch = ["torch>=1.13.1", "torchvision", "transformers>=4.36.2"] onnx = ["onnx>=1.15.0", "onnxruntime>=1.17.1", "numpy"] onnx-gpu = ["onnx>=1.15.0", "onnxruntime-gpu>=1.17.1", "numpy"] +dev = ["pytest", "pandas"] [project.urls] "Homepage" = "https://github.com/unum-cloud/uform" From 96df21d3a33f703a641596fdb80a6cca9cbac15b Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 23 Apr 2024 23:52:56 +0000 Subject: [PATCH 29/40] Improve: Reduce warnings --- python/uform/__init__.py | 11 +++++++++-- python/uform/onnx_encoders.py | 10 +++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 2be45ed..8f0a30b 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -1,9 +1,10 @@ -from json import load from os.path import join, exists from typing import Dict, Optional, Tuple, Literal, Union, Callable from enum import Enum -from huggingface_hub import snapshot_download +from huggingface_hub import snapshot_download, utils + +from uform.onnx_encoders import ExecutionProviderError class Modality(Enum): @@ -44,6 +45,9 @@ def get_checkpoint( config_names = ["torch_config.json", "config.json"] tokenizer_names = ["tokenizer.json"] + old_progress_behavior = utils.are_progress_bars_disabled() + utils.disable_progress_bars() + # The download stats depend on the number of times the `config.json` is pulled # https://huggingface.co/docs/hub/models-download-stats model_path = snapshot_download( @@ -52,6 +56,9 @@ def get_checkpoint( allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names, ) + if old_progress_behavior: + utils.enable_progress_bars() + # Find the first name in `config_names` that is present config_path = None for config_name in config_names: diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py index 0b88473..d2668b9 100644 --- a/python/uform/onnx_encoders.py +++ b/python/uform/onnx_encoders.py @@ -39,7 +39,7 @@ def available_providers(device: Optional[str]) -> Tuple[str, ...]: raise ExecutionProviderError( f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}" ) - return gpu_providers + return [x for x in gpu_providers if x in available] # If a CPU is requested, but no CPU providers are available, raise an error if device == "cpu": @@ -47,7 +47,7 @@ def available_providers(device: Optional[str]) -> Tuple[str, ...]: raise ExecutionProviderError( f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}" ) - return cpu_providers + return [x for x in cpu_providers if x in available] if device not in available: available_providers = ", ".join(available) @@ -128,7 +128,11 @@ def encode( input_ids = x features, embeddings = self.text_encoder_session.run( - None, {"input_ids": input_ids, "attention_mask": attention_mask} + None, + { + "input_ids": input_ids, + "attention_mask": attention_mask, + }, ) return_features = return_features if return_features is not None else self.return_features From 91c86a1cb62c0d5f6b2573014401167e307f86db Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 23 Apr 2024 23:53:29 +0000 Subject: [PATCH 30/40] Improve: Move inputs to same device as model --- python/uform/torch_encoders.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index 1120926..ed413a8 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -266,6 +266,11 @@ def forward( # If no attention mask is provided - create one with all ones attention_mask = torch.ones_like(x) + # If the model is on the GPU and the input matrices are not, shift them there + if next(self.parameters()).device.type == "cuda" and x.device.type != "cuda": + x = x.cuda() + attention_mask = attention_mask.cuda() + features = self.forward_features(x, attention_mask) embeddings = self.forward_embedding(features, attention_mask) @@ -368,6 +373,11 @@ def forward_embedding(self, x: Tensor) -> Tensor: def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: if isinstance(x, dict): x = x["images"] + + # If the model is on the GPU and the input matrices are not, shift them there + if next(self.parameters()).device.type == "cuda" and x.device.type != "cuda": + x = x.cuda() + features = self.forward_features(x) embeddings = self.forward_embedding(features) return_features = return_features if return_features is not None else self.return_features From 6d5f1ce739178f91774cda22b662b4395916bb62 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 24 Apr 2024 05:17:47 +0000 Subject: [PATCH 31/40] Docs: Reorganize --- BENCHMARKS.md | 160 ++++++++ README.md | 313 +-------------- python/README.md | 51 +++ python/scripts/bench_encoders.py | 57 ++- python/scripts/export_decoders.ipynb | 567 +-------------------------- python/uform/torch_encoders.py | 5 +- 6 files changed, 269 insertions(+), 884 deletions(-) create mode 100644 BENCHMARKS.md diff --git a/BENCHMARKS.md b/BENCHMARKS.md new file mode 100644 index 0000000..ef78990 --- /dev/null +++ b/BENCHMARKS.md @@ -0,0 +1,160 @@ +# UForm Model Benchmarks + +## Accuracy + +### Embedding Models + +Few retrieval benchmarks exist for multimodal embeddings. +The most famous ones for English are "MS-COCO" and "Flickr30k". +Evaluating `uform-vl-english` model, one can expect the following numbers for search quality. + +| Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 | +| :------- | ---------: | ---------: | ----------: | +| Flickr | 0.727 | 0.915 | 0.949 | +| MS-COCO¹ | 0.510 | 0.761 | 0.838 | + +For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository². +Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model. + +| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | +| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: | +| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | +| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | +| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | +| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | +| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | +| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | + + +
+All languages. +
+ +| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | +| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: | +| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | +| Armenian 🇦🇲 | 5.6 | __22.0__ | 14.3 | __44.7__ | 20.2 | __56.0__ | 4 M | +| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | +| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | +| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | +| German 🇩🇪 | 31.7 | __35.1__ | 56.9 | __62.2__ | 67.4 | __73.3__ | 134 M | +| Hebrew 🇮🇱 | 23.7 | __26.7__ | 46.3 | __51.8__ | 57.0 | __63.5__ | 9 M | +| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | +| Indonesian 🇮🇩 | 26.9 | __30.7__ | 51.4 | __57.0__ | 62.7 | __68.6__ | 199 M | +| Italian 🇮🇹 | 31.3 | __34.9__ | 56.7 | __62.1__ | 67.1 | __73.1__ | 67 M | +| Japanese 🇯🇵 | 27.4 | __32.6__ | 51.5 | __59.2__ | 62.6 | __70.6__ | 125 M | +| Korean 🇰🇷 | 24.4 | __31.5__ | 48.1 | __57.8__ | 59.2 | __69.2__ | 81 M | +| Persian 🇮🇷 | 24.0 | __28.8__ | 47.0 | __54.6__ | 57.8 | __66.2__ | 77 M | +| Polish 🇵🇱 | 29.2 | __33.6__ | 53.9 | __60.1__ | 64.7 | __71.3__ | 41 M | +| Portuguese 🇵🇹 | 31.6 | __32.7__ | 57.1 | __59.6__ | 67.9 | __71.0__ | 257 M | +| Russian 🇷🇺 | 29.9 | __33.9__ | 54.8 | __60.9__ | 65.8 | __72.0__ | 258 M | +| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | +| Thai 🇹🇭 | 21.5 | __28.7__ | 43.0 | __54.6__ | 53.7 | __66.0__ | 61 M | +| Turkish 🇹🇷 | 25.5 | __33.0__ | 49.1 | __59.6__ | 60.3 | __70.8__ | 88 M | +| Ukranian 🇺🇦 | 26.0 | __30.6__ | 49.9 | __56.7__ | 60.9 | __68.1__ | 41 M | +| Vietnamese 🇻🇳 | 25.4 | __28.3__ | 49.2 | __53.9__ | 60.3 | __65.5__ | 85 M | +| | | | | | | | | +| Mean | 26.5±6.4 | __31.8±3.5__ | 49.8±9.8 | __58.1±4.5__ | 60.4±10.6 | __69.4±4.3__ | - | +| Google Translate | 27.4±6.3 | __31.5±3.5__ | 51.1±9.5 | __57.8±4.4__ | 61.7±10.3 | __69.1±4.3__ | - | +| Microsoft Translator | 27.2±6.4 | __31.4±3.6__ | 50.8±9.8 | __57.7±4.7__ | 61.4±10.6 | __68.9±4.6__ | - | +| Meta NLLB | 24.9±6.7 | __32.4±3.5__ | 47.5±10.3 | __58.9±4.5__ | 58.2±11.2 | __70.2±4.3__ | - | + +
+ +### Generative Models + +| Model | LLM Size | SQA | MME | MMBench | Average¹ | +| :------------------- | -------: | ---: | -----: | ------: | -------: | +| UForm-Gen2-Qwen-500m | 0.5B | 45.5 | 880.1 | 42.0 | 29.31 | +| MobileVLM v2 | 1.4B | 52.1 | 1302.8 | 57.7 | 36.81 | +| LLaVA-Phi | 2.7B | 68.4 | 1335.1 | 59.8 | 42.95 | + +For captioning evaluation we measure CLIPScore and RefCLIPScore³. + +| Model | Size | Caption Length | CLIPScore | RefCLIPScore | +| :---------------------------------- | ---: | -------------: | --------: | -----------: | +| `llava-hf/llava-1.5-7b-hf` | 7B | Long | 0.878 | 0.529 | +| `llava-hf/llava-1.5-7b-hf` | 7B | Short | 0.886 | 0.531 | +| | +| `Salesforce/instructblip-vicuna-7b` | 7B | Long | 0.902 | 0.534 | +| `Salesforce/instructblip-vicuna-7b` | 7B | Short | 0.848 | 0.523 | +| | +| `unum-cloud/uform-gen` | 1.5B | Long | 0.847 | 0.523 | +| `unum-cloud/uform-gen` | 1.5B | Short | 0.842 | 0.522 | +| | +| `unum-cloud/uform-gen-chat` | 1.5B | Long | 0.860 | 0.525 | +| `unum-cloud/uform-gen-chat` | 1.5B | Short | 0.858 | 0.525 | + +Results for VQAv2 evaluation. + +| Model | Size | Accuracy | +| :------------------------- | ---: | -------: | +| `llava-hf/llava-1.5-7b-hf` | 7B | 78.5 | +| `unum-cloud/uform-gen` | 1.5B | 66.5 | + +
+ +> ¹ Train split was in training data.
+> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section.
+> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model. + +## Speed + +UForm comes pre-packaged with speed benchmarks for the models. + +```bash +$ python python/scripts/bench_encoders.py --help +usage: bench_encoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE] + +options: + -h, --help show this help message and exit + --filter-out FILTER_OUT + Filter out models, backends, or devices with a Regular Expression. + --batch-size BATCH_SIZE + Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU. +``` + +Running that script for a fairly small batch size of 50 on an Nvidia H100 GPU and + +| Model Name | Device | Backend | Images Preprocessed/s | Images Encoded/s | Texts Preprocessed/s | Texts Encoded/s | +| :--------------------------------------------- | :----- | :------ | --------------------: | :--------------- | :------------------- | :-------------- | +| unum-cloud/uform3-image-text-english-base | cpu | torch | 23.03 | 76.57 | 15,978.03 | 562.28 | +| unum-cloud/uform3-image-text-english-base | cpu | onnx | 23.11 | 77.75 | 13,880.27 | 1,067.40 | +| unum-cloud/uform3-image-text-english-base | cuda | torch | 22.87 | 1,060.40 | 12,348.94 | 13,242.83 | +| unum-cloud/uform3-image-text-english-large | cpu | torch | 22.41 | 10.84 | 13,350.45 | 145.12 | +| unum-cloud/uform3-image-text-english-large | cpu | onnx | 23.13 | 19.60 | 18,031.85 | 960.09 | +| unum-cloud/uform3-image-text-english-large | cuda | torch | 22.78 | 244.86 | 13,226.40 | 10,204.04 | +| unum-cloud/uform3-image-text-english-small | cpu | torch | 20.08 | 71.68 | 12,147.05 | 249.63 | +| unum-cloud/uform3-image-text-english-small | cpu | onnx | 22.84 | 195.27 | 13,636.99 | 1,385.25 | +| unum-cloud/uform3-image-text-english-small | cuda | torch | 22.63 | 2,662.16 | 14,731.18 | 14,694.87 | +| unum-cloud/uform3-image-text-multilingual-base | cpu | torch | 22.98 | 64.28 | 10,129.27 | 209.76 | +| unum-cloud/uform3-image-text-multilingual-base | cpu | onnx | 23.06 | 66.81 | 8,963.13 | 1,104.32 | +| unum-cloud/uform3-image-text-multilingual-base | cuda | torch | 22.88 | 1,051.95 | 15,639.72 | 12,416.12 | + +If you are interested in performance numbers on consumer grade hardware, compared to third-party models, here are some rough estimates. +On Nvidia RTX 3090: + +| Model | Multilingual | Speed | Speedup | +| :----------------------------------------------- | -----------: | ---------------------: | ---------: | +| `bert-base-uncased` | No | 1'612 sequences/second | | +| `distilbert-base-uncased` | No | 3'174 sequences/second | x 1.96 | +| `sentence-transformers/all-MiniLM-L12-v2` | __Yes__ | 3'604 sequences/second | x 2.24 | +| `unum-cloud/uform3-image-text-multilingual-base` | __Yes__ | 6'809 sequences/second | __x 4.22__ | + +On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. + +| Model | Size | Speed | Speedup | +| :---------------------------------- | ---: | ------------------: | --------: | +| `llava-hf/llava-1.5-7b-hf` | 7B | ~ 40 tokens/second | | +| `Salesforce/instructblip-vicuna-7b` | 7B | ~ 40 tokens/second | | +| `unum-cloud/uform-gen` | 1.5B | ~ 140 tokens/second | __x 3.5__ | + +Given the small size of the model it also work well on mobile devices. +On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards. + +| Device | Speed | Device TDP | Efficiency | +| :--------------------- | ------------------: | ---------: | ----------------: | +| Nvidia RTX 3090 | ~ 140 tokens/second | < 350W | 0.40 tokens/joule | +| Apple M2 Pro unplugged | ~ 19 tokens/second | < 20W | 0.95 tokens/joule | +| Apple M2 Max unplugged | ~ 38 tokens/second | < 36W | 1.06 tokens/joule | +| Apple M2 Max plugged | ~ 56 tokens/second | < 89W | 0.63 tokens/joule | diff --git a/README.md b/README.md index ee62beb..b7fd6ca 100755 --- a/README.md +++ b/README.md @@ -24,16 +24,20 @@ Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
ONNX • CoreML • PyTorch
-Python • JavaScript • Swift +Python + • +JavaScript + • +Swift

--- -![](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true) +![UForm Chat Preview](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true) Welcome to UForm, a __multimodal__ AI library that's as versatile as it is efficient. UForm [tiny embedding models](#encoder) will help you understand and search visual and textual content across various languages. -UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are also capable of image captioning and Visual Question Answering (VQA). +UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are great for fast image captioning and Visual Question Answering (VQA). With compact __custom pre-trained transformer models__, this can run anywhere from your server farm down to your smartphone. ## Features @@ -42,13 +46,15 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr - __Throughput__: Thanks to the small size, the inference speed is [2-4x faster](#speed) than competitors. - __Portable__: Models come with native ONNX support, making them easy to deploy on any platform. - __Quantization Aware__: Down-cast embeddings from `f32` to `i8` without losing much recall. -- __Multilingual__: Trained on a balanced dataset, the recall is great across over [20 languages](#evaluation). +- __Multilingual__: Trained on a balanced dataset, the recall is great across over 20 languages. [usearch]: https://github.com/unum-cloud/usearch [matryoshka]: https://arxiv.org/abs/2205.13147 ## Models +For accuracy and speed benchmarks refer to the [evaluation page](https://github.com/unum-cloud/uform/blob/main/BENCHMARKS.md). + ### Embedding Models | Model | Parameters | Languages | Architecture | @@ -75,74 +81,7 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr [model-g2]: https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/ [model-g1]: https://huggingface.co/unum-cloud/uform-gen/ -## Producing Embeddings - -Add UForm to your dependencies list, or just install it locally: - -```bash -pip install uform -``` - -Then, you can use the following code to get embeddings for text and images. -You can do that either with the PyTorch reference model or the lighter cross-platform ONNX weights. - -```python -import uform -from PIL import Image - -# If you want to use the PyTorch model -model, processor = uform.get_model('unum-cloud/uform-vl-english-large') # Just English -model, processor = uform.get_model('unum-cloud/uform-vl-multilingual-v2') # 21 Languages - -# If you want to use the light-weight portable ONNX model -# Available combinations: cpu & fp32, gpu & fp32, gpu & fp16 -# Check out Unum's Hugging Face space for more details: https://huggingface.co/unum-cloud -model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-small', 'cpu', 'fp32') -model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-large', 'gpu', 'fp16') - -text = 'a small red panda in a zoo' -image = Image.open('red_panda.jpg') - -image_data = processor.preprocess_image(image) -text_data = processor.preprocess_text(text) - -image_features, image_embedding = model.encode_image(image_data, return_features=True) -text_features, text_embedding = model.encode_text(text_data, return_features=True) -``` - -To search for similar items, the embeddings can be compared using cosine similarity. -The resulting value will fall within the range of `-1` to `1`, where `1` indicates a high likelihood of a match. -PyTorch provides a built-in function for calculating cosine similarity, while for ONNX, you can use NumPy. - -```python -import torch.nn.functional as F - -similarity = F.cosine_similarity(image_embedding, text_embedding) -``` - -ONNX has no such function, but you can calculate the cosine similarity using [SimSIMD](https://github.com/ashvardanian/simsimd) or manually, with NumPy: - -```python -import numpy as np - -image_embedding = image_embedding / np.linalg.norm(image_embedding, keepdims=True, axis=1) -text_embedding = text_embedding / np.linalg.norm(text_embedding, keepdims=True, axis=1) -similarity = (image_embedding * text_embedding).sum(axis=1) -``` - -### Reranking - -Once the list of nearest neighbors (best matches) is obtained, the joint multimodal embeddings, created from both text and image features, can be used to better rerank (reorder) the list. -The model can calculate a "matching score" that falls within the range of `[0, 1]`, where `1` indicates a high likelihood of a match. - -```python -score, joint_embedding = model.encode_multimodal( - image_features=image_features, - text_features=text_features, - attention_mask=text_data['attention_mask'], - return_scores=True, -) -``` +## Features and Recommendations ### Down-casting, Quantization, Matryoshka, and Slicing @@ -154,7 +93,7 @@ Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is ```python import numpy as np -f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy() +f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False) f16_embedding: np.ndarray = f32_embedding.astype(np.float16) i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8) b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8)) @@ -165,7 +104,7 @@ Alternative approach to quantization is to use the Matryoshka embeddings, where ```python import numpy as np -large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy() +large_embedding: np.ndarray = model.encode_text(text_data, return_features=False) small_embedding: np.ndarray = large_embedding[:, :256] tiny_embedding: np.ndarray = large_embedding[:, :64] ``` @@ -220,92 +159,6 @@ You can pick one of many supported [ONNX execution providers][onnx-providers], w [onnx-providers]: https://onnxruntime.ai/docs/execution-providers/ ---- - -The configuration process may include a few additional steps, depending on the environment. -When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository. - -```sh -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -sudo dpkg -i cuda-keyring_1.1-1_all.deb -sudo apt-get update -sudo apt-get -y install cuda-toolkit-12 -pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ -export CUDA_PATH="/usr/local/cuda-12/bin" -export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}" -export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" -pytest python/scripts/ -s -x -Wd -v -k onnx -``` - -[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu - -## Chat, Image Captioning and Question Answering - -UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library. -Those models can be used to caption images or power multimodal chat experiences. - -```python -from transformers import AutoModel, AutoProcessor - -model = AutoModel.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True) -processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True) - -prompt = 'Question or Instruction' -image = Image.open('image.jpg') - -inputs = processor(text=[prompt], images=[image], return_tensors='pt') - -with torch.inference_mode(): - output = model.generate( - **inputs, - do_sample=False, - use_cache=True, - max_new_tokens=256, - eos_token_id=151645, - pad_token_id=processor.tokenizer.pad_token_id - ) -prompt_len = inputs['input_ids'].shape[1] -decoded_text = processor.batch_decode(output[:, prompt_len:])[0] -``` - -You can check examples of different prompts in our [demo space](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo) - - -### Image Captioning and Question Answering - -__It is the instruction for the first version of UForm-Gen model. We highly recommend you use the new model, instructions for which you can find above.__ - - -The generative model can be used to caption images, summarize their content, or answer questions about them. -The exact behavior is controlled by prompts. - -```python -from uform.torch_decoders import VLMForCausalLM, VLMProcessor - -model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen') -processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen') - -# [cap] Narrate the contents of the image with precision. -# [cap] Summarize the visual content of the image. -# [vqa] What is the main subject of the image? -prompt = '[cap] Summarize the visual content of the image.' -image = Image.open('zebra.jpg') - -inputs = processor(texts=[prompt], images=[image], return_tensors='pt') -with torch.inference_mode(): - output = model.generate( - **inputs, - do_sample=False, - use_cache=True, - max_new_tokens=128, - eos_token_id=32001, - pad_token_id=processor.tokenizer.pad_token_id - ) - -prompt_len = inputs['input_ids'].shape[1] -decoded_text = processor.batch_decode(output[:, prompt_len:])[0] -``` - ### Multimodal Chat in CLI The generative models can be used for chat-like experiences in the command line. @@ -319,143 +172,3 @@ $ uform-chat --model unum-cloud/uform-gen2-dpo \ > --device="cuda:0" \ > --fp16 ``` - -## Evaluation - -### Embedding Models - -Few retrieval benchmarks exist for multimodal embeddings. -The most famous ones for English are "MS-COCO" and "Flickr30k". -Evaluating `uform-vl-english` model, one can expect the following numbers for search quality. - -| Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 | -| :------- | ---------: | ---------: | ----------: | -| Flickr | 0.727 | 0.915 | 0.949 | -| MS-COCO¹ | 0.510 | 0.761 | 0.838 | - - -For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository². -Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model. - -| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | -| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: | -| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | -| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | -| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | -| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | -| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | -| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | - - -
-All languages. -
- -| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | -| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: | -| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | -| Armenian 🇦🇲 | 5.6 | __22.0__ | 14.3 | __44.7__ | 20.2 | __56.0__ | 4 M | -| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | -| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | -| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | -| German 🇩🇪 | 31.7 | __35.1__ | 56.9 | __62.2__ | 67.4 | __73.3__ | 134 M | -| Hebrew 🇮🇱 | 23.7 | __26.7__ | 46.3 | __51.8__ | 57.0 | __63.5__ | 9 M | -| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | -| Indonesian 🇮🇩 | 26.9 | __30.7__ | 51.4 | __57.0__ | 62.7 | __68.6__ | 199 M | -| Italian 🇮🇹 | 31.3 | __34.9__ | 56.7 | __62.1__ | 67.1 | __73.1__ | 67 M | -| Japanese 🇯🇵 | 27.4 | __32.6__ | 51.5 | __59.2__ | 62.6 | __70.6__ | 125 M | -| Korean 🇰🇷 | 24.4 | __31.5__ | 48.1 | __57.8__ | 59.2 | __69.2__ | 81 M | -| Persian 🇮🇷 | 24.0 | __28.8__ | 47.0 | __54.6__ | 57.8 | __66.2__ | 77 M | -| Polish 🇵🇱 | 29.2 | __33.6__ | 53.9 | __60.1__ | 64.7 | __71.3__ | 41 M | -| Portuguese 🇵🇹 | 31.6 | __32.7__ | 57.1 | __59.6__ | 67.9 | __71.0__ | 257 M | -| Russian 🇷🇺 | 29.9 | __33.9__ | 54.8 | __60.9__ | 65.8 | __72.0__ | 258 M | -| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | -| Thai 🇹🇭 | 21.5 | __28.7__ | 43.0 | __54.6__ | 53.7 | __66.0__ | 61 M | -| Turkish 🇹🇷 | 25.5 | __33.0__ | 49.1 | __59.6__ | 60.3 | __70.8__ | 88 M | -| Ukranian 🇺🇦 | 26.0 | __30.6__ | 49.9 | __56.7__ | 60.9 | __68.1__ | 41 M | -| Vietnamese 🇻🇳 | 25.4 | __28.3__ | 49.2 | __53.9__ | 60.3 | __65.5__ | 85 M | -| | | | | | | | | -| Mean | 26.5±6.4 | __31.8±3.5__ | 49.8±9.8 | __58.1±4.5__ | 60.4±10.6 | __69.4±4.3__ | - | -| Google Translate | 27.4±6.3 | __31.5±3.5__ | 51.1±9.5 | __57.8±4.4__ | 61.7±10.3 | __69.1±4.3__ | - | -| Microsoft Translator | 27.2±6.4 | __31.4±3.6__ | 50.8±9.8 | __57.7±4.7__ | 61.4±10.6 | __68.9±4.6__ | - | -| Meta NLLB | 24.9±6.7 | __32.4±3.5__ | 47.5±10.3 | __58.9±4.5__ | 58.2±11.2 | __70.2±4.3__ | - | - -
- -### Generative Models - -| Model | LLM Size | SQA | MME | MMBench | Average¹ | -| :------------------- | -------: | ---: | -----: | ------: | -------: | -| UForm-Gen2-Qwen-500m | 0.5B | 45.5 | 880.1 | 42.0 | 29.31 | -| MobileVLM v2 | 1.4B | 52.1 | 1302.8 | 57.7 | 36.81 | -| LLaVA-Phi | 2.7B | 68.4 | 1335.1 | 59.8 | 42.95 | - -For captioning evaluation we measure CLIPScore and RefCLIPScore³. - -| Model | Size | Caption Length | CLIPScore | RefCLIPScore | -| :---------------------------------- | ---: | -------------: | --------: | -----------: | -| `llava-hf/llava-1.5-7b-hf` | 7B | Long | 0.878 | 0.529 | -| `llava-hf/llava-1.5-7b-hf` | 7B | Short | 0.886 | 0.531 | -| | -| `Salesforce/instructblip-vicuna-7b` | 7B | Long | 0.902 | 0.534 | -| `Salesforce/instructblip-vicuna-7b` | 7B | Short | 0.848 | 0.523 | -| | -| `unum-cloud/uform-gen` | 1.5B | Long | 0.847 | 0.523 | -| `unum-cloud/uform-gen` | 1.5B | Short | 0.842 | 0.522 | -| | -| `unum-cloud/uform-gen-chat` | 1.5B | Long | 0.860 | 0.525 | -| `unum-cloud/uform-gen-chat` | 1.5B | Short | 0.858 | 0.525 | - -Results for VQAv2 evaluation. - -| Model | Size | Accuracy | -| :------------------------- | ---: | -------: | -| `llava-hf/llava-1.5-7b-hf` | 7B | 78.5 | -| `unum-cloud/uform-gen` | 1.5B | 66.5 | - -
- -> ¹ Train split was in training data.
-> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section.
-> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model. - -## Speed - -On Nvidia RTX 3090, the following performance is expected on text encoding. - -| Model | Multilingual | Speed | Speedup | -| :---------------------------------------- | -----------: | ---------------------: | ---------: | -| `bert-base-uncased` | No | 1'612 sequences/second | | -| `distilbert-base-uncased` | No | 3'174 sequences/second | x 1.96 | -| `sentence-transformers/all-MiniLM-L12-v2` | __Yes__ | 3'604 sequences/second | x 2.24 | -| `unum-cloud/uform-vl-multilingual-v2` | __Yes__ | 6'809 sequences/second | __x 4.22__ | - -On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. - -| Model | Size | Speed | Speedup | -| :---------------------------------- | ---: | ------------------: | --------: | -| `llava-hf/llava-1.5-7b-hf` | 7B | ~ 40 tokens/second | | -| `Salesforce/instructblip-vicuna-7b` | 7B | ~ 40 tokens/second | | -| `unum-cloud/uform-gen` | 1.5B | ~ 140 tokens/second | __x 3.5__ | - -Given the small size of the model it also work well on mobile devices. -On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards. - -| Device | Speed | Device TDP | Efficiency | -| :--------------------- | ------------------: | ---------: | ----------------: | -| Nvidia RTX 3090 | ~ 140 tokens/second | < 350W | 0.40 tokens/joule | -| Apple M2 Pro unplugged | ~ 19 tokens/second | < 20W | 0.95 tokens/joule | -| Apple M2 Max unplugged | ~ 38 tokens/second | < 36W | 1.06 tokens/joule | -| Apple M2 Max plugged | ~ 56 tokens/second | < 89W | 0.63 tokens/joule | - -> [!WARNING] -> The above numbers are for reference only and are not guaranteed to be accurate. - -## License - -All models come under the same license as the code - Apache 2.0. - - -TODO: - -- [ ] Download the image if a URL is provided \ No newline at end of file diff --git a/python/README.md b/python/README.md index aec9de8..2340e15 100644 --- a/python/README.md +++ b/python/README.md @@ -50,6 +50,38 @@ text_features, text_embedding = model_text.encode(text_data, return_features=Tru ### Generative Models +UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library. +Those models can be used to caption images or power multimodal chat experiences. + +```python +from transformers import AutoModel, AutoProcessor + +model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) +processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) + +prompt = 'Question or Instruction' +image = Image.open('image.jpg') + +inputs = processor(text=[prompt], images=[image], return_tensors='pt') + +with torch.inference_mode(): + output = model.generate( + **inputs, + do_sample=False, + use_cache=True, + max_new_tokens=256, + eos_token_id=151645, + pad_token_id=processor.tokenizer.pad_token_id + ) +prompt_len = inputs['input_ids'].shape[1] +decoded_text = processor.batch_decode(output[:, prompt_len:])[0] +``` + +You can check examples of different prompts in our demo spaces: + +- for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo) +- for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo) + ## Technical Details ### Down-casting, Quantization, Matryoshka, and Slicing @@ -122,3 +154,22 @@ encoder_image = nn.DataParallel(encoder_image) _, res = encoder_image(images, 0) ``` + +### ONNX and CUDA + +The configuration process may include a few additional steps, depending on the environment. +When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository. + +```sh +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install cuda-toolkit-12 +pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ +export CUDA_PATH="/usr/local/cuda-12/bin" +export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}" +export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" +pytest python/scripts/ -s -x -Wd -v -k onnx +``` + +[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu diff --git a/python/scripts/bench_encoders.py b/python/scripts/bench_encoders.py index a8adb91..b237126 100644 --- a/python/scripts/bench_encoders.py +++ b/python/scripts/bench_encoders.py @@ -68,10 +68,14 @@ class BenchmarkResult: duration_text_embedding: float = 0 -def duration(callable): +def duration(callable, synchronize=False): """Profile the duration of a callable and return the duration and the result.""" + if synchronize and torch_available and cuda_available: + torch.cuda.synchronize() # Wait for CUDA operations to complete start = perf_counter() result = callable() + if synchronize and torch_available and cuda_available: + torch.cuda.synchronize() # Ensure all CUDA kernels have finished stop = perf_counter() return stop - start, result @@ -96,13 +100,20 @@ def get_captioned_images() -> List[Tuple[Image.Image, str]]: return list(zip(images, captions)) -def yield_benchmarks() -> Generator[Tuple[BenchmarkResult, Callable], None, None]: +def yield_benchmarks(batch_size: int) -> Generator[Tuple[BenchmarkResult, Callable], None, None]: """Yields callable benchmarks for all supported backends of the given model.""" # Pull the content and artificially grow the batch size images, captions = zip(*get_captioned_images()) - images *= 10 - captions *= 10 + + if len(images) < batch_size: + import math + + multiplier = int(math.ceil(batch_size / len(images))) + images *= multiplier + captions *= multiplier + images = images[:batch_size] + captions = captions[:batch_size] def run(model_name: str, device: str, backend_name: str): result = BenchmarkResult( @@ -115,6 +126,7 @@ def run(model_name: str, device: str, backend_name: str): duration_text_embedding=0, ) + sync = backend_name == "torch" processors, models = get_model( model_name, device=device, @@ -130,7 +142,7 @@ def run(model_name: str, device: str, backend_name: str): # Image preprocessing total_duration = 0 total_iterations = 0 - while total_duration < 10: + while total_duration < 10 and total_iterations < 100: seconds, _ = duration(lambda: processor_image(images)) total_duration += seconds total_iterations += len(images) @@ -140,9 +152,9 @@ def run(model_name: str, device: str, backend_name: str): # Image embedding total_duration = 0 total_iterations = 0 - while total_duration < 10: + while total_duration < 10 and total_iterations < 100: images_data = processor_image(images) - seconds, _ = duration(lambda: model_image.encode(images_data)) + seconds, _ = duration(lambda: model_image.encode(images_data), synchronize=sync) total_duration += seconds total_iterations += len(images) duration_per_iteration = total_duration / total_iterations @@ -151,7 +163,7 @@ def run(model_name: str, device: str, backend_name: str): # Text preprocessing total_duration = 0 total_iterations = 0 - while total_duration < 10: + while total_duration < 10 and total_iterations < 100: seconds, _ = duration(lambda: processor_text(captions)) total_duration += seconds total_iterations += len(captions) @@ -161,9 +173,9 @@ def run(model_name: str, device: str, backend_name: str): # Text embedding total_duration = 0 total_iterations = 0 - while total_duration < 10: + while total_duration < 10 and total_iterations < 100: texts_data = processor_text(captions) - seconds, _ = duration(lambda: model_text.encode(texts_data)) + seconds, _ = duration(lambda: model_text.encode(texts_data), synchronize=sync) total_duration += seconds total_iterations += len(captions) duration_per_iteration = total_duration / total_iterations @@ -195,10 +207,10 @@ def run(model_name: str, device: str, backend_name: str): ), partial(run, model_name, device, backend_name) -def main(filter_out: str = None): +def main(filter_out: str = None, batch_size: int = 10): results = [] filter_pattern = re.compile(filter_out) if filter_out else None - for specs, func in yield_benchmarks(): + for specs, func in yield_benchmarks(batch_size=batch_size): if filter_pattern and ( filter_pattern.search(specs.model_name) or filter_pattern.search(specs.backend_name) @@ -243,7 +255,20 @@ def inverse(x): if __name__ == "__main__": - argparse = argparse.ArgumentParser() - argparse.add_argument("--filter-out", type=str, default=None) - args = argparse.parse_args() - main(filter_out=args.filter_out) + + parser = argparse.ArgumentParser() + parser.add_argument( + "--filter-out", + type=str, + default=None, + help="Filter out models, backends, or devices with a Regular Expression.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=10, + help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.", + ) + args = parser.parse_args() + + main(filter_out=args.filter_out, batch_size=args.batch_size) diff --git a/python/scripts/export_decoders.ipynb b/python/scripts/export_decoders.ipynb index 3aededb..26e463b 100644 --- a/python/scripts/export_decoders.ipynb +++ b/python/scripts/export_decoders.ipynb @@ -44,8 +44,8 @@ "from PIL import Image\n", "from transformers import AutoModel, AutoProcessor\n", "\n", - "model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)\n", - "processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)\n", + "model = AutoModel.from_pretrained(model_name, trust_remote_code=True)\n", + "processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)\n", "\n", "prompt = 'Describe the picture'\n", "image = Image.open('../../assets/unum.png')\n", @@ -65,569 +65,6 @@ "\n", "print(decoded_text)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n", - "for name, module in model.named_children():\n", - " print(f\"First layer of module: {name}\")\n", - " break # We break after the first layer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## CoreML" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import coremltools as ct\n", - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "precision = ct.precision.FLOAT32" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n", - "\n", - "```python\n", - " image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n", - " text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n", - " text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n", - "```\n", - "\n", - "That, however, will only work for batch-size one. To support larger batches, we need to override the input shapes.\n", - "\n", - "```python\n", - " ct.RangeDim(lower_bound=25, upper_bound=100, default=45)\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generalize_first_dimensions(input_shape, upper_bound=64):\n", - " if upper_bound == 1:\n", - " return input_shape\n", - " input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n", - " return input_shape\n", - "\n", - "generalize_first_dimensions(image_data.shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_input = ct.TensorType(name=\"input\", shape=generalize_first_dimensions(image_data.shape, 1))\n", - "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n", - "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n", - "text_features = ct.TensorType(name=\"features\")\n", - "text_embeddings = ct.TensorType(name=\"embeddings\")\n", - "image_features = ct.TensorType(name=\"features\")\n", - "image_embeddings = ct.TensorType(name=\"embeddings\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module = model.image_encoder\n", - "module.eval()\n", - "module.return_features = True\n", - "\n", - "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n", - "traced_script_module" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "coreml_model = ct.convert(\n", - " traced_script_module, source=\"pytorch\",\n", - " inputs=[image_input], outputs=[image_features, image_embeddings],\n", - " convert_to='mlprogram', compute_precision=precision)\n", - "\n", - "coreml_model.author = 'Unum Cloud'\n", - "coreml_model.license = 'Apache 2.0'\n", - "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(os.path.join(output_directory, \"image_encoder.mlpackage\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module = model.text_encoder\n", - "module.eval()\n", - "module.return_features = True\n", - "\n", - "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n", - "traced_script_module" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "coreml_model = ct.convert(\n", - " traced_script_module, source=\"pytorch\",\n", - " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n", - " convert_to='mlprogram', compute_precision=precision)\n", - "\n", - "coreml_model.author = 'Unum Cloud'\n", - "coreml_model.license = 'Apache 2.0'\n", - "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(os.path.join(output_directory, \"text_encoder.mlpackage\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PyTorch\n", - "\n", - "Let's ensure:\n", - "\n", - "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n", - "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n", - "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "from safetensors import safe_open\n", - "from safetensors.torch import save_file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.image_encoder.eval()\n", - "model.image_encoder.to(dtype=torch.bfloat16)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "torch.save(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.pt\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "save_file(model.image_encoder.state_dict(), os.path.join(output_directory, \"image_encoder.safetensors\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.text_encoder.eval()\n", - "model.text_encoder.to(dtype=torch.bfloat16)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "torch.save(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.pt\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "save_file(model.text_encoder.state_dict(), os.path.join(output_directory, \"text_encoder.safetensors\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n", - "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", - "\n", - "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ONNX" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install onnx onnxconverter-common" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from torch.onnx import export as onnx_export\n", - "import torch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module = model.text_encoder\n", - "module.eval()\n", - "module.return_features = True\n", - "module.to(dtype=torch.float32)\n", - "\n", - "onnx_export(\n", - " module,\n", - " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n", - " os.path.join(output_directory, \"text_encoder.onnx\"), \n", - " export_params=True,\n", - " opset_version=15,\n", - " do_constant_folding=True,\n", - " input_names = ['input_ids', 'attention_mask'], \n", - " output_names = ['features', 'embeddings'],\n", - " dynamic_axes={\n", - " 'input_ids' : {0 : 'batch_size'}, \n", - " 'attention_mask' : {0 : 'batch_size'}, \n", - " 'features' : {0 : 'batch_size'}, \n", - " 'embeddings' : {0 : 'batch_size'}})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now repeat the same for images." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module = model.image_encoder\n", - "module.eval()\n", - "module.return_features = True\n", - "module.to(dtype=torch.float32)\n", - "\n", - "torch.onnx.export(\n", - " module,\n", - " image_data, \n", - " os.path.join(output_directory, \"image_encoder.onnx\"), \n", - " export_params=True,\n", - " opset_version=15,\n", - " do_constant_folding=True,\n", - " input_names = ['input'], \n", - " output_names = ['features', 'embeddings'],\n", - " dynamic_axes={\n", - " 'input' : {0 : 'batch_size'},\n", - " 'features' : {0 : 'batch_size'},\n", - " 'embeddings' : {0 : 'batch_size'}})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Quantizing to `float16`\n", - "\n", - "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import onnx\n", - "from onnxconverter_common import float16" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", - "module = onnx.load(module_path)\n", - "module_fp16 = float16.convert_float_to_float16(module)\n", - "onnx.save(module_fp16, module_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", - "module = onnx.load(module_path)\n", - "module_fp16 = float16.convert_float_to_float16(module)\n", - "onnx.save(module_fp16, module_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Quantizing to `uint8`\n", - "\n", - "We can further quantize the model into `uint8` using ONNX quantization tools.\n", - "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from onnxruntime.quantization import quantize_dynamic, QuantType" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", - "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", - "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's make sure that all the text inputs are integers of identical type - `int32`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import onnx\n", - "import os\n", - "from onnx import helper\n", - "\n", - "# Load the ONNX model\n", - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", - "module = onnx.load(module_path)\n", - "\n", - "# Get the module's graph\n", - "graph = module.graph\n", - "\n", - "# Iterate through the inputs and update the data type of `input_ids`\n", - "for input_tensor in graph.input:\n", - " # Check if this is the tensor we want to change\n", - " if input_tensor.name == 'input_ids' or input_tensor.name == 'attention_mask':\n", - " # Get the tensor type information\n", - " tensor_type = input_tensor.type.tensor_type\n", - " # Set the element type to INT32 (int32's enum value in onnx is 6)\n", - " tensor_type.elem_type = onnx.TensorProto.INT32\n", - "\n", - "# Optionally, check that the module is still valid\n", - "onnx.checker.check_model(module)\n", - "\n", - "# Save the modified module\n", - "onnx.save(module, module_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the following function to print and validate the input and output types of the ONNX model files." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def print_model_inputs_and_outputs(onnx_model_path):\n", - " model = onnx.load(onnx_model_path)\n", - "\n", - " # Get the model's graph\n", - " graph = model.graph\n", - "\n", - " # Print input information\n", - " print(\"Model Inputs:\")\n", - " for input_tensor in graph.input:\n", - " tensor_type = input_tensor.type.tensor_type\n", - " # Get the element type (data type)\n", - " elem_type = tensor_type.elem_type\n", - " # Convert numeric type to readable format\n", - " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n", - " # Get tensor shape\n", - " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n", - " print(f\"Name: {input_tensor.name}, Type: {readable_type}, Shape: {shape}\")\n", - "\n", - " # Print output information similarly if needed\n", - " print(\"\\nModel Outputs:\")\n", - " for output_tensor in graph.output:\n", - " tensor_type = output_tensor.type.tensor_type\n", - " elem_type = tensor_type.elem_type\n", - " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n", - " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n", - " print(f\"Name: {output_tensor.name}, Type: {readable_type}, Shape: {shape}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's check that the runtime can actually load those models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import onnxruntime as ort\n", - "session_options = ort.SessionOptions()\n", - "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module_path = os.path.join(output_directory, \"text_encoder.onnx\")\n", - "session = ort.InferenceSession(module_path, sess_options=session_options)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "module_path = os.path.join(output_directory, \"image_encoder.onnx\")\n", - "session = ort.InferenceSession(module_path, sess_options=session_options)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Upload to Hugging Face" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n", - "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n", - "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n", - "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n", - "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n", - "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index ed413a8..c149088 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -231,7 +231,6 @@ def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor: return x[:, 0] attn_mask = attn_mask.unsqueeze(2).type_as(x) - return (x * attn_mask).sum(dim=1) / attn_mask.sum(dim=1) def get_attention_mask(self, attn_mask: Tensor, dtype: torch.dtype) -> Tensor: @@ -370,7 +369,7 @@ def forward_embedding(self, x: Tensor) -> Tensor: return self.embedding_projection(x) - def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: + def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor: if isinstance(x, dict): x = x["images"] @@ -385,7 +384,7 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: return features, embeddings return embeddings - def encode(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: + def encode(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor: result = self.forward(x, return_features) if isinstance(result, tuple): return result[0].detach(), result[1].detach() From 1f556b867ad80460f36df7fbc2f500e0d5785951 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 24 Apr 2024 05:54:47 +0000 Subject: [PATCH 32/40] Improve: Extend benchmarks --- BENCHMARKS.md | 33 +++++++++++++++++------ python/scripts/bench_decoders.py | 45 +++++++++++++++++++++++++++++++- python/uform/chat.py | 12 ++++----- 3 files changed, 75 insertions(+), 15 deletions(-) diff --git a/BENCHMARKS.md b/BENCHMARKS.md index ef78990..9b0fa1d 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -100,6 +100,8 @@ Results for VQAv2 evaluation. ## Speed +### Embedding Models + UForm comes pre-packaged with speed benchmarks for the models. ```bash @@ -141,14 +143,6 @@ On Nvidia RTX 3090: | `sentence-transformers/all-MiniLM-L12-v2` | __Yes__ | 3'604 sequences/second | x 2.24 | | `unum-cloud/uform3-image-text-multilingual-base` | __Yes__ | 6'809 sequences/second | __x 4.22__ | -On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. - -| Model | Size | Speed | Speedup | -| :---------------------------------- | ---: | ------------------: | --------: | -| `llava-hf/llava-1.5-7b-hf` | 7B | ~ 40 tokens/second | | -| `Salesforce/instructblip-vicuna-7b` | 7B | ~ 40 tokens/second | | -| `unum-cloud/uform-gen` | 1.5B | ~ 140 tokens/second | __x 3.5__ | - Given the small size of the model it also work well on mobile devices. On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards. @@ -158,3 +152,26 @@ On Apple M2 Arm chips the energy efficiency of inference can exceed that of the | Apple M2 Pro unplugged | ~ 19 tokens/second | < 20W | 0.95 tokens/joule | | Apple M2 Max unplugged | ~ 38 tokens/second | < 36W | 1.06 tokens/joule | | Apple M2 Max plugged | ~ 56 tokens/second | < 89W | 0.63 tokens/joule | + +### Generative Models + +```bash +$ python python/scripts/bench_decoders.py --help +usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE] + +options: + -h, --help show this help message and exit + --filter-out FILTER_OUT + Filter out models, backends, or devices with a Regular Expression. + --batch-size BATCH_SIZE + Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU. +``` + +On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. + +| Model | Size | Speed | Speedup | +| :---------------------------------- | ---: | ------------------: | --------: | +| `llava-hf/llava-1.5-7b-hf` | 7B | ~ 40 tokens/second | | +| `Salesforce/instructblip-vicuna-7b` | 7B | ~ 40 tokens/second | | +| `unum-cloud/uform-gen` | 1.5B | ~ 140 tokens/second | __x 3.5__ | + diff --git a/python/scripts/bench_decoders.py b/python/scripts/bench_decoders.py index d98c130..4241ee6 100644 --- a/python/scripts/bench_decoders.py +++ b/python/scripts/bench_decoders.py @@ -2,6 +2,7 @@ from time import perf_counter from dataclasses import dataclass from typing import List +import argparse import requests import torch @@ -11,6 +12,8 @@ InstructBlipForConditionalGeneration, InstructBlipProcessor, LlavaForConditionalGeneration, + AutoModel, + AutoProcessor, ) from uform.torch_decoders import VLMForCausalLM, VLMProcessor @@ -57,6 +60,7 @@ def caption(model, processor, prompt: str, image: Image.Image) -> str: def duration(callable): + """Profile the duration of a callable and return the duration and the result.""" start = perf_counter() result = callable() stop = perf_counter() @@ -86,7 +90,8 @@ def caption_image(image, model=model, processor=processor, prompt=prompt): print(f"Throughput: {total_length/total_duration:.2f} tokens/s") -if __name__ == "__main__": +def main(filter_out: str = None, batch_size: int = 10): + image_urls = [ "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", @@ -103,12 +108,30 @@ def caption_image(image, model=model, processor=processor, prompt=prompt): "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it", ] + print("UForm-Gen2") + bench_captions( + model=AutoModel.from_pretrained( + "unum-cloud/uform-gen2-dpo", + trust_remote_code=True, + torch_dtype=dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ignore_mismatched_sizes=True, + ).to(device), + processor=AutoProcessor.from_pretrained( + "unum-cloud/uform-gen2-dpo", + trust_remote_code=True, + ), + prompt="Describe the picture in great detail", + images=images, + ) + print("UForm-Gen") bench_captions( model=VLMForCausalLM.from_pretrained( "unum-cloud/uform-gen", torch_dtype=dtype, low_cpu_mem_usage=low_cpu_mem_usage, + ignore_mismatched_sizes=True, ).to(device), processor=VLMProcessor.from_pretrained( "unum-cloud/uform-gen", @@ -144,3 +167,23 @@ def caption_image(image, model=model, processor=processor, prompt=prompt): prompt="Summarize the visual content of the image.", images=images, ) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + "--filter-out", + type=str, + default=None, + help="Filter out models, backends, or devices with a Regular Expression.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=10, + help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.", + ) + args = parser.parse_args() + + main(filter_out=args.filter_out, batch_size=args.batch_size) diff --git a/python/uform/chat.py b/python/uform/chat.py index c9f8dc3..7bb1737 100644 --- a/python/uform/chat.py +++ b/python/uform/chat.py @@ -13,10 +13,10 @@ def parse_args(): parser = ArgumentParser(description="Chat with UForm generative model") - parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat") - parser.add_argument("--image", type=str, help="", required=True) - parser.add_argument("--device", type=str, required=True) - parser.add_argument("--fp16", action="store_true") + parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat", help="Model name or path") + parser.add_argument("--image", type=str, required=True, help="Path to image or URL") + parser.add_argument("--device", type=str, required=True, help="Device to run on, like `cpu` or `cuda:0`") + parser.add_argument("--fp16", action="store_true", help="Use half-precision math for faster inference") return parser.parse_args() @@ -95,16 +95,16 @@ def run_chat(opts, model, processor): def main(): try: opts = parse_args() - + processor = VLMProcessor.from_pretrained(opts.model) model = ( VLMForCausalLM.from_pretrained( opts.model, torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32, + ignore_mismatched_sizes=True, ) .eval() .to(opts.device) ) - processor = VLMProcessor.from_pretrained(opts.model) run_chat(opts, model, processor) From 47b7a49b7e2b436b1a42a3aa5ad7bba179bc0680 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 24 Apr 2024 16:05:22 +0000 Subject: [PATCH 33/40] Docs: Add examples --- README.md | 165 +++++++++++++++++++++++++++++++++++++++++------ python/README.md | 83 ++++++------------------ 2 files changed, 163 insertions(+), 85 deletions(-) diff --git a/README.md b/README.md index b7fd6ca..4c66d29 100755 --- a/README.md +++ b/README.md @@ -57,31 +57,156 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github. ### Embedding Models -| Model | Parameters | Languages | Architecture | -| :-------------------------------------------------- | ---------: | --------: | -------------------------------------------: | -| [`uform3-image-text-english-large`][model-e-l] 🆕 | 365M | 1 | 6 text layers, ViT-L/14, 6 multimodal layers | -| [`uform3-image-text-english-base`][model-e] | 143M | 1 | 2 text layers, ViT-B/16, 2 multimodal layers | -| [`uform3-image-text-english-small`][model-e-s] 🆕 | 79M | 1 | 2 text layers, ViT-S/16, 2 multimodal layers | -| [`uform3-image-text-multilingual-base`][model-m-v2] | 206M | 21 | 8 text layers, ViT-B/16, 4 multimodal layers | - -[model-e-l]: https://huggingface.co/unum-cloud/uform-vl-english-large/ -[model-e]: https://huggingface.co/unum-cloud/uform-vl-english/ -[model-e-s]: https://huggingface.co/unum-cloud/uform-vl-english-small/ -[model-m]: https://huggingface.co/unum-cloud/uform-vl-multilingual/ -[model-m-v2]: https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelParametersLanguagesArchitecture
uform3-image-text-english-large 🆕365M112 layer BERT, ViT-L/14
uform3-image-text-english-base143M14 layer BERT, ViT-B/16
uform3-image-text-english-small 🆕79M14 layer BERT, ViT-S/16
uform3-image-text-multilingual-base206M2112 layer BERT, ViT-B/16
### Generative Models -| Model | Parameters | Purpose | Architecture | -| :--------------------------------- | ---------: | --------------------------: | ---------------------: | -| [`uform-gen2-dpo`][model-g2] 🆕 | 1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 | -| [`uform-gen2-qwen-500m`][model-g2] | 1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 | -| [`uform-gen`][model-g1] | 1.5B | Image Captioning, VQA | llama-1.3B, ViT-B/16 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelParametersPurposeArchitecture
uform-gen2-dpo 🆕1.2BChat, Image Captioning, VQAqwen1.5-0.5B, ViT-H/14
uform-gen2-qwen-500m1.2BChat, Image Captioning, VQAqwen1.5-0.5B, ViT-H/14
uform-gen1.5BImage Captioning, VQAllama-1.3B, ViT-B/16
+ +## Quick Start Examples -[model-g2]: https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/ -[model-g1]: https://huggingface.co/unum-cloud/uform-gen/ +### Embedding Models + +First, `pip install uform`. +Then, load the model: + +```py +from uform import get_model, Modality + +processors, models = get_model('unum-cloud/uform3-image-text-english-small') + +model_text = models[Modality.TEXT_ENCODER] +model_image = models[Modality.IMAGE_ENCODER] +processor_text = processors[Modality.TEXT_ENCODER] +processor_image = processors[Modality.IMAGE_ENCODER] +``` + +Embed images: + +```py +import requests +from io import BytesIO +from PIL import Image + +image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg' +image_url = Image.open(BytesIO(requests.get(image_url).content)) +image_data = processor_image(image) +image_features, image_embedding = model_image.encode(image_data, return_features=True) +``` + +Embed queries: + +```py +text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background' +text_data = processor_text(text) +text_features, text_embedding = model_text.encode(text_data, return_features=True) +``` + +For more details check out: + +- Python docs on embedding models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#embedding-models) +- JavaScript docs on embedding models in [javascript/README.md](https://github.com/unum-cloud/uform/blob/main/javascript/README.md#embedding-models) +- Swift docs on embedding models in [swift/README.md](https://github.com/unum-cloud/uform/blob/main/swift/README.md#embedding-models) + +### Generative Models + +The generative models are natively compatible with + +```python +from transformers import AutoModel, AutoProcessor + +model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) +processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) + +prompt = 'Question or Instruction' +image = Image.open('image.jpg') + +inputs = processor(text=[prompt], images=[image], return_tensors='pt') + +with torch.inference_mode(): + output = model.generate( + **inputs, + do_sample=False, + use_cache=True, + max_new_tokens=256, + eos_token_id=151645, + pad_token_id=processor.tokenizer.pad_token_id + ) +prompt_len = inputs['input_ids'].shape[1] +decoded_text = processor.batch_decode(output[:, prompt_len:])[0] +``` + +For more details check out: + +- Python docs on generative models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#generative-models) +- JavaScript docs on generative models 🔜 +- Swift docs on generative models 🔜 -## Features and Recommendations +## Technical Details ### Down-casting, Quantization, Matryoshka, and Slicing diff --git a/python/README.md b/python/README.md index 2340e15..621bee0 100644 --- a/python/README.md +++ b/python/README.md @@ -20,13 +20,11 @@ pip install "uform[torch,onnx]" # For PyTorch and ONNX Python tests ### Embeddings +Load the model: + ```py from uform import get_model, Modality -import requests -from io import BytesIO -from PIL import Image - model_name = 'unum-cloud/uform3-image-text-english-small' modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER] processors, models = get_model(model_name, modalities=modalities) @@ -35,16 +33,26 @@ model_text = models[Modality.TEXT_ENCODER] model_image = models[Modality.IMAGE_ENCODER] processor_text = processors[Modality.TEXT_ENCODER] processor_image = processors[Modality.IMAGE_ENCODER] +``` + +Embed images: + +```py +import requests +from io import BytesIO +from PIL import Image -# Download the image -text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background' image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg' image_url = Image.open(BytesIO(requests.get(image_url).content)) - -# The actual inference image_data = processor_image(image) -text_data = processor_text(text) image_features, image_embedding = model_image.encode(image_data, return_features=True) +``` + +Embed queries: + +```py +text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background' +text_data = processor_text(text) text_features, text_embedding = model_text.encode(text_data, return_features=True) ``` @@ -77,68 +85,13 @@ prompt_len = inputs['input_ids'].shape[1] decoded_text = processor.batch_decode(output[:, prompt_len:])[0] ``` -You can check examples of different prompts in our demo spaces: +You can check examples of different prompts in our demo Gradio spaces on HuggingFace: - for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo) - for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo) ## Technical Details -### Down-casting, Quantization, Matryoshka, and Slicing - -Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall. -Switching from `f32` to `f16` is recommended in almost all cases, unless you are running on very old hardware without half-precision support. -Switching to `i8` with linear scaling is also possible, but will be noticeable in the recall on larger collections with millions of searchable entries. -Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is to quantize them into single-bit representations for faster search. - -```python -import numpy as np - -f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy() -f16_embedding: np.ndarray = f32_embedding.astype(np.float16) -i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8) -b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8)) -``` - -Alternative approach to quantization is to use the Matryoshka embeddings, where the embeddings are sliced into smaller parts, and the search is performed in a hierarchical manner. - -```python -import numpy as np - -large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy() -small_embedding: np.ndarray = large_embedding[:, :256] -tiny_embedding: np.ndarray = large_embedding[:, :64] -``` - -Both approaches are natively supported by the [USearch][github-usearch] vector-search engine and the [SimSIMD][github-simsimd] numerics libraries. -When dealing with small collections (up to millions of entries) and looking for low-latency cosine distance calculations, you can [achieve 5x-2500x performance improvement][report-simsimd] over Torch, NumPy, SciPy, and vanilla Python using SimSIMD. - -```python -from simsimd import cosine, hamming - -distance: float = cosine(f32_embedding, f32_embedding) # 32x SciPy performance on Apple M2 CPU -distance: float = cosine(f16_embedding, f16_embedding) # 79x SciPy performance on Apple M2 CPU -distance: float = cosine(i8_embedding, i8_embedding) # 133x SciPy performance on Apple M2 CPU -distance: float = hamming(b1_embedding, b1_embedding) # 17x SciPy performance on Apple M2 CPU -``` - -Similarly, when dealing with large collections (up to billions of entries per server) and looking for high-throughput search, you can [achieve 100x performance improvement][report-usearch] over FAISS and other vector-search solutions using USearch. -Here are a couple of examples: - -```python -from usearch.index import Index - -f32_index = Index(ndim=64, metric='cos', dtype='f32') # for Matryoshka embeddings -f16_index = Index(ndim=64, metric='cos', dtype='f16') # for Matryoshka embeddings -i8_index = Index(ndim=256, metric='cos', dtype='i8') # for quantized embeddings -b1_index = Index(ndim=768, metric='hamming', dtype='b1') # for binary embeddings -``` - -[github-usearch]: https://github.com/unum-cloud/usearch -[github-simsimd]: https://github.com/ashvardanian/simsimd -[report-usearch]: https://www.unum.cloud/blog/2023-11-07-scaling-vector-search-with-intel -[report-simsimd]: https://ashvardanian.com/posts/python-c-assembly-comparison/ - ### Multi-GPU Parallelism To achieve higher throughput, you can launch UForm on multiple GPUs. From ebd7f66ef5f3711c2a2b6f4916f5d9fe7293d271 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 24 Apr 2024 20:25:32 +0000 Subject: [PATCH 34/40] Improve: Refresh CLI for new models --- python/uform/chat.py | 46 +++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/python/uform/chat.py b/python/uform/chat.py index 7bb1737..b9e4423 100644 --- a/python/uform/chat.py +++ b/python/uform/chat.py @@ -3,11 +3,7 @@ import requests import torch from PIL import Image -from transformers import TextStreamer - -from uform.torch_decoders import VLMForCausalLM, VLMProcessor - -EOS_TOKEN = 32001 +from transformers import TextStreamer, AutoModel, AutoProcessor def parse_args(): @@ -30,22 +26,18 @@ def run_chat(opts, model, processor): messages = [{"role": "system", "content": "You are a helpful assistant."}] is_first_message = True + if opts.image.startswith("http"): - image = ( - processor.image_processor( - Image.open(requests.get(opts.image, stream=True).raw), - ) - .unsqueeze(0) - .to(torch.bfloat16 if opts.fp16 else torch.float32) - .to(opts.device) - ) + image = Image.open(requests.get(opts.image, stream=True).raw) else: - image = ( - processor.image_processor(Image.open(opts.image)) - .unsqueeze(0) - .to(torch.bfloat16 if opts.fp16 else torch.float32) - .to(opts.device) - ) + image = Image.open(opts.image) + + image = ( + processor.feature_extractor(image) # + .unsqueeze(0) + .to(torch.bfloat16 if opts.fp16 else torch.float32) + .to(opts.device) + ) while True: if messages[-1]["role"] in ("system", "assistant"): @@ -68,7 +60,7 @@ def run_chat(opts, model, processor): 1, input_ids.shape[1] + processor.num_image_latents - 1, ).to(opts.device) - x = { + inputs = { "input_ids": input_ids, "attention_mask": attention_mask, "images": image, @@ -76,18 +68,19 @@ def run_chat(opts, model, processor): print("Assistant: ", end="") with torch.inference_mode(): - y = model.generate( - **x, + output = model.generate( + **inputs, do_sample=False, use_cache=True, max_new_tokens=1024, - eos_token_id=EOS_TOKEN, + eos_token_id=151645, pad_token_id=processor.tokenizer.pad_token_id, streamer=streamer, ) print() - message = processor.batch_decode(y[:, x["input_ids"].shape[1] : -1])[0] + prompt_len = inputs["input_ids"].shape[1] + message = processor.batch_decode(output[:, prompt_len:-1])[0] messages.append({"role": "assistant", "content": message}) @@ -95,12 +88,13 @@ def run_chat(opts, model, processor): def main(): try: opts = parse_args() - processor = VLMProcessor.from_pretrained(opts.model) + processor = AutoProcessor.from_pretrained(opts.model, trust_remote_code=True) model = ( - VLMForCausalLM.from_pretrained( + AutoModel.from_pretrained( opts.model, torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32, ignore_mismatched_sizes=True, + trust_remote_code=True, ) .eval() .to(opts.device) From d00204f817748aaa204975862f9912101ab956f6 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 24 Apr 2024 20:51:50 +0000 Subject: [PATCH 35/40] Docs: Reference for Py and Swift --- .github/workflows/release.yml | 8 +++- BENCHMARKS.md | 20 ++++----- README.md | 34 +++++++-------- docs/_static/custom.js | 2 +- docs/benchmarks.rst | 5 +++ docs/conf.py | 7 ++- docs/contributing.rst | 5 +++ docs/index.rst | 24 ++++++++--- docs/javascript/index.rst | 9 ++++ docs/javascript/reference.rst.txt | 18 ++++++++ docs/python/index.rst | 11 +++++ docs/python/reference.rst | 42 ++++++++++++++++++ docs/reference.rst | 6 --- docs/swift/index.rst | 6 +++ javascript/encoders.mjs | 72 ++++++++++++++++++++++++++++--- python/uform/__init__.py | 37 +++++++++++++--- 16 files changed, 251 insertions(+), 55 deletions(-) create mode 100644 docs/benchmarks.rst create mode 100644 docs/contributing.rst create mode 100644 docs/javascript/index.rst create mode 100644 docs/javascript/reference.rst.txt create mode 100644 docs/python/index.rst create mode 100644 docs/python/reference.rst delete mode 100644 docs/reference.rst create mode 100644 docs/swift/index.rst diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4170c99..512b641 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -113,10 +113,14 @@ jobs: uses: actions/checkout@v4 with: ref: "main" + - name: Install dependencies + run: | + sudo apt update && + sudo apt install -y doxygen graphviz dia git && + pip install sphinx==5.3.0 sphinx-js==3.2.1 breathe==4.35.0 furo==2023.3.27 m2r2==0.3.3.post2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery==4.1 && + npm install -g jsdoc - name: Setup GitHub Pages uses: actions/configure-pages@v2 - - name: Install dependencies - run: sudo apt update && sudo apt install -y doxygen graphviz dia git && pip install sphinx==7.1.2 breathe furo m2r2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery toml - name: Install UForm from PyPi run: pip install uform - name: Build documentation diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 9b0fa1d..aa61535 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -8,10 +8,10 @@ Few retrieval benchmarks exist for multimodal embeddings. The most famous ones for English are "MS-COCO" and "Flickr30k". Evaluating `uform-vl-english` model, one can expect the following numbers for search quality. -| Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 | -| :------- | ---------: | ---------: | ----------: | -| Flickr | 0.727 | 0.915 | 0.949 | -| MS-COCO¹ | 0.510 | 0.761 | 0.838 | +| Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 | +| :-------- | ---------: | ---------: | ----------: | +| Flickr | 0.727 | 0.915 | 0.949 | +| MS-COCO ¹ | 0.510 | 0.761 | 0.838 | For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository². Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model. @@ -26,9 +26,7 @@ Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the f | French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | -
-All languages. -
+All languages: | Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | | :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: | @@ -59,8 +57,6 @@ Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the f | Microsoft Translator | 27.2±6.4 | __31.4±3.6__ | 50.8±9.8 | __57.7±4.7__ | 61.4±10.6 | __68.9±4.6__ | - | | Meta NLLB | 24.9±6.7 | __32.4±3.5__ | 47.5±10.3 | __58.9±4.5__ | 58.2±11.2 | __70.2±4.3__ | - | -
- ### Generative Models | Model | LLM Size | SQA | MME | MMBench | Average¹ | @@ -75,13 +71,13 @@ For captioning evaluation we measure CLIPScore and RefCLIPScore³. | :---------------------------------- | ---: | -------------: | --------: | -----------: | | `llava-hf/llava-1.5-7b-hf` | 7B | Long | 0.878 | 0.529 | | `llava-hf/llava-1.5-7b-hf` | 7B | Short | 0.886 | 0.531 | -| | +| | | | | | | `Salesforce/instructblip-vicuna-7b` | 7B | Long | 0.902 | 0.534 | | `Salesforce/instructblip-vicuna-7b` | 7B | Short | 0.848 | 0.523 | -| | +| | | | | | | `unum-cloud/uform-gen` | 1.5B | Long | 0.847 | 0.523 | | `unum-cloud/uform-gen` | 1.5B | Short | 0.842 | 0.522 | -| | +| | | | | | | `unum-cloud/uform-gen-chat` | 1.5B | Long | 0.860 | 0.525 | | `unum-cloud/uform-gen-chat` | 1.5B | Short | 0.858 | 0.525 | diff --git a/README.md b/README.md index 4c66d29..8484b0f 100755 --- a/README.md +++ b/README.md @@ -57,9 +57,9 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github. ### Embedding Models - +
- + @@ -68,25 +68,25 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github. - - + + - - + + - - + + - + @@ -96,9 +96,9 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github. ### Generative Models -
Model Parameters Languages
uform3-image-text-english-large 🆕365Muform3-image-text-english-large 🆕365 M 1 12 layer BERT, ViT-L/14
uform3-image-text-english-base143Muform3-image-text-english-base143 M 1 4 layer BERT, ViT-B/16
uform3-image-text-english-small 🆕79Muform3-image-text-english-small 🆕79 M 1 4 layer BERT, ViT-S/16
uform3-image-text-multilingual-baseuform3-image-text-multilingual-base 206M 21 12 layer BERT, ViT-B/16
+
- + @@ -107,20 +107,20 @@ For accuracy and speed benchmarks refer to the [evaluation page](https://github. - - + + - - + + - - + + diff --git a/docs/_static/custom.js b/docs/_static/custom.js index b909a1d..3dd0974 100644 --- a/docs/_static/custom.js +++ b/docs/_static/custom.js @@ -3,5 +3,5 @@ $(document).ready(function () { ` - $(".sidebar-brand-text").html("Unum · UForm
$(VERSION)" + github_logo) + $(".sidebar-brand-text").html("Unum · UForm
2.1.1" + github_logo) }) diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst new file mode 100644 index 0000000..7683788 --- /dev/null +++ b/docs/benchmarks.rst @@ -0,0 +1,5 @@ +==================== +Benchmarks +==================== + +.. mdinclude:: ../BENCHMARKS.md \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index acc061e..f9061f5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -5,12 +5,11 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -import toml project = "Unum · UForm" copyright = "2023, Unum" author = "Unum" -release = toml.load("../pyproject.toml")["project"]["version"] +release = open("../VERSION", "r").read().strip() with open("_static/custom.js", "r+") as js: content = js.read() js.seek(0) @@ -24,6 +23,7 @@ "breathe", "m2r2", "sphinx.ext.autodoc", + "sphinx_js", "sphinx.ext.autosummary", "sphinx.ext.intersphinx", "sphinx.ext.napoleon", @@ -44,6 +44,9 @@ html_static_path = ["_static"] html_css_files = ["custom.css"] html_js_files = ["custom.js"] +html_baseurl = "/docs/uform/" breathe_projects = {"UForm": "../build/xml"} breathe_default_project = "UForm" + +js_source_path = "../javascript/" diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 0000000..48893cf --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1,5 @@ +==================== +Contributing +==================== + +.. mdinclude:: ../CONTRIBUTING.md \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 162bbee..d3da0ec 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,11 +1,25 @@ -========== +==================== Overview -========== +==================== .. mdinclude:: ../README.md -.. toctree:: +.. toctree:: :hidden: + :caption: � + + python/index + javascript/index + swift/index + +.. toctree:: + :hidden: + :caption: � + + contributing + benchmarks + +.. toctree:: + :hidden: + :caption: � - self - reference genindex diff --git a/docs/javascript/index.rst b/docs/javascript/index.rst new file mode 100644 index 0000000..771081c --- /dev/null +++ b/docs/javascript/index.rst @@ -0,0 +1,9 @@ +==================== +JavaScript SDK +==================== + + +.. mdinclude:: ../../javascript/README.md + +.. toctree:: + :hidden: diff --git a/docs/javascript/reference.rst.txt b/docs/javascript/reference.rst.txt new file mode 100644 index 0000000..356176a --- /dev/null +++ b/docs/javascript/reference.rst.txt @@ -0,0 +1,18 @@ +API Reference +==================== + +==================== +Encoders +==================== + +.. js:autoclass:: ../javascript/encoders.TextProcessor + :members: + +.. js:autoclass:: ../javascript/encoders.ImageProcessor + :members: + +.. js:autoclass:: ../javascript/encoders.TextEncoder + :members: + +.. js:autoclass:: ../javascript/encoders.ImageEncoder + :members: diff --git a/docs/python/index.rst b/docs/python/index.rst new file mode 100644 index 0000000..5f870d1 --- /dev/null +++ b/docs/python/index.rst @@ -0,0 +1,11 @@ +==================== +Python SDK +==================== + + +.. mdinclude:: ../../python/README.md + +.. toctree:: + :hidden: + + reference \ No newline at end of file diff --git a/docs/python/reference.rst b/docs/python/reference.rst new file mode 100644 index 0000000..d580583 --- /dev/null +++ b/docs/python/reference.rst @@ -0,0 +1,42 @@ +API Reference +==================== + +==================== +Root +==================== + +.. automodule:: uform + :members: + :undoc-members: + +==================== +Torch Encoreds +==================== + +.. automodule:: uform.torch_encoders + :members: + :undoc-members: + +==================== +Torch Processors +==================== + +.. automodule:: uform.torch_processors + :members: + :undoc-members: + +==================== +ONNX Encoders +==================== + +.. automodule:: uform.onnx_encoders + :members: + :undoc-members: + +==================== +NumPy Processors +==================== + +.. automodule:: uform.numpy_processors + :members: + :undoc-members: diff --git a/docs/reference.rst b/docs/reference.rst deleted file mode 100644 index 5828f41..0000000 --- a/docs/reference.rst +++ /dev/null @@ -1,6 +0,0 @@ -API Reference -============== - -.. automodule:: uform - :members: - :undoc-members: diff --git a/docs/swift/index.rst b/docs/swift/index.rst new file mode 100644 index 0000000..5f2e213 --- /dev/null +++ b/docs/swift/index.rst @@ -0,0 +1,6 @@ +==================== +Swift SDK +==================== + + +.. mdinclude:: ../../swift/README.md diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs index a37b326..81af1ae 100644 --- a/javascript/encoders.mjs +++ b/javascript/encoders.mjs @@ -3,10 +3,17 @@ import { InferenceSession, Tensor } from 'onnxruntime-node'; import { PreTrainedTokenizer } from '@xenova/transformers'; import sharp from 'sharp'; -import { getModel, Modality } from "./hub.mjs"; - +/** + * A processor for text data that prepares input for the text encoder model. + */ class TextProcessor { + /** + * Constructs a new TextProcessor instance. + * + * @param {string} configPath - The path to the configuration file for the text encoder. + * @param {string} tokenizerPath - The path to the tokenizer configuration file. + */ constructor(configPath, tokenizerPath) { this.configPath = configPath; this.tokenizerPath = tokenizerPath; @@ -16,6 +23,9 @@ class TextProcessor { this.tokenizer = null; } + /** + * Initializes the TextProcessor by loading configurations and setting up the tokenizer. + */ async init() { var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' })); if (config.text_encoder !== undefined) { @@ -31,6 +41,12 @@ class TextProcessor { this.tokenizer.pad_token_id = this.padTokenIdx; } + /** + * Processes a list of text strings into model-ready format, including padding and attention masks. + * + * @param {Array} texts - An array of text strings to process. + * @return {Object} The processed texts as model input features. + */ async process(texts) { const encoded = await this.tokenizer(texts, { @@ -48,17 +64,31 @@ class TextProcessor { } } +/** + * An encoder for text data that uses a pre-trained model to encode text. + */ class TextEncoder { - constructor(modelPath, processor = null) { + /** + * Constructs a new TextEncoder instance. + * + * @param {string} modelPath - The path to the pre-trained ONNX model. + */ + constructor(modelPath) { this.modelPath = modelPath; this.session = null; } + /** + * Initializes the ONNX session with the pre-trained model. + */ async init() { this.session = await InferenceSession.create(this.modelPath); } + /** + * Releases the ONNX session resources. + */ async dispose() { if (this.session) { await this.session.release(); @@ -66,6 +96,12 @@ class TextEncoder { } } + /** + * Encodes the input data using the pre-trained model. + * + * @param {Object} inputs - The input data containing input_ids and attention_mask. + * @return {Object} The encoded outputs from the model. + */ async encode(inputs) { if (!this.session) { throw new Error("Session is not initialized."); @@ -109,12 +145,17 @@ class TextEncoder { } - +/** + * A processor for image data that prepares images for the image encoder model. + */ class ImageProcessor { constructor(configPath) { this.configPath = configPath; } + /** + * Initializes the ImageProcessor by loading configuration settings for image preprocessing. + */ async init() { var config = JSON.parse(readFileSync(this.configPath, 'utf8')); if (config.image_encoder !== undefined) { @@ -128,6 +169,12 @@ class ImageProcessor { this.imageMean = new Float32Array(this.normalizationMeans); this.imageStd = new Float32Array(this.normalizationDeviations); } + /** + * Processes raw image data into a model-ready format, including resizing, cropping, and normalizing. + * + * @param {Buffer|Array} images - A single image or an array of images to process. + * @return {Array} The processed image data as an array of Float32Arrays. + */ async process(images) { const processSingle = async (image) => { let img = sharp(image).toColorspace('srgb'); @@ -174,16 +221,25 @@ class ImageProcessor { } } +/** + * An encoder for image data that uses a pre-trained model to encode images. + */ class ImageEncoder { constructor(modelPath, processor) { this.modelPath = modelPath; this.imageSize = processor.imageSize; } + /** + * Initializes the ONNX session with the pre-trained model. + */ async init() { this.session = await InferenceSession.create(this.modelPath); } + /** + * Releases the ONNX session resources. + */ async dispose() { if (this.session) { await this.session.release(); @@ -191,6 +247,12 @@ class ImageEncoder { } } + /** + * Encodes the processed image data using the pre-trained model. + * + * @param {Float32Array|Array} images - The processed image data. + * @return {Object} The encoded outputs from the model. + */ async encode(images) { if (!this.session) { throw new Error("Session is not initialized."); @@ -220,7 +282,7 @@ class ImageEncoder { let dims; if (Array.isArray(images)) { - // Assuming each images in the array is a Float32Array representing an image already processed to a fixed size. + // Assuming each image in the array is a Float32Array representing an image already processed to a fixed size. const arrays = images.map(ensureFloat32Array); imagesData = concatFloat32Arrays(arrays); const numImages = arrays.length; diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 8f0a30b..99d13c1 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -14,7 +14,7 @@ class Modality(Enum): TEXT_DECODER = "text_decoder" -def normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]: +def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]: if modalities is None: return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER) @@ -36,7 +36,7 @@ def get_checkpoint( :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path """ - modalities = normalize_modalities(modalities) + modalities = _normalize_modalities(modalities) # It is not recommended to use `.pth` extension when checkpointing models # because it collides with Python path (`.pth`) configuration files. @@ -98,10 +98,19 @@ def get_model_torch( device: Literal["cpu", "cuda"] = "cpu", modalities: Optional[Tuple[Union[str, Modality]]] = None, ) -> Tuple[Dict[Modality, Callable], Dict]: + """ + Fetches and constructs a PyTorch model with its processors based on provided modalities. + + :param model_name: The identifier of the model on the Hugging Face Hub. + :param token: Optional API token for authenticated access to the model. + :param device: The device to load the model onto ('cpu' or 'cuda'). + :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder). + :return: A tuple containing dictionaries for processors and models keyed by their respective modalities. + """ from uform.torch_encoders import TextEncoder, ImageEncoder from uform.torch_processors import TextProcessor, ImageProcessor - modalities = normalize_modalities(modalities) + modalities = _normalize_modalities(modalities) config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt") result_processors = {} @@ -131,10 +140,19 @@ def get_model_onnx( token: Optional[str] = None, modalities: Optional[Tuple[str]] = None, ): + """ + Fetches and constructs an ONNX model with its processors based on provided modalities. + + :param model_name: The identifier of the model on the Hugging Face Hub. + :param device: The device on which the model will operate ('cpu' or 'cuda'). + :param token: Optional API token for authenticated access to the model. + :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder). + :return: A tuple containing dictionaries for processors and models keyed by their respective modalities. + """ from uform.onnx_encoders import TextEncoder, ImageEncoder from uform.numpy_processors import TextProcessor, ImageProcessor - modalities = normalize_modalities(modalities) + modalities = _normalize_modalities(modalities) config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx") result_processors = {} @@ -163,7 +181,16 @@ def get_model( modalities: Optional[Tuple[str, Modality]] = None, # all by default token: Optional[str] = None, # optional HuggingFace Hub token for private models ) -> Tuple[Dict[Modality, Callable], Dict]: - + """ + Fetches a model and its processors from the Hugging Face Hub, using either the ONNX or Torch backend. + + :param model_name: The identifier of the model on the Hugging Face Hub. + :param device: The device to load the model onto ('cpu' or 'cuda'). + :param backend: The backend framework to use ('onnx' or 'torch'). + :param modalities: A tuple specifying the types of model components to fetch. + :param token: Optional API token for authenticated access to the model. + :return: A tuple containing dictionaries for processors and models keyed by their respective modalities. + """ if backend == "onnx": return get_model_onnx(model_name, device=device, token=token, modalities=modalities) elif backend == "torch": From c6f773c8249b8126e64393fff14d59c201d18b0b Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 24 Apr 2024 23:50:47 +0100 Subject: [PATCH 36/40] Docs: Typo Co-authored-by: Joshua Lochner --- javascript/encoders_test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js index a0a70b2..1785703 100644 --- a/javascript/encoders_test.js +++ b/javascript/encoders_test.js @@ -101,7 +101,7 @@ async function tryImageEncoderForwardPass(modelId) { } function cosineSimilarity(vecA, vecB) { - // We may be receiving a complex tesnor type, so let's check if it + // We may be receiving a complex tensor type, so let's check if it // has an array member named `data`. if (vecA.data) { vecA = vecA.data; From 6d4b6149700a45fb117c841f1a92f6497cdc46a4 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 24 Apr 2024 23:51:37 +0100 Subject: [PATCH 37/40] Improve: Backend-agnostic `.data` extraction in JS Co-authored-by: Joshua Lochner --- javascript/encoders_test.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js index 1785703..30ea96a 100644 --- a/javascript/encoders_test.js +++ b/javascript/encoders_test.js @@ -180,8 +180,8 @@ async function tryCrossReferencingImageAndText(modelId) { const textEmbedding = await textEncoder.encode(processedText); const imageEmbedding = await imageEncoder.encode(processedImage); - textEmbeddings.push(new Float32Array(textEmbedding.embeddings.cpuData)); - imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.cpuData)); + textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data)); + imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data)); // Print-based debugging at its best :) // console.log(`Text: ${text}, Image: ${imageUrl}`); From cf2516045a0e5c9f199c39907f000a2fb8c49fcb Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 24 Apr 2024 23:54:17 +0100 Subject: [PATCH 38/40] Fix: `add_special_tokens` argument in JS Co-authored-by: Joshua Lochner --- javascript/encoders.mjs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs index 81af1ae..3c41636 100644 --- a/javascript/encoders.mjs +++ b/javascript/encoders.mjs @@ -50,8 +50,7 @@ class TextProcessor { async process(texts) { const encoded = await this.tokenizer(texts, { - addSpecialTokens: true, - returnAttentionMask: true, + add_special_tokens: true, padding: 'max_length', max_length: this.maxSeqLen, truncation: true, From 917a4a868e9450597f546f33c0a4e26083b83498 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 24 Apr 2024 23:59:16 +0000 Subject: [PATCH 39/40] Improve: Multi-GPU support in Py --- python/README.md | 28 ++++- python/scripts/test_encoders.py | 175 +++++++++++++++++++------------ python/uform/__init__.py | 10 +- python/uform/numpy_processors.py | 6 +- python/uform/onnx_encoders.py | 4 +- python/uform/shared.py | 26 +++++ python/uform/torch_encoders.py | 20 ++-- python/uform/torch_processors.py | 6 +- 8 files changed, 182 insertions(+), 93 deletions(-) create mode 100644 python/uform/shared.py diff --git a/python/README.md b/python/README.md index 621bee0..dd7611d 100644 --- a/python/README.md +++ b/python/README.md @@ -99,13 +99,33 @@ For that pick the encoder of the model you want to run in parallel, and wrap it ```python from uform import get_model, Modality +import torch.nn as nn -encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch', device='gpu') +encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch') -encoder_image = encoders[Modality.IMAGE_ENCODER] -encoder_image = nn.DataParallel(encoder_image) +model_text = models[Modality.TEXT_ENCODER] +model_image = models[Modality.IMAGE_ENCODER] +processor_text = processors[Modality.TEXT_ENCODER] +processor_image = processors[Modality.IMAGE_ENCODER] -_, res = encoder_image(images, 0) +model_text.return_features = False +model_image.return_features = False +model_text_parallel = nn.DataParallel(model_text) +model_image_parallel = nn.DataParallel(model_image) +``` + +Since we are now dealing with the PyTorch wrapper, make sure to use the `forward` method (instead of `encode`) to get the embeddings, and the `.detach().cpu().numpy()` sequence to bring the data back to more Pythonic NumPy arrays. + +```python +def get_image_embedding(images: List[Image]): + preprocessed = processor_image(images) + embedding = model_image_parallel.forward(preprocessed) + return embedding.detach().cpu().numpy() + +def get_text_embedding(texts: List[str]): + preprocessed = processor_text(texts) + embedding = model_text_parallel.forward(preprocessed) + return embedding.detach().cpu().numpy() ``` ### ONNX and CUDA diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py index 274ed6c..20caed2 100644 --- a/python/scripts/test_encoders.py +++ b/python/scripts/test_encoders.py @@ -1,3 +1,4 @@ +from functools import wraps from typing import Tuple import requests from io import BytesIO @@ -7,7 +8,7 @@ import numpy as np from PIL import Image -from uform import Modality, get_model, get_model_onnx +from uform import Modality, get_model, ExecutionProviderError # PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed try: @@ -49,6 +50,21 @@ token = file.read().strip() +def skip_on(exception, reason="No good reason :)"): + def decorator_func(f): + @wraps(f) + def wrapper(*args, **kwargs): + try: + # Try to run the test + return f(*args, **kwargs) + except exception: + pytest.skip(reason) + + return wrapper + + return decorator_func + + def cosine_similarity(x, y) -> float: if not isinstance(x, np.ndarray): x = x.detach().numpy() @@ -61,7 +77,7 @@ def cosine_similarity(x, y) -> float: return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) -def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding): +def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding, batch_size_multiple: int = 1): """Test if the embeddings of text and image are semantically similar using a small set of example text-image pairs.""" @@ -80,30 +96,27 @@ def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embed "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true", "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true", ] + assert len(texts) == len(image_urls), "Number of texts and images should be the same." - text_embeddings = [] - image_embeddings = [] - - for text, image_url in zip(texts, image_urls): - # Download and open the image - response = requests.get(image_url) - image = Image.open(BytesIO(response.content)) + images = [Image.open(BytesIO(requests.get(image_url).content)) for image_url in image_urls] + count_pairs = len(texts) - # Get embeddings - text_embedding = text_to_embedding(text) - image_embedding = image_to_embedding(image) + # Ensure we have a sufficiently large batch + texts = texts * batch_size_multiple + images = images * batch_size_multiple - text_embeddings.append(text_embedding) - image_embeddings.append(image_embedding) + # Compute the embedding in a batch fashion + text_embeddings = text_to_embedding(texts) + image_embeddings = image_to_embedding(images) # Evaluate cosine similarity - for i in range(len(texts)): + for i in range(count_pairs): pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i]) other_text_similarities = [ - cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(len(texts)) if j != i + cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(count_pairs) if j != i ] other_image_similarities = [ - cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(len(texts)) if j != i + cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(count_pairs) if j != i ] assert pair_similarity > max( @@ -171,79 +184,109 @@ def test_torch_many_embeddings(model_name: str, batch_size: int): @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") @pytest.mark.parametrize("model_name", onnx_models) @pytest.mark.parametrize("device", ["CPUExecutionProvider"]) +@skip_on(ExecutionProviderError, reason="Missing execution provider") def test_onnx_one_embedding(model_name: str, device: str): - from uform.onnx_encoders import ExecutionProviderError - - try: - - processors, models = get_model(model_name, token=token, device=device, backend="onnx") - model_text = models[Modality.TEXT_ENCODER] - model_image = models[Modality.IMAGE_ENCODER] - processor_text = processors[Modality.TEXT_ENCODER] - processor_image = processors[Modality.IMAGE_ENCODER] - - text = "a small red panda in a zoo" - image_path = "assets/unum.png" + processors, models = get_model(model_name, token=token, device=device, backend="onnx") + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] - image = Image.open(image_path) - image_data = processor_image(image) - text_data = processor_text(text) + text = "a small red panda in a zoo" + image_path = "assets/unum.png" - image_features, image_embedding = model_image.encode(image_data) - text_features, text_embedding = model_text.encode(text_data) + image = Image.open(image_path) + image_data = processor_image(image) + text_data = processor_text(text) - assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" - assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" + image_features, image_embedding = model_image.encode(image_data) + text_features, text_embedding = model_text.encode(text_data) - # Nested fucntions are easier to debug, than lambdas - def get_image_embedding(image_data): - features, embedding = model_image.encode(processor_image(image_data)) - return embedding + assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" + assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" - def get_text_embedding(text_data): - features, embedding = model_text.encode(processor_text(text_data)) - return embedding + # Nested fucntions are easier to debug, than lambdas + def get_image_embedding(image_data): + features, embedding = model_image.encode(processor_image(image_data)) + return embedding - # Test if the model outputs actually make sense - cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding) + def get_text_embedding(text_data): + features, embedding = model_text.encode(processor_text(text_data)) + return embedding - except ExecutionProviderError as e: - pytest.skip(f"Execution provider error: {e}") + # Test if the model outputs actually make sense + cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding) @pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") @pytest.mark.parametrize("model_name", onnx_models) @pytest.mark.parametrize("batch_size", [1, 2]) @pytest.mark.parametrize("device", ["CPUExecutionProvider"]) +@skip_on(ExecutionProviderError, reason="Missing execution provider") def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str): - from uform.onnx_encoders import ExecutionProviderError + processors, models = get_model(model_name, token=token, device=device, backend="onnx") + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] - try: + texts = ["a small red panda in a zoo"] * batch_size + image_paths = ["assets/unum.png"] * batch_size + + images = [Image.open(path) for path in image_paths] + image_data = processor_image(images) + text_data = processor_text(texts) - processors, models = get_model(model_name, token=token, device=device, backend="onnx") - model_text = models[Modality.TEXT_ENCODER] - model_image = models[Modality.IMAGE_ENCODER] - processor_text = processors[Modality.TEXT_ENCODER] - processor_image = processors[Modality.IMAGE_ENCODER] + image_embeddings = model_image.encode(image_data, return_features=False) + text_embeddings = model_text.encode(text_data, return_features=False) - texts = ["a small red panda in a zoo"] * batch_size - image_paths = ["assets/unum.png"] * batch_size + assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" + assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" - images = [Image.open(path) for path in image_paths] - image_data = processor_image(images) - text_data = processor_text(texts) - image_embeddings = model_image.encode(image_data, return_features=False) - text_embeddings = model_text.encode(text_data, return_features=False) +@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") +@pytest.mark.parametrize("model_name", torch_models[:1]) +def test_torch_multi_gpu(model_name: str): - assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" - assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" + count_cuda_devices = torch.cuda.device_count() + if count_cuda_devices < 2: + pytest.skip("Not enough CUDA devices to run multi-GPU test") - except ExecutionProviderError as e: - pytest.skip(f"Execution provider error: {e}") + processors, models = get_model(model_name, token=token, backend="torch", device="cuda") + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + + import torch.nn as nn + + model_text.return_features = False + model_image.return_features = False + model_text_parallel = nn.DataParallel(model_text) + model_image_parallel = nn.DataParallel(model_image) + + # Nested fucntions are easier to debug, than lambdas + def get_image_embedding(image_data): + preprocessed = processor_image(image_data) + embedding = model_image_parallel.forward(preprocessed) + return embedding.detach().cpu().numpy() + + def get_text_embedding(text_data): + preprocessed = processor_text(text_data) + embedding = model_text_parallel.forward(preprocessed) + return embedding.detach().cpu().numpy() + + # Test if the model outputs actually make sense + cross_references_image_and_text_embeddings( + get_text_embedding, + get_image_embedding, + batch_size_multiple=count_cuda_devices, + ) if __name__ == "__main__": - pytest.main(["-s", "-x", __file__]) + # If you want to run this test file individually, you can do so by running: + # pytest.main(["-s", "-x", __file__]) + pass diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 99d13c1..7af8b75 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -1,17 +1,9 @@ from os.path import join, exists from typing import Dict, Optional, Tuple, Literal, Union, Callable -from enum import Enum from huggingface_hub import snapshot_download, utils -from uform.onnx_encoders import ExecutionProviderError - - -class Modality(Enum): - TEXT_ENCODER = "text_encoder" - IMAGE_ENCODER = "image_encoder" - VIDEO_ENCODER = "video_encoder" - TEXT_DECODER = "text_decoder" +from uform.shared import ExecutionProviderError, Modality def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]: diff --git a/python/uform/numpy_processors.py b/python/uform/numpy_processors.py index 3782c26..166ecf4 100644 --- a/python/uform/numpy_processors.py +++ b/python/uform/numpy_processors.py @@ -6,6 +6,8 @@ from tokenizers import Tokenizer import numpy as np +from uform.shared import read_config + class TextProcessor: def __init__(self, config_path: PathLike, tokenizer_path: PathLike): @@ -14,7 +16,7 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike): :param tokenizer_path: path to tokenizer file """ - config = json.load(open(config_path, "r")) + config = read_config(config_path) if "text_encoder" in config: config = config["text_encoder"] @@ -60,7 +62,7 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None): :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) """ - config = json.load(open(config_path, "r")) + config = read_config(config_path) if "image_encoder" in config: config = config["image_encoder"] diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py index d2668b9..b9c4cc4 100644 --- a/python/uform/onnx_encoders.py +++ b/python/uform/onnx_encoders.py @@ -5,9 +5,7 @@ import onnxruntime as ort from numpy import ndarray - -class ExecutionProviderError(Exception): - """Exception raised when a requested execution provider is not available.""" +from uform.shared import ExecutionProviderError def available_providers(device: Optional[str]) -> Tuple[str, ...]: diff --git a/python/uform/shared.py b/python/uform/shared.py new file mode 100644 index 0000000..37d256b --- /dev/null +++ b/python/uform/shared.py @@ -0,0 +1,26 @@ +from enum import Enum +from typing import Union +from os import PathLike +import json + + +class Modality(Enum): + TEXT_ENCODER = "text_encoder" + IMAGE_ENCODER = "image_encoder" + VIDEO_ENCODER = "video_encoder" + TEXT_DECODER = "text_decoder" + + +class ExecutionProviderError(Exception): + """Exception raised when a requested execution provider is not available.""" + + +ConfigOrPath = Union[PathLike, str, object] + + +def read_config(path_or_object: ConfigOrPath) -> object: + if isinstance(path_or_object, (PathLike, str)): + with open(path_or_object, "r") as f: + return json.load(f) + else: + return path_or_object diff --git a/python/uform/torch_encoders.py b/python/uform/torch_encoders.py index c149088..89f6631 100644 --- a/python/uform/torch_encoders.py +++ b/python/uform/torch_encoders.py @@ -3,7 +3,6 @@ from dataclasses import dataclass from os import PathLike from typing import Dict, Optional, Union, Mapping, Any, Tuple -import json import torch import torch.nn as nn @@ -11,6 +10,15 @@ from torch import Tensor from PIL.Image import Image +from uform.shared import read_config + + +def _is_on_gpu(model: nn.Module) -> bool: + try: + return next(model.parameters()).device.type == "cuda" + except StopIteration: + return False + @dataclass(eq=False) class Attention(nn.Module): @@ -266,7 +274,7 @@ def forward( attention_mask = torch.ones_like(x) # If the model is on the GPU and the input matrices are not, shift them there - if next(self.parameters()).device.type == "cuda" and x.device.type != "cuda": + if _is_on_gpu(self) and not x.is_cuda: x = x.cuda() attention_mask = attention_mask.cuda() @@ -298,8 +306,7 @@ def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, :param config: the configuration dictionary or path to the JSON configuration file :param model: the model state dictionary or path to the `.pt` model file """ - if isinstance(config, (PathLike, str)): - config = json.load(open(config, "r")) + config = read_config(config) if "text_encoder" in config: config = config["text_encoder"] @@ -374,7 +381,7 @@ def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None x = x["images"] # If the model is on the GPU and the input matrices are not, shift them there - if next(self.parameters()).device.type == "cuda" and x.device.type != "cuda": + if _is_on_gpu(self) and not x.is_cuda: x = x.cuda() features = self.forward_features(x) @@ -401,8 +408,7 @@ def from_pretrained( :param config: the configuration dictionary or path to the JSON configuration file :param model: the model state dictionary or path to the `.pt` model file """ - if isinstance(config, (PathLike, str)): - config = json.load(open(config, "r")) + config = read_config(config) if "image_encoder" in config: config = config["image_encoder"] diff --git a/python/uform/torch_processors.py b/python/uform/torch_processors.py index b61b224..79c7e87 100644 --- a/python/uform/torch_processors.py +++ b/python/uform/torch_processors.py @@ -15,6 +15,8 @@ ToTensor, ) +from uform.shared import read_config + # lambda is not pickle-able def convert_to_rgb(image): @@ -28,7 +30,7 @@ def __init__(self, config_path: PathLike, tokenizer_path: PathLike): :param tokenizer_path: path to tokenizer file """ - config = json.load(open(config_path, "r")) + config = read_config(config_path) if "text_encoder" in config: config = config["text_encoder"] @@ -75,7 +77,7 @@ def __init__(self, config_path: PathLike): :param config: model config """ - config = json.load(open(config_path, "r")) + config = read_config(config_path) if "image_encoder" in config: config = config["image_encoder"] From f195b667a49c0be802780f4075bc842dd932408a Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Thu, 25 Apr 2024 01:08:24 +0000 Subject: [PATCH 40/40] Add: Parallel decoding bench --- .vscode/launch.json | 14 +++++++- BENCHMARKS.md | 23 ++++++++---- python/scripts/bench_decoders.py | 61 ++++++++++++++++++++++---------- 3 files changed, 72 insertions(+), 26 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 3343a11..92a1844 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,12 +5,24 @@ "version": "0.2.0", "configurations": [ { - "name": "Python Debugger: Current File with Arguments", + "name": "Python Debugger", "type": "debugpy", "request": "launch", "program": "${file}", "console": "integratedTerminal", }, + { + "name": "PyTest Debugger", + "type": "debugpy", + "request": "launch", + "program": "pytest", + "console": "integratedTerminal", + "args": [ + "${file}", + "-s", + "-x", + ], + }, { "name": "NodeJS Debugger", "type": "node-terminal", diff --git a/BENCHMARKS.md b/BENCHMARKS.md index aa61535..07ff0bb 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -157,17 +157,26 @@ usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE options: -h, --help show this help message and exit - --filter-out FILTER_OUT - Filter out models, backends, or devices with a Regular Expression. --batch-size BATCH_SIZE Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU. + --max-length MAX_LENGTH + Maximum length of the generated text in tokens. ``` +On Nvidia H100 GPU, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. + +| Model | Size | Decoding Speed | Decoding Parallel Streams | +| :---------------------------------- | ----: | -------------: | ---------------------------: | +| `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 141 tokens/s | ~ 4 K tokens/s (32 streams) | +| `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 211 tokens/s | ~ 2 K tokens/s (32 streams) | +| `unum-cloud/uform-gen` | 1.5 B | ~ 252 tokens/s | ~ 3 K tokens/s (128 streams) | +| `unum-cloud/uform-gen2-dpo` | 1.2 B | ~ 372 tokens/s | ~ 10 K tokens/s (64 streams) | + On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. -| Model | Size | Speed | Speedup | -| :---------------------------------- | ---: | ------------------: | --------: | -| `llava-hf/llava-1.5-7b-hf` | 7B | ~ 40 tokens/second | | -| `Salesforce/instructblip-vicuna-7b` | 7B | ~ 40 tokens/second | | -| `unum-cloud/uform-gen` | 1.5B | ~ 140 tokens/second | __x 3.5__ | +| Model | Size | Decoding Speed | Speedup | +| :---------------------------------- | ----: | -------------: | --------: | +| `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 40 tokens/s | | +| `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 40 tokens/s | | +| `unum-cloud/uform-gen` | 1.5 B | ~ 140 tokens/s | __x 3.5__ | diff --git a/python/scripts/bench_decoders.py b/python/scripts/bench_decoders.py index 4241ee6..0842ba9 100644 --- a/python/scripts/bench_decoders.py +++ b/python/scripts/bench_decoders.py @@ -34,8 +34,16 @@ class BenchmarkResult: duration_text_embedding: float -def caption(model, processor, prompt: str, image: Image.Image) -> str: - inputs = processor(prompt, image, return_tensors="pt") +def caption(model, processor, prompt: str, image: Image.Image, max_length: int, batch_size: int) -> List[str]: + # BLIP models require the prompt to be the first argument + prompt = [prompt] * batch_size + image = [image] * batch_size + try: + inputs = processor(prompt, image, return_tensors="pt") + except ValueError: + inputs = processor(image, prompt, return_tensors="pt") + + # Downcast and move to device for possible_key in ["images", "pixel_values"]: if possible_key not in inputs: continue @@ -47,16 +55,16 @@ def caption(model, processor, prompt: str, image: Image.Image) -> str: **inputs, do_sample=False, # use_cache=True, - max_new_tokens=128, + max_new_tokens=max_length, eos_token_id=32001, pad_token_id=processor.tokenizer.pad_token_id, ) prompt_len = inputs["input_ids"].shape[1] - decoded_text = processor.batch_decode( + decoded_texts = processor.batch_decode( output[:, prompt_len:], skip_special_tokens=True, - )[0].strip() - return decoded_text + ) + return decoded_texts def duration(callable): @@ -72,25 +80,34 @@ def bench_captions( processor, prompt: str, images: List[Image.Image], + max_length: int = 256, + batch_size: int = 10, ) -> List[str]: total_duration = 0 total_length = 0 model = torch.compile(model) - def caption_image(image, model=model, processor=processor, prompt=prompt): - return caption(model=model, processor=processor, prompt=prompt, image=image) + def caption_image(image): + return caption( + model=model, + processor=processor, + prompt=prompt, + image=image, + max_length=max_length, + batch_size=batch_size, + ) for image in images: - seconds, text = duration(partial(caption_image, image=image)) + seconds, captions = duration(partial(caption_image, image=image)) total_duration += seconds - total_length += len(text) + total_length += len(captions.strip()) if isinstance(captions, str) else sum(len(t.strip()) for t in captions) del model del processor print(f"Throughput: {total_length/total_duration:.2f} tokens/s") -def main(filter_out: str = None, batch_size: int = 10): +def main(batch_size: int = 10, max_length: int = 256): image_urls = [ "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", @@ -123,6 +140,8 @@ def main(filter_out: str = None, batch_size: int = 10): ), prompt="Describe the picture in great detail", images=images, + batch_size=batch_size, + max_length=max_length, ) print("UForm-Gen") @@ -138,6 +157,8 @@ def main(filter_out: str = None, batch_size: int = 10): ), prompt="[cap] Summarize the visual content of the image.", images=images, + batch_size=batch_size, + max_length=max_length, ) print("LLaVA") @@ -152,6 +173,8 @@ def main(filter_out: str = None, batch_size: int = 10): ), prompt="USER: \nWhat are these?\nASSISTANT:", images=images, + batch_size=batch_size, + max_length=max_length, ) print("InstructBLIP") @@ -166,24 +189,26 @@ def main(filter_out: str = None, batch_size: int = 10): ), prompt="Summarize the visual content of the image.", images=images, + batch_size=batch_size, + max_length=max_length, ) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "--filter-out", - type=str, - default=None, - help="Filter out models, backends, or devices with a Regular Expression.", - ) parser.add_argument( "--batch-size", type=int, default=10, help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.", ) + parser.add_argument( + "--max-length", + type=str, + default=256, + help="Maximum length of the generated text in tokens.", + ) args = parser.parse_args() - main(filter_out=args.filter_out, batch_size=args.batch_size) + main(batch_size=args.batch_size, max_length=args.max_length)
Model Parameters Purpose
uform-gen2-dpo 🆕1.2Buform-gen2-dpo 🆕1.2 B Chat, Image Captioning, VQA qwen1.5-0.5B, ViT-H/14
uform-gen2-qwen-500m1.2Buform-gen2-qwen-500m1.2 B Chat, Image Captioning, VQA qwen1.5-0.5B, ViT-H/14
uform-gen1.5Buform-gen ⚠️1.5 B Image Captioning, VQA llama-1.3B, ViT-B/16