From 6b1867511a23f2cf41b6a05a7bc1d23f3e0921e6 Mon Sep 17 00:00:00 2001 From: Dickson Neoh Date: Tue, 29 Oct 2024 23:37:45 +0800 Subject: [PATCH] Add Phi 3.5 Vision Model (#43) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * initial implementation of phi35 * fix bug with molmo naming prefix * rename nb * Bump version: 0.1.2 → 0.1.3 --- nbs/molmo.ipynb | 98 ++++++++++ nbs/{vllm.ipynb => phi35.ipynb} | 332 ++++++++++++++++---------------- pyproject.toml | 4 +- xinfer/__init__.py | 2 +- xinfer/vllm/__init__.py | 1 + xinfer/vllm/molmo.py | 3 +- xinfer/vllm/phi35vision.py | 63 ++++++ 7 files changed, 328 insertions(+), 175 deletions(-) create mode 100644 nbs/molmo.ipynb rename nbs/{vllm.ipynb => phi35.ipynb} (88%) create mode 100644 xinfer/vllm/phi35vision.py diff --git a/nbs/molmo.ipynb b/nbs/molmo.ipynb new file mode 100644 index 0000000..7a562db --- /dev/null +++ b/nbs/molmo.ipynb @@ -0,0 +1,98 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import xinfer\n", + "\n", + "xinfer.list_models(interactive=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "model = xinfer.create_model(\n", + " \"vllm/allenai/Molmo-7B-D-0924\",\n", + " device=\"cuda\",\n", + " dtype=\"float16\",\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image1 = \"https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg\"\n", + "prompt1 = \"Describe this image.\"\n", + "\n", + "image2 = \"https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg\"\n", + "prompt2 = \"Create a pun based on the image.\"\n", + "\n", + "model.infer_batch([image1, image2], [prompt1, prompt2])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.print_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.infer(image1, prompt1, max_tokens=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.launch_gradio()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "xinfer", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/vllm.ipynb b/nbs/phi35.ipynb similarity index 88% rename from nbs/vllm.ipynb rename to nbs/phi35.ipynb index 30f3e8a..2ec5b96 100644 --- a/nbs/vllm.ipynb +++ b/nbs/phi35.ipynb @@ -1,5 +1,76 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from vllm import LLM, SamplingParams\n", + "from vllm.assets.image import ImageAsset\n", + "\n", + "def run_phi3v(question: str):\n", + " prompt = f\"<|user|>\\n<|image_1|>\\n{question}<|end|>\\n<|assistant|>\\n\"\n", + "\n", + " llm = LLM(\n", + " model=\"microsoft/Phi-3.5-vision-instruct\",\n", + " trust_remote_code=True,\n", + " max_model_len=4096,\n", + " max_num_seqs=2,\n", + " mm_processor_kwargs={\"num_crops\": 16},\n", + " )\n", + " stop_token_ids = None\n", + " return llm, prompt, stop_token_ids\n", + "\n", + "def get_multi_modal_input():\n", + " image = ImageAsset(\"cherry_blossom\").pil_image.convert(\"RGB\")\n", + " img_question = \"What is the content of this image?\"\n", + "\n", + " return {\n", + " \"data\": image,\n", + " \"question\": img_question,\n", + " }\n", + "\n", + "def main(num_prompts=4):\n", + " mm_input = get_multi_modal_input()\n", + " data = mm_input[\"data\"]\n", + " question = mm_input[\"question\"]\n", + "\n", + " llm, prompt, stop_token_ids = run_phi3v(question)\n", + "\n", + " sampling_params = SamplingParams(temperature=0.2,\n", + " max_tokens=64,\n", + " stop_token_ids=stop_token_ids)\n", + "\n", + " if num_prompts == 1:\n", + " # Single inference\n", + " inputs = {\n", + " \"prompt\": prompt,\n", + " \"multi_modal_data\": {\n", + " \"image\": data\n", + " },\n", + " }\n", + " else:\n", + " # Batch inference\n", + " inputs = [{\n", + " \"prompt\": prompt,\n", + " \"multi_modal_data\": {\n", + " \"image\": data\n", + " },\n", + " } for _ in range(num_prompts)]\n", + "\n", + " outputs = llm.generate(inputs, sampling_params=sampling_params)\n", + "\n", + " for i, o in enumerate(outputs):\n", + " generated_text = o.outputs[0].text\n", + " print(f\"Output {i + 1}:\")\n", + " print(generated_text)\n", + " print()\n", + "\n", + "if __name__ == \"__main__\":\n", + " main(num_prompts=4) # Change this number to adjust the batch size" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -9,9 +80,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/dnth/mambaforge-pypy3/envs/xinfer/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "2024-10-24 00:47:00,503\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + "\u001b[32m2024-10-29 23:03:33.654\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.core\u001b[0m:\u001b[36mlist_models\u001b[0m:\u001b[36m55\u001b[0m - \u001b[1mShowing interactive table in Jupyter Notebook. Type in the search bar to filter the models.\u001b[0m\n" ] }, { @@ -168,7 +237,7 @@ { "data": { "text/html": [ - "\n", + "
\n", "\n", " \n", " \n", @@ -280,12 +349,12 @@ "\n" ], "text/plain": [ - " Implementation Model ID \\\n", - "0 timm bat_resnext26ts.ch_in1k \n", - "1 timm beit_base_patch16_224.in22k_ft_in22k_in1k \n", - "2 timm beit_base_patch16_384.in22k_ft_in22k_in1k \n", - "3 timm beit_large_patch16_224.in22k_ft_in22k_in1k \n", - "4 timm beit_large_patch16_384.in22k_ft_in22k_in1k \n", - "... ... ... \n", - "1258 transformers vikhyatk/moondream2 \n", - "1259 transformers sashakunitsyn/vlrm-blip2-opt-2.7b \n", - "1260 vllm allenai/Molmo-7B-D-0924 \n", - "1261 vllm allenai/Molmo-7B-O-0924 \n", - "1262 vllm allenai/Molmo-72B-0924 \n", + " Implementation Model ID \\\n", + "0 timm timm/bat_resnext26ts.ch_in1k \n", + "1 timm timm/beit_base_patch16_224.in22k_ft_in22k_in1k \n", + "2 timm timm/beit_base_patch16_384.in22k_ft_in22k_in1k \n", + "3 timm timm/beit_large_patch16_224.in22k_ft_in22k_in1k \n", + "4 timm timm/beit_large_patch16_384.in22k_ft_in22k_in1k \n", + "... ... ... \n", + "1281 ultralytics ultralytics/yolov11n \n", + "1282 vllm vllm/allenai/Molmo-7B-D-0924 \n", + "1283 vllm vllm/allenai/Molmo-7B-O-0924 \n", + "1284 vllm vllm/allenai/Molmo-72B-0924 \n", + "1285 vllm vllm/microsoft/Phi-3.5-vision-instruct \n", "\n", " Input --> Output \n", "0 image --> categories \n", @@ -317,13 +386,13 @@ "3 image --> categories \n", "4 image --> categories \n", "... ... \n", - "1258 image-text --> text \n", - "1259 image-text --> text \n", - "1260 image-text --> text \n", - "1261 image-text --> text \n", - "1262 image-text --> text \n", + "1281 image --> boxes \n", + "1282 image-text --> text \n", + "1283 image-text --> text \n", + "1284 image-text --> text \n", + "1285 image-text --> text \n", "\n", - "[1263 rows x 3 columns]" + "[1286 rows x 3 columns]" ] }, "execution_count": 1, @@ -346,95 +415,86 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-10-24 00:47:01.184\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m23\u001b[0m - \u001b[1mModel: allenai/Molmo-7B-D-0924\u001b[0m\n", - "\u001b[32m2024-10-24 00:47:01.185\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n", - "\u001b[32m2024-10-24 00:47:01.185\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1mDtype: float16\u001b[0m\n" + "\u001b[32m2024-10-29 23:03:33.746\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m63\u001b[0m - \u001b[1mModel: vllm/microsoft/Phi-3.5-vision-instruct\u001b[0m\n", + "\u001b[32m2024-10-29 23:03:33.748\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n", + "\u001b[32m2024-10-29 23:03:33.748\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m65\u001b[0m - \u001b[1mDtype: float16\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "INFO 10-24 00:47:01 config.py:1664] Downcasting torch.float32 to torch.float16.\n", - "INFO 10-24 00:47:04 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='/home/dnth/Desktop/cv-docker-images/image_captioning/molmo/molmo_7b_d_0924', speculative_config=None, tokenizer='/home/dnth/Desktop/cv-docker-images/image_captioning/molmo/molmo_7b_d_0924', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/home/dnth/Desktop/cv-docker-images/image_captioning/molmo/molmo_7b_d_0924, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, mm_processor_kwargs=None)\n", - "INFO 10-24 00:47:04 model_runner.py:1056] Starting to load model /home/dnth/Desktop/cv-docker-images/image_captioning/molmo/molmo_7b_d_0924...\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n", - "WARNING 10-24 00:47:04 utils.py:513] Current `vllm-flash-attn` has a bug inside vision module, so we use xformers backend instead. You can run `pip install flash-attn` to use flash-attention backend.\n" + "INFO 10-29 23:03:35 config.py:107] Replacing legacy 'type' key with 'rope_type'\n", + "WARNING 10-29 23:03:35 config.py:114] Replacing legacy rope_type 'su' with 'longrope'\n", + "WARNING 10-29 23:03:36 config.py:1668] Casting torch.bfloat16 to torch.float16.\n", + "INFO 10-29 23:03:39 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='microsoft/Phi-3.5-vision-instruct', speculative_config=None, tokenizer='microsoft/Phi-3.5-vision-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=microsoft/Phi-3.5-vision-instruct, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, mm_processor_kwargs={'num_crops': 16})\n", + "INFO 10-29 23:03:41 selector.py:247] Cannot use FlashAttention-2 backend due to sliding window.\n", + "INFO 10-29 23:03:41 selector.py:115] Using XFormers backend.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00 Model Info \n", - "╭───────────────────────────┬─────────────────────────╮\n", - "│ Attribute Value │\n", - "├───────────────────────────┼─────────────────────────┤\n", - "│ Model ID allenai/Molmo-7B-D-0924 │\n", - "│ Device cuda │\n", - "│ Dtype torch.float16 │\n", - "│ Number of Inferences 2 │\n", - "│ Total Inference Time (ms) 1093.0564 │\n", - "│ Average Latency (ms) 546.5282 │\n", - "╰───────────────────────────┴─────────────────────────╯\n", - "\n" - ], - "text/plain": [ - "\u001b[3m Model Info \u001b[0m\n", - "╭───────────────────────────┬─────────────────────────╮\n", - "│\u001b[1m \u001b[0m\u001b[1mAttribute \u001b[0m\u001b[1m \u001b[0m│\u001b[1m \u001b[0m\u001b[1mValue \u001b[0m\u001b[1m \u001b[0m│\n", - "├───────────────────────────┼─────────────────────────┤\n", - "│\u001b[36m \u001b[0m\u001b[36mModel ID \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mallenai/Molmo-7B-D-0924\u001b[0m\u001b[35m \u001b[0m│\n", - "│\u001b[36m \u001b[0m\u001b[36mDevice \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mcuda \u001b[0m\u001b[35m \u001b[0m│\n", - "│\u001b[36m \u001b[0m\u001b[36mDtype \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mtorch.float16 \u001b[0m\u001b[35m \u001b[0m│\n", - "│\u001b[36m \u001b[0m\u001b[36mNumber of Inferences \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m│\n", - "│\u001b[36m \u001b[0m\u001b[36mTotal Inference Time (ms)\u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m1093.0564 \u001b[0m\u001b[35m \u001b[0m│\n", - "│\u001b[36m \u001b[0m\u001b[36mAverage Latency (ms) \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35m546.5282 \u001b[0m\u001b[35m \u001b[0m│\n", - "╰───────────────────────────┴─────────────────────────╯\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "model.print_stats()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 1.81it/s, est. speed input: 1775.51 toks/s, output: 18.10 toks/s]\n" + "prompt_token_ids (old) [1, 32010, 29871, 13, 29966, 29989, 3027, 29918, 29896, 29989, 29958, 13, 26270, 683, 445, 1967, 29889, 32007, 29871, 13, 32001]\n", + "prompt_token_ids (old) [1, 32010, 29871, 13, 29966, 29989, 3027, 29918, 29896, 29989, 29958, 13, 26270, 683, 445, 1967, 29889, 32007, 29871, 13, 32001]\n" ] }, { - "data": { - "text/plain": [ - "'This image features a highly detailed cartoon character in the'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.infer(image1, prompt1, max_tokens=10)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "* Running on local URL: http://127.0.0.1:7860\n", - "\n", - "To create a public link, set `share=True` in `launch()`.\n" + "Processed prompts: 100%|██████████| 2/2 [00:01<00:00, 1.74it/s, est. speed input: 3367.01 toks/s, output: 27.84 toks/s]\n" ] }, { "data": { - "text/html": [ - "
" - ], "text/plain": [ - "" + "['The image shows an individual with a blurred face, wearing a green',\n", + " 'The image shows a middle-aged woman with shoulder-length hair and glasses']" ] }, + "execution_count": 4, "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 1.33it/s, est. speed input: 1597.76 toks/s, output: 21.27 toks/s]\n" - ] + "output_type": "execute_result" } ], "source": [ - "model.launch_gradio()" + "model.infer_batch([image, image], [prompt, prompt])" ] }, { diff --git a/pyproject.toml b/pyproject.toml index ed0f48e..e33e5cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "xinfer" -version = "0.1.2" +version = "0.1.3" dynamic = [ "dependencies", ] @@ -48,7 +48,7 @@ universal = true [tool.bumpversion] -current_version = "0.1.2" +current_version = "0.1.3" commit = true tag = true diff --git a/xinfer/__init__.py b/xinfer/__init__.py index 330a5e0..c4e54a2 100644 --- a/xinfer/__init__.py +++ b/xinfer/__init__.py @@ -2,7 +2,7 @@ __author__ = """Dickson Neoh""" __email__ = "dickson.neoh@gmail.com" -__version__ = "0.1.2" +__version__ = "0.1.3" from .core import create_model, list_models from .model_registry import ModelInputOutput, register_model diff --git a/xinfer/vllm/__init__.py b/xinfer/vllm/__init__.py index bf9c18d..23cb120 100644 --- a/xinfer/vllm/__init__.py +++ b/xinfer/vllm/__init__.py @@ -1 +1,2 @@ from .molmo import Molmo +from .phi35vision import Phi35Vision diff --git a/xinfer/vllm/molmo.py b/xinfer/vllm/molmo.py index 9fbd565..1bb2101 100644 --- a/xinfer/vllm/molmo.py +++ b/xinfer/vllm/molmo.py @@ -26,9 +26,10 @@ def __init__( def load_model(self, **kwargs): self.model = LLM( - model=self.model_id, + model=self.model_id.replace("vllm/", ""), trust_remote_code=True, dtype=self.dtype, + max_model_len=4096, **kwargs, ) diff --git a/xinfer/vllm/phi35vision.py b/xinfer/vllm/phi35vision.py new file mode 100644 index 0000000..aa6b70d --- /dev/null +++ b/xinfer/vllm/phi35vision.py @@ -0,0 +1,63 @@ +from vllm import LLM, SamplingParams + +from ..model_registry import ModelInputOutput, register_model +from ..models import BaseModel, track_inference + + +@register_model( + "vllm/microsoft/Phi-3.5-vision-instruct", + "vllm", + ModelInputOutput.IMAGE_TEXT_TO_TEXT, +) +class Phi35Vision(BaseModel): + def __init__( + self, + model_id: str, + device: str = "cpu", + dtype: str = "float32", + **kwargs, + ): + super().__init__(model_id, device, dtype) + self.load_model(**kwargs) + + def load_model(self, **kwargs): + self.model = LLM( + model=self.model_id.replace("vllm/", ""), + trust_remote_code=True, + dtype=self.dtype, + max_model_len=4096, + max_num_seqs=2, + mm_processor_kwargs={"num_crops": 16}, + **kwargs, + ) + + @track_inference + def infer_batch(self, images: list[str], prompts: list[str], **sampling_kwargs): + images = self.parse_images(images) + + sampling_params = SamplingParams(**sampling_kwargs) + batch_inputs = [ + { + "prompt": f"<|user|>\n<|image_1|>\n{prompt}<|end|>\n<|assistant|>\n", + "multi_modal_data": {"image": image}, + } + for image, prompt in zip(images, prompts) + ] + + results = self.model.generate(batch_inputs, sampling_params) + + return [output.outputs[0].text.strip() for output in results] + + @track_inference + def infer(self, image: str, prompt: str, **sampling_kwargs): + image = self.parse_images(image) + + inputs = { + "prompt": f"<|user|>\n<|image_1|>\n{prompt}<|end|>\n<|assistant|>\n", + "multi_modal_data": {"image": image}, + } + + sampling_params = SamplingParams(**sampling_kwargs) + outputs = self.model.generate(inputs, sampling_params) + generated_text = outputs[0].outputs[0].text.strip() + return generated_text