Add Phi 3.5 Vision Model (#43)

* initial implementation of phi35 * fix bug with molmo naming prefix * rename nb * Bump version: 0.1.2 → 0.1.3
dnth · Oct 29, 2024 · 6b18675 · 6b18675
1 parent ae9964b
commit 6b18675
Show file tree

Hide file tree

Showing 7 changed files with 328 additions and 175 deletions.
diff --git a/nbs/molmo.ipynb b/nbs/molmo.ipynb
@@ -0,0 +1,98 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xinfer\n",
+    "\n",
+    "xinfer.list_models(interactive=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = xinfer.create_model(\n",
+    "    \"vllm/allenai/Molmo-7B-D-0924\",\n",
+    "    device=\"cuda\",\n",
+    "    dtype=\"float16\",\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image1 = \"https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg\"\n",
+    "prompt1 = \"Describe this image.\"\n",
+    "\n",
+    "image2 = \"https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg\"\n",
+    "prompt2 = \"Create a pun based on the image.\"\n",
+    "\n",
+    "model.infer_batch([image1, image2], [prompt1, prompt2])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.print_stats()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.infer(image1, prompt1, max_tokens=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.launch_gradio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "xinfer",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/nbs/vllm.ipynb → nbs/phi35.ipynb b/nbs/vllm.ipynb → nbs/phi35.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "xinfer"
-version = "0.1.2"
+version = "0.1.3"
 dynamic = [
     "dependencies",
 ]
@@ -48,7 +48,7 @@ universal = true
 
 
 [tool.bumpversion]
-current_version = "0.1.2"
+current_version = "0.1.3"
 commit = true
 tag = true
 

diff --git a/xinfer/__init__.py b/xinfer/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = """Dickson Neoh"""
 __email__ = "[email protected]"
-__version__ = "0.1.2"
+__version__ = "0.1.3"
 
 from .core import create_model, list_models
 from .model_registry import ModelInputOutput, register_model

diff --git a/xinfer/vllm/__init__.py b/xinfer/vllm/__init__.py
@@ -1 +1,2 @@
 from .molmo import Molmo
+from .phi35vision import Phi35Vision
diff --git a/xinfer/vllm/molmo.py b/xinfer/vllm/molmo.py
@@ -26,9 +26,10 @@ def __init__(
 
     def load_model(self, **kwargs):
         self.model = LLM(
-            model=self.model_id,
+            model=self.model_id.replace("vllm/", ""),
             trust_remote_code=True,
             dtype=self.dtype,
+            max_model_len=4096,
             **kwargs,
         )
 

diff --git a/xinfer/vllm/phi35vision.py b/xinfer/vllm/phi35vision.py
@@ -0,0 +1,63 @@
+from vllm import LLM, SamplingParams
+
+from ..model_registry import ModelInputOutput, register_model
+from ..models import BaseModel, track_inference
+
+
+@register_model(
+    "vllm/microsoft/Phi-3.5-vision-instruct",
+    "vllm",
+    ModelInputOutput.IMAGE_TEXT_TO_TEXT,
+)
+class Phi35Vision(BaseModel):
+    def __init__(
+        self,
+        model_id: str,
+        device: str = "cpu",
+        dtype: str = "float32",
+        **kwargs,
+    ):
+        super().__init__(model_id, device, dtype)
+        self.load_model(**kwargs)
+
+    def load_model(self, **kwargs):
+        self.model = LLM(
+            model=self.model_id.replace("vllm/", ""),
+            trust_remote_code=True,
+            dtype=self.dtype,
+            max_model_len=4096,
+            max_num_seqs=2,
+            mm_processor_kwargs={"num_crops": 16},
+            **kwargs,
+        )
+
+    @track_inference
+    def infer_batch(self, images: list[str], prompts: list[str], **sampling_kwargs):
+        images = self.parse_images(images)
+
+        sampling_params = SamplingParams(**sampling_kwargs)
+        batch_inputs = [
+            {
+                "prompt": f"<|user|>\n<|image_1|>\n{prompt}<|end|>\n<|assistant|>\n",
+                "multi_modal_data": {"image": image},
+            }
+            for image, prompt in zip(images, prompts)
+        ]
+
+        results = self.model.generate(batch_inputs, sampling_params)
+
+        return [output.outputs[0].text.strip() for output in results]
+
+    @track_inference
+    def infer(self, image: str, prompt: str, **sampling_kwargs):
+        image = self.parse_images(image)
+
+        inputs = {
+            "prompt": f"<|user|>\n<|image_1|>\n{prompt}<|end|>\n<|assistant|>\n",
+            "multi_modal_data": {"image": image},
+        }
+
+        sampling_params = SamplingParams(**sampling_kwargs)
+        outputs = self.model.generate(inputs, sampling_params)
+        generated_text = outputs[0].outputs[0].text.strip()
+        return generated_text
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .molmo import Molmo
		from .phi35vision import Phi35Vision