Skip to content

Commit

Permalink
Add Phi 3.5 Vision Model (#43)
Browse files Browse the repository at this point in the history
* initial implementation of phi35

* fix bug with molmo naming prefix

* rename nb

* Bump version: 0.1.2 → 0.1.3
  • Loading branch information
dnth authored Oct 29, 2024
1 parent ae9964b commit 6b18675
Show file tree
Hide file tree
Showing 7 changed files with 328 additions and 175 deletions.
98 changes: 98 additions & 0 deletions nbs/molmo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import xinfer\n",
"\n",
"xinfer.list_models(interactive=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"model = xinfer.create_model(\n",
" \"vllm/allenai/Molmo-7B-D-0924\",\n",
" device=\"cuda\",\n",
" dtype=\"float16\",\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image1 = \"https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg\"\n",
"prompt1 = \"Describe this image.\"\n",
"\n",
"image2 = \"https://raw.githubusercontent.com/vikhyat/moondream/main/assets/demo-1.jpg\"\n",
"prompt2 = \"Create a pun based on the image.\"\n",
"\n",
"model.infer_batch([image1, image2], [prompt1, prompt2])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.print_stats()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.infer(image1, prompt1, max_tokens=10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.launch_gradio()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "xinfer",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
332 changes: 161 additions & 171 deletions nbs/vllm.ipynb → nbs/phi35.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "xinfer"
version = "0.1.2"
version = "0.1.3"
dynamic = [
"dependencies",
]
Expand Down Expand Up @@ -48,7 +48,7 @@ universal = true


[tool.bumpversion]
current_version = "0.1.2"
current_version = "0.1.3"
commit = true
tag = true

Expand Down
2 changes: 1 addition & 1 deletion xinfer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

__author__ = """Dickson Neoh"""
__email__ = "[email protected]"
__version__ = "0.1.2"
__version__ = "0.1.3"

from .core import create_model, list_models
from .model_registry import ModelInputOutput, register_model
Expand Down
1 change: 1 addition & 0 deletions xinfer/vllm/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .molmo import Molmo
from .phi35vision import Phi35Vision
3 changes: 2 additions & 1 deletion xinfer/vllm/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ def __init__(

def load_model(self, **kwargs):
self.model = LLM(
model=self.model_id,
model=self.model_id.replace("vllm/", ""),
trust_remote_code=True,
dtype=self.dtype,
max_model_len=4096,
**kwargs,
)

Expand Down
63 changes: 63 additions & 0 deletions xinfer/vllm/phi35vision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from vllm import LLM, SamplingParams

from ..model_registry import ModelInputOutput, register_model
from ..models import BaseModel, track_inference


@register_model(
"vllm/microsoft/Phi-3.5-vision-instruct",
"vllm",
ModelInputOutput.IMAGE_TEXT_TO_TEXT,
)
class Phi35Vision(BaseModel):
def __init__(
self,
model_id: str,
device: str = "cpu",
dtype: str = "float32",
**kwargs,
):
super().__init__(model_id, device, dtype)
self.load_model(**kwargs)

def load_model(self, **kwargs):
self.model = LLM(
model=self.model_id.replace("vllm/", ""),
trust_remote_code=True,
dtype=self.dtype,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={"num_crops": 16},
**kwargs,
)

@track_inference
def infer_batch(self, images: list[str], prompts: list[str], **sampling_kwargs):
images = self.parse_images(images)

sampling_params = SamplingParams(**sampling_kwargs)
batch_inputs = [
{
"prompt": f"<|user|>\n<|image_1|>\n{prompt}<|end|>\n<|assistant|>\n",
"multi_modal_data": {"image": image},
}
for image, prompt in zip(images, prompts)
]

results = self.model.generate(batch_inputs, sampling_params)

return [output.outputs[0].text.strip() for output in results]

@track_inference
def infer(self, image: str, prompt: str, **sampling_kwargs):
image = self.parse_images(image)

inputs = {
"prompt": f"<|user|>\n<|image_1|>\n{prompt}<|end|>\n<|assistant|>\n",
"multi_modal_data": {"image": image},
}

sampling_params = SamplingParams(**sampling_kwargs)
outputs = self.model.generate(inputs, sampling_params)
generated_text = outputs[0].outputs[0].text.strip()
return generated_text

0 comments on commit 6b18675

Please sign in to comment.