add moondream vision service

viam-labs · Mar 11, 2024 · e3546f8 · e3546f8
1 parent 0956b37
commit e3546f8
Show file tree

Hide file tree

Showing 7 changed files with 249 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,8 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+viam-env/
+module.tar.gz
+.installed
+test.py
+skiing.jpg
diff --git a/README.md b/README.md
@@ -1,2 +1,72 @@
-# moondream-vision
-A Viam vision service that provides image classifications with the moondream tiny vision model
+# moondream modular vision service
+
+This module implements the [rdk vision API](https://github.com/rdk/vision-api) in a viam-labs:vision:moondream model.
+
+This model leverages the [Moondream tiny vision language model](https://github.com/vikhyat/moondream) to allow for image classification and querying.
+
+The Moondream model and inference will run locally, and therefore speed of inference is highly dependant on hardware.
+
+## Build and Run
+
+To use this module, follow these instructions to [add a module from the Viam Registry](https://docs.viam.com/registry/configure/#add-a-modular-resource-from-the-viam-registry) and select the `viam-labs:vision:moondream` model from the [viam-labs moondream-vision module](https://app.viam.com/module/viam-labs/moondream-vision).
+
+## Configure your vision service
+
+> [!NOTE]  
+> Before configuring your vision service, you must [create a machine](https://docs.viam.com/manage/fleet/machines/#add-a-new-machine).
+
+Navigate to the **Config** tab of your robot’s page in [the Viam app](https://app.viam.com/).
+Click on the **Service** subtab and click **Create service**.
+Select the `vision` type, then select the `viam-labs:vision:moondream` model.
+Enter a name for your vision service and click **Create**.
+
+On the new service panel, copy and paste the following attribute template into your vision service's **Attributes** box:
+
+```json
+{
+  "revision": "<optional model revision>"
+}
+```
+
+> [!NOTE]  
+> For more information, see [Configure a Robot](https://docs.viam.com/manage/configuration/).
+
+### Attributes
+
+The following attributes are available for `viam-labs:vision:yolov8` model:
+
+| Name | Type | Inclusion | Description |
+| ---- | ---- | --------- | ----------- |
+| `revision` | string | **Required** |  Moondream model revision, defaults to "2024-03-06" |
+
+### Example Configurations
+
+```json
+{
+  "revision": "2024-03-06"
+}
+```
+
+## API
+
+The moondream resource provides the following methods from Viam's built-in [rdk:service:vision API](https://python.viam.dev/autoapi/viam/services/vision/client/index.html)
+
+### get_classifications(image=*binary*, count)
+
+### get_classifications_from_camera(camera_name=*string*, count)
+
+Note: if using this method, any cameras you are using must be set in the `depends_on` array for the service configuration, for example:
+
+```json
+      "depends_on": [
+        "cam"
+      ]
+```
+
+By default, the Moondream model will be asked the question "describe this image".
+If you want to ask a different question about the image, you can pass that question as the extra parameter "question".
+For example:
+
+``` python
+moondream.get_classifications(image, 1, extra={"question": "what is the person wearing?"})
+```
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+viam-sdk
+transformers
+timm
+einops
+pillow
diff --git a/run.sh b/run.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+cd `dirname $0`
+
+if [ -f .installed ]
+  then
+    source viam-env/bin/activate
+  else
+    python3 -m pip install --user virtualenv --break-system-packages
+    python3 -m venv viam-env
+    source viam-env/bin/activate
+    pip3 install --upgrade -r requirements.txt
+    if [ $? -eq 0 ]
+      then
+        touch .installed
+    fi
+fi
+
+# Be sure to use `exec` so that termination signals reach the python process,
+# or handle forwarding termination signals manually
+exec python3 -m src $@
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1,10 @@
+"""
+This file registers the model with the Python SDK.
+"""
+
+from viam.services.vision import Vision
+from viam.resource.registry import Registry, ResourceCreatorRegistration
+
+from .moondream import moondream
+
+Registry.register_resource_creator(Vision.SUBTYPE, moondream.MODEL, ResourceCreatorRegistration(moondream.new, moondream.validate))
diff --git a/src/__main__.py b/src/__main__.py
@@ -0,0 +1,17 @@
+import asyncio
+import sys
+
+from viam.module.module import Module
+from viam.services.vision import Vision
+from .moondream import moondream
+
+async def main():
+    """This function creates and starts a new module, after adding all desired resources.
+    Resources must be pre-registered. For an example, see the `__init__.py` file.
+    """
+    module = Module.from_args()
+    module.add_model_from_registry(Vision.SUBTYPE, moondream.MODEL)
+    await module.start()
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/moondream.py b/src/moondream.py
@@ -0,0 +1,120 @@
+from typing import ClassVar, Mapping, Sequence, Any, Dict, Optional, Tuple, Final, List, cast
+from typing_extensions import Self
+
+from typing import Any, Final, List, Mapping, Optional, Union
+
+from PIL import Image
+
+from viam.media.video import RawImage
+from viam.proto.common import PointCloudObject
+from viam.proto.service.vision import Classification, Detection
+from viam.resource.types import RESOURCE_NAMESPACE_RDK, RESOURCE_TYPE_SERVICE, Subtype
+from viam.utils import ValueTypes
+
+
+from viam.module.types import Reconfigurable
+from viam.proto.app.robot import ComponentConfig
+from viam.proto.common import ResourceName, Vector3
+from viam.resource.base import ResourceBase
+from viam.resource.types import Model, ModelFamily
+
+from viam.services.vision import Vision
+from viam.components.camera import Camera
+from viam.logging import getLogger
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from PIL import Image
+
+LOGGER = getLogger(__name__)
+
+class moondream(Vision, Reconfigurable):
+
+    """
+    Vision represents a Vision service.
+    """
+
+
+    MODEL: ClassVar[Model] = Model(ModelFamily("viam-labs", "vision"), "moondream")
+
+    model: AutoModelForCausalLM
+    tokenizer = AutoTokenizer
+
+    # Constructor
+    @classmethod
+    def new(cls, config: ComponentConfig, dependencies: Mapping[ResourceName, ResourceBase]) -> Self:
+        my_class = cls(config.name)
+        my_class.reconfigure(config, dependencies)
+        return my_class
+
+    # Validates JSON Configuration
+    @classmethod
+    def validate(cls, config: ComponentConfig):
+        return
+
+    # Handles attribute reconfiguration
+    def reconfigure(self, config: ComponentConfig, dependencies: Mapping[ResourceName, ResourceBase]):
+        self.DEPS = dependencies
+        model_id = "vikhyatk/moondream2"
+        revision = config.attributes.fields["revision"].string_value or "2024-03-06"
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id, trust_remote_code=True, revision=revision
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+        return
+
+    # not implemented, use classification methods
+    async def get_detections_from_camera(
+        self, camera_name: str, *, extra: Optional[Mapping[str, Any]] = None, timeout: Optional[float] = None
+    ) -> List[Detection]:
+        return
+
+     # not implemented, use classification methods
+    async def get_detections(
+        self,
+        image: Union[Image.Image, RawImage],
+        *,
+        extra: Optional[Mapping[str, Any]] = None,
+        timeout: Optional[float] = None,
+    ) -> List[Detection]:
+        return 
+
+    async def get_classifications_from_camera(
+        self,
+        camera_name: str,
+        count: int,
+        *,
+        extra: Optional[Mapping[str, Any]] = None,
+        timeout: Optional[float] = None,
+    ) -> List[Classification]:
+        actual_cam = self.DEPS[Camera.get_resource_name(camera_name)]
+        cam = cast(Camera, actual_cam)
+        cam_image = await cam.get_image(mime_type="image/jpeg")
+        return await self.get_classifications(cam_image, count, extra=extra)
+
+
+    async def get_classifications(
+        self,
+        image: Union[Image.Image, RawImage],
+        count: int,
+        *,
+        extra: Optional[Mapping[str, Any]] = None,
+        timeout: Optional[float] = None,
+    ) -> List[Classification]:
+        classifications = []
+        question = "describe this image"
+        if extra != None and extra.get('question') != None:
+            question = extra['question']
+        enc_image = self.model.encode_image(image)
+        result = self.model.answer_question(enc_image, question, self.tokenizer)
+        classifications.append({"class_name": result, "confidence": 1})
+        return classifications
+
+
+    async def get_object_point_clouds(
+        self, camera_name: str, *, extra: Optional[Mapping[str, Any]] = None, timeout: Optional[float] = None
+    ) -> List[PointCloudObject]:
+        return
+
+    async def do_command(self, command: Mapping[str, ValueTypes], *, timeout: Optional[float] = None) -> Mapping[str, ValueTypes]:
+        return
+