diff --git a/florence2-sam2/index.html b/florence2-sam2/index.html
index db7b1090..132eb3c8 100644
--- a/florence2-sam2/index.html
+++ b/florence2-sam2/index.html
@@ -411,6 +411,24 @@
     </span>
   </a>
   
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#vision_agent_tools.models.florence2_sam2.Florence2SAM2.fine_tune" class="md-nav__link">
+    <span class="md-ellipsis">
+      fine_tune
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#vision_agent_tools.models.florence2_sam2.Florence2SAM2.load_base" class="md-nav__link">
+    <span class="md-ellipsis">
+      load_base
+    </span>
+  </a>
+  
 </li>
         
       </ul>
@@ -667,6 +685,24 @@
     </span>
   </a>
   
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#vision_agent_tools.models.florence2_sam2.Florence2SAM2.fine_tune" class="md-nav__link">
+    <span class="md-ellipsis">
+      fine_tune
+    </span>
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#vision_agent_tools.models.florence2_sam2.Florence2SAM2.load_base" class="md-nav__link">
+    <span class="md-ellipsis">
+      load_base
+    </span>
+  </a>
+  
 </li>
         
       </ul>
@@ -961,6 +997,40 @@ <h3 id="vision_agent_tools.models.florence2_sam2.Florence2SAM2.__init__" class="
 
 </div>
 
+<div class="doc doc-object doc-function">
+
+
+<h3 id="vision_agent_tools.models.florence2_sam2.Florence2SAM2.fine_tune" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">fine_tune</span><span class="p">(</span><span class="n">checkpoint</span><span class="p">)</span></code>
+
+<a href="#vision_agent_tools.models.florence2_sam2.Florence2SAM2.fine_tune" class="headerlink" title="Permanent link">#</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Load the fine-tuned Florence-2 model.</p>
+
+    </div>
+
+</div>
+
+<div class="doc doc-object doc-function">
+
+
+<h3 id="vision_agent_tools.models.florence2_sam2.Florence2SAM2.load_base" class="doc doc-heading">
+            <code class="highlight language-python"><span class="n">load_base</span><span class="p">()</span></code>
+
+<a href="#vision_agent_tools.models.florence2_sam2.Florence2SAM2.load_base" class="headerlink" title="Permanent link">#</a></h3>
+
+
+    <div class="doc doc-contents ">
+
+      <p>Load the base Florence-2 model.</p>
+
+    </div>
+
+</div>
+
 
 
   </div>
diff --git a/objects.inv b/objects.inv
index d119075f..3c78b8c2 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/search/search_index.json b/search/search_index.json
index 0c13bdfa..eae1d345 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Vision Agent Tools Documentation","text":"<p>This repository contains tools that solve vision problems. This tools can be used in conjunction with the vision-agent.</p>"},{"location":"clip_media_sim/","title":"CLIPMediaSim","text":""},{"location":"clip_media_sim/#video-similarity","title":"Video similarity","text":"<pre><code>import cv2\nfrom PIL import Image\n\nfrom vision_agent_tools.models.clip_media_sim import CLIPMediaSim\n\n# Path to your target image\nimage_path = \"path/to/your/image.jpg\"\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the image\ntarget_image = Image.open(image_path)\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nfps = cap.get(cv2.CAP_PROP_FPS)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Calculate video timestamps\nvideo_time = len(frames) / fps\n\n# Create the CLIPMediaSim instance\nclip_media_sim = CLIPMediaSim()\n\n# Run video similarity against the target image\nresults = clip_media_sim(video=frames, target_image=target_image)\n\n# The results should be a list of [index_of_frame, confidence_score] where the\n# video is similar to the target image.\n\n# To find the time at which a given frame happens, you can do the following\n\ntime_per_frame = video_time / len(frames)\n\ntimestamp = results[0][0] * time_per_frame\n\nprint(\"Similarity detection complete!\")\n</code></pre> <p>You can also run similarity against a target text doing the following:</p> <pre><code>results = clip_media_sim(video=frames, target_text=\"a turtle holding the earth\")\n</code></pre>"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim","title":"<code>CLIPMediaSim</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>A class that receives a video and a target image or text and returns the frames that are most similar to the target.</p>"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__call__","title":"<code>__call__(video, target_image=None, target_text=None, thresh=0.3)</code>","text":"<p>Receives a video and a target image or text and returns the frames that are most similar to the target.</p> <p>Parameters:</p> Name Type Description Default <code>video</code> <code>ndarray</code> <p>The input video to be processed.</p> required <code>target_image</code> <code>Image | None</code> <p>The target image to compare the video frames with.</p> <code>None</code> <code>target_text</code> <code>str | None</code> <p>The target text to compare the video frames with.</p> <code>None</code> <code>thresh</code> <code>float</code> <p>The threshold to filter the results. Defaults to 0.3.</p> <code>0.3</code>"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__init__","title":"<code>__init__(device='cuda')</code>","text":"<p>Initializes the CLIPMediaSim object with a pre-trained CLIP model.</p>"},{"location":"controlnet_aux/","title":"Controlnet-Aux","text":""},{"location":"controlnet_aux/#pose-detector","title":"Pose Detector","text":"<pre><code>from PIL import Image\nfrom vision_agent_tools.models.controlnet_aux import Image2Pose\n\n# Path to your test image\ntest_image_path = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image_path)\n# Create the Image2Pose instance\nimage_2_pose = Image2Pose()\n\n# Run pose detection and get the results\nresults = image_2_pose(image)\n\n# Optional: Save the result image (assuming results is a PIL Image)\n# results.save(\"result.png\")\n\nprint(\"Pose detection complete!\")\n</code></pre> Pose Detection Result"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose","title":"<code>Image2Pose</code>","text":"<p>A class that simplifies human pose detection using a pre-trained Openpose model.</p> <p>This class provides a convenient way to run pose detection on images using a pre-trained Openpose model from the <code>controlnet_aux</code> library. It takes a PIL Image object as input and returns the predicted pose information.</p>"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__call__","title":"<code>__call__(image)</code>","text":"<p>Performs pose detection on a PIL image and returns the results.</p> <p>This method takes a PIL Image object as input and runs the loaded Openpose detector on it. The predicted pose information is then resized to match the original image size and returned.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The input image for pose detection.</p> required <p>Returns:</p> Type Description <code>Image</code> <p>PIL.Image: The image with the predicted pose information (format might vary       depending on the specific OpenposeDetector implementation).</p>"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the Image2Pose object with a pre-trained Openpose detector.</p> <p>This method loads a pre-trained Openpose model from the specified model hub (\"lllyasviel/Annotators\" in this case). The loaded detector is stored as an attribute for future use.</p>"},{"location":"depth_anything_v2/","title":"Depth-Anything-V2","text":"<p>This example demonstrates using the Depth-Anything-V2 tool for depth estimation on images.</p> <pre><code>from vision_agent_tools.models.depth_anything_v2 import DepthAnythingV2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the depth map estimation model.\ndepth_estimate = DepthAnythingV2()\n\n# Run the inference\nresults = depth_estimate(image)\n\n# Let's print the obtained depth map\nprint(results.map)\n</code></pre>"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2","title":"<code>DepthAnythingV2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Model for depth estimation using the Depth-Anything-V2 model from the paper Depth Anything V2.</p>"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__call__","title":"<code>__call__(image, grayscale=False)</code>","text":"<p>Depth-Anything-V2 is a highly practical solution for robust monocular depth estimation.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Union[str, Image, ndarray]</code> <p>The input image for depth estimation. Can be a file path, a PIL Image, or a NumPy array.</p> required <code>grayscale</code> <code>bool</code> <p>Whether to return the depth map as a grayscale image. If True, the depth map will be normalized to the range [0, 255] and converted to uint8. Defaults to False.</p> <code>False</code> <p>Returns:</p> Name Type Description <code>DepthMap</code> <code>DepthMap</code> <p>An object type containing a numpy array with the HxW depth map of the image.</p>"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the Depth-Anything-V2 model.</p>"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthMap","title":"<code>DepthMap</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Represents the depth map of an image.</p> <p>Attributes:</p> Name Type Description <code>map</code> <code>Any</code> <p>HxW raw depth map of the image.</p>"},{"location":"florence2-qa/","title":"FlorenceQA","text":"<p>This example demonstrates using the Florence2-QA tool to   to answer questions about images.</p> <p>NOTE: The FlorenceQA model can only be used in GPU environments.</p> <pre><code>from vision_agent_tools.models.florence2_qa import FlorenceQA\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image and create initialize the FlorenceQA model\nimage = Image.open(test_image)\nrun_florence_qa = FlorenceQA()\n\n# Time to put FlorenceQA to work! Let's pose a question about the image\nanswer = run_florence_qa(image, question=\"Is there a dog in the image?\")\n\n# Print the output answer\nprint(answer)\n</code></pre>"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA","title":"<code>FlorenceQA</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>FlorenceQA is a tool that combines the Florence-2 and Roberta QA models to answer questions about images.</p> <p>NOTE: The Florence-2 model can only be used in GPU environments.</p>"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__call__","title":"<code>__call__(image, question)</code>","text":"<p>FlorenceQA model answers questions about images.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The image to be analyzed.</p> required <code>question</code> <code>str</code> <p>The question to be answered.</p> required <p>Returns:</p> Name Type Description <code>str</code> <code>dict[str, Any]</code> <p>The answer to the question.</p>"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the FlorenceQA model.</p>"},{"location":"florence2-sam2/","title":"Florence2Sam2","text":"<p>This tool uses Florence2 and the SAM-2 model to do text to instance segmentation on image or video inputs.</p> <pre><code>import cv2\n\nfrom vision_agent_tools.models.florence2_sam2 import Florence2SAM2\n\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Create the Florence2SAM2 instance\nflorence2_sam2 = Florence2SAM2()\n\n# segment all the instances of the prompt \"ball\" for all video frames\nresults = florence2_sam2(prompt=\"ball\", video=frames)\n\n# Returns a list of list where the first list represents the frames and the inner\n# list contains all the predictions per frame. The annotation ID can be used\n# to track the same object across different frames. For example:\n[\n    [\n        {\n            \"id\": 0\n            \"mask\": rle\n            \"label\": \"ball\"\n            \"bbox\": [x_min, y_min, x_max, y_max]\n        }\n    ],\n    [\n        {\n            \"id\": 0\n            \"mask\": rle\n            \"label\": \"ball\"\n            \"bbox\": [x_min, y_min, x_max, y_max]\n        }\n    ]\n]\n\nprint(\"Instance segmentation complete!\")\n</code></pre> <p>You can also run similarity against an image and get additionally bounding boxes doing the following:</p> <pre><code>results = florence2_sam2(image=image, prompts=[\"ball\"])\n</code></pre>"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2","title":"<code>Florence2SAM2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>A class that receives a video or images, a text prompt and returns the instance segmentation based on the input for each frame.</p>"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__call__","title":"<code>__call__(prompt, images=None, video=None, *, chunk_length_frames=20, iou_threshold=0.6, nms_threshold=0.3)</code>","text":"<p>Florence2Sam2 model find objects in images and track objects in a video.</p> <p>Parameters:</p> Name Type Description Default <code>prompt</code> <code>str</code> <p>The text input that complements the media to find or track objects.</p> required <code>images</code> <code>list[Image] | None</code> <p>The images to be analyzed.</p> <code>None</code> <code>video</code> <code>VideoNumpy | None</code> <p>A numpy array containing the different images, representing the video.</p> <code>None</code> <code>chunk_length_frames</code> <code>int | None</code> <p>The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.</p> <code>20</code> <code>iou_threshold</code> <code>float</code> <p>The IoU threshold value used to compare last_predictions and new_predictions objects.</p> <code>0.6</code> <code>nms_threshold</code> <code>float</code> <p>The non-maximum suppression threshold value used to filter the Florence2 predictions.</p> <code>0.3</code> <p>Returns:</p> Type Description <code>list[list[dict[str, Any]]]</code> <p>list[list[dict[str, Any]]]: A list where each item represents each frames predictions. [[{     \"id\": 0,     \"mask\": rle,     \"label\": \"car\",     \"bbox\": [0.1, 0.2, 0.3, 0.4] }]]</p>"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__init__","title":"<code>__init__(model_config=Florence2SAM2Config())</code>","text":"<p>Initializes the Florence2SAM2 object with a pre-trained Florence2 model and a SAM2 model.</p>"},{"location":"florence2/","title":"Florence-2","text":"<p>This example demonstrates using the Florence2 tool to interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.</p> <pre><code>from vision_agent_tools.shared_types import PromptTask\nfrom vision_agent_tools.models.florence2 import Florence2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Choose the task that you are planning to use\ntask_prompt = PromptTask.CAPTION\n\n# Load the image and create initialize the Florence2 model\nimage = Image.open(test_image)\nmodel = Florence2()\n\n# Time to put Florence2 to work! Let's see what it finds...\nresults = model(images=[image], task=task_prompt)\n\n# Print the output result\nprint(f\"The image contains: {results[0]}\")\n</code></pre>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2","title":"<code>Florence2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Florence2 model. It supported both zero-shot and fine-tuned settings. For the zero-shot we use the Florence-2-large. For fine-tuning we use the Florence-2-base-ft. This model can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.</p>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__call__","title":"<code>__call__(task, prompt='', images=None, video=None, *, batch_size=5, nms_threshold=0.3, chunk_length_frames=None)</code>","text":"<p>Performs inference on the Florence-2 model based on the provided task, images or video, and prompt.</p> <p>Parameters:</p> Name Type Description Default <code>task</code> <code>PromptTask</code> <p>The task to be performed on the images or video.</p> required <code>prompt</code> <code>Optional[str]</code> <p>The text input that complements the prompt task.</p> <code>''</code> <code>images</code> <code>list[Image] | None</code> <p>A list of images for the model to process. None if using video.</p> <code>None</code> <code>video</code> <code>VideoNumpy | None</code> <p>A NumPy representation of the video for inference. None if using images.</p> <code>None</code> <code>batch_size</code> <code>int</code> <p>The batch size used for processing multiple images or video frames.</p> <code>5</code> <code>nms_threshold</code> <code>float</code> <p>The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).</p> <code>0.3</code> <code>chunk_length_frames</code> <code>int | None</code> <p>The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.</p> <code>None</code> <p>Returns:</p> Name Type Description <code>Florence2ResponseType</code> <code>Florence2ResponseType</code> <p>The output of the Florence-2 model based on the task and prompt.</p>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__init__","title":"<code>__init__(model_config=Florence2Config())</code>","text":"<p>Initializes the Florence2 model.</p>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.fine_tune","title":"<code>fine_tune(checkpoint)</code>","text":"<p>Load the fine-tuned Florence-2 model.</p>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.load_base","title":"<code>load_base()</code>","text":"<p>Load the base Florence-2 model.</p>"},{"location":"flux1/","title":"Flux1","text":"<p>This example demonstrates using the Flux1 model to perform tasks such as image generation and mask inpainting based on text prompts.</p>"},{"location":"flux1/#parameters","title":"Parameters","text":"<ul> <li>task: The task to perform using the model - either image generation (\"generation\") or mask inpainting (\"inpainting\").</li> <li>prompt: The text prompt describing the desired modifications.</li> <li>config: The <code>Flux1Config</code> class allows you to configure the parameters for the Flux1 model.</li> <li>image (Image.Image): The original image to be modified -  used for the mask inpainting and image to image tasks.</li> <li>mask_image (Image.Image): The mask image indicating areas to be inpainted - used for the mask inpainting task</li> </ul>"},{"location":"flux1/#flux1config","title":"Flux1Config","text":"<p>Below is an example of how to create and use a <code>Flux1Config</code> object:</p> <pre><code>from vision_agent_tools.models.flux1 import Flux1Config\n\nconfig = Flux1Config(\n    height=512,\n    width=512,\n    num_inference_steps=28,\n    guidance_scale=3.5,\n    num_images_per_prompt=1,\n    max_sequence_length=512,\n    seed=42\n)\n</code></pre> <ul> <li>height: The height in pixels of the generated image. Defaults to 512.</li> <li>width: The width in pixels of the generated image. Defaults to 512.</li> <li>num_inference_steps: The number of inference steps to perform. Defaults to 28.</li> <li>guidance_scale: Guidance scale as defined in Classifier-Free Diffusion Guidance. Defaults to 3.5.</li> <li>num_images_per_prompt: The number of images to generate per prompt. Defaults to 1.</li> <li>max_sequence_length: Maximum sequence length to use with the prompt. Defaults to 512.</li> <li>seed: Seed for the random number generator. If not provided, a random seed is used.</li> <li>strength: Indicates extent to transform the reference <code>image</code>. Must be between 0 and 1. A value of 1 essentially ignores <code>image</code>.</li> </ul>"},{"location":"flux1/#perform-image-generation","title":"Perform image generation","text":"<pre><code>import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# To perform image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n    task=Flux1Task.IMAGE_GENERATION,  # Image Generation Task\n    prompt=\"A purple car in a futuristic cityscape\",\n    config=config\n)\ngenerated_image.save(\"generated_car.png\")\n</code></pre>"},{"location":"flux1/#perform-mask-inpainting","title":"Perform mask inpainting","text":"<p>To perform mask inpainting, both the original image and the mask image need to be provided. These images have the same dimensions. The mask should clearly delineate the areas that you want to modify in the original image. Additionally, the inpainting process includes a strength  parameter, which controls the intensity of the modifications applied to the masked areas.</p> <pre><code>import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have a cat image named \"cat_image.jpg\" that you want to use for mask inpainting\nimage_to_edit = Image.open(\"path/to/your/cat_image.jpg\").convert(\"RGB\")  # Image to inpaint\n\n# Make sure to provide a mask image with the same dimensions, delineating the cat\nmask_image = Image.open(\"path/to/your/mask.png\")  # Mask image indicating areas to change\n\n# Set a new prompt for inpainting\ninpainting_prompt = \"A cute dog\"\n\n# To perform image mask inpainting\nflux1 = Flux1()\n\ninpainted_image = flux_model(\n    task=Flux1Task.MASK_INPAINTING,  # Image Mask Inpainting Task\n    prompt=inpainting_prompt,\n    image=image_to_edit,\n    mask_image=mask_image,\n    config=config\n)\n\ninpainted_image.save(\"inpainted_dog_over_cat.png\")\n</code></pre>"},{"location":"flux1/#perform-image-to-image-generation","title":"Perform image-to-image generation","text":"<p>To perform image-to-image generation, you need to provide an original image along with a text prompt describing the desired modifications. The original image serves as the base, and the model will generate a new image based on the prompt.</p> <pre><code>import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have an original image named \"original_image.jpg\" that you want to use for image-to-image generation\noriginal_image = Image.open(\"path/to/your/original_image.jpg\").convert(\"RGB\")  # Original image\n\n# Set a new prompt for image-to-image generation\nimage_to_image_prompt = \"A sunny beach with palm trees\"\n\n# To perform image-to-image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n    task=Flux1Task.IMAGE_TO_IMAGE,  # Image-to-Image Generation Task\n    prompt=image_to_image_prompt,\n    image=original_image,\n    config=config\n)\n\ngenerated_image.save(\"generated_beach.png\")\n</code></pre>"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1","title":"<code>Flux1</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Tool for object detection using the pre-trained Flux1 model. This tool takes a prompt as input and generates an image using the Flux1 model.</p>"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__call__","title":"<code>__call__(prompt=Field(max_length=512), task=Flux1Task.IMAGE_GENERATION, config=Flux1Config(), image=None, mask_image=None)</code>","text":"<p>Performs object detection on an image using the Flux1 model.</p> <p>Parameters:</p> Name Type Description Default <code>-</code> <code>prompt (str</code> <p>The text prompt describing the desired modifications.</p> required <code>-</code> <code>task (Flux1Task</code> <p>The task to perform using the model: - image generation - \"generation\", - mask inpainting - \"inpainting\", - image-to-image generation - \"img2img\".</p> required <code>-</code> <code>config (Flux1Config</code> <ul> <li>height (<code>int</code>, optional):     The height in pixels of the generated image.     This is set to 512 by default.</li> <li>width (<code>int</code>, optional):     The width in pixels of the generated image.     This is set to 512 by default.</li> <li>num_inference_steps (<code>int</code>, optional, defaults to 28):</li> <li>guidance_scale (<code>float</code>, optional, defaults to 3.5):     Guidance scale as defined in Classifier-Free Diffusion Guidance.     Higher guidance scale encourages to generate images     that are closely linked to the text <code>prompt</code>,     usually at the expense of lower image quality.</li> <li>num_images_per_prompt (<code>int</code>, optional, defaults to 1):     The number of images to generate per prompt.</li> <li>max_sequence_length (<code>int</code> defaults to 512):     Maximum sequence length to use with the <code>prompt</code>.     to make generation deterministic.</li> <li>seed (<code>int</code>, optional): The seed to use for the random number generator.     If not provided, a random seed is used.</li> <li>strength (<code>float</code>, optional, defaults to 0.6):     Indicates extent to transform the reference <code>image</code>.     Must be between 0 and 1.     A value of 1 essentially ignores <code>image</code>.</li> </ul> required <code>-</code> <code>image (Image.Image</code> <p>The original image to be modified.</p> required <code>-</code> <code>mask_image (Image.Image</code> <p>The mask image indicating areas to be inpainted.</p> required <p>Returns:</p> Type Description <code>List[Image] | None</code> <p>List[Image.Image]: The list of generated image(s) if successful; None if an error occurred.</p>"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__init__","title":"<code>__init__(hf_model='black-forest-labs/FLUX.1-schnell', dtype=torch.bfloat16, enable_sequential_cpu_offload=True)</code>","text":"<p>Initializes the Flux1 image generation tool. Loads the pre-trained Flux1 model from HuggingFace and sets model configurations.</p> <p>Parameters:</p> Name Type Description Default <code>-</code> <code>task (Flux1Task</code> <p>The task to perform using the model: either image generation (\"generation\") or mask inpainting (\"inpainting\").</p> required <code>-</code> <code>model_config</code> <p>The configuration for the model, hf_model, and device.</p> required <code>-</code> <code>dtype (torch.dtype</code> <p>The data type to use for the model.</p> required <code>-</code> <code>enable_sequential_cpu_offload (bool</code> <p>Whether to enable sequential CPU offload.</p> required"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1Config","title":"<code>Flux1Config</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Configuration for the Flux1 model.</p>"},{"location":"internlm_xcomposer2/","title":"InternLM-XComposer-2.5","text":"<p>This example demonstrates how to use the InternLM-XComposer-2.5 tool to   to answer questions about images or videos.</p> <p>NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.</p> <pre><code>import cv2\n\nfrom vision_agent_tools.models.internlm_xcomposer2 import InternLMXComposer2\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Initialize the InternLMXComposer2 model\nrun_inference = InternLMXComposer2()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put InternLMXComposer2 to work!\nanswer = run_inference(video=p_video, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n</code></pre>"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2","title":"<code>InternLMXComposer2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>InternLM-XComposer-2.5 is a tool that excels in various text-image comprehension and composition applications, achieving GPT-4V level capabilities.</p> <p>NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.</p>"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__call__","title":"<code>__call__(prompt, image=None, video=None, frames=MAX_NUMBER_OF_FRAMES, chunk_length=None)</code>","text":"<p>InternLMXComposer2 model answers questions about a video or image.</p> <p>Parameters:</p> Name Type Description Default <code>prompt</code> <code>str</code> <p>The prompt with the question to be answered.</p> required <code>image</code> <code>Image | None</code> <p>The image to be analyzed.</p> <code>None</code> <code>video</code> <code>VideoNumpy | None</code> <p>A numpy array containing the different images, representing the video.</p> <code>None</code> <code>frames</code> <code>int</code> <p>The number of frames to be used from the video.</p> <code>MAX_NUMBER_OF_FRAMES</code> <code>chunk_length</code> <code>int</code> <p>The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.</p> <code>None</code> <p>Returns:</p> Type Description <code>list[str]</code> <p>list[str]: The answers to the prompt.</p>"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the InternLMXComposer2.5 model.</p>"},{"location":"nsfw_classification/","title":"NSFW (Not Safe for Work) classification","text":"<p>This example demonstrates using the Not Safe for Work classification tool.</p> <pre><code>from vision_agent_tools.models.nsfw_classification import NSFWClassification\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the NSFW model.\nnsfw_classification = NSFWClassification()\n\n# Run the inference\nresults = nsfw_classification(image)\n\n# Let's print the predicted label\nprint(results.label)\n</code></pre>"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification","title":"<code>NSFWClassification</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>The primary intended use of this model is for the classification of NSFW (Not Safe for Work) images.</p>"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__call__","title":"<code>__call__(image)</code>","text":"<p>Performs the NSFW inference on an image using the NSFWClassification model.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The input image for object detection.</p> required <p>Returns:</p> Name Type Description <code>NSFWInferenceData</code> <code>NSFWInferenceData</code> <p>The inference result from the NSFWClassification model. label (str): The label for the unsafe content detected in the image. score (float):The score for the unsafe content detected in the image.</p>"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the NSFW (Not Safe for Work) classification tool.</p>"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWInferenceData","title":"<code>NSFWInferenceData</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Represents an inference result from the NSFWClassification model.</p> <p>Attributes:</p> Name Type Description <code>label</code> <code>str</code> <p>The predicted label for the image.</p> <code>score</code> <code>float</code> <p>The confidence score associated with the prediction (between 0 and 1).</p>"},{"location":"nshot_counting/","title":"LOCA (Low-shot Object Counting network with iterative prototype Adaptation).","text":"<p>This example demonstrates how to use the NShot LOCA tool for object counting in images.</p> <pre><code>from vision_agent_tools.models.nshot_counting import NShotCounting\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the counting model and choose the image output size you expect.\nObjectCounting = NShotCounting(zero_shot=False, img_size=512)\n\n# Run the inference\nresults = ObjectCounting(image, bbox=[12, 34, 56, 78])\n\n# Let's find out how many objects were found in total\nprint(\"Found a total count of {results.count} objects on the image!\")\n</code></pre>"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.CountingDetection","title":"<code>CountingDetection</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Represents an inference result from the LOCA model.</p> <p>Attributes:</p> Name Type Description <code>count</code> <code>int</code> <p>The predicted number of detected objects.</p> <code>masks</code> <code>list[Any]</code> <p>A list of numpy arrays representing the masks             of the detected objects in the image.</p>"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting","title":"<code>NShotCounting</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Model for object counting using the zeroshot and n-shot versions of the LOCA model from the paper A Low-Shot Object Counting Network With Iterative Prototype Adaptation .</p>"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__call__","title":"<code>__call__(image, bbox=None)</code>","text":"<p>LOCA injects shape and appearance information into object queries to precisely count objects of various sizes in densely and sparsely populated scenarios. It also extends to a zeroshot scenario and achieves excellent localization and count errors across the entire low-shot spectrum.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The input image for object detection.</p> required <code>bbox</code> <code>BoundingBox</code> <p>A list of four ints representing the bounding box coordinates (xmin, ymin, xmax, ymax)         of the detected query in the image.</p> <code>None</code> <p>Returns:</p> Name Type Description <code>CountingDetection</code> <code>CountingDetection</code> <p>An object type containing: - The count of the objects found similar to the bbox query. - A list of numpy arrays representing the masks of the objects found.</p>"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__init__","title":"<code>__init__(zero_shot=True, img_size=512)</code>","text":"<p>Initializes the LOCA model.</p> <p>Parameters:</p> Name Type Description Default <code>img_size</code> <code>int</code> <p>Size of the input image.</p> <code>512</code>"},{"location":"owlv2/","title":"OWLv2 Open-World Localization","text":"<p>This example demonstrates using the Owlv2 tool for object detection in images based on text prompts.</p> <pre><code>from vision_agent_tools.models.owlv2 import Owlv2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# What are you looking for? Write your detective prompts here!\nprompts = [\"a photo of a cat\", \"a photo of a dog\"]\n\n# Load the image and create your Owlv2 detective tool\nimage = Image.open(test_image)\nowlv2 = Owlv2()\n\n# Time to put Owlv2 to work! Let's see what it finds...\nresults = owlv2(image, prompts=prompts)[0]\n\n# Did Owlv2 sniff out any objects? Let's see the results!\nif results:\n    for detection in results:\n        print(f\"Found it! It looks like a {detection['label']} with a confidence of {detection['score']:.2f}.\")\n        print(f\"Here's where it's hiding: {detection['bbox']}\")\nelse:\n    print(\"Hmm, Owlv2 couldn't find anything this time. Maybe try a different prompt?\")\n</code></pre>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2","title":"<code>Owlv2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Tool for object detection using the pre-trained Owlv2 model from Transformers.</p> <p>This tool takes images and a prompt as input, performs object detection using the Owlv2 model, and returns a list of objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding a threshold.</p>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__call__","title":"<code>__call__(prompts, images=None, video=None, *, batch_size=1, nms_threshold=0.3, confidence=0.1)</code>","text":"<p>Performs object detection on images using the Owlv2 model.</p> <p>Parameters:</p> Name Type Description Default <code>prompts</code> <code>list[str]</code> <p>The prompt to be used for object detection.</p> required <code>images</code> <code>list[Image] | None</code> <p>The images to be analyzed.</p> <code>None</code> <code>video</code> <code>VideoNumpy[uint8] | None</code> <p>A numpy array containing the different images, representing the video.</p> <code>None</code> <code>batch_size</code> <code>int</code> <p>The batch size used for processing multiple images or video frames.</p> <code>1</code> <code>nms_threshold</code> <code>float</code> <p>The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).</p> <code>0.3</code> <code>confidence</code> <code>float</code> <p>Confidence threshold for model predictions.</p> <code>0.1</code> <p>Returns:</p> Type Description <code>list[ODWithScoreResponse]</code> <p>list[ODWithScoreResponse]: A list of <code>ODWithScoreResponse</code> objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding the threshold. The item will be None if no objects are detected above the confidence threshold for an specific image / frame.</p>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__init__","title":"<code>__init__(model_config=OWLV2Config())</code>","text":"<p>Loads the pre-trained Owlv2 processor and model from Transformers.</p>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS","title":"<code>Owlv2ProcessorWithNMS</code>","text":"<p>               Bases: <code>Owlv2Processor</code></p>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS.post_process_object_detection_with_nms","title":"<code>post_process_object_detection_with_nms(outputs, *, threshold=0.1, nms_threshold=0.3, target_sizes=None)</code>","text":"<p>Converts the raw output of [<code>OwlViTForObjectDetection</code>] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.</p> <p>Parameters:</p> Name Type Description Default <code>outputs</code> <code>OwlViTObjectDetectionOutput</code> <p>Raw outputs of the model.</p> required <code>threshold</code> <code>float</code> <p>Score threshold to keep object detection predictions.</p> <code>0.1</code> <code>nms_threshold</code> <code>float</code> <p>IoU threshold to filter overlapping objects the raw detections.</p> <code>0.3</code> <code>target_sizes</code> <code>TensorType | list[Tuple] | None</code> <p>Tensor of shape <code>(batch_size, 2)</code> or list of tuples (<code>Tuple[int, int]</code>) containing the target size <code>(height, width)</code> of each image in the batch. If unset, predictions will not be resized.</p> <code>None</code> <p>Returns:     <code>list[dict]</code>:         A list of dictionaries, each dictionary containing the scores, labels         and boxes for an image in the batch as predicted by the model.</p>"},{"location":"qr_reader/","title":"QR Reader","text":"<p>Tool for detecting QR codes in images.</p> <pre><code>from PIL import Image, ImageDraw\n\nfrom vision_agent_tools.models.qr_reader import QRReader\n\n# Open the image containing the QR code\nimage = Image.open(\"sample_qr_image.jpeg\")\n\n# Create a QR code reader object\nqr_reader = QRReader()\n\n# Detect QR codes in the image\ndetections = qr_reader(image)\n\n\nif detections:\n\n    detection = detections[0]\n    draw = ImageDraw.Draw(image)\n\n    # Print the detected text\n    print(f\"Decoded Text: {detection.text}\")\n\n    # Draw the bounding box\n    x_min, y_min, x_max, y_max = (\n        int(detection.bbox[0]),\n        int(detection.bbox[1]),\n        int(detection.bbox[2]),\n        int(detection.bbox[3]),\n    )\n    draw.rectangle(((x_min, y_min), (x_max, y_max)), outline=\"red\", width=2)\n\n    # Draw the text on top of the image\n    draw.text((x_min + 10, y_min - 10), detection.text, fill=\"blue\", anchor=\"mm\")\n    image.show()\nelse:\n    print(\"No QR codes detected in the image.\")\n</code></pre> Displaying the Detection Result"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRCodeDetection","title":"<code>QRCodeDetection</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Represents a detected QR code.</p>"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader","title":"<code>QRReader</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>This tool utilizes the <code>qreader</code> library to detect QR codes within an input image. It returns a list of <code>QRCodeDetection</code> objects for each detected QR code, containing the decoded text, confidence score, polygon coordinates, bounding box, and center point.</p>"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader.__call__","title":"<code>__call__(image)</code>","text":"<p>Detects QR codes in an image.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The input image for QR code detection.</p> required <p>Returns:</p> Type Description <code>list[QRCodeDetection]</code> <p>list[QRCodeDetection]: A list of <code>QRCodeDetection</code> objects containing                    information about each detected QR code, or an empty list if none are found.</p>"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the QR code reader tool.</p> <p>Loads the <code>QReader</code> instance for QR code detection.</p>"},{"location":"qwen2_vl/","title":"Qwen2-VL","text":"<p>This example demonstrates how to use the Qwen2-VL model to   to answer questions about images or videos.</p> <p>NOTE: The Qwen2-VL model should be used in GPU environments.</p> <pre><code>import cv2\nimport numpy as np\nfrom vision_agent_tools.models.qwen2_vl import Qwen2VL\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n    frames.append(frame)\ncap.release()\nframes = np.stack(frames, axis=0)\n\n# Initialize the Qwen2VL model\nrun_inference = Qwen2VL()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put Qwen2VL to work!\nanswer = run_inference(video=frames, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n</code></pre>"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL","title":"<code>Qwen2VL</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Qwen2-VL is a model that is capable of accurately identifying and comprehending the content within images, regardless of their clarity, resolution, or extreme aspect ratios.</p> <p>NOTE: The Qwen2-VL model should be used in GPU environments.</p>"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__call__","title":"<code>__call__(prompt=None, images=None, video=None, frames=MAX_NUMBER_OF_FRAMES)</code>","text":"<p>Qwen2-VL model answers questions about a video or image.</p> <p>Parameters:</p> Name Type Description Default <code>prompt</code> <code>str</code> <p>The prompt with the question to be answered.</p> <code>None</code> <code>images</code> <code>list[Image]</code> <p>A list of images for the model to process. None if using video.</p> <code>None</code> <code>video</code> <code>VideoNumpy | None</code> <p>A numpy array containing the different images, representing the video.</p> <code>None</code> <code>frames</code> <code>int</code> <p>The number of frames to be used from the video.</p> <code>MAX_NUMBER_OF_FRAMES</code> <p>Returns:</p> Type Description <code>list[str]</code> <p>list[str]: The answers to the prompt.</p>"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__init__","title":"<code>__init__(model_config=None)</code>","text":"<p>Initializes the Qwen2-VL model.</p>"},{"location":"shared_model_manager/","title":"Shared Model Manager","text":"<p>The <code>SharedModelManager</code> class is designed to manage and facilitate the use of machine learning models across different devices, such as CPUs and GPUs, within an asynchronous environment. It ensures safe and efficient execution of these models, particularly in scenarios where GPU resources need to be shared exclusively among multiple models. The manager coordinates access to the shared GPU, preventing conflicts when multiple models require it. Models are only loaded into memory when needed using the <code>fetch_model</code> function.</p> <ul> <li><code>add()</code>: Registers a machine learning model class with the manager. The actual model instance is not loaded at this point.</li> <li><code>fetch_model()</code>: Retrieves the previously added model class and creates (loads) the actual model instance. This function utilizes PyTorch interface <code>to</code>, to handle device (CPU/GPU) allocation based on availability.</li> </ul> <p>The usage example demonstrates adding models and then using them with their respective functionalities.</p> <p>\u26a0\ufe0f \u2755: We should ALWAYS add model instance on CPU to the pool. This avoids overwhelming the GPU memory, and model pool will automatically put it in GPU when the model is fetched..</p> <pre><code>model_pool = SharedModelManager()\n\n# Add models instance to the pool\nmodel_pool.add(QRReader())\nmodel_pool.add(Owlv2(model_config=OWLV2Config(device=Device.CPU)))\n\n# Read image\nimage = Image.open(\"path/to/your/image.jpg\")\n\n# Use QRReader model\nasync def use_qr_reader():\n    # Read image\n    image = Image.open(\"path/to/your/image.jpg\")\n\n    qr_reader = await model_pool.fetch_model(QRReader.__name__)\n    detections = qr_reader(image)\n    # Process detections ...\n\n# Use Owlv2 model\nasync def use_owlv2():\n    # Read image\n    image = Image.open(\"path/to/your/image.jpg\")\n\n    owlv2 = await model_pool.fetch_model(Owlv2.__name__)\n    prompts = [\"a photo of a cat\", \"a photo of a dog\"]\n    results = owlv2(image, prompts=prompts)\n    # Process results ...\n</code></pre>"},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager","title":"<code>SharedModelManager</code>","text":""},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.add","title":"<code>add(model)</code>","text":"<p>Adds a model to the pool with a device preference.</p> <p>Parameters:</p> Name Type Description Default <code>model</code> <code>Basetool</code> <p>The modal instance to be added to the pool, it should implement the BaseTool interface.</p> required <code>device</code> <code>Device</code> <p>The preferred device for the model.</p> required <p>Returns:</p> Name Type Description <code>str</code> <code>str</code> <p>The model ID to be used for fetching the model.</p>"},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.fetch_model","title":"<code>fetch_model(model_id)</code>","text":"<p>Retrieves a model from the pool for safe execution.</p> <p>Parameters:</p> Name Type Description Default <code>model_id</code> <code>str</code> <p>Id to access the model in the pool.</p> required <p>Returns:</p> Name Type Description <code>Any</code> <code>BaseTool</code> <p>The retrieved model instance.</p>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Vision Agent Tools Documentation","text":"<p>This repository contains tools that solve vision problems. This tools can be used in conjunction with the vision-agent.</p>"},{"location":"clip_media_sim/","title":"CLIPMediaSim","text":""},{"location":"clip_media_sim/#video-similarity","title":"Video similarity","text":"<pre><code>import cv2\nfrom PIL import Image\n\nfrom vision_agent_tools.models.clip_media_sim import CLIPMediaSim\n\n# Path to your target image\nimage_path = \"path/to/your/image.jpg\"\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the image\ntarget_image = Image.open(image_path)\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nfps = cap.get(cv2.CAP_PROP_FPS)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Calculate video timestamps\nvideo_time = len(frames) / fps\n\n# Create the CLIPMediaSim instance\nclip_media_sim = CLIPMediaSim()\n\n# Run video similarity against the target image\nresults = clip_media_sim(video=frames, target_image=target_image)\n\n# The results should be a list of [index_of_frame, confidence_score] where the\n# video is similar to the target image.\n\n# To find the time at which a given frame happens, you can do the following\n\ntime_per_frame = video_time / len(frames)\n\ntimestamp = results[0][0] * time_per_frame\n\nprint(\"Similarity detection complete!\")\n</code></pre> <p>You can also run similarity against a target text doing the following:</p> <pre><code>results = clip_media_sim(video=frames, target_text=\"a turtle holding the earth\")\n</code></pre>"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim","title":"<code>CLIPMediaSim</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>A class that receives a video and a target image or text and returns the frames that are most similar to the target.</p>"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__call__","title":"<code>__call__(video, target_image=None, target_text=None, thresh=0.3)</code>","text":"<p>Receives a video and a target image or text and returns the frames that are most similar to the target.</p> <p>Parameters:</p> Name Type Description Default <code>video</code> <code>ndarray</code> <p>The input video to be processed.</p> required <code>target_image</code> <code>Image | None</code> <p>The target image to compare the video frames with.</p> <code>None</code> <code>target_text</code> <code>str | None</code> <p>The target text to compare the video frames with.</p> <code>None</code> <code>thresh</code> <code>float</code> <p>The threshold to filter the results. Defaults to 0.3.</p> <code>0.3</code>"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__init__","title":"<code>__init__(device='cuda')</code>","text":"<p>Initializes the CLIPMediaSim object with a pre-trained CLIP model.</p>"},{"location":"controlnet_aux/","title":"Controlnet-Aux","text":""},{"location":"controlnet_aux/#pose-detector","title":"Pose Detector","text":"<pre><code>from PIL import Image\nfrom vision_agent_tools.models.controlnet_aux import Image2Pose\n\n# Path to your test image\ntest_image_path = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image_path)\n# Create the Image2Pose instance\nimage_2_pose = Image2Pose()\n\n# Run pose detection and get the results\nresults = image_2_pose(image)\n\n# Optional: Save the result image (assuming results is a PIL Image)\n# results.save(\"result.png\")\n\nprint(\"Pose detection complete!\")\n</code></pre> Pose Detection Result"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose","title":"<code>Image2Pose</code>","text":"<p>A class that simplifies human pose detection using a pre-trained Openpose model.</p> <p>This class provides a convenient way to run pose detection on images using a pre-trained Openpose model from the <code>controlnet_aux</code> library. It takes a PIL Image object as input and returns the predicted pose information.</p>"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__call__","title":"<code>__call__(image)</code>","text":"<p>Performs pose detection on a PIL image and returns the results.</p> <p>This method takes a PIL Image object as input and runs the loaded Openpose detector on it. The predicted pose information is then resized to match the original image size and returned.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The input image for pose detection.</p> required <p>Returns:</p> Type Description <code>Image</code> <p>PIL.Image: The image with the predicted pose information (format might vary       depending on the specific OpenposeDetector implementation).</p>"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the Image2Pose object with a pre-trained Openpose detector.</p> <p>This method loads a pre-trained Openpose model from the specified model hub (\"lllyasviel/Annotators\" in this case). The loaded detector is stored as an attribute for future use.</p>"},{"location":"depth_anything_v2/","title":"Depth-Anything-V2","text":"<p>This example demonstrates using the Depth-Anything-V2 tool for depth estimation on images.</p> <pre><code>from vision_agent_tools.models.depth_anything_v2 import DepthAnythingV2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the depth map estimation model.\ndepth_estimate = DepthAnythingV2()\n\n# Run the inference\nresults = depth_estimate(image)\n\n# Let's print the obtained depth map\nprint(results.map)\n</code></pre>"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2","title":"<code>DepthAnythingV2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Model for depth estimation using the Depth-Anything-V2 model from the paper Depth Anything V2.</p>"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__call__","title":"<code>__call__(image, grayscale=False)</code>","text":"<p>Depth-Anything-V2 is a highly practical solution for robust monocular depth estimation.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Union[str, Image, ndarray]</code> <p>The input image for depth estimation. Can be a file path, a PIL Image, or a NumPy array.</p> required <code>grayscale</code> <code>bool</code> <p>Whether to return the depth map as a grayscale image. If True, the depth map will be normalized to the range [0, 255] and converted to uint8. Defaults to False.</p> <code>False</code> <p>Returns:</p> Name Type Description <code>DepthMap</code> <code>DepthMap</code> <p>An object type containing a numpy array with the HxW depth map of the image.</p>"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the Depth-Anything-V2 model.</p>"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthMap","title":"<code>DepthMap</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Represents the depth map of an image.</p> <p>Attributes:</p> Name Type Description <code>map</code> <code>Any</code> <p>HxW raw depth map of the image.</p>"},{"location":"florence2-qa/","title":"FlorenceQA","text":"<p>This example demonstrates using the Florence2-QA tool to   to answer questions about images.</p> <p>NOTE: The FlorenceQA model can only be used in GPU environments.</p> <pre><code>from vision_agent_tools.models.florence2_qa import FlorenceQA\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image and create initialize the FlorenceQA model\nimage = Image.open(test_image)\nrun_florence_qa = FlorenceQA()\n\n# Time to put FlorenceQA to work! Let's pose a question about the image\nanswer = run_florence_qa(image, question=\"Is there a dog in the image?\")\n\n# Print the output answer\nprint(answer)\n</code></pre>"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA","title":"<code>FlorenceQA</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>FlorenceQA is a tool that combines the Florence-2 and Roberta QA models to answer questions about images.</p> <p>NOTE: The Florence-2 model can only be used in GPU environments.</p>"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__call__","title":"<code>__call__(image, question)</code>","text":"<p>FlorenceQA model answers questions about images.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The image to be analyzed.</p> required <code>question</code> <code>str</code> <p>The question to be answered.</p> required <p>Returns:</p> Name Type Description <code>str</code> <code>dict[str, Any]</code> <p>The answer to the question.</p>"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the FlorenceQA model.</p>"},{"location":"florence2-sam2/","title":"Florence2Sam2","text":"<p>This tool uses Florence2 and the SAM-2 model to do text to instance segmentation on image or video inputs.</p> <pre><code>import cv2\n\nfrom vision_agent_tools.models.florence2_sam2 import Florence2SAM2\n\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Create the Florence2SAM2 instance\nflorence2_sam2 = Florence2SAM2()\n\n# segment all the instances of the prompt \"ball\" for all video frames\nresults = florence2_sam2(prompt=\"ball\", video=frames)\n\n# Returns a list of list where the first list represents the frames and the inner\n# list contains all the predictions per frame. The annotation ID can be used\n# to track the same object across different frames. For example:\n[\n    [\n        {\n            \"id\": 0\n            \"mask\": rle\n            \"label\": \"ball\"\n            \"bbox\": [x_min, y_min, x_max, y_max]\n        }\n    ],\n    [\n        {\n            \"id\": 0\n            \"mask\": rle\n            \"label\": \"ball\"\n            \"bbox\": [x_min, y_min, x_max, y_max]\n        }\n    ]\n]\n\nprint(\"Instance segmentation complete!\")\n</code></pre> <p>You can also run similarity against an image and get additionally bounding boxes doing the following:</p> <pre><code>results = florence2_sam2(image=image, prompts=[\"ball\"])\n</code></pre>"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2","title":"<code>Florence2SAM2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>A class that receives a video or images, a text prompt and returns the instance segmentation based on the input for each frame.</p>"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__call__","title":"<code>__call__(prompt, images=None, video=None, *, chunk_length_frames=20, iou_threshold=0.6, nms_threshold=0.3)</code>","text":"<p>Florence2Sam2 model find objects in images and track objects in a video.</p> <p>Parameters:</p> Name Type Description Default <code>prompt</code> <code>str</code> <p>The text input that complements the media to find or track objects.</p> required <code>images</code> <code>list[Image] | None</code> <p>The images to be analyzed.</p> <code>None</code> <code>video</code> <code>VideoNumpy | None</code> <p>A numpy array containing the different images, representing the video.</p> <code>None</code> <code>chunk_length_frames</code> <code>int | None</code> <p>The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.</p> <code>20</code> <code>iou_threshold</code> <code>float</code> <p>The IoU threshold value used to compare last_predictions and new_predictions objects.</p> <code>0.6</code> <code>nms_threshold</code> <code>float</code> <p>The non-maximum suppression threshold value used to filter the Florence2 predictions.</p> <code>0.3</code> <p>Returns:</p> Type Description <code>list[list[dict[str, Any]]]</code> <p>list[list[dict[str, Any]]]: A list where each item represents each frames predictions. [[{     \"id\": 0,     \"mask\": rle,     \"label\": \"car\",     \"bbox\": [0.1, 0.2, 0.3, 0.4] }]]</p>"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__init__","title":"<code>__init__(model_config=Florence2SAM2Config())</code>","text":"<p>Initializes the Florence2SAM2 object with a pre-trained Florence2 model and a SAM2 model.</p>"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.fine_tune","title":"<code>fine_tune(checkpoint)</code>","text":"<p>Load the fine-tuned Florence-2 model.</p>"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.load_base","title":"<code>load_base()</code>","text":"<p>Load the base Florence-2 model.</p>"},{"location":"florence2/","title":"Florence-2","text":"<p>This example demonstrates using the Florence2 tool to interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.</p> <pre><code>from vision_agent_tools.shared_types import PromptTask\nfrom vision_agent_tools.models.florence2 import Florence2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Choose the task that you are planning to use\ntask_prompt = PromptTask.CAPTION\n\n# Load the image and create initialize the Florence2 model\nimage = Image.open(test_image)\nmodel = Florence2()\n\n# Time to put Florence2 to work! Let's see what it finds...\nresults = model(images=[image], task=task_prompt)\n\n# Print the output result\nprint(f\"The image contains: {results[0]}\")\n</code></pre>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2","title":"<code>Florence2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Florence2 model. It supported both zero-shot and fine-tuned settings. For the zero-shot we use the Florence-2-large. For fine-tuning we use the Florence-2-base-ft. This model can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.</p>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__call__","title":"<code>__call__(task, prompt='', images=None, video=None, *, batch_size=5, nms_threshold=0.3, chunk_length_frames=None)</code>","text":"<p>Performs inference on the Florence-2 model based on the provided task, images or video, and prompt.</p> <p>Parameters:</p> Name Type Description Default <code>task</code> <code>PromptTask</code> <p>The task to be performed on the images or video.</p> required <code>prompt</code> <code>Optional[str]</code> <p>The text input that complements the prompt task.</p> <code>''</code> <code>images</code> <code>list[Image] | None</code> <p>A list of images for the model to process. None if using video.</p> <code>None</code> <code>video</code> <code>VideoNumpy | None</code> <p>A NumPy representation of the video for inference. None if using images.</p> <code>None</code> <code>batch_size</code> <code>int</code> <p>The batch size used for processing multiple images or video frames.</p> <code>5</code> <code>nms_threshold</code> <code>float</code> <p>The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).</p> <code>0.3</code> <code>chunk_length_frames</code> <code>int | None</code> <p>The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.</p> <code>None</code> <p>Returns:</p> Name Type Description <code>Florence2ResponseType</code> <code>Florence2ResponseType</code> <p>The output of the Florence-2 model based on the task and prompt.</p>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__init__","title":"<code>__init__(model_config=Florence2Config())</code>","text":"<p>Initializes the Florence2 model.</p>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.fine_tune","title":"<code>fine_tune(checkpoint)</code>","text":"<p>Load the fine-tuned Florence-2 model.</p>"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.load_base","title":"<code>load_base()</code>","text":"<p>Load the base Florence-2 model.</p>"},{"location":"flux1/","title":"Flux1","text":"<p>This example demonstrates using the Flux1 model to perform tasks such as image generation and mask inpainting based on text prompts.</p>"},{"location":"flux1/#parameters","title":"Parameters","text":"<ul> <li>task: The task to perform using the model - either image generation (\"generation\") or mask inpainting (\"inpainting\").</li> <li>prompt: The text prompt describing the desired modifications.</li> <li>config: The <code>Flux1Config</code> class allows you to configure the parameters for the Flux1 model.</li> <li>image (Image.Image): The original image to be modified -  used for the mask inpainting and image to image tasks.</li> <li>mask_image (Image.Image): The mask image indicating areas to be inpainted - used for the mask inpainting task</li> </ul>"},{"location":"flux1/#flux1config","title":"Flux1Config","text":"<p>Below is an example of how to create and use a <code>Flux1Config</code> object:</p> <pre><code>from vision_agent_tools.models.flux1 import Flux1Config\n\nconfig = Flux1Config(\n    height=512,\n    width=512,\n    num_inference_steps=28,\n    guidance_scale=3.5,\n    num_images_per_prompt=1,\n    max_sequence_length=512,\n    seed=42\n)\n</code></pre> <ul> <li>height: The height in pixels of the generated image. Defaults to 512.</li> <li>width: The width in pixels of the generated image. Defaults to 512.</li> <li>num_inference_steps: The number of inference steps to perform. Defaults to 28.</li> <li>guidance_scale: Guidance scale as defined in Classifier-Free Diffusion Guidance. Defaults to 3.5.</li> <li>num_images_per_prompt: The number of images to generate per prompt. Defaults to 1.</li> <li>max_sequence_length: Maximum sequence length to use with the prompt. Defaults to 512.</li> <li>seed: Seed for the random number generator. If not provided, a random seed is used.</li> <li>strength: Indicates extent to transform the reference <code>image</code>. Must be between 0 and 1. A value of 1 essentially ignores <code>image</code>.</li> </ul>"},{"location":"flux1/#perform-image-generation","title":"Perform image generation","text":"<pre><code>import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# To perform image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n    task=Flux1Task.IMAGE_GENERATION,  # Image Generation Task\n    prompt=\"A purple car in a futuristic cityscape\",\n    config=config\n)\ngenerated_image.save(\"generated_car.png\")\n</code></pre>"},{"location":"flux1/#perform-mask-inpainting","title":"Perform mask inpainting","text":"<p>To perform mask inpainting, both the original image and the mask image need to be provided. These images have the same dimensions. The mask should clearly delineate the areas that you want to modify in the original image. Additionally, the inpainting process includes a strength  parameter, which controls the intensity of the modifications applied to the masked areas.</p> <pre><code>import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have a cat image named \"cat_image.jpg\" that you want to use for mask inpainting\nimage_to_edit = Image.open(\"path/to/your/cat_image.jpg\").convert(\"RGB\")  # Image to inpaint\n\n# Make sure to provide a mask image with the same dimensions, delineating the cat\nmask_image = Image.open(\"path/to/your/mask.png\")  # Mask image indicating areas to change\n\n# Set a new prompt for inpainting\ninpainting_prompt = \"A cute dog\"\n\n# To perform image mask inpainting\nflux1 = Flux1()\n\ninpainted_image = flux_model(\n    task=Flux1Task.MASK_INPAINTING,  # Image Mask Inpainting Task\n    prompt=inpainting_prompt,\n    image=image_to_edit,\n    mask_image=mask_image,\n    config=config\n)\n\ninpainted_image.save(\"inpainted_dog_over_cat.png\")\n</code></pre>"},{"location":"flux1/#perform-image-to-image-generation","title":"Perform image-to-image generation","text":"<p>To perform image-to-image generation, you need to provide an original image along with a text prompt describing the desired modifications. The original image serves as the base, and the model will generate a new image based on the prompt.</p> <pre><code>import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have an original image named \"original_image.jpg\" that you want to use for image-to-image generation\noriginal_image = Image.open(\"path/to/your/original_image.jpg\").convert(\"RGB\")  # Original image\n\n# Set a new prompt for image-to-image generation\nimage_to_image_prompt = \"A sunny beach with palm trees\"\n\n# To perform image-to-image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n    task=Flux1Task.IMAGE_TO_IMAGE,  # Image-to-Image Generation Task\n    prompt=image_to_image_prompt,\n    image=original_image,\n    config=config\n)\n\ngenerated_image.save(\"generated_beach.png\")\n</code></pre>"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1","title":"<code>Flux1</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Tool for object detection using the pre-trained Flux1 model. This tool takes a prompt as input and generates an image using the Flux1 model.</p>"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__call__","title":"<code>__call__(prompt=Field(max_length=512), task=Flux1Task.IMAGE_GENERATION, config=Flux1Config(), image=None, mask_image=None)</code>","text":"<p>Performs object detection on an image using the Flux1 model.</p> <p>Parameters:</p> Name Type Description Default <code>-</code> <code>prompt (str</code> <p>The text prompt describing the desired modifications.</p> required <code>-</code> <code>task (Flux1Task</code> <p>The task to perform using the model: - image generation - \"generation\", - mask inpainting - \"inpainting\", - image-to-image generation - \"img2img\".</p> required <code>-</code> <code>config (Flux1Config</code> <ul> <li>height (<code>int</code>, optional):     The height in pixels of the generated image.     This is set to 512 by default.</li> <li>width (<code>int</code>, optional):     The width in pixels of the generated image.     This is set to 512 by default.</li> <li>num_inference_steps (<code>int</code>, optional, defaults to 28):</li> <li>guidance_scale (<code>float</code>, optional, defaults to 3.5):     Guidance scale as defined in Classifier-Free Diffusion Guidance.     Higher guidance scale encourages to generate images     that are closely linked to the text <code>prompt</code>,     usually at the expense of lower image quality.</li> <li>num_images_per_prompt (<code>int</code>, optional, defaults to 1):     The number of images to generate per prompt.</li> <li>max_sequence_length (<code>int</code> defaults to 512):     Maximum sequence length to use with the <code>prompt</code>.     to make generation deterministic.</li> <li>seed (<code>int</code>, optional): The seed to use for the random number generator.     If not provided, a random seed is used.</li> <li>strength (<code>float</code>, optional, defaults to 0.6):     Indicates extent to transform the reference <code>image</code>.     Must be between 0 and 1.     A value of 1 essentially ignores <code>image</code>.</li> </ul> required <code>-</code> <code>image (Image.Image</code> <p>The original image to be modified.</p> required <code>-</code> <code>mask_image (Image.Image</code> <p>The mask image indicating areas to be inpainted.</p> required <p>Returns:</p> Type Description <code>List[Image] | None</code> <p>List[Image.Image]: The list of generated image(s) if successful; None if an error occurred.</p>"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__init__","title":"<code>__init__(hf_model='black-forest-labs/FLUX.1-schnell', dtype=torch.bfloat16, enable_sequential_cpu_offload=True)</code>","text":"<p>Initializes the Flux1 image generation tool. Loads the pre-trained Flux1 model from HuggingFace and sets model configurations.</p> <p>Parameters:</p> Name Type Description Default <code>-</code> <code>task (Flux1Task</code> <p>The task to perform using the model: either image generation (\"generation\") or mask inpainting (\"inpainting\").</p> required <code>-</code> <code>model_config</code> <p>The configuration for the model, hf_model, and device.</p> required <code>-</code> <code>dtype (torch.dtype</code> <p>The data type to use for the model.</p> required <code>-</code> <code>enable_sequential_cpu_offload (bool</code> <p>Whether to enable sequential CPU offload.</p> required"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1Config","title":"<code>Flux1Config</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Configuration for the Flux1 model.</p>"},{"location":"internlm_xcomposer2/","title":"InternLM-XComposer-2.5","text":"<p>This example demonstrates how to use the InternLM-XComposer-2.5 tool to   to answer questions about images or videos.</p> <p>NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.</p> <pre><code>import cv2\n\nfrom vision_agent_tools.models.internlm_xcomposer2 import InternLMXComposer2\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Initialize the InternLMXComposer2 model\nrun_inference = InternLMXComposer2()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put InternLMXComposer2 to work!\nanswer = run_inference(video=p_video, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n</code></pre>"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2","title":"<code>InternLMXComposer2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>InternLM-XComposer-2.5 is a tool that excels in various text-image comprehension and composition applications, achieving GPT-4V level capabilities.</p> <p>NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.</p>"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__call__","title":"<code>__call__(prompt, image=None, video=None, frames=MAX_NUMBER_OF_FRAMES, chunk_length=None)</code>","text":"<p>InternLMXComposer2 model answers questions about a video or image.</p> <p>Parameters:</p> Name Type Description Default <code>prompt</code> <code>str</code> <p>The prompt with the question to be answered.</p> required <code>image</code> <code>Image | None</code> <p>The image to be analyzed.</p> <code>None</code> <code>video</code> <code>VideoNumpy | None</code> <p>A numpy array containing the different images, representing the video.</p> <code>None</code> <code>frames</code> <code>int</code> <p>The number of frames to be used from the video.</p> <code>MAX_NUMBER_OF_FRAMES</code> <code>chunk_length</code> <code>int</code> <p>The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.</p> <code>None</code> <p>Returns:</p> Type Description <code>list[str]</code> <p>list[str]: The answers to the prompt.</p>"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the InternLMXComposer2.5 model.</p>"},{"location":"nsfw_classification/","title":"NSFW (Not Safe for Work) classification","text":"<p>This example demonstrates using the Not Safe for Work classification tool.</p> <pre><code>from vision_agent_tools.models.nsfw_classification import NSFWClassification\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the NSFW model.\nnsfw_classification = NSFWClassification()\n\n# Run the inference\nresults = nsfw_classification(image)\n\n# Let's print the predicted label\nprint(results.label)\n</code></pre>"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification","title":"<code>NSFWClassification</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>The primary intended use of this model is for the classification of NSFW (Not Safe for Work) images.</p>"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__call__","title":"<code>__call__(image)</code>","text":"<p>Performs the NSFW inference on an image using the NSFWClassification model.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The input image for object detection.</p> required <p>Returns:</p> Name Type Description <code>NSFWInferenceData</code> <code>NSFWInferenceData</code> <p>The inference result from the NSFWClassification model. label (str): The label for the unsafe content detected in the image. score (float):The score for the unsafe content detected in the image.</p>"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the NSFW (Not Safe for Work) classification tool.</p>"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWInferenceData","title":"<code>NSFWInferenceData</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Represents an inference result from the NSFWClassification model.</p> <p>Attributes:</p> Name Type Description <code>label</code> <code>str</code> <p>The predicted label for the image.</p> <code>score</code> <code>float</code> <p>The confidence score associated with the prediction (between 0 and 1).</p>"},{"location":"nshot_counting/","title":"LOCA (Low-shot Object Counting network with iterative prototype Adaptation).","text":"<p>This example demonstrates how to use the NShot LOCA tool for object counting in images.</p> <pre><code>from vision_agent_tools.models.nshot_counting import NShotCounting\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the counting model and choose the image output size you expect.\nObjectCounting = NShotCounting(zero_shot=False, img_size=512)\n\n# Run the inference\nresults = ObjectCounting(image, bbox=[12, 34, 56, 78])\n\n# Let's find out how many objects were found in total\nprint(\"Found a total count of {results.count} objects on the image!\")\n</code></pre>"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.CountingDetection","title":"<code>CountingDetection</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Represents an inference result from the LOCA model.</p> <p>Attributes:</p> Name Type Description <code>count</code> <code>int</code> <p>The predicted number of detected objects.</p> <code>masks</code> <code>list[Any]</code> <p>A list of numpy arrays representing the masks             of the detected objects in the image.</p>"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting","title":"<code>NShotCounting</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Model for object counting using the zeroshot and n-shot versions of the LOCA model from the paper A Low-Shot Object Counting Network With Iterative Prototype Adaptation .</p>"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__call__","title":"<code>__call__(image, bbox=None)</code>","text":"<p>LOCA injects shape and appearance information into object queries to precisely count objects of various sizes in densely and sparsely populated scenarios. It also extends to a zeroshot scenario and achieves excellent localization and count errors across the entire low-shot spectrum.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The input image for object detection.</p> required <code>bbox</code> <code>BoundingBox</code> <p>A list of four ints representing the bounding box coordinates (xmin, ymin, xmax, ymax)         of the detected query in the image.</p> <code>None</code> <p>Returns:</p> Name Type Description <code>CountingDetection</code> <code>CountingDetection</code> <p>An object type containing: - The count of the objects found similar to the bbox query. - A list of numpy arrays representing the masks of the objects found.</p>"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__init__","title":"<code>__init__(zero_shot=True, img_size=512)</code>","text":"<p>Initializes the LOCA model.</p> <p>Parameters:</p> Name Type Description Default <code>img_size</code> <code>int</code> <p>Size of the input image.</p> <code>512</code>"},{"location":"owlv2/","title":"OWLv2 Open-World Localization","text":"<p>This example demonstrates using the Owlv2 tool for object detection in images based on text prompts.</p> <pre><code>from vision_agent_tools.models.owlv2 import Owlv2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# What are you looking for? Write your detective prompts here!\nprompts = [\"a photo of a cat\", \"a photo of a dog\"]\n\n# Load the image and create your Owlv2 detective tool\nimage = Image.open(test_image)\nowlv2 = Owlv2()\n\n# Time to put Owlv2 to work! Let's see what it finds...\nresults = owlv2(image, prompts=prompts)[0]\n\n# Did Owlv2 sniff out any objects? Let's see the results!\nif results:\n    for detection in results:\n        print(f\"Found it! It looks like a {detection['label']} with a confidence of {detection['score']:.2f}.\")\n        print(f\"Here's where it's hiding: {detection['bbox']}\")\nelse:\n    print(\"Hmm, Owlv2 couldn't find anything this time. Maybe try a different prompt?\")\n</code></pre>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2","title":"<code>Owlv2</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Tool for object detection using the pre-trained Owlv2 model from Transformers.</p> <p>This tool takes images and a prompt as input, performs object detection using the Owlv2 model, and returns a list of objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding a threshold.</p>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__call__","title":"<code>__call__(prompts, images=None, video=None, *, batch_size=1, nms_threshold=0.3, confidence=0.1)</code>","text":"<p>Performs object detection on images using the Owlv2 model.</p> <p>Parameters:</p> Name Type Description Default <code>prompts</code> <code>list[str]</code> <p>The prompt to be used for object detection.</p> required <code>images</code> <code>list[Image] | None</code> <p>The images to be analyzed.</p> <code>None</code> <code>video</code> <code>VideoNumpy[uint8] | None</code> <p>A numpy array containing the different images, representing the video.</p> <code>None</code> <code>batch_size</code> <code>int</code> <p>The batch size used for processing multiple images or video frames.</p> <code>1</code> <code>nms_threshold</code> <code>float</code> <p>The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).</p> <code>0.3</code> <code>confidence</code> <code>float</code> <p>Confidence threshold for model predictions.</p> <code>0.1</code> <p>Returns:</p> Type Description <code>list[ODWithScoreResponse]</code> <p>list[ODWithScoreResponse]: A list of <code>ODWithScoreResponse</code> objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding the threshold. The item will be None if no objects are detected above the confidence threshold for an specific image / frame.</p>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__init__","title":"<code>__init__(model_config=OWLV2Config())</code>","text":"<p>Loads the pre-trained Owlv2 processor and model from Transformers.</p>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS","title":"<code>Owlv2ProcessorWithNMS</code>","text":"<p>               Bases: <code>Owlv2Processor</code></p>"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS.post_process_object_detection_with_nms","title":"<code>post_process_object_detection_with_nms(outputs, *, threshold=0.1, nms_threshold=0.3, target_sizes=None)</code>","text":"<p>Converts the raw output of [<code>OwlViTForObjectDetection</code>] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.</p> <p>Parameters:</p> Name Type Description Default <code>outputs</code> <code>OwlViTObjectDetectionOutput</code> <p>Raw outputs of the model.</p> required <code>threshold</code> <code>float</code> <p>Score threshold to keep object detection predictions.</p> <code>0.1</code> <code>nms_threshold</code> <code>float</code> <p>IoU threshold to filter overlapping objects the raw detections.</p> <code>0.3</code> <code>target_sizes</code> <code>TensorType | list[Tuple] | None</code> <p>Tensor of shape <code>(batch_size, 2)</code> or list of tuples (<code>Tuple[int, int]</code>) containing the target size <code>(height, width)</code> of each image in the batch. If unset, predictions will not be resized.</p> <code>None</code> <p>Returns:     <code>list[dict]</code>:         A list of dictionaries, each dictionary containing the scores, labels         and boxes for an image in the batch as predicted by the model.</p>"},{"location":"qr_reader/","title":"QR Reader","text":"<p>Tool for detecting QR codes in images.</p> <pre><code>from PIL import Image, ImageDraw\n\nfrom vision_agent_tools.models.qr_reader import QRReader\n\n# Open the image containing the QR code\nimage = Image.open(\"sample_qr_image.jpeg\")\n\n# Create a QR code reader object\nqr_reader = QRReader()\n\n# Detect QR codes in the image\ndetections = qr_reader(image)\n\n\nif detections:\n\n    detection = detections[0]\n    draw = ImageDraw.Draw(image)\n\n    # Print the detected text\n    print(f\"Decoded Text: {detection.text}\")\n\n    # Draw the bounding box\n    x_min, y_min, x_max, y_max = (\n        int(detection.bbox[0]),\n        int(detection.bbox[1]),\n        int(detection.bbox[2]),\n        int(detection.bbox[3]),\n    )\n    draw.rectangle(((x_min, y_min), (x_max, y_max)), outline=\"red\", width=2)\n\n    # Draw the text on top of the image\n    draw.text((x_min + 10, y_min - 10), detection.text, fill=\"blue\", anchor=\"mm\")\n    image.show()\nelse:\n    print(\"No QR codes detected in the image.\")\n</code></pre> Displaying the Detection Result"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRCodeDetection","title":"<code>QRCodeDetection</code>","text":"<p>               Bases: <code>BaseModel</code></p> <p>Represents a detected QR code.</p>"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader","title":"<code>QRReader</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>This tool utilizes the <code>qreader</code> library to detect QR codes within an input image. It returns a list of <code>QRCodeDetection</code> objects for each detected QR code, containing the decoded text, confidence score, polygon coordinates, bounding box, and center point.</p>"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader.__call__","title":"<code>__call__(image)</code>","text":"<p>Detects QR codes in an image.</p> <p>Parameters:</p> Name Type Description Default <code>image</code> <code>Image</code> <p>The input image for QR code detection.</p> required <p>Returns:</p> Type Description <code>list[QRCodeDetection]</code> <p>list[QRCodeDetection]: A list of <code>QRCodeDetection</code> objects containing                    information about each detected QR code, or an empty list if none are found.</p>"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader.__init__","title":"<code>__init__()</code>","text":"<p>Initializes the QR code reader tool.</p> <p>Loads the <code>QReader</code> instance for QR code detection.</p>"},{"location":"qwen2_vl/","title":"Qwen2-VL","text":"<p>This example demonstrates how to use the Qwen2-VL model to   to answer questions about images or videos.</p> <p>NOTE: The Qwen2-VL model should be used in GPU environments.</p> <pre><code>import cv2\nimport numpy as np\nfrom vision_agent_tools.models.qwen2_vl import Qwen2VL\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n    frames.append(frame)\ncap.release()\nframes = np.stack(frames, axis=0)\n\n# Initialize the Qwen2VL model\nrun_inference = Qwen2VL()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put Qwen2VL to work!\nanswer = run_inference(video=frames, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n</code></pre>"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL","title":"<code>Qwen2VL</code>","text":"<p>               Bases: <code>BaseMLModel</code></p> <p>Qwen2-VL is a model that is capable of accurately identifying and comprehending the content within images, regardless of their clarity, resolution, or extreme aspect ratios.</p> <p>NOTE: The Qwen2-VL model should be used in GPU environments.</p>"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__call__","title":"<code>__call__(prompt=None, images=None, video=None, frames=MAX_NUMBER_OF_FRAMES)</code>","text":"<p>Qwen2-VL model answers questions about a video or image.</p> <p>Parameters:</p> Name Type Description Default <code>prompt</code> <code>str</code> <p>The prompt with the question to be answered.</p> <code>None</code> <code>images</code> <code>list[Image]</code> <p>A list of images for the model to process. None if using video.</p> <code>None</code> <code>video</code> <code>VideoNumpy | None</code> <p>A numpy array containing the different images, representing the video.</p> <code>None</code> <code>frames</code> <code>int</code> <p>The number of frames to be used from the video.</p> <code>MAX_NUMBER_OF_FRAMES</code> <p>Returns:</p> Type Description <code>list[str]</code> <p>list[str]: The answers to the prompt.</p>"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__init__","title":"<code>__init__(model_config=None)</code>","text":"<p>Initializes the Qwen2-VL model.</p>"},{"location":"shared_model_manager/","title":"Shared Model Manager","text":"<p>The <code>SharedModelManager</code> class is designed to manage and facilitate the use of machine learning models across different devices, such as CPUs and GPUs, within an asynchronous environment. It ensures safe and efficient execution of these models, particularly in scenarios where GPU resources need to be shared exclusively among multiple models. The manager coordinates access to the shared GPU, preventing conflicts when multiple models require it. Models are only loaded into memory when needed using the <code>fetch_model</code> function.</p> <ul> <li><code>add()</code>: Registers a machine learning model class with the manager. The actual model instance is not loaded at this point.</li> <li><code>fetch_model()</code>: Retrieves the previously added model class and creates (loads) the actual model instance. This function utilizes PyTorch interface <code>to</code>, to handle device (CPU/GPU) allocation based on availability.</li> </ul> <p>The usage example demonstrates adding models and then using them with their respective functionalities.</p> <p>\u26a0\ufe0f \u2755: We should ALWAYS add model instance on CPU to the pool. This avoids overwhelming the GPU memory, and model pool will automatically put it in GPU when the model is fetched..</p> <pre><code>model_pool = SharedModelManager()\n\n# Add models instance to the pool\nmodel_pool.add(QRReader())\nmodel_pool.add(Owlv2(model_config=OWLV2Config(device=Device.CPU)))\n\n# Read image\nimage = Image.open(\"path/to/your/image.jpg\")\n\n# Use QRReader model\nasync def use_qr_reader():\n    # Read image\n    image = Image.open(\"path/to/your/image.jpg\")\n\n    qr_reader = await model_pool.fetch_model(QRReader.__name__)\n    detections = qr_reader(image)\n    # Process detections ...\n\n# Use Owlv2 model\nasync def use_owlv2():\n    # Read image\n    image = Image.open(\"path/to/your/image.jpg\")\n\n    owlv2 = await model_pool.fetch_model(Owlv2.__name__)\n    prompts = [\"a photo of a cat\", \"a photo of a dog\"]\n    results = owlv2(image, prompts=prompts)\n    # Process results ...\n</code></pre>"},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager","title":"<code>SharedModelManager</code>","text":""},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.add","title":"<code>add(model)</code>","text":"<p>Adds a model to the pool with a device preference.</p> <p>Parameters:</p> Name Type Description Default <code>model</code> <code>Basetool</code> <p>The modal instance to be added to the pool, it should implement the BaseTool interface.</p> required <code>device</code> <code>Device</code> <p>The preferred device for the model.</p> required <p>Returns:</p> Name Type Description <code>str</code> <code>str</code> <p>The model ID to be used for fetching the model.</p>"},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.fetch_model","title":"<code>fetch_model(model_id)</code>","text":"<p>Retrieves a model from the pool for safe execution.</p> <p>Parameters:</p> Name Type Description Default <code>model_id</code> <code>str</code> <p>Id to access the model in the pool.</p> required <p>Returns:</p> Name Type Description <code>Any</code> <code>BaseTool</code> <p>The retrieved model instance.</p>"}]}
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index 23098322..e74d7be6 100644
Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ