diff --git a/florence2-sam2/index.html b/florence2-sam2/index.html index db7b1090..132eb3c8 100644 --- a/florence2-sam2/index.html +++ b/florence2-sam2/index.html @@ -411,6 +411,24 @@ + + +
fine_tune(checkpoint)
+
+#Load the fine-tuned Florence-2 model.
+ +load_base()
+
+#Load the base Florence-2 model.
+ +This repository contains tools that solve vision problems. This tools can be used in conjunction with the vision-agent.
"},{"location":"clip_media_sim/","title":"CLIPMediaSim","text":""},{"location":"clip_media_sim/#video-similarity","title":"Video similarity","text":"import cv2\nfrom PIL import Image\n\nfrom vision_agent_tools.models.clip_media_sim import CLIPMediaSim\n\n# Path to your target image\nimage_path = \"path/to/your/image.jpg\"\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the image\ntarget_image = Image.open(image_path)\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nfps = cap.get(cv2.CAP_PROP_FPS)\nframes = []\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n frames.append(frame)\ncap.release()\n\n# Calculate video timestamps\nvideo_time = len(frames) / fps\n\n# Create the CLIPMediaSim instance\nclip_media_sim = CLIPMediaSim()\n\n# Run video similarity against the target image\nresults = clip_media_sim(video=frames, target_image=target_image)\n\n# The results should be a list of [index_of_frame, confidence_score] where the\n# video is similar to the target image.\n\n# To find the time at which a given frame happens, you can do the following\n\ntime_per_frame = video_time / len(frames)\n\ntimestamp = results[0][0] * time_per_frame\n\nprint(\"Similarity detection complete!\")\n
You can also run similarity against a target text doing the following:
results = clip_media_sim(video=frames, target_text=\"a turtle holding the earth\")\n
"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim","title":"CLIPMediaSim
","text":" Bases: BaseMLModel
A class that receives a video and a target image or text and returns the frames that are most similar to the target.
"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__call__","title":"__call__(video, target_image=None, target_text=None, thresh=0.3)
","text":"Receives a video and a target image or text and returns the frames that are most similar to the target.
Parameters:
Name Type Description Defaultvideo
ndarray
The input video to be processed.
requiredtarget_image
Image | None
The target image to compare the video frames with.
None
target_text
str | None
The target text to compare the video frames with.
None
thresh
float
The threshold to filter the results. Defaults to 0.3.
0.3
"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__init__","title":"__init__(device='cuda')
","text":"Initializes the CLIPMediaSim object with a pre-trained CLIP model.
"},{"location":"controlnet_aux/","title":"Controlnet-Aux","text":""},{"location":"controlnet_aux/#pose-detector","title":"Pose Detector","text":"from PIL import Image\nfrom vision_agent_tools.models.controlnet_aux import Image2Pose\n\n# Path to your test image\ntest_image_path = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image_path)\n# Create the Image2Pose instance\nimage_2_pose = Image2Pose()\n\n# Run pose detection and get the results\nresults = image_2_pose(image)\n\n# Optional: Save the result image (assuming results is a PIL Image)\n# results.save(\"result.png\")\n\nprint(\"Pose detection complete!\")\n
Pose Detection Result"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose","title":"Image2Pose
","text":"A class that simplifies human pose detection using a pre-trained Openpose model.
This class provides a convenient way to run pose detection on images using a pre-trained Openpose model from the controlnet_aux
library. It takes a PIL Image object as input and returns the predicted pose information.
__call__(image)
","text":"Performs pose detection on a PIL image and returns the results.
This method takes a PIL Image object as input and runs the loaded Openpose detector on it. The predicted pose information is then resized to match the original image size and returned.
Parameters:
Name Type Description Defaultimage
Image
The input image for pose detection.
requiredReturns:
Type DescriptionImage
PIL.Image: The image with the predicted pose information (format might vary depending on the specific OpenposeDetector implementation).
"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__init__","title":"__init__()
","text":"Initializes the Image2Pose object with a pre-trained Openpose detector.
This method loads a pre-trained Openpose model from the specified model hub (\"lllyasviel/Annotators\" in this case). The loaded detector is stored as an attribute for future use.
"},{"location":"depth_anything_v2/","title":"Depth-Anything-V2","text":"This example demonstrates using the Depth-Anything-V2 tool for depth estimation on images.
from vision_agent_tools.models.depth_anything_v2 import DepthAnythingV2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the depth map estimation model.\ndepth_estimate = DepthAnythingV2()\n\n# Run the inference\nresults = depth_estimate(image)\n\n# Let's print the obtained depth map\nprint(results.map)\n
"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2","title":"DepthAnythingV2
","text":" Bases: BaseMLModel
Model for depth estimation using the Depth-Anything-V2 model from the paper Depth Anything V2.
"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__call__","title":"__call__(image, grayscale=False)
","text":"Depth-Anything-V2 is a highly practical solution for robust monocular depth estimation.
Parameters:
Name Type Description Defaultimage
Union[str, Image, ndarray]
The input image for depth estimation. Can be a file path, a PIL Image, or a NumPy array.
requiredgrayscale
bool
Whether to return the depth map as a grayscale image. If True, the depth map will be normalized to the range [0, 255] and converted to uint8. Defaults to False.
False
Returns:
Name Type DescriptionDepthMap
DepthMap
An object type containing a numpy array with the HxW depth map of the image.
"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__init__","title":"__init__()
","text":"Initializes the Depth-Anything-V2 model.
"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthMap","title":"DepthMap
","text":" Bases: BaseModel
Represents the depth map of an image.
Attributes:
Name Type Descriptionmap
Any
HxW raw depth map of the image.
"},{"location":"florence2-qa/","title":"FlorenceQA","text":"This example demonstrates using the Florence2-QA tool to to answer questions about images.
NOTE: The FlorenceQA model can only be used in GPU environments.
from vision_agent_tools.models.florence2_qa import FlorenceQA\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image and create initialize the FlorenceQA model\nimage = Image.open(test_image)\nrun_florence_qa = FlorenceQA()\n\n# Time to put FlorenceQA to work! Let's pose a question about the image\nanswer = run_florence_qa(image, question=\"Is there a dog in the image?\")\n\n# Print the output answer\nprint(answer)\n
"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA","title":"FlorenceQA
","text":" Bases: BaseMLModel
FlorenceQA is a tool that combines the Florence-2 and Roberta QA models to answer questions about images.
NOTE: The Florence-2 model can only be used in GPU environments.
"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__call__","title":"__call__(image, question)
","text":"FlorenceQA model answers questions about images.
Parameters:
Name Type Description Defaultimage
Image
The image to be analyzed.
requiredquestion
str
The question to be answered.
requiredReturns:
Name Type Descriptionstr
dict[str, Any]
The answer to the question.
"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__init__","title":"__init__()
","text":"Initializes the FlorenceQA model.
"},{"location":"florence2-sam2/","title":"Florence2Sam2","text":"This tool uses Florence2 and the SAM-2 model to do text to instance segmentation on image or video inputs.
import cv2\n\nfrom vision_agent_tools.models.florence2_sam2 import Florence2SAM2\n\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n frames.append(frame)\ncap.release()\n\n# Create the Florence2SAM2 instance\nflorence2_sam2 = Florence2SAM2()\n\n# segment all the instances of the prompt \"ball\" for all video frames\nresults = florence2_sam2(prompt=\"ball\", video=frames)\n\n# Returns a list of list where the first list represents the frames and the inner\n# list contains all the predictions per frame. The annotation ID can be used\n# to track the same object across different frames. For example:\n[\n [\n {\n \"id\": 0\n \"mask\": rle\n \"label\": \"ball\"\n \"bbox\": [x_min, y_min, x_max, y_max]\n }\n ],\n [\n {\n \"id\": 0\n \"mask\": rle\n \"label\": \"ball\"\n \"bbox\": [x_min, y_min, x_max, y_max]\n }\n ]\n]\n\nprint(\"Instance segmentation complete!\")\n
You can also run similarity against an image and get additionally bounding boxes doing the following:
results = florence2_sam2(image=image, prompts=[\"ball\"])\n
"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2","title":"Florence2SAM2
","text":" Bases: BaseMLModel
A class that receives a video or images, a text prompt and returns the instance segmentation based on the input for each frame.
"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__call__","title":"__call__(prompt, images=None, video=None, *, chunk_length_frames=20, iou_threshold=0.6, nms_threshold=0.3)
","text":"Florence2Sam2 model find objects in images and track objects in a video.
Parameters:
Name Type Description Defaultprompt
str
The text input that complements the media to find or track objects.
requiredimages
list[Image] | None
The images to be analyzed.
None
video
VideoNumpy | None
A numpy array containing the different images, representing the video.
None
chunk_length_frames
int | None
The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.
20
iou_threshold
float
The IoU threshold value used to compare last_predictions and new_predictions objects.
0.6
nms_threshold
float
The non-maximum suppression threshold value used to filter the Florence2 predictions.
0.3
Returns:
Type Descriptionlist[list[dict[str, Any]]]
list[list[dict[str, Any]]]: A list where each item represents each frames predictions. [[{ \"id\": 0, \"mask\": rle, \"label\": \"car\", \"bbox\": [0.1, 0.2, 0.3, 0.4] }]]
"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__init__","title":"__init__(model_config=Florence2SAM2Config())
","text":"Initializes the Florence2SAM2 object with a pre-trained Florence2 model and a SAM2 model.
"},{"location":"florence2/","title":"Florence-2","text":"This example demonstrates using the Florence2 tool to interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.
from vision_agent_tools.shared_types import PromptTask\nfrom vision_agent_tools.models.florence2 import Florence2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Choose the task that you are planning to use\ntask_prompt = PromptTask.CAPTION\n\n# Load the image and create initialize the Florence2 model\nimage = Image.open(test_image)\nmodel = Florence2()\n\n# Time to put Florence2 to work! Let's see what it finds...\nresults = model(images=[image], task=task_prompt)\n\n# Print the output result\nprint(f\"The image contains: {results[0]}\")\n
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2","title":"Florence2
","text":" Bases: BaseMLModel
Florence2 model. It supported both zero-shot and fine-tuned settings. For the zero-shot we use the Florence-2-large. For fine-tuning we use the Florence-2-base-ft. This model can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__call__","title":"__call__(task, prompt='', images=None, video=None, *, batch_size=5, nms_threshold=0.3, chunk_length_frames=None)
","text":"Performs inference on the Florence-2 model based on the provided task, images or video, and prompt.
Parameters:
Name Type Description Defaulttask
PromptTask
The task to be performed on the images or video.
requiredprompt
Optional[str]
The text input that complements the prompt task.
''
images
list[Image] | None
A list of images for the model to process. None if using video.
None
video
VideoNumpy | None
A NumPy representation of the video for inference. None if using images.
None
batch_size
int
The batch size used for processing multiple images or video frames.
5
nms_threshold
float
The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).
0.3
chunk_length_frames
int | None
The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.
None
Returns:
Name Type DescriptionFlorence2ResponseType
Florence2ResponseType
The output of the Florence-2 model based on the task and prompt.
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__init__","title":"__init__(model_config=Florence2Config())
","text":"Initializes the Florence2 model.
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.fine_tune","title":"fine_tune(checkpoint)
","text":"Load the fine-tuned Florence-2 model.
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.load_base","title":"load_base()
","text":"Load the base Florence-2 model.
"},{"location":"flux1/","title":"Flux1","text":"This example demonstrates using the Flux1 model to perform tasks such as image generation and mask inpainting based on text prompts.
"},{"location":"flux1/#parameters","title":"Parameters","text":"Flux1Config
class allows you to configure the parameters for the Flux1 model.Below is an example of how to create and use a Flux1Config
object:
from vision_agent_tools.models.flux1 import Flux1Config\n\nconfig = Flux1Config(\n height=512,\n width=512,\n num_inference_steps=28,\n guidance_scale=3.5,\n num_images_per_prompt=1,\n max_sequence_length=512,\n seed=42\n)\n
image
. Must be between 0 and 1. A value of 1 essentially ignores image
.import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# To perform image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n task=Flux1Task.IMAGE_GENERATION, # Image Generation Task\n prompt=\"A purple car in a futuristic cityscape\",\n config=config\n)\ngenerated_image.save(\"generated_car.png\")\n
"},{"location":"flux1/#perform-mask-inpainting","title":"Perform mask inpainting","text":"To perform mask inpainting, both the original image and the mask image need to be provided. These images have the same dimensions. The mask should clearly delineate the areas that you want to modify in the original image. Additionally, the inpainting process includes a strength parameter, which controls the intensity of the modifications applied to the masked areas.
import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have a cat image named \"cat_image.jpg\" that you want to use for mask inpainting\nimage_to_edit = Image.open(\"path/to/your/cat_image.jpg\").convert(\"RGB\") # Image to inpaint\n\n# Make sure to provide a mask image with the same dimensions, delineating the cat\nmask_image = Image.open(\"path/to/your/mask.png\") # Mask image indicating areas to change\n\n# Set a new prompt for inpainting\ninpainting_prompt = \"A cute dog\"\n\n# To perform image mask inpainting\nflux1 = Flux1()\n\ninpainted_image = flux_model(\n task=Flux1Task.MASK_INPAINTING, # Image Mask Inpainting Task\n prompt=inpainting_prompt,\n image=image_to_edit,\n mask_image=mask_image,\n config=config\n)\n\ninpainted_image.save(\"inpainted_dog_over_cat.png\")\n
"},{"location":"flux1/#perform-image-to-image-generation","title":"Perform image-to-image generation","text":"To perform image-to-image generation, you need to provide an original image along with a text prompt describing the desired modifications. The original image serves as the base, and the model will generate a new image based on the prompt.
import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have an original image named \"original_image.jpg\" that you want to use for image-to-image generation\noriginal_image = Image.open(\"path/to/your/original_image.jpg\").convert(\"RGB\") # Original image\n\n# Set a new prompt for image-to-image generation\nimage_to_image_prompt = \"A sunny beach with palm trees\"\n\n# To perform image-to-image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n task=Flux1Task.IMAGE_TO_IMAGE, # Image-to-Image Generation Task\n prompt=image_to_image_prompt,\n image=original_image,\n config=config\n)\n\ngenerated_image.save(\"generated_beach.png\")\n
"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1","title":"Flux1
","text":" Bases: BaseMLModel
Tool for object detection using the pre-trained Flux1 model. This tool takes a prompt as input and generates an image using the Flux1 model.
"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__call__","title":"__call__(prompt=Field(max_length=512), task=Flux1Task.IMAGE_GENERATION, config=Flux1Config(), image=None, mask_image=None)
","text":"Performs object detection on an image using the Flux1 model.
Parameters:
Name Type Description Default-
prompt (str
The text prompt describing the desired modifications.
required-
task (Flux1Task
The task to perform using the model: - image generation - \"generation\", - mask inpainting - \"inpainting\", - image-to-image generation - \"img2img\".
required-
config (Flux1Config
int
, optional): The height in pixels of the generated image. This is set to 512 by default.int
, optional): The width in pixels of the generated image. This is set to 512 by default.int
, optional, defaults to 28):float
, optional, defaults to 3.5): Guidance scale as defined in Classifier-Free Diffusion Guidance. Higher guidance scale encourages to generate images that are closely linked to the text prompt
, usually at the expense of lower image quality.int
, optional, defaults to 1): The number of images to generate per prompt.int
defaults to 512): Maximum sequence length to use with the prompt
. to make generation deterministic.int
, optional): The seed to use for the random number generator. If not provided, a random seed is used.float
, optional, defaults to 0.6): Indicates extent to transform the reference image
. Must be between 0 and 1. A value of 1 essentially ignores image
.-
image (Image.Image
The original image to be modified.
required-
mask_image (Image.Image
The mask image indicating areas to be inpainted.
requiredReturns:
Type DescriptionList[Image] | None
List[Image.Image]: The list of generated image(s) if successful; None if an error occurred.
"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__init__","title":"__init__(hf_model='black-forest-labs/FLUX.1-schnell', dtype=torch.bfloat16, enable_sequential_cpu_offload=True)
","text":"Initializes the Flux1 image generation tool. Loads the pre-trained Flux1 model from HuggingFace and sets model configurations.
Parameters:
Name Type Description Default-
task (Flux1Task
The task to perform using the model: either image generation (\"generation\") or mask inpainting (\"inpainting\").
required-
model_config
The configuration for the model, hf_model, and device.
required-
dtype (torch.dtype
The data type to use for the model.
required-
enable_sequential_cpu_offload (bool
Whether to enable sequential CPU offload.
required"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1Config","title":"Flux1Config
","text":" Bases: BaseModel
Configuration for the Flux1 model.
"},{"location":"internlm_xcomposer2/","title":"InternLM-XComposer-2.5","text":"This example demonstrates how to use the InternLM-XComposer-2.5 tool to to answer questions about images or videos.
NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.
import cv2\n\nfrom vision_agent_tools.models.internlm_xcomposer2 import InternLMXComposer2\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n frames.append(frame)\ncap.release()\n\n# Initialize the InternLMXComposer2 model\nrun_inference = InternLMXComposer2()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put InternLMXComposer2 to work!\nanswer = run_inference(video=p_video, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n
"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2","title":"InternLMXComposer2
","text":" Bases: BaseMLModel
InternLM-XComposer-2.5 is a tool that excels in various text-image comprehension and composition applications, achieving GPT-4V level capabilities.
NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.
"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__call__","title":"__call__(prompt, image=None, video=None, frames=MAX_NUMBER_OF_FRAMES, chunk_length=None)
","text":"InternLMXComposer2 model answers questions about a video or image.
Parameters:
Name Type Description Defaultprompt
str
The prompt with the question to be answered.
requiredimage
Image | None
The image to be analyzed.
None
video
VideoNumpy | None
A numpy array containing the different images, representing the video.
None
frames
int
The number of frames to be used from the video.
MAX_NUMBER_OF_FRAMES
chunk_length
int
The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.
None
Returns:
Type Descriptionlist[str]
list[str]: The answers to the prompt.
"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__init__","title":"__init__()
","text":"Initializes the InternLMXComposer2.5 model.
"},{"location":"nsfw_classification/","title":"NSFW (Not Safe for Work) classification","text":"This example demonstrates using the Not Safe for Work classification tool.
from vision_agent_tools.models.nsfw_classification import NSFWClassification\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the NSFW model.\nnsfw_classification = NSFWClassification()\n\n# Run the inference\nresults = nsfw_classification(image)\n\n# Let's print the predicted label\nprint(results.label)\n
"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification","title":"NSFWClassification
","text":" Bases: BaseMLModel
The primary intended use of this model is for the classification of NSFW (Not Safe for Work) images.
"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__call__","title":"__call__(image)
","text":"Performs the NSFW inference on an image using the NSFWClassification model.
Parameters:
Name Type Description Defaultimage
Image
The input image for object detection.
requiredReturns:
Name Type DescriptionNSFWInferenceData
NSFWInferenceData
The inference result from the NSFWClassification model. label (str): The label for the unsafe content detected in the image. score (float):The score for the unsafe content detected in the image.
"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__init__","title":"__init__()
","text":"Initializes the NSFW (Not Safe for Work) classification tool.
"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWInferenceData","title":"NSFWInferenceData
","text":" Bases: BaseModel
Represents an inference result from the NSFWClassification model.
Attributes:
Name Type Descriptionlabel
str
The predicted label for the image.
score
float
The confidence score associated with the prediction (between 0 and 1).
"},{"location":"nshot_counting/","title":"LOCA (Low-shot Object Counting network with iterative prototype Adaptation).","text":"This example demonstrates how to use the NShot LOCA tool for object counting in images.
from vision_agent_tools.models.nshot_counting import NShotCounting\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the counting model and choose the image output size you expect.\nObjectCounting = NShotCounting(zero_shot=False, img_size=512)\n\n# Run the inference\nresults = ObjectCounting(image, bbox=[12, 34, 56, 78])\n\n# Let's find out how many objects were found in total\nprint(\"Found a total count of {results.count} objects on the image!\")\n
"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.CountingDetection","title":"CountingDetection
","text":" Bases: BaseModel
Represents an inference result from the LOCA model.
Attributes:
Name Type Descriptioncount
int
The predicted number of detected objects.
masks
list[Any]
A list of numpy arrays representing the masks of the detected objects in the image.
"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting","title":"NShotCounting
","text":" Bases: BaseMLModel
Model for object counting using the zeroshot and n-shot versions of the LOCA model from the paper A Low-Shot Object Counting Network With Iterative Prototype Adaptation .
"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__call__","title":"__call__(image, bbox=None)
","text":"LOCA injects shape and appearance information into object queries to precisely count objects of various sizes in densely and sparsely populated scenarios. It also extends to a zeroshot scenario and achieves excellent localization and count errors across the entire low-shot spectrum.
Parameters:
Name Type Description Defaultimage
Image
The input image for object detection.
requiredbbox
BoundingBox
A list of four ints representing the bounding box coordinates (xmin, ymin, xmax, ymax) of the detected query in the image.
None
Returns:
Name Type DescriptionCountingDetection
CountingDetection
An object type containing: - The count of the objects found similar to the bbox query. - A list of numpy arrays representing the masks of the objects found.
"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__init__","title":"__init__(zero_shot=True, img_size=512)
","text":"Initializes the LOCA model.
Parameters:
Name Type Description Defaultimg_size
int
Size of the input image.
512
"},{"location":"owlv2/","title":"OWLv2 Open-World Localization","text":"This example demonstrates using the Owlv2 tool for object detection in images based on text prompts.
from vision_agent_tools.models.owlv2 import Owlv2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# What are you looking for? Write your detective prompts here!\nprompts = [\"a photo of a cat\", \"a photo of a dog\"]\n\n# Load the image and create your Owlv2 detective tool\nimage = Image.open(test_image)\nowlv2 = Owlv2()\n\n# Time to put Owlv2 to work! Let's see what it finds...\nresults = owlv2(image, prompts=prompts)[0]\n\n# Did Owlv2 sniff out any objects? Let's see the results!\nif results:\n for detection in results:\n print(f\"Found it! It looks like a {detection['label']} with a confidence of {detection['score']:.2f}.\")\n print(f\"Here's where it's hiding: {detection['bbox']}\")\nelse:\n print(\"Hmm, Owlv2 couldn't find anything this time. Maybe try a different prompt?\")\n
"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2","title":"Owlv2
","text":" Bases: BaseMLModel
Tool for object detection using the pre-trained Owlv2 model from Transformers.
This tool takes images and a prompt as input, performs object detection using the Owlv2 model, and returns a list of objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding a threshold.
"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__call__","title":"__call__(prompts, images=None, video=None, *, batch_size=1, nms_threshold=0.3, confidence=0.1)
","text":"Performs object detection on images using the Owlv2 model.
Parameters:
Name Type Description Defaultprompts
list[str]
The prompt to be used for object detection.
requiredimages
list[Image] | None
The images to be analyzed.
None
video
VideoNumpy[uint8] | None
A numpy array containing the different images, representing the video.
None
batch_size
int
The batch size used for processing multiple images or video frames.
1
nms_threshold
float
The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).
0.3
confidence
float
Confidence threshold for model predictions.
0.1
Returns:
Type Descriptionlist[ODWithScoreResponse]
list[ODWithScoreResponse]: A list of ODWithScoreResponse
objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding the threshold. The item will be None if no objects are detected above the confidence threshold for an specific image / frame.
__init__(model_config=OWLV2Config())
","text":"Loads the pre-trained Owlv2 processor and model from Transformers.
"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS","title":"Owlv2ProcessorWithNMS
","text":" Bases: Owlv2Processor
post_process_object_detection_with_nms(outputs, *, threshold=0.1, nms_threshold=0.3, target_sizes=None)
","text":"Converts the raw output of [OwlViTForObjectDetection
] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
Parameters:
Name Type Description Defaultoutputs
OwlViTObjectDetectionOutput
Raw outputs of the model.
requiredthreshold
float
Score threshold to keep object detection predictions.
0.1
nms_threshold
float
IoU threshold to filter overlapping objects the raw detections.
0.3
target_sizes
TensorType | list[Tuple] | None
Tensor of shape (batch_size, 2)
or list of tuples (Tuple[int, int]
) containing the target size (height, width)
of each image in the batch. If unset, predictions will not be resized.
None
Returns: list[dict]
: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image in the batch as predicted by the model.
Tool for detecting QR codes in images.
from PIL import Image, ImageDraw\n\nfrom vision_agent_tools.models.qr_reader import QRReader\n\n# Open the image containing the QR code\nimage = Image.open(\"sample_qr_image.jpeg\")\n\n# Create a QR code reader object\nqr_reader = QRReader()\n\n# Detect QR codes in the image\ndetections = qr_reader(image)\n\n\nif detections:\n\n detection = detections[0]\n draw = ImageDraw.Draw(image)\n\n # Print the detected text\n print(f\"Decoded Text: {detection.text}\")\n\n # Draw the bounding box\n x_min, y_min, x_max, y_max = (\n int(detection.bbox[0]),\n int(detection.bbox[1]),\n int(detection.bbox[2]),\n int(detection.bbox[3]),\n )\n draw.rectangle(((x_min, y_min), (x_max, y_max)), outline=\"red\", width=2)\n\n # Draw the text on top of the image\n draw.text((x_min + 10, y_min - 10), detection.text, fill=\"blue\", anchor=\"mm\")\n image.show()\nelse:\n print(\"No QR codes detected in the image.\")\n
Displaying the Detection Result"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRCodeDetection","title":"QRCodeDetection
","text":" Bases: BaseModel
Represents a detected QR code.
"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader","title":"QRReader
","text":" Bases: BaseMLModel
This tool utilizes the qreader
library to detect QR codes within an input image. It returns a list of QRCodeDetection
objects for each detected QR code, containing the decoded text, confidence score, polygon coordinates, bounding box, and center point.
__call__(image)
","text":"Detects QR codes in an image.
Parameters:
Name Type Description Defaultimage
Image
The input image for QR code detection.
requiredReturns:
Type Descriptionlist[QRCodeDetection]
list[QRCodeDetection]: A list of QRCodeDetection
objects containing information about each detected QR code, or an empty list if none are found.
__init__()
","text":"Initializes the QR code reader tool.
Loads the QReader
instance for QR code detection.
This example demonstrates how to use the Qwen2-VL model to to answer questions about images or videos.
NOTE: The Qwen2-VL model should be used in GPU environments.
import cv2\nimport numpy as np\nfrom vision_agent_tools.models.qwen2_vl import Qwen2VL\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n frames.append(frame)\ncap.release()\nframes = np.stack(frames, axis=0)\n\n# Initialize the Qwen2VL model\nrun_inference = Qwen2VL()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put Qwen2VL to work!\nanswer = run_inference(video=frames, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n
"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL","title":"Qwen2VL
","text":" Bases: BaseMLModel
Qwen2-VL is a model that is capable of accurately identifying and comprehending the content within images, regardless of their clarity, resolution, or extreme aspect ratios.
NOTE: The Qwen2-VL model should be used in GPU environments.
"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__call__","title":"__call__(prompt=None, images=None, video=None, frames=MAX_NUMBER_OF_FRAMES)
","text":"Qwen2-VL model answers questions about a video or image.
Parameters:
Name Type Description Defaultprompt
str
The prompt with the question to be answered.
None
images
list[Image]
A list of images for the model to process. None if using video.
None
video
VideoNumpy | None
A numpy array containing the different images, representing the video.
None
frames
int
The number of frames to be used from the video.
MAX_NUMBER_OF_FRAMES
Returns:
Type Descriptionlist[str]
list[str]: The answers to the prompt.
"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__init__","title":"__init__(model_config=None)
","text":"Initializes the Qwen2-VL model.
"},{"location":"shared_model_manager/","title":"Shared Model Manager","text":"The SharedModelManager
class is designed to manage and facilitate the use of machine learning models across different devices, such as CPUs and GPUs, within an asynchronous environment. It ensures safe and efficient execution of these models, particularly in scenarios where GPU resources need to be shared exclusively among multiple models. The manager coordinates access to the shared GPU, preventing conflicts when multiple models require it. Models are only loaded into memory when needed using the fetch_model
function.
add()
: Registers a machine learning model class with the manager. The actual model instance is not loaded at this point.fetch_model()
: Retrieves the previously added model class and creates (loads) the actual model instance. This function utilizes PyTorch interface to
, to handle device (CPU/GPU) allocation based on availability.The usage example demonstrates adding models and then using them with their respective functionalities.
\u26a0\ufe0f \u2755: We should ALWAYS add model instance on CPU to the pool. This avoids overwhelming the GPU memory, and model pool will automatically put it in GPU when the model is fetched..
model_pool = SharedModelManager()\n\n# Add models instance to the pool\nmodel_pool.add(QRReader())\nmodel_pool.add(Owlv2(model_config=OWLV2Config(device=Device.CPU)))\n\n# Read image\nimage = Image.open(\"path/to/your/image.jpg\")\n\n# Use QRReader model\nasync def use_qr_reader():\n # Read image\n image = Image.open(\"path/to/your/image.jpg\")\n\n qr_reader = await model_pool.fetch_model(QRReader.__name__)\n detections = qr_reader(image)\n # Process detections ...\n\n# Use Owlv2 model\nasync def use_owlv2():\n # Read image\n image = Image.open(\"path/to/your/image.jpg\")\n\n owlv2 = await model_pool.fetch_model(Owlv2.__name__)\n prompts = [\"a photo of a cat\", \"a photo of a dog\"]\n results = owlv2(image, prompts=prompts)\n # Process results ...\n
"},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager","title":"SharedModelManager
","text":""},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.add","title":"add(model)
","text":"Adds a model to the pool with a device preference.
Parameters:
Name Type Description Defaultmodel
Basetool
The modal instance to be added to the pool, it should implement the BaseTool interface.
requireddevice
Device
The preferred device for the model.
requiredReturns:
Name Type Descriptionstr
str
The model ID to be used for fetching the model.
"},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.fetch_model","title":"fetch_model(model_id)
","text":"Retrieves a model from the pool for safe execution.
Parameters:
Name Type Description Defaultmodel_id
str
Id to access the model in the pool.
requiredReturns:
Name Type DescriptionAny
BaseTool
The retrieved model instance.
"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Vision Agent Tools Documentation","text":"This repository contains tools that solve vision problems. This tools can be used in conjunction with the vision-agent.
"},{"location":"clip_media_sim/","title":"CLIPMediaSim","text":""},{"location":"clip_media_sim/#video-similarity","title":"Video similarity","text":"import cv2\nfrom PIL import Image\n\nfrom vision_agent_tools.models.clip_media_sim import CLIPMediaSim\n\n# Path to your target image\nimage_path = \"path/to/your/image.jpg\"\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the image\ntarget_image = Image.open(image_path)\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nfps = cap.get(cv2.CAP_PROP_FPS)\nframes = []\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n frames.append(frame)\ncap.release()\n\n# Calculate video timestamps\nvideo_time = len(frames) / fps\n\n# Create the CLIPMediaSim instance\nclip_media_sim = CLIPMediaSim()\n\n# Run video similarity against the target image\nresults = clip_media_sim(video=frames, target_image=target_image)\n\n# The results should be a list of [index_of_frame, confidence_score] where the\n# video is similar to the target image.\n\n# To find the time at which a given frame happens, you can do the following\n\ntime_per_frame = video_time / len(frames)\n\ntimestamp = results[0][0] * time_per_frame\n\nprint(\"Similarity detection complete!\")\n
You can also run similarity against a target text doing the following:
results = clip_media_sim(video=frames, target_text=\"a turtle holding the earth\")\n
"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim","title":"CLIPMediaSim
","text":" Bases: BaseMLModel
A class that receives a video and a target image or text and returns the frames that are most similar to the target.
"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__call__","title":"__call__(video, target_image=None, target_text=None, thresh=0.3)
","text":"Receives a video and a target image or text and returns the frames that are most similar to the target.
Parameters:
Name Type Description Defaultvideo
ndarray
The input video to be processed.
requiredtarget_image
Image | None
The target image to compare the video frames with.
None
target_text
str | None
The target text to compare the video frames with.
None
thresh
float
The threshold to filter the results. Defaults to 0.3.
0.3
"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__init__","title":"__init__(device='cuda')
","text":"Initializes the CLIPMediaSim object with a pre-trained CLIP model.
"},{"location":"controlnet_aux/","title":"Controlnet-Aux","text":""},{"location":"controlnet_aux/#pose-detector","title":"Pose Detector","text":"from PIL import Image\nfrom vision_agent_tools.models.controlnet_aux import Image2Pose\n\n# Path to your test image\ntest_image_path = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image_path)\n# Create the Image2Pose instance\nimage_2_pose = Image2Pose()\n\n# Run pose detection and get the results\nresults = image_2_pose(image)\n\n# Optional: Save the result image (assuming results is a PIL Image)\n# results.save(\"result.png\")\n\nprint(\"Pose detection complete!\")\n
Pose Detection Result"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose","title":"Image2Pose
","text":"A class that simplifies human pose detection using a pre-trained Openpose model.
This class provides a convenient way to run pose detection on images using a pre-trained Openpose model from the controlnet_aux
library. It takes a PIL Image object as input and returns the predicted pose information.
__call__(image)
","text":"Performs pose detection on a PIL image and returns the results.
This method takes a PIL Image object as input and runs the loaded Openpose detector on it. The predicted pose information is then resized to match the original image size and returned.
Parameters:
Name Type Description Defaultimage
Image
The input image for pose detection.
requiredReturns:
Type DescriptionImage
PIL.Image: The image with the predicted pose information (format might vary depending on the specific OpenposeDetector implementation).
"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__init__","title":"__init__()
","text":"Initializes the Image2Pose object with a pre-trained Openpose detector.
This method loads a pre-trained Openpose model from the specified model hub (\"lllyasviel/Annotators\" in this case). The loaded detector is stored as an attribute for future use.
"},{"location":"depth_anything_v2/","title":"Depth-Anything-V2","text":"This example demonstrates using the Depth-Anything-V2 tool for depth estimation on images.
from vision_agent_tools.models.depth_anything_v2 import DepthAnythingV2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the depth map estimation model.\ndepth_estimate = DepthAnythingV2()\n\n# Run the inference\nresults = depth_estimate(image)\n\n# Let's print the obtained depth map\nprint(results.map)\n
"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2","title":"DepthAnythingV2
","text":" Bases: BaseMLModel
Model for depth estimation using the Depth-Anything-V2 model from the paper Depth Anything V2.
"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__call__","title":"__call__(image, grayscale=False)
","text":"Depth-Anything-V2 is a highly practical solution for robust monocular depth estimation.
Parameters:
Name Type Description Defaultimage
Union[str, Image, ndarray]
The input image for depth estimation. Can be a file path, a PIL Image, or a NumPy array.
requiredgrayscale
bool
Whether to return the depth map as a grayscale image. If True, the depth map will be normalized to the range [0, 255] and converted to uint8. Defaults to False.
False
Returns:
Name Type DescriptionDepthMap
DepthMap
An object type containing a numpy array with the HxW depth map of the image.
"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__init__","title":"__init__()
","text":"Initializes the Depth-Anything-V2 model.
"},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthMap","title":"DepthMap
","text":" Bases: BaseModel
Represents the depth map of an image.
Attributes:
Name Type Descriptionmap
Any
HxW raw depth map of the image.
"},{"location":"florence2-qa/","title":"FlorenceQA","text":"This example demonstrates using the Florence2-QA tool to to answer questions about images.
NOTE: The FlorenceQA model can only be used in GPU environments.
from vision_agent_tools.models.florence2_qa import FlorenceQA\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image and create initialize the FlorenceQA model\nimage = Image.open(test_image)\nrun_florence_qa = FlorenceQA()\n\n# Time to put FlorenceQA to work! Let's pose a question about the image\nanswer = run_florence_qa(image, question=\"Is there a dog in the image?\")\n\n# Print the output answer\nprint(answer)\n
"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA","title":"FlorenceQA
","text":" Bases: BaseMLModel
FlorenceQA is a tool that combines the Florence-2 and Roberta QA models to answer questions about images.
NOTE: The Florence-2 model can only be used in GPU environments.
"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__call__","title":"__call__(image, question)
","text":"FlorenceQA model answers questions about images.
Parameters:
Name Type Description Defaultimage
Image
The image to be analyzed.
requiredquestion
str
The question to be answered.
requiredReturns:
Name Type Descriptionstr
dict[str, Any]
The answer to the question.
"},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__init__","title":"__init__()
","text":"Initializes the FlorenceQA model.
"},{"location":"florence2-sam2/","title":"Florence2Sam2","text":"This tool uses Florence2 and the SAM-2 model to do text to instance segmentation on image or video inputs.
import cv2\n\nfrom vision_agent_tools.models.florence2_sam2 import Florence2SAM2\n\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n frames.append(frame)\ncap.release()\n\n# Create the Florence2SAM2 instance\nflorence2_sam2 = Florence2SAM2()\n\n# segment all the instances of the prompt \"ball\" for all video frames\nresults = florence2_sam2(prompt=\"ball\", video=frames)\n\n# Returns a list of list where the first list represents the frames and the inner\n# list contains all the predictions per frame. The annotation ID can be used\n# to track the same object across different frames. For example:\n[\n [\n {\n \"id\": 0\n \"mask\": rle\n \"label\": \"ball\"\n \"bbox\": [x_min, y_min, x_max, y_max]\n }\n ],\n [\n {\n \"id\": 0\n \"mask\": rle\n \"label\": \"ball\"\n \"bbox\": [x_min, y_min, x_max, y_max]\n }\n ]\n]\n\nprint(\"Instance segmentation complete!\")\n
You can also run similarity against an image and get additionally bounding boxes doing the following:
results = florence2_sam2(image=image, prompts=[\"ball\"])\n
"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2","title":"Florence2SAM2
","text":" Bases: BaseMLModel
A class that receives a video or images, a text prompt and returns the instance segmentation based on the input for each frame.
"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__call__","title":"__call__(prompt, images=None, video=None, *, chunk_length_frames=20, iou_threshold=0.6, nms_threshold=0.3)
","text":"Florence2Sam2 model find objects in images and track objects in a video.
Parameters:
Name Type Description Defaultprompt
str
The text input that complements the media to find or track objects.
requiredimages
list[Image] | None
The images to be analyzed.
None
video
VideoNumpy | None
A numpy array containing the different images, representing the video.
None
chunk_length_frames
int | None
The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.
20
iou_threshold
float
The IoU threshold value used to compare last_predictions and new_predictions objects.
0.6
nms_threshold
float
The non-maximum suppression threshold value used to filter the Florence2 predictions.
0.3
Returns:
Type Descriptionlist[list[dict[str, Any]]]
list[list[dict[str, Any]]]: A list where each item represents each frames predictions. [[{ \"id\": 0, \"mask\": rle, \"label\": \"car\", \"bbox\": [0.1, 0.2, 0.3, 0.4] }]]
"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__init__","title":"__init__(model_config=Florence2SAM2Config())
","text":"Initializes the Florence2SAM2 object with a pre-trained Florence2 model and a SAM2 model.
"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.fine_tune","title":"fine_tune(checkpoint)
","text":"Load the fine-tuned Florence-2 model.
"},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.load_base","title":"load_base()
","text":"Load the base Florence-2 model.
"},{"location":"florence2/","title":"Florence-2","text":"This example demonstrates using the Florence2 tool to interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.
from vision_agent_tools.shared_types import PromptTask\nfrom vision_agent_tools.models.florence2 import Florence2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Choose the task that you are planning to use\ntask_prompt = PromptTask.CAPTION\n\n# Load the image and create initialize the Florence2 model\nimage = Image.open(test_image)\nmodel = Florence2()\n\n# Time to put Florence2 to work! Let's see what it finds...\nresults = model(images=[image], task=task_prompt)\n\n# Print the output result\nprint(f\"The image contains: {results[0]}\")\n
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2","title":"Florence2
","text":" Bases: BaseMLModel
Florence2 model. It supported both zero-shot and fine-tuned settings. For the zero-shot we use the Florence-2-large. For fine-tuning we use the Florence-2-base-ft. This model can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__call__","title":"__call__(task, prompt='', images=None, video=None, *, batch_size=5, nms_threshold=0.3, chunk_length_frames=None)
","text":"Performs inference on the Florence-2 model based on the provided task, images or video, and prompt.
Parameters:
Name Type Description Defaulttask
PromptTask
The task to be performed on the images or video.
requiredprompt
Optional[str]
The text input that complements the prompt task.
''
images
list[Image] | None
A list of images for the model to process. None if using video.
None
video
VideoNumpy | None
A NumPy representation of the video for inference. None if using images.
None
batch_size
int
The batch size used for processing multiple images or video frames.
5
nms_threshold
float
The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).
0.3
chunk_length_frames
int | None
The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.
None
Returns:
Name Type DescriptionFlorence2ResponseType
Florence2ResponseType
The output of the Florence-2 model based on the task and prompt.
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__init__","title":"__init__(model_config=Florence2Config())
","text":"Initializes the Florence2 model.
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.fine_tune","title":"fine_tune(checkpoint)
","text":"Load the fine-tuned Florence-2 model.
"},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.load_base","title":"load_base()
","text":"Load the base Florence-2 model.
"},{"location":"flux1/","title":"Flux1","text":"This example demonstrates using the Flux1 model to perform tasks such as image generation and mask inpainting based on text prompts.
"},{"location":"flux1/#parameters","title":"Parameters","text":"Flux1Config
class allows you to configure the parameters for the Flux1 model.Below is an example of how to create and use a Flux1Config
object:
from vision_agent_tools.models.flux1 import Flux1Config\n\nconfig = Flux1Config(\n height=512,\n width=512,\n num_inference_steps=28,\n guidance_scale=3.5,\n num_images_per_prompt=1,\n max_sequence_length=512,\n seed=42\n)\n
image
. Must be between 0 and 1. A value of 1 essentially ignores image
.import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# To perform image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n task=Flux1Task.IMAGE_GENERATION, # Image Generation Task\n prompt=\"A purple car in a futuristic cityscape\",\n config=config\n)\ngenerated_image.save(\"generated_car.png\")\n
"},{"location":"flux1/#perform-mask-inpainting","title":"Perform mask inpainting","text":"To perform mask inpainting, both the original image and the mask image need to be provided. These images have the same dimensions. The mask should clearly delineate the areas that you want to modify in the original image. Additionally, the inpainting process includes a strength parameter, which controls the intensity of the modifications applied to the masked areas.
import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have a cat image named \"cat_image.jpg\" that you want to use for mask inpainting\nimage_to_edit = Image.open(\"path/to/your/cat_image.jpg\").convert(\"RGB\") # Image to inpaint\n\n# Make sure to provide a mask image with the same dimensions, delineating the cat\nmask_image = Image.open(\"path/to/your/mask.png\") # Mask image indicating areas to change\n\n# Set a new prompt for inpainting\ninpainting_prompt = \"A cute dog\"\n\n# To perform image mask inpainting\nflux1 = Flux1()\n\ninpainted_image = flux_model(\n task=Flux1Task.MASK_INPAINTING, # Image Mask Inpainting Task\n prompt=inpainting_prompt,\n image=image_to_edit,\n mask_image=mask_image,\n config=config\n)\n\ninpainted_image.save(\"inpainted_dog_over_cat.png\")\n
"},{"location":"flux1/#perform-image-to-image-generation","title":"Perform image-to-image generation","text":"To perform image-to-image generation, you need to provide an original image along with a text prompt describing the desired modifications. The original image serves as the base, and the model will generate a new image based on the prompt.
import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have an original image named \"original_image.jpg\" that you want to use for image-to-image generation\noriginal_image = Image.open(\"path/to/your/original_image.jpg\").convert(\"RGB\") # Original image\n\n# Set a new prompt for image-to-image generation\nimage_to_image_prompt = \"A sunny beach with palm trees\"\n\n# To perform image-to-image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n task=Flux1Task.IMAGE_TO_IMAGE, # Image-to-Image Generation Task\n prompt=image_to_image_prompt,\n image=original_image,\n config=config\n)\n\ngenerated_image.save(\"generated_beach.png\")\n
"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1","title":"Flux1
","text":" Bases: BaseMLModel
Tool for object detection using the pre-trained Flux1 model. This tool takes a prompt as input and generates an image using the Flux1 model.
"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__call__","title":"__call__(prompt=Field(max_length=512), task=Flux1Task.IMAGE_GENERATION, config=Flux1Config(), image=None, mask_image=None)
","text":"Performs object detection on an image using the Flux1 model.
Parameters:
Name Type Description Default-
prompt (str
The text prompt describing the desired modifications.
required-
task (Flux1Task
The task to perform using the model: - image generation - \"generation\", - mask inpainting - \"inpainting\", - image-to-image generation - \"img2img\".
required-
config (Flux1Config
int
, optional): The height in pixels of the generated image. This is set to 512 by default.int
, optional): The width in pixels of the generated image. This is set to 512 by default.int
, optional, defaults to 28):float
, optional, defaults to 3.5): Guidance scale as defined in Classifier-Free Diffusion Guidance. Higher guidance scale encourages to generate images that are closely linked to the text prompt
, usually at the expense of lower image quality.int
, optional, defaults to 1): The number of images to generate per prompt.int
defaults to 512): Maximum sequence length to use with the prompt
. to make generation deterministic.int
, optional): The seed to use for the random number generator. If not provided, a random seed is used.float
, optional, defaults to 0.6): Indicates extent to transform the reference image
. Must be between 0 and 1. A value of 1 essentially ignores image
.-
image (Image.Image
The original image to be modified.
required-
mask_image (Image.Image
The mask image indicating areas to be inpainted.
requiredReturns:
Type DescriptionList[Image] | None
List[Image.Image]: The list of generated image(s) if successful; None if an error occurred.
"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__init__","title":"__init__(hf_model='black-forest-labs/FLUX.1-schnell', dtype=torch.bfloat16, enable_sequential_cpu_offload=True)
","text":"Initializes the Flux1 image generation tool. Loads the pre-trained Flux1 model from HuggingFace and sets model configurations.
Parameters:
Name Type Description Default-
task (Flux1Task
The task to perform using the model: either image generation (\"generation\") or mask inpainting (\"inpainting\").
required-
model_config
The configuration for the model, hf_model, and device.
required-
dtype (torch.dtype
The data type to use for the model.
required-
enable_sequential_cpu_offload (bool
Whether to enable sequential CPU offload.
required"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1Config","title":"Flux1Config
","text":" Bases: BaseModel
Configuration for the Flux1 model.
"},{"location":"internlm_xcomposer2/","title":"InternLM-XComposer-2.5","text":"This example demonstrates how to use the InternLM-XComposer-2.5 tool to to answer questions about images or videos.
NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.
import cv2\n\nfrom vision_agent_tools.models.internlm_xcomposer2 import InternLMXComposer2\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n frames.append(frame)\ncap.release()\n\n# Initialize the InternLMXComposer2 model\nrun_inference = InternLMXComposer2()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put InternLMXComposer2 to work!\nanswer = run_inference(video=p_video, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n
"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2","title":"InternLMXComposer2
","text":" Bases: BaseMLModel
InternLM-XComposer-2.5 is a tool that excels in various text-image comprehension and composition applications, achieving GPT-4V level capabilities.
NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.
"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__call__","title":"__call__(prompt, image=None, video=None, frames=MAX_NUMBER_OF_FRAMES, chunk_length=None)
","text":"InternLMXComposer2 model answers questions about a video or image.
Parameters:
Name Type Description Defaultprompt
str
The prompt with the question to be answered.
requiredimage
Image | None
The image to be analyzed.
None
video
VideoNumpy | None
A numpy array containing the different images, representing the video.
None
frames
int
The number of frames to be used from the video.
MAX_NUMBER_OF_FRAMES
chunk_length
int
The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.
None
Returns:
Type Descriptionlist[str]
list[str]: The answers to the prompt.
"},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__init__","title":"__init__()
","text":"Initializes the InternLMXComposer2.5 model.
"},{"location":"nsfw_classification/","title":"NSFW (Not Safe for Work) classification","text":"This example demonstrates using the Not Safe for Work classification tool.
from vision_agent_tools.models.nsfw_classification import NSFWClassification\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the NSFW model.\nnsfw_classification = NSFWClassification()\n\n# Run the inference\nresults = nsfw_classification(image)\n\n# Let's print the predicted label\nprint(results.label)\n
"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification","title":"NSFWClassification
","text":" Bases: BaseMLModel
The primary intended use of this model is for the classification of NSFW (Not Safe for Work) images.
"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__call__","title":"__call__(image)
","text":"Performs the NSFW inference on an image using the NSFWClassification model.
Parameters:
Name Type Description Defaultimage
Image
The input image for object detection.
requiredReturns:
Name Type DescriptionNSFWInferenceData
NSFWInferenceData
The inference result from the NSFWClassification model. label (str): The label for the unsafe content detected in the image. score (float):The score for the unsafe content detected in the image.
"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__init__","title":"__init__()
","text":"Initializes the NSFW (Not Safe for Work) classification tool.
"},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWInferenceData","title":"NSFWInferenceData
","text":" Bases: BaseModel
Represents an inference result from the NSFWClassification model.
Attributes:
Name Type Descriptionlabel
str
The predicted label for the image.
score
float
The confidence score associated with the prediction (between 0 and 1).
"},{"location":"nshot_counting/","title":"LOCA (Low-shot Object Counting network with iterative prototype Adaptation).","text":"This example demonstrates how to use the NShot LOCA tool for object counting in images.
from vision_agent_tools.models.nshot_counting import NShotCounting\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the counting model and choose the image output size you expect.\nObjectCounting = NShotCounting(zero_shot=False, img_size=512)\n\n# Run the inference\nresults = ObjectCounting(image, bbox=[12, 34, 56, 78])\n\n# Let's find out how many objects were found in total\nprint(\"Found a total count of {results.count} objects on the image!\")\n
"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.CountingDetection","title":"CountingDetection
","text":" Bases: BaseModel
Represents an inference result from the LOCA model.
Attributes:
Name Type Descriptioncount
int
The predicted number of detected objects.
masks
list[Any]
A list of numpy arrays representing the masks of the detected objects in the image.
"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting","title":"NShotCounting
","text":" Bases: BaseMLModel
Model for object counting using the zeroshot and n-shot versions of the LOCA model from the paper A Low-Shot Object Counting Network With Iterative Prototype Adaptation .
"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__call__","title":"__call__(image, bbox=None)
","text":"LOCA injects shape and appearance information into object queries to precisely count objects of various sizes in densely and sparsely populated scenarios. It also extends to a zeroshot scenario and achieves excellent localization and count errors across the entire low-shot spectrum.
Parameters:
Name Type Description Defaultimage
Image
The input image for object detection.
requiredbbox
BoundingBox
A list of four ints representing the bounding box coordinates (xmin, ymin, xmax, ymax) of the detected query in the image.
None
Returns:
Name Type DescriptionCountingDetection
CountingDetection
An object type containing: - The count of the objects found similar to the bbox query. - A list of numpy arrays representing the masks of the objects found.
"},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__init__","title":"__init__(zero_shot=True, img_size=512)
","text":"Initializes the LOCA model.
Parameters:
Name Type Description Defaultimg_size
int
Size of the input image.
512
"},{"location":"owlv2/","title":"OWLv2 Open-World Localization","text":"This example demonstrates using the Owlv2 tool for object detection in images based on text prompts.
from vision_agent_tools.models.owlv2 import Owlv2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# What are you looking for? Write your detective prompts here!\nprompts = [\"a photo of a cat\", \"a photo of a dog\"]\n\n# Load the image and create your Owlv2 detective tool\nimage = Image.open(test_image)\nowlv2 = Owlv2()\n\n# Time to put Owlv2 to work! Let's see what it finds...\nresults = owlv2(image, prompts=prompts)[0]\n\n# Did Owlv2 sniff out any objects? Let's see the results!\nif results:\n for detection in results:\n print(f\"Found it! It looks like a {detection['label']} with a confidence of {detection['score']:.2f}.\")\n print(f\"Here's where it's hiding: {detection['bbox']}\")\nelse:\n print(\"Hmm, Owlv2 couldn't find anything this time. Maybe try a different prompt?\")\n
"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2","title":"Owlv2
","text":" Bases: BaseMLModel
Tool for object detection using the pre-trained Owlv2 model from Transformers.
This tool takes images and a prompt as input, performs object detection using the Owlv2 model, and returns a list of objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding a threshold.
"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__call__","title":"__call__(prompts, images=None, video=None, *, batch_size=1, nms_threshold=0.3, confidence=0.1)
","text":"Performs object detection on images using the Owlv2 model.
Parameters:
Name Type Description Defaultprompts
list[str]
The prompt to be used for object detection.
requiredimages
list[Image] | None
The images to be analyzed.
None
video
VideoNumpy[uint8] | None
A numpy array containing the different images, representing the video.
None
batch_size
int
The batch size used for processing multiple images or video frames.
1
nms_threshold
float
The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).
0.3
confidence
float
Confidence threshold for model predictions.
0.1
Returns:
Type Descriptionlist[ODWithScoreResponse]
list[ODWithScoreResponse]: A list of ODWithScoreResponse
objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding the threshold. The item will be None if no objects are detected above the confidence threshold for an specific image / frame.
__init__(model_config=OWLV2Config())
","text":"Loads the pre-trained Owlv2 processor and model from Transformers.
"},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS","title":"Owlv2ProcessorWithNMS
","text":" Bases: Owlv2Processor
post_process_object_detection_with_nms(outputs, *, threshold=0.1, nms_threshold=0.3, target_sizes=None)
","text":"Converts the raw output of [OwlViTForObjectDetection
] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
Parameters:
Name Type Description Defaultoutputs
OwlViTObjectDetectionOutput
Raw outputs of the model.
requiredthreshold
float
Score threshold to keep object detection predictions.
0.1
nms_threshold
float
IoU threshold to filter overlapping objects the raw detections.
0.3
target_sizes
TensorType | list[Tuple] | None
Tensor of shape (batch_size, 2)
or list of tuples (Tuple[int, int]
) containing the target size (height, width)
of each image in the batch. If unset, predictions will not be resized.
None
Returns: list[dict]
: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image in the batch as predicted by the model.
Tool for detecting QR codes in images.
from PIL import Image, ImageDraw\n\nfrom vision_agent_tools.models.qr_reader import QRReader\n\n# Open the image containing the QR code\nimage = Image.open(\"sample_qr_image.jpeg\")\n\n# Create a QR code reader object\nqr_reader = QRReader()\n\n# Detect QR codes in the image\ndetections = qr_reader(image)\n\n\nif detections:\n\n detection = detections[0]\n draw = ImageDraw.Draw(image)\n\n # Print the detected text\n print(f\"Decoded Text: {detection.text}\")\n\n # Draw the bounding box\n x_min, y_min, x_max, y_max = (\n int(detection.bbox[0]),\n int(detection.bbox[1]),\n int(detection.bbox[2]),\n int(detection.bbox[3]),\n )\n draw.rectangle(((x_min, y_min), (x_max, y_max)), outline=\"red\", width=2)\n\n # Draw the text on top of the image\n draw.text((x_min + 10, y_min - 10), detection.text, fill=\"blue\", anchor=\"mm\")\n image.show()\nelse:\n print(\"No QR codes detected in the image.\")\n
Displaying the Detection Result"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRCodeDetection","title":"QRCodeDetection
","text":" Bases: BaseModel
Represents a detected QR code.
"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader","title":"QRReader
","text":" Bases: BaseMLModel
This tool utilizes the qreader
library to detect QR codes within an input image. It returns a list of QRCodeDetection
objects for each detected QR code, containing the decoded text, confidence score, polygon coordinates, bounding box, and center point.
__call__(image)
","text":"Detects QR codes in an image.
Parameters:
Name Type Description Defaultimage
Image
The input image for QR code detection.
requiredReturns:
Type Descriptionlist[QRCodeDetection]
list[QRCodeDetection]: A list of QRCodeDetection
objects containing information about each detected QR code, or an empty list if none are found.
__init__()
","text":"Initializes the QR code reader tool.
Loads the QReader
instance for QR code detection.
This example demonstrates how to use the Qwen2-VL model to to answer questions about images or videos.
NOTE: The Qwen2-VL model should be used in GPU environments.
import cv2\nimport numpy as np\nfrom vision_agent_tools.models.qwen2_vl import Qwen2VL\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n ret, frame = cap.read()\n if not ret:\n break\n frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n frames.append(frame)\ncap.release()\nframes = np.stack(frames, axis=0)\n\n# Initialize the Qwen2VL model\nrun_inference = Qwen2VL()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put Qwen2VL to work!\nanswer = run_inference(video=frames, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n
"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL","title":"Qwen2VL
","text":" Bases: BaseMLModel
Qwen2-VL is a model that is capable of accurately identifying and comprehending the content within images, regardless of their clarity, resolution, or extreme aspect ratios.
NOTE: The Qwen2-VL model should be used in GPU environments.
"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__call__","title":"__call__(prompt=None, images=None, video=None, frames=MAX_NUMBER_OF_FRAMES)
","text":"Qwen2-VL model answers questions about a video or image.
Parameters:
Name Type Description Defaultprompt
str
The prompt with the question to be answered.
None
images
list[Image]
A list of images for the model to process. None if using video.
None
video
VideoNumpy | None
A numpy array containing the different images, representing the video.
None
frames
int
The number of frames to be used from the video.
MAX_NUMBER_OF_FRAMES
Returns:
Type Descriptionlist[str]
list[str]: The answers to the prompt.
"},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__init__","title":"__init__(model_config=None)
","text":"Initializes the Qwen2-VL model.
"},{"location":"shared_model_manager/","title":"Shared Model Manager","text":"The SharedModelManager
class is designed to manage and facilitate the use of machine learning models across different devices, such as CPUs and GPUs, within an asynchronous environment. It ensures safe and efficient execution of these models, particularly in scenarios where GPU resources need to be shared exclusively among multiple models. The manager coordinates access to the shared GPU, preventing conflicts when multiple models require it. Models are only loaded into memory when needed using the fetch_model
function.
add()
: Registers a machine learning model class with the manager. The actual model instance is not loaded at this point.fetch_model()
: Retrieves the previously added model class and creates (loads) the actual model instance. This function utilizes PyTorch interface to
, to handle device (CPU/GPU) allocation based on availability.The usage example demonstrates adding models and then using them with their respective functionalities.
\u26a0\ufe0f \u2755: We should ALWAYS add model instance on CPU to the pool. This avoids overwhelming the GPU memory, and model pool will automatically put it in GPU when the model is fetched..
model_pool = SharedModelManager()\n\n# Add models instance to the pool\nmodel_pool.add(QRReader())\nmodel_pool.add(Owlv2(model_config=OWLV2Config(device=Device.CPU)))\n\n# Read image\nimage = Image.open(\"path/to/your/image.jpg\")\n\n# Use QRReader model\nasync def use_qr_reader():\n # Read image\n image = Image.open(\"path/to/your/image.jpg\")\n\n qr_reader = await model_pool.fetch_model(QRReader.__name__)\n detections = qr_reader(image)\n # Process detections ...\n\n# Use Owlv2 model\nasync def use_owlv2():\n # Read image\n image = Image.open(\"path/to/your/image.jpg\")\n\n owlv2 = await model_pool.fetch_model(Owlv2.__name__)\n prompts = [\"a photo of a cat\", \"a photo of a dog\"]\n results = owlv2(image, prompts=prompts)\n # Process results ...\n
"},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager","title":"SharedModelManager
","text":""},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.add","title":"add(model)
","text":"Adds a model to the pool with a device preference.
Parameters:
Name Type Description Defaultmodel
Basetool
The modal instance to be added to the pool, it should implement the BaseTool interface.
requireddevice
Device
The preferred device for the model.
requiredReturns:
Name Type Descriptionstr
str
The model ID to be used for fetching the model.
"},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.fetch_model","title":"fetch_model(model_id)
","text":"Retrieves a model from the pool for safe execution.
Parameters:
Name Type Description Defaultmodel_id
str
Id to access the model in the pool.
requiredReturns:
Name Type DescriptionAny
BaseTool
The retrieved model instance.
"}]} \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 23098322..e74d7be6 100644 Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ