diff --git a/florence2-sam2/index.html b/florence2-sam2/index.html index db7b1090..132eb3c8 100644 --- a/florence2-sam2/index.html +++ b/florence2-sam2/index.html @@ -411,6 +411,24 @@ + + +
  • + + + fine_tune + + + +
  • + +
  • + + + load_base + + +
  • @@ -667,6 +685,24 @@ + + +
  • + + + fine_tune + + + +
  • + +
  • + + + load_base + + +
  • @@ -961,6 +997,40 @@

    + + +

    + fine_tune(checkpoint) + +#

    + + +
    + +

    Load the fine-tuned Florence-2 model.

    + +
    + + + +
    + + +

    + load_base() + +#

    + + +
    + +

    Load the base Florence-2 model.

    + +
    + +
    + diff --git a/objects.inv b/objects.inv index d119075f..3c78b8c2 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/search/search_index.json b/search/search_index.json index 0c13bdfa..eae1d345 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Vision Agent Tools Documentation","text":"

    This repository contains tools that solve vision problems. This tools can be used in conjunction with the vision-agent.

    "},{"location":"clip_media_sim/","title":"CLIPMediaSim","text":""},{"location":"clip_media_sim/#video-similarity","title":"Video similarity","text":"
    import cv2\nfrom PIL import Image\n\nfrom vision_agent_tools.models.clip_media_sim import CLIPMediaSim\n\n# Path to your target image\nimage_path = \"path/to/your/image.jpg\"\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the image\ntarget_image = Image.open(image_path)\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nfps = cap.get(cv2.CAP_PROP_FPS)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Calculate video timestamps\nvideo_time = len(frames) / fps\n\n# Create the CLIPMediaSim instance\nclip_media_sim = CLIPMediaSim()\n\n# Run video similarity against the target image\nresults = clip_media_sim(video=frames, target_image=target_image)\n\n# The results should be a list of [index_of_frame, confidence_score] where the\n# video is similar to the target image.\n\n# To find the time at which a given frame happens, you can do the following\n\ntime_per_frame = video_time / len(frames)\n\ntimestamp = results[0][0] * time_per_frame\n\nprint(\"Similarity detection complete!\")\n

    You can also run similarity against a target text doing the following:

    results = clip_media_sim(video=frames, target_text=\"a turtle holding the earth\")\n
    "},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim","title":"CLIPMediaSim","text":"

    Bases: BaseMLModel

    A class that receives a video and a target image or text and returns the frames that are most similar to the target.

    "},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__call__","title":"__call__(video, target_image=None, target_text=None, thresh=0.3)","text":"

    Receives a video and a target image or text and returns the frames that are most similar to the target.

    Parameters:

    Name Type Description Default video ndarray

    The input video to be processed.

    required target_image Image | None

    The target image to compare the video frames with.

    None target_text str | None

    The target text to compare the video frames with.

    None thresh float

    The threshold to filter the results. Defaults to 0.3.

    0.3"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__init__","title":"__init__(device='cuda')","text":"

    Initializes the CLIPMediaSim object with a pre-trained CLIP model.

    "},{"location":"controlnet_aux/","title":"Controlnet-Aux","text":""},{"location":"controlnet_aux/#pose-detector","title":"Pose Detector","text":"
    from PIL import Image\nfrom vision_agent_tools.models.controlnet_aux import Image2Pose\n\n# Path to your test image\ntest_image_path = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image_path)\n# Create the Image2Pose instance\nimage_2_pose = Image2Pose()\n\n# Run pose detection and get the results\nresults = image_2_pose(image)\n\n# Optional: Save the result image (assuming results is a PIL Image)\n# results.save(\"result.png\")\n\nprint(\"Pose detection complete!\")\n
    Pose Detection Result"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose","title":"Image2Pose","text":"

    A class that simplifies human pose detection using a pre-trained Openpose model.

    This class provides a convenient way to run pose detection on images using a pre-trained Openpose model from the controlnet_aux library. It takes a PIL Image object as input and returns the predicted pose information.

    "},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__call__","title":"__call__(image)","text":"

    Performs pose detection on a PIL image and returns the results.

    This method takes a PIL Image object as input and runs the loaded Openpose detector on it. The predicted pose information is then resized to match the original image size and returned.

    Parameters:

    Name Type Description Default image Image

    The input image for pose detection.

    required

    Returns:

    Type Description Image

    PIL.Image: The image with the predicted pose information (format might vary depending on the specific OpenposeDetector implementation).

    "},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__init__","title":"__init__()","text":"

    Initializes the Image2Pose object with a pre-trained Openpose detector.

    This method loads a pre-trained Openpose model from the specified model hub (\"lllyasviel/Annotators\" in this case). The loaded detector is stored as an attribute for future use.

    "},{"location":"depth_anything_v2/","title":"Depth-Anything-V2","text":"

    This example demonstrates using the Depth-Anything-V2 tool for depth estimation on images.

    from vision_agent_tools.models.depth_anything_v2 import DepthAnythingV2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the depth map estimation model.\ndepth_estimate = DepthAnythingV2()\n\n# Run the inference\nresults = depth_estimate(image)\n\n# Let's print the obtained depth map\nprint(results.map)\n
    "},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2","title":"DepthAnythingV2","text":"

    Bases: BaseMLModel

    Model for depth estimation using the Depth-Anything-V2 model from the paper Depth Anything V2.

    "},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__call__","title":"__call__(image, grayscale=False)","text":"

    Depth-Anything-V2 is a highly practical solution for robust monocular depth estimation.

    Parameters:

    Name Type Description Default image Union[str, Image, ndarray]

    The input image for depth estimation. Can be a file path, a PIL Image, or a NumPy array.

    required grayscale bool

    Whether to return the depth map as a grayscale image. If True, the depth map will be normalized to the range [0, 255] and converted to uint8. Defaults to False.

    False

    Returns:

    Name Type Description DepthMap DepthMap

    An object type containing a numpy array with the HxW depth map of the image.

    "},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__init__","title":"__init__()","text":"

    Initializes the Depth-Anything-V2 model.

    "},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthMap","title":"DepthMap","text":"

    Bases: BaseModel

    Represents the depth map of an image.

    Attributes:

    Name Type Description map Any

    HxW raw depth map of the image.

    "},{"location":"florence2-qa/","title":"FlorenceQA","text":"

    This example demonstrates using the Florence2-QA tool to to answer questions about images.

    NOTE: The FlorenceQA model can only be used in GPU environments.

    from vision_agent_tools.models.florence2_qa import FlorenceQA\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image and create initialize the FlorenceQA model\nimage = Image.open(test_image)\nrun_florence_qa = FlorenceQA()\n\n# Time to put FlorenceQA to work! Let's pose a question about the image\nanswer = run_florence_qa(image, question=\"Is there a dog in the image?\")\n\n# Print the output answer\nprint(answer)\n
    "},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA","title":"FlorenceQA","text":"

    Bases: BaseMLModel

    FlorenceQA is a tool that combines the Florence-2 and Roberta QA models to answer questions about images.

    NOTE: The Florence-2 model can only be used in GPU environments.

    "},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__call__","title":"__call__(image, question)","text":"

    FlorenceQA model answers questions about images.

    Parameters:

    Name Type Description Default image Image

    The image to be analyzed.

    required question str

    The question to be answered.

    required

    Returns:

    Name Type Description str dict[str, Any]

    The answer to the question.

    "},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__init__","title":"__init__()","text":"

    Initializes the FlorenceQA model.

    "},{"location":"florence2-sam2/","title":"Florence2Sam2","text":"

    This tool uses Florence2 and the SAM-2 model to do text to instance segmentation on image or video inputs.

    import cv2\n\nfrom vision_agent_tools.models.florence2_sam2 import Florence2SAM2\n\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Create the Florence2SAM2 instance\nflorence2_sam2 = Florence2SAM2()\n\n# segment all the instances of the prompt \"ball\" for all video frames\nresults = florence2_sam2(prompt=\"ball\", video=frames)\n\n# Returns a list of list where the first list represents the frames and the inner\n# list contains all the predictions per frame. The annotation ID can be used\n# to track the same object across different frames. For example:\n[\n    [\n        {\n            \"id\": 0\n            \"mask\": rle\n            \"label\": \"ball\"\n            \"bbox\": [x_min, y_min, x_max, y_max]\n        }\n    ],\n    [\n        {\n            \"id\": 0\n            \"mask\": rle\n            \"label\": \"ball\"\n            \"bbox\": [x_min, y_min, x_max, y_max]\n        }\n    ]\n]\n\nprint(\"Instance segmentation complete!\")\n

    You can also run similarity against an image and get additionally bounding boxes doing the following:

    results = florence2_sam2(image=image, prompts=[\"ball\"])\n
    "},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2","title":"Florence2SAM2","text":"

    Bases: BaseMLModel

    A class that receives a video or images, a text prompt and returns the instance segmentation based on the input for each frame.

    "},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__call__","title":"__call__(prompt, images=None, video=None, *, chunk_length_frames=20, iou_threshold=0.6, nms_threshold=0.3)","text":"

    Florence2Sam2 model find objects in images and track objects in a video.

    Parameters:

    Name Type Description Default prompt str

    The text input that complements the media to find or track objects.

    required images list[Image] | None

    The images to be analyzed.

    None video VideoNumpy | None

    A numpy array containing the different images, representing the video.

    None chunk_length_frames int | None

    The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.

    20 iou_threshold float

    The IoU threshold value used to compare last_predictions and new_predictions objects.

    0.6 nms_threshold float

    The non-maximum suppression threshold value used to filter the Florence2 predictions.

    0.3

    Returns:

    Type Description list[list[dict[str, Any]]]

    list[list[dict[str, Any]]]: A list where each item represents each frames predictions. [[{ \"id\": 0, \"mask\": rle, \"label\": \"car\", \"bbox\": [0.1, 0.2, 0.3, 0.4] }]]

    "},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__init__","title":"__init__(model_config=Florence2SAM2Config())","text":"

    Initializes the Florence2SAM2 object with a pre-trained Florence2 model and a SAM2 model.

    "},{"location":"florence2/","title":"Florence-2","text":"

    This example demonstrates using the Florence2 tool to interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.

    from vision_agent_tools.shared_types import PromptTask\nfrom vision_agent_tools.models.florence2 import Florence2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Choose the task that you are planning to use\ntask_prompt = PromptTask.CAPTION\n\n# Load the image and create initialize the Florence2 model\nimage = Image.open(test_image)\nmodel = Florence2()\n\n# Time to put Florence2 to work! Let's see what it finds...\nresults = model(images=[image], task=task_prompt)\n\n# Print the output result\nprint(f\"The image contains: {results[0]}\")\n
    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2","title":"Florence2","text":"

    Bases: BaseMLModel

    Florence2 model. It supported both zero-shot and fine-tuned settings. For the zero-shot we use the Florence-2-large. For fine-tuning we use the Florence-2-base-ft. This model can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.

    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__call__","title":"__call__(task, prompt='', images=None, video=None, *, batch_size=5, nms_threshold=0.3, chunk_length_frames=None)","text":"

    Performs inference on the Florence-2 model based on the provided task, images or video, and prompt.

    Parameters:

    Name Type Description Default task PromptTask

    The task to be performed on the images or video.

    required prompt Optional[str]

    The text input that complements the prompt task.

    '' images list[Image] | None

    A list of images for the model to process. None if using video.

    None video VideoNumpy | None

    A NumPy representation of the video for inference. None if using images.

    None batch_size int

    The batch size used for processing multiple images or video frames.

    5 nms_threshold float

    The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).

    0.3 chunk_length_frames int | None

    The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.

    None

    Returns:

    Name Type Description Florence2ResponseType Florence2ResponseType

    The output of the Florence-2 model based on the task and prompt.

    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__init__","title":"__init__(model_config=Florence2Config())","text":"

    Initializes the Florence2 model.

    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.fine_tune","title":"fine_tune(checkpoint)","text":"

    Load the fine-tuned Florence-2 model.

    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.load_base","title":"load_base()","text":"

    Load the base Florence-2 model.

    "},{"location":"flux1/","title":"Flux1","text":"

    This example demonstrates using the Flux1 model to perform tasks such as image generation and mask inpainting based on text prompts.

    "},{"location":"flux1/#parameters","title":"Parameters","text":""},{"location":"flux1/#flux1config","title":"Flux1Config","text":"

    Below is an example of how to create and use a Flux1Config object:

    from vision_agent_tools.models.flux1 import Flux1Config\n\nconfig = Flux1Config(\n    height=512,\n    width=512,\n    num_inference_steps=28,\n    guidance_scale=3.5,\n    num_images_per_prompt=1,\n    max_sequence_length=512,\n    seed=42\n)\n
    "},{"location":"flux1/#perform-image-generation","title":"Perform image generation","text":"
    import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# To perform image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n    task=Flux1Task.IMAGE_GENERATION,  # Image Generation Task\n    prompt=\"A purple car in a futuristic cityscape\",\n    config=config\n)\ngenerated_image.save(\"generated_car.png\")\n
    "},{"location":"flux1/#perform-mask-inpainting","title":"Perform mask inpainting","text":"

    To perform mask inpainting, both the original image and the mask image need to be provided. These images have the same dimensions. The mask should clearly delineate the areas that you want to modify in the original image. Additionally, the inpainting process includes a strength parameter, which controls the intensity of the modifications applied to the masked areas.

    import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have a cat image named \"cat_image.jpg\" that you want to use for mask inpainting\nimage_to_edit = Image.open(\"path/to/your/cat_image.jpg\").convert(\"RGB\")  # Image to inpaint\n\n# Make sure to provide a mask image with the same dimensions, delineating the cat\nmask_image = Image.open(\"path/to/your/mask.png\")  # Mask image indicating areas to change\n\n# Set a new prompt for inpainting\ninpainting_prompt = \"A cute dog\"\n\n# To perform image mask inpainting\nflux1 = Flux1()\n\ninpainted_image = flux_model(\n    task=Flux1Task.MASK_INPAINTING,  # Image Mask Inpainting Task\n    prompt=inpainting_prompt,\n    image=image_to_edit,\n    mask_image=mask_image,\n    config=config\n)\n\ninpainted_image.save(\"inpainted_dog_over_cat.png\")\n
    "},{"location":"flux1/#perform-image-to-image-generation","title":"Perform image-to-image generation","text":"

    To perform image-to-image generation, you need to provide an original image along with a text prompt describing the desired modifications. The original image serves as the base, and the model will generate a new image based on the prompt.

    import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have an original image named \"original_image.jpg\" that you want to use for image-to-image generation\noriginal_image = Image.open(\"path/to/your/original_image.jpg\").convert(\"RGB\")  # Original image\n\n# Set a new prompt for image-to-image generation\nimage_to_image_prompt = \"A sunny beach with palm trees\"\n\n# To perform image-to-image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n    task=Flux1Task.IMAGE_TO_IMAGE,  # Image-to-Image Generation Task\n    prompt=image_to_image_prompt,\n    image=original_image,\n    config=config\n)\n\ngenerated_image.save(\"generated_beach.png\")\n
    "},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1","title":"Flux1","text":"

    Bases: BaseMLModel

    Tool for object detection using the pre-trained Flux1 model. This tool takes a prompt as input and generates an image using the Flux1 model.

    "},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__call__","title":"__call__(prompt=Field(max_length=512), task=Flux1Task.IMAGE_GENERATION, config=Flux1Config(), image=None, mask_image=None)","text":"

    Performs object detection on an image using the Flux1 model.

    Parameters:

    Name Type Description Default - prompt (str

    The text prompt describing the desired modifications.

    required - task (Flux1Task

    The task to perform using the model: - image generation - \"generation\", - mask inpainting - \"inpainting\", - image-to-image generation - \"img2img\".

    required - config (Flux1Config required - image (Image.Image

    The original image to be modified.

    required - mask_image (Image.Image

    The mask image indicating areas to be inpainted.

    required

    Returns:

    Type Description List[Image] | None

    List[Image.Image]: The list of generated image(s) if successful; None if an error occurred.

    "},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__init__","title":"__init__(hf_model='black-forest-labs/FLUX.1-schnell', dtype=torch.bfloat16, enable_sequential_cpu_offload=True)","text":"

    Initializes the Flux1 image generation tool. Loads the pre-trained Flux1 model from HuggingFace and sets model configurations.

    Parameters:

    Name Type Description Default - task (Flux1Task

    The task to perform using the model: either image generation (\"generation\") or mask inpainting (\"inpainting\").

    required - model_config

    The configuration for the model, hf_model, and device.

    required - dtype (torch.dtype

    The data type to use for the model.

    required - enable_sequential_cpu_offload (bool

    Whether to enable sequential CPU offload.

    required"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1Config","title":"Flux1Config","text":"

    Bases: BaseModel

    Configuration for the Flux1 model.

    "},{"location":"internlm_xcomposer2/","title":"InternLM-XComposer-2.5","text":"

    This example demonstrates how to use the InternLM-XComposer-2.5 tool to to answer questions about images or videos.

    NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.

    import cv2\n\nfrom vision_agent_tools.models.internlm_xcomposer2 import InternLMXComposer2\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Initialize the InternLMXComposer2 model\nrun_inference = InternLMXComposer2()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put InternLMXComposer2 to work!\nanswer = run_inference(video=p_video, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n
    "},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2","title":"InternLMXComposer2","text":"

    Bases: BaseMLModel

    InternLM-XComposer-2.5 is a tool that excels in various text-image comprehension and composition applications, achieving GPT-4V level capabilities.

    NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.

    "},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__call__","title":"__call__(prompt, image=None, video=None, frames=MAX_NUMBER_OF_FRAMES, chunk_length=None)","text":"

    InternLMXComposer2 model answers questions about a video or image.

    Parameters:

    Name Type Description Default prompt str

    The prompt with the question to be answered.

    required image Image | None

    The image to be analyzed.

    None video VideoNumpy | None

    A numpy array containing the different images, representing the video.

    None frames int

    The number of frames to be used from the video.

    MAX_NUMBER_OF_FRAMES chunk_length int

    The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.

    None

    Returns:

    Type Description list[str]

    list[str]: The answers to the prompt.

    "},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__init__","title":"__init__()","text":"

    Initializes the InternLMXComposer2.5 model.

    "},{"location":"nsfw_classification/","title":"NSFW (Not Safe for Work) classification","text":"

    This example demonstrates using the Not Safe for Work classification tool.

    from vision_agent_tools.models.nsfw_classification import NSFWClassification\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the NSFW model.\nnsfw_classification = NSFWClassification()\n\n# Run the inference\nresults = nsfw_classification(image)\n\n# Let's print the predicted label\nprint(results.label)\n
    "},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification","title":"NSFWClassification","text":"

    Bases: BaseMLModel

    The primary intended use of this model is for the classification of NSFW (Not Safe for Work) images.

    "},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__call__","title":"__call__(image)","text":"

    Performs the NSFW inference on an image using the NSFWClassification model.

    Parameters:

    Name Type Description Default image Image

    The input image for object detection.

    required

    Returns:

    Name Type Description NSFWInferenceData NSFWInferenceData

    The inference result from the NSFWClassification model. label (str): The label for the unsafe content detected in the image. score (float):The score for the unsafe content detected in the image.

    "},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__init__","title":"__init__()","text":"

    Initializes the NSFW (Not Safe for Work) classification tool.

    "},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWInferenceData","title":"NSFWInferenceData","text":"

    Bases: BaseModel

    Represents an inference result from the NSFWClassification model.

    Attributes:

    Name Type Description label str

    The predicted label for the image.

    score float

    The confidence score associated with the prediction (between 0 and 1).

    "},{"location":"nshot_counting/","title":"LOCA (Low-shot Object Counting network with iterative prototype Adaptation).","text":"

    This example demonstrates how to use the NShot LOCA tool for object counting in images.

    from vision_agent_tools.models.nshot_counting import NShotCounting\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the counting model and choose the image output size you expect.\nObjectCounting = NShotCounting(zero_shot=False, img_size=512)\n\n# Run the inference\nresults = ObjectCounting(image, bbox=[12, 34, 56, 78])\n\n# Let's find out how many objects were found in total\nprint(\"Found a total count of {results.count} objects on the image!\")\n
    "},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.CountingDetection","title":"CountingDetection","text":"

    Bases: BaseModel

    Represents an inference result from the LOCA model.

    Attributes:

    Name Type Description count int

    The predicted number of detected objects.

    masks list[Any]

    A list of numpy arrays representing the masks of the detected objects in the image.

    "},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting","title":"NShotCounting","text":"

    Bases: BaseMLModel

    Model for object counting using the zeroshot and n-shot versions of the LOCA model from the paper A Low-Shot Object Counting Network With Iterative Prototype Adaptation .

    "},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__call__","title":"__call__(image, bbox=None)","text":"

    LOCA injects shape and appearance information into object queries to precisely count objects of various sizes in densely and sparsely populated scenarios. It also extends to a zeroshot scenario and achieves excellent localization and count errors across the entire low-shot spectrum.

    Parameters:

    Name Type Description Default image Image

    The input image for object detection.

    required bbox BoundingBox

    A list of four ints representing the bounding box coordinates (xmin, ymin, xmax, ymax) of the detected query in the image.

    None

    Returns:

    Name Type Description CountingDetection CountingDetection

    An object type containing: - The count of the objects found similar to the bbox query. - A list of numpy arrays representing the masks of the objects found.

    "},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__init__","title":"__init__(zero_shot=True, img_size=512)","text":"

    Initializes the LOCA model.

    Parameters:

    Name Type Description Default img_size int

    Size of the input image.

    512"},{"location":"owlv2/","title":"OWLv2 Open-World Localization","text":"

    This example demonstrates using the Owlv2 tool for object detection in images based on text prompts.

    from vision_agent_tools.models.owlv2 import Owlv2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# What are you looking for? Write your detective prompts here!\nprompts = [\"a photo of a cat\", \"a photo of a dog\"]\n\n# Load the image and create your Owlv2 detective tool\nimage = Image.open(test_image)\nowlv2 = Owlv2()\n\n# Time to put Owlv2 to work! Let's see what it finds...\nresults = owlv2(image, prompts=prompts)[0]\n\n# Did Owlv2 sniff out any objects? Let's see the results!\nif results:\n    for detection in results:\n        print(f\"Found it! It looks like a {detection['label']} with a confidence of {detection['score']:.2f}.\")\n        print(f\"Here's where it's hiding: {detection['bbox']}\")\nelse:\n    print(\"Hmm, Owlv2 couldn't find anything this time. Maybe try a different prompt?\")\n
    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2","title":"Owlv2","text":"

    Bases: BaseMLModel

    Tool for object detection using the pre-trained Owlv2 model from Transformers.

    This tool takes images and a prompt as input, performs object detection using the Owlv2 model, and returns a list of objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding a threshold.

    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__call__","title":"__call__(prompts, images=None, video=None, *, batch_size=1, nms_threshold=0.3, confidence=0.1)","text":"

    Performs object detection on images using the Owlv2 model.

    Parameters:

    Name Type Description Default prompts list[str]

    The prompt to be used for object detection.

    required images list[Image] | None

    The images to be analyzed.

    None video VideoNumpy[uint8] | None

    A numpy array containing the different images, representing the video.

    None batch_size int

    The batch size used for processing multiple images or video frames.

    1 nms_threshold float

    The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).

    0.3 confidence float

    Confidence threshold for model predictions.

    0.1

    Returns:

    Type Description list[ODWithScoreResponse]

    list[ODWithScoreResponse]: A list of ODWithScoreResponse objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding the threshold. The item will be None if no objects are detected above the confidence threshold for an specific image / frame.

    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__init__","title":"__init__(model_config=OWLV2Config())","text":"

    Loads the pre-trained Owlv2 processor and model from Transformers.

    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS","title":"Owlv2ProcessorWithNMS","text":"

    Bases: Owlv2Processor

    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS.post_process_object_detection_with_nms","title":"post_process_object_detection_with_nms(outputs, *, threshold=0.1, nms_threshold=0.3, target_sizes=None)","text":"

    Converts the raw output of [OwlViTForObjectDetection] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.

    Parameters:

    Name Type Description Default outputs OwlViTObjectDetectionOutput

    Raw outputs of the model.

    required threshold float

    Score threshold to keep object detection predictions.

    0.1 nms_threshold float

    IoU threshold to filter overlapping objects the raw detections.

    0.3 target_sizes TensorType | list[Tuple] | None

    Tensor of shape (batch_size, 2) or list of tuples (Tuple[int, int]) containing the target size (height, width) of each image in the batch. If unset, predictions will not be resized.

    None

    Returns: list[dict]: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image in the batch as predicted by the model.

    "},{"location":"qr_reader/","title":"QR Reader","text":"

    Tool for detecting QR codes in images.

    from PIL import Image, ImageDraw\n\nfrom vision_agent_tools.models.qr_reader import QRReader\n\n# Open the image containing the QR code\nimage = Image.open(\"sample_qr_image.jpeg\")\n\n# Create a QR code reader object\nqr_reader = QRReader()\n\n# Detect QR codes in the image\ndetections = qr_reader(image)\n\n\nif detections:\n\n    detection = detections[0]\n    draw = ImageDraw.Draw(image)\n\n    # Print the detected text\n    print(f\"Decoded Text: {detection.text}\")\n\n    # Draw the bounding box\n    x_min, y_min, x_max, y_max = (\n        int(detection.bbox[0]),\n        int(detection.bbox[1]),\n        int(detection.bbox[2]),\n        int(detection.bbox[3]),\n    )\n    draw.rectangle(((x_min, y_min), (x_max, y_max)), outline=\"red\", width=2)\n\n    # Draw the text on top of the image\n    draw.text((x_min + 10, y_min - 10), detection.text, fill=\"blue\", anchor=\"mm\")\n    image.show()\nelse:\n    print(\"No QR codes detected in the image.\")\n
    Displaying the Detection Result"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRCodeDetection","title":"QRCodeDetection","text":"

    Bases: BaseModel

    Represents a detected QR code.

    "},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader","title":"QRReader","text":"

    Bases: BaseMLModel

    This tool utilizes the qreader library to detect QR codes within an input image. It returns a list of QRCodeDetection objects for each detected QR code, containing the decoded text, confidence score, polygon coordinates, bounding box, and center point.

    "},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader.__call__","title":"__call__(image)","text":"

    Detects QR codes in an image.

    Parameters:

    Name Type Description Default image Image

    The input image for QR code detection.

    required

    Returns:

    Type Description list[QRCodeDetection]

    list[QRCodeDetection]: A list of QRCodeDetection objects containing information about each detected QR code, or an empty list if none are found.

    "},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader.__init__","title":"__init__()","text":"

    Initializes the QR code reader tool.

    Loads the QReader instance for QR code detection.

    "},{"location":"qwen2_vl/","title":"Qwen2-VL","text":"

    This example demonstrates how to use the Qwen2-VL model to to answer questions about images or videos.

    NOTE: The Qwen2-VL model should be used in GPU environments.

    import cv2\nimport numpy as np\nfrom vision_agent_tools.models.qwen2_vl import Qwen2VL\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n    frames.append(frame)\ncap.release()\nframes = np.stack(frames, axis=0)\n\n# Initialize the Qwen2VL model\nrun_inference = Qwen2VL()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put Qwen2VL to work!\nanswer = run_inference(video=frames, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n
    "},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL","title":"Qwen2VL","text":"

    Bases: BaseMLModel

    Qwen2-VL is a model that is capable of accurately identifying and comprehending the content within images, regardless of their clarity, resolution, or extreme aspect ratios.

    NOTE: The Qwen2-VL model should be used in GPU environments.

    "},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__call__","title":"__call__(prompt=None, images=None, video=None, frames=MAX_NUMBER_OF_FRAMES)","text":"

    Qwen2-VL model answers questions about a video or image.

    Parameters:

    Name Type Description Default prompt str

    The prompt with the question to be answered.

    None images list[Image]

    A list of images for the model to process. None if using video.

    None video VideoNumpy | None

    A numpy array containing the different images, representing the video.

    None frames int

    The number of frames to be used from the video.

    MAX_NUMBER_OF_FRAMES

    Returns:

    Type Description list[str]

    list[str]: The answers to the prompt.

    "},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__init__","title":"__init__(model_config=None)","text":"

    Initializes the Qwen2-VL model.

    "},{"location":"shared_model_manager/","title":"Shared Model Manager","text":"

    The SharedModelManager class is designed to manage and facilitate the use of machine learning models across different devices, such as CPUs and GPUs, within an asynchronous environment. It ensures safe and efficient execution of these models, particularly in scenarios where GPU resources need to be shared exclusively among multiple models. The manager coordinates access to the shared GPU, preventing conflicts when multiple models require it. Models are only loaded into memory when needed using the fetch_model function.

    The usage example demonstrates adding models and then using them with their respective functionalities.

    \u26a0\ufe0f \u2755: We should ALWAYS add model instance on CPU to the pool. This avoids overwhelming the GPU memory, and model pool will automatically put it in GPU when the model is fetched..

    model_pool = SharedModelManager()\n\n# Add models instance to the pool\nmodel_pool.add(QRReader())\nmodel_pool.add(Owlv2(model_config=OWLV2Config(device=Device.CPU)))\n\n# Read image\nimage = Image.open(\"path/to/your/image.jpg\")\n\n# Use QRReader model\nasync def use_qr_reader():\n    # Read image\n    image = Image.open(\"path/to/your/image.jpg\")\n\n    qr_reader = await model_pool.fetch_model(QRReader.__name__)\n    detections = qr_reader(image)\n    # Process detections ...\n\n# Use Owlv2 model\nasync def use_owlv2():\n    # Read image\n    image = Image.open(\"path/to/your/image.jpg\")\n\n    owlv2 = await model_pool.fetch_model(Owlv2.__name__)\n    prompts = [\"a photo of a cat\", \"a photo of a dog\"]\n    results = owlv2(image, prompts=prompts)\n    # Process results ...\n
    "},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager","title":"SharedModelManager","text":""},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.add","title":"add(model)","text":"

    Adds a model to the pool with a device preference.

    Parameters:

    Name Type Description Default model Basetool

    The modal instance to be added to the pool, it should implement the BaseTool interface.

    required device Device

    The preferred device for the model.

    required

    Returns:

    Name Type Description str str

    The model ID to be used for fetching the model.

    "},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.fetch_model","title":"fetch_model(model_id)","text":"

    Retrieves a model from the pool for safe execution.

    Parameters:

    Name Type Description Default model_id str

    Id to access the model in the pool.

    required

    Returns:

    Name Type Description Any BaseTool

    The retrieved model instance.

    "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Vision Agent Tools Documentation","text":"

    This repository contains tools that solve vision problems. This tools can be used in conjunction with the vision-agent.

    "},{"location":"clip_media_sim/","title":"CLIPMediaSim","text":""},{"location":"clip_media_sim/#video-similarity","title":"Video similarity","text":"
    import cv2\nfrom PIL import Image\n\nfrom vision_agent_tools.models.clip_media_sim import CLIPMediaSim\n\n# Path to your target image\nimage_path = \"path/to/your/image.jpg\"\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the image\ntarget_image = Image.open(image_path)\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nfps = cap.get(cv2.CAP_PROP_FPS)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Calculate video timestamps\nvideo_time = len(frames) / fps\n\n# Create the CLIPMediaSim instance\nclip_media_sim = CLIPMediaSim()\n\n# Run video similarity against the target image\nresults = clip_media_sim(video=frames, target_image=target_image)\n\n# The results should be a list of [index_of_frame, confidence_score] where the\n# video is similar to the target image.\n\n# To find the time at which a given frame happens, you can do the following\n\ntime_per_frame = video_time / len(frames)\n\ntimestamp = results[0][0] * time_per_frame\n\nprint(\"Similarity detection complete!\")\n

    You can also run similarity against a target text doing the following:

    results = clip_media_sim(video=frames, target_text=\"a turtle holding the earth\")\n
    "},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim","title":"CLIPMediaSim","text":"

    Bases: BaseMLModel

    A class that receives a video and a target image or text and returns the frames that are most similar to the target.

    "},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__call__","title":"__call__(video, target_image=None, target_text=None, thresh=0.3)","text":"

    Receives a video and a target image or text and returns the frames that are most similar to the target.

    Parameters:

    Name Type Description Default video ndarray

    The input video to be processed.

    required target_image Image | None

    The target image to compare the video frames with.

    None target_text str | None

    The target text to compare the video frames with.

    None thresh float

    The threshold to filter the results. Defaults to 0.3.

    0.3"},{"location":"clip_media_sim/#vision_agent_tools.models.clip_media_sim.CLIPMediaSim.__init__","title":"__init__(device='cuda')","text":"

    Initializes the CLIPMediaSim object with a pre-trained CLIP model.

    "},{"location":"controlnet_aux/","title":"Controlnet-Aux","text":""},{"location":"controlnet_aux/#pose-detector","title":"Pose Detector","text":"
    from PIL import Image\nfrom vision_agent_tools.models.controlnet_aux import Image2Pose\n\n# Path to your test image\ntest_image_path = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image_path)\n# Create the Image2Pose instance\nimage_2_pose = Image2Pose()\n\n# Run pose detection and get the results\nresults = image_2_pose(image)\n\n# Optional: Save the result image (assuming results is a PIL Image)\n# results.save(\"result.png\")\n\nprint(\"Pose detection complete!\")\n
    Pose Detection Result"},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose","title":"Image2Pose","text":"

    A class that simplifies human pose detection using a pre-trained Openpose model.

    This class provides a convenient way to run pose detection on images using a pre-trained Openpose model from the controlnet_aux library. It takes a PIL Image object as input and returns the predicted pose information.

    "},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__call__","title":"__call__(image)","text":"

    Performs pose detection on a PIL image and returns the results.

    This method takes a PIL Image object as input and runs the loaded Openpose detector on it. The predicted pose information is then resized to match the original image size and returned.

    Parameters:

    Name Type Description Default image Image

    The input image for pose detection.

    required

    Returns:

    Type Description Image

    PIL.Image: The image with the predicted pose information (format might vary depending on the specific OpenposeDetector implementation).

    "},{"location":"controlnet_aux/#vision_agent_tools.models.controlnet_aux.Image2Pose.__init__","title":"__init__()","text":"

    Initializes the Image2Pose object with a pre-trained Openpose detector.

    This method loads a pre-trained Openpose model from the specified model hub (\"lllyasviel/Annotators\" in this case). The loaded detector is stored as an attribute for future use.

    "},{"location":"depth_anything_v2/","title":"Depth-Anything-V2","text":"

    This example demonstrates using the Depth-Anything-V2 tool for depth estimation on images.

    from vision_agent_tools.models.depth_anything_v2 import DepthAnythingV2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the depth map estimation model.\ndepth_estimate = DepthAnythingV2()\n\n# Run the inference\nresults = depth_estimate(image)\n\n# Let's print the obtained depth map\nprint(results.map)\n
    "},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2","title":"DepthAnythingV2","text":"

    Bases: BaseMLModel

    Model for depth estimation using the Depth-Anything-V2 model from the paper Depth Anything V2.

    "},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__call__","title":"__call__(image, grayscale=False)","text":"

    Depth-Anything-V2 is a highly practical solution for robust monocular depth estimation.

    Parameters:

    Name Type Description Default image Union[str, Image, ndarray]

    The input image for depth estimation. Can be a file path, a PIL Image, or a NumPy array.

    required grayscale bool

    Whether to return the depth map as a grayscale image. If True, the depth map will be normalized to the range [0, 255] and converted to uint8. Defaults to False.

    False

    Returns:

    Name Type Description DepthMap DepthMap

    An object type containing a numpy array with the HxW depth map of the image.

    "},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthAnythingV2.__init__","title":"__init__()","text":"

    Initializes the Depth-Anything-V2 model.

    "},{"location":"depth_anything_v2/#vision_agent_tools.models.depth_anything_v2.DepthMap","title":"DepthMap","text":"

    Bases: BaseModel

    Represents the depth map of an image.

    Attributes:

    Name Type Description map Any

    HxW raw depth map of the image.

    "},{"location":"florence2-qa/","title":"FlorenceQA","text":"

    This example demonstrates using the Florence2-QA tool to to answer questions about images.

    NOTE: The FlorenceQA model can only be used in GPU environments.

    from vision_agent_tools.models.florence2_qa import FlorenceQA\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image and create initialize the FlorenceQA model\nimage = Image.open(test_image)\nrun_florence_qa = FlorenceQA()\n\n# Time to put FlorenceQA to work! Let's pose a question about the image\nanswer = run_florence_qa(image, question=\"Is there a dog in the image?\")\n\n# Print the output answer\nprint(answer)\n
    "},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA","title":"FlorenceQA","text":"

    Bases: BaseMLModel

    FlorenceQA is a tool that combines the Florence-2 and Roberta QA models to answer questions about images.

    NOTE: The Florence-2 model can only be used in GPU environments.

    "},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__call__","title":"__call__(image, question)","text":"

    FlorenceQA model answers questions about images.

    Parameters:

    Name Type Description Default image Image

    The image to be analyzed.

    required question str

    The question to be answered.

    required

    Returns:

    Name Type Description str dict[str, Any]

    The answer to the question.

    "},{"location":"florence2-qa/#vision_agent_tools.models.florence2_qa.FlorenceQA.__init__","title":"__init__()","text":"

    Initializes the FlorenceQA model.

    "},{"location":"florence2-sam2/","title":"Florence2Sam2","text":"

    This tool uses Florence2 and the SAM-2 model to do text to instance segmentation on image or video inputs.

    import cv2\n\nfrom vision_agent_tools.models.florence2_sam2 import Florence2SAM2\n\n\n# Path to your video\nvideo_path = \"path/to/your/video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Create the Florence2SAM2 instance\nflorence2_sam2 = Florence2SAM2()\n\n# segment all the instances of the prompt \"ball\" for all video frames\nresults = florence2_sam2(prompt=\"ball\", video=frames)\n\n# Returns a list of list where the first list represents the frames and the inner\n# list contains all the predictions per frame. The annotation ID can be used\n# to track the same object across different frames. For example:\n[\n    [\n        {\n            \"id\": 0\n            \"mask\": rle\n            \"label\": \"ball\"\n            \"bbox\": [x_min, y_min, x_max, y_max]\n        }\n    ],\n    [\n        {\n            \"id\": 0\n            \"mask\": rle\n            \"label\": \"ball\"\n            \"bbox\": [x_min, y_min, x_max, y_max]\n        }\n    ]\n]\n\nprint(\"Instance segmentation complete!\")\n

    You can also run similarity against an image and get additionally bounding boxes doing the following:

    results = florence2_sam2(image=image, prompts=[\"ball\"])\n
    "},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2","title":"Florence2SAM2","text":"

    Bases: BaseMLModel

    A class that receives a video or images, a text prompt and returns the instance segmentation based on the input for each frame.

    "},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__call__","title":"__call__(prompt, images=None, video=None, *, chunk_length_frames=20, iou_threshold=0.6, nms_threshold=0.3)","text":"

    Florence2Sam2 model find objects in images and track objects in a video.

    Parameters:

    Name Type Description Default prompt str

    The text input that complements the media to find or track objects.

    required images list[Image] | None

    The images to be analyzed.

    None video VideoNumpy | None

    A numpy array containing the different images, representing the video.

    None chunk_length_frames int | None

    The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.

    20 iou_threshold float

    The IoU threshold value used to compare last_predictions and new_predictions objects.

    0.6 nms_threshold float

    The non-maximum suppression threshold value used to filter the Florence2 predictions.

    0.3

    Returns:

    Type Description list[list[dict[str, Any]]]

    list[list[dict[str, Any]]]: A list where each item represents each frames predictions. [[{ \"id\": 0, \"mask\": rle, \"label\": \"car\", \"bbox\": [0.1, 0.2, 0.3, 0.4] }]]

    "},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.__init__","title":"__init__(model_config=Florence2SAM2Config())","text":"

    Initializes the Florence2SAM2 object with a pre-trained Florence2 model and a SAM2 model.

    "},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.fine_tune","title":"fine_tune(checkpoint)","text":"

    Load the fine-tuned Florence-2 model.

    "},{"location":"florence2-sam2/#vision_agent_tools.models.florence2_sam2.Florence2SAM2.load_base","title":"load_base()","text":"

    Load the base Florence-2 model.

    "},{"location":"florence2/","title":"Florence-2","text":"

    This example demonstrates using the Florence2 tool to interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.

    from vision_agent_tools.shared_types import PromptTask\nfrom vision_agent_tools.models.florence2 import Florence2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Choose the task that you are planning to use\ntask_prompt = PromptTask.CAPTION\n\n# Load the image and create initialize the Florence2 model\nimage = Image.open(test_image)\nmodel = Florence2()\n\n# Time to put Florence2 to work! Let's see what it finds...\nresults = model(images=[image], task=task_prompt)\n\n# Print the output result\nprint(f\"The image contains: {results[0]}\")\n
    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2","title":"Florence2","text":"

    Bases: BaseMLModel

    Florence2 model. It supported both zero-shot and fine-tuned settings. For the zero-shot we use the Florence-2-large. For fine-tuning we use the Florence-2-base-ft. This model can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation.

    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__call__","title":"__call__(task, prompt='', images=None, video=None, *, batch_size=5, nms_threshold=0.3, chunk_length_frames=None)","text":"

    Performs inference on the Florence-2 model based on the provided task, images or video, and prompt.

    Parameters:

    Name Type Description Default task PromptTask

    The task to be performed on the images or video.

    required prompt Optional[str]

    The text input that complements the prompt task.

    '' images list[Image] | None

    A list of images for the model to process. None if using video.

    None video VideoNumpy | None

    A NumPy representation of the video for inference. None if using images.

    None batch_size int

    The batch size used for processing multiple images or video frames.

    5 nms_threshold float

    The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).

    0.3 chunk_length_frames int | None

    The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.

    None

    Returns:

    Name Type Description Florence2ResponseType Florence2ResponseType

    The output of the Florence-2 model based on the task and prompt.

    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.__init__","title":"__init__(model_config=Florence2Config())","text":"

    Initializes the Florence2 model.

    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.fine_tune","title":"fine_tune(checkpoint)","text":"

    Load the fine-tuned Florence-2 model.

    "},{"location":"florence2/#vision_agent_tools.models.florence2.Florence2.load_base","title":"load_base()","text":"

    Load the base Florence-2 model.

    "},{"location":"flux1/","title":"Flux1","text":"

    This example demonstrates using the Flux1 model to perform tasks such as image generation and mask inpainting based on text prompts.

    "},{"location":"flux1/#parameters","title":"Parameters","text":""},{"location":"flux1/#flux1config","title":"Flux1Config","text":"

    Below is an example of how to create and use a Flux1Config object:

    from vision_agent_tools.models.flux1 import Flux1Config\n\nconfig = Flux1Config(\n    height=512,\n    width=512,\n    num_inference_steps=28,\n    guidance_scale=3.5,\n    num_images_per_prompt=1,\n    max_sequence_length=512,\n    seed=42\n)\n
    "},{"location":"flux1/#perform-image-generation","title":"Perform image generation","text":"
    import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# To perform image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n    task=Flux1Task.IMAGE_GENERATION,  # Image Generation Task\n    prompt=\"A purple car in a futuristic cityscape\",\n    config=config\n)\ngenerated_image.save(\"generated_car.png\")\n
    "},{"location":"flux1/#perform-mask-inpainting","title":"Perform mask inpainting","text":"

    To perform mask inpainting, both the original image and the mask image need to be provided. These images have the same dimensions. The mask should clearly delineate the areas that you want to modify in the original image. Additionally, the inpainting process includes a strength parameter, which controls the intensity of the modifications applied to the masked areas.

    import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have a cat image named \"cat_image.jpg\" that you want to use for mask inpainting\nimage_to_edit = Image.open(\"path/to/your/cat_image.jpg\").convert(\"RGB\")  # Image to inpaint\n\n# Make sure to provide a mask image with the same dimensions, delineating the cat\nmask_image = Image.open(\"path/to/your/mask.png\")  # Mask image indicating areas to change\n\n# Set a new prompt for inpainting\ninpainting_prompt = \"A cute dog\"\n\n# To perform image mask inpainting\nflux1 = Flux1()\n\ninpainted_image = flux_model(\n    task=Flux1Task.MASK_INPAINTING,  # Image Mask Inpainting Task\n    prompt=inpainting_prompt,\n    image=image_to_edit,\n    mask_image=mask_image,\n    config=config\n)\n\ninpainted_image.save(\"inpainted_dog_over_cat.png\")\n
    "},{"location":"flux1/#perform-image-to-image-generation","title":"Perform image-to-image generation","text":"

    To perform image-to-image generation, you need to provide an original image along with a text prompt describing the desired modifications. The original image serves as the base, and the model will generate a new image based on the prompt.

    import torch\nfrom PIL import Image\nfrom vision_agent_tools.models.flux1 import Flux1, Flux1Task\n\n# You have an original image named \"original_image.jpg\" that you want to use for image-to-image generation\noriginal_image = Image.open(\"path/to/your/original_image.jpg\").convert(\"RGB\")  # Original image\n\n# Set a new prompt for image-to-image generation\nimage_to_image_prompt = \"A sunny beach with palm trees\"\n\n# To perform image-to-image generation\nflux1 = Flux1()\n\ngenerated_image = flux_model(\n    task=Flux1Task.IMAGE_TO_IMAGE,  # Image-to-Image Generation Task\n    prompt=image_to_image_prompt,\n    image=original_image,\n    config=config\n)\n\ngenerated_image.save(\"generated_beach.png\")\n
    "},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1","title":"Flux1","text":"

    Bases: BaseMLModel

    Tool for object detection using the pre-trained Flux1 model. This tool takes a prompt as input and generates an image using the Flux1 model.

    "},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__call__","title":"__call__(prompt=Field(max_length=512), task=Flux1Task.IMAGE_GENERATION, config=Flux1Config(), image=None, mask_image=None)","text":"

    Performs object detection on an image using the Flux1 model.

    Parameters:

    Name Type Description Default - prompt (str

    The text prompt describing the desired modifications.

    required - task (Flux1Task

    The task to perform using the model: - image generation - \"generation\", - mask inpainting - \"inpainting\", - image-to-image generation - \"img2img\".

    required - config (Flux1Config required - image (Image.Image

    The original image to be modified.

    required - mask_image (Image.Image

    The mask image indicating areas to be inpainted.

    required

    Returns:

    Type Description List[Image] | None

    List[Image.Image]: The list of generated image(s) if successful; None if an error occurred.

    "},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1.__init__","title":"__init__(hf_model='black-forest-labs/FLUX.1-schnell', dtype=torch.bfloat16, enable_sequential_cpu_offload=True)","text":"

    Initializes the Flux1 image generation tool. Loads the pre-trained Flux1 model from HuggingFace and sets model configurations.

    Parameters:

    Name Type Description Default - task (Flux1Task

    The task to perform using the model: either image generation (\"generation\") or mask inpainting (\"inpainting\").

    required - model_config

    The configuration for the model, hf_model, and device.

    required - dtype (torch.dtype

    The data type to use for the model.

    required - enable_sequential_cpu_offload (bool

    Whether to enable sequential CPU offload.

    required"},{"location":"flux1/#vision_agent_tools.models.flux1.Flux1Config","title":"Flux1Config","text":"

    Bases: BaseModel

    Configuration for the Flux1 model.

    "},{"location":"internlm_xcomposer2/","title":"InternLM-XComposer-2.5","text":"

    This example demonstrates how to use the InternLM-XComposer-2.5 tool to to answer questions about images or videos.

    NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.

    import cv2\n\nfrom vision_agent_tools.models.internlm_xcomposer2 import InternLMXComposer2\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frames.append(frame)\ncap.release()\n\n# Initialize the InternLMXComposer2 model\nrun_inference = InternLMXComposer2()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put InternLMXComposer2 to work!\nanswer = run_inference(video=p_video, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n
    "},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2","title":"InternLMXComposer2","text":"

    Bases: BaseMLModel

    InternLM-XComposer-2.5 is a tool that excels in various text-image comprehension and composition applications, achieving GPT-4V level capabilities.

    NOTE: The InternLM-XComposer-2.5 model should be used in GPU environments.

    "},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__call__","title":"__call__(prompt, image=None, video=None, frames=MAX_NUMBER_OF_FRAMES, chunk_length=None)","text":"

    InternLMXComposer2 model answers questions about a video or image.

    Parameters:

    Name Type Description Default prompt str

    The prompt with the question to be answered.

    required image Image | None

    The image to be analyzed.

    None video VideoNumpy | None

    A numpy array containing the different images, representing the video.

    None frames int

    The number of frames to be used from the video.

    MAX_NUMBER_OF_FRAMES chunk_length int

    The number of frames for each chunk of video to analyze. The last chunk may have fewer frames.

    None

    Returns:

    Type Description list[str]

    list[str]: The answers to the prompt.

    "},{"location":"internlm_xcomposer2/#vision_agent_tools.models.internlm_xcomposer2.InternLMXComposer2.__init__","title":"__init__()","text":"

    Initializes the InternLMXComposer2.5 model.

    "},{"location":"nsfw_classification/","title":"NSFW (Not Safe for Work) classification","text":"

    This example demonstrates using the Not Safe for Work classification tool.

    from vision_agent_tools.models.nsfw_classification import NSFWClassification\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the NSFW model.\nnsfw_classification = NSFWClassification()\n\n# Run the inference\nresults = nsfw_classification(image)\n\n# Let's print the predicted label\nprint(results.label)\n
    "},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification","title":"NSFWClassification","text":"

    Bases: BaseMLModel

    The primary intended use of this model is for the classification of NSFW (Not Safe for Work) images.

    "},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__call__","title":"__call__(image)","text":"

    Performs the NSFW inference on an image using the NSFWClassification model.

    Parameters:

    Name Type Description Default image Image

    The input image for object detection.

    required

    Returns:

    Name Type Description NSFWInferenceData NSFWInferenceData

    The inference result from the NSFWClassification model. label (str): The label for the unsafe content detected in the image. score (float):The score for the unsafe content detected in the image.

    "},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWClassification.__init__","title":"__init__()","text":"

    Initializes the NSFW (Not Safe for Work) classification tool.

    "},{"location":"nsfw_classification/#vision_agent_tools.models.nsfw_classification.NSFWInferenceData","title":"NSFWInferenceData","text":"

    Bases: BaseModel

    Represents an inference result from the NSFWClassification model.

    Attributes:

    Name Type Description label str

    The predicted label for the image.

    score float

    The confidence score associated with the prediction (between 0 and 1).

    "},{"location":"nshot_counting/","title":"LOCA (Low-shot Object Counting network with iterative prototype Adaptation).","text":"

    This example demonstrates how to use the NShot LOCA tool for object counting in images.

    from vision_agent_tools.models.nshot_counting import NShotCounting\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# Load the image\nimage = Image.open(test_image)\n# Initialize the counting model and choose the image output size you expect.\nObjectCounting = NShotCounting(zero_shot=False, img_size=512)\n\n# Run the inference\nresults = ObjectCounting(image, bbox=[12, 34, 56, 78])\n\n# Let's find out how many objects were found in total\nprint(\"Found a total count of {results.count} objects on the image!\")\n
    "},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.CountingDetection","title":"CountingDetection","text":"

    Bases: BaseModel

    Represents an inference result from the LOCA model.

    Attributes:

    Name Type Description count int

    The predicted number of detected objects.

    masks list[Any]

    A list of numpy arrays representing the masks of the detected objects in the image.

    "},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting","title":"NShotCounting","text":"

    Bases: BaseMLModel

    Model for object counting using the zeroshot and n-shot versions of the LOCA model from the paper A Low-Shot Object Counting Network With Iterative Prototype Adaptation .

    "},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__call__","title":"__call__(image, bbox=None)","text":"

    LOCA injects shape and appearance information into object queries to precisely count objects of various sizes in densely and sparsely populated scenarios. It also extends to a zeroshot scenario and achieves excellent localization and count errors across the entire low-shot spectrum.

    Parameters:

    Name Type Description Default image Image

    The input image for object detection.

    required bbox BoundingBox

    A list of four ints representing the bounding box coordinates (xmin, ymin, xmax, ymax) of the detected query in the image.

    None

    Returns:

    Name Type Description CountingDetection CountingDetection

    An object type containing: - The count of the objects found similar to the bbox query. - A list of numpy arrays representing the masks of the objects found.

    "},{"location":"nshot_counting/#vision_agent_tools.models.nshot_counting.NShotCounting.__init__","title":"__init__(zero_shot=True, img_size=512)","text":"

    Initializes the LOCA model.

    Parameters:

    Name Type Description Default img_size int

    Size of the input image.

    512"},{"location":"owlv2/","title":"OWLv2 Open-World Localization","text":"

    This example demonstrates using the Owlv2 tool for object detection in images based on text prompts.

    from vision_agent_tools.models.owlv2 import Owlv2\n\n# (replace this path with your own!)\ntest_image = \"path/to/your/image.jpg\"\n\n# What are you looking for? Write your detective prompts here!\nprompts = [\"a photo of a cat\", \"a photo of a dog\"]\n\n# Load the image and create your Owlv2 detective tool\nimage = Image.open(test_image)\nowlv2 = Owlv2()\n\n# Time to put Owlv2 to work! Let's see what it finds...\nresults = owlv2(image, prompts=prompts)[0]\n\n# Did Owlv2 sniff out any objects? Let's see the results!\nif results:\n    for detection in results:\n        print(f\"Found it! It looks like a {detection['label']} with a confidence of {detection['score']:.2f}.\")\n        print(f\"Here's where it's hiding: {detection['bbox']}\")\nelse:\n    print(\"Hmm, Owlv2 couldn't find anything this time. Maybe try a different prompt?\")\n
    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2","title":"Owlv2","text":"

    Bases: BaseMLModel

    Tool for object detection using the pre-trained Owlv2 model from Transformers.

    This tool takes images and a prompt as input, performs object detection using the Owlv2 model, and returns a list of objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding a threshold.

    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__call__","title":"__call__(prompts, images=None, video=None, *, batch_size=1, nms_threshold=0.3, confidence=0.1)","text":"

    Performs object detection on images using the Owlv2 model.

    Parameters:

    Name Type Description Default prompts list[str]

    The prompt to be used for object detection.

    required images list[Image] | None

    The images to be analyzed.

    None video VideoNumpy[uint8] | None

    A numpy array containing the different images, representing the video.

    None batch_size int

    The batch size used for processing multiple images or video frames.

    1 nms_threshold float

    The IoU threshold value used to apply a dummy agnostic Non-Maximum Suppression (NMS).

    0.3 confidence float

    Confidence threshold for model predictions.

    0.1

    Returns:

    Type Description list[ODWithScoreResponse]

    list[ODWithScoreResponse]: A list of ODWithScoreResponse objects containing the predicted labels, confidence scores, and bounding boxes for detected objects with confidence exceeding the threshold. The item will be None if no objects are detected above the confidence threshold for an specific image / frame.

    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2.__init__","title":"__init__(model_config=OWLV2Config())","text":"

    Loads the pre-trained Owlv2 processor and model from Transformers.

    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS","title":"Owlv2ProcessorWithNMS","text":"

    Bases: Owlv2Processor

    "},{"location":"owlv2/#vision_agent_tools.models.owlv2.Owlv2ProcessorWithNMS.post_process_object_detection_with_nms","title":"post_process_object_detection_with_nms(outputs, *, threshold=0.1, nms_threshold=0.3, target_sizes=None)","text":"

    Converts the raw output of [OwlViTForObjectDetection] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.

    Parameters:

    Name Type Description Default outputs OwlViTObjectDetectionOutput

    Raw outputs of the model.

    required threshold float

    Score threshold to keep object detection predictions.

    0.1 nms_threshold float

    IoU threshold to filter overlapping objects the raw detections.

    0.3 target_sizes TensorType | list[Tuple] | None

    Tensor of shape (batch_size, 2) or list of tuples (Tuple[int, int]) containing the target size (height, width) of each image in the batch. If unset, predictions will not be resized.

    None

    Returns: list[dict]: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image in the batch as predicted by the model.

    "},{"location":"qr_reader/","title":"QR Reader","text":"

    Tool for detecting QR codes in images.

    from PIL import Image, ImageDraw\n\nfrom vision_agent_tools.models.qr_reader import QRReader\n\n# Open the image containing the QR code\nimage = Image.open(\"sample_qr_image.jpeg\")\n\n# Create a QR code reader object\nqr_reader = QRReader()\n\n# Detect QR codes in the image\ndetections = qr_reader(image)\n\n\nif detections:\n\n    detection = detections[0]\n    draw = ImageDraw.Draw(image)\n\n    # Print the detected text\n    print(f\"Decoded Text: {detection.text}\")\n\n    # Draw the bounding box\n    x_min, y_min, x_max, y_max = (\n        int(detection.bbox[0]),\n        int(detection.bbox[1]),\n        int(detection.bbox[2]),\n        int(detection.bbox[3]),\n    )\n    draw.rectangle(((x_min, y_min), (x_max, y_max)), outline=\"red\", width=2)\n\n    # Draw the text on top of the image\n    draw.text((x_min + 10, y_min - 10), detection.text, fill=\"blue\", anchor=\"mm\")\n    image.show()\nelse:\n    print(\"No QR codes detected in the image.\")\n
    Displaying the Detection Result"},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRCodeDetection","title":"QRCodeDetection","text":"

    Bases: BaseModel

    Represents a detected QR code.

    "},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader","title":"QRReader","text":"

    Bases: BaseMLModel

    This tool utilizes the qreader library to detect QR codes within an input image. It returns a list of QRCodeDetection objects for each detected QR code, containing the decoded text, confidence score, polygon coordinates, bounding box, and center point.

    "},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader.__call__","title":"__call__(image)","text":"

    Detects QR codes in an image.

    Parameters:

    Name Type Description Default image Image

    The input image for QR code detection.

    required

    Returns:

    Type Description list[QRCodeDetection]

    list[QRCodeDetection]: A list of QRCodeDetection objects containing information about each detected QR code, or an empty list if none are found.

    "},{"location":"qr_reader/#vision_agent_tools.models.qr_reader.QRReader.__init__","title":"__init__()","text":"

    Initializes the QR code reader tool.

    Loads the QReader instance for QR code detection.

    "},{"location":"qwen2_vl/","title":"Qwen2-VL","text":"

    This example demonstrates how to use the Qwen2-VL model to to answer questions about images or videos.

    NOTE: The Qwen2-VL model should be used in GPU environments.

    import cv2\nimport numpy as np\nfrom vision_agent_tools.models.qwen2_vl import Qwen2VL\n\n# (replace this path with your own!)\nvideo_path = \"path/to/your/my_video.mp4\"\n\n# Load the video into frames\ncap = cv2.VideoCapture(video_path)\nframes = []\nwhile cap.isOpened():\n    ret, frame = cap.read()\n    if not ret:\n        break\n    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n    frames.append(frame)\ncap.release()\nframes = np.stack(frames, axis=0)\n\n# Initialize the Qwen2VL model\nrun_inference = Qwen2VL()\nprompt = \"Here are some frames of a video. Describe this video in detail\"\n# Time to put Qwen2VL to work!\nanswer = run_inference(video=frames, prompt=prompt)\n\n# Print the output answer\nprint(answer)\n
    "},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL","title":"Qwen2VL","text":"

    Bases: BaseMLModel

    Qwen2-VL is a model that is capable of accurately identifying and comprehending the content within images, regardless of their clarity, resolution, or extreme aspect ratios.

    NOTE: The Qwen2-VL model should be used in GPU environments.

    "},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__call__","title":"__call__(prompt=None, images=None, video=None, frames=MAX_NUMBER_OF_FRAMES)","text":"

    Qwen2-VL model answers questions about a video or image.

    Parameters:

    Name Type Description Default prompt str

    The prompt with the question to be answered.

    None images list[Image]

    A list of images for the model to process. None if using video.

    None video VideoNumpy | None

    A numpy array containing the different images, representing the video.

    None frames int

    The number of frames to be used from the video.

    MAX_NUMBER_OF_FRAMES

    Returns:

    Type Description list[str]

    list[str]: The answers to the prompt.

    "},{"location":"qwen2_vl/#vision_agent_tools.models.qwen2_vl.Qwen2VL.__init__","title":"__init__(model_config=None)","text":"

    Initializes the Qwen2-VL model.

    "},{"location":"shared_model_manager/","title":"Shared Model Manager","text":"

    The SharedModelManager class is designed to manage and facilitate the use of machine learning models across different devices, such as CPUs and GPUs, within an asynchronous environment. It ensures safe and efficient execution of these models, particularly in scenarios where GPU resources need to be shared exclusively among multiple models. The manager coordinates access to the shared GPU, preventing conflicts when multiple models require it. Models are only loaded into memory when needed using the fetch_model function.

    The usage example demonstrates adding models and then using them with their respective functionalities.

    \u26a0\ufe0f \u2755: We should ALWAYS add model instance on CPU to the pool. This avoids overwhelming the GPU memory, and model pool will automatically put it in GPU when the model is fetched..

    model_pool = SharedModelManager()\n\n# Add models instance to the pool\nmodel_pool.add(QRReader())\nmodel_pool.add(Owlv2(model_config=OWLV2Config(device=Device.CPU)))\n\n# Read image\nimage = Image.open(\"path/to/your/image.jpg\")\n\n# Use QRReader model\nasync def use_qr_reader():\n    # Read image\n    image = Image.open(\"path/to/your/image.jpg\")\n\n    qr_reader = await model_pool.fetch_model(QRReader.__name__)\n    detections = qr_reader(image)\n    # Process detections ...\n\n# Use Owlv2 model\nasync def use_owlv2():\n    # Read image\n    image = Image.open(\"path/to/your/image.jpg\")\n\n    owlv2 = await model_pool.fetch_model(Owlv2.__name__)\n    prompts = [\"a photo of a cat\", \"a photo of a dog\"]\n    results = owlv2(image, prompts=prompts)\n    # Process results ...\n
    "},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager","title":"SharedModelManager","text":""},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.add","title":"add(model)","text":"

    Adds a model to the pool with a device preference.

    Parameters:

    Name Type Description Default model Basetool

    The modal instance to be added to the pool, it should implement the BaseTool interface.

    required device Device

    The preferred device for the model.

    required

    Returns:

    Name Type Description str str

    The model ID to be used for fetching the model.

    "},{"location":"shared_model_manager/#vision_agent_tools.tools.shared_model_manager.SharedModelManager.fetch_model","title":"fetch_model(model_id)","text":"

    Retrieves a model from the pool for safe execution.

    Parameters:

    Name Type Description Default model_id str

    Id to access the model in the pool.

    required

    Returns:

    Name Type Description Any BaseTool

    The retrieved model instance.

    "}]} \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 23098322..e74d7be6 100644 Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ