From 70c286550d672de2307d6c7a05373e361737e45f Mon Sep 17 00:00:00 2001 From: Thomas Berends Date: Sat, 2 Aug 2025 13:51:10 +0200 Subject: [PATCH 1/2] docs: update detection core with tips for using Gemini integration --- supervision/detection/core.py | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index ffe5ed3fc..34c446847 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -946,6 +946,26 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio ``` !!! example "Gemini 2.0" + + ??? tip "Prompt engineering" + + From Gemini 2.0 onwards, models are further trained to detect objects in + an image and get their bounding box coordinates. The coordinates, + relative to image dimensions, scale to [0, 1000]. You need to descale + these coordinates based on your original image size. + + According to the Gemini API documentation on image prompts, when using + a single image with text, the recommended approach is to place the text + prompt after the image part in the contents array. This ordering has + been shown to produce significantly better results in practice. + + To get the best results from Google Gemini 2.0, use the following prompt. + + ``` + Detect all the cats and dogs in the image. The box_2d should be + [ymin, xmin, ymax, xmax] normalized to 0-1000. + ``` + ```python import supervision as sv @@ -983,6 +1003,11 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio including small, distant, or partially visible ones, and to return tight bounding boxes. + According to the Gemini API documentation on image prompts, when using + a single image with text, the recommended approach is to place the text + prompt after the image part in the contents array. This ordering has + been shown to produce significantly better results in practice. + ``` Carefully examine this image and detect ALL visible objects, including small, distant, or partially visible ones. @@ -1323,6 +1348,26 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio ``` !!! example "Gemini 2.0" + + ??? tip "Prompt engineering" + + From Gemini 2.0 onwards, models are further trained to detect objects in + an image and get their bounding box coordinates. The coordinates, + relative to image dimensions, scale to [0, 1000]. You need to descale + these coordinates based on your original image size. + + According to the Gemini API documentation on image prompts, when using + a single image with text, the recommended approach is to place the text + prompt after the image part in the contents array. This ordering has + been shown to produce significantly better results in practice. + + To get the best results from Google Gemini 2.0, use the following prompt. + + ``` + Detect all the cats and dogs in the image. The box_2d should be + [ymin, xmin, ymax, xmax] normalized to 0-1000. + ``` + ```python import supervision as sv @@ -1360,6 +1405,11 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio including small, distant, or partially visible ones, and to return tight bounding boxes. + According to the Gemini API documentation on image prompts, when using + a single image with text, the recommended approach is to place the text + prompt after the image part in the contents array. This ordering has + been shown to produce significantly better results in practice. + ``` Carefully examine this image and detect ALL visible objects, including small, distant, or partially visible ones. From f6c34af34f04f50359808fac08aebecb2903d337 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 2 Aug 2025 11:57:32 +0000 Subject: [PATCH 2/2] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- supervision/detection/core.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/supervision/detection/core.py b/supervision/detection/core.py index 34c446847..a6b551508 100644 --- a/supervision/detection/core.py +++ b/supervision/detection/core.py @@ -949,9 +949,9 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio ??? tip "Prompt engineering" - From Gemini 2.0 onwards, models are further trained to detect objects in - an image and get their bounding box coordinates. The coordinates, - relative to image dimensions, scale to [0, 1000]. You need to descale + From Gemini 2.0 onwards, models are further trained to detect objects in + an image and get their bounding box coordinates. The coordinates, + relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. According to the Gemini API documentation on image prompts, when using @@ -962,7 +962,7 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio To get the best results from Google Gemini 2.0, use the following prompt. ``` - Detect all the cats and dogs in the image. The box_2d should be + Detect all the cats and dogs in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000. ``` @@ -1348,12 +1348,12 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio ``` !!! example "Gemini 2.0" - + ??? tip "Prompt engineering" - From Gemini 2.0 onwards, models are further trained to detect objects in - an image and get their bounding box coordinates. The coordinates, - relative to image dimensions, scale to [0, 1000]. You need to descale + From Gemini 2.0 onwards, models are further trained to detect objects in + an image and get their bounding box coordinates. The coordinates, + relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. According to the Gemini API documentation on image prompts, when using @@ -1364,7 +1364,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio To get the best results from Google Gemini 2.0, use the following prompt. ``` - Detect all the cats and dogs in the image. The box_2d should be + Detect all the cats and dogs in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000. ```