From 70c286550d672de2307d6c7a05373e361737e45f Mon Sep 17 00:00:00 2001
From: Thomas Berends <thomasberends@hotmail.com>
Date: Sat, 2 Aug 2025 13:51:10 +0200
Subject: [PATCH 1/2] docs: update detection core with tips for using Gemini
 integration

---
 supervision/detection/core.py | 50 +++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/supervision/detection/core.py b/supervision/detection/core.py
index ffe5ed3fc..34c446847 100644
--- a/supervision/detection/core.py
+++ b/supervision/detection/core.py
@@ -946,6 +946,26 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
             ```
 
         !!! example "Gemini 2.0"
+
+            ??? tip "Prompt engineering"
+
+                From Gemini 2.0 onwards, models are further trained to detect objects in 
+                an image and get their bounding box coordinates. The coordinates, 
+                relative to image dimensions, scale to [0, 1000]. You need to descale 
+                these coordinates based on your original image size.
+
+                According to the Gemini API documentation on image prompts, when using
+                a single image with text, the recommended approach is to place the text
+                prompt after the image part in the contents array. This ordering has
+                been shown to produce significantly better results in practice.
+
+                To get the best results from Google Gemini 2.0, use the following prompt.
+
+                ```
+                Detect all the cats and dogs in the image. The box_2d should be 
+                [ymin, xmin, ymax, xmax] normalized to 0-1000.
+                ```
+
             ```python
             import supervision as sv
 
@@ -983,6 +1003,11 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
                 including small, distant, or partially visible ones, and to return
                 tight bounding boxes.
 
+                According to the Gemini API documentation on image prompts, when using
+                a single image with text, the recommended approach is to place the text
+                prompt after the image part in the contents array. This ordering has
+                been shown to produce significantly better results in practice.
+
                 ```
                 Carefully examine this image and detect ALL visible objects, including
                 small, distant, or partially visible ones.
@@ -1323,6 +1348,26 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
             ```
 
         !!! example "Gemini 2.0"
+        
+            ??? tip "Prompt engineering"
+
+                From Gemini 2.0 onwards, models are further trained to detect objects in 
+                an image and get their bounding box coordinates. The coordinates, 
+                relative to image dimensions, scale to [0, 1000]. You need to descale 
+                these coordinates based on your original image size.
+
+                According to the Gemini API documentation on image prompts, when using
+                a single image with text, the recommended approach is to place the text
+                prompt after the image part in the contents array. This ordering has
+                been shown to produce significantly better results in practice.
+
+                To get the best results from Google Gemini 2.0, use the following prompt.
+
+                ```
+                Detect all the cats and dogs in the image. The box_2d should be 
+                [ymin, xmin, ymax, xmax] normalized to 0-1000.
+                ```
+
             ```python
             import supervision as sv
 
@@ -1360,6 +1405,11 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
                 including small, distant, or partially visible ones, and to return
                 tight bounding boxes.
 
+                According to the Gemini API documentation on image prompts, when using
+                a single image with text, the recommended approach is to place the text
+                prompt after the image part in the contents array. This ordering has
+                been shown to produce significantly better results in practice.
+
                 ```
                 Carefully examine this image and detect ALL visible objects, including
                 small, distant, or partially visible ones.

From f6c34af34f04f50359808fac08aebecb2903d337 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 2 Aug 2025 11:57:32 +0000
Subject: [PATCH 2/2] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?=
 =?UTF-8?q?format=20pre-commit=20hooks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 supervision/detection/core.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/supervision/detection/core.py b/supervision/detection/core.py
index 34c446847..a6b551508 100644
--- a/supervision/detection/core.py
+++ b/supervision/detection/core.py
@@ -949,9 +949,9 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
 
             ??? tip "Prompt engineering"
 
-                From Gemini 2.0 onwards, models are further trained to detect objects in 
-                an image and get their bounding box coordinates. The coordinates, 
-                relative to image dimensions, scale to [0, 1000]. You need to descale 
+                From Gemini 2.0 onwards, models are further trained to detect objects in
+                an image and get their bounding box coordinates. The coordinates,
+                relative to image dimensions, scale to [0, 1000]. You need to descale
                 these coordinates based on your original image size.
 
                 According to the Gemini API documentation on image prompts, when using
@@ -962,7 +962,7 @@ def from_lmm(cls, lmm: LMM | str, result: str | dict, **kwargs: Any) -> Detectio
                 To get the best results from Google Gemini 2.0, use the following prompt.
 
                 ```
-                Detect all the cats and dogs in the image. The box_2d should be 
+                Detect all the cats and dogs in the image. The box_2d should be
                 [ymin, xmin, ymax, xmax] normalized to 0-1000.
                 ```
 
@@ -1348,12 +1348,12 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
             ```
 
         !!! example "Gemini 2.0"
-        
+
             ??? tip "Prompt engineering"
 
-                From Gemini 2.0 onwards, models are further trained to detect objects in 
-                an image and get their bounding box coordinates. The coordinates, 
-                relative to image dimensions, scale to [0, 1000]. You need to descale 
+                From Gemini 2.0 onwards, models are further trained to detect objects in
+                an image and get their bounding box coordinates. The coordinates,
+                relative to image dimensions, scale to [0, 1000]. You need to descale
                 these coordinates based on your original image size.
 
                 According to the Gemini API documentation on image prompts, when using
@@ -1364,7 +1364,7 @@ def from_vlm(cls, vlm: VLM | str, result: str | dict, **kwargs: Any) -> Detectio
                 To get the best results from Google Gemini 2.0, use the following prompt.
 
                 ```
-                Detect all the cats and dogs in the image. The box_2d should be 
+                Detect all the cats and dogs in the image. The box_2d should be
                 [ymin, xmin, ymax, xmax] normalized to 0-1000.
                 ```