diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py index afdd683e2312..e7c30d83f56d 100644 --- a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py @@ -103,7 +103,8 @@ def get_optimal_tiled_canvas( # Pick the resolution that required the least upscaling so that it most closely fits the image required_scale = np.where(required_scale < 1.0, 10e9, required_scale) best_grid = possible_resolutions[np.argmin(required_scale)] - return best_grid + best_grid_row, best_grid_col = best_grid + return best_grid_col, best_grid_row # revert the order to align with boilerplate code @auto_docstring diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py index 5f417075a931..a9a2311b1b05 100644 --- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py @@ -305,7 +305,8 @@ def get_optimal_tiled_canvas( # Pick the resolution that required the least upscaling so that it most closely fits the image required_scale = np.where(required_scale < 1.0, 10e9, required_scale) best_grid = possible_resolutions[np.argmin(required_scale)] - return best_grid + best_grid_row, best_grid_col = best_grid + return best_grid_col, best_grid_row # revert the order to align with boilerplate code class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs, total=False): diff --git a/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py b/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py index 81a16ba39c14..c2b0c89a9a44 100644 --- a/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py +++ b/tests/models/cohere2_vision/test_image_processing_cohere2_vision.py @@ -190,3 +190,128 @@ def test_call_numpy_4_channels(self): image_std=(1.0, 1.0, 1.0, 1.0), ).pixel_values self.assertEqual(tuple(encoded_images.shape), (70, 4, 30, 30)) + + def test_crop_to_patches_aspect_ratio(self): + """Test that row/column ordering is correct when cropping non-square images to patches. + + This test verifies that patches can be stitched back to reconstruct the original image, + which validates that the row/column ordering in get_optimal_tiled_canvas is correct. + If row/column are swapped, the image would be resized to wrong dimensions and patches + would not match the original content. + """ + for image_processing_class in self.image_processor_list: + patch_size = 64 + image_processor = image_processing_class( + do_resize=True, + size={"height": patch_size, "width": patch_size}, + do_normalize=False, # Disable normalization to preserve pixel values + do_rescale=False, # Disable rescaling to preserve pixel values + crop_to_patches=True, + min_patches=1, + max_patches=6, # Allow up to 6 patches to test asymmetric grids like 2x3 + ) + + # Create a 2:3 aspect ratio image (2 rows x 3 columns of patches) + # This asymmetric grid will fail if rows/columns are swapped + num_rows, num_cols = 2, 3 + image_height = patch_size * num_rows # 128 + image_width = patch_size * num_cols # 192 + + # Create image with unique color for each patch position + test_image = Image.new("RGB", (image_width, image_height)) + for row in range(num_rows): + for col in range(num_cols): + patch_idx = row * num_cols + col # 0-5 + color = (patch_idx * 40 + 20, 0, 0) # Unique red values: 20, 60, 100, 140, 180, 220 + for y in range(patch_size): + for x in range(patch_size): + test_image.putpixel( + (col * patch_size + x, row * patch_size + y), + color, + ) + + # Process image + result = image_processor(test_image, return_tensors="pt") + patches = result.pixel_values + num_patches_result = result.num_patches + + # Should produce 7 patches (6 grid patches + 1 thumbnail) + self.assertEqual(num_patches_result.tolist(), [7]) + self.assertEqual(tuple(patches.shape), (7, 3, patch_size, patch_size)) + + # Verify each patch has the correct color (excluding thumbnail which is last) + # Patches should be ordered row by row: (0,0), (0,1), (0,2), (1,0), (1,1), (1,2) + for patch_idx in range(6): + expected_red = patch_idx * 40 + 20 + actual_red = patches[patch_idx, 0, 0, 0].item() # Red channel, top-left pixel + self.assertEqual( + actual_red, + expected_red, + f"Patch {patch_idx} has wrong color. Expected red={expected_red}, got {actual_red}. " + f"This indicates row/column ordering is incorrect.", + ) + + # Stitch patches back and verify against original + stitched = torch.zeros(3, image_height, image_width) + for patch_idx in range(6): + row = patch_idx // num_cols + col = patch_idx % num_cols + stitched[ + :, + row * patch_size : (row + 1) * patch_size, + col * patch_size : (col + 1) * patch_size, + ] = patches[patch_idx] + + original_tensor = torch.tensor(np.array(test_image)).permute(2, 0, 1).float() + self.assertTrue( + torch.allclose(stitched, original_tensor), + "Patches do not stitch back to original image - row/column ordering may be wrong", + ) + + def test_get_number_of_image_patches_aspect_ratio(self): + """Test that get_number_of_image_patches returns correct count for non-square images. + + This directly tests the row/column unpacking fix by verifying patch counts match + the expected grid layout. If rows/columns are swapped, the wrong grid would be + chosen for asymmetric images. + """ + for image_processing_class in self.image_processor_list: + patch_size = 64 + image_processor = image_processing_class( + size={"height": patch_size, "width": patch_size}, + crop_to_patches=True, + min_patches=1, + max_patches=12, + ) + + # Test 1: Tall image (4 rows x 1 column) should give 5 patches (4 + thumbnail) + tall_patches = image_processor.get_number_of_image_patches( + height=patch_size * 4, # 256 + width=patch_size, # 64 + images_kwargs={}, + ) + self.assertEqual(tall_patches, 5, "Tall image (4:1) should produce 5 patches") + + # Test 2: Wide image (1 row x 4 columns) should give 5 patches (4 + thumbnail) + wide_patches = image_processor.get_number_of_image_patches( + height=patch_size, # 64 + width=patch_size * 4, # 256 + images_kwargs={}, + ) + self.assertEqual(wide_patches, 5, "Wide image (1:4) should produce 5 patches") + + # Test 3: Asymmetric image (2 rows x 3 columns) should give 7 patches + asym_patches = image_processor.get_number_of_image_patches( + height=patch_size * 2, # 128 + width=patch_size * 3, # 192 + images_kwargs={"max_patches": 6}, + ) + self.assertEqual(asym_patches, 7, "Asymmetric image (2:3) should produce 7 patches") + + # Test 4: Opposite asymmetric (3 rows x 2 columns) should also give 7 patches + asym_patches2 = image_processor.get_number_of_image_patches( + height=patch_size * 3, # 192 + width=patch_size * 2, # 128 + images_kwargs={"max_patches": 6}, + ) + self.assertEqual(asym_patches2, 7, "Asymmetric image (3:2) should produce 7 patches")