diff --git a/tests/models/internvl/test_modeling_internvl.py b/tests/models/internvl/test_modeling_internvl.py index d4a31f5951e2..9f51aa3fc325 100644 --- a/tests/models/internvl/test_modeling_internvl.py +++ b/tests/models/internvl/test_modeling_internvl.py @@ -430,7 +430,14 @@ def test_qwen2_small_model_integration_batched_generate_multi_image(self): # Check first output decoded_output = processor.decode(output[0], skip_special_tokens=True) # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232 - expected_output = "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." # fmt: skip + expected_outputs = Expectations( + { + ("xpu", 3): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature\'s peace.', + ("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature\'s peace.', + ("rocm", (9, 4)): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature\'s embrace.', + } + ) # fmt: skip + expected_output = expected_outputs.get_expectation() self.assertEqual( decoded_output, expected_output, @@ -443,6 +450,7 @@ def test_qwen2_small_model_integration_batched_generate_multi_image(self): { ("xpu", 3): "user\n\nWhat are the differences between these two images?\nassistant\nThe images show the Statue of Liberty and the Golden Gate Bridge from different angles. Here are the differences:\n\n1. **Foreground", ("cuda", 7): "user\n\nWhat are the differences between these two images?\nassistant\nThe images show the Statue of Liberty and the Golden Gate Bridge from different angles. Here are the differences:\n\n1. **Foreground", + ("rocm", (9, 4)): "user\n\nWhat are the differences between these two images?\nassistant\nThe images show the Statue of Liberty and the Golden Gate Bridge from different angles. Here are the main differences:\n\n1. **", } ) # fmt: skip expected_output = expected_outputs.get_expectation() @@ -567,6 +575,7 @@ def test_qwen2_small_model_integration_interleaved_images_videos(self): { ("xpu", 3): "user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an", ("cuda", 7): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - The Statue of Liberty is prominently featured on an', + ("rocm", (9, 4)): 'user\n\n\nWhat are the differences between these two images?\nassistant\nThe images depict two distinct scenes:\n\n1. **Left Image:**\n - This image features the Statue of Liberty on Liberty', } ) # fmt: skip expected_output = expected_outputs.get_expectation() @@ -582,6 +591,7 @@ def test_qwen2_small_model_integration_interleaved_images_videos(self): { ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot", ("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot', + ("rocm", (9, 4)): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot', } ) # fmt: skip expected_output = expected_outputs.get_expectation() @@ -593,9 +603,14 @@ def test_qwen2_small_model_integration_interleaved_images_videos(self): # Check third output decoded_output = processor.decode(output[2], skip_special_tokens=True) - expected_output = ( - "user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature's peace." - ) + expected_outputs = Expectations( + { + ("xpu", 3): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature\'s peace.', + ("cuda", 7): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature\'s peace.', + ("rocm", (9, 4)): 'user\n\nWrite a haiku for this image\nassistant\nSilky lake, \nWooden pier, \nNature\'s embrace.', + } + ) # fmt: skip + expected_output = expected_outputs.get_expectation() self.assertEqual( decoded_output, expected_output, @@ -658,7 +673,7 @@ def test_llama_small_model_integration_forward(self): ("xpu", 3): [-9.8828, -0.4954, 1.4561, -10.3438, -10.3438], ("cuda", 7): [-9.8750, -0.4861, 1.4648, -10.3359, -10.3359], ("cuda", 8): [-9.8906, -0.4995, 1.4473, -10.3359, -10.3438], - ("rocm", (9, 4)): [ -9.8828, -0.5005, 1.4697, -10.3438, -10.3438], + ("rocm", (9, 4)): [ -9.8672, -0.4888, 1.4648, -10.3281, -10.3281], ("rocm", (9, 5)): [ -9.8906, -0.4976, 1.4502, -10.3359, -10.3438], } ) # fmt: skip @@ -934,7 +949,7 @@ def test_llama_small_model_integration_interleaved_images_videos(self): ("xpu", 3): "user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **", ("cuda", 7): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **', ("cuda", 8): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no', - ("rocm", (9, 4)): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. Upon closer inspection, the differences between the two images are:\n\n1. **', + ("rocm", (9, 4)): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no', ("rocm", (9, 5)): 'user\n\n\nWhat are the difference between these two images?\nassistant\nI apologize for the confusion in my previous response. After re-examining the images, I can see that there are no', } ) # fmt: skip diff --git a/tests/models/mistral3/test_modeling_mistral3.py b/tests/models/mistral3/test_modeling_mistral3.py index 3a7f51642a7b..15109b2aec8c 100644 --- a/tests/models/mistral3/test_modeling_mistral3.py +++ b/tests/models/mistral3/test_modeling_mistral3.py @@ -355,7 +355,8 @@ def test_mistral3_integration_batched_generate(self): expected_outputs = Expectations( { ("xpu", 3): "Calm lake's mirror gleams,\nWhispering pines stand in silence,\nPath to peace begins.", - ("cuda", 8): "Wooden path to calm,\nReflections whisper secrets,\nNature's peace unfolds.", + ("cuda", (8, 0)): "Wooden path to calm,\nReflections whisper secrets,\nNature's peace unfolds.", + ("cuda", (8, 6)): "Calm waters reflect\nWooden path to distant shore\nSilence in the woods", ("rocm", (9, 5)): "Calm waters reflect\nWooden path to distant shore\nSilence in the scene" } ) # fmt: skip @@ -432,7 +433,8 @@ def test_mistral3_integration_batched_generate_multi_image(self): decoded_output = processor.decode(gen_tokens[0], skip_special_tokens=True) expected_outputs = Expectations( { - ("cuda", 8): 'Calm waters reflect\nWooden path to distant shore\nSilence in the scene', + ("cuda", 8): "Calm waters reflect\nWooden path to distant shore\nPeace in nature's hold", + ("rocm", (9, 4)): "Calm waters reflect\nWooden path to distant shore\nSilence in the pines" } ) # fmt: skip expected_output = expected_outputs.get_expectation() @@ -448,6 +450,7 @@ def test_mistral3_integration_batched_generate_multi_image(self): { ("xpu", 3): "Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City.", ("cuda", 8): 'Certainly! The images depict two famous landmarks in the United States:\n\n1. The first image shows the Statue of Liberty,', + ("rocm", (9, 4)): 'Certainly! The images depict two famous landmarks in the United States:\n\n1. The first image shows the Statue of Liberty,', } ) # fmt: skip expected_output = expected_outputs.get_expectation()