huggingface
diff --git a/‎setup.py‎
Lines changed: 1 addition & 1 deletion b/‎setup.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/dependency_versions_table.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/dependency_versions_table.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/albert/modeling_albert.py‎
Lines changed: 0 additions & 10 deletions b/‎src/transformers/models/albert/modeling_albert.py‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/transformers/models/distilbert/modeling_distilbert.py‎
Lines changed: 0 additions & 10 deletions b/‎src/transformers/models/distilbert/modeling_distilbert.py‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/transformers/pytorch_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/pytorch_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/testing_utils.py‎
Lines changed: 0 additions & 10 deletions b/‎src/transformers/testing_utils.py‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/transformers/utils/import_utils.py‎
Lines changed: 2 additions & 9 deletions b/‎src/transformers/utils/import_utils.py‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎tests/generation/test_utils.py‎
Lines changed: 0 additions & 2 deletions b/‎tests/generation/test_utils.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/models/aimv2/test_modeling_aimv2.py‎
Lines changed: 0 additions & 2 deletions b/‎tests/models/aimv2/test_modeling_aimv2.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/models/blip_2/test_modeling_blip_2.py‎
Lines changed: 0 additions & 3 deletions b/‎tests/models/blip_2/test_modeling_blip_2.py‎
Lines changed: 0 additions & 3 deletions
@@ -190,7 +190,7 @@
     "tiktoken",
     "timm<=1.0.19,!=1.0.18",
     "tokenizers>=0.21,<0.22",
-    "torch>=2.1",
+    "torch>=2.2",
     "torchaudio",
     "torchvision",
     "pyctcdecode>=0.4.0",
 
@@ -92,7 +92,7 @@
     "tiktoken": "tiktoken",
     "timm": "timm<=1.0.19,!=1.0.18",
     "tokenizers": "tokenizers>=0.21,<0.22",
-    "torch": "torch>=2.1",
+    "torch": "torch>=2.2",
     "torchaudio": "torchaudio",
     "torchvision": "torchvision",
     "pyctcdecode": "pyctcdecode>=0.4.0",
 
@@ -38,7 +38,6 @@
 from ...pytorch_utils import (
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
-    is_torch_greater_or_equal_than_2_2,
     prune_linear_layer,
 )
 from ...utils import ModelOutput, auto_docstring, logging
@@ -356,7 +355,6 @@ class AlbertSdpaAttention(AlbertAttention):
     def __init__(self, config):
         super().__init__(config)
         self.dropout_prob = config.attention_probs_dropout_prob
-        self.require_contiguous_qkv = not is_torch_greater_or_equal_than_2_2
 
     def forward(
         self,
@@ -392,14 +390,6 @@ def forward(
             .transpose(1, 2)
         )
 
-        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
-        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
-        # Reference: https://github.com/pytorch/pytorch/issues/112577
-        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
-            query_layer = query_layer.contiguous()
-            key_layer = key_layer.contiguous()
-            value_layer = value_layer.contiguous()
-
         attention_output = torch.nn.functional.scaled_dot_product_attention(
             query=query_layer,
             key=key_layer,
 
@@ -44,7 +44,6 @@
 from ...pytorch_utils import (
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
-    is_torch_greater_or_equal_than_2_2,
     prune_linear_layer,
 )
 from ...utils import (
@@ -338,7 +337,6 @@ class DistilBertSdpaAttention(MultiHeadSelfAttention):
     def __init__(self, config: PretrainedConfig):
         super().__init__(config=config)
         self.dropout_prob = config.attention_dropout
-        self.require_contiguous_qkv = not is_torch_greater_or_equal_than_2_2
 
     def forward(
         self,
@@ -391,14 +389,6 @@ def unshape(x: torch.Tensor) -> torch.Tensor:
         k = shape(self.k_lin(key))  # (bs, n_heads, k_length, dim_per_head)
         v = shape(self.v_lin(value))  # (bs, n_heads, k_length, dim_per_head)
 
-        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
-        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
-        # Reference: https://github.com/pytorch/pytorch/issues/112577
-        if self.require_contiguous_qkv and q.device.type == "cuda" and mask is not None:
-            q = q.contiguous()
-            k = k.contiguous()
-            v = v.contiguous()
-
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             q,
             k,
 
@@ -38,9 +38,9 @@
 is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True)
 is_torch_greater_or_equal_than_2_4 = is_torch_greater_or_equal("2.4", accept_dev=True)
 is_torch_greater_or_equal_than_2_3 = is_torch_greater_or_equal("2.3", accept_dev=True)
-is_torch_greater_or_equal_than_2_2 = is_torch_greater_or_equal("2.2", accept_dev=True)
 
 # For backwards compatibility (e.g. some remote codes on Hub using those variables).
+is_torch_greater_or_equal_than_2_2 = is_torch_greater_or_equal("2.2", accept_dev=True)
 is_torch_greater_or_equal_than_2_1 = is_torch_greater_or_equal("2.1", accept_dev=True)
 is_torch_greater_or_equal_than_2_0 = is_torch_greater_or_equal("2.0", accept_dev=True)
 is_torch_greater_or_equal_than_1_13 = is_torch_greater_or_equal("1.13", accept_dev=True)
 
@@ -159,7 +159,6 @@
     is_torch_neuroncore_available,
     is_torch_npu_available,
     is_torch_optimi_available,
-    is_torch_sdpa_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_xla_available,
@@ -624,15 +623,6 @@ def require_flash_attn_3(test_case):
     return unittest.skipUnless(is_flash_attn_3_available(), "test requires Flash Attention 3")(test_case)
 
 
-def require_torch_sdpa(test_case):
-    """
-    Decorator marking a test that requires PyTorch's SDPA.
-
-    These tests are skipped when requirements are not met (torch version).
-    """
-    return unittest.skipUnless(is_torch_sdpa_available(), "test requires PyTorch SDPA")(test_case)
-
-
 def require_read_token(test_case):
     """
     A decorator that loads the HF token for tests that require to load gated models.
 
@@ -451,17 +451,10 @@ def get_torch_major_and_minor_version() -> str:
 
 
 def is_torch_sdpa_available():
+    # Mostly retained for backward compatibility in remote code, since sdpa works correctly on all torch versions >= 2.2
     if not is_torch_available() or _torch_version == "N/A":
         return False
-
-    # NOTE: MLU is OK with non-contiguous inputs.
-    if is_torch_mlu_available():
-        return True
-    # NOTE: NPU can use SDPA in Transformers with torch>=2.1.0.
-    if is_torch_npu_available():
-        return True
-    # NOTE: We require torch>=2.1.1 to avoid a numerical issue in SDPA with non-contiguous inputs: https://github.com/pytorch/pytorch/issues/112577
-    return version.parse(_torch_version) >= version.parse("2.1.1")
+    return True
 
 
 def is_torch_flex_attn_available():
 
@@ -51,7 +51,6 @@
     require_torch_gpu,
     require_torch_greater_or_equal,
     require_torch_multi_accelerator,
-    require_torch_sdpa,
     set_config_for_less_flaky_test,
     set_model_for_less_flaky_test,
     set_model_tester_for_less_flaky_test,
@@ -2366,7 +2365,6 @@ def _test_attention_implementation(self, attn_implementation):
                 self.assertTrue(has_similar_generate_outputs(res_eager, res_attn, atol=1e-3, rtol=1e-3))
 
     @pytest.mark.generate
-    @require_torch_sdpa
     @slow
     def test_eager_matches_sdpa_generate(self):
         """Tests that generate has equivalent outputs with SDPA and eager attention implementations."""
 
@@ -28,7 +28,6 @@
     require_flash_attn,
     require_torch,
     require_torch_gpu,
-    require_torch_sdpa,
     require_vision,
     slow,
     torch_device,
@@ -563,7 +562,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
                 )
 
     @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
-    @require_torch_sdpa
     def test_eager_matches_sdpa_inference(
         self,
         name,
 
@@ -31,7 +31,6 @@
     require_torch_fp16,
     require_torch_gpu,
     require_torch_multi_accelerator,
-    require_torch_sdpa,
     require_vision,
     slow,
     torch_device,
@@ -508,7 +507,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_model_get_set_embeddings(self):
         pass
 
-    @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         """
         Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
@@ -945,7 +943,6 @@ def test_model_get_set_embeddings(self):
     def test_cpu_offload(self):
         pass
 
-    @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         """
         Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.