skip certain mxfp8 tests for cuda < 12.8 (#3443)

danielvegamyhre · web-flow · commit aa21b808ad62 · 2025-12-05T09:07:21.000-08:00
skip certain mxfp8 tests when mxfp8_cuda extension is unavailable
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
@@ -43,6 +43,7 @@
 from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_dtype, to_mx
 from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.utils import (
+    is_cuda_version_at_least,
     is_sm_at_least_89,
     is_sm_at_least_100,
     torch_version_at_least,
@@ -529,6 +530,10 @@ def test_rearrange(shape):
     not is_sm_at_least_100(),
     reason="MXFP8 requires CUDA capability 10.0 or greater",
 )
+@pytest.mark.skipif(
+    not is_cuda_version_at_least(12, 8),
+    reason="CUDA version >= 12.8 required for MXFP8 CUDA kernels",
+)
 @pytest.mark.parametrize("M", (32, 256))
 @pytest.mark.parametrize("K", (32, 256))
 @pytest.mark.parametrize("input_dtype", (torch.float32, torch.bfloat16))
@@ -577,6 +582,10 @@ def test_cuda_mx_dim1_numerics(M, K, input_dtype, scaling_mode):
     not is_sm_at_least_100(),
     reason="MXFP8 requires CUDA capability 10.0 or greater",
 )
+@pytest.mark.skipif(
+    not is_cuda_version_at_least(12, 8),
+    reason="CUDA version >= 12.8 required for MXFP8 CUDA kernels",
+)
 def test_cuda_mx_dim0_not_supported():
     from torchao.prototype import mxfp8_cuda
 
@@ -601,6 +610,10 @@ def test_cuda_mx_dim0_not_supported():
     not is_sm_at_least_100(),
     reason="MXFP8 requires CUDA capability 10.0 or greater",
 )
+@pytest.mark.skipif(
+    not is_cuda_version_at_least(12, 8),
+    reason="CUDA version >= 12.8 required for MXFP8 CUDA kernels",
+)
 def test_cuda_mx_dim1_invalid_block_size():
     from torchao.prototype import mxfp8_cuda
 
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -26,6 +26,7 @@
 from torchao.quantization import quantize_
 from torchao.quantization.utils import compute_error
 from torchao.utils import (
+    is_cuda_version_at_least,
     is_sm_at_least_89,
     is_sm_at_least_100,
     torch_version_at_least,
@@ -50,12 +51,25 @@ def run_around_tests():
 elem_dtypes = (
     [
         # test each dtype
-        (torch.float8_e4m3fn, torch.float8_e4m3fn, torch.float8_e4m3fn),
+        (
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fn,
+        ),
         (DTYPE_FP6_E3M2, DTYPE_FP6_E3M2, DTYPE_FP6_E3M2),
         (DTYPE_FP6_E2M3, DTYPE_FP6_E2M3, DTYPE_FP6_E2M3),
-        (torch.float4_e2m1fn_x2, torch.float4_e2m1fn_x2, torch.float4_e2m1fn_x2),
-        # only test one type of mixed-dtype overrides, to save testing time
-        (torch.float8_e4m3fn, torch.float4_e2m1fn_x2, torch.float4_e2m1fn_x2),
+        (
+            torch.float4_e2m1fn_x2,
+            torch.float4_e2m1fn_x2,
+            torch.float4_e2m1fn_x2,
+        ),
+        # only test one type of mixed-dtype overrides, to save
+        # testing time
+        (
+            torch.float8_e4m3fn,
+            torch.float4_e2m1fn_x2,
+            torch.float4_e2m1fn_x2,
+        ),
     ]
     if torch_version_at_least("2.8.0")
     else [
@@ -117,6 +131,8 @@ def test_linear_eager_vs_hp(
             pytest.skip("unsupported configuration")
         elif not is_sm_at_least_100():
             pytest.skip("CUDA capability >= 10.0 required for MX dim1 cast cuda kernel")
+        elif not is_cuda_version_at_least(12, 8):
+            pytest.skip("CUDA version >= 12.8 required for MXFP8 CUDA extension")
 
     # elem_dtype is a tuple of (input, weight, gradient) dtypes.
     grad_shape = list(input_shape)
@@ -166,7 +182,12 @@ def test_linear_eager_vs_hp(
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(
-    not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for mxfloat8"
+    not is_sm_at_least_100(),
+    reason="CUDA capability >= 10.0 required for mxfloat8",
+)
+@pytest.mark.skipif(
+    not is_cuda_version_at_least(12, 8),
+    reason="CUDA version >= 12.8 required for MXFP8",
 )
 @pytest.mark.parametrize(
     "recipe_name",
@@ -303,6 +324,10 @@ def test_linear_compile(
             ScaleCalculationMode.RCEIL,
         ):
             pytest.skip("unsupported configuration")
+        elif not is_sm_at_least_100():
+            pytest.skip("CUDA capability >= 10.0 required for MX dim1 cast cuda kernel")
+        elif not is_cuda_version_at_least(12, 8):
+            pytest.skip("CUDA version >= 12.8 required for MXFP8")
 
     if hp_dtype == torch.bfloat16 and recipe_name != "mxfp8_cublas":
         # TODO(future PR): properly enable float32 + bfloat16 for every
@@ -318,7 +343,8 @@ def test_linear_compile(
     ):
         # TODO(future): debug this
         pytest.skip(
-            "there are currently accuracy issues with this configuration on H100 and below"
+            "there are currently accuracy issues with this configuration "
+            "on H100 and below"
         )
 
     M, K, N = 128, 256, 512
diff --git a/torchao/utils.py b/torchao/utils.py
@@ -29,6 +29,7 @@
     "get_model_size_in_bytes",
     "unwrap_tensor_subclass",
     "TorchAOBaseTensor",
+    "is_cuda_version_at_least",
     "is_MI300",
     "is_sm_at_least_89",
     "is_sm_at_least_90",
@@ -512,9 +513,11 @@ def _same_metadata(self: TorchAOBaseTensor, src: TorchAOBaseTensor) -> bool:
         if hasattr(self, "optional_tensor_data_names"):
             # either both are None or both are not Tensors and the shape match
             _optional_tensor_shape_match = all(
-                getattr(self, t_name).shape == getattr(src, t_name).shape
-                if getattr(self, t_name) is not None
-                else getattr(src, t_name) is None
+                (
+                    getattr(self, t_name).shape == getattr(src, t_name).shape
+                    if getattr(self, t_name) is not None
+                    else getattr(src, t_name) is None
+                )
                 for t_name in self.optional_tensor_data_names
             )
 
@@ -1097,6 +1100,16 @@ def is_sm_at_least_100():
     )
 
 
+def is_cuda_version_at_least(major: int, minor: int) -> bool:
+    if not torch.cuda.is_available():
+        return False
+    cuda_version = torch.version.cuda
+    if cuda_version is None:
+        return False
+    cuda_major, cuda_minor = map(int, cuda_version.split(".")[:2])
+    return (cuda_major, cuda_minor) >= (major, minor)
+
+
 def check_cpu_version(device, version="2.6.0"):
     if isinstance(device, torch.device):
         device = device.type