From 4801234281062b995828335e55c1c3e3d31334cc Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 3 Dec 2025 14:20:13 +0800
Subject: [PATCH 1/5] add test/prototype/mx_formats/test_nvfp4_tensor.py

---
 .../prototype/mx_formats/test_nvfp4_tensor.py | 62 ++++++++++---------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py
index 2f734cef2c..f0ececd1f0 100644
--- a/test/prototype/mx_formats/test_nvfp4_tensor.py
+++ b/test/prototype/mx_formats/test_nvfp4_tensor.py
@@ -24,9 +24,11 @@
 from torchao.utils import (
     is_sm_at_least_100,
     torch_version_at_least,
+    get_current_accelerator_device,
 )
 
 torch.manual_seed(2)
+_DEVICE = get_current_accelerator_device()
 
 if not torch_version_at_least("2.8.0"):
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
@@ -42,12 +44,12 @@
         (torch.bfloat16, (1, 32, 64), False),
     ],
 )
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+"
 )
 def test_nvfp4_reconstruction(dtype, shape, use_per_tensor_scale):
-    x = torch.randn(shape, dtype=dtype, device="cuda")
+    x = torch.randn(shape, dtype=dtype, device=_DEVICE)
     if use_per_tensor_scale:
         tensor_amax = torch.max(torch.abs(x))
         scale = per_tensor_amax_to_scale(tensor_amax)
@@ -113,14 +115,14 @@ def assert_sqnr_gt_threshold(orig, new, threshold):
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+"
 )
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 def test_nvfp4_swizzled_scales_construction(is_swizzled_scales, shape):
     """
     Test that NVFP4Tensor can be constructed with swizzled scales and
     that the _is_swizzled_scales flag is set correctly.
     """
 
-    data = torch.randn(*shape, device="cuda", dtype=torch.bfloat16)
+    data = torch.randn(*shape, device=_DEVICE, dtype=torch.bfloat16)
 
     tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=is_swizzled_scales)
     assert tensor._is_swizzled_scales == is_swizzled_scales
@@ -146,7 +148,7 @@ def test_nvfp4_swizzled_scales_construction(is_swizzled_scales, shape):
         pytest.param(1, slice(1024, 2048), id="slice_cols[1024:2048]_quarter"),
     ],
 )
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
 )
@@ -164,7 +166,7 @@ def test_nvfp4_swizzled_scales_slicing(slice_dim, slice_spec):
         # For column slicing, need multiples of 64 columns for alignment
         M, K = 128, 4096
 
-    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16)
 
     tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True)
     assert tensor._is_swizzled_scales == True
@@ -240,7 +242,7 @@ def test_nvfp4_swizzled_scales_slicing(slice_dim, slice_spec):
         ),
     ],
 )
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
 )
@@ -250,7 +252,7 @@ def test_nvfp4_swizzled_scales_slicing_errors(slice_dim, slice_spec, expected_er
     """
 
     M, K = 256, 4096
-    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16)
     tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True)
 
     with pytest.raises(RuntimeError, match=expected_error):
@@ -260,7 +262,7 @@ def test_nvfp4_swizzled_scales_slicing_errors(slice_dim, slice_spec, expected_er
             _ = tensor[:, slice_spec]
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
 )
@@ -270,7 +272,7 @@ def test_nvfp4_swizzled_scales_view_semantics():
     """
 
     M, K = 256, 4096
-    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16)
     tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True)
 
     # Test row slicing (should maintain views)
@@ -286,7 +288,7 @@ def test_nvfp4_swizzled_scales_view_semantics():
     assert full_width_slice.qdata.data_ptr() == tensor.qdata.data_ptr()
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
 )
@@ -296,7 +298,7 @@ def test_nvfp4_swizzled_scales_serialization():
     """
 
     M, K = 32, 64
-    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16)
 
     # Create tensor with swizzled scales
     original_tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True)
@@ -327,7 +329,7 @@ def test_nvfp4_swizzled_scales_serialization():
     torch.testing.assert_close(original_dq, reconstructed_dq, atol=1e-6, rtol=1e-6)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
 )
@@ -337,7 +339,7 @@ def test_nvfp4_swizzled_scales_get_scales_method():
     """
 
     M, K = 32, 64
-    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16)
 
     # Create tensors with both storage methods
     regular_tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=False)
@@ -354,7 +356,7 @@ def test_nvfp4_swizzled_scales_get_scales_method():
     assert swizzled_scales.shape == expected_shape
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.parametrize(
     "M", [128, 256, 512, 1024, 100, 200, 384], ids=lambda m: f"M{m}"
 )
@@ -371,7 +373,7 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
     """Test that Triton and PyTorch NVFP4 quantization produce equivalent results."""
 
     torch.manual_seed(42)
-    x = torch.randn(M, N, dtype=dtype, device="cuda")
+    x = torch.randn(M, N, dtype=dtype, device=_DEVICE)
 
     per_tensor_scale = None
     if use_per_tensor_scale:
@@ -413,7 +415,7 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
     )
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+"
 )
@@ -442,7 +444,7 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
 @torch.no_grad()
 @skip_if_rocm("ROCm float4 gemm require gfx950")
 @pytest.mark.skipif(
-    not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for fp4"
+    torch.cuda.is_available() and not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for fp4"
 )
 def test_nvfp4_matmul_with_amax(
     use_gelu: bool,
@@ -454,7 +456,7 @@ def test_nvfp4_matmul_with_amax(
     shapes: tuple,
 ):
     # DYNAMIC mode requires SM100+, but WEIGHT_ONLY works on older GPUs
-    if quant_type == "dynamic" and not is_sm_at_least_100():
+    if quant_type == "dynamic" and torch.cuda.is_available() and not is_sm_at_least_100():
         pytest.skip("CUDA capability >= 10.0 required for DYNAMIC float4 gemm")
 
     if bias and inpt_dtype == torch.float32:
@@ -467,13 +469,13 @@ def test_nvfp4_matmul_with_amax(
 
     # Create activation tensor
     if use_gelu:
-        x = torch.randn(m, k, dtype=inpt_dtype, device="cuda")
+        x = torch.randn(m, k, dtype=inpt_dtype, device=_DEVICE)
         A = torch.nn.functional.gelu(x)
     else:
-        A = torch.randn(m, k, dtype=inpt_dtype, device="cuda")
+        A = torch.randn(m, k, dtype=inpt_dtype, device=_DEVICE)
 
-    B = torch.randn(n, k, dtype=inpt_dtype, device="cuda")
-    bias_tensor = torch.randn(n, dtype=inpt_dtype, device="cuda") if bias else None
+    B = torch.randn(n, k, dtype=inpt_dtype, device=_DEVICE)
+    bias_tensor = torch.randn(n, dtype=inpt_dtype, device=_DEVICE) if bias else None
 
     # Compute reference
     C_ref = F.linear(A, B, bias_tensor)
@@ -511,12 +513,12 @@ def test_nvfp4_matmul_with_amax(
     )
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
 )
 def test_nvfp4_to_copy():
-    x = NVFP4Tensor.to_nvfp4(torch.randn((32, 128))).cuda()
+    x = NVFP4Tensor.to_nvfp4(torch.randn((32, 128))).to(_DEVICE)
     y = torch.ops.aten._to_copy(x, dtype=torch.bfloat16)
     assert torch.equal(x.qdata, y.qdata)
     assert torch.equal(x.scale, y.scale)
@@ -531,7 +533,7 @@ def test_nvfp4_to_copy():
     assert y.dtype == torch.bfloat16
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
 )
@@ -551,14 +553,14 @@ def test_nvfp4_to_copy():
 def test_scale_shape_matches_qdata(
     transpose, use_triton_kernel, is_swizzled_scales, shape
 ):
-    if use_triton_kernel and not is_sm_at_least_100():
+    if use_triton_kernel and torch.cuda.is_available() and not is_sm_at_least_100():
         pytest.skip("CUDA capability >= 10.0 required for nvfp4 triton kernel")
     if use_triton_kernel and not is_swizzled_scales:
         pytest.skip("triton kernel requires swizzled scales")
 
     block_size = 16
 
-    x_hp = torch.randn(*shape, device="cuda")
+    x_hp = torch.randn(*shape, device=_DEVICE)
     x = NVFP4Tensor.to_nvfp4(
         x_hp, is_swizzled_scales=is_swizzled_scales, use_triton_kernel=use_triton_kernel
     )
@@ -599,14 +601,14 @@ def test_scale_shape_matches_qdata(
     )
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 @pytest.mark.skipif(
     not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
 )
 @pytest.mark.parametrize("dims", ((1, 2), (2, 1), (-1, -2), (-2, -1)))
 @pytest.mark.parametrize("is_swizzled_scales", [True, False])
 def test_3d_transpose(dims, is_swizzled_scales):
-    x_hp = torch.randn(2, 128, 256, device="cuda")
+    x_hp = torch.randn(2, 128, 256, device=_DEVICE)
     x_nvfp4 = NVFP4Tensor.to_nvfp4(x_hp, is_swizzled_scales=is_swizzled_scales)
     x_hp_t = x_hp.transpose(dims[0], dims[1])
     x_nvfp4_t = x_nvfp4.transpose(dims[0], dims[1])

From a2e50c8573ffd17a573e289e1ff41dc8c1c5a887 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 3 Dec 2025 14:26:27 +0800
Subject: [PATCH 2/5] add test/prototype/mx_formats/test_nvfp4_tensor.py

---
 test/prototype/mx_formats/test_nvfp4_tensor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py
index f0ececd1f0..75bb2a9049 100644
--- a/test/prototype/mx_formats/test_nvfp4_tensor.py
+++ b/test/prototype/mx_formats/test_nvfp4_tensor.py
@@ -366,7 +366,7 @@ def test_nvfp4_swizzled_scales_get_scales_method():
 )
 @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16], ids=["fp32", "bf16"])
 @pytest.mark.skipif(
-    not is_sm_at_least_100(), reason="requires sm100+ for raw intrinsics"
+    torch.cuda.is_available() and not is_sm_at_least_100(), reason="requires sm100+ for raw intrinsics"
 )
 @torch.no_grad()
 def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
@@ -444,7 +444,7 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
 @torch.no_grad()
 @skip_if_rocm("ROCm float4 gemm require gfx950")
 @pytest.mark.skipif(
-    torch.cuda.is_available() and not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for fp4"
+    not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for fp4"
 )
 def test_nvfp4_matmul_with_amax(
     use_gelu: bool,

From 82c146ef6bf2c6c5f56c4e834d97ec881af46a66 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 3 Dec 2025 14:29:33 +0800
Subject: [PATCH 3/5] add test/prototype/mx_formats/test_nvfp4_tensor.py

---
 test/prototype/mx_formats/test_nvfp4_tensor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py
index 75bb2a9049..b4b59ff1d9 100644
--- a/test/prototype/mx_formats/test_nvfp4_tensor.py
+++ b/test/prototype/mx_formats/test_nvfp4_tensor.py
@@ -356,7 +356,7 @@ def test_nvfp4_swizzled_scales_get_scales_method():
     assert swizzled_scales.shape == expected_shape
 
 
-@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize(
     "M", [128, 256, 512, 1024, 100, 200, 384], ids=lambda m: f"M{m}"
 )
@@ -366,7 +366,7 @@ def test_nvfp4_swizzled_scales_get_scales_method():
 )
 @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16], ids=["fp32", "bf16"])
 @pytest.mark.skipif(
-    torch.cuda.is_available() and not is_sm_at_least_100(), reason="requires sm100+ for raw intrinsics"
+    not is_sm_at_least_100(), reason="requires sm100+ for raw intrinsics"
 )
 @torch.no_grad()
 def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):

From 32e451a94dd651efd936762b2d1d5f93598b01c8 Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 3 Dec 2025 14:39:46 +0800
Subject: [PATCH 4/5] test/prototype/test_spinquant.py

---
 test/prototype/test_spinquant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/prototype/test_spinquant.py b/test/prototype/test_spinquant.py
index 03f0c34e20..fb32c83729 100644
--- a/test/prototype/test_spinquant.py
+++ b/test/prototype/test_spinquant.py
@@ -16,7 +16,7 @@ def _init_model(name="7B", device="cpu", precision=torch.bfloat16):
     return model.eval()
 
 
-_AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
+_AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) + (["xpu"] if torch.xpu.is_available() else [])
 
 
 @pytest.mark.parametrize("device", _AVAILABLE_DEVICES)

From b3ba0b5b80cbadf52aeb930bb7b59557d9f84dbd Mon Sep 17 00:00:00 2001
From: "Zeng, Xiangdong" <xiangdong.zeng@intel.com>
Date: Wed, 3 Dec 2025 14:43:26 +0800
Subject: [PATCH 5/5] fix format issue

---
 test/prototype/mx_formats/test_nvfp4_tensor.py | 8 ++++++--
 test/prototype/test_spinquant.py               | 6 +++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py
index b4b59ff1d9..2e40d8276f 100644
--- a/test/prototype/mx_formats/test_nvfp4_tensor.py
+++ b/test/prototype/mx_formats/test_nvfp4_tensor.py
@@ -22,9 +22,9 @@
 from torchao.quantization.utils import compute_error
 from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
+    get_current_accelerator_device,
     is_sm_at_least_100,
     torch_version_at_least,
-    get_current_accelerator_device,
 )
 
 torch.manual_seed(2)
@@ -456,7 +456,11 @@ def test_nvfp4_matmul_with_amax(
     shapes: tuple,
 ):
     # DYNAMIC mode requires SM100+, but WEIGHT_ONLY works on older GPUs
-    if quant_type == "dynamic" and torch.cuda.is_available() and not is_sm_at_least_100():
+    if (
+        quant_type == "dynamic"
+        and torch.cuda.is_available()
+        and not is_sm_at_least_100()
+    ):
         pytest.skip("CUDA capability >= 10.0 required for DYNAMIC float4 gemm")
 
     if bias and inpt_dtype == torch.float32:
diff --git a/test/prototype/test_spinquant.py b/test/prototype/test_spinquant.py
index fb32c83729..399bc2fa64 100644
--- a/test/prototype/test_spinquant.py
+++ b/test/prototype/test_spinquant.py
@@ -16,7 +16,11 @@ def _init_model(name="7B", device="cpu", precision=torch.bfloat16):
     return model.eval()
 
 
-_AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) + (["xpu"] if torch.xpu.is_available() else [])
+_AVAILABLE_DEVICES = (
+    ["cpu"]
+    + (["cuda"] if torch.cuda.is_available() else [])
+    + (["xpu"] if torch.xpu.is_available() else [])
+)
 
 
 @pytest.mark.parametrize("device", _AVAILABLE_DEVICES)