From 4801234281062b995828335e55c1c3e3d31334cc Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 3 Dec 2025 14:20:13 +0800 Subject: [PATCH 1/5] add test/prototype/mx_formats/test_nvfp4_tensor.py --- .../prototype/mx_formats/test_nvfp4_tensor.py | 62 ++++++++++--------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py index 2f734cef2c..f0ececd1f0 100644 --- a/test/prototype/mx_formats/test_nvfp4_tensor.py +++ b/test/prototype/mx_formats/test_nvfp4_tensor.py @@ -24,9 +24,11 @@ from torchao.utils import ( is_sm_at_least_100, torch_version_at_least, + get_current_accelerator_device, ) torch.manual_seed(2) +_DEVICE = get_current_accelerator_device() if not torch_version_at_least("2.8.0"): pytest.skip("Unsupported PyTorch version", allow_module_level=True) @@ -42,12 +44,12 @@ (torch.bfloat16, (1, 32, 64), False), ], ) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+" ) def test_nvfp4_reconstruction(dtype, shape, use_per_tensor_scale): - x = torch.randn(shape, dtype=dtype, device="cuda") + x = torch.randn(shape, dtype=dtype, device=_DEVICE) if use_per_tensor_scale: tensor_amax = torch.max(torch.abs(x)) scale = per_tensor_amax_to_scale(tensor_amax) @@ -113,14 +115,14 @@ def assert_sqnr_gt_threshold(orig, new, threshold): @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+" ) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") def test_nvfp4_swizzled_scales_construction(is_swizzled_scales, shape): """ Test that NVFP4Tensor can be constructed with swizzled scales and that the _is_swizzled_scales flag is set correctly. """ - data = torch.randn(*shape, device="cuda", dtype=torch.bfloat16) + data = torch.randn(*shape, device=_DEVICE, dtype=torch.bfloat16) tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=is_swizzled_scales) assert tensor._is_swizzled_scales == is_swizzled_scales @@ -146,7 +148,7 @@ def test_nvfp4_swizzled_scales_construction(is_swizzled_scales, shape): pytest.param(1, slice(1024, 2048), id="slice_cols[1024:2048]_quarter"), ], ) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+" ) @@ -164,7 +166,7 @@ def test_nvfp4_swizzled_scales_slicing(slice_dim, slice_spec): # For column slicing, need multiples of 64 columns for alignment M, K = 128, 4096 - data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16) + data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16) tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True) assert tensor._is_swizzled_scales == True @@ -240,7 +242,7 @@ def test_nvfp4_swizzled_scales_slicing(slice_dim, slice_spec): ), ], ) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+" ) @@ -250,7 +252,7 @@ def test_nvfp4_swizzled_scales_slicing_errors(slice_dim, slice_spec, expected_er """ M, K = 256, 4096 - data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16) + data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16) tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True) with pytest.raises(RuntimeError, match=expected_error): @@ -260,7 +262,7 @@ def test_nvfp4_swizzled_scales_slicing_errors(slice_dim, slice_spec, expected_er _ = tensor[:, slice_spec] -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+" ) @@ -270,7 +272,7 @@ def test_nvfp4_swizzled_scales_view_semantics(): """ M, K = 256, 4096 - data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16) + data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16) tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True) # Test row slicing (should maintain views) @@ -286,7 +288,7 @@ def test_nvfp4_swizzled_scales_view_semantics(): assert full_width_slice.qdata.data_ptr() == tensor.qdata.data_ptr() -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+" ) @@ -296,7 +298,7 @@ def test_nvfp4_swizzled_scales_serialization(): """ M, K = 32, 64 - data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16) + data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16) # Create tensor with swizzled scales original_tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=True) @@ -327,7 +329,7 @@ def test_nvfp4_swizzled_scales_serialization(): torch.testing.assert_close(original_dq, reconstructed_dq, atol=1e-6, rtol=1e-6) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+" ) @@ -337,7 +339,7 @@ def test_nvfp4_swizzled_scales_get_scales_method(): """ M, K = 32, 64 - data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16) + data = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16) # Create tensors with both storage methods regular_tensor = NVFP4Tensor.to_nvfp4(data, is_swizzled_scales=False) @@ -354,7 +356,7 @@ def test_nvfp4_swizzled_scales_get_scales_method(): assert swizzled_scales.shape == expected_shape -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.parametrize( "M", [128, 256, 512, 1024, 100, 200, 384], ids=lambda m: f"M{m}" ) @@ -371,7 +373,7 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype): """Test that Triton and PyTorch NVFP4 quantization produce equivalent results.""" torch.manual_seed(42) - x = torch.randn(M, N, dtype=dtype, device="cuda") + x = torch.randn(M, N, dtype=dtype, device=_DEVICE) per_tensor_scale = None if use_per_tensor_scale: @@ -413,7 +415,7 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype): ) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="torch.compile requires PyTorch 2.8+" ) @@ -442,7 +444,7 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype): @torch.no_grad() @skip_if_rocm("ROCm float4 gemm require gfx950") @pytest.mark.skipif( - not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for fp4" + torch.cuda.is_available() and not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for fp4" ) def test_nvfp4_matmul_with_amax( use_gelu: bool, @@ -454,7 +456,7 @@ def test_nvfp4_matmul_with_amax( shapes: tuple, ): # DYNAMIC mode requires SM100+, but WEIGHT_ONLY works on older GPUs - if quant_type == "dynamic" and not is_sm_at_least_100(): + if quant_type == "dynamic" and torch.cuda.is_available() and not is_sm_at_least_100(): pytest.skip("CUDA capability >= 10.0 required for DYNAMIC float4 gemm") if bias and inpt_dtype == torch.float32: @@ -467,13 +469,13 @@ def test_nvfp4_matmul_with_amax( # Create activation tensor if use_gelu: - x = torch.randn(m, k, dtype=inpt_dtype, device="cuda") + x = torch.randn(m, k, dtype=inpt_dtype, device=_DEVICE) A = torch.nn.functional.gelu(x) else: - A = torch.randn(m, k, dtype=inpt_dtype, device="cuda") + A = torch.randn(m, k, dtype=inpt_dtype, device=_DEVICE) - B = torch.randn(n, k, dtype=inpt_dtype, device="cuda") - bias_tensor = torch.randn(n, dtype=inpt_dtype, device="cuda") if bias else None + B = torch.randn(n, k, dtype=inpt_dtype, device=_DEVICE) + bias_tensor = torch.randn(n, dtype=inpt_dtype, device=_DEVICE) if bias else None # Compute reference C_ref = F.linear(A, B, bias_tensor) @@ -511,12 +513,12 @@ def test_nvfp4_matmul_with_amax( ) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+" ) def test_nvfp4_to_copy(): - x = NVFP4Tensor.to_nvfp4(torch.randn((32, 128))).cuda() + x = NVFP4Tensor.to_nvfp4(torch.randn((32, 128))).to(_DEVICE) y = torch.ops.aten._to_copy(x, dtype=torch.bfloat16) assert torch.equal(x.qdata, y.qdata) assert torch.equal(x.scale, y.scale) @@ -531,7 +533,7 @@ def test_nvfp4_to_copy(): assert y.dtype == torch.bfloat16 -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+" ) @@ -551,14 +553,14 @@ def test_nvfp4_to_copy(): def test_scale_shape_matches_qdata( transpose, use_triton_kernel, is_swizzled_scales, shape ): - if use_triton_kernel and not is_sm_at_least_100(): + if use_triton_kernel and torch.cuda.is_available() and not is_sm_at_least_100(): pytest.skip("CUDA capability >= 10.0 required for nvfp4 triton kernel") if use_triton_kernel and not is_swizzled_scales: pytest.skip("triton kernel requires swizzled scales") block_size = 16 - x_hp = torch.randn(*shape, device="cuda") + x_hp = torch.randn(*shape, device=_DEVICE) x = NVFP4Tensor.to_nvfp4( x_hp, is_swizzled_scales=is_swizzled_scales, use_triton_kernel=use_triton_kernel ) @@ -599,14 +601,14 @@ def test_scale_shape_matches_qdata( ) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") @pytest.mark.skipif( not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+" ) @pytest.mark.parametrize("dims", ((1, 2), (2, 1), (-1, -2), (-2, -1))) @pytest.mark.parametrize("is_swizzled_scales", [True, False]) def test_3d_transpose(dims, is_swizzled_scales): - x_hp = torch.randn(2, 128, 256, device="cuda") + x_hp = torch.randn(2, 128, 256, device=_DEVICE) x_nvfp4 = NVFP4Tensor.to_nvfp4(x_hp, is_swizzled_scales=is_swizzled_scales) x_hp_t = x_hp.transpose(dims[0], dims[1]) x_nvfp4_t = x_nvfp4.transpose(dims[0], dims[1]) From a2e50c8573ffd17a573e289e1ff41dc8c1c5a887 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 3 Dec 2025 14:26:27 +0800 Subject: [PATCH 2/5] add test/prototype/mx_formats/test_nvfp4_tensor.py --- test/prototype/mx_formats/test_nvfp4_tensor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py index f0ececd1f0..75bb2a9049 100644 --- a/test/prototype/mx_formats/test_nvfp4_tensor.py +++ b/test/prototype/mx_formats/test_nvfp4_tensor.py @@ -366,7 +366,7 @@ def test_nvfp4_swizzled_scales_get_scales_method(): ) @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16], ids=["fp32", "bf16"]) @pytest.mark.skipif( - not is_sm_at_least_100(), reason="requires sm100+ for raw intrinsics" + torch.cuda.is_available() and not is_sm_at_least_100(), reason="requires sm100+ for raw intrinsics" ) @torch.no_grad() def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype): @@ -444,7 +444,7 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype): @torch.no_grad() @skip_if_rocm("ROCm float4 gemm require gfx950") @pytest.mark.skipif( - torch.cuda.is_available() and not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for fp4" + not is_sm_at_least_100(), reason="CUDA capability >= 10.0 required for fp4" ) def test_nvfp4_matmul_with_amax( use_gelu: bool, From 82c146ef6bf2c6c5f56c4e834d97ec881af46a66 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 3 Dec 2025 14:29:33 +0800 Subject: [PATCH 3/5] add test/prototype/mx_formats/test_nvfp4_tensor.py --- test/prototype/mx_formats/test_nvfp4_tensor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py index 75bb2a9049..b4b59ff1d9 100644 --- a/test/prototype/mx_formats/test_nvfp4_tensor.py +++ b/test/prototype/mx_formats/test_nvfp4_tensor.py @@ -356,7 +356,7 @@ def test_nvfp4_swizzled_scales_get_scales_method(): assert swizzled_scales.shape == expected_shape -@pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available") +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @pytest.mark.parametrize( "M", [128, 256, 512, 1024, 100, 200, 384], ids=lambda m: f"M{m}" ) @@ -366,7 +366,7 @@ def test_nvfp4_swizzled_scales_get_scales_method(): ) @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16], ids=["fp32", "bf16"]) @pytest.mark.skipif( - torch.cuda.is_available() and not is_sm_at_least_100(), reason="requires sm100+ for raw intrinsics" + not is_sm_at_least_100(), reason="requires sm100+ for raw intrinsics" ) @torch.no_grad() def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype): From 32e451a94dd651efd936762b2d1d5f93598b01c8 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 3 Dec 2025 14:39:46 +0800 Subject: [PATCH 4/5] test/prototype/test_spinquant.py --- test/prototype/test_spinquant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/prototype/test_spinquant.py b/test/prototype/test_spinquant.py index 03f0c34e20..fb32c83729 100644 --- a/test/prototype/test_spinquant.py +++ b/test/prototype/test_spinquant.py @@ -16,7 +16,7 @@ def _init_model(name="7B", device="cpu", precision=torch.bfloat16): return model.eval() -_AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) +_AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) + (["xpu"] if torch.xpu.is_available() else []) @pytest.mark.parametrize("device", _AVAILABLE_DEVICES) From b3ba0b5b80cbadf52aeb930bb7b59557d9f84dbd Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 3 Dec 2025 14:43:26 +0800 Subject: [PATCH 5/5] fix format issue --- test/prototype/mx_formats/test_nvfp4_tensor.py | 8 ++++++-- test/prototype/test_spinquant.py | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/test/prototype/mx_formats/test_nvfp4_tensor.py b/test/prototype/mx_formats/test_nvfp4_tensor.py index b4b59ff1d9..2e40d8276f 100644 --- a/test/prototype/mx_formats/test_nvfp4_tensor.py +++ b/test/prototype/mx_formats/test_nvfp4_tensor.py @@ -22,9 +22,9 @@ from torchao.quantization.utils import compute_error from torchao.testing.utils import skip_if_rocm from torchao.utils import ( + get_current_accelerator_device, is_sm_at_least_100, torch_version_at_least, - get_current_accelerator_device, ) torch.manual_seed(2) @@ -456,7 +456,11 @@ def test_nvfp4_matmul_with_amax( shapes: tuple, ): # DYNAMIC mode requires SM100+, but WEIGHT_ONLY works on older GPUs - if quant_type == "dynamic" and torch.cuda.is_available() and not is_sm_at_least_100(): + if ( + quant_type == "dynamic" + and torch.cuda.is_available() + and not is_sm_at_least_100() + ): pytest.skip("CUDA capability >= 10.0 required for DYNAMIC float4 gemm") if bias and inpt_dtype == torch.float32: diff --git a/test/prototype/test_spinquant.py b/test/prototype/test_spinquant.py index fb32c83729..399bc2fa64 100644 --- a/test/prototype/test_spinquant.py +++ b/test/prototype/test_spinquant.py @@ -16,7 +16,11 @@ def _init_model(name="7B", device="cpu", precision=torch.bfloat16): return model.eval() -_AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else []) + (["xpu"] if torch.xpu.is_available() else []) +_AVAILABLE_DEVICES = ( + ["cpu"] + + (["cuda"] if torch.cuda.is_available() else []) + + (["xpu"] if torch.xpu.is_available() else []) +) @pytest.mark.parametrize("device", _AVAILABLE_DEVICES)