test

yf225 · yf225 · commit 08464bf3a2d7 · 2025-12-08T22:40:49.000-08:00
diff --git a/test/test_specialize.expected b/test/test_specialize.expected
@@ -1,6 +1,102 @@
 This file is automatically generated by assertExpectedJournal calls in test_specialize.py.
 Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set.
 
+--- assertExpectedJournal(TestMarkStatic.test_mark_static)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_matmul(x, y, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, y_stride_0, y_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    num_blocks_0 = tl.cdiv(64, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < 64
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < 56
+    # src[test_specialize.py:N]: acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+    acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1], 0.0, tl.float32)
+    # src[test_specialize.py:N]: for tile_k in hl.tile(k):
+    # src[test_specialize.py:N]:     acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+    symnode_0 = 128
+    for offset_2 in tl.range(0, symnode_0.to(tl.int32), _BLOCK_SIZE_2):
+        indices_2 = offset_2 + tl.arange(0, _BLOCK_SIZE_2).to(tl.int32)
+        mask_2 = indices_2 < symnode_0
+        acc_copy = acc
+        acc_copy_0 = acc_copy
+        # src[test_specialize.py:N]: acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+        load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_2[None, :] * x_stride_1), mask_0[:, None] & mask_2[None, :], other=0)
+        load_1 = tl.load(y + (indices_2[:, None] * y_stride_0 + indices_1[None, :] * y_stride_1), mask_2[:, None] & mask_1[None, :], other=0)
+        acc = tl.dot(tl.cast(load, tl.float16), tl.cast(load_1, tl.float16), acc=acc_copy_0, input_precision='tf32', out_dtype=tl.float32)
+    # src[test_specialize.py:N]: out[tile_m, tile_n] = acc.to(x.dtype)
+    v_0 = tl.cast(acc, tl.float16)
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_0, mask_0[:, None] & mask_1[None, :])
+
+def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_specialize.py:N]: m, k = x.size()
+    m, k = x.size()
+    # src[test_specialize.py:N]: k2, n = y.size()
+    k2, n = y.size()
+    # src[test_specialize.py:N]: out = torch.empty([m, n], device=x.device, dtype=x.dtype)
+    out = torch.empty([m, n], device=x.device, dtype=x.dtype)
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    # src[test_specialize.py:N]: for tile_k in hl.tile(k):
+    # src[test_specialize.py:N]:     acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+    _BLOCK_SIZE_2 = 32
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    # src[test_specialize.py:N]:     acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+    # src[test_specialize.py:N]:     for tile_k in hl.tile(k):
+    # src[test_specialize.py:N-N]: ...
+    _launcher(_helion_matmul, (triton.cdiv(64, _BLOCK_SIZE_0) * triton.cdiv(56, _BLOCK_SIZE_1),), x, y, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), y.stride(0), y.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=4, num_stages=1)
+    # src[test_specialize.py:N]: return out
+    return out
+
+--- assertExpectedJournal(TestMarkStatic.test_mark_static_and_hl_specialize)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_fn(x, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    # src[test_specialize.py:N]: for tile in hl.tile(x.size()):
+    num_blocks_0 = tl.cdiv(320, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < 640
+    # src[test_specialize.py:N]: out[tile] = x[tile] * 2
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_1[None, :], other=0)
+    v_0 = 2.0
+    v_1 = load * v_0
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_1, mask_1[None, :])
+
+def fn(x: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_specialize.py:N]: out = torch.empty_like(x)
+    out = torch.empty_like(x)
+    # src[test_specialize.py:N]: for tile in hl.tile(x.size()):
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    # src[test_specialize.py:N]: for tile in hl.tile(x.size()):
+    # src[test_specialize.py:N]:     out[tile] = x[tile] * 2
+    _launcher(_helion_fn, (triton.cdiv(320, _BLOCK_SIZE_0) * triton.cdiv(640, _BLOCK_SIZE_1),), x, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=1)
+    # src[test_specialize.py:N]: return out
+    return out
+
 --- assertExpectedJournal(TestSpecialize.test_dynamic_size_block_non_power_of_two)
 from __future__ import annotations
 
diff --git a/test/test_specialize.py b/test/test_specialize.py
@@ -327,5 +327,92 @@ def foo(x: torch.Tensor, bitshift: tuple[int, int]) -> torch.Tensor:
         self.assertExpectedJournal(code)
 
 
+@skipIfCpu("needs to be debugged")
+class TestMarkStatic(RefEagerTestBase, TestCase):
+    """Tests for torch._dynamo.mark_static() external specialization API."""
+
+    maxDiff = 163842
+
+    def test_mark_static(self):
+        """Test mark_static: multiple tensors, multiple dims, negative indexing."""
+
+        @helion.kernel(autotune_effort="none", static_shapes=False)
+        def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            m, k = x.size()
+            k2, n = y.size()
+            out = torch.empty([m, n], device=x.device, dtype=x.dtype)
+            for tile_m, tile_n in hl.tile([m, n]):
+                acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+                for tile_k in hl.tile(k):
+                    acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+                out[tile_m, tile_n] = acc.to(x.dtype)
+            return out
+
+        m, k, n = 64, 128, 56
+
+        # First, run WITHOUT mark_static - dimensions should NOT be constants
+        x = torch.randn([m, k], device=DEVICE, dtype=torch.float16)
+        y = torch.randn([k, n], device=DEVICE, dtype=torch.float16)
+        code_no_spec, result_no_spec = code_and_output(
+            matmul, (x, y), block_sizes=[32, 32, 32]
+        )
+        torch.testing.assert_close(result_no_spec, x @ y, rtol=1e-2, atol=1e-2)
+        self.assertNotIn("64", code_no_spec)
+        self.assertNotIn("128", code_no_spec)
+        self.assertNotIn("56", code_no_spec)
+
+        # Now, run WITH mark_static - dimensions SHOULD be constants
+        x_static = torch.randn([m, k], device=DEVICE, dtype=torch.float16)
+        y_static = torch.randn([k, n], device=DEVICE, dtype=torch.float16)
+        torch._dynamo.mark_static(x_static, [0, -1])  # test list and negative index
+        torch._dynamo.mark_static(y_static, 1)
+
+        code, result = code_and_output(
+            matmul, (x_static, y_static), block_sizes=[32, 32, 32]
+        )
+        torch.testing.assert_close(result, x_static @ y_static, rtol=1e-2, atol=1e-2)
+        self.assertIn("64", code)
+        self.assertIn("128", code)
+        self.assertIn("56", code)
+        self.assertExpectedJournal(code)
+
+        # Cache hit: same tensors
+        self.assertIs(
+            matmul.bind((x_static, y_static)), matmul.bind((x_static, y_static))
+        )
+        # Cache miss: different specialized values
+        x2 = torch.randn([48, 96], device=DEVICE, dtype=torch.float16)
+        y2 = torch.randn([96, 24], device=DEVICE, dtype=torch.float16)
+        torch._dynamo.mark_static(x2, [0, -1])
+        torch._dynamo.mark_static(y2, 1)
+        self.assertIsNot(matmul.bind((x_static, y_static)), matmul.bind((x2, y2)))
+
+    def test_mark_static_and_hl_specialize(self):
+        """Test that external mark_static and internal hl.specialize form a union."""
+
+        @helion.kernel(autotune_effort="none", static_shapes=False)
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            hl.specialize(x.size(0))  # internal specialize on dim 0
+            out = torch.empty_like(x)
+            for tile in hl.tile(x.size()):
+                out[tile] = x[tile] * 2
+            return out
+
+        # mark_static on dim 1 should combine with hl.specialize on dim 0
+        x = torch.randn([320, 640], device=DEVICE)
+        torch._dynamo.mark_static(x, -1)
+
+        code, result = code_and_output(fn, (x,), block_sizes=[16, 16])
+        torch.testing.assert_close(result, x * 2)
+        self.assertIn("320", code)  # dim 0 from hl.specialize
+        self.assertIn("640", code)  # dim 1 from mark_static
+        self.assertExpectedJournal(code)
+
+        # Cache miss: changing externally-specialized dim
+        x2 = torch.randn([320, 128], device=DEVICE)
+        torch._dynamo.mark_static(x2, -1)
+        self.assertIsNot(fn.bind((x,)), fn.bind((x2,)))
+
+
 if __name__ == "__main__":
     unittest.main()