test

yf225 · yf225 · commit 65e11462115a · 2025-12-08T22:25:03.000-08:00
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -460,27 +460,11 @@ def attention(q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor, *, _la
     _RDIM_SIZE_2 = 64
     # src[attention.py:N]: m_i = hl.full([tile_b, tile_m], float("-inf"), dtype=torch.float32)
     _BLOCK_SIZE_0 = 1
-    # src[attention.py:N]: q = q_view[tile_b, tile_m, :]
-    _SHAPE_DIM = q_in.size(3)
-    _SHAPE_DIM_1 = q_in.size(3)
-    _SHAPE_DIM_2 = q_in.size(3)
     # src[attention.py:N]: for tile_n in hl.tile(v_view.size(1)):
     # src[attention.py:N]:     k = k_view[tile_b, :, tile_n]
     # src[attention.py:N]:     qk = torch.bmm(q, k)
     # src[attention.py:N-N]: ...
     _BLOCK_SIZE_3 = 32
-    # src[attention.py:N]: k = k_view[tile_b, :, tile_n]
-    _SHAPE_DIM_3 = q_in.size(3)
-    _SHAPE_DIM_4 = q_in.size(3)
-    _SHAPE_DIM_5 = q_in.size(3)
-    # src[attention.py:N]: v = v_view[tile_b, tile_n, :]
-    _SHAPE_DIM_6 = q_in.size(3)
-    _SHAPE_DIM_7 = q_in.size(3)
-    _SHAPE_DIM_8 = q_in.size(3)
-    # src[attention.py:N]: out[tile_b, tile_m, :] = acc.to(out.dtype)
-    _SHAPE_DIM_9 = q_in.size(3)
-    _SHAPE_DIM_10 = q_in.size(3)
-    _SHAPE_DIM_11 = q_in.size(3)
     # src[attention.py:N]: for tile_b, tile_m in hl.tile([q_view.size(0), m_dim]):
     # src[attention.py:N]:     m_i = hl.full([tile_b, tile_m], float("-inf"), dtype=torch.float32)
     # src[attention.py:N]:     l_i = torch.full_like(m_i, 1.0)
diff --git a/test/test_specialize.expected b/test/test_specialize.expected
@@ -1132,3 +1132,142 @@ def reduce_kernel(x: torch.Tensor, tensor_factory_fn, test_host, *, _launcher=_d
     _launcher(_helion_reduce_kernel, (triton.cdiv(128, _BLOCK_SIZE_0),), x, grad_weight, _BLOCK_SIZE_0, _RDIM_SIZE_2, _BLOCK_SIZE_1, num_warps=4, num_stages=1)
     # src[test_specialize.py:N]: return grad_weight.sum(0).to(x.dtype)
     return grad_weight.sum(0).to(x.dtype)
+
+--- assertExpectedJournal(TestMarkStatic.test_mark_static)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_matmul(x, y, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, y_stride_0, y_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    num_blocks_0 = tl.cdiv(64, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < 64
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < 56
+    # src[test_specialize.py:N]: acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+    acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1], 0.0, tl.float32)
+    # src[test_specialize.py:N]: for tile_k in hl.tile(k):
+    # src[test_specialize.py:N]:     acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+    symnode_0 = 128
+    for offset_2 in tl.range(0, symnode_0.to(tl.int32), _BLOCK_SIZE_2):
+        indices_2 = offset_2 + tl.arange(0, _BLOCK_SIZE_2).to(tl.int32)
+        mask_2 = indices_2 < symnode_0
+        acc_copy = acc
+        acc_copy_0 = acc_copy
+        # src[test_specialize.py:N]: acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+        load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_2[None, :] * x_stride_1), mask_0[:, None] & mask_2[None, :], other=0)
+        load_1 = tl.load(y + (indices_2[:, None] * y_stride_0 + indices_1[None, :] * y_stride_1), mask_2[:, None] & mask_1[None, :], other=0)
+        acc = tl.dot(tl.cast(load, tl.float16), tl.cast(load_1, tl.float16), acc=acc_copy_0, input_precision='tf32', out_dtype=tl.float32)
+    # src[test_specialize.py:N]: out[tile_m, tile_n] = acc.to(x.dtype)
+    v_0 = tl.cast(acc, tl.float16)
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_0, mask_0[:, None] & mask_1[None, :])
+
+def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_specialize.py:N]: m, k = x.size()
+    m, k = x.size()
+    # src[test_specialize.py:N]: k2, n = y.size()
+    k2, n = y.size()
+    # src[test_specialize.py:N]: out = torch.empty([m, n], device=x.device, dtype=x.dtype)
+    out = torch.empty([m, n], device=x.device, dtype=x.dtype)
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    # src[test_specialize.py:N]: for tile_k in hl.tile(k):
+    # src[test_specialize.py:N]:     acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+    _BLOCK_SIZE_2 = 32
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    # src[test_specialize.py:N]:     acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+    # src[test_specialize.py:N]:     for tile_k in hl.tile(k):
+    # src[test_specialize.py:N-N]: ...
+    _launcher(_helion_matmul, (triton.cdiv(64, _BLOCK_SIZE_0) * triton.cdiv(56, _BLOCK_SIZE_1),), x, y, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), y.stride(0), y.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=4, num_stages=1)
+    # src[test_specialize.py:N]: return out
+    return out
+
+--- assertExpectedJournal(TestMarkStatic.test_mark_static_and_hl_specialize)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_dual_specialize(x, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    # src[test_specialize.py:N]: for tile in hl.tile(x.size()):
+    num_blocks_0 = tl.cdiv(320, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < 640
+    # src[test_specialize.py:N]: out[tile] = x[tile] * 2
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_1[None, :], other=0)
+    v_0 = 2.0
+    v_1 = load * v_0
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_1, mask_1[None, :])
+
+def dual_specialize(x: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_specialize.py:N]: out = torch.empty_like(x)
+    out = torch.empty_like(x)
+    # src[test_specialize.py:N]: for tile in hl.tile(x.size()):
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    # src[test_specialize.py:N]: for tile in hl.tile(x.size()):
+    # src[test_specialize.py:N]:     out[tile] = x[tile] * 2
+    _launcher(_helion_dual_specialize, (triton.cdiv(320, _BLOCK_SIZE_0) * triton.cdiv(640, _BLOCK_SIZE_1),), x, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=1)
+    # src[test_specialize.py:N]: return out
+    return out
+
+--- assertExpectedJournal(TestMarkStatic.test_mark_static_multiple_tensors)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_fn(x, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, n, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    num_blocks_0 = tl.cdiv(37, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < 37
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < n
+    # src[test_specialize.py:N]: out[tile_m, tile_n] = x[tile_m, tile_n] * p
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    symnode_0 = 127
+    v_0 = tl.cast(symnode_0, tl.float32)
+    v_1 = load * v_0
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_1, mask_0[:, None] & mask_1[None, :])
+
+def fn(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_specialize.py:N]: m, n = x.size()
+    m, n = x.size()
+    # src[test_specialize.py:N]: p = y.size(1)  # use y's dim 1 as a scalar
+    p = y.size(1)
+    # src[test_specialize.py:N]: out = x.new_empty([m, n])
+    out = x.new_empty([m, n])
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    # src[test_specialize.py:N]:     out[tile_m, tile_n] = x[tile_m, tile_n] * p
+    _launcher(_helion_fn, (triton.cdiv(37, _BLOCK_SIZE_0) * triton.cdiv(n, _BLOCK_SIZE_1),), x, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), n, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=1)
+    # src[test_specialize.py:N]: return out
+    return out
diff --git a/test/test_specialize.py b/test/test_specialize.py
@@ -327,5 +327,157 @@ def foo(x: torch.Tensor, bitshift: tuple[int, int]) -> torch.Tensor:
         self.assertExpectedJournal(code)
 
 
+@skipIfCpu("needs to be debugged")
+class TestMarkStatic(RefEagerTestBase, TestCase):
+    """Tests for torch._dynamo.mark_static() external specialization API."""
+
+    maxDiff = 163842
+
+    def test_mark_static(self):
+        """Test mark_static: multiple tensors, multiple dims, negative indexing."""
+
+        @helion.kernel(autotune_effort="none", static_shapes=False)
+        def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            m, k = x.size()
+            k2, n = y.size()
+            out = torch.empty([m, n], device=x.device, dtype=x.dtype)
+            for tile_m, tile_n in hl.tile([m, n]):
+                acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+                for tile_k in hl.tile(k):
+                    acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+                out[tile_m, tile_n] = acc.to(x.dtype)
+            return out
+
+        m, k, n = 64, 128, 56
+        x = torch.randn([m, k], device=DEVICE, dtype=torch.float16)
+        y = torch.randn([k, n], device=DEVICE, dtype=torch.float16)
+
+        # First, run WITHOUT mark_static - dimensions should NOT be constants
+        code_no_spec, result_no_spec = code_and_output(
+            matmul,
+            (x, y),
+            block_sizes=[32, 32, 32],
+        )
+        torch.testing.assert_close(result_no_spec, x @ y, rtol=1e-2, atol=1e-2)
+        self.assertNotIn("64", code_no_spec)  # x dim 0 = m should NOT be specialized
+        self.assertNotIn("128", code_no_spec)  # x dim -1 = k should NOT be specialized
+        self.assertNotIn("56", code_no_spec)  # y dim 1 = n should NOT be specialized
+
+        # Now, run WITH mark_static - dimensions SHOULD be constants
+        # Create fresh tensors and mark them static
+        x_static = torch.randn([m, k], device=DEVICE, dtype=torch.float16)
+        y_static = torch.randn([k, n], device=DEVICE, dtype=torch.float16)
+        torch._dynamo.mark_static(x_static, [0, -1])
+        torch._dynamo.mark_static(y_static, 1)
+
+        code, result = code_and_output(
+            matmul,
+            (x_static, y_static),
+            block_sizes=[32, 32, 32],
+        )
+        torch.testing.assert_close(result, x_static @ y_static, rtol=1e-2, atol=1e-2)
+        self.assertIn("64", code)  # x dim 0 = m
+        self.assertIn("128", code)  # x dim -1 = k
+        self.assertIn("56", code)  # y dim 1 = n
+        self.assertExpectedJournal(code)
+
+        # Verify cache behavior: same specialized values hit cache
+        self.assertIs(matmul.bind((x_static, y_static)), matmul.bind((x_static, y_static)))
+        # Verify cache behavior: different specialized values produce different bound kernels
+        x2 = torch.randn([48, 96], device=DEVICE, dtype=torch.float16)
+        y2 = torch.randn([96, 24], device=DEVICE, dtype=torch.float16)
+        torch._dynamo.mark_static(x2, [0, -1])
+        torch._dynamo.mark_static(y2, 1)
+        self.assertIsNot(
+            matmul.bind((x_static, y_static)), matmul.bind((x2, y2))
+        )
+
+    def test_mark_static_and_hl_specialize(self):
+        """Test that external mark_static and internal hl.specialize form a union."""
+
+        @helion.kernel(autotune_effort="none", static_shapes=False)
+        def dual_specialize(x: torch.Tensor) -> torch.Tensor:
+            # Internal specialize on dim 0
+            hl.specialize(x.size(0))
+            out = torch.empty_like(x)
+            for tile in hl.tile(x.size()):
+                out[tile] = x[tile] * 2
+            return out
+
+        x = torch.randn([320, 640], device=DEVICE)
+
+        # First, run WITHOUT external mark_static - only dim 0 should be specialized
+        code_no_spec, result_no_spec = code_and_output(
+            dual_specialize,
+            (x,),
+            block_sizes=[16, 16],
+        )
+        torch.testing.assert_close(result_no_spec, x * 2)
+        self.assertIn("320", code_no_spec)  # dim 0 from internal specialize
+        self.assertNotIn("640", code_no_spec)  # dim 1 should NOT be specialized
+
+        # Now, run WITH external mark_static on dim -1 (dim 1)
+        # Result: both dim 0 AND dim 1 are specialized (union)
+        x_static = torch.randn([320, 640], device=DEVICE)
+        torch._dynamo.mark_static(x_static, -1)
+
+        code, result = code_and_output(
+            dual_specialize,
+            (x_static,),
+            block_sizes=[16, 16],
+        )
+        torch.testing.assert_close(result, x_static * 2)
+        # Both dimensions should appear as constants
+        self.assertIn("320", code)  # dim 0 from internal specialize
+        self.assertIn("640", code)  # dim 1 from external mark_static
+        self.assertExpectedJournal(code)
+
+        # Verify cache behavior: changing dim 1 (external) produces different bound kernel
+        x2 = torch.randn([320, 128], device=DEVICE)  # same dim 0, different dim 1
+        torch._dynamo.mark_static(x2, -1)
+        self.assertIsNot(dual_specialize.bind((x_static,)), dual_specialize.bind((x2,)))
+
+    def test_mark_static_multiple_tensors(self):
+        """Test mark_static on multiple tensors."""
+
+        @helion.kernel(autotune_effort="none", static_shapes=False)
+        def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            m, n = x.size()
+            p = y.size(1)  # use y's dim 1 as a scalar
+            out = x.new_empty([m, n])
+            for tile_m, tile_n in hl.tile([m, n]):
+                out[tile_m, tile_n] = x[tile_m, tile_n] * p
+            return out
+
+        x = torch.randn([37, 64], device=DEVICE)
+        y = torch.randn([48, 127], device=DEVICE)
+
+        # First, run WITHOUT mark_static - dimensions should NOT be constants
+        code_no_spec, result_no_spec = code_and_output(fn, (x, y), block_sizes=[16, 16])
+        torch.testing.assert_close(result_no_spec, x * 127)
+        self.assertNotIn("37", code_no_spec)  # x dim 0 should NOT be specialized
+        self.assertNotIn("127", code_no_spec)  # y dim 1 should NOT be specialized
+
+        # Now, mark both tensors static
+        x_static = torch.randn([37, 64], device=DEVICE)
+        y_static = torch.randn([48, 127], device=DEVICE)
+        torch._dynamo.mark_static(x_static, 0)
+        torch._dynamo.mark_static(y_static, 1)
+
+        code, result = code_and_output(fn, (x_static, y_static), block_sizes=[16, 16])
+        torch.testing.assert_close(result, x_static * 127)
+        # Both specializations should be present
+        self.assertIn("37", code)  # x dim 0
+        self.assertIn("127", code)  # y dim 1
+        self.assertExpectedJournal(code)
+
+        # Verify cache behavior: changing specialized values produces different bound kernels
+        x2 = torch.randn([48, 64], device=DEVICE)  # different dim 0
+        y2 = torch.randn([48, 256], device=DEVICE)  # different dim 1
+        torch._dynamo.mark_static(x2, 0)
+        torch._dynamo.mark_static(y2, 1)
+        self.assertIsNot(fn.bind((x_static, y_static)), fn.bind((x2, y2)))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_tensor_descriptor.expected b/test/test_tensor_descriptor.expected
@@ -123,27 +123,11 @@ def attention(q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor, *, _la
     _RDIM_SIZE_2 = 64
     # src[attention.py:N]: m_i = hl.full([tile_b, tile_m], float("-inf"), dtype=torch.float32)
     _BLOCK_SIZE_0 = 1
-    # src[attention.py:N]: q = q_view[tile_b, tile_m, :]
-    _SHAPE_DIM = q_in.size(3)
-    _SHAPE_DIM_1 = q_in.size(3)
-    _SHAPE_DIM_2 = q_in.size(3)
     # src[attention.py:N]: for tile_n in hl.tile(v_view.size(1)):
     # src[attention.py:N]:     k = k_view[tile_b, :, tile_n]
     # src[attention.py:N]:     qk = torch.bmm(q, k)
     # src[attention.py:N-N]: ...
     _BLOCK_SIZE_3 = 16
-    # src[attention.py:N]: k = k_view[tile_b, :, tile_n]
-    _SHAPE_DIM_3 = q_in.size(3)
-    _SHAPE_DIM_4 = q_in.size(3)
-    _SHAPE_DIM_5 = q_in.size(3)
-    # src[attention.py:N]: v = v_view[tile_b, tile_n, :]
-    _SHAPE_DIM_6 = q_in.size(3)
-    _SHAPE_DIM_7 = q_in.size(3)
-    _SHAPE_DIM_8 = q_in.size(3)
-    # src[attention.py:N]: out[tile_b, tile_m, :] = acc.to(out.dtype)
-    _SHAPE_DIM_9 = q_in.size(3)
-    _SHAPE_DIM_10 = q_in.size(3)
-    _SHAPE_DIM_11 = q_in.size(3)
     # src[attention.py:N]: for tile_b, tile_m in hl.tile([q_view.size(0), m_dim]):
     # src[attention.py:N]:     m_i = hl.full([tile_b, tile_m], float("-inf"), dtype=torch.float32)
     # src[attention.py:N]:     l_i = torch.full_like(m_i, 1.0)