test

yf225 · yf225 · commit 4a8696597806 · 2025-12-04T19:59:58.000-08:00
diff --git a/test/test_specialize.py b/test/test_specialize.py
@@ -327,5 +327,166 @@ def foo(x: torch.Tensor, bitshift: tuple[int, int]) -> torch.Tensor:
         self.assertExpectedJournal(code)
 
 
+@skipIfCpu("needs to be debugged")
+class TestSpecializeArgs(RefEagerTestBase, TestCase):
+    """Tests for kernel.specialize_args() external specialization API."""
+
+    maxDiff = 163842
+
+    def test_specialize_args(self):
+        """Test specialize_args: multiple tensors, multiple dims, negative indexing."""
+
+        @helion.kernel(autotune_effort="none", static_shapes=False)
+        def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            m, k = x.size()
+            k2, n = y.size()
+            out = torch.empty([m, n], device=x.device, dtype=x.dtype)
+            for tile_m, tile_n in hl.tile([m, n]):
+                acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+                for tile_k in hl.tile(k):
+                    acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+                out[tile_m, tile_n] = acc.to(x.dtype)
+            return out
+
+        m, k, n = 64, 128, 56
+        x = torch.randn([m, k], device=DEVICE, dtype=torch.float16)
+        y = torch.randn([k, n], device=DEVICE, dtype=torch.float16)
+
+        # First, run WITHOUT specialize_args - dimensions should NOT be constants
+        code_no_spec, result_no_spec = code_and_output(
+            matmul,
+            (x, y),
+            block_sizes=[32, 32, 32],
+        )
+        torch.testing.assert_close(result_no_spec, x @ y, rtol=1e-2, atol=1e-2)
+        self.assertNotIn("64", code_no_spec)  # x dim 0 = m should NOT be specialized
+        self.assertNotIn("128", code_no_spec)  # x dim -1 = k should NOT be specialized
+        self.assertNotIn("56", code_no_spec)  # y dim 1 = n should NOT be specialized
+
+        # Now, run WITH specialize_args - dimensions SHOULD be constants
+        code, result = code_and_output(
+            matmul.specialize_args(x=[0, -1], y=[1]),
+            (x, y),
+            block_sizes=[32, 32, 32],
+        )
+        torch.testing.assert_close(result, x @ y, rtol=1e-2, atol=1e-2)
+        self.assertIn("64", code)  # x dim 0 = m
+        self.assertIn("128", code)  # x dim -1 = k
+        self.assertIn("56", code)  # y dim 1 = n
+        self.assertExpectedJournal(code)
+
+        # Verify cache behavior: same specialized values hit cache
+        specialized_kernel = matmul.specialize_args(x=[0, -1], y=[1])
+        self.assertIs(
+            specialized_kernel.bind((x, y)), specialized_kernel.bind((x, y))
+        )
+        # Verify cache behavior: different specialized values produce different bound kernels
+        x2 = torch.randn([48, 96], device=DEVICE, dtype=torch.float16)
+        y2 = torch.randn([96, 24], device=DEVICE, dtype=torch.float16)
+        self.assertIsNot(
+            specialized_kernel.bind((x, y)), specialized_kernel.bind((x2, y2))
+        )
+
+    def test_specialize_args_and_hl_specialize(self):
+        """Test that external specialize_args and internal hl.specialize form a union."""
+
+        @helion.kernel(autotune_effort="none", static_shapes=False)
+        def dual_specialize(x: torch.Tensor) -> torch.Tensor:
+            # Internal specialize on dim 0
+            hl.specialize(x.size(0))
+            out = torch.empty_like(x)
+            for tile in hl.tile(x.size()):
+                out[tile] = x[tile] * 2
+            return out
+
+        x = torch.randn([320, 640], device=DEVICE)
+
+        # First, run WITHOUT external specialize_args - only dim 0 should be specialized
+        code_no_spec, result_no_spec = code_and_output(
+            dual_specialize,
+            (x,),
+            block_sizes=[16, 16],
+        )
+        torch.testing.assert_close(result_no_spec, x * 2)
+        self.assertIn("320", code_no_spec)  # dim 0 from internal specialize
+        self.assertNotIn("640", code_no_spec)  # dim 1 should NOT be specialized
+
+        # Now, run WITH external specialize_args on dim -1 (dim 1)
+        # Result: both dim 0 AND dim 1 are specialized (union)
+        code, result = code_and_output(
+            dual_specialize.specialize_args(x=[-1]),
+            (x,),
+            block_sizes=[16, 16],
+        )
+        torch.testing.assert_close(result, x * 2)
+        # Both dimensions should appear as constants
+        self.assertIn("320", code)  # dim 0 from internal specialize
+        self.assertIn("640", code)  # dim 1 from external specialize
+        self.assertExpectedJournal(code)
+
+        # Verify cache behavior: changing dim 1 (external) produces different bound kernel
+        x2 = torch.randn([320, 128], device=DEVICE)  # same dim 0, different dim 1
+        specialized_kernel = dual_specialize.specialize_args(x=[-1])
+        self.assertIsNot(specialized_kernel.bind((x,)), specialized_kernel.bind((x2,)))
+
+    @skipIfRefEager("Error checking not available in ref eager mode")
+    def test_specialize_args_errors(self):
+        """Test error handling for invalid specialize_args usage."""
+
+        @helion.kernel(autotune_effort="none", static_shapes=False)
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            out = torch.empty_like(x)
+            for tile in hl.tile(x.size()):
+                out[tile] = x[tile]
+            return out
+
+        x = torch.randn([32, 64], device=DEVICE)  # 2D tensor
+
+        # Error: dim out of range
+        with self.assertRaises((IndexError, ValueError)):
+            fn.specialize_args(x=[5])(x)
+
+        # Error: unknown argument name
+        with self.assertRaises(ValueError) as cm:
+            fn.specialize_args(z=[-1])
+        self.assertIn("Unknown argument", str(cm.exception))
+
+    def test_specialize_args_chaining(self):
+        """Test that chained specialize_args calls merge specializations."""
+
+        @helion.kernel(autotune_effort="none", static_shapes=False)
+        def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            m, n = x.size()
+            p = y.size(1)  # use y's dim 1 as a scalar
+            out = x.new_empty([m, n])
+            for tile_m, tile_n in hl.tile([m, n]):
+                out[tile_m, tile_n] = x[tile_m, tile_n] * p
+            return out
+
+        x = torch.randn([37, 64], device=DEVICE)
+        y = torch.randn([48, 127], device=DEVICE)
+
+        # First, run WITHOUT specialize_args - dimensions should NOT be constants
+        code_no_spec, result_no_spec = code_and_output(fn, (x, y), block_sizes=[16, 16])
+        torch.testing.assert_close(result_no_spec, x * 127)
+        self.assertNotIn("37", code_no_spec)  # x dim 0 should NOT be specialized
+        self.assertNotIn("127", code_no_spec)  # y dim 1 should NOT be specialized
+
+        # Now, chain two specialize_args calls - both should be preserved
+        chained = fn.specialize_args(x=[0]).specialize_args(y=[1])
+
+        code, result = code_and_output(chained, (x, y), block_sizes=[16, 16])
+        torch.testing.assert_close(result, x * 127)
+        # Both specializations should be present
+        self.assertIn("37", code)  # x dim 0
+        self.assertIn("127", code)  # y dim 1
+        self.assertExpectedJournal(code)
+
+        # Verify cache behavior: changing specialized values produces different bound kernels
+        x2 = torch.randn([48, 64], device=DEVICE)  # different dim 0
+        y2 = torch.randn([48, 256], device=DEVICE)  # different dim 1
+        self.assertIsNot(chained.bind((x, y)), chained.bind((x2, y2)))
+
+
 if __name__ == "__main__":
     unittest.main()