up

yf225 · yf225 · commit 72adccfcb8aa · 2025-12-06T12:57:43.000-08:00
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -352,26 +352,65 @@ def reset(self) -> None:
 
     def specialize_args(self, **kwargs: list[int]) -> Kernel[_R]:
         """
-        Returns a kernel that will specialize on the given argument dimensions.
-        This allows specialization decisions to be made outside the kernel,
+        Returns a new kernel that will specialize on the given argument dimensions.
+        The original kernel is not mutated - you can call the original kernel before
+        or after this method and it will behave identically.
+
+        This allows specialization decisions to be made outside the kernel definition,
         binding to argument names via kwargs.
 
         Args:
-            **kwargs: Mapping of argument name -> dims to specialize on
-                      e.g., specialize_args(q_in=[-1], k_in=[-1])
+            **kwargs: Mapping of argument name -> list of dimension indices to specialize.
+                      Supports negative indexing (e.g., -1 for last dimension).
+                      Example: specialize_args(x=[0, -1], y=[1])
 
         Returns:
-            Kernel: A new kernel with same settings and configs, adding the given
-            specializations to any existing ones.
+            Kernel: A new kernel with the same settings and configs, adding the given
+            specializations to any existing ones. Can be chained for multiple arguments.
+
+        Examples:
+            Basic usage - specialize specific dimensions:
+
+                @helion.kernel(static_shapes=False)
+                def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                    m, k = x.size()
+                    k2, n = y.size()
+                    ...
+
+                # Original kernel - dimensions are dynamic (symbolic)
+                result1 = matmul(x, y)
+
+                # Specialized kernel - m and k are compiled as constants
+                specialized = matmul.specialize_args(x=[0, 1])
+                result2 = specialized(x, y)
+
+                # Original kernel is unaffected by specialize_args
+                result3 = matmul(x, y)  # still uses dynamic dimensions
+
+            Calling original kernel before specialize_args:
+
+                # This works fine - original kernel is independent
+                result1 = matmul(x, y)  # dynamic dimensions
+
+                # Create specialized version - does NOT affect prior calls
+                specialized = matmul.specialize_args(x=[0])
+                result2 = specialized(x, y)  # m is now a constant within the `specialized` kernel
+
+            Chaining specializations for multiple arguments:
+
+                # Specialize x's first dim and y's second dim
+                chained = matmul.specialize_args(x=[0]).specialize_args(y=[1])
+                result = chained(x, y)
+
+            Combining with hl.specialize() within the kernel:
 
-        Example:
-            @helion.kernel
-            def attention(q_in, k_in, v_in):
-                head_dim = q_in.size(0)  # Specialized if specified externally
-                seq_len = k_in.size(1)  # Specialized if specified externally
-                ...
+                @helion.kernel(static_shapes=False)
+                def fn(x: torch.Tensor) -> torch.Tensor:
+                    hl.specialize(x.size(0))  # Always specialize dim 0
+                    ...
 
-            result = attention.specialize_args(q_in=[0], k_in=[1])(q, k, v)
+                # Adds dim 1 specialization to the existing dim 0
+                both_dims = fn.specialize_args(x=[1])
         """
         if not kwargs:
             return self
diff --git a/test/test_specialize.py b/test/test_specialize.py
@@ -334,7 +334,7 @@ class TestSpecializeArgs(RefEagerTestBase, TestCase):
     maxDiff = 163842
 
     def test_specialize_args(self):
-        """Test specialize_args: multiple tensors, multiple dims, negative indexing."""
+        """Test specialize_args(): multiple tensors, multiple dims, negative indexing."""
 
         @helion.kernel(autotune_effort="none", static_shapes=False)
         def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
@@ -352,7 +352,7 @@ def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         x = torch.randn([m, k], device=DEVICE, dtype=torch.float16)
         y = torch.randn([k, n], device=DEVICE, dtype=torch.float16)
 
-        # First, run WITHOUT specialize_args - dimensions should NOT be constants
+        # First, run WITHOUT specialize_args() - dimensions should NOT be constants
         code_no_spec, result_no_spec = code_and_output(
             matmul,
             (x, y),
@@ -363,7 +363,7 @@ def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         self.assertNotIn("128", code_no_spec)  # x dim -1 = k should NOT be specialized
         self.assertNotIn("56", code_no_spec)  # y dim 1 = n should NOT be specialized
 
-        # Now, run WITH specialize_args - dimensions SHOULD be constants
+        # Now, run WITH specialize_args() - dimensions SHOULD be constants
         code, result = code_and_output(
             matmul.specialize_args(x=[0, -1], y=[1]),
             (x, y),
@@ -386,7 +386,7 @@ def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         )
 
     def test_specialize_args_and_hl_specialize(self):
-        """Test that external specialize_args and internal hl.specialize form a union."""
+        """Test that external specialize_args() and internal hl.specialize() form a union."""
 
         @helion.kernel(autotune_effort="none", static_shapes=False)
         def dual_specialize(x: torch.Tensor) -> torch.Tensor:
@@ -399,7 +399,7 @@ def dual_specialize(x: torch.Tensor) -> torch.Tensor:
 
         x = torch.randn([320, 640], device=DEVICE)
 
-        # First, run WITHOUT external specialize_args - only dim 0 should be specialized
+        # First, run WITHOUT external specialize_args() - only dim 0 should be specialized
         code_no_spec, result_no_spec = code_and_output(
             dual_specialize,
             (x,),
@@ -409,7 +409,7 @@ def dual_specialize(x: torch.Tensor) -> torch.Tensor:
         self.assertIn("320", code_no_spec)  # dim 0 from internal specialize
         self.assertNotIn("640", code_no_spec)  # dim 1 should NOT be specialized
 
-        # Now, run WITH external specialize_args on dim -1 (dim 1)
+        # Now, run WITH external specialize_args() on dim -1 (dim 1)
         # Result: both dim 0 AND dim 1 are specialized (union)
         code, result = code_and_output(
             dual_specialize.specialize_args(x=[-1]),
@@ -429,7 +429,7 @@ def dual_specialize(x: torch.Tensor) -> torch.Tensor:
 
     @skipIfRefEager("Error checking not available in ref eager mode")
     def test_specialize_args_errors(self):
-        """Test error handling for invalid specialize_args usage."""
+        """Test error handling for invalid specialize_args() usage."""
 
         @helion.kernel(autotune_effort="none", static_shapes=False)
         def fn(x: torch.Tensor) -> torch.Tensor:
@@ -450,7 +450,7 @@ def fn(x: torch.Tensor) -> torch.Tensor:
         self.assertIn("Unknown argument", str(cm.exception))
 
     def test_specialize_args_chaining(self):
-        """Test that chained specialize_args calls merge specializations."""
+        """Test that chained specialize_args() calls merge specializations."""
 
         @helion.kernel(autotune_effort="none", static_shapes=False)
         def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
@@ -464,13 +464,13 @@ def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         x = torch.randn([37, 64], device=DEVICE)
         y = torch.randn([48, 127], device=DEVICE)
 
-        # First, run WITHOUT specialize_args - dimensions should NOT be constants
+        # First, run WITHOUT specialize_args() - dimensions should NOT be constants
         code_no_spec, result_no_spec = code_and_output(fn, (x, y), block_sizes=[16, 16])
         torch.testing.assert_close(result_no_spec, x * 127)
         self.assertNotIn("37", code_no_spec)  # x dim 0 should NOT be specialized
         self.assertNotIn("127", code_no_spec)  # y dim 1 should NOT be specialized
 
-        # Now, chain two specialize_args calls - both should be preserved
+        # Now, chain two specialize_args() calls - both should be preserved
         chained = fn.specialize_args(x=[0]).specialize_args(y=[1])
 
         code, result = code_and_output(chained, (x, y), block_sizes=[16, 16])
@@ -485,6 +485,92 @@ def fn(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         y2 = torch.randn([48, 256], device=DEVICE)  # different dim 1
         self.assertIsNot(chained.bind((x, y)), chained.bind((x2, y2)))
 
+    def test_specialize_args_does_not_mutate_original(self):
+        """
+        Test that specialize_args() returns a new kernel and does not mutate the original.
+        This test explicitly verifies:
+        1. Calling original kernel before specialize_args() works normally
+        2. specialize_args() returns a different kernel object
+        3. Original kernel remains unspecialized after specialize_args() is called
+        4. Both kernels produce correct results independently
+        """
+        config = helion.Config(block_sizes=[16, 16])
+
+        @helion.kernel(config=config, static_shapes=False)
+        def kernel_fn(x: torch.Tensor) -> torch.Tensor:
+            m, n = x.size()
+            out = torch.empty_like(x)
+            for tile in hl.tile(x.size()):
+                out[tile] = x[tile] * 2.0
+            return out
+
+        x = torch.randn([64, 128], device=DEVICE)
+
+        # Step 1: Call original kernel BEFORE specialize_args()
+        code_before, result_before = code_and_output(
+            kernel_fn, (x,), block_sizes=[16, 16]
+        )
+        torch.testing.assert_close(result_before, x * 2.0)
+        # Original should NOT have specialized dimensions
+        self.assertNotIn("64", code_before)
+        self.assertNotIn("128", code_before)
+
+        # Step 2: Create specialized version
+        specialized_kernel_fn = kernel_fn.specialize_args(x=[0, 1])
+
+        # Verify it's a different kernel object
+        self.assertIsNot(kernel_fn, specialized_kernel_fn)
+
+        # Step 3: Call specialized kernel
+        code_spec, result_spec = code_and_output(
+            specialized_kernel_fn, (x,), block_sizes=[16, 16]
+        )
+        torch.testing.assert_close(result_spec, x * 2.0)
+        # Specialized should have constant dimensions
+        self.assertIn("64", code_spec)
+        self.assertIn("128", code_spec)
+
+        # Step 4: Call original kernel AFTER specialize_args() - should still be unspecialized
+        kernel_fn.reset()  # Clear cache to force recompilation
+        code_after, result_after = code_and_output(
+            kernel_fn, (x,), block_sizes=[16, 16]
+        )
+        torch.testing.assert_close(result_after, x * 2.0)
+        # Original should STILL NOT have specialized dimensions
+        self.assertNotIn("64", code_after)
+        self.assertNotIn("128", code_after)
+
+        # Verify that specialize_args() creates a true copy without shared mutable state.
+        mutable_attrs = [
+            "_bound_kernels",
+            "_specialize_extra",
+            "_specialized_args",
+            "_arg_name_to_index",
+            "_annotations",
+        ]
+        for attr in mutable_attrs:
+            self.assertIsNot(
+                getattr(kernel_fn, attr),
+                getattr(specialized_kernel_fn, attr),
+                f"Attribute '{attr}' is shared between original and specialized kernel",
+            )
+
+        # These objects are currently shared between original and specialized kernel.
+        self.assertIs(
+            kernel_fn.settings,
+            specialized_kernel_fn.settings,
+        )
+        # Config objects inside the configs list are shared (list itself is copied)
+        self.assertIsNot(
+            kernel_fn.configs,
+            specialized_kernel_fn.configs,
+        )
+        for i, orig_config in enumerate(kernel_fn.configs):
+            self.assertIs(
+                orig_config,
+                specialized_kernel_fn.configs[i],
+            )
+
 
 if __name__ == "__main__":
     unittest.main()