up

yf225 · yf225 · commit 94ddf8b5f8e8 · 2025-12-04T20:15:21.000-08:00
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -118,6 +118,16 @@ def __init__(
             0  # Track number of loads in all device code for eviction policy tuning
         )
 
+    def specialize_expr(self, expr: sympy.Expr) -> sympy.Expr:
+        """Substitute any specialized vars with their concrete values."""
+        if subs := {
+            s: sympy.Integer(self.shape_env.size_hint(s))
+            for s in expr.free_symbols & self.specialized_vars
+        }:
+            # pyrefly: ignore [bad-assignment]
+            expr = expr.xreplace(subs)
+        return expr
+
     def add_kernel_tensor_size(self, sizes: Sequence[int | torch.SymInt]) -> None:
         from .device_function import contains_only_block_size_symbols
 
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -373,7 +373,8 @@ def set_pid(self, pid: ProgramIDs) -> None:
         self.pid = pid
 
     def sympy_expr(self, expr: sympy.Expr) -> str:
-        expr = CompileEnvironment.current().shape_env.simplify(expr)
+        env = CompileEnvironment.current()
+        expr = env.specialize_expr(env.shape_env.simplify(expr))
         if not expr.free_symbols:
             return texpr(expr)
         if expr in self.expr_to_var_info:
@@ -393,6 +394,7 @@ def sympy_expr(self, expr: sympy.Expr) -> str:
                 replacements[sym] = sympy.Symbol(
                     self._lift_sympy_arg(sym), integer=True
                 )
+        # pyrefly: ignore [bad-argument-type]
         return texpr(expr.xreplace(replacements))
 
     def _lift_sympy_arg(self, expr: sympy.Expr) -> str:
diff --git a/helion/_compiler/host_function.py b/helion/_compiler/host_function.py
@@ -191,14 +191,18 @@ def set_local_types(self, local_types: dict[str, TypeInfo]) -> None:
             type_info.populate_symbol_origins(NameOrigin(name, fn))
 
     def sympy_expr(self, expr: sympy.Expr) -> str:
-        expr = CompileEnvironment.current().shape_env.simplify(expr)
+        env = CompileEnvironment.current()
+        expr = env.specialize_expr(env.shape_env.simplify(expr))
+        if not expr.free_symbols:
+            return pexpr(expr)
         if expr in self.expr_to_origin:
             return self.expr_to_origin[expr].origin.host_str()
         replacements = {}
         for sym in sorted(expr.free_symbols, key=lambda x: x.name):
             assert isinstance(sym, sympy.Symbol)
             origin = self.expr_to_origin[sym].origin
             replacements[sym] = sympy.Symbol(origin.host_str(), integer=True)
+        # pyrefly: ignore [bad-argument-type]
         return pexpr(expr.xreplace(replacements))
 
     def literal_expr(self, expr: object) -> str:
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -499,6 +499,14 @@ def assertNotIn(
         if not self._in_ref_eager_mode:
             super().assertNotIn(member, container, msg)  # type: ignore[misc]
 
+    def assertIs(self, expr1: object, expr2: object, msg: str | None = None) -> None:
+        if not self._in_ref_eager_mode:
+            super().assertIs(expr1, expr2, msg)  # type: ignore[misc]
+
+    def assertIsNot(self, expr1: object, expr2: object, msg: str | None = None) -> None:
+        if not self._in_ref_eager_mode:
+            super().assertIsNot(expr1, expr2, msg)  # type: ignore[misc]
+
     def assertTrueIfInNormalMode(self, condition: bool, msg: str | None = None) -> None:
         if not self._in_ref_eager_mode:
             self.assertTrue(condition, msg)  # type: ignore[attr-defined]
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -136,6 +136,10 @@ def __init__(
         self._specialize_extra: dict[
             Hashable, list[Callable[[Sequence[object]], Hashable]]
         ] = {}
+        self._specialized_args: dict[int, tuple[int, ...]] = {}
+        self._arg_name_to_index: dict[str, int] = {
+            name: i for i, name in enumerate(self.signature.parameters.keys())
+        }
         if any(
             param.kind
             in (
@@ -346,6 +350,51 @@ def reset(self) -> None:
         """
         self._bound_kernels.clear()
 
+    def specialize_args(self, **kwargs: list[int]) -> Kernel[_R]:
+        """
+        Returns a kernel that will specialize on the given argument dimensions.
+        This allows specialization decisions to be made outside the kernel,
+        binding to argument names via kwargs.
+
+        Args:
+            **kwargs: Mapping of argument name -> dims to specialize on
+                      e.g., specialize_args(q_in=[-1], k_in=[-1])
+
+        Returns:
+            Kernel: A new kernel with same settings and configs, adding the given
+            specializations to any existing ones.
+
+        Example:
+            @helion.kernel
+            def attention(q_in, k_in, v_in):
+                head_dim = q_in.size(0)  # Specialized if specified externally
+                seq_len = k_in.size(1)  # Specialized if specified externally
+                ...
+
+            result = attention.specialize_args(q_in=[0], k_in=[1])(q, k, v)
+        """
+        if not kwargs:
+            return self
+        try:
+            specialized_args = {
+                self._arg_name_to_index[name]: tuple(dims)
+                for name, dims in kwargs.items()
+            }
+        except KeyError as e:
+            valid_args = ", ".join(self._arg_name_to_index.keys())
+            raise ValueError(
+                f"Unknown argument '{e.args[0]}' for kernel '{self.name}'. Valid arguments: {valid_args}"
+            ) from e
+
+        specialized = Kernel(
+            self.fn,
+            configs=list(self.configs),
+            settings=self.settings,
+            key=self._key_fn,
+        )
+        specialized._specialized_args = {**self._specialized_args, **specialized_args}
+        return specialized
+
 
 class BoundKernel(Generic[_R]):
     def __init__(
@@ -403,6 +452,10 @@ def __init__(
                     constexpr_args[name] = arg
                 else:
                     self.fake_args.append(self.env.to_fake(arg, ArgumentOrigin(name)))
+
+            if kernel._specialized_args:
+                self._apply_specialized_args(kernel._specialized_args)
+
             with (
                 _maybe_skip_dtype_check_in_meta_registrations(),
                 patch_inductor_lowerings(),
@@ -420,6 +473,18 @@ def __init__(
                     self.maybe_log_repro(log.warning, args, config=config)
                     raise
 
+    def _apply_specialized_args(
+        self, specialized_args: dict[int, tuple[int, ...]]
+    ) -> None:
+        for arg_idx, dims in specialized_args.items():
+            fake_tensor = self.fake_args[arg_idx]
+            if isinstance(fake_tensor, torch.Tensor):
+                for dim in dims:
+                    size = fake_tensor.size(dim)
+                    if isinstance(size, torch.SymInt):
+                        sym_expr = size._sympy_()
+                        self.env.specialized_vars.update(sym_expr.free_symbols)
+
     @property
     def settings(self) -> Settings:
         """
@@ -622,6 +687,8 @@ def _specialize_extra(self) -> list[Callable[[Sequence[object]], Hashable]]:
         if not self.env.specialized_vars:
             return []
 
+        arg_name_to_index = self.kernel._arg_name_to_index
+
         def make_extractor(v: Source) -> Callable[[Sequence[object]], Hashable]:
             if isinstance(v, TensorPropertySource):
                 assert v.prop == TensorProperty.SIZE
@@ -635,9 +702,6 @@ def make_extractor(v: Source) -> Callable[[Sequence[object]], Hashable]:
                 return operator.itemgetter(index)
             raise exc.SpecializeArgType(v)
 
-        arg_name_to_index: dict[str, int] = {
-            n: i for i, n in enumerate(self.kernel.signature.parameters.keys())
-        }
         extractors = []
         for v in sorted(self.env.specialized_vars, key=lambda v: v.name):
             source = self.env.shape_env.var_to_sources[v][0]
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -460,27 +460,11 @@ def attention(q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor, *, _la
     _RDIM_SIZE_2 = 64
     # src[attention.py:N]: m_i = hl.full([tile_b, tile_m], float("-inf"), dtype=torch.float32)
     _BLOCK_SIZE_0 = 1
-    # src[attention.py:N]: q = q_view[tile_b, tile_m, :]
-    _SHAPE_DIM = q_in.size(3)
-    _SHAPE_DIM_1 = q_in.size(3)
-    _SHAPE_DIM_2 = q_in.size(3)
     # src[attention.py:N]: for tile_n in hl.tile(v_view.size(1)):
     # src[attention.py:N]:     k = k_view[tile_b, :, tile_n]
     # src[attention.py:N]:     qk = torch.bmm(q, k)
     # src[attention.py:N-N]: ...
     _BLOCK_SIZE_3 = 32
-    # src[attention.py:N]: k = k_view[tile_b, :, tile_n]
-    _SHAPE_DIM_3 = q_in.size(3)
-    _SHAPE_DIM_4 = q_in.size(3)
-    _SHAPE_DIM_5 = q_in.size(3)
-    # src[attention.py:N]: v = v_view[tile_b, tile_n, :]
-    _SHAPE_DIM_6 = q_in.size(3)
-    _SHAPE_DIM_7 = q_in.size(3)
-    _SHAPE_DIM_8 = q_in.size(3)
-    # src[attention.py:N]: out[tile_b, tile_m, :] = acc.to(out.dtype)
-    _SHAPE_DIM_9 = q_in.size(3)
-    _SHAPE_DIM_10 = q_in.size(3)
-    _SHAPE_DIM_11 = q_in.size(3)
     # src[attention.py:N]: for tile_b, tile_m in hl.tile([q_view.size(0), m_dim]):
     # src[attention.py:N]:     m_i = hl.full([tile_b, tile_m], float("-inf"), dtype=torch.float32)
     # src[attention.py:N]:     l_i = torch.full_like(m_i, 1.0)
diff --git a/test/test_specialize.expected b/test/test_specialize.expected
@@ -1132,3 +1132,142 @@ def reduce_kernel(x: torch.Tensor, tensor_factory_fn, test_host, *, _launcher=_d
     _launcher(_helion_reduce_kernel, (triton.cdiv(128, _BLOCK_SIZE_0),), x, grad_weight, _BLOCK_SIZE_0, _RDIM_SIZE_2, _BLOCK_SIZE_1, num_warps=4, num_stages=1)
     # src[test_specialize.py:N]: return grad_weight.sum(0).to(x.dtype)
     return grad_weight.sum(0).to(x.dtype)
+
+--- assertExpectedJournal(TestSpecializeArgs.test_specialize_args)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_matmul(x, y, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, y_stride_0, y_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    num_blocks_0 = tl.cdiv(64, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < 64
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < 56
+    # src[test_specialize.py:N]: acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+    acc = tl.full([_BLOCK_SIZE_0, _BLOCK_SIZE_1], 0.0, tl.float32)
+    # src[test_specialize.py:N]: for tile_k in hl.tile(k):
+    # src[test_specialize.py:N]:     acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+    symnode_0 = 128
+    for offset_2 in tl.range(0, symnode_0.to(tl.int32), _BLOCK_SIZE_2):
+        indices_2 = offset_2 + tl.arange(0, _BLOCK_SIZE_2).to(tl.int32)
+        mask_2 = indices_2 < symnode_0
+        acc_copy = acc
+        acc_copy_0 = acc_copy
+        # src[test_specialize.py:N]: acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+        load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_2[None, :] * x_stride_1), mask_0[:, None] & mask_2[None, :], other=0)
+        load_1 = tl.load(y + (indices_2[:, None] * y_stride_0 + indices_1[None, :] * y_stride_1), mask_2[:, None] & mask_1[None, :], other=0)
+        acc = tl.dot(tl.cast(load, tl.float16), tl.cast(load_1, tl.float16), acc=acc_copy_0, input_precision='tf32', out_dtype=tl.float32)
+    # src[test_specialize.py:N]: out[tile_m, tile_n] = acc.to(x.dtype)
+    v_0 = tl.cast(acc, tl.float16)
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_0, mask_0[:, None] & mask_1[None, :])
+
+def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_specialize.py:N]: m, k = x.size()
+    m, k = x.size()
+    # src[test_specialize.py:N]: k2, n = y.size()
+    k2, n = y.size()
+    # src[test_specialize.py:N]: out = torch.empty([m, n], device=x.device, dtype=x.dtype)
+    out = torch.empty([m, n], device=x.device, dtype=x.dtype)
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    # src[test_specialize.py:N]: for tile_k in hl.tile(k):
+    # src[test_specialize.py:N]:     acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
+    _BLOCK_SIZE_2 = 32
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    # src[test_specialize.py:N]:     acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
+    # src[test_specialize.py:N]:     for tile_k in hl.tile(k):
+    # src[test_specialize.py:N-N]: ...
+    _launcher(_helion_matmul, (triton.cdiv(64, _BLOCK_SIZE_0) * triton.cdiv(56, _BLOCK_SIZE_1),), x, y, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), y.stride(0), y.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=4, num_stages=1)
+    # src[test_specialize.py:N]: return out
+    return out
+
+--- assertExpectedJournal(TestSpecializeArgs.test_specialize_args_and_hl_specialize)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_dual_specialize(x, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    # src[test_specialize.py:N]: for tile in hl.tile(x.size()):
+    num_blocks_0 = tl.cdiv(320, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < 640
+    # src[test_specialize.py:N]: out[tile] = x[tile] * 2
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_1[None, :], other=0)
+    v_0 = 2.0
+    v_1 = load * v_0
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_1, mask_1[None, :])
+
+def dual_specialize(x: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_specialize.py:N]: out = torch.empty_like(x)
+    out = torch.empty_like(x)
+    # src[test_specialize.py:N]: for tile in hl.tile(x.size()):
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    # src[test_specialize.py:N]: for tile in hl.tile(x.size()):
+    # src[test_specialize.py:N]:     out[tile] = x[tile] * 2
+    _launcher(_helion_dual_specialize, (triton.cdiv(320, _BLOCK_SIZE_0) * triton.cdiv(640, _BLOCK_SIZE_1),), x, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=1)
+    # src[test_specialize.py:N]: return out
+    return out
+
+--- assertExpectedJournal(TestSpecializeArgs.test_specialize_args_chaining)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_fn(x, out, out_stride_0, out_stride_1, x_stride_0, x_stride_1, n, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    num_blocks_0 = tl.cdiv(37, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < 37
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < n
+    # src[test_specialize.py:N]: out[tile_m, tile_n] = x[tile_m, tile_n] * p
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    symnode_0 = 127
+    v_0 = tl.cast(symnode_0, tl.float32)
+    v_1 = load * v_0
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_1, mask_0[:, None] & mask_1[None, :])
+
+def fn(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+    # src[test_specialize.py:N]: m, n = x.size()
+    m, n = x.size()
+    # src[test_specialize.py:N]: p = y.size(1)  # use y's dim 1 as a scalar
+    p = y.size(1)
+    # src[test_specialize.py:N]: out = x.new_empty([m, n])
+    out = x.new_empty([m, n])
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    _BLOCK_SIZE_0 = 16
+    _BLOCK_SIZE_1 = 16
+    # src[test_specialize.py:N]: for tile_m, tile_n in hl.tile([m, n]):
+    # src[test_specialize.py:N]:     out[tile_m, tile_n] = x[tile_m, tile_n] * p
+    _launcher(_helion_fn, (triton.cdiv(37, _BLOCK_SIZE_0) * triton.cdiv(n, _BLOCK_SIZE_1),), x, out, out.stride(0), out.stride(1), x.stride(0), x.stride(1), n, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=1)
+    # src[test_specialize.py:N]: return out
+    return out
diff --git a/test/test_specialize.py b/test/test_specialize.py
@@ -377,9 +377,7 @@ def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 
         # Verify cache behavior: same specialized values hit cache
         specialized_kernel = matmul.specialize_args(x=[0, -1], y=[1])
-        self.assertIs(
-            specialized_kernel.bind((x, y)), specialized_kernel.bind((x, y))
-        )
+        self.assertIs(specialized_kernel.bind((x, y)), specialized_kernel.bind((x, y)))
         # Verify cache behavior: different specialized values produce different bound kernels
         x2 = torch.randn([48, 96], device=DEVICE, dtype=torch.float16)
         y2 = torch.randn([96, 24], device=DEVICE, dtype=torch.float16)
diff --git a/test/test_tensor_descriptor.expected b/test/test_tensor_descriptor.expected
@@ -123,27 +123,11 @@ def attention(q_in: torch.Tensor, k_in: torch.Tensor, v_in: torch.Tensor, *, _la
     _RDIM_SIZE_2 = 64
     # src[attention.py:N]: m_i = hl.full([tile_b, tile_m], float("-inf"), dtype=torch.float32)
     _BLOCK_SIZE_0 = 1
-    # src[attention.py:N]: q = q_view[tile_b, tile_m, :]
-    _SHAPE_DIM = q_in.size(3)
-    _SHAPE_DIM_1 = q_in.size(3)
-    _SHAPE_DIM_2 = q_in.size(3)
     # src[attention.py:N]: for tile_n in hl.tile(v_view.size(1)):
     # src[attention.py:N]:     k = k_view[tile_b, :, tile_n]
     # src[attention.py:N]:     qk = torch.bmm(q, k)
     # src[attention.py:N-N]: ...
     _BLOCK_SIZE_3 = 16
-    # src[attention.py:N]: k = k_view[tile_b, :, tile_n]
-    _SHAPE_DIM_3 = q_in.size(3)
-    _SHAPE_DIM_4 = q_in.size(3)
-    _SHAPE_DIM_5 = q_in.size(3)
-    # src[attention.py:N]: v = v_view[tile_b, tile_n, :]
-    _SHAPE_DIM_6 = q_in.size(3)
-    _SHAPE_DIM_7 = q_in.size(3)
-    _SHAPE_DIM_8 = q_in.size(3)
-    # src[attention.py:N]: out[tile_b, tile_m, :] = acc.to(out.dtype)
-    _SHAPE_DIM_9 = q_in.size(3)
-    _SHAPE_DIM_10 = q_in.size(3)
-    _SHAPE_DIM_11 = q_in.size(3)
     # src[attention.py:N]: for tile_b, tile_m in hl.tile([q_view.size(0), m_dim]):
     # src[attention.py:N]:     m_i = hl.full([tile_b, tile_m], float("-inf"), dtype=torch.float32)
     # src[attention.py:N]:     l_i = torch.full_like(m_i, 1.0)