wip

yf225 · yf225 · commit bc2a3c667ad8 · 2025-12-08T22:31:25.000-08:00
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -127,6 +127,16 @@ def __init__(
             0  # Track number of loads in all device code for eviction policy tuning
         )
 
+    def specialize_expr(self, expr: sympy.Expr) -> sympy.Expr:
+        """Substitute any specialized vars with their concrete values."""
+        if subs := {
+            s: sympy.Integer(self.shape_env.size_hint(s))
+            for s in expr.free_symbols & self.specialized_vars
+        }:
+            # pyrefly: ignore [bad-assignment]
+            expr = expr.xreplace(subs)
+        return expr
+
     def add_kernel_tensor_size(self, sizes: Sequence[int | torch.SymInt]) -> None:
         from .device_function import contains_only_block_size_symbols
 
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -373,7 +373,8 @@ def set_pid(self, pid: ProgramIDs) -> None:
         self.pid = pid
 
     def sympy_expr(self, expr: sympy.Expr) -> str:
-        expr = CompileEnvironment.current().shape_env.simplify(expr)
+        env = CompileEnvironment.current()
+        expr = env.specialize_expr(env.shape_env.simplify(expr))
         if not expr.free_symbols:
             return texpr(expr)
         if expr in self.expr_to_var_info:
diff --git a/helion/_compiler/host_function.py b/helion/_compiler/host_function.py
@@ -191,7 +191,10 @@ def set_local_types(self, local_types: dict[str, TypeInfo]) -> None:
             type_info.populate_symbol_origins(NameOrigin(name, fn))
 
     def sympy_expr(self, expr: sympy.Expr) -> str:
-        expr = CompileEnvironment.current().shape_env.simplify(expr)
+        env = CompileEnvironment.current()
+        expr = env.specialize_expr(env.shape_env.simplify(expr))
+        if not expr.free_symbols:
+            return pexpr(expr)
         if expr in self.expr_to_origin:
             return self.expr_to_origin[expr].origin.host_str()
         replacements = {}
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -499,6 +499,14 @@ def assertNotIn(
         if not self._in_ref_eager_mode:
             super().assertNotIn(member, container, msg)  # type: ignore[misc]
 
+    def assertIs(self, expr1: object, expr2: object, msg: str | None = None) -> None:
+        if not self._in_ref_eager_mode:
+            super().assertIs(expr1, expr2, msg)  # type: ignore[misc]
+
+    def assertIsNot(self, expr1: object, expr2: object, msg: str | None = None) -> None:
+        if not self._in_ref_eager_mode:
+            super().assertIsNot(expr1, expr2, msg)  # type: ignore[misc]
+
     def assertTrueIfInNormalMode(self, condition: bool, msg: str | None = None) -> None:
         if not self._in_ref_eager_mode:
             self.assertTrue(condition, msg)  # type: ignore[attr-defined]
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -403,6 +403,9 @@ def __init__(
                     constexpr_args[name] = arg
                 else:
                     self.fake_args.append(self.env.to_fake(arg, ArgumentOrigin(name)))
+
+            self._apply_mark_static(args)
+
             with (
                 _maybe_skip_dtype_check_in_meta_registrations(),
                 patch_inductor_lowerings(),
@@ -420,6 +423,24 @@ def __init__(
                     self.maybe_log_repro(log.warning, args, config=config)
                     raise
 
+    def _apply_mark_static(self, args: tuple[object, ...]) -> None:
+        """
+        Apply torch._dynamo.mark_static() markings from input tensors.
+
+        This reads _dynamo_static_indices from each tensor argument and marks
+        the corresponding dimensions as specialized (constant) in the kernel.
+        """
+        for arg_idx, (arg, fake_arg) in enumerate(zip(args, self.fake_args, strict=True)):
+            if isinstance(arg, torch.Tensor):
+                static_indices = getattr(arg, "_dynamo_static_indices", None)
+                if static_indices:
+                    assert isinstance(fake_arg, torch.Tensor)
+                    for dim in static_indices:
+                        size = fake_arg.size(dim)
+                        if isinstance(size, torch.SymInt):
+                            sym_expr = size._sympy_()
+                            self.env.specialized_vars.update(sym_expr.free_symbols)
+
     @property
     def settings(self) -> Settings:
         """
@@ -889,12 +910,14 @@ def kernel(
 def _tensor_key(fn: Kernel, obj: torch.Tensor) -> Hashable:
     # NOTE: If a machine has two different gpu types on the same machine,
     # obj.device.type will incorrectly hit
+    static_indices = frozenset(getattr(obj, "_dynamo_static_indices", ()))
     if fn.settings.static_shapes:
         return (
             obj.dtype,
             obj.device.type,
             (*obj.size(),),
             (*obj.stride(),),
+            static_indices,
         )
     bucketed = tuple([min(s, 2) for s in obj.size()])
     if fn.settings.index_dtype is None:
@@ -907,11 +930,13 @@ def _tensor_key(fn: Kernel, obj: torch.Tensor) -> Hashable:
             obj.device.type,
             bucketed,
             needs_int64,
+            static_indices,
         )
     return (
         obj.dtype,
         obj.device.type,
         bucketed,
+        static_indices,
     )