[Interpret Mode] Support custom block size (#1194)

yf225 · web-flow · commit a2f5ed119834 · 2025-12-04T19:06:45.000-08:00
diff --git a/helion/language/loops.py b/helion/language/loops.py
@@ -23,7 +23,6 @@
 from .._compiler.ast_extension import LoopType
 from .._compiler.ast_extension import expr_from_string
 from .._compiler.compile_environment import CompileEnvironment
-from .._compiler.compile_environment import warning
 from .._compiler.type_propagation import GridIndexType
 from .._compiler.type_propagation import IterType
 from .._compiler.type_propagation import LiteralType
@@ -519,47 +518,41 @@ def _(
     end_or_none: int | torch.Tensor | list[int | torch.Tensor] | None = None,
     block_size: int | torch.Tensor | list[int | torch.Tensor] | None = None,
 ) -> Iterator[RefTile | tuple[RefTile, ...]]:
-    # Issue warning if block_size is specified in interpret mode
-    if block_size is not None:
-        warning(exc.BlockSizeIgnoredInInterpretMode(block_size))
-
-    # Step 1: Normalize begin and end values
     begin, end = _normalize_begin_end_ref(begin_or_end, end_or_none)
-
-    # Step 2: Convert to lists and then to ints
+    scalar_input = not isinstance(begin, list) and not isinstance(end, list)
     begin_list = _normalize_to_list(begin)
     end_list = _normalize_to_list(end)
-    begin_ints = [_to_int(b) for b in begin_list]
-    end_ints = [_to_int(e) for e in end_list]
-
-    # Step 3: Determine block sizes - always return full dimension size, ignoring block_size parameter
-    block_size_list = []
-    for b, e in zip(begin_ints, end_ints, strict=True):
-        assert b is not None and e is not None
-        block_size_list.append(e - b)
-
-    # Step 4: Determine return type
-    # Return single tiles if input was not a list
-    return_single = not isinstance(begin, list) and not isinstance(end, list)
-
-    # Step 5: Generate tiles
-    # Build tiles for each dimension
-    tiles = []
-    for b, e in zip(begin_ints, end_ints, strict=True):
-        assert b is not None and e is not None
-        if b != e:
-            # Only create tile if range is non-empty
-            tiles.append(RefTile(b, e, e - b))
-
-    # Yield result based on return type
-    if tiles:  # Only yield if we have at least one non-empty dimension
-        if return_single:
-            # Single dimension case - yield the tile directly
-            assert len(tiles) == 1
-            yield tiles[0]
-        else:
-            # Multi-dimensional case - yield as tuple
-            yield tuple(tiles)
+
+    # Normalize block_size to list matching dimensions
+    bs_list: list[int | torch.Tensor | None]
+    if block_size is None:
+        bs_list = [None] * len(begin_list)
+    else:
+        bs_list = cast(
+            "list[int | torch.Tensor | None]", _normalize_to_list(block_size)
+        )
+        if len(bs_list) == 1 and len(begin_list) > 1:
+            bs_list = bs_list * len(begin_list)
+
+    # Build tile ranges for each dimension
+    dim_ranges: list[list[tuple[int, int, int]]] = []
+    for b, e, bs in zip(begin_list, end_list, bs_list, strict=True):
+        b_int, e_int = _to_int(b), _to_int(e)
+        assert b_int is not None and e_int is not None
+        if b_int == e_int:
+            continue
+        bs_int = _to_int(bs) if bs is not None else (e_int - b_int)
+        assert bs_int is not None
+        dim_ranges.append(
+            [(s, min(s + bs_int, e_int), bs_int) for s in range(b_int, e_int, bs_int)]
+        )
+
+    if not dim_ranges:
+        return
+
+    for combo in itertools.product(*dim_ranges):
+        tiles = list(starmap(RefTile, combo))
+        yield tiles[0] if scalar_input else tuple(tiles)
 
 
 def _codegen_loop_helper(
diff --git a/helion/language/tile_ops.py b/helion/language/tile_ops.py
@@ -259,5 +259,4 @@ def _(state: CodegenState) -> ast.AST:
 
 @_decorators.ref(tile_id)
 def _(tile: RefTile) -> int:
-    # ID is always 0 since we always have one tile per dim in ref mode
-    return 0
+    return tile._slice.start // tile._block_size
diff --git a/helion/runtime/ref_mode.py b/helion/runtime/ref_mode.py
@@ -4,11 +4,13 @@
 import threading
 import typing
 from typing import TYPE_CHECKING
+from typing import Any
 from typing import Callable
 from typing import Protocol
 from typing import cast
 
 import torch
+from torch._prims_common import is_integer_dtype
 from torch.overrides import BaseTorchFunctionMode
 
 from .._compiler.compile_environment import CompileEnvironment
@@ -180,6 +182,10 @@ def __torch_function__(
                 return self._method_handlers[func_name](args, kwargs)
             if func_name in self._binary_op_names:
                 return self._handle_binary_op(func, args, kwargs)
+            if func_name == "__getitem__":
+                return self._handle_getitem(args, kwargs)
+            if func_name == "__setitem__":
+                return self._handle_setitem(args, kwargs)
 
         if func in self._binary_ops:
             return self._handle_binary_op(func, args, kwargs)
@@ -334,6 +340,59 @@ def _should_handle_binary_op(self, lhs: object, rhs: object) -> bool:
         # Only handle shape-based masking for non-broadcasting cases
         return True
 
+    @staticmethod
+    def _is_int_tensor(x: object) -> bool:
+        return type(x) is torch.Tensor and is_integer_dtype(x.dtype)
+
+    def _handle_getitem(
+        self,
+        args: tuple[object, ...],
+        kwargs: dict[str, object],
+    ) -> torch.Tensor:
+        """Handle tensor indexing with out-of-bounds index clamping."""
+        tensor = cast("torch.Tensor", args[0])
+        indices: Any = args[1]
+        is_tuple = isinstance(indices, tuple)
+        indices_list = list(indices) if is_tuple else [indices]
+
+        for dim, idx in enumerate(indices_list):
+            if self._is_int_tensor(idx):
+                indices_list[dim] = torch.clamp(idx, min=0, max=tensor.size(dim) - 1)
+
+        return tensor[tuple(indices_list) if is_tuple else indices_list[0]]
+
+    def _handle_setitem(
+        self,
+        args: tuple[object, ...],
+        kwargs: dict[str, object],
+    ) -> None:
+        """Handle tensor indexed assignment with out-of-bounds index clamping."""
+        tensor = cast("torch.Tensor", args[0])
+        indices: Any = args[1]
+        value: Any = args[2]
+        is_tuple = isinstance(indices, tuple)
+        indices_list = list(indices) if is_tuple else [indices]
+
+        valid_mask: torch.Tensor | None = None
+        for dim, idx in enumerate(indices_list):
+            if self._is_int_tensor(idx):
+                max_idx = tensor.size(dim) - 1
+                dim_valid = (idx >= 0) & (idx <= max_idx)
+                valid_mask = (
+                    dim_valid if valid_mask is None else (valid_mask & dim_valid)
+                )
+                indices_list[dim] = torch.clamp(idx, min=0, max=max_idx)
+
+        final_indices = tuple(indices_list) if is_tuple else indices_list[0]
+        if valid_mask is not None and type(value) is torch.Tensor:
+            current = tensor[final_indices]
+            mask: torch.Tensor = valid_mask
+            while mask.dim() < value.dim():
+                mask = mask.unsqueeze(-1)
+            value = torch.where(mask, value, current)
+
+        tensor[final_indices] = value
+
     def _setup_binary_ops_handling(self) -> None:
         """Initialize binary operation tracking sets and mappings."""
         # Define binary operations and their variants
diff --git a/test/test_ref_eager.py b/test/test_ref_eager.py
@@ -8,7 +8,6 @@
 import torch
 
 import helion
-from helion import exc
 from helion._testing import DEVICE
 from helion._testing import TestCase
 from helion._testing import assert_ref_eager_mode
@@ -95,7 +94,7 @@ def kernel(x: torch.Tensor) -> torch.Tensor:
             expected = x * 2.0
             torch.testing.assert_close(result, expected)
 
-    def test_block_size_warning(self):
+    def test_block_size_support(self):
         @helion.kernel(ref_mode=helion.RefMode.EAGER)
         def kernel(x: torch.Tensor) -> torch.Tensor:
             m, n = x.shape
@@ -105,20 +104,25 @@ def kernel(x: torch.Tensor) -> torch.Tensor:
             return out
 
         with assert_ref_eager_mode():
-            # Run the kernel to capture the warning message
-            captured_stderr = io.StringIO()
-            with contextlib.redirect_stderr(captured_stderr):
-                x = torch.randn(128, 128, device=DEVICE)
-                kernel(x)
-
-            stderr_output = captured_stderr.getvalue()
+            x = torch.randn(128, 128, device=DEVICE)
+            result = kernel(x)
+            expected = x * 2.0
+            torch.testing.assert_close(result, expected)
 
-            # Create expected warning message using the actual class
-            expected_warning = exc.BlockSizeIgnoredInInterpretMode(2)
-            expected_warning_text = expected_warning.report()
+    def test_tile_begin_with_block_size_1(self):
+        @helion.kernel(ref_mode=helion.RefMode.EAGER)
+        def kernel(x: torch.Tensor) -> torch.Tensor:
+            n = x.size(0)
+            out = torch.empty_like(x)
+            for tile in hl.tile(n, block_size=1):
+                out[tile] = x[tile] + tile.begin
+            return out
 
-            # Check that the expected warning appears in stderr
-            self.assertIn(expected_warning_text, stderr_output)
+        with assert_ref_eager_mode():
+            x = torch.zeros(8, device=DEVICE)
+            result = kernel(x)
+            expected = torch.arange(8, device=DEVICE, dtype=torch.float32)
+            torch.testing.assert_close(result, expected)
 
 
 if __name__ == "__main__":