Add setup for Helion to compile on MTIA with basic test (#1169)

Myrthan · web-flow · commit 601d7dd8f4ff · 2025-12-08T22:46:28.000-08:00
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -61,6 +61,15 @@ def is_cpu() -> bool:
     )
 
 
+def is_mtia() -> bool:
+    """Return True if running on MTIA."""
+    return _get_triton_backend() == "mtia"
+
+
+def skipIfMTIA(reason: str) -> Callable[[Callable], Callable]:
+    return unittest.skipIf(is_mtia(), reason)
+
+
 class _LogCapture(logging.Handler):
     """Simple logging handler to capture log records."""
 
@@ -101,11 +110,14 @@ def is_cuda() -> bool:
 
 PROJECT_ROOT: Path = Path(__file__).parent.parent
 EXAMPLES_DIR: Path = PROJECT_ROOT / "examples"
+DEVICE = None
 
 if is_cpu():
     DEVICE = torch.device("cpu")
 elif torch.xpu.is_available():
     DEVICE = torch.device("xpu")
+elif is_mtia():
+    DEVICE = torch.device("mtia")
 else:
     DEVICE = torch.device("cuda")
 
@@ -1006,6 +1018,21 @@ class TestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls) -> None:
         cls._expected_journal = AssertExpectedJournal(cls)
+
+        if is_mtia():
+            # pyrefly: ignore [missing-import]
+            import mtia.host_runtime.torch_mtia.dynamic_library  # noqa: F401
+
+            # pyrefly: ignore [missing-import]
+            from mtia.re.re_unittest_lib import MTIAUnittest
+
+            # pyrefly: ignore [missing-import]
+            from triton_mtia.python.mtia.eager import mtia_triton_launcher
+
+            # Call MTIAUnittest.setUpClass for MTIA initialization
+            MTIAUnittest.setUpClass.__func__(cls)
+            # Initialize MTIA properly
+            mtia_triton_launcher.init()
         super().setUpClass()
 
     @classmethod
diff --git a/helion/runtime/__init__.py b/helion/runtime/__init__.py
@@ -2,9 +2,9 @@
 
 import contextvars
 import os
-from typing import TYPE_CHECKING
 
 import torch
+import triton
 
 from .. import _compat as _compat  # ensure Triton compatibility patches run
 from .config import Config as Config
@@ -14,12 +14,14 @@
 from .triton_helpers import triton_wait_multiple_signal as triton_wait_multiple_signal
 from .triton_helpers import triton_wait_signal as triton_wait_signal
 
-if TYPE_CHECKING:
-    import triton
-
 
 def _alloc_fn(size: int, alignment: int, stream: int | None) -> torch.Tensor:
-    return torch.empty(size, device="cuda", dtype=torch.int8)
+    # Dynamically get device from Triton backend
+    current_target = triton.runtime.driver.active.get_current_target()
+    if current_target is None:
+        raise RuntimeError("No active Triton target available")
+    backend = current_target.backend
+    return torch.empty(size, device=backend, dtype=torch.int8)
 
 
 def set_triton_allocator() -> None:
@@ -51,8 +53,13 @@ def get_num_sm(device: torch.device, *, reserved_sms: int = 0) -> int:
         Grid size to use for a persistent kernel on the device after accounting
         for any reserved SMs. Always at least 1.
     """
-    assert device.type in ["cuda", "xpu", "cpu"], "TODO: implement for other devices"
     available_sms: int
+    assert device.type in [
+        "cuda",
+        "xpu",
+        "cpu",
+        "mtia",
+    ], "TODO: implement for other devices"
     if device.type == "cpu":
         try:
             num_threads = int(torch.get_num_threads())
@@ -66,8 +73,19 @@ def get_num_sm(device: torch.device, *, reserved_sms: int = 0) -> int:
     # TODO(EikanWang): gpu_subslice_count is an out-of-date term. we change update it to XeCore number.
     elif device.type == "xpu":
         available_sms = torch.xpu.get_device_properties(device.index).gpu_subslice_count
+    elif device.type == "mtia":
+        device_props = torch.mtia.get_device_properties(device.index)
+        if "maxGridHeight" in device_props and "maxGridWidth" in device_props:
+            available_sms = device_props["maxGridHeight"] * device_props["maxGridWidth"]
+        else:
+            raise RuntimeError(
+                f"Unable to determine SM count for MTIA device. "
+                f"Available properties: {list(device_props.keys())}"
+            )
     else:
-        raise AssertionError("TODO: implement for other devices")
+        raise NotImplementedError(
+            f"get_num_sm not implemented for device type: {device.type}"
+        )
 
     if reserved_sms <= 0:
         return available_sms
@@ -83,6 +101,7 @@ def default_launcher(
     **kwargs: dict,
 ) -> object:
     """Default launcher function that executes the kernel immediately."""
+    # For both CUDA and MTIA, use the same kernel execution
     return triton_kernel.run(
         *args,
         grid=grid,
diff --git a/test/test_constexpr.py b/test/test_constexpr.py
@@ -10,6 +10,7 @@
 from helion._testing import RefEagerTestBase
 from helion._testing import TestCase
 from helion._testing import code_and_output
+from helion._testing import skipIfMTIA
 from helion._testing import skipIfRefEager
 import helion.language as hl
 
@@ -95,6 +96,7 @@ def fn(x: torch.Tensor, mode: str) -> torch.Tensor:
         self.assertExpectedJournal(code)
 
     @skipIfRefEager("Triton codegen does not work in ref eager mode")
+    @skipIfMTIA('Not supported on MTIA. Error: "Expected IntList but got GenericList"')
     def test_block_size_constexpr_assignment_in_host_code(self) -> None:
         @helion.kernel(
             config=helion.Config(