[Autotuner] Add autotune_benchmark_fn setting (#1199)

yf225 · web-flow · commit 7aada668f156 · 2025-12-04T20:32:23.000-08:00
diff --git a/docs/api/settings.md b/docs/api/settings.md
@@ -254,6 +254,13 @@ See :class:`helion.autotuner.LocalAutotuneCache` for details on cache keys and b
 
    Override the callable that constructs autotuner instances. Accepts the same signature as :func:`helion.runtime.settings.default_autotuner_fn`.
    Pass a replacement callable via ``@helion.kernel(..., autotuner_fn=...)`` or ``helion.kernel(autotuner_fn=...)`` at definition time.
+
+.. autoattribute:: Settings.autotune_benchmark_fn
+
+   Custom benchmark function for rebenchmarking during autotuning. Should have the signature
+   ``(fns: list[Callable[[], object]], *, repeat: int, desc: str | None = None) -> list[float]``.
+   If ``None`` (default), uses the built-in benchmark function.
+   Pass a replacement callable via ``@helion.kernel(..., autotune_benchmark_fn=...)`` at definition time.
 ```
 
 Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"DifferentialEvolutionSearch"``, ``"FiniteSearch"``, and ``"RandomSearch"``.
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -922,12 +922,13 @@ def rebenchmark(
         )
         repeat = min(1000, max(3, base_repeat))
         iterator = [functools.partial(m.fn, *self.args) for m in members]
+        bench_fn = self.settings.autotune_benchmark_fn or interleaved_bench
         if self.settings.autotune_progress_bar:
             # pyrefly: ignore [bad-argument-type]
-            new_timings = interleaved_bench(iterator, repeat=repeat, desc=desc)
+            new_timings = bench_fn(iterator, repeat=repeat, desc=desc)
         else:
             # pyrefly: ignore [bad-argument-type]
-            new_timings = interleaved_bench(iterator, repeat=repeat)
+            new_timings = bench_fn(iterator, repeat=repeat)
         for m, t in zip(members, new_timings, strict=True):
             m.perfs.append(t)
             if t < self.best_perf_so_far:
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -412,6 +412,7 @@ class _Settings:
     autotune_baseline_fn: Callable[..., object] | None = None
     autotune_baseline_atol: float | None = None
     autotune_baseline_rtol: float | None = None
+    autotune_benchmark_fn: Callable[..., list[float]] | None = None
 
 
 class Settings(_Settings):
@@ -502,6 +503,12 @@ class Settings(_Settings):
             "Set HELION_AUTOTUNE_CACHE=StrictLocalAutotuneCache to enable strict caching. "
             "Defaults to 'LocalAutotuneCache'."
         ),
+        "autotune_benchmark_fn": (
+            "Custom benchmark function for rebenchmarking during autotuning. "
+            "Should have the following signature: "
+            "(fns: list[Callable[[], object]], *, repeat: int, desc: str | None = None) -> list[float]. "
+            "If None (default), uses the built-in benchmark function."
+        ),
     }
 
     def __init__(self, **settings: object) -> None:
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
@@ -1284,6 +1284,60 @@ def test_fragment_encoding(self):
             encoded = fragment.encode(value)
             self.assertEqual(len(encoded), dim)
 
+    @skipIfCpu("fails on Triton CPU backend")
+    def test_autotune_benchmark_fn(self) -> None:
+        """Test that custom benchmark function is used during rebenchmarking."""
+        # Track benchmark function calls
+        benchmark_calls: list[tuple[int, int]] = []  # (num_fns, repeat)
+
+        def custom_benchmark_fn(
+            fns: list[Callable[[], object]], *, repeat: int, desc: str | None = None
+        ) -> list[float]:
+            benchmark_calls.append((len(fns), repeat))
+            # Return fake timings
+            return [1.0] * len(fns)
+
+        @helion.kernel(
+            autotune_benchmark_fn=custom_benchmark_fn,
+            autotune_log_level=0,
+        )
+        def add(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            out = torch.empty_like(a)
+            for tile in hl.tile(out.size()):
+                out[tile] = a[tile] + b[tile]
+            return out
+
+        args = (
+            torch.randn([128], device=DEVICE),
+            torch.randn([128], device=DEVICE),
+        )
+
+        bound_kernel = add.bind(args)
+        # Use PatternSearch which has rebenchmark method
+        search = PatternSearch(bound_kernel, args)
+
+        # Compile two configs
+        config1 = search.config_gen.random_config()
+        config2 = search.config_gen.random_config()
+        fn1 = bound_kernel.compile_config(config1)
+        fn2 = bound_kernel.compile_config(config2)
+
+        # Create population members (flat_values not used in rebenchmark)
+        member1 = PopulationMember(fn1, [1.0], (), config1)
+        member2 = PopulationMember(fn2, [1.1], (), config2)
+
+        search.best_perf_so_far = 1.0
+
+        # Call rebenchmark directly
+        search.rebenchmark([member1, member2])
+
+        # Verify custom benchmark function was called
+        self.assertGreater(
+            len(benchmark_calls), 0, "Custom benchmark function should be called"
+        )
+        # Should have been called with 2 functions
+        self.assertEqual(benchmark_calls[0][0], 2)
+
 
 class TestAutotuneRandomSeed(RefEagerTestDisabled, TestCase):
     def _autotune_and_record(self, **settings: object) -> float: