add AutotuneLevel for more detailed autotune (#1031)

sufubao · web-flow · commit 914dd588f604 · 2025-09-01T17:53:58.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -24,8 +24,9 @@
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.distributed.communication_op import dist_group_manager
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
+from lightllm.common.triton_utils.autotuner import AutotuneLevel
 from lightllm.utils.custom_kernel_utis import pad2dim_tensor_to_new_batch
-from lightllm.utils.envs_utils import set_model_init_status, is_triton_autotune_enabled, disable_triton_autotune
+from lightllm.utils.envs_utils import set_model_init_status, set_triton_autotune_level, get_triton_autotune_level
 from lightllm.utils.infer_utils import post_empty_cache
 
 logger = init_logger(__name__)
@@ -731,7 +732,7 @@ def autotune_layers(self):
     @torch.no_grad()
     @post_empty_cache
     def _autotune_warmup(self):
-        if not is_triton_autotune_enabled():
+        if get_triton_autotune_level() not in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
             return
 
         torch.distributed.barrier()
@@ -794,7 +795,7 @@ def _autotune_warmup(self):
                 torch.cuda.empty_cache()
         self.layers_num = layer_num_bak
         torch.distributed.barrier()
-        disable_triton_autotune()
+        set_triton_autotune_level(AutotuneLevel.USE_AUTOTUNE_HIS_CONFIG)
 
     @final
     @torch.no_grad()
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py
@@ -17,7 +17,8 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
-from lightllm.utils.envs_utils import is_triton_autotune_enabled
+from lightllm.utils.envs_utils import get_triton_autotune_level
+from lightllm.common.triton_utils.autotuner import AutotuneLevel
 from lightllm.utils.log_utils import init_logger
 
 logger = init_logger(__name__)
@@ -358,7 +359,7 @@ def prefilled_group_gemm(
             ######################################## warning ##################################################
             # here is used to match autotune feature, make moe model run same triton kernel in different rank.
             # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
-            if is_triton_autotune_enabled():
+            if get_triton_autotune_level() in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
                 _gemm_out_a = torch.zeros((1, N), device=device, dtype=hidden_dtype)
                 _silu_out = torch.zeros((1, N // 2), device=device, dtype=hidden_dtype)
                 silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/fused_moe/grouped_fused_moe_ep.py
@@ -14,7 +14,8 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
-from lightllm.utils.envs_utils import is_triton_autotune_enabled
+from lightllm.utils.envs_utils import get_triton_autotune_level
+from lightllm.common.triton_utils.autotuner import AutotuneLevel
 import numpy as np
 
 logger = init_logger(__name__)
@@ -191,7 +192,7 @@ def fused_experts_impl(
             ######################################## warning ##################################################
             # here is used to match autotune feature, make moe model run same triton kernel in different rank.
             # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
-            if is_triton_autotune_enabled():
+            if get_triton_autotune_level() in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
                 _gemm_out_a = torch.zeros((1, N), device=hidden_states.device, dtype=hidden_states.dtype)
                 _silu_out = torch.zeros((1, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
                 silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py
@@ -23,6 +23,8 @@
 from lightllm.utils.light_utils import light_ops
 from typing import Callable, List, Optional, Tuple
 from lightllm.common.fused_moe.softmax_topk import softmax_topk
+from lightllm.common.triton_utils.autotuner import AutotuneLevel
+from lightllm.utils.envs_utils import get_triton_autotune_level
 
 use_cuda_grouped_topk = os.getenv("LIGHTLLM_CUDA_GROUPED_TOPK", "False").upper() in ["ON", "TRUE", "1"]
 
@@ -221,4 +223,12 @@ def select_experts(
             hidden_states=hidden_states, gating_output=router_logits, topk=top_k, renormalize=renormalize
         )
 
+    ######################################## warning ##################################################
+    # here is used to match autotune feature, make topk_ids more random
+    if get_triton_autotune_level() in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
+        rand_gen = torch.Generator(device="cuda")
+        rand_gen.manual_seed(router_logits.shape[0])
+        router_logits = torch.randn(size=router_logits.shape, generator=rand_gen, dtype=torch.float32, device="cuda")
+        _, topk_ids = torch.topk(router_logits, k=top_k, dim=1)
+
     return topk_weights, topk_ids
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -12,14 +12,24 @@
 from lightllm.utils.device_utils import get_current_device_name
 from lightllm.utils.log_utils import init_logger
 from typing import Callable, Optional, Union, List
-from lightllm.utils.envs_utils import is_triton_autotune_enabled
+from lightllm.utils.envs_utils import get_triton_autotune_level
 from lightllm.common.kernel_config import KernelConfigs
 from lightllm.utils.dist_utils import get_global_world_size, get_global_rank, get_current_rank_in_node
-from lightllm.distributed.communication_op import dist_group_manager
 
 logger = init_logger(__name__)
 
 
+class AutotuneLevel:
+    # Use the config of cached files in /lightllm/common/triton_utils/autotune_kernel_configs.
+    USE_AUTOTUNE_HIS_CONFIG = 0
+    # Autotune if no config is cached.
+    ADAPTIVE_AUTOTUNE = 1
+    # Autotune anyway to overwrite the config of cached files.
+    FORCE_AUTOTUNE = 2
+    # Close autotune and use the configs of cached files in lightllm/common/all_kernel_configs.
+    CLOSE_AUTOTUNE = 3
+
+
 def autotune(
     kernel_name: str,
     configs_gen_func: Callable[[], List],
@@ -28,6 +38,30 @@ def autotune(
     run_key_distance_func: Callable = lambda run_key, config_key: abs(int(run_key) - int(config_key)),
     mutates_args: List[str] = [],
 ):
+    """Decorator that constructs and returns an Autotuner wrapper for a Triton kernel.
+
+    This decorator configures an Autotuner with the provided configuration
+    generator and key functions, enabling on-demand benchmarking and caching
+    of kernel run configurations across runs and processes.
+
+    Args:
+        kernel_name (str): Human-readable kernel name used for logging and cache paths.
+        configs_gen_func (Callable[[], List]): Function that returns candidate run configurations.
+        static_key_func (Callable): Function that derives a static key (dict-like) from call arguments.
+            This key identifies the cache file that stores tuned configs.
+        run_key_func (Callable): Function that derives a run-time key from call arguments.
+            This key indexes tuned configs within a static key's cache.
+        run_key_distance_func (Callable, optional): Distance metric taking ``(run_key, config_key)`` and
+            returning a comparable value; used to pick the closest config when an exact match is absent.
+            Defaults to ``abs(int(run_key) - int(config_key))``.
+        mutates_args (List[str], optional): Names of arguments that can be mutated by the kernel.
+            During benchmarking, defensive clones are made to avoid side effects. Defaults to ``[]``.
+
+    Returns:
+        Callable: A callable object that wraps the original function and performs autotuning
+        as needed before invocation.
+    """
+
     def decorator(fn):
         return Autotuner(
             fn=fn,
@@ -53,8 +87,6 @@ def __init__(
         run_key_distance_func: Callable = lambda run_key, config_key: abs(int(run_key) - int(config_key)),
         mutates_args: List[str] = [],
     ):
-        # Whether to use this autotune decorator
-        self.disable_autotune = not is_triton_autotune_enabled()
 
         self.configs_gen_func = configs_gen_func
         self.kernel_name = kernel_name
@@ -81,41 +113,50 @@ def __init__(
         ]
         self._run_key_func_param_names = [name for name, _ in inspect.signature(self.run_key_func).parameters.items()]
         self.mutates_args = mutates_args
+
+        assert get_triton_autotune_level() in [
+            AutotuneLevel.USE_AUTOTUNE_HIS_CONFIG,
+            AutotuneLevel.ADAPTIVE_AUTOTUNE,
+            AutotuneLevel.FORCE_AUTOTUNE,
+            AutotuneLevel.CLOSE_AUTOTUNE,
+        ]
         return
 
     @torch.no_grad()
     def __call__(self, *args, **kwargs):
         if kwargs.get("run_config", None) is not None:
             return self.fn(*args, **kwargs)
 
-        if self.disable_autotune:
+        # if the autotune_level is AutotuneLevel.CLOSE_AUTOTUNE, ignore the autotune
+        autotune_level = get_triton_autotune_level()
+        if autotune_level == AutotuneLevel.CLOSE_AUTOTUNE:
             return self.fn(*args, **kwargs)
 
         rank_id = 0 if not dist.is_initialized() else get_global_rank()
         world_size = 1 if not dist.is_initialized() else get_global_world_size()
 
-        static_key = self._static_key(*args, **kwargs)
+        static_key = frozendict(self._static_key(*args, **kwargs))
         run_key = str(self._run_key(*args, **kwargs))
 
-        # Lazy load
+        # Lazy load the cached configs in lightllm/common/triton_utils/autotune_kernel_configs
         self._try_load_cache(static_key)
 
-        if static_key not in self.cached_configs:
+        if static_key not in self.cached_configs and autotune_level == AutotuneLevel.USE_AUTOTUNE_HIS_CONFIG:
             if (dist.is_initialized() and get_current_rank_in_node() == 0) or not dist.is_initialized():
                 logger.warning(
                     f"No kernel config for {self.kernel_name} in {KernelConfigs.get_config_file_name(static_key)}",
                 )
             self.cached_configs[static_key] = {}
 
-        if is_triton_autotune_enabled():
-            need_tunning = run_key not in self.cached_configs.get(static_key, {})
+        if autotune_level in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
+            need_tuning = (autotune_level == AutotuneLevel.FORCE_AUTOTUNE) or (
+                run_key not in self.cached_configs.get(static_key, {})
+            )
             if world_size > 1:
-                _need_tunnings = [None for _ in range(world_size)]
-                dist.all_gather_object(
-                    _need_tunnings, obj=need_tunning, group=dist_group_manager.get_default_group().autotune_group
-                )
-                need_tunning = any(_need_tunnings)
-            if need_tunning:
+                _need_tunings = [None for _ in range(world_size)]
+                dist.all_gather_object(_need_tunings, obj=need_tuning, group=self._get_autotune_group())
+                need_tuning = any(_need_tunings)
+            if need_tuning:
                 self._autotune(
                     args=args,
                     kwargs=kwargs,
@@ -125,12 +166,12 @@ def __call__(self, *args, **kwargs):
                     world_size=world_size,
                 )
 
-        if static_key in self.fast_match_configs and run_key in self.fast_match_configs[static_key]:
-            closest_config = self.fast_match_configs[static_key][run_key]
+        closest_config = self.fast_match_configs.get(static_key, {}).get(run_key, None)
+        if closest_config is not None:
             kwargs["run_config"] = closest_config
             return self.fn(*args, **kwargs)
 
-        all_configs = self.cached_configs.get(static_key)
+        all_configs = self.cached_configs.get(static_key, {})
         if len(all_configs) != 0:
             closest_config = min(
                 list(all_configs.items()), key=lambda item: self.run_key_distance_func(run_key, item[0])
@@ -146,6 +187,7 @@ def _try_load_cache(self, static_key):
 
         cache_file = os.path.join(self.cache_dir, KernelConfigs.get_config_file_name(static_key))
         if os.path.exists(cache_file):
+            logger.info(f"Loading cached configs for {self.kernel_name} - {static_key}")
             with open(cache_file, "rb") as f:
                 self.cached_configs[static_key] = orjson.loads(f.read())
         return
@@ -194,9 +236,7 @@ def _autotune(self, args, kwargs, static_key, run_key, rank_id, world_size):
         if world_size > 1:
             all_keys = [None for _ in range(world_size)]
             all_key_str = f"{run_key}_{static_key}"
-            dist.all_gather_object(
-                all_keys, obj=all_key_str, group=dist_group_manager.get_default_group().autotune_group
-            )
+            dist.all_gather_object(all_keys, obj=all_key_str, group=self._get_autotune_group())
             is_key_all_same = all(all_keys[0] == k for k in all_keys)
             if not is_key_all_same:
                 logger.warning(
@@ -237,7 +277,7 @@ def _autotune(self, args, kwargs, static_key, run_key, rank_id, world_size):
             dist.all_gather_object(
                 all_gather_configs,
                 obj=(best_time, run_key, dict(static_key), best_config),
-                group=dist_group_manager.get_default_group().autotune_group,
+                group=self._get_autotune_group(),
             )
             all_gather_configs = sorted(all_gather_configs, key=lambda x: x[0])
             key_set = set()
@@ -318,13 +358,19 @@ def _select_args(self, param_names, args, kwargs):
 
     def _static_key(self, *args, **kwargs):
         params = self._select_args(self._static_key_func_param_names, args, kwargs)
-        key = self.static_key_func(*params)
-        return frozendict(key)
+        return self.static_key_func(*params)
 
     def _run_key(self, *args, **kwargs):
         params = self._select_args(self._run_key_func_param_names, args, kwargs)
         return self.run_key_func(*params)
 
+    def _get_autotune_group(
+        self,
+    ):
+        from lightllm.distributed.communication_op import dist_group_manager
+
+        return dist_group_manager.get_default_group().autotune_group
+
 
 class _BenchmarkState:
     def __init__(self):
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -149,15 +149,13 @@ def get_kv_quant_calibration_inference_count():
     return int(os.getenv("LIGHTLLM_KV_QUANT_CALIBRARTION_INFERENCE_COUNT", 4000))
 
 
-def is_triton_autotune_enabled():
-    # Whether Triton autotune is enabled (read-only check)
-    mark = os.getenv("LIGHTLLM_TRITON_AUTOTUNE", "False").upper() in ["ON", "TRUE", "1"]
-    return mark
+def get_triton_autotune_level():
+    return int(os.getenv("LIGHTLLM_TRITON_AUTOTUNE_LEVEL", 0))
 
 
-def disable_triton_autotune():
-    # Disable Triton autotune (setter)
-    os.environ["LIGHTLLM_TRITON_AUTOTUNE"] = "False"
+def set_triton_autotune_level(level: int):
+    os.environ["LIGHTLLM_TRITON_AUTOTUNE_LEVEL"] = str(level)
+    return
 
 
 g_model_init_done = False