format

WANDY666 · WANDY666 · commit 60bbd8635e57 · 2025-08-11T04:55:38.000Z
diff --git a/test/benchmark/kernel/benchmark_fa3_decode_mtp.py b/test/benchmark/kernel/benchmark_fa3_decode_mtp.py
@@ -25,6 +25,7 @@
 from lightllm.common.flash_attn import flash_attn_with_kvcache_mtp
 from lightllm.utils.bench_utils import do_bench
 
+
 def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
     query = query.float()
     key = key.float()
@@ -36,8 +37,7 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
         s_q = query.shape[-2]
         s_k = key.shape[-2]
         attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype, device=query.device)
-        temp_mask = torch.ones(
-            s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
+        temp_mask = torch.ones(s_q, s_k, dtype=torch.bool, device=query.device).tril(diagonal=s_k - s_q)
         attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
         attn_bias.to(query.dtype)
         attn_weight += attn_bias
@@ -47,8 +47,9 @@ def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
 
 
 @torch.inference_mode()
-def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q,
-                  h_kv, d, dv, causal, dtype):
+def run_torch_mla(
+    q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+):
     # q: [b, s_q, h_q, d]
     # block_table: [b, max_seqlen_pad // block_size]
     # blocked_k: [b * max_seqlen_pad // block_size, block_size, h_kv, d]
@@ -77,27 +78,35 @@ def ref_mla():
     return out_torch
 
 
-def run_fa3_mla_mtp(mtp_size, q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
-                     h_q, h_kv, d, dv, causal, dtype):
+def run_fa3_mla_mtp(
+    mtp_size,
+    q,
+    block_table,
+    blocked_k,
+    max_seqlen_pad,
+    block_size,
+    b,
+    s_q,
+    cache_seqlens,
+    h_q,
+    h_kv,
+    d,
+    dv,
+    causal,
+    dtype,
+):
 
     assert d > dv, "mla with rope dim should be larger than no rope dim"
     q_nope, q_pe = q[..., :dv].contiguous(), q[..., dv:].contiguous()
-    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[...,
-                                                                               dv:].contiguous()
+    blocked_k_nope, blocked_k_pe = blocked_k[..., :dv].contiguous(), blocked_k[..., dv:].contiguous()
 
     dpe = d - dv
-    num_kv_splits = 1
-
-    out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
-    glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
 
     batch_mtp = b // mtp_size
-    cu_seqlens_q = torch.arange(
-        0, batch_mtp + 1, step=s_q, dtype=torch.int32, device=q.device
-    )
+    cu_seqlens_q = torch.arange(0, batch_mtp + 1, step=s_q, dtype=torch.int32, device=q.device)
     cu_seqlens_k = torch.cumsum(cache_seqlens, dim=0)
     cu_seqlens_k = torch.cat([torch.tensor([0]).to(cu_seqlens_k), cu_seqlens_k])
-    scale = (1.0 / (dv + dpe))**0.5  # log2(e)
+    scale = (1.0 / (dv + dpe)) ** 0.5  # log2(e)
     k_descale, v_descale = None, None
     BLOCK_H = h_q * mtp_size
 
@@ -119,23 +128,24 @@ def flash_mla_fa3():
             k_descale=k_descale,
             v_descale=v_descale,
             return_softmax_lse=False,
-            mtp_step=1
+            mtp_step=1,
         )
         return out.view([b, s_q, h_q, dv])
 
     out_flash = flash_mla_fa3()
     t = do_bench(flash_mla_fa3)
 
-    out_ref = run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
-                            cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
-    
-        # 计算相对绝对误差
+    out_ref = run_torch_mla(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+
+    # 计算相对绝对误差
     def print_error(a, b, name=""):
         max_absolute_error = torch.abs(a - b).max()
         relative_abs_error = torch.abs(a - b) / (torch.abs(a) + 1e-4)
         max_relative_abs_error = relative_abs_error.max()
         mean_relative_abs_error = relative_abs_error.mean()
-    
+
         print(f"{name}: Maximum absolute difference: {max_absolute_error:.6e}")
         print(f"Maximum relative absolute error: {max_relative_abs_error:.6e}")
         print(f"Mean relative absolute error: {mean_relative_abs_error:.6e}")
@@ -148,13 +158,13 @@ def print_error(a, b, name=""):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--batch', type=int, default=128, help='batch size')
-    parser.add_argument('--h_q', type=int, default=16, help='q heads number')
-    parser.add_argument('--h_kv', type=int, default=1, help='kv heads number')
-    parser.add_argument('--cache_seqlen', type=int, default=8192, help='kv cache context length')
-    parser.add_argument('--d', type=int, default=576, help='query/key head dim, d = dv + dpe')
-    parser.add_argument('--dv', type=int, default=512, help='value head dim')
-    parser.add_argument('--mtp_size', type=int, default=2, help='Specifies the number of tokens per prediction.')
+    parser.add_argument("--batch", type=int, default=128, help="batch size")
+    parser.add_argument("--h_q", type=int, default=16, help="q heads number")
+    parser.add_argument("--h_kv", type=int, default=1, help="kv heads number")
+    parser.add_argument("--cache_seqlen", type=int, default=8192, help="kv cache context length")
+    parser.add_argument("--d", type=int, default=576, help="query/key head dim, d = dv + dpe")
+    parser.add_argument("--dv", type=int, default=512, help="value head dim")
+    parser.add_argument("--mtp_size", type=int, default=2, help="Specifies the number of tokens per prediction.")
     args = parser.parse_args()
     b, h_q, h_kv, cache_seqlen, d, dv = args.batch, args.h_q, args.h_kv, args.cache_seqlen, args.d, args.dv
     mtp_size = args.mtp_size
@@ -165,28 +175,41 @@ def print_error(a, b, name=""):
     s_q = 1  # for decode, s_q = 1
     block_size = 1
     batch_mtp = b // mtp_size
-    cache_seqlens = torch.tensor([cache_seqlen + i for i in range(batch_mtp)],
-                                 dtype=torch.int32,
-                                 device=device)
+    cache_seqlens = torch.tensor([cache_seqlen + i for i in range(batch_mtp)], dtype=torch.int32, device=device)
     # print(cache_seqlens[-1])
     dpe = d - dv
     causal = True
 
     total_seqlens = cache_seqlens.sum().item()
     mean_seqlens = cache_seqlens.float().mean().int().item()
     max_seqlen = cache_seqlens.max().item()
-    max_seqlen_pad = math.ceil(max_seqlen / 256) * 256 # ?为什么对齐256
+    max_seqlen_pad = math.ceil(max_seqlen / 256) * 256  # ?为什么对齐256
 
     total_flops = s_q * (total_seqlens * 2 - batch_mtp) * h_q * (d + dv) * 2
 
     q = torch.randn(b, s_q, h_q, d, dtype=dtype, device=device)
-    block_table = torch.arange(
-        batch_mtp  * max_seqlen_pad, dtype=torch.int32,
-        device=device).view(batch_mtp, max_seqlen_pad)
+    block_table = torch.arange(batch_mtp * max_seqlen_pad, dtype=torch.int32, device=device).view(
+        batch_mtp, max_seqlen_pad
+    )
 
     blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d, dtype=dtype, device=device)
-    out_flash, latency = run_fa3_mla_mtp(mtp_size, q, block_table, blocked_k, max_seqlen_pad, block_size, b,
-                                          s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
+    out_flash, latency = run_fa3_mla_mtp(
+        mtp_size,
+        q,
+        block_table,
+        blocked_k,
+        max_seqlen_pad,
+        block_size,
+        b,
+        s_q,
+        cache_seqlens,
+        h_q,
+        h_kv,
+        d,
+        dv,
+        causal,
+        dtype,
+    )
 
     print("Tile-lang: {:.3f} ms".format(latency))
     print("Tile-lang: {:.3f} TFlops".format(total_flops / latency * 1e-9))