fix pos_emb (#1126)

SangChengC · web-flow · commit b2649edca5c8 · 2025-11-27T19:19:27.000+08:00
diff --git a/lightllm/models/qwen2_vl/triton_kernel/rotary_pos_emb.py b/lightllm/models/qwen2_vl/triton_kernel/rotary_pos_emb.py
@@ -1,6 +1,7 @@
+import math
+import torch
 import triton
 import triton.language as tl
-import torch
 
 
 @triton.jit
@@ -16,94 +17,68 @@ def rotary_kernel(
     stride_cos_d,
     stride_sin_l,
     stride_sin_d,
-    L,
-    H,
-    D,
-    BLOCK_SEQ: tl.constexpr,
+    total_len,
+    head_num,
+    D: tl.constexpr,
     BLOCK_HEAD: tl.constexpr,
+    HALF_D: tl.constexpr,
     BLOCK_D: tl.constexpr,
 ):
-    pid_head_blk = tl.program_id(0)
-    pid_seq_blk = tl.program_id(1)
+    pid_h_block_index = tl.program_id(0).to(tl.int64)
+    pid_l_start = tl.program_id(1).to(tl.int64)
+    pid_blk = tl.program_id(2).to(tl.int64)
 
-    offs_h = pid_head_blk * BLOCK_HEAD + tl.arange(0, BLOCK_HEAD)
-    offs_l = pid_seq_blk * BLOCK_SEQ + tl.arange(0, BLOCK_SEQ)
     offs_d = tl.arange(0, BLOCK_D)
+    d = pid_blk * BLOCK_D + offs_d
+    mask = d < D
+    partner_d = tl.where(d < HALF_D, d + HALF_D, d - HALF_D)
 
-    offs_h = offs_h.to(tl.int64)
-    offs_l = offs_l.to(tl.int64)
-    offs_d = offs_d.to(tl.int64)
-
-    mask_h = offs_h < H
-    mask_l = offs_l < L
-    mask_d = offs_d < D
-
-    HALF_D = D // 2
-
-    l_b = offs_l[:, None, None]
-    h_b = offs_h[None, :, None]
-    d_b = offs_d[None, None, :]
-
-    mask = mask_l[:, None, None] & mask_h[None, :, None] & mask_d[None, None, :]
-
-    base = l_b * stride_l + h_b * stride_h + d_b * stride_d
-    x = tl.load(inp_ptr + base, mask=mask, other=0.0)
-
-    cos_base_2d = offs_l[:, None] * stride_cos_l + offs_d[None, :] * stride_cos_d
-    sin_base_2d = offs_l[:, None] * stride_sin_l + offs_d[None, :] * stride_sin_d
-    mask_ld = mask_l[:, None] & mask_d[None, :]
-
-    cos_2d = tl.load(cos_ptr + cos_base_2d, mask=mask_ld, other=0.0)
-    sin_2d = tl.load(sin_ptr + sin_base_2d, mask=mask_ld, other=0.0)
+    for pid_l in tl.range(pid_l_start, total_len, step=tl.num_programs(axis=1)):
+        cos_ptr_ = cos_ptr + pid_l * stride_cos_l + d
+        sin_ptr_ = sin_ptr + pid_l * stride_sin_l + d
+        cos = tl.load(cos_ptr_, mask=mask)
+        sin = tl.load(sin_ptr_, mask=mask)
 
-    cos = cos_2d[:, None, :]
-    sin = sin_2d[:, None, :]
+        for iter_index in tl.static_range(0, BLOCK_HEAD):
+            pid_h = pid_h_block_index * BLOCK_HEAD + iter_index
+            pid_h = tl.where(pid_h < head_num, pid_h, pid_h_block_index * BLOCK_HEAD)
+            base = pid_l * stride_l + pid_h * stride_h
+            in_ptr = inp_ptr + base + d * stride_d
+            x = tl.load(in_ptr, mask=mask, other=0.0)
 
-    partner_d = tl.where(offs_d < HALF_D, offs_d + HALF_D, offs_d - HALF_D)
-    partner_d_b = partner_d[None, None, :]
+            partner_ptr = inp_ptr + base + partner_d * stride_d
+            partner_val = tl.load(partner_ptr, mask=mask, other=0.0)
+            rotated = tl.where(d < HALF_D, -partner_val, partner_val)
 
-    partner_base = l_b * stride_l + h_b * stride_h + partner_d_b * stride_d
-    partner_val = tl.load(inp_ptr + partner_base, mask=mask, other=0.0)
+            y = x * cos + rotated * sin
 
-    rotated = tl.where(d_b < HALF_D, -partner_val, partner_val)
-
-    y = x * cos + rotated * sin
-
-    tl.store(out_ptr + base, y, mask=mask)
+            out_ptr_ = out_ptr + base + d
+            tl.store(out_ptr_, y, mask=mask)
 
 
 def apply_rotary_pos_emb_triton(
-    tensor: torch.Tensor,
-    cos: torch.Tensor,
-    sin: torch.Tensor,
+    tensor: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, BLOCK_D: int = 128
 ) -> torch.Tensor:
     assert tensor.is_cuda and cos.is_cuda and sin.is_cuda
     assert cos.is_contiguous() and sin.is_contiguous()
     if tensor.ndim != 3:
         raise RuntimeError("tensor shape should be [L, H, D]")
-
     orig_dtype = tensor.dtype
     x = tensor.float()
 
     cos = cos.repeat(1, 2).view(cos.size(0), -1).contiguous().float()
     sin = sin.repeat(1, 2).view(sin.size(0), -1).contiguous().float()
 
     L, H, D = x.shape
+    HALF_D = D // 2
     y = torch.empty_like(x)
-
-    BLOCK_SEQ = 16
-    BLOCK_HEAD = 4
-    BLOCK_D = triton.next_power_of_2(D)
-
-    if D >= 128:
-        num_warps = 8
+    if L < 1024:
+        grid_L = L
     else:
-        num_warps = 4
+        grid_L = 1024
 
-    grid = (
-        triton.cdiv(H, BLOCK_HEAD),
-        triton.cdiv(L, BLOCK_SEQ),
-    )
+    BLOCK_HEAD = 4
+    grid = (triton.cdiv(H, BLOCK_HEAD), grid_L, triton.cdiv(D, BLOCK_D))
 
     rotary_kernel[grid](
         inp_ptr=x,
@@ -117,13 +92,12 @@ def apply_rotary_pos_emb_triton(
         stride_cos_d=cos.stride(1),
         stride_sin_l=sin.stride(0),
         stride_sin_d=sin.stride(1),
-        L=L,
-        H=H,
+        total_len=L,
+        head_num=H,
         D=D,
-        BLOCK_SEQ=BLOCK_SEQ,
         BLOCK_HEAD=BLOCK_HEAD,
+        HALF_D=HALF_D,
         BLOCK_D=BLOCK_D,
-        num_warps=num_warps,
     )
 
     return y.to(orig_dtype)