support-whisper-longaudio (#1128)

SangChengC · web-flow · commit a4ff3c6d12f4 · 2025-11-27T19:35:12.000+08:00
diff --git a/lightllm/models/internvl/model.py b/lightllm/models/internvl/model.py
@@ -19,6 +19,7 @@
 )
 from lightllm.models.internvl.layer_weights.pre_and_post_layer_weight import InternVLInternlm2PreAndPostLayerWeight
 from lightllm.models.vit import get_image_patch_func
+from lightllm.models.whisper.defaults import MIN_AUDIO_LEN
 
 IMG_START_TOKEN = "<img>"
 IMG_END_TOKEN = "</img>"
@@ -47,6 +48,9 @@ def __init__(self, tokenizer, model_cfg, **kwargs):
         self.audio_end_id = tokenizer.convert_tokens_to_ids(self.audio_end_tag)
         self.get_image_patch_func = get_image_patch_func(kwargs["weight_dir"])
 
+        self.audio_min_length = MIN_AUDIO_LEN
+        self.audio_max_length = 16000 * 30
+
     def init_imageitem_extral_params(
         self, img: ImageItem, multi_params: MultimodalParams, sampling_params: SamplingParams
     ):
@@ -81,16 +85,35 @@ def get_image_token_length(self, img: ImageItem):
 
     def get_audio_token_length(self, audio: AudioItem):
         L = audio.audio_length
-        L = L if L <= 480000 else 480000  # max_length < 30s
-        mel_len = L // 160
-        dilation = 1
-        L_in = mel_len
-        for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
-            L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
-            L_out = 1 + L_out // stride
-            L_in = L_out
-        audio_len_after_cnn = L_out
-        audio_token_num = (audio_len_after_cnn - 2) // 2 + 1
+        audio_token_num = 0
+        chunk_lens = []
+        if L <= self.audio_max_length:
+            cur_len = L
+            if cur_len < self.audio_min_length:
+                cur_len = self.audio_min_length
+            chunk_lens.append(cur_len)
+        else:
+            start = 0
+            while start < L:
+                end = min(start + self.audio_max_length, L)
+                cur_len = end - start
+
+                if cur_len < self.audio_min_length:
+                    cur_len = self.audio_min_length
+
+                chunk_lens.append(cur_len)
+                start = end
+        for chunk_len in chunk_lens:
+            mel_len = chunk_len // 160
+            dilation = 1
+            L_in = mel_len
+            for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
+                L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+                L_out = 1 + L_out // stride
+                L_in = L_out
+            audio_len_after_cnn = L_out
+            chunk_token_num = (audio_len_after_cnn - 2) // 2 + 1
+            audio_token_num += int(chunk_token_num)
         return audio_token_num
 
     # only change the impl of the encode func:
diff --git a/lightllm/models/whisper/whisper_audio.py b/lightllm/models/whisper/whisper_audio.py
@@ -162,9 +162,12 @@ def forward(self, audio_values, audio_lens_after_cnn):
         return x
 
     def encode(self, audio_items: List[AudioItem]):
+        # 每个元素是一个chunk
         batch_audios = []
-        batch_audio_lens = np.zeros(len(audio_items), dtype=np.int32)
+        batch_audio_lens = []
         uuids = []
+        # 记录每个chunk属于哪个audio_items下标
+        chunk_owner_index = []
         for i, item in enumerate(audio_items):
             if isinstance(item, AudioItem):
                 uuids.append(item.uuid)
@@ -180,8 +183,25 @@ def encode(self, audio_items: List[AudioItem]):
             if audio.shape[0] < MIN_AUDIO_LEN:
                 audio = np.pad(audio, (0, MIN_AUDIO_LEN - len(audio)), mode="constant", constant_values=0.0)
 
-            batch_audio_lens[i] = min(audio.shape[0], self.max_length)
-            batch_audios.append(audio)
+            if audio.shape[0] > self.max_length:
+                start = 0
+                while start < audio.shape[0]:
+                    end = min(start + self.max_length, audio.shape[0])
+                    chunk = audio[start:end]
+
+                    if chunk.shape[0] < MIN_AUDIO_LEN:
+                        chunk = np.pad(chunk, (0, MIN_AUDIO_LEN - chunk.shape[0]), mode="constant", constant_values=0.0)
+                    batch_audios.append(chunk)
+                    batch_audio_lens.append(min(chunk.shape[0], self.max_length))
+                    chunk_owner_index.append(i)
+
+                    start = end
+            else:
+                batch_audio_lens.append(min(audio.shape[0], self.max_length))
+                batch_audios.append(audio)
+                chunk_owner_index.append(i)
+
+        batch_audio_lens = np.array(batch_audio_lens, dtype=np.int32)
 
         audios, audio_lens_after_cnn = self.audio_processor(
             batch_audios, batch_audio_lens, sampling_rate=16000, return_tensors="pt"
@@ -190,13 +210,28 @@ def encode(self, audio_items: List[AudioItem]):
         audio_lens_after_cnn = np.array(audio_lens_after_cnn, dtype=np.int32)
         audio_token_num = (audio_lens_after_cnn - 2) // 2 + 1
 
+        num_audios = len(audio_items)
+        per_audio_embeds = [[] for _ in range(num_audios)]
+
+        for chunk_idx, owner in enumerate(chunk_owner_index):
+            token_len = int(audio_token_num[chunk_idx])
+            if token_len <= 0:
+                continue
+            per_audio_embeds[owner].append(audios[chunk_idx][:token_len])
+
         ready_audio = obtain(self.cache_client.root.get_items_embed(uuids))
         ids_to_set = []
         for i, ready in enumerate(ready_audio):
-            if not ready:
-                uid = uuids[i]
-                cur_embed_bytes = tensor2bytes(audios[i][: audio_token_num[i]])
-                create_shm(get_shm_name_embed(uid), cur_embed_bytes)
-                ids_to_set.append(uid)
+            if ready:
+                continue
+
+            uid = uuids[i]
+
+            # 拼接该 audio 的所有 chunk embedding
+            cur_embed = torch.cat(per_audio_embeds[i], dim=0)
+            cur_embed_bytes = tensor2bytes(cur_embed)
+            create_shm(get_shm_name_embed(uid), cur_embed_bytes)
+            ids_to_set.append(uid)
+
         if ids_to_set:
             self.cache_client.root.set_items_embed(ids=ids_to_set)