opti-qwen2-vl-pre-process (#1094)

SangChengC · web-flow · commit 96e2a1d4e480 · 2025-11-04T16:54:58.000+08:00
diff --git a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
@@ -383,7 +383,6 @@ def encode(self, images: List[ImageItem]):
                 uuids.append(img.uuid)
                 image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
-                image_data = resize_image(image_data)
                 pixel_values, image_grid_thw = self.processor.preprocess(image_data)
                 img_tensors.append(pixel_values)
                 img_grids.append(image_grid_thw)
diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -311,12 +311,6 @@ def encode(self, images: List[ImageItem]):
                 uuids.append(img.uuid)
                 image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
-                image_data = resize_image(
-                    image_file=image_data,
-                    factor=self.processor.patch_size * self.processor.merge_size,
-                    min_pixels=self.processor.min_pixels,
-                    max_pixels=self.processor.max_pixels,
-                )
                 pixel_values, image_grid_thw = self.processor.preprocess(image_data)
                 img_tensors.append(pixel_values)
                 img_grids.append(image_grid_thw)
diff --git a/lightllm/models/qwen2_vl/vision_process.py b/lightllm/models/qwen2_vl/vision_process.py
@@ -3,23 +3,24 @@
 import torch
 import numpy as np
 from PIL import Image
+from collections import defaultdict
 from typing import List, Optional, Union, Tuple
 
-from transformers.image_processing_utils import BaseImageProcessor
-from transformers.image_transforms import (
-    convert_to_rgb,
-    resize,
-    to_channel_dimension_format,
+from transformers.image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    group_images_by_shape,
+    reorder_images,
 )
+from torchvision.transforms import InterpolationMode
+
 from transformers.image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
     ChannelDimension,
     PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    to_numpy_array,
+    SizeDict,
 )
+from torchvision.transforms.v2 import functional as F
 
 IMAGE_FACTOR = 28
 MIN_PIXELS = 4 * 28 * 28
@@ -35,9 +36,9 @@ def smart_resize(
     height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
 ) -> tuple[int, int]:
 
-    if max(height, width) / min(height, width) > 200:
+    if max(height, width) / min(height, width) > MAX_RATIO:
         raise ValueError(
-            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+            f"absolute aspect ratio must be smaller than MAX_RATIO, got {max(height, width) / min(height, width)}"
         )
     h_bar = round(height / factor) * factor
     w_bar = round(width / factor) * factor
@@ -54,7 +55,7 @@ def smart_resize(
 
 def resize_image(
     image_file: Image.Image, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
-) -> tuple[Image.Image, int, int]:
+) -> tuple[Image.Image]:
 
     image = image_file.convert("RGB")
     width, height = image.size
@@ -71,7 +72,7 @@ def resize_image(
     return image
 
 
-class Qwen2VLImageProcessor(BaseImageProcessor):
+class Qwen2VLImageProcessor(BaseImageProcessorFast):
     def __init__(
         self,
         do_resize: bool = True,
@@ -87,6 +88,8 @@ def __init__(
         patch_size: int = 14,
         temporal_patch_size: int = 2,
         merge_size: int = 2,
+        disable_grouping: Optional[bool] = None,
+        interpolation: Optional["F.InterpolationMode"] = InterpolationMode.BICUBIC,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
@@ -103,63 +106,138 @@ def __init__(
         self.patch_size = patch_size
         self.temporal_patch_size = temporal_patch_size
         self.merge_size = merge_size
+        self.disable_grouping = disable_grouping
+        self.interpolation = interpolation
         self.data_format = ChannelDimension.FIRST
+        self._fused_cache = {}  # key: (do_norm, do_rescale, rescale_factor, device)
+
+    def _get_fused_mean_std(
+        self,
+        do_normalize: bool,
+        image_mean: Union[float, list[float]],
+        image_std: Union[float, list[float]],
+        do_rescale: bool,
+        rescale_factor: float,
+        device: Optional["torch.device"],
+    ) -> tuple[torch.Tensor, torch.Tensor, bool]:
+        key = (bool(do_normalize), bool(do_rescale), float(rescale_factor), str(device))
+        if key not in self._fused_cache:
+            if do_rescale and do_normalize:
+                mean = torch.tensor(image_mean) * (1.0 / rescale_factor)
+                std = torch.tensor(image_std) * (1.0 / rescale_factor)
+                do_rescale = False
+            else:
+                mean = torch.tensor(image_mean)
+                std = torch.tensor(image_std)
+            self._fused_cache[key] = (mean.to(device=device), std.to(device=device), do_rescale)
+        return self._fused_cache[key]
+
+    def rescale_and_normalize(
+        self,
+        images: "torch.Tensor",
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, list[float]],
+        image_std: Union[float, list[float]],
+    ) -> "torch.Tensor":
+        """
+        Rescale and normalize images.
+        """
+        image_mean, image_std, do_rescale = self._get_fused_mean_std(
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            device=images.device,
+        )
+        # if/elif as we use fused rescale and normalize if both are set to True
+        if do_normalize:
+            images = self.normalize(images.to(dtype=torch.float32), image_mean, image_std)
+        elif do_rescale:
+            images = self.rescale(images, rescale_factor)
+
+        return images
 
     def preprocess(self, image) -> Tuple[torch.Tensor, torch.Tensor]:
-        if self.do_convert_rgb:
-            image = convert_to_rgb(image)
-        image = to_numpy_array(image)
-        input_data_format = infer_channel_dimension_format(image)
-        height, width = get_image_size(image, channel_dim=input_data_format)
-
-        resized_height, resized_width = height, width
-        if self.do_resize:
-            resized_height, resized_width = smart_resize(
-                height,
-                width,
-                factor=self.patch_size * self.merge_size,
-                min_pixels=self.min_pixels,
-                max_pixels=self.max_pixels,
+        image_arr = np.asarray(image, dtype=np.uint8)
+        image_data = torch.from_numpy(image_arr).permute(2, 0, 1).contiguous().to("cuda", non_blocking=True)
+        grouped_images, grouped_images_index = group_images_by_shape(
+            [image_data], disable_grouping=self.disable_grouping
+        )
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            height, width = stacked_images.shape[-2:]
+            if self.do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                stacked_images = self.resize(
+                    image=stacked_images,
+                    size=SizeDict(height=resized_height, width=resized_width),
+                    interpolation=self.interpolation,
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(
+            resized_images, disable_grouping=self.disable_grouping
+        )
+        processed_images_grouped = {}
+        processed_grids = {}
+        for shape, stacked_images in grouped_images.items():
+            resized_height, resized_width = stacked_images.shape[-2:]
+            # Fused rescale and normalize
+            patches = self.rescale_and_normalize(
+                stacked_images, self.do_rescale, self.rescale_factor, self.do_normalize, self.image_mean, self.image_std
             )
-            image = resize(
-                image, size=(resized_height, resized_width), resample=self.resample, input_data_format=input_data_format
+            if patches.ndim == 4:
+                # add a temporal dimension if we have images
+                patches = patches.unsqueeze(1)
+            if patches.shape[1] % self.temporal_patch_size != 0:
+                repeats = patches[:, -1:].repeat(1, self.temporal_patch_size - 1, 1, 1, 1)
+                patches = torch.cat([patches, repeats], dim=1)
+            batch_size, grid_t, channel = patches.shape[:3]
+            grid_t = grid_t // self.temporal_patch_size
+            grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+
+            patches = (
+                patches.view(
+                    batch_size,
+                    grid_t,
+                    self.temporal_patch_size,
+                    channel,
+                    grid_h // self.merge_size,
+                    self.merge_size,
+                    self.patch_size,
+                    grid_w // self.merge_size,
+                    self.merge_size,
+                    self.patch_size,
+                )
+                .permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9)
+                .contiguous()
             )
-
-        if self.do_rescale:
-            image = self.rescale(image, scale=self.rescale_factor, input_data_format=input_data_format)
-
-        if self.do_normalize:
-            image = self.normalize(
-                image=image, mean=self.image_mean, std=self.image_std, input_data_format=input_data_format
+            # Reorder dimensions to group grid and patch information for subsequent flattening.
+            # (batch, grid_t, grid_h, grid_w, merge_h, merge_w, channel, temp_patch_size, patch_h, patch_w)
+            flatten_patches = patches.view(
+                batch_size,
+                grid_t * grid_h * grid_w,
+                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
             )
 
-        image = to_channel_dimension_format(image, self.data_format, input_channel_dim=input_data_format)
-
-        patches = np.array([image])
-
-        if patches.shape[0] == 1:
-            # why to copy image 2 times. use self.temporal_patch_size = 2.
-            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
-        channel = patches.shape[1]
-        grid_t = patches.shape[0] // self.temporal_patch_size
-        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
-        patches = patches.reshape(
-            grid_t,
-            self.temporal_patch_size,
-            channel,
-            grid_h // self.merge_size,
-            self.merge_size,
-            self.patch_size,
-            grid_w // self.merge_size,
-            self.merge_size,
-            self.patch_size,
-        )
-        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
-        flatten_patches = patches.reshape(
-            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
-        )
-        image_grid_thw = (grid_t, grid_h, grid_w)
-        pixel_values = torch.as_tensor(flatten_patches)
-        grid_thw = torch.as_tensor([image_grid_thw])
+            processed_images_grouped[shape] = flatten_patches
+            processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_grids = reorder_images(processed_grids, grouped_images_index)
+        pixel_values = torch.cat(processed_images, dim=0)  # (num_patches_total, C*T*ps*ps)
+        image_grid_thw = torch.as_tensor(processed_grids)
 
-        return pixel_values, grid_thw
+        return pixel_values, image_grid_thw
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -4,6 +4,7 @@
 import asyncio
 import uvloop
 import rpyc
+import socket
 import time
 import copy
 import hashlib
@@ -79,6 +80,7 @@ def __init__(
         self.enable_multimodal = args.enable_multimodal
         if self.enable_multimodal:
             self.cache_client = rpyc.connect("localhost", args.cache_port, config={"allow_pickle": True})
+            self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
             self.send_to_visual = context.socket(zmq.PUSH)
             self.send_to_visual.connect(f"{args.zmq_mode}127.0.0.1:{args.visual_port}")
         if args.enable_cpu_cache and not self.args.enable_multimodal:
diff --git a/lightllm/server/visualserver/manager.py b/lightllm/server/visualserver/manager.py
@@ -3,6 +3,7 @@
 import asyncio
 import uvloop
 import rpyc
+import socket
 import pickle
 import inspect
 import setproctitle
@@ -45,6 +46,7 @@ def __init__(
         self.zmq_recv_socket = context.socket(zmq.PULL)
         self.zmq_recv_socket.bind(f"{args.zmq_mode}127.0.0.1:{args.visual_port}")
         self.cache_client = rpyc.connect("localhost", args.cache_port, config={"allow_pickle": True})
+        self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
         self.cache_port = args.cache_port
         self.waiting_reqs: List[GroupReqIndexes] = []
         self.model_weightdir = args.model_dir
diff --git a/lightllm/server/visualserver/model_infer/model_rpc.py b/lightllm/server/visualserver/model_infer/model_rpc.py
@@ -2,6 +2,7 @@
 import numpy as np
 import rpyc
 import torch
+import socket
 import inspect
 from datetime import timedelta
 from typing import Dict, List, Tuple
@@ -39,6 +40,7 @@ def exposed_init_model(self, kvargs):
         weight_dir = kvargs["weight_dir"]
         self.vit_rank_id = kvargs["vit_rank_id"]
         self.cache_client = rpyc.connect("localhost", self.cache_port, config={"allow_pickle": True})
+        self.cache_client._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
         self.data_type = kvargs["data_type"]
 
         init_vision_distributed_env(kvargs)
@@ -161,6 +163,8 @@ def _init_env(port, device_id):
     # 注册graceful 退出的处理
     graceful_registry(inspect.currentframe().f_code.co_name)
 
+    import lightllm.utils.rpyc_fix_utils as _
+
     t = ThreadedServer(VisualModelRpcServer(), port=port, protocol_config={"allow_pickle": True})
     t.start()
     return
@@ -182,6 +186,7 @@ async def start_model_process(port, vit_tp, device_id):
     while repeat_count < 20:
         try:
             con = rpyc.connect("localhost", port, config={"allow_pickle": True})
+            con._channel.stream.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
             break
         except BaseException:
             await asyncio.sleep(1)