0911-add-other-multimodal's vit dispatch

SangChengC · SangChengC · commit 6b951563023b · 2025-09-11T13:16:32.000Z
diff --git a/lightllm/models/gemma3/gemma3_visual.py b/lightllm/models/gemma3/gemma3_visual.py
@@ -16,7 +16,8 @@
 
 
 class Gemma3VisionModel:
-    def __init__(self):
+    def __init__(self, kvargs):
+        self.remote_vit = kvargs.get("remote_vit", False)
         pass
 
     def load_model(self, weight_dir):
@@ -122,7 +123,10 @@ def encode(self, images: List[ImageItem]):
         for i, img in enumerate(images):
             if isinstance(img, ImageItem):
                 uuids.append(img.uuid)
-                image_data = read_shm(get_shm_name_data(img.uuid))
+                if self.remote_vit:
+                    image_data = img._preload_data
+                else:
+                    image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
                 t = self.image_processor.preprocess(image_data, return_tensors="pt")["pixel_values"]
                 img_tensors.append(t)
diff --git a/lightllm/models/llava/llava_visual.py b/lightllm/models/llava/llava_visual.py
@@ -15,7 +15,8 @@
 
 
 class LlavaVisionModel:
-    def __init__(self):
+    def __init__(self, kvargs):
+        self.remote_vit = kvargs.get("remote_vit", False)
         pass
 
     def load_model(self, weight_dir):
@@ -133,7 +134,10 @@ def encode(self, images: List[ImageItem]):
         for i, img in enumerate(images):
             if isinstance(img, ImageItem):
                 uuids.append(img.uuid)
-                image_data = read_shm(get_shm_name_data(img.uuid))
+                if self.remote_vit:
+                    image_data = img._preload_data
+                else:
+                    image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data)).convert("RGB")
                 t = self.image_processor.preprocess(image_data, return_tensors="pt")["pixel_values"]
                 img_tensors.append(t)
diff --git a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
@@ -171,6 +171,7 @@ def __init__(
         self.window_size = window_size
         self.fullatt_block_indexes = fullatt_block_indexes
         self.out_hidden_size = out_hidden_size
+        self.remote_vit = kvargs.get("remote_vit", False)
 
         self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
 
@@ -381,7 +382,10 @@ def encode(self, images: List[ImageItem]):
         for i, img in enumerate(images):
             if isinstance(img, ImageItem):
                 uuids.append(img.uuid)
-                image_data = read_shm(get_shm_name_data(img.uuid))
+                if self.remote_vit:
+                    image_data = img._preload_data
+                else:
+                    image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
                 image_data = resize_image(image_data)
                 pixel_values, image_grid_thw = self.processor.preprocess(image_data)
diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -200,6 +200,7 @@ def __init__(
         self.patch_size = patch_size
         self.spatial_merge_size = spatial_merge_size
         self.temporal_patch_size = temporal_patch_size
+        self.remote_vit = kvargs.get("remote_vit", False)
 
         self.patch_embed = PatchEmbed(
             patch_size=self.patch_size,
@@ -309,7 +310,10 @@ def encode(self, images: List[ImageItem]):
         for i, img in enumerate(images):
             if isinstance(img, ImageItem):
                 uuids.append(img.uuid)
-                image_data = read_shm(get_shm_name_data(img.uuid))
+                if self.remote_vit:
+                    image_data = img._preload_data
+                else:
+                    image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
                 image_data = resize_image(image_data)
                 pixel_values, image_grid_thw = self.processor.preprocess(image_data)
diff --git a/lightllm/models/qwen_vl/qwen_visual.py b/lightllm/models/qwen_vl/qwen_visual.py
@@ -333,6 +333,7 @@ def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
 class QWenVisionTransformer(nn.Module):
     def __init__(
         self,
+        kvargs,
         image_size: int,
         patch_size: int,
         width: int,
@@ -344,6 +345,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__()
+        self.remote_vit = kvargs.get("remote_vit", False)
         image_height, image_width = self.image_size = (image_size, image_size)
         patch_height, patch_width = self.patch_size = (patch_size, patch_size)
         self.grid_size = (image_height // patch_height, image_width // patch_width)
@@ -422,7 +424,10 @@ def encode(self, image_uuids: List):
         for i, item in enumerate(image_uuids):
             if isinstance(item, int):
                 uuids.append(item)
-                image_data = read_shm(get_shm_name_data(item))
+                if self.remote_vit:
+                    image_data = item._preload_data
+                else:
+                    image_data = read_shm(get_shm_name_data(item.uuid))
                 image_data = Image.open(BytesIO(image_data)).convert("RGB")
                 t = self.image_transform(image_data)
                 img_tensors.append(t)
diff --git a/lightllm/models/tarsier2/tarsier2_visual.py b/lightllm/models/tarsier2/tarsier2_visual.py
@@ -152,6 +152,7 @@ def forward(self, image_features, input_embeddings):
 class TarsierVisionTransformerPretrainedModel(nn.Module):
     def __init__(
         self,
+        kvargs,
         vision_config=None,
         text_config=None,
         ignore_index=-100,
@@ -165,6 +166,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__()
+        self.remote_vit = kvargs.get("remote_vit", False)
         self.vision_tower = Qwen2VisionTransformerPretrainedModel(**vision_config)
 
         if projection_head == "Pixel_Shuffle":
@@ -251,7 +253,10 @@ def encode(self, images: List[ImageItem]):
         for i, img in enumerate(images):
             if isinstance(img, ImageItem):
                 uuids.append(img.uuid)
-                image_data = read_shm(get_shm_name_data(img.uuid))
+                if self.remote_vit:
+                    image_data = img._preload_data
+                else:
+                    image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
                 image_data = resize_image(image_data)
                 pixel_values, image_grid_thw = self.processor.preprocess(image=image_data)
diff --git a/lightllm/models/vit/model.py b/lightllm/models/vit/model.py
@@ -47,6 +47,7 @@ def __init__(self, kvargs):
         self.quant_cfg_path = kvargs.get("quant_cfg", None)
         self.load_image_func = get_load_image_func(self.weight_dir_)
         self.max_batch_size = kvargs.get("max_batch_size", 1)
+        self.remote_vit = kvargs.get("remote_vit", False)
 
         self._init_datatype()
         self._init_config()
@@ -178,8 +179,10 @@ def encode(self, images: List[ImageItem]):
         for i, img in enumerate(images):
             if isinstance(img, ImageItem):
                 uuids.append(img.uuid)
-                image_data = img._preload_data
-                # image_data = read_shm(get_shm_name_data(img.uuid))
+                if self.remote_vit:
+                    image_data = img._preload_data
+                else:
+                    image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
                 t = self.load_image_func(image_data, max_num=img.extra_params["image_patch_max_num"])
                 img_tensors.append(t)
diff --git a/lightllm/server/visualserver/model_infer/model_rpc.py b/lightllm/server/visualserver/model_infer/model_rpc.py
@@ -46,6 +46,7 @@ def exposed_init_model(self, kvargs):
         quant_type = self.args.vit_quant_type
         quant_cfg = self.args.vit_quant_cfg
         max_batch_size = min(self.args.visual_infer_batch_size // self.args.visual_dp, 1)
+        remote_vit = True if self.args.run_mode == "visual" else False
 
         self.dp_rank_id = kvargs["dp_rank_id"]
         self.tp_rank_id = kvargs["tp_rank_id"]
@@ -62,10 +63,11 @@ def exposed_init_model(self, kvargs):
                 "quant_type": quant_type,
                 "quant_cfg": quant_cfg,
                 "max_batch_size": max_batch_size,
+                "remote_vit": remote_vit,
             }
             self.model_type = model_cfg["model_type"]
             if self.model_type == "qwen":
-                self.model = QWenVisionTransformer(**model_cfg["visual"]).eval().bfloat16()
+                self.model = QWenVisionTransformer(kvargs, **model_cfg["visual"]).eval().bfloat16()
             elif self.model_type == "qwen2_vl":
                 self.model = (
                     Qwen2VisionTransformerPretrainedModel(kvargs, **model_cfg["vision_config"]).eval().bfloat16()
@@ -75,14 +77,14 @@ def exposed_init_model(self, kvargs):
                     Qwen2_5_VisionTransformerPretrainedModel(kvargs, **model_cfg["vision_config"]).eval().bfloat16()
                 )
             elif model_cfg["architectures"][0] == "TarsierForConditionalGeneration":
-                self.model = TarsierVisionTransformerPretrainedModel(**model_cfg).eval().bfloat16()
+                self.model = TarsierVisionTransformerPretrainedModel(kvargs, **model_cfg).eval().bfloat16()
             elif self.model_type == "llava":
-                self.model = LlavaVisionModel()
+                self.model = LlavaVisionModel(kvargs)
             elif self.model_type == "internvl_chat":
                 self.model = VisionTransformer(kvargs)
                 # self.model = InternVLVisionModel()
             elif self.model_type == "gemma3":
-                self.model = Gemma3VisionModel()
+                self.model = Gemma3VisionModel(kvargs)
             else:
                 raise Exception(f"can not support {self.model_type} now")
             self.model.load_model(weight_dir)
diff --git a/lightllm/server/visualserver/vit_connect.py b/lightllm/server/visualserver/vit_connect.py
@@ -50,7 +50,6 @@ def _setup_vit_connections(self):
         """
         if self.remote_vit:
             # 远程VIT实例模式
-            print("remote")
             self._setup_remote_vit_connections()
         else:
             print("not remote")
diff --git a/lightllm/utils/shm_size_check.py b/lightllm/utils/shm_size_check.py
@@ -117,6 +117,7 @@ def _get_recommended_shm_size_gb(args, max_image_resolution=(3940, 2160), dtype_
         )
         fake_image_item.image_w = fake_image_item._data[0]
         fake_image_item.image_h = fake_image_item._data[1]
+        fake_image_item.extra_params["image_patch_max_num"] = 12
         max_image_tokens = tokenizer.get_image_token_length(fake_image_item)
 
         # 估算图片 token 所需的资源

Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,7 @@ def _get_recommended_shm_size_gb(args, max_image_resolution=(3940, 2160), dtype_`
`117`	`117`	`)`
`118`	`118`	`fake_image_item.image_w = fake_image_item._data[0]`
`119`	`119`	`fake_image_item.image_h = fake_image_item._data[1]`
	`120`	`+ fake_image_item.extra_params["image_patch_max_num"] = 12`
`120`	`121`	`max_image_tokens = tokenizer.get_image_token_length(fake_image_item)`
`121`	`122`
`122`	`123`	`# 估算图片 token 所需的资源`