From 98d9804f585e5d07b18d96915c569f0220918e1f Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Fri, 5 Dec 2025 10:01:59 +0000 Subject: [PATCH 1/3] fix health req id gen when httpserver worker num > 1 --- lightllm/utils/health_check.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lightllm/utils/health_check.py b/lightllm/utils/health_check.py index ee0778b65..f6c52bdb3 100644 --- a/lightllm/utils/health_check.py +++ b/lightllm/utils/health_check.py @@ -8,16 +8,12 @@ from lightllm.server.httpserver.manager import HttpServerManager from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt from fastapi import Request -from lightllm.server.req_id_generator import ReqIDGenerator from lightllm.utils.log_utils import init_logger from lightllm.utils.envs_utils import get_unique_server_name, get_env_start_args logger = init_logger(__name__) -_g_health_req_id_gen = ReqIDGenerator() - - @dataclass class HealthObj: _is_health: bool = False @@ -81,9 +77,9 @@ async def health_check(args, httpserver_manager: HttpServerManager, request: Req if get_env_start_args().run_mode == "pd_master": # Since the id assigned by pd master needs to be passed to prefill and decode nodes for inference, # a normal request id is required instead of a negative id. - sampling_params.group_request_id = _g_health_req_id_gen.generate_id() + sampling_params.group_request_id = httpserver_manager.id_gen.generate_id() else: - sampling_params.group_request_id = -_g_health_req_id_gen.generate_id() # health monitor 的 id 是负的 + sampling_params.group_request_id = -httpserver_manager.id_gen.generate_id() # health monitor 的 id 是负的 multimodal_params_dict = request_dict.get("multimodal_params", {}) multimodal_params = MultimodalParams(**multimodal_params_dict) results_generator = httpserver_manager.generate( From bd9a2a71f829af6ab8a9e5bf2d1f0891ad9deb97 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Fri, 5 Dec 2025 10:13:12 +0000 Subject: [PATCH 2/3] fix --- lightllm/server/req_id_generator.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lightllm/server/req_id_generator.py b/lightllm/server/req_id_generator.py index 445b6213e..ac76ef424 100644 --- a/lightllm/server/req_id_generator.py +++ b/lightllm/server/req_id_generator.py @@ -27,6 +27,7 @@ def __init__(self): self.current_id.arr[0] = 0 self.current_id.arr[1] = 0 self.lock = AtomicShmLock(f"{get_unique_server_name()}_req_id_gen_lock") + self.alloced_max_req_id: int = 0 def _check_and_set_new_id_range(self): need_update_range = self.current_id.arr[0] + MAX_BEST_OF >= self.current_id.arr[1] @@ -61,6 +62,13 @@ def generate_id(self): self._check_and_set_new_id_range() id = self.current_id.arr[0] self.current_id.arr[0] += MAX_BEST_OF + + # 请求 id 在 多 httpserver worker情况的极端启动情况下,可能存在请求id对应的共享内存被重新修改 + # 导致请求 id 错乱的问题。 + self.alloced_max_req_id = max(self.alloced_max_req_id, id) + if id < self.alloced_max_req_id: + logger.error(f"alloc req_id error, current alloc id {id} < max alloced id {self.alloced_max_req_id}") + return id From 02fc87652d8e5e5dfbc2a0a777fb6717f6e34016 Mon Sep 17 00:00:00 2001 From: wangzaijun Date: Fri, 5 Dec 2025 11:22:57 +0000 Subject: [PATCH 3/3] fix --- lightllm/server/req_id_generator.py | 76 ++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/lightllm/server/req_id_generator.py b/lightllm/server/req_id_generator.py index ac76ef424..9bf9040c3 100644 --- a/lightllm/server/req_id_generator.py +++ b/lightllm/server/req_id_generator.py @@ -1,3 +1,6 @@ +import os +import psutil +import sys import time import requests import numpy as np @@ -27,7 +30,44 @@ def __init__(self): self.current_id.arr[0] = 0 self.current_id.arr[1] = 0 self.lock = AtomicShmLock(f"{get_unique_server_name()}_req_id_gen_lock") - self.alloced_max_req_id: int = 0 + self._wait_all_workers_ready() + logger.info("ReqIDGenerator init finished") + + def _wait_all_workers_ready(self): + from lightllm.utils.envs_utils import get_unique_server_name + from lightllm.server.core.objs.shm_array import ShmArray + + _sync_shm = ShmArray( + f"{get_unique_server_name()}_httpworker_start_sync", (self.args.httpserver_workers,), dtype=np.int64 + ) + _sync_shm.create_shm() + # 等待所有 httpserver 的 worker 启动完成,防止重新初始化对应的请求id 对应的shm + try_count = 0 + while len(_find_sibling_processes()) + 1 != self.args.httpserver_workers: + time.sleep(0.1) + try_count += 1 + if try_count > 120: + logger.error("wait all httpserver workers start failed") + sys.exit(-1) + else: + continue + + cur_p_id = os.getpid() + pids = _find_sibling_processes() + pids.append(cur_p_id) + assert len(pids) == self.args.httpserver_workers + pids = sorted(pids) + index = pids.index(cur_p_id) + _sync_shm.arr[index] = cur_p_id + try_count = 0 + while not all(a == b for a, b in zip(pids, _sync_shm.arr)): + time.sleep(0.1) + try_count += 1 + if try_count > 120: + logger.error("wait all httpserver workers start failed 1") + sys.exit(-1) + else: + continue def _check_and_set_new_id_range(self): need_update_range = self.current_id.arr[0] + MAX_BEST_OF >= self.current_id.arr[1] @@ -62,15 +102,35 @@ def generate_id(self): self._check_and_set_new_id_range() id = self.current_id.arr[0] self.current_id.arr[0] += MAX_BEST_OF - - # 请求 id 在 多 httpserver worker情况的极端启动情况下,可能存在请求id对应的共享内存被重新修改 - # 导致请求 id 错乱的问题。 - self.alloced_max_req_id = max(self.alloced_max_req_id, id) - if id < self.alloced_max_req_id: - logger.error(f"alloc req_id error, current alloc id {id} < max alloced id {self.alloced_max_req_id}") - return id def convert_sub_id_to_group_id(sub_req_id): return (sub_req_id // MAX_BEST_OF) * MAX_BEST_OF + + +def _find_sibling_processes(): + # 获取当前进程的 PID + current_pid = os.getpid() + + # 获取当前进程的信息 + current_process = psutil.Process(current_pid) + + # 获取当前进程的父进程 + parent_process = current_process.parent() + + if parent_process is None: + logger.error("Current process has no parent.") + return [] + + # 查找兄弟进程 + sibling_processes = [] + for proc in psutil.process_iter(["pid", "name"]): + try: + # 检查是否是兄弟进程(同一父进程且不是当前进程) + if proc.pid != current_pid and proc.ppid() == parent_process.pid: + sibling_processes.append(proc) + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + + return [proc.pid for proc in sibling_processes]