ModelTC
diff --git a/‎lightllm/server/api_cli.py‎
Lines changed: 6 additions & 0 deletions b/‎lightllm/server/api_cli.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎lightllm/server/core/objs/atomic_lock.py‎
Lines changed: 8 additions & 1 deletion b/‎lightllm/server/core/objs/atomic_lock.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎lightllm/server/core/objs/req.py‎
Lines changed: 2 additions & 0 deletions b/‎lightllm/server/core/objs/req.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lightllm/server/core/objs/start_args_type.py‎
Lines changed: 1 addition & 0 deletions b/‎lightllm/server/core/objs/start_args_type.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lightllm/server/httpserver/manager.py‎
Lines changed: 12 additions & 3 deletions b/‎lightllm/server/httpserver/manager.py‎
Lines changed: 12 additions & 3 deletions
@@ -560,4 +560,10 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--disk_cache_storage_size", type=float, default=10, help="""The capacity of disk cache. GB used."""
     )
+    parser.add_argument(
+        "--disk_cache_dir",
+        type=str,
+        default=None,
+        help="""Directory used to persist disk cache data. Defaults to a temp directory when not set.""",
+    )
     return parser
@@ -29,9 +29,16 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
     # acquire_sleep1ms 和 release 是某些特定场景下主动使用进行锁获取的操作函数
     def acquire_sleep1ms(self):
+        start_time = time.monotonic()
+        is_first = True
         with atomics.atomicview(buffer=self.shm.buf, atype=atomics.INT) as a:
             while not a.cmpxchg_weak(0, 1):
-                logger.warning("acquire_sleep1ms wait for 1ms")
+                # 减少日志数量
+                if is_first:
+                    is_first = False
+                    logger.warning("acquire_sleep1ms wait for 1ms")
+                if time.monotonic() - start_time > 0.010:
+                    logger.warning("acquire_sleep1ms wait more than 10ms")
                 time.sleep(0.001)
                 pass
 
 
@@ -86,6 +86,7 @@ class Req(ctypes.Structure):
         ("candetoken_out_len", ctypes.c_int),
         ("prompt_cache_len", ctypes.c_int),  # 用于记录prompt cache 的命中长度，用于统计,这里指gpu kv cache命中长度
         ("cpu_prompt_cache_len", ctypes.c_int),  # 用于记录在 enable_cpu_cache 的场景下,命中的 cpu kv cache 的长度
+        ("disk_prompt_cache_len", ctypes.c_int),  # 用于记录从磁盘命中的长度
         ("is_paused", ctypes.c_bool),  # 标记一个Req因为显存资源管理的原因被临时暂停了。
         ("finish_status", FinishStatus),
         # 这个标记变量是http_server 写入，其他进程读取，用于标记该请求是否因为断网被aborted。
@@ -155,6 +156,7 @@ def init(
         self.candetoken_out_len = 0
         self.prompt_cache_len = 0
         self.cpu_prompt_cache_len = 0
+        self.disk_prompt_cache_len = 0
         self.finish_token_index = -1
         self.can_released_mark = False
         self.reward_score = math.nan
 
@@ -113,6 +113,7 @@ class StartArgs:
     cpu_cache_token_page_size: int = field(default=64)
     enable_disk_cache: bool = field(default=False)
     disk_cache_storage_size: float = field(default=10)
+    disk_cache_dir: Optional[str] = field(default=None)
     # zmp ports
     router_port: int = field(default=None)
     detokenization_port: int = field(default=None)
 
@@ -13,7 +13,7 @@
 from frozendict import frozendict
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
-from typing import Union, List, Tuple, Dict, Optional
+from typing import Union, List, Tuple, Dict, Optional, AsyncGenerator
 from websockets import ClientConnection
 from fastapi import Request
 from ..tokenizer import get_tokenizer
@@ -266,7 +266,7 @@ async def generate(
         nixl_pd_upload_websocket: ClientConnection = None,
         # 用于等待 pd_master 下发的交换信息
         nixl_pd_event: asyncio.Event = None,
-    ) -> Tuple[int, str, dict, FinishStatus]:
+    ) -> AsyncGenerator[Tuple[int, str, dict, FinishStatus], None]:
         start_time = time.time()
         request_headers = request.headers if request is not None else {}
         group_request_id = self.alloc_req_id(sampling_params, is_health_req)
@@ -569,6 +569,7 @@ async def _wait_to_token_package(
 
                     prompt_cache_len = metadata.pop("prompt_cache_len", 0)
                     cpu_prompt_cache_len = metadata.pop("cpu_prompt_cache_len", 0)
+                    disk_prompt_cache_len = metadata.pop("disk_prompt_cache_len", 0)
                     if is_first_token:
                         first_token_cost_ms = (time.time() - start_time) * 1000
                         is_first_token = False
@@ -591,6 +592,8 @@ async def _wait_to_token_package(
                         x_request_id = request.headers.get("X-Request-Id", "") if request is not None else ""
                         x_session_id = request.headers.get("X-Session-Id", "") if request is not None else ""
                         prompt_cache_ratio = prompt_cache_len / prompt_tokens
+                        cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens
+                        disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens
 
                         mtp_avg_token_per_step = out_token_counter / max(
                             (out_token_counter - metadata["mtp_accepted_token_num"]), 1
@@ -603,10 +606,15 @@ async def _wait_to_token_package(
                             f"total_cost_time:{total_cost_time_ms}ms,out_token_counter:{out_token_counter} "
                             f"mean_per_token_cost_time: {mean_per_token_cost_time_ms}ms "
                             f"prompt_token_num:{prompt_tokens} "
+                            f"gpu cache hit: {prompt_cache_len > 0} "
                             f"prompt_cache_len:{prompt_cache_len} "
                             f"prompt_cache_ratio:{prompt_cache_ratio} "
+                            f"cpu cache hit: {cpu_prompt_cache_len > 0} "
                             f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
-                            f"used_cpu_prompt_cache_len:{max(0, cpu_prompt_cache_len - prompt_cache_len)} "
+                            f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
+                            f"disk cache hit: {disk_prompt_cache_len > 0} "
+                            f"disk_prompt_cache_len:{disk_prompt_cache_len} "
+                            f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
                             f"mtp_avg_token_per_step:{mtp_avg_token_per_step} "
                         )
                         if group_request_id < 0:
@@ -728,6 +736,7 @@ async def handle_loop(self):
                                     "count_output_tokens": count_output_tokens,
                                     "prompt_cache_len": req.prompt_cache_len,
                                     "cpu_prompt_cache_len": req.cpu_prompt_cache_len,
+                                    "disk_prompt_cache_len": req.disk_prompt_cache_len,
                                     "mtp_accepted_token_num": req.mtp_accepted_token_num,
                                 }
                                 if self.args.return_all_prompt_logprobs: