Skip to content

Commit 7708b30

Browse files
blueswhenniushengxiaowangzaijunhiworldwzj
authored
feat: disk cache v1.0 (#1098)
Co-authored-by: niushengxiao <niushengxiao@sensetime.com> Co-authored-by: wangzaijun <wangzaijun@sensetime.com> Co-authored-by: hiworldwzj <30762946+hiworldwzj@users.noreply.github.com>
1 parent 5f2ce96 commit 7708b30

File tree

14 files changed

+536
-112
lines changed

14 files changed

+536
-112
lines changed

lightllm/server/api_cli.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,4 +560,10 @@ def make_argument_parser() -> argparse.ArgumentParser:
560560
parser.add_argument(
561561
"--disk_cache_storage_size", type=float, default=10, help="""The capacity of disk cache. GB used."""
562562
)
563+
parser.add_argument(
564+
"--disk_cache_dir",
565+
type=str,
566+
default=None,
567+
help="""Directory used to persist disk cache data. Defaults to a temp directory when not set.""",
568+
)
563569
return parser

lightllm/server/core/objs/atomic_lock.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,16 @@ def __exit__(self, exc_type, exc_val, exc_tb):
2929

3030
# acquire_sleep1ms 和 release 是某些特定场景下主动使用进行锁获取的操作函数
3131
def acquire_sleep1ms(self):
32+
start_time = time.monotonic()
33+
is_first = True
3234
with atomics.atomicview(buffer=self.shm.buf, atype=atomics.INT) as a:
3335
while not a.cmpxchg_weak(0, 1):
34-
logger.warning("acquire_sleep1ms wait for 1ms")
36+
# 减少日志数量
37+
if is_first:
38+
is_first = False
39+
logger.warning("acquire_sleep1ms wait for 1ms")
40+
if time.monotonic() - start_time > 0.010:
41+
logger.warning("acquire_sleep1ms wait more than 10ms")
3542
time.sleep(0.001)
3643
pass
3744

lightllm/server/core/objs/req.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ class Req(ctypes.Structure):
8686
("candetoken_out_len", ctypes.c_int),
8787
("prompt_cache_len", ctypes.c_int), # 用于记录prompt cache 的命中长度,用于统计,这里指gpu kv cache命中长度
8888
("cpu_prompt_cache_len", ctypes.c_int), # 用于记录在 enable_cpu_cache 的场景下,命中的 cpu kv cache 的长度
89+
("disk_prompt_cache_len", ctypes.c_int), # 用于记录从磁盘命中的长度
8990
("is_paused", ctypes.c_bool), # 标记一个Req因为显存资源管理的原因被临时暂停了。
9091
("finish_status", FinishStatus),
9192
# 这个标记变量是http_server 写入,其他进程读取,用于标记该请求是否因为断网被aborted。
@@ -155,6 +156,7 @@ def init(
155156
self.candetoken_out_len = 0
156157
self.prompt_cache_len = 0
157158
self.cpu_prompt_cache_len = 0
159+
self.disk_prompt_cache_len = 0
158160
self.finish_token_index = -1
159161
self.can_released_mark = False
160162
self.reward_score = math.nan

lightllm/server/core/objs/start_args_type.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ class StartArgs:
113113
cpu_cache_token_page_size: int = field(default=64)
114114
enable_disk_cache: bool = field(default=False)
115115
disk_cache_storage_size: float = field(default=10)
116+
disk_cache_dir: Optional[str] = field(default=None)
116117
# zmp ports
117118
router_port: int = field(default=None)
118119
detokenization_port: int = field(default=None)

lightllm/server/httpserver/manager.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from frozendict import frozendict
1414

1515
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
16-
from typing import Union, List, Tuple, Dict, Optional
16+
from typing import Union, List, Tuple, Dict, Optional, AsyncGenerator
1717
from websockets import ClientConnection
1818
from fastapi import Request
1919
from ..tokenizer import get_tokenizer
@@ -266,7 +266,7 @@ async def generate(
266266
nixl_pd_upload_websocket: ClientConnection = None,
267267
# 用于等待 pd_master 下发的交换信息
268268
nixl_pd_event: asyncio.Event = None,
269-
) -> Tuple[int, str, dict, FinishStatus]:
269+
) -> AsyncGenerator[Tuple[int, str, dict, FinishStatus], None]:
270270
start_time = time.time()
271271
request_headers = request.headers if request is not None else {}
272272
group_request_id = self.alloc_req_id(sampling_params, is_health_req)
@@ -569,6 +569,7 @@ async def _wait_to_token_package(
569569

570570
prompt_cache_len = metadata.pop("prompt_cache_len", 0)
571571
cpu_prompt_cache_len = metadata.pop("cpu_prompt_cache_len", 0)
572+
disk_prompt_cache_len = metadata.pop("disk_prompt_cache_len", 0)
572573
if is_first_token:
573574
first_token_cost_ms = (time.time() - start_time) * 1000
574575
is_first_token = False
@@ -591,6 +592,8 @@ async def _wait_to_token_package(
591592
x_request_id = request.headers.get("X-Request-Id", "") if request is not None else ""
592593
x_session_id = request.headers.get("X-Session-Id", "") if request is not None else ""
593594
prompt_cache_ratio = prompt_cache_len / prompt_tokens
595+
cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens
596+
disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens
594597

595598
mtp_avg_token_per_step = out_token_counter / max(
596599
(out_token_counter - metadata["mtp_accepted_token_num"]), 1
@@ -603,10 +606,15 @@ async def _wait_to_token_package(
603606
f"total_cost_time:{total_cost_time_ms}ms,out_token_counter:{out_token_counter} "
604607
f"mean_per_token_cost_time: {mean_per_token_cost_time_ms}ms "
605608
f"prompt_token_num:{prompt_tokens} "
609+
f"gpu cache hit: {prompt_cache_len > 0} "
606610
f"prompt_cache_len:{prompt_cache_len} "
607611
f"prompt_cache_ratio:{prompt_cache_ratio} "
612+
f"cpu cache hit: {cpu_prompt_cache_len > 0} "
608613
f"cpu_prompt_cache_len:{cpu_prompt_cache_len} "
609-
f"used_cpu_prompt_cache_len:{max(0, cpu_prompt_cache_len - prompt_cache_len)} "
614+
f"cpu_prompt_cache_ratio:{cpu_prompt_cache_ratio} "
615+
f"disk cache hit: {disk_prompt_cache_len > 0} "
616+
f"disk_prompt_cache_len:{disk_prompt_cache_len} "
617+
f"disk_prompt_cache_ratio:{disk_prompt_cache_ratio} "
610618
f"mtp_avg_token_per_step:{mtp_avg_token_per_step} "
611619
)
612620
if group_request_id < 0:
@@ -728,6 +736,7 @@ async def handle_loop(self):
728736
"count_output_tokens": count_output_tokens,
729737
"prompt_cache_len": req.prompt_cache_len,
730738
"cpu_prompt_cache_len": req.cpu_prompt_cache_len,
739+
"disk_prompt_cache_len": req.disk_prompt_cache_len,
731740
"mtp_accepted_token_num": req.mtp_accepted_token_num,
732741
}
733742
if self.args.return_all_prompt_logprobs:

0 commit comments

Comments
 (0)