1313from frozendict import frozendict
1414
1515asyncio .set_event_loop_policy (uvloop .EventLoopPolicy ())
16- from typing import Union , List , Tuple , Dict , Optional
16+ from typing import Union , List , Tuple , Dict , Optional , AsyncGenerator
1717from websockets import ClientConnection
1818from fastapi import Request
1919from ..tokenizer import get_tokenizer
@@ -266,7 +266,7 @@ async def generate(
266266 nixl_pd_upload_websocket : ClientConnection = None ,
267267 # 用于等待 pd_master 下发的交换信息
268268 nixl_pd_event : asyncio .Event = None ,
269- ) -> Tuple [int , str , dict , FinishStatus ]:
269+ ) -> AsyncGenerator [ Tuple [int , str , dict , FinishStatus ], None ]:
270270 start_time = time .time ()
271271 request_headers = request .headers if request is not None else {}
272272 group_request_id = self .alloc_req_id (sampling_params , is_health_req )
@@ -569,6 +569,7 @@ async def _wait_to_token_package(
569569
570570 prompt_cache_len = metadata .pop ("prompt_cache_len" , 0 )
571571 cpu_prompt_cache_len = metadata .pop ("cpu_prompt_cache_len" , 0 )
572+ disk_prompt_cache_len = metadata .pop ("disk_prompt_cache_len" , 0 )
572573 if is_first_token :
573574 first_token_cost_ms = (time .time () - start_time ) * 1000
574575 is_first_token = False
@@ -591,6 +592,8 @@ async def _wait_to_token_package(
591592 x_request_id = request .headers .get ("X-Request-Id" , "" ) if request is not None else ""
592593 x_session_id = request .headers .get ("X-Session-Id" , "" ) if request is not None else ""
593594 prompt_cache_ratio = prompt_cache_len / prompt_tokens
595+ cpu_prompt_cache_ratio = cpu_prompt_cache_len / prompt_tokens
596+ disk_prompt_cache_ratio = disk_prompt_cache_len / prompt_tokens
594597
595598 mtp_avg_token_per_step = out_token_counter / max (
596599 (out_token_counter - metadata ["mtp_accepted_token_num" ]), 1
@@ -603,10 +606,15 @@ async def _wait_to_token_package(
603606 f"total_cost_time:{ total_cost_time_ms } ms,out_token_counter:{ out_token_counter } "
604607 f"mean_per_token_cost_time: { mean_per_token_cost_time_ms } ms "
605608 f"prompt_token_num:{ prompt_tokens } "
609+ f"gpu cache hit: { prompt_cache_len > 0 } "
606610 f"prompt_cache_len:{ prompt_cache_len } "
607611 f"prompt_cache_ratio:{ prompt_cache_ratio } "
612+ f"cpu cache hit: { cpu_prompt_cache_len > 0 } "
608613 f"cpu_prompt_cache_len:{ cpu_prompt_cache_len } "
609- f"used_cpu_prompt_cache_len:{ max (0 , cpu_prompt_cache_len - prompt_cache_len )} "
614+ f"cpu_prompt_cache_ratio:{ cpu_prompt_cache_ratio } "
615+ f"disk cache hit: { disk_prompt_cache_len > 0 } "
616+ f"disk_prompt_cache_len:{ disk_prompt_cache_len } "
617+ f"disk_prompt_cache_ratio:{ disk_prompt_cache_ratio } "
610618 f"mtp_avg_token_per_step:{ mtp_avg_token_per_step } "
611619 )
612620 if group_request_id < 0 :
@@ -728,6 +736,7 @@ async def handle_loop(self):
728736 "count_output_tokens" : count_output_tokens ,
729737 "prompt_cache_len" : req .prompt_cache_len ,
730738 "cpu_prompt_cache_len" : req .cpu_prompt_cache_len ,
739+ "disk_prompt_cache_len" : req .disk_prompt_cache_len ,
731740 "mtp_accepted_token_num" : req .mtp_accepted_token_num ,
732741 }
733742 if self .args .return_all_prompt_logprobs :
0 commit comments