feat: add usage to streamin response

okaris · okaris · commit ad299c37a41c · 2025-09-06T07:18:27.000Z
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1054,6 +1054,50 @@ def decode_batch(seq_sizes: List[int]):
         else:
             return output
 
+    def _create_chunk(
+        self,
+        completion_id: str,
+        created: int,
+        model_name: str,
+        text: str,
+        logprobs_or_none: Union[Optional[CompletionLogprobs], None],
+        index: int,
+        finish_reason: Union[str, None],
+        usage: Optional[Dict[str, Any]] = None,
+    ) -> CreateCompletionStreamResponse:
+        """Create chunks for streaming API, depending on whether usage is requested or not."""
+        if usage is not None:
+            return {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": index,
+                        "logprobs": logprobs_or_none,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+                "usage": usage,
+            }
+        else:
+            return {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": index,
+                        "logprobs": logprobs_or_none,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+            }
+
     def _create_completion(
         self,
         prompt: Union[str, List[int]],
@@ -1380,24 +1424,20 @@ def logit_bias_processor(
                             "top_logprobs": [top_logprob],
                         }
                         returned_tokens += 1
-                        yield {
-                            "id": completion_id,
-                            "object": "text_completion",
-                            "created": created,
-                            "model": model_name,
-                            "choices": [
-                                {
-                                    "text": self.detokenize(
-                                        [token],
-                                        prev_tokens=prompt_tokens
-                                        + completion_tokens[:returned_tokens],
-                                    ).decode("utf-8", errors="ignore"),
-                                    "index": 0,
-                                    "logprobs": logprobs_or_none,
-                                    "finish_reason": None,
-                                }
-                            ],
-                        }
+                        yield self._create_chunk(
+                            completion_id=completion_id,
+                            created=created,
+                            model_name=model_name,
+                            text=self.detokenize(
+                                [token],
+                                prev_tokens=prompt_tokens
+                                + completion_tokens[:returned_tokens],
+                            ).decode("utf-8", errors="ignore"),
+                            logprobs_or_none=logprobs_or_none,
+                            index=0,
+                            finish_reason=None,
+                            usage=None,
+                        )
                 else:
                     while len(remaining_tokens) > 0:
                         decode_success = False
@@ -1426,20 +1466,16 @@ def logit_bias_processor(
                         remaining_tokens = remaining_tokens[i:]
                         returned_tokens += i
 
-                        yield {
-                            "id": completion_id,
-                            "object": "text_completion",
-                            "created": created,
-                            "model": model_name,
-                            "choices": [
-                                {
-                                    "text": ts,
-                                    "index": 0,
-                                    "logprobs": None,
-                                    "finish_reason": None,
-                                }
-                            ],
-                        }
+                        yield self._create_chunk(
+                            completion_id=completion_id,
+                            created=created,
+                            model_name=model_name,
+                            text=ts,
+                            logprobs_or_none=None,
+                            index=0,
+                            finish_reason=None,
+                            usage=None,
+                        )
 
             if len(completion_tokens) >= max_tokens:
                 text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
@@ -1518,54 +1554,51 @@ def logit_bias_processor(
                     if token_end_position == end - 1:
                         break
                     returned_tokens += 1
-                    yield {
-                        "id": completion_id,
-                        "object": "text_completion",
-                        "created": created,
-                        "model": model_name,
-                        "choices": [
-                            {
-                                "text": last_text[
-                                    : len(last_text) - (token_end_position - end)
-                                ].decode("utf-8", errors="ignore"),
-                                "index": 0,
-                                "logprobs": logprobs_or_none,
-                                "finish_reason": None,
-                            }
-                        ],
-                    }
+                    yield self._create_chunk(
+                        completion_id=completion_id,
+                        created=created,
+                        model_name=model_name,
+                        text=last_text[
+                            : len(last_text) - (token_end_position - end)
+                        ].decode("utf-8", errors="ignore"),
+                        logprobs_or_none=logprobs_or_none,
+                        index=0,
+                        finish_reason=None,
+                        usage=None,
+                    )
                     break
                 returned_tokens += 1
-                yield {
-                    "id": completion_id,
-                    "object": "text_completion",
-                    "created": created,
-                    "model": model_name,
-                    "choices": [
-                        {
-                            "text": self.detokenize([token]).decode(
-                                "utf-8", errors="ignore"
-                            ),
-                            "index": 0,
-                            "logprobs": logprobs_or_none,
-                            "finish_reason": None,
-                        }
-                    ],
-                }
-            yield {
-                "id": completion_id,
-                "object": "text_completion",
-                "created": created,
-                "model": model_name,
-                "choices": [
-                    {
-                        "text": "",
-                        "index": 0,
-                        "logprobs": None,
-                        "finish_reason": finish_reason,
-                    }
-                ],
+                yield self._create_chunk(
+                    completion_id=completion_id,
+                    created=created,
+                    model_name=model_name,
+                    text=self.detokenize([token]).decode(
+                        "utf-8", errors="ignore"
+                    ),
+                    logprobs_or_none=logprobs_or_none,
+                    index=0,
+                    finish_reason=None,
+                    usage=None,
+                )
+
+            # Final streaming chunk with both finish_reason and usage
+            usage = {
+                "prompt_tokens": len(prompt_tokens),
+                "completion_tokens": returned_tokens,
+                "total_tokens": len(prompt_tokens) + returned_tokens,
             }
+
+            yield self._create_chunk(
+                completion_id=completion_id,
+                created=created,
+                model_name=model_name,
+                text="",
+                logprobs_or_none=None,
+                index=0,
+                finish_reason=finish_reason,
+                usage=usage,
+            )
+
             if self.cache:
                 if self.verbose:
                     print("Llama._create_completion: cache save", file=sys.stderr)
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -356,6 +356,7 @@ def _convert_text_completion_chunks_to_chat(
                     "finish_reason": chunk["choices"][0]["finish_reason"],
                 }
             ],
+            "usage": chunk.get("usage") if "usage" in chunk else None,
         }
 
 
@@ -440,7 +441,7 @@ def _stream_response_to_function_stream(
                     created = chunk["created"]
                     model = chunk["model"]
                     tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
-                    yield {
+                    response = {
                         "id": id_,
                         "object": "chat.completion.chunk",
                         "created": created,
@@ -459,7 +460,11 @@ def _stream_response_to_function_stream(
                             }
                         ],
                     }
-                    yield {
+                    if "usage" in chunk:
+                        response["usage"] = chunk["usage"]
+                    yield response
+
+                    response = {
                         "id": "chat" + chunk["id"],
                         "object": "chat.completion.chunk",
                         "created": chunk["created"],
@@ -493,10 +498,14 @@ def _stream_response_to_function_stream(
                             }
                         ],
                     }
+                    if "usage" in chunk:
+                        response["usage"] = chunk["usage"]
+                    yield response
                     first = False
                     continue
+
                 assert tool_id is not None
-                yield {
+                response = {
                     "id": "chat" + chunk["id"],
                     "object": "chat.completion.chunk",
                     "created": chunk["created"],
@@ -528,9 +537,12 @@ def _stream_response_to_function_stream(
                         }
                     ],
                 }
+                if "usage" in chunk:
+                    response["usage"] = chunk["usage"]
+                yield response
 
             if id_ is not None and created is not None and model is not None:
-                yield {
+                response = {
                     "id": id_,
                     "object": "chat.completion.chunk",
                     "created": created,
@@ -549,6 +561,9 @@ def _stream_response_to_function_stream(
                         }
                     ],
                 }
+                if "usage" in chunk:
+                    response["usage"] = chunk["usage"]
+                yield response
 
         return _stream_response_to_function_stream(chunks)
 
@@ -2129,6 +2144,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                     first = False
                 if tools is not None:
@@ -2169,6 +2185,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
             # Yield tool_call/function_call stop message
             yield llama_types.CreateChatCompletionStreamResponse(
@@ -2191,6 +2208,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         },
                     }
                 ],
+                usage=chunk["usage"] if "usage" in chunk else None,
             )
         # If "auto" or no tool_choice/function_call
         elif isinstance(function_call, str) and function_call == "auto":
@@ -2226,6 +2244,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 "finish_reason": None,
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                 else:
                     prompt += f"{function_name}\n<|content|>"
@@ -2271,6 +2290,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                 # Generate content
                 stops = [RECIPIENT_TOKEN, STOP_TOKEN]
@@ -2308,6 +2328,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                                 },
                                             }
                                         ],
+                                        usage=chunk["usage"] if "usage" in chunk else None,
                                     )
                                 is_end = False
                         elif chunk["choices"][0]["text"] == "\n":
@@ -2337,6 +2358,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         },
                                     }
                                 ],
+                                usage=chunk["usage"] if "usage" in chunk else None,
                             )
                     # Check whether the model wants to generate another turn
                     if (
@@ -2369,6 +2391,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     "finish_reason": "stop",
                                 }
                             ],
+                            usage=chunk["usage"] if "usage" in chunk else None,
                         )
                         break
                 else:
@@ -2418,6 +2441,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         },
                                     }
                                 ],
+                                usage=chunk["usage"] if "usage" in chunk else None,
                             )
                     prompt += completion_text.strip()
                     grammar = None
@@ -2457,6 +2481,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     },
                                 }
                             ],
+                            usage=chunk["usage"] if "usage" in chunk else None,
                         )
                         break
 
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
@@ -154,13 +154,13 @@ class ChatCompletionStreamResponseChoice(TypedDict):
     finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
     logprobs: NotRequired[Optional[ChatCompletionLogprobs]]
 
-
 class CreateChatCompletionStreamResponse(TypedDict):
     id: str
     model: str
     object: Literal["chat.completion.chunk"]
     created: int
     choices: List[ChatCompletionStreamResponseChoice]
+    usage: NotRequired[CompletionUsage]
 
 
 class ChatCompletionFunctions(TypedDict):

Original file line number	Diff line number	Diff line change
`@@ -356,6 +356,7 @@ def _convert_text_completion_chunks_to_chat(`
`356`	`356`	`"finish_reason": chunk["choices"][0]["finish_reason"],`
`357`	`357`	`}`
`358`	`358`	`],`
	`359`	`+ "usage": chunk.get("usage") if "usage" in chunk else None,`
`359`	`360`	`}`
`360`	`361`
`361`	`362`
`@@ -440,7 +441,7 @@ def _stream_response_to_function_stream(`
`440`	`441`	`created = chunk["created"]`
`441`	`442`	`model = chunk["model"]`
`442`	`443`	`tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]`
`443`		`- yield {`
	`444`	`+ response = {`
`444`	`445`	`"id": id_,`
`445`	`446`	`"object": "chat.completion.chunk",`
`446`	`447`	`"created": created,`
`@@ -459,7 +460,11 @@ def _stream_response_to_function_stream(`
`459`	`460`	`}`
`460`	`461`	`],`
`461`	`462`	`}`
`462`		`- yield {`
	`463`	`+ if "usage" in chunk:`
	`464`	`+ response["usage"] = chunk["usage"]`
	`465`	`+ yield response`
	`466`	`+`
	`467`	`+ response = {`
`463`	`468`	`"id": "chat" + chunk["id"],`
`464`	`469`	`"object": "chat.completion.chunk",`
`465`	`470`	`"created": chunk["created"],`
`@@ -493,10 +498,14 @@ def _stream_response_to_function_stream(`
`493`	`498`	`}`
`494`	`499`	`],`
`495`	`500`	`}`
	`501`	`+ if "usage" in chunk:`
	`502`	`+ response["usage"] = chunk["usage"]`
	`503`	`+ yield response`
`496`	`504`	`first = False`
`497`	`505`	`continue`
	`506`	`+`
`498`	`507`	`assert tool_id is not None`
`499`		`- yield {`
	`508`	`+ response = {`
`500`	`509`	`"id": "chat" + chunk["id"],`
`501`	`510`	`"object": "chat.completion.chunk",`
`502`	`511`	`"created": chunk["created"],`
`@@ -528,9 +537,12 @@ def _stream_response_to_function_stream(`
`528`	`537`	`}`
`529`	`538`	`],`
`530`	`539`	`}`
	`540`	`+ if "usage" in chunk:`
	`541`	`+ response["usage"] = chunk["usage"]`
	`542`	`+ yield response`
`531`	`543`
`532`	`544`	`if id_ is not None and created is not None and model is not None:`
`533`		`- yield {`
	`545`	`+ response = {`
`534`	`546`	`"id": id_,`
`535`	`547`	`"object": "chat.completion.chunk",`
`536`	`548`	`"created": created,`
`@@ -549,6 +561,9 @@ def _stream_response_to_function_stream(`
`549`	`561`	`}`
`550`	`562`	`],`
`551`	`563`	`}`
	`564`	`+ if "usage" in chunk:`
	`565`	`+ response["usage"] = chunk["usage"]`
	`566`	`+ yield response`
`552`	`567`
`553`	`568`	`return _stream_response_to_function_stream(chunks)`
`554`	`569`
`@@ -2129,6 +2144,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2129`	`2144`	`},`
`2130`	`2145`	`}`
`2131`	`2146`	`],`
	`2147`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2132`	`2148`	`)`
`2133`	`2149`	`first = False`
`2134`	`2150`	`if tools is not None:`
`@@ -2169,6 +2185,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2169`	`2185`	`},`
`2170`	`2186`	`}`
`2171`	`2187`	`],`
	`2188`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2172`	`2189`	`)`
`2173`	`2190`	`# Yield tool_call/function_call stop message`
`2174`	`2191`	`yield llama_types.CreateChatCompletionStreamResponse(`
`@@ -2191,6 +2208,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2191`	`2208`	`},`
`2192`	`2209`	`}`
`2193`	`2210`	`],`
	`2211`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2194`	`2212`	`)`
`2195`	`2213`	`# If "auto" or no tool_choice/function_call`
`2196`	`2214`	`elif isinstance(function_call, str) and function_call == "auto":`
`@@ -2226,6 +2244,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2226`	`2244`	`"finish_reason": None,`
`2227`	`2245`	`}`
`2228`	`2246`	`],`
	`2247`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2229`	`2248`	`)`
`2230`	`2249`	`else:`
`2231`	`2250`	`prompt += f"{function_name}\n<\|content\|>"`
`@@ -2271,6 +2290,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2271`	`2290`	`},`
`2272`	`2291`	`}`
`2273`	`2292`	`],`
	`2293`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2274`	`2294`	`)`
`2275`	`2295`	`# Generate content`
`2276`	`2296`	`stops = [RECIPIENT_TOKEN, STOP_TOKEN]`
`@@ -2308,6 +2328,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2308`	`2328`	`},`
`2309`	`2329`	`}`
`2310`	`2330`	`],`
	`2331`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2311`	`2332`	`)`
`2312`	`2333`	`is_end = False`
`2313`	`2334`	`elif chunk["choices"][0]["text"] == "\n":`
`@@ -2337,6 +2358,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2337`	`2358`	`},`
`2338`	`2359`	`}`
`2339`	`2360`	`],`
	`2361`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2340`	`2362`	`)`
`2341`	`2363`	`# Check whether the model wants to generate another turn`
`2342`	`2364`	`if (`
`@@ -2369,6 +2391,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2369`	`2391`	`"finish_reason": "stop",`
`2370`	`2392`	`}`
`2371`	`2393`	`],`
	`2394`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2372`	`2395`	`)`
`2373`	`2396`	`break`
`2374`	`2397`	`else:`
`@@ -2418,6 +2441,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2418`	`2441`	`},`
`2419`	`2442`	`}`
`2420`	`2443`	`],`
	`2444`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2421`	`2445`	`)`
`2422`	`2446`	`prompt += completion_text.strip()`
`2423`	`2447`	`grammar = None`
`@@ -2457,6 +2481,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2457`	`2481`	`},`
`2458`	`2482`	`}`
`2459`	`2483`	`],`
	`2484`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2460`	`2485`	`)`
`2461`	`2486`	`break`
`2462`	`2487`