feat: add usage to streamin response

okaris · okaris · commit 5fcd220071e4 · 2025-06-26T13:43:44.000Z
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1057,6 +1057,50 @@ def decode_batch(seq_sizes: List[int]):
         else:
             return output
 
+    def _create_chunk(
+        self,
+        completion_id: str,
+        created: int,
+        model_name: str,
+        text: str,
+        logprobs_or_none: Union[Optional[CompletionLogprobs], None],
+        index: int,
+        finish_reason: Union[str, None],
+        usage: Optional[Dict[str, Any]] = None,
+    ) -> CreateCompletionStreamResponse:
+        """Create chunks for streaming API, depending on whether usage is requested or not."""
+        if usage is not None:
+            return {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": index,
+                        "logprobs": logprobs_or_none,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+                "usage": usage,
+            }
+        else:
+            return {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": model_name,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": index,
+                        "logprobs": logprobs_or_none,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+            }
+
     def _create_completion(
         self,
         prompt: Union[str, List[int]],
@@ -1383,24 +1427,20 @@ def logit_bias_processor(
                             "top_logprobs": [top_logprob],
                         }
                         returned_tokens += 1
-                        yield {
-                            "id": completion_id,
-                            "object": "text_completion",
-                            "created": created,
-                            "model": model_name,
-                            "choices": [
-                                {
-                                    "text": self.detokenize(
-                                        [token],
-                                        prev_tokens=prompt_tokens
-                                        + completion_tokens[:returned_tokens],
-                                    ).decode("utf-8", errors="ignore"),
-                                    "index": 0,
-                                    "logprobs": logprobs_or_none,
-                                    "finish_reason": None,
-                                }
-                            ],
-                        }
+                        yield self._create_chunk(
+                            completion_id=completion_id,
+                            created=created,
+                            model_name=model_name,
+                            text=self.detokenize(
+                                [token],
+                                prev_tokens=prompt_tokens
+                                + completion_tokens[:returned_tokens],
+                            ).decode("utf-8", errors="ignore"),
+                            logprobs_or_none=logprobs_or_none,
+                            index=0,
+                            finish_reason=None,
+                            usage=None,
+                        )
                 else:
                     while len(remaining_tokens) > 0:
                         decode_success = False
@@ -1429,20 +1469,16 @@ def logit_bias_processor(
                         remaining_tokens = remaining_tokens[i:]
                         returned_tokens += i
 
-                        yield {
-                            "id": completion_id,
-                            "object": "text_completion",
-                            "created": created,
-                            "model": model_name,
-                            "choices": [
-                                {
-                                    "text": ts,
-                                    "index": 0,
-                                    "logprobs": None,
-                                    "finish_reason": None,
-                                }
-                            ],
-                        }
+                        yield self._create_chunk(
+                            completion_id=completion_id,
+                            created=created,
+                            model_name=model_name,
+                            text=ts,
+                            logprobs_or_none=None,
+                            index=0,
+                            finish_reason=None,
+                            usage=None,
+                        )
 
             if len(completion_tokens) >= max_tokens:
                 text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
@@ -1521,54 +1557,51 @@ def logit_bias_processor(
                     if token_end_position == end - 1:
                         break
                     returned_tokens += 1
-                    yield {
-                        "id": completion_id,
-                        "object": "text_completion",
-                        "created": created,
-                        "model": model_name,
-                        "choices": [
-                            {
-                                "text": last_text[
-                                    : len(last_text) - (token_end_position - end)
-                                ].decode("utf-8", errors="ignore"),
-                                "index": 0,
-                                "logprobs": logprobs_or_none,
-                                "finish_reason": None,
-                            }
-                        ],
-                    }
+                    yield self._create_chunk(
+                        completion_id=completion_id,
+                        created=created,
+                        model_name=model_name,
+                        text=last_text[
+                            : len(last_text) - (token_end_position - end)
+                        ].decode("utf-8", errors="ignore"),
+                        logprobs_or_none=logprobs_or_none,
+                        index=0,
+                        finish_reason=None,
+                        usage=None,
+                    )
                     break
                 returned_tokens += 1
-                yield {
-                    "id": completion_id,
-                    "object": "text_completion",
-                    "created": created,
-                    "model": model_name,
-                    "choices": [
-                        {
-                            "text": self.detokenize([token]).decode(
-                                "utf-8", errors="ignore"
-                            ),
-                            "index": 0,
-                            "logprobs": logprobs_or_none,
-                            "finish_reason": None,
-                        }
-                    ],
-                }
-            yield {
-                "id": completion_id,
-                "object": "text_completion",
-                "created": created,
-                "model": model_name,
-                "choices": [
-                    {
-                        "text": "",
-                        "index": 0,
-                        "logprobs": None,
-                        "finish_reason": finish_reason,
-                    }
-                ],
+                yield self._create_chunk(
+                    completion_id=completion_id,
+                    created=created,
+                    model_name=model_name,
+                    text=self.detokenize([token]).decode(
+                        "utf-8", errors="ignore"
+                    ),
+                    logprobs_or_none=logprobs_or_none,
+                    index=0,
+                    finish_reason=None,
+                    usage=None,
+                )
+
+            # Final streaming chunk with both finish_reason and usage
+            usage = {
+                "prompt_tokens": len(prompt_tokens),
+                "completion_tokens": returned_tokens,
+                "total_tokens": len(prompt_tokens) + returned_tokens,
             }
+
+            yield self._create_chunk(
+                completion_id=completion_id,
+                created=created,
+                model_name=model_name,
+                text="",
+                logprobs_or_none=None,
+                index=0,
+                finish_reason=finish_reason,
+                usage=usage,
+            )
+
             if self.cache:
                 if self.verbose:
                     print("Llama._create_completion: cache save", file=sys.stderr)
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -347,6 +347,7 @@ def _convert_text_completion_chunks_to_chat(
                     "finish_reason": chunk["choices"][0]["finish_reason"],
                 }
             ],
+            "usage": chunk.get("usage") if "usage" in chunk else None,
         }
 
 
@@ -431,7 +432,7 @@ def _stream_response_to_function_stream(
                     created = chunk["created"]
                     model = chunk["model"]
                     tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]
-                    yield {
+                    response = {
                         "id": id_,
                         "object": "chat.completion.chunk",
                         "created": created,
@@ -450,7 +451,11 @@ def _stream_response_to_function_stream(
                             }
                         ],
                     }
-                    yield {
+                    if "usage" in chunk:
+                        response["usage"] = chunk["usage"]
+                    yield response
+
+                    response = {
                         "id": "chat" + chunk["id"],
                         "object": "chat.completion.chunk",
                         "created": chunk["created"],
@@ -484,10 +489,14 @@ def _stream_response_to_function_stream(
                             }
                         ],
                     }
+                    if "usage" in chunk:
+                        response["usage"] = chunk["usage"]
+                    yield response
                     first = False
                     continue
+
                 assert tool_id is not None
-                yield {
+                response = {
                     "id": "chat" + chunk["id"],
                     "object": "chat.completion.chunk",
                     "created": chunk["created"],
@@ -519,9 +528,12 @@ def _stream_response_to_function_stream(
                         }
                     ],
                 }
+                if "usage" in chunk:
+                    response["usage"] = chunk["usage"]
+                yield response
 
             if id_ is not None and created is not None and model is not None:
-                yield {
+                response = {
                     "id": id_,
                     "object": "chat.completion.chunk",
                     "created": created,
@@ -540,6 +552,9 @@ def _stream_response_to_function_stream(
                         }
                     ],
                 }
+                if "usage" in chunk:
+                    response["usage"] = chunk["usage"]
+                yield response
 
         return _stream_response_to_function_stream(chunks)
 
@@ -2120,6 +2135,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                     first = False
                 if tools is not None:
@@ -2160,6 +2176,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
             # Yield tool_call/function_call stop message
             yield llama_types.CreateChatCompletionStreamResponse(
@@ -2182,6 +2199,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         },
                     }
                 ],
+                usage=chunk["usage"] if "usage" in chunk else None,
             )
         # If "auto" or no tool_choice/function_call
         elif isinstance(function_call, str) and function_call == "auto":
@@ -2217,6 +2235,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 "finish_reason": None,
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                 else:
                     prompt += f"{function_name}\n<|content|>"
@@ -2262,6 +2281,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                 },
                             }
                         ],
+                        usage=chunk["usage"] if "usage" in chunk else None,
                     )
                 # Generate content
                 stops = [RECIPIENT_TOKEN, STOP_TOKEN]
@@ -2299,6 +2319,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                                 },
                                             }
                                         ],
+                                        usage=chunk["usage"] if "usage" in chunk else None,
                                     )
                                 is_end = False
                         elif chunk["choices"][0]["text"] == "\n":
@@ -2328,6 +2349,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         },
                                     }
                                 ],
+                                usage=chunk["usage"] if "usage" in chunk else None,
                             )
                     # Check whether the model wants to generate another turn
                     if (
@@ -2360,6 +2382,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     "finish_reason": "stop",
                                 }
                             ],
+                            usage=chunk["usage"] if "usage" in chunk else None,
                         )
                         break
                 else:
@@ -2409,6 +2432,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                         },
                                     }
                                 ],
+                                usage=chunk["usage"] if "usage" in chunk else None,
                             )
                     prompt += completion_text.strip()
                     grammar = None
@@ -2448,6 +2472,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                                     },
                                 }
                             ],
+                            usage=chunk["usage"] if "usage" in chunk else None,
                         )
                         break
 
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
@@ -154,13 +154,13 @@ class ChatCompletionStreamResponseChoice(TypedDict):
     finish_reason: Optional[Literal["stop", "length", "tool_calls", "function_call"]]
     logprobs: NotRequired[Optional[ChatCompletionLogprobs]]
 
-
 class CreateChatCompletionStreamResponse(TypedDict):
     id: str
     model: str
     object: Literal["chat.completion.chunk"]
     created: int
     choices: List[ChatCompletionStreamResponseChoice]
+    usage: NotRequired[CompletionUsage]
 
 
 class ChatCompletionFunctions(TypedDict):

Original file line number	Diff line number	Diff line change
`@@ -347,6 +347,7 @@ def _convert_text_completion_chunks_to_chat(`
`347`	`347`	`"finish_reason": chunk["choices"][0]["finish_reason"],`
`348`	`348`	`}`
`349`	`349`	`],`
	`350`	`+ "usage": chunk.get("usage") if "usage" in chunk else None,`
`350`	`351`	`}`
`351`	`352`
`352`	`353`
`@@ -431,7 +432,7 @@ def _stream_response_to_function_stream(`
`431`	`432`	`created = chunk["created"]`
`432`	`433`	`model = chunk["model"]`
`433`	`434`	`tool_id = "call_" + "_0_" + tool_name + "_" + chunk["id"]`
`434`		`- yield {`
	`435`	`+ response = {`
`435`	`436`	`"id": id_,`
`436`	`437`	`"object": "chat.completion.chunk",`
`437`	`438`	`"created": created,`
`@@ -450,7 +451,11 @@ def _stream_response_to_function_stream(`
`450`	`451`	`}`
`451`	`452`	`],`
`452`	`453`	`}`
`453`		`- yield {`
	`454`	`+ if "usage" in chunk:`
	`455`	`+ response["usage"] = chunk["usage"]`
	`456`	`+ yield response`
	`457`	`+`
	`458`	`+ response = {`
`454`	`459`	`"id": "chat" + chunk["id"],`
`455`	`460`	`"object": "chat.completion.chunk",`
`456`	`461`	`"created": chunk["created"],`
`@@ -484,10 +489,14 @@ def _stream_response_to_function_stream(`
`484`	`489`	`}`
`485`	`490`	`],`
`486`	`491`	`}`
	`492`	`+ if "usage" in chunk:`
	`493`	`+ response["usage"] = chunk["usage"]`
	`494`	`+ yield response`
`487`	`495`	`first = False`
`488`	`496`	`continue`
	`497`	`+`
`489`	`498`	`assert tool_id is not None`
`490`		`- yield {`
	`499`	`+ response = {`
`491`	`500`	`"id": "chat" + chunk["id"],`
`492`	`501`	`"object": "chat.completion.chunk",`
`493`	`502`	`"created": chunk["created"],`
`@@ -519,9 +528,12 @@ def _stream_response_to_function_stream(`
`519`	`528`	`}`
`520`	`529`	`],`
`521`	`530`	`}`
	`531`	`+ if "usage" in chunk:`
	`532`	`+ response["usage"] = chunk["usage"]`
	`533`	`+ yield response`
`522`	`534`
`523`	`535`	`if id_ is not None and created is not None and model is not None:`
`524`		`- yield {`
	`536`	`+ response = {`
`525`	`537`	`"id": id_,`
`526`	`538`	`"object": "chat.completion.chunk",`
`527`	`539`	`"created": created,`
`@@ -540,6 +552,9 @@ def _stream_response_to_function_stream(`
`540`	`552`	`}`
`541`	`553`	`],`
`542`	`554`	`}`
	`555`	`+ if "usage" in chunk:`
	`556`	`+ response["usage"] = chunk["usage"]`
	`557`	`+ yield response`
`543`	`558`
`544`	`559`	`return _stream_response_to_function_stream(chunks)`
`545`	`560`
`@@ -2120,6 +2135,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2120`	`2135`	`},`
`2121`	`2136`	`}`
`2122`	`2137`	`],`
	`2138`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2123`	`2139`	`)`
`2124`	`2140`	`first = False`
`2125`	`2141`	`if tools is not None:`
`@@ -2160,6 +2176,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2160`	`2176`	`},`
`2161`	`2177`	`}`
`2162`	`2178`	`],`
	`2179`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2163`	`2180`	`)`
`2164`	`2181`	`# Yield tool_call/function_call stop message`
`2165`	`2182`	`yield llama_types.CreateChatCompletionStreamResponse(`
`@@ -2182,6 +2199,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2182`	`2199`	`},`
`2183`	`2200`	`}`
`2184`	`2201`	`],`
	`2202`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2185`	`2203`	`)`
`2186`	`2204`	`# If "auto" or no tool_choice/function_call`
`2187`	`2205`	`elif isinstance(function_call, str) and function_call == "auto":`
`@@ -2217,6 +2235,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2217`	`2235`	`"finish_reason": None,`
`2218`	`2236`	`}`
`2219`	`2237`	`],`
	`2238`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2220`	`2239`	`)`
`2221`	`2240`	`else:`
`2222`	`2241`	`prompt += f"{function_name}\n<\|content\|>"`
`@@ -2262,6 +2281,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2262`	`2281`	`},`
`2263`	`2282`	`}`
`2264`	`2283`	`],`
	`2284`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2265`	`2285`	`)`
`2266`	`2286`	`# Generate content`
`2267`	`2287`	`stops = [RECIPIENT_TOKEN, STOP_TOKEN]`
`@@ -2299,6 +2319,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2299`	`2319`	`},`
`2300`	`2320`	`}`
`2301`	`2321`	`],`
	`2322`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2302`	`2323`	`)`
`2303`	`2324`	`is_end = False`
`2304`	`2325`	`elif chunk["choices"][0]["text"] == "\n":`
`@@ -2328,6 +2349,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2328`	`2349`	`},`
`2329`	`2350`	`}`
`2330`	`2351`	`],`
	`2352`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2331`	`2353`	`)`
`2332`	`2354`	`# Check whether the model wants to generate another turn`
`2333`	`2355`	`if (`
`@@ -2360,6 +2382,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2360`	`2382`	`"finish_reason": "stop",`
`2361`	`2383`	`}`
`2362`	`2384`	`],`
	`2385`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2363`	`2386`	`)`
`2364`	`2387`	`break`
`2365`	`2388`	`else:`
`@@ -2409,6 +2432,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2409`	`2432`	`},`
`2410`	`2433`	`}`
`2411`	`2434`	`],`
	`2435`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2412`	`2436`	`)`
`2413`	`2437`	`prompt += completion_text.strip()`
`2414`	`2438`	`grammar = None`
`@@ -2448,6 +2472,7 @@ def generate_streaming(tools, functions, function_call, prompt):`
`2448`	`2472`	`},`
`2449`	`2473`	`}`
`2450`	`2474`	`],`
	`2475`	`+ usage=chunk["usage"] if "usage" in chunk else None,`
`2451`	`2476`	`)`
`2452`	`2477`	`break`
`2453`	`2478`