File tree Expand file tree Collapse file tree 1 file changed +51
-0
lines changed
Expand file tree Collapse file tree 1 file changed +51
-0
lines changed Original file line number Diff line number Diff line change 1+ import httpx
2+ import orjson
3+ from typing import AsyncGenerator , Optional
4+
5+
6+ class StreamLLMService :
7+ def __init__ (self , base_url : str = "http://localhost:11434/v1" ):
8+ self .base_url = base_url
9+ self .model = "llama3.2"
10+
11+ async def stream_chat (self , prompt : str ) -> AsyncGenerator [bytes , None ]:
12+ """Stream chat completion responses from LLM."""
13+ # Send user message first
14+ user_msg = {
15+ 'role' : 'user' ,
16+ 'content' : prompt ,
17+ }
18+ yield orjson .dumps (user_msg ) + b'\n '
19+
20+ # Open client as context manager and stream responses
21+ async with httpx .AsyncClient (base_url = self .base_url ) as client :
22+ async with client .stream (
23+ "POST" ,
24+ "/chat/completions" ,
25+ json = {
26+ "model" : self .model ,
27+ "messages" : [{"role" : "user" , "content" : prompt }],
28+ "stream" : True
29+ },
30+ timeout = 60.0
31+ ) as response :
32+ async for line in response .aiter_lines ():
33+ print (line )
34+ if line .startswith ("data: " ) and line != "data: [DONE]" :
35+ try :
36+ json_line = line [6 :] # Remove "data: " prefix
37+ data = orjson .loads (json_line )
38+ content = data .get ("choices" , [{}])[0 ].get ("delta" , {}).get ("content" , "" )
39+ if content :
40+ model_msg = {
41+ 'role' : 'model' ,
42+ 'content' : content
43+ }
44+ yield orjson .dumps (model_msg ) + b'\n '
45+ except Exception :
46+ pass
47+
48+
49+ # FastAPI dependency
50+ def get_llm_service (base_url : Optional [str ] = None ) -> StreamLLMService :
51+ return StreamLLMService (base_url = base_url )
You can’t perform that action at this time.
0 commit comments