[Feat] Add structured generation OpenAI API (#1114)

flyinglandlord · web-flow · commit fbd13bba241f · 2025-11-25T14:53:41.000+08:00
diff --git a/lightllm/server/api_models.py b/lightllm/server/api_models.py
@@ -1,8 +1,8 @@
 import time
+import uuid
 
 from pydantic import BaseModel, Field, field_validator
-from typing import Dict, List, Optional, Union, Literal
-import uuid
+from typing import Any, Dict, List, Optional, Union, Literal
 
 
 class ImageURL(BaseModel):
@@ -52,6 +52,21 @@ class StreamOptions(BaseModel):
     include_usage: Optional[bool] = False
 
 
+class JsonSchemaResponseFormat(BaseModel):
+    name: str
+    description: Optional[str] = None
+    # schema is the field in openai but that causes conflicts with pydantic so
+    # instead use json_schema with an alias
+    json_schema: Optional[dict[str, Any]] = Field(default=None, alias="schema")
+    strict: Optional[bool] = None
+
+
+class ResponseFormat(BaseModel):
+    # type must be "json_schema", "json_object", or "text"
+    type: Literal["text", "json_object", "json_schema"]
+    json_schema: Optional[JsonSchemaResponseFormat] = None
+
+
 class CompletionRequest(BaseModel):
     model: str
     # prompt: string or tokens
@@ -71,6 +86,14 @@ class CompletionRequest(BaseModel):
     best_of: Optional[int] = 1
     logit_bias: Optional[Dict[str, float]] = None
     user: Optional[str] = None
+    response_format: Optional[ResponseFormat] = Field(
+        default=None,
+        description=(
+            "Similar to chat completion, this parameter specifies the format "
+            "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
+            ", or {'type': 'text' } is supported."
+        ),
+    )
 
     # Additional parameters supported by LightLLM
     do_sample: Optional[bool] = False
@@ -94,7 +117,14 @@ class ChatCompletionRequest(BaseModel):
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[Dict[str, float]] = None
     user: Optional[str] = None
-    response_format: Optional[Dict] = None
+    response_format: Optional[ResponseFormat] = Field(
+        default=None,
+        description=(
+            "Similar to chat completion, this parameter specifies the format "
+            "of output. Only {'type': 'json_object'}, {'type': 'json_schema'}"
+            ", or {'type': 'text' } is supported."
+        ),
+    )
 
     # OpenAI Adaptive parameters for tool call
     tools: Optional[List[Tool]] = Field(default=None, examples=[None])
diff --git a/lightllm/server/api_openai.py b/lightllm/server/api_openai.py
@@ -175,11 +175,17 @@ async def chat_completions_impl(request: ChatCompletionRequest, raw_request: Req
         "best_of": request.n,
         "add_special_tokens": False,
     }
+
+    # Structured output handling
     if request.response_format:
-        obj = request.response_format.get("schema")
-        if obj:
-            # guided_json takes str instead of dict obj
-            sampling_params_dict["guided_json"] = json.dumps(obj)
+        if request.response_format.type == "json_schema":
+            obj = request.response_format.json_schema
+            if obj:
+                # guided_json takes str instead of dict obj
+                sampling_params_dict["guided_json"] = json.dumps(obj.json_schema)
+        elif request.response_format.type == "json_object":
+            sampling_params_dict["guided_grammar"] = "json"
+
     sampling_params = SamplingParams()
     sampling_params.init(tokenizer=g_objs.httpserver_manager.tokenizer, **sampling_params_dict)
 
@@ -453,6 +459,15 @@ async def completions_impl(request: CompletionRequest, raw_request: Request) ->
         "add_special_tokens": False,
     }
 
+    if request.response_format:
+        if request.response_format.type == "json_schema":
+            obj = request.response_format.json_schema
+            if obj:
+                # guided_json takes str instead of dict obj
+                sampling_params_dict["guided_json"] = json.dumps(obj.json_schema)
+        elif request.response_format.type == "json_object":
+            sampling_params_dict["guided_grammar"] = "json"
+
     sampling_params = SamplingParams()
     sampling_params.init(tokenizer=g_objs.httpserver_manager.tokenizer, **sampling_params_dict)
     sampling_params.verify()
diff --git a/lightllm/server/core/objs/sampling_params.py b/lightllm/server/core/objs/sampling_params.py
@@ -142,7 +142,7 @@ def initialize(self, constraint: str, tokenizer):
         ctypes.memmove(self.constraint, constraint_bytes, len(constraint_bytes))
         self.length = len(constraint_bytes)
         try:
-            if self.length > 0 and tokenizer is not None:
+            if self.length > 0 and tokenizer is not None and constraint != "json":
                 import xgrammar as xgr
 
                 tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer)
diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_xgrammar_mode.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl_for_xgrammar_mode.py
@@ -35,7 +35,11 @@ def get_cached_grammar(type: str, grammar: str):
             logger.info(f"grammar cache miss for {type}: '{grammar}'")
             try:
                 if type == "grammar":
-                    return self.xgrammar_compiler.compile_grammar(grammar)
+                    if grammar == "json":
+                        return self.xgrammar_compiler.compile_builtin_json_grammar()
+
+                    else:
+                        return self.xgrammar_compiler.compile_grammar(grammar)
                 elif type == "schema":
                     return self.xgrammar_compiler.compile_json_schema(grammar)
                 else:
diff --git a/test/test_api/test_openai_api.py b/test/test_api/test_openai_api.py
@@ -635,6 +635,72 @@ def test_multiple_token_arrays():
         print(f"错误: {e}")
 
 
+def test_structured_generation():
+    """测试结构化生成功能"""
+    client = LightLLMClient()
+
+    try:
+        print("=== 测试结构化生成 ===")
+        prompt = "请以JSON格式提供一个包含姓名、年龄和职业的人的信息。"
+
+        # 测试JSON生成
+        result = client.completions(prompt, max_tokens=150, response_format={"type": "json_object"})
+        print("提示:", prompt)
+        print("助手:", result["choices"][0]["text"])
+
+        # 测试JSON Schema生成
+        schema = {
+            "type": "object",
+            "properties": {
+                "name": {"type": "string"},
+                "age": {"type": "integer"},
+                "occupation": {"type": "string"},
+            },
+            "required": ["name", "age", "occupation"],
+        }
+        result = client.completions(
+            prompt,
+            max_tokens=150,
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "PersonInfo",
+                    "description": "包含姓名、年龄和职业的人的信息",
+                    "schema": schema,
+                },
+            },
+        )
+        print("提示:", prompt)
+        print("助手:", result["choices"][0]["text"])
+
+        # 测试/v1/chat/completions端点的JSON生成
+        result = client.simple_chat(
+            prompt,
+            max_tokens=150,
+            response_format={"type": "json_object"},
+        )
+        print("提示:", prompt)
+        print("助手:", result["choices"][0]["message"]["content"])
+
+        # 测试/v1/chat/completions端点的JSON Schema生成
+        result = client.simple_chat(
+            prompt,
+            max_tokens=150,
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "PersonInfo",
+                    "description": "包含姓名、年龄和职业的人的信息",
+                    "schema": schema,
+                },
+            },
+        )
+        print("提示:", prompt)
+        print("助手:", result["choices"][0]["message"]["content"])
+    except Exception as e:
+        print(f"错误: {e}")
+
+
 def main():
     # 基础功能测试
     test_completions()
@@ -651,6 +717,7 @@ def main():
     test_logprobs()
     test_echo()
     test_stop_parameter()
+    test_structured_generation()
 
 
 if __name__ == "__main__":