CogStack · baixiac · Dec 17, 2025 · Dec 2, 2025 · Dec 3, 2025 · Dec 15, 2025
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -28,7 +28,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          uv sync --extra dev --extra docs --extra vllm
+          uv sync --extra dev --extra docs --extra llm
           uv run python -m ensurepip
       - name: Check types
         run: |

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -41,7 +41,7 @@ jobs:
             python-version: "3.10"
       - name: Install dependencies
         run: |
-          uv sync --extra dev --extra docs --extra vllm
+          uv sync --extra dev --extra docs --extra llm
       - name: Run unit tests
         run: |
           uv run pytest -v tests/app --cov --cov-report=html:coverage_reports #--random-order

diff --git a/README.md b/README.md
@@ -230,6 +230,30 @@ curl -N -X 'POST' 'http://127.0.0.1:8000/stream/generate?max_tokens=512' \
     -d 'What is hypertension?'
 ```
 
+CMS also provides APIs that are compatible with the OpenAI client. For example, using its Python SDK, you can generate text and retrieve embeddings with the following snippet:
+```python
+from openai import OpenAI
+
+client = OpenAI(api_key="dummy", base_url="http://localhost:8000/v1")
+completion = client.chat.completions.create(
+    model="Huggingface LLM Model",
+    messages=[
+        {"role": "system", "content": "You are bot answering questions from the user"},
+        {"role": "user", "content": "What is hypertension?"},
+    ],
+    max_tokens=256,
+    temperature=0.7,
+    stream=False
+)
+print(f"CMS => {completion.choices[0].message.content}")
+embeddings = client.embeddings.create(
+    model="Huggingface LLM Model", input="What is hypertension?"
+)
+```
+Note that to enable quantization and training features, you need to install the extra dependencies:
+```commandline
+pip install '.[llm]'
+```
 #### Chat with served models
 You can also "chat" with the running model using the `/stream/ws` endpoint. For example:
 ```html

diff --git a/app/api/routers/generative.py b/app/api/routers/generative.py
@@ -4,7 +4,7 @@
 import uuid
 import app.api.globals as cms_globals
 
-from typing import Union, Iterable, AsyncGenerator
+from typing import Union, Iterable, AsyncGenerator, List
 from typing_extensions import Annotated
 from functools import partial
 from fastapi import APIRouter, Depends, Request, Body, Query
@@ -51,7 +51,9 @@ def generate_text(
     request: Request,
     prompt: Annotated[str, Body(description="The prompt to be sent to the model", media_type="text/plain")],
     max_tokens: Annotated[int, Query(description="The maximum number of tokens to generate", gt=0)] = 512,
-    temperature: Annotated[float, Query(description="The temperature of the generated text", gt=0.0, lt=1.0)] = 0.7,
+    temperature: Annotated[float, Query(description="The temperature of the generated text", ge=0.0)] = 0.7,
+    top_p: Annotated[float, Query(description="The Top-P value for nucleus sampling", ge=0.0, le=1.0)] = 0.9,
+    stop_sequences: Annotated[List[str], Query(description="The list of sequences used to stop the generation")] = [],
     tracking_id: Union[str, None] = Depends(validate_tracking_id),
     model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
 ) -> PlainTextResponse:
@@ -63,6 +65,8 @@ def generate_text(
         prompt (str): The prompt to be sent to the model.
         max_tokens (int): The maximum number of tokens to generate.
         temperature (float): The temperature of the generated text.
+        top_p (float): The Top-P value for nucleus sampling.
+        stop_sequences (List[str]): The list of sequences used to stop the generation.
         tracking_id (Union[str, None]): An optional tracking ID of the requested task.
         model_service (AbstractModelService): The model service dependency.
 
@@ -77,6 +81,8 @@ def generate_text(
                 prompt,
                 max_tokens=max_tokens,
                 temperature=temperature,
+                top_p=top_p,
+                stop_sequences=stop_sequences,
                 report_tokens=partial(_send_usage_metrics, handler=PATH_GENERATE),
             ),
             headers={"x-cms-tracking-id": tracking_id},
@@ -101,7 +107,9 @@ async def generate_text_stream(
     request: Request,
     prompt: Annotated[str, Body(description="The prompt to be sent to the model", media_type="text/plain")],
     max_tokens: Annotated[int, Query(description="The maximum number of tokens to generate", gt=0)] = 512,
-    temperature: Annotated[float, Query(description="The temperature of the generated text", gt=0.0, lt=1.0)] = 0.7,
+    temperature: Annotated[float, Query(description="The temperature of the generated text", ge=0.0)] = 0.7,
+    top_p: Annotated[float, Query(description="The Top-P value for nucleus sampling", ge=0.0, le=1.0)] = 0.9,
+    stop_sequences: Annotated[List[str], Query(description="The list of sequences used to stop the generation")] = [],
     tracking_id: Union[str, None] = Depends(validate_tracking_id),
     model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
 ) -> StreamingResponse:
@@ -113,6 +121,8 @@ async def generate_text_stream(
         prompt (str): The prompt to be sent to the model.
         max_tokens (int): The maximum number of tokens to generate.
         temperature (float): The temperature of the generated text.
+        top_p (float): The Top-P value for nucleus sampling.
+        stop_sequences (List[str]): The list of sequences used to stop the generation.
         tracking_id (Union[str, None]): An optional tracking ID of the requested task.
         model_service (AbstractModelService): The model service dependency.
 
@@ -127,6 +137,8 @@ async def generate_text_stream(
                 prompt,
                 max_tokens=max_tokens,
                 temperature=temperature,
+                top_p=top_p,
+                stop_sequences=stop_sequences,
                 report_tokens=partial(_send_usage_metrics, handler=PATH_GENERATE_ASYNC),
             ),
             media_type="text/event-stream",
@@ -162,7 +174,7 @@ def generate_chat_completions(
 
     Args:
         request (Request): The request object.
-        request_data (OpenAIChatRequest): The request data containing model, messages, and stream.
+        request_data (OpenAIChatRequest): The request data containing model, messages, stream, temperature, top_p, and stop_sequences.
         tracking_id (Union[str, None]): An optional tracking ID of the requested task.
         model_service (AbstractModelService): The model service dependency.
 
@@ -176,6 +188,8 @@ def generate_chat_completions(
     stream = request_data.stream
     max_tokens = request_data.max_tokens
     temperature = request_data.temperature
+    top_p = request_data.top_p
+    stop_sequences = request_data.stop_sequences
     tracking_id = tracking_id or str(uuid.uuid4())
 
     if not messages:
@@ -193,7 +207,7 @@ def generate_chat_completions(
             headers={"x-cms-tracking-id": tracking_id},
         )
 
-    async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGenerator:
+    async def _stream(prompt: str, max_tokens: int, temperature: float, top_p: float, stop_sequences: List[str]) -> AsyncGenerator:
         data = {
             "id": tracking_id,
             "object": "chat.completion.chunk",
@@ -204,6 +218,8 @@ async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGene
             prompt,
             max_tokens=max_tokens,
             temperature=temperature,
+            top_p=top_p,
+            stop_sequences=stop_sequences,
             report_tokens=partial(_send_usage_metrics, handler=PATH_OPENAI_COMPLETIONS)
         ):
             data = {
@@ -221,7 +237,7 @@ async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGene
     prompt = get_prompt_from_messages(model_service.tokenizer, messages)
     if stream:
         return StreamingResponse(
-            _stream(prompt, max_tokens, temperature),
+            _stream(prompt, max_tokens, temperature, top_p, stop_sequences or []),
             media_type="text/event-stream",
             headers={"x-cms-tracking-id": tracking_id},
         )
@@ -230,6 +246,8 @@ async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGene
             prompt,
             max_tokens=max_tokens,
             temperature=temperature,
+            top_p=top_p,
+            stop_sequences=stop_sequences or [],
             send_metrics=partial(_send_usage_metrics, handler=PATH_OPENAI_COMPLETIONS),
         )
         completion = OpenAIChatResponse(

diff --git a/app/api/utils.py b/app/api/utils.py
@@ -323,8 +323,8 @@ async def init_vllm_engine(app: FastAPI,
         )
         from vllm import SamplingParams, TokensPrompt
     except ImportError:
-        logger.error("Cannot import the vLLM engine. Please install it with `pip install cms[vllm]`.")
-        raise ExtraDependencyRequiredException("Cannot import the vLLM engine. Please install it with `pip install cms[vllm]`.")
+        logger.error("Cannot import the vLLM engine. Please install it with `pip install cms[llm]`.")
+        raise ExtraDependencyRequiredException("Cannot import the vLLM engine. Please install it with `pip install cms[llm]`.")
 
     parser = FlexibleArgumentParser()
     parser = make_arg_parser(parser)

diff --git a/app/cli/cli.py b/app/cli/cli.py
@@ -490,7 +490,7 @@ def package_model(
         raise typer.Exit(code=1)
 
     if output_model_package == "":
-        typer.echo("ERROR: The model package path is not passed in.")
+        typer.echo("ERROR: The output model package path is not passed in.")
         raise typer.Exit(code=1)
 
     model_package_archive = os.path.abspath(os.path.expanduser(output_model_package))

diff --git a/app/domain.py b/app/domain.py
@@ -217,6 +217,8 @@ class OpenAIChatRequest(BaseModel):
     max_tokens: int = Field(512, description="The maximum number of tokens to generate", gt=0)
     model: str = Field(..., description="The name of the model used for generating the completion")
     temperature: float = Field(0.7, description="The temperature of the generated text", ge=0.0, le=1.0)
+    top_p: float = Field(0.9, description="The top-p value for nucleus sampling", ge=0.0, le=1.0)
+    stop_sequences: Optional[List[str]] = Field(default=None, description="The list of sequences used to stop the generation")
 
 
 class OpenAIChatResponse(BaseModel):

diff --git a/app/management/model_manager.py b/app/management/model_manager.py
@@ -293,7 +293,7 @@ def predict_stream(
             for _, item in df.iterrows():
                 yield item.to_dict()
 
-    @staticmethod   # type: ignore
-    @func_deprecated
+    @staticmethod
+    @func_deprecated()
     def _get_pip_requirements() -> str:
         return os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "requirements.txt"))
diff --git a/app/model_services/huggingface_llm_model.py b/app/model_services/huggingface_llm_model.py
@@ -230,8 +230,12 @@ def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
     def generate(
         self,
         prompt: str,
+        min_tokens: int = 100,
         max_tokens: int = 512,
+        num_beams: int = 5,
         temperature: float = 0.7,
+        top_p: float = 0.9,
+        stop_sequences: Optional[List[str]] = None,
         report_tokens: Optional[Callable[[str], None]] = None,
         **kwargs: Any
     ) -> str:
@@ -240,8 +244,12 @@ def generate(
 
         Args:
             prompt (str): The prompt for the text generation
+            min_tokens (int): The minimum number of tokens to generate. Defaults to 100.
             max_tokens (int): The maximum number of tokens to generate. Defaults to 512.
+            num_beams (int): The number of beams for beam search. Defaults to 5.
             temperature (float): The temperature for the text generation. Defaults to 0.7.
+            top_p (float): The Top-P value for nucleus sampling. Defaults to 0.9.
+            stop_sequences (Optional[List[str]]): List of strings that will stop generation when encountered. Defaults to None.
             report_tokens (Optional[Callable[[str], None]]): The callback function to send metrics. Defaults to None.
             **kwargs (Any): Additional keyword arguments to be passed to this method.
 
@@ -257,14 +265,25 @@ def generate(
         generation_kwargs = dict(
             inputs=inputs.input_ids,
             attention_mask=inputs.attention_mask,
+            min_new_tokens=min_tokens,
             max_new_tokens=max_tokens,
-            do_sample=False,
+            num_beams=num_beams,
+            do_sample=True,
             temperature=temperature,
-            top_p=0.9,
+            top_p=top_p,
+            repetition_penalty=1.2,
+            no_repeat_ngram_size=3,
         )
 
         outputs = self.model.generate(**generation_kwargs)
         generated_text = self.tokenizer.decode(outputs[0], skip_prompt=True, skip_special_tokens=True)
+
+        if stop_sequences:
+            for stop_seq in stop_sequences:
+                if stop_seq in generated_text:
+                    generated_text = generated_text.split(stop_seq)[0]
+                    break
+
         logger.debug("Response generation completed")
 
         if report_tokens:
@@ -280,6 +299,8 @@ async def generate_async(
         prompt: str,
         max_tokens: int = 512,
         temperature: float = 0.7,
+        top_p: float = 0.9,
+        stop_sequences: Optional[List[str]] = None,
         report_tokens: Optional[Callable[[str], None]] = None,
         **kwargs: Any
     ) -> AsyncIterable:
@@ -290,6 +311,8 @@ async def generate_async(
             prompt (str): The prompt for the text generation.
             max_tokens (int): The maximum number of tokens to generate. Defaults to 512.
             temperature (float): The temperature for the text generation. Defaults to 0.7.
+            top_p (float): The Top-P value for nucleus sampling. Defaults to 0.9.
+            stop_sequences (Optional[List[str]]): List of strings that will stop generation when encountered. Defaults to None.
             report_tokens (Optional[Callable[[str], None]]): The callback function to send metrics. Defaults to None.
             **kwargs (Any): Additional keyword arguments to be passed to the model loader.
 
@@ -314,15 +337,25 @@ async def generate_async(
             max_new_tokens=max_tokens,
             do_sample=True,
             temperature=temperature,
-            top_p=0.9,
+            top_p=top_p,
+            repetition_penalty=1.2,
+            no_repeat_ngram_size=3,
         )
 
         try:
             _ = self._text_generator.submit(self.model.generate, **generation_kwargs)
             output = ""
             for content in streamer:
-                yield content
+                prev_output = output
                 output += content
+                if stop_sequences:
+                    for stop_seq in stop_sequences:
+                        if stop_seq in output:
+                            remaining = output[len(prev_output):output.find(stop_seq)]
+                            if remaining:
+                                yield remaining
+                            return
+                yield content
                 await asyncio.sleep(0.01)
             if report_tokens:
                 report_tokens(

diff --git a/app/trainers/huggingface_llm_trainer.py b/app/trainers/huggingface_llm_trainer.py
@@ -15,7 +15,6 @@
     TrainingArguments,
     PreTrainedModel,
     PreTrainedTokenizerBase,
-    PreTrainedTokenizerFast,
     TrainerCallback,
     TrainerState,
     TrainerControl,
@@ -34,13 +33,11 @@
     get_model_data_package_base_name,
 )
 from app.trainers.base import SupervisedTrainer
-from app.domain import ModelType, TrainerBackend, LlmRole, LlmTrainerType, LlmDatasetType, PromptMessage, Device
+from app.domain import ModelType, TrainerBackend, LlmRole, LlmTrainerType, LlmDatasetType, PromptMessage
 from app.exception import (
     TrainingCancelledException,
-    ManagedModelException,
     DatasetException,
     ConfigurationException,
-    DeviceNotAvailableError,
     ExtraDependencyRequiredException,
 )
 if TYPE_CHECKING:
@@ -81,9 +78,6 @@ class HuggingFaceLlmSupervisedTrainer(SupervisedTrainer, _HuggingFaceLlmTrainerC
     CONTINUING_TOKEN_LABEL_ID = 1
 
     def __init__(self, model_service: "HuggingFaceLlmModel") -> None:
-        if not isinstance(model_service.tokenizer, PreTrainedTokenizerFast):
-            logger.error("The supervised trainer requires a fast tokenizer to function correctly")
-            raise ManagedModelException("The supervised trainer requires a fast tokenizer to function correctly")
         SupervisedTrainer.__init__(self, model_service._config, model_service.model_name)
         self._model_service = model_service
         self._model_name = model_service.model_name
@@ -299,15 +293,12 @@ def run(
             run_id (str): The run ID of the training job.
             description (Optional[str]): The optional description of the training or change logs.
         """
-
-        if self._config.DEVICE is not Device.GPU.value:
-            raise DeviceNotAvailableError("This trainer currently requires a CUDA device")
-
         try:
             from trl import GRPOConfig, GRPOTrainer  # , PPOConfig, PPOTrainer
-        except ImportError:
-            logger.error("Cannot import the GRPO Trainer. Please install it with `pip install cms[vllm]`.")
-            raise ExtraDependencyRequiredException("Cannot import the GRPO Trainer. Please install it with `pip install cms[vllm]`.")
+        except ImportError as e:
+            logger.exception(e)
+            logger.error("Cannot import the GRPO Trainer. Please install it with `pip install cms[llm]`.")
+            raise ExtraDependencyRequiredException("Cannot import the GRPO Trainer. Please install it with `pip install cms[llm]`.")
 
         trained_model_pack_path = None
         redeploy = self._config.REDEPLOY_TRAINED_MODEL == "true"