Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
uv sync --extra dev --extra docs --extra vllm
uv sync --extra dev --extra docs --extra llm
uv run python -m ensurepip
- name: Check types
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
python-version: "3.10"
- name: Install dependencies
run: |
uv sync --extra dev --extra docs --extra vllm
uv sync --extra dev --extra docs --extra llm
- name: Run unit tests
run: |
uv run pytest -v tests/app --cov --cov-report=html:coverage_reports #--random-order
Expand Down
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,30 @@ curl -N -X 'POST' 'http://127.0.0.1:8000/stream/generate?max_tokens=512' \
-d 'What is hypertension?'
```

CMS also provides APIs that are compatible with the OpenAI client. For example, using its Python SDK, you can generate text and retrieve embeddings with the following snippet:
```python
from openai import OpenAI

client = OpenAI(api_key="dummy", base_url="http://localhost:8000/v1")
completion = client.chat.completions.create(
model="Huggingface LLM Model",
messages=[
{"role": "system", "content": "You are bot answering questions from the user"},
{"role": "user", "content": "What is hypertension?"},
],
max_tokens=256,
temperature=0.7,
stream=False
)
print(f"CMS => {completion.choices[0].message.content}")
embeddings = client.embeddings.create(
model="Huggingface LLM Model", input="What is hypertension?"
)
```
Note that to enable quantization and training features, you need to install the extra dependencies:
```commandline
pip install '.[llm]'
```
#### Chat with served models
You can also "chat" with the running model using the `/stream/ws` endpoint. For example:
```html
Expand Down
30 changes: 24 additions & 6 deletions app/api/routers/generative.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import uuid
import app.api.globals as cms_globals

from typing import Union, Iterable, AsyncGenerator
from typing import Union, Iterable, AsyncGenerator, List
from typing_extensions import Annotated
from functools import partial
from fastapi import APIRouter, Depends, Request, Body, Query
Expand Down Expand Up @@ -51,7 +51,9 @@ def generate_text(
request: Request,
prompt: Annotated[str, Body(description="The prompt to be sent to the model", media_type="text/plain")],
max_tokens: Annotated[int, Query(description="The maximum number of tokens to generate", gt=0)] = 512,
temperature: Annotated[float, Query(description="The temperature of the generated text", gt=0.0, lt=1.0)] = 0.7,
temperature: Annotated[float, Query(description="The temperature of the generated text", ge=0.0)] = 0.7,
top_p: Annotated[float, Query(description="The Top-P value for nucleus sampling", ge=0.0, le=1.0)] = 0.9,
stop_sequences: Annotated[List[str], Query(description="The list of sequences used to stop the generation")] = [],
tracking_id: Union[str, None] = Depends(validate_tracking_id),
model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
) -> PlainTextResponse:
Expand All @@ -63,6 +65,8 @@ def generate_text(
prompt (str): The prompt to be sent to the model.
max_tokens (int): The maximum number of tokens to generate.
temperature (float): The temperature of the generated text.
top_p (float): The Top-P value for nucleus sampling.
stop_sequences (List[str]): The list of sequences used to stop the generation.
tracking_id (Union[str, None]): An optional tracking ID of the requested task.
model_service (AbstractModelService): The model service dependency.

Expand All @@ -77,6 +81,8 @@ def generate_text(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop_sequences=stop_sequences,
report_tokens=partial(_send_usage_metrics, handler=PATH_GENERATE),
),
headers={"x-cms-tracking-id": tracking_id},
Expand All @@ -101,7 +107,9 @@ async def generate_text_stream(
request: Request,
prompt: Annotated[str, Body(description="The prompt to be sent to the model", media_type="text/plain")],
max_tokens: Annotated[int, Query(description="The maximum number of tokens to generate", gt=0)] = 512,
temperature: Annotated[float, Query(description="The temperature of the generated text", gt=0.0, lt=1.0)] = 0.7,
temperature: Annotated[float, Query(description="The temperature of the generated text", ge=0.0)] = 0.7,
top_p: Annotated[float, Query(description="The Top-P value for nucleus sampling", ge=0.0, le=1.0)] = 0.9,
stop_sequences: Annotated[List[str], Query(description="The list of sequences used to stop the generation")] = [],
tracking_id: Union[str, None] = Depends(validate_tracking_id),
model_service: AbstractModelService = Depends(cms_globals.model_service_dep)
) -> StreamingResponse:
Expand All @@ -113,6 +121,8 @@ async def generate_text_stream(
prompt (str): The prompt to be sent to the model.
max_tokens (int): The maximum number of tokens to generate.
temperature (float): The temperature of the generated text.
top_p (float): The Top-P value for nucleus sampling.
stop_sequences (List[str]): The list of sequences used to stop the generation.
tracking_id (Union[str, None]): An optional tracking ID of the requested task.
model_service (AbstractModelService): The model service dependency.

Expand All @@ -127,6 +137,8 @@ async def generate_text_stream(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop_sequences=stop_sequences,
report_tokens=partial(_send_usage_metrics, handler=PATH_GENERATE_ASYNC),
),
media_type="text/event-stream",
Expand Down Expand Up @@ -162,7 +174,7 @@ def generate_chat_completions(

Args:
request (Request): The request object.
request_data (OpenAIChatRequest): The request data containing model, messages, and stream.
request_data (OpenAIChatRequest): The request data containing model, messages, stream, temperature, top_p, and stop_sequences.
tracking_id (Union[str, None]): An optional tracking ID of the requested task.
model_service (AbstractModelService): The model service dependency.

Expand All @@ -176,6 +188,8 @@ def generate_chat_completions(
stream = request_data.stream
max_tokens = request_data.max_tokens
temperature = request_data.temperature
top_p = request_data.top_p
stop_sequences = request_data.stop_sequences
tracking_id = tracking_id or str(uuid.uuid4())

if not messages:
Expand All @@ -193,7 +207,7 @@ def generate_chat_completions(
headers={"x-cms-tracking-id": tracking_id},
)

async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGenerator:
async def _stream(prompt: str, max_tokens: int, temperature: float, top_p: float, stop_sequences: List[str]) -> AsyncGenerator:
data = {
"id": tracking_id,
"object": "chat.completion.chunk",
Expand All @@ -204,6 +218,8 @@ async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGene
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop_sequences=stop_sequences,
report_tokens=partial(_send_usage_metrics, handler=PATH_OPENAI_COMPLETIONS)
):
data = {
Expand All @@ -221,7 +237,7 @@ async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGene
prompt = get_prompt_from_messages(model_service.tokenizer, messages)
if stream:
return StreamingResponse(
_stream(prompt, max_tokens, temperature),
_stream(prompt, max_tokens, temperature, top_p, stop_sequences or []),
media_type="text/event-stream",
headers={"x-cms-tracking-id": tracking_id},
)
Expand All @@ -230,6 +246,8 @@ async def _stream(prompt: str, max_tokens: int, temperature: float) -> AsyncGene
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stop_sequences=stop_sequences or [],
send_metrics=partial(_send_usage_metrics, handler=PATH_OPENAI_COMPLETIONS),
)
completion = OpenAIChatResponse(
Expand Down
4 changes: 2 additions & 2 deletions app/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,8 @@ async def init_vllm_engine(app: FastAPI,
)
from vllm import SamplingParams, TokensPrompt
except ImportError:
logger.error("Cannot import the vLLM engine. Please install it with `pip install cms[vllm]`.")
raise ExtraDependencyRequiredException("Cannot import the vLLM engine. Please install it with `pip install cms[vllm]`.")
logger.error("Cannot import the vLLM engine. Please install it with `pip install cms[llm]`.")
raise ExtraDependencyRequiredException("Cannot import the vLLM engine. Please install it with `pip install cms[llm]`.")

parser = FlexibleArgumentParser()
parser = make_arg_parser(parser)
Expand Down
2 changes: 1 addition & 1 deletion app/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ def package_model(
raise typer.Exit(code=1)

if output_model_package == "":
typer.echo("ERROR: The model package path is not passed in.")
typer.echo("ERROR: The output model package path is not passed in.")
raise typer.Exit(code=1)

model_package_archive = os.path.abspath(os.path.expanduser(output_model_package))
Expand Down
2 changes: 2 additions & 0 deletions app/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ class OpenAIChatRequest(BaseModel):
max_tokens: int = Field(512, description="The maximum number of tokens to generate", gt=0)
model: str = Field(..., description="The name of the model used for generating the completion")
temperature: float = Field(0.7, description="The temperature of the generated text", ge=0.0, le=1.0)
top_p: float = Field(0.9, description="The top-p value for nucleus sampling", ge=0.0, le=1.0)
stop_sequences: Optional[List[str]] = Field(default=None, description="The list of sequences used to stop the generation")


class OpenAIChatResponse(BaseModel):
Expand Down
4 changes: 2 additions & 2 deletions app/management/model_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def predict_stream(
for _, item in df.iterrows():
yield item.to_dict()

@staticmethod # type: ignore
@func_deprecated
@staticmethod
@func_deprecated()
def _get_pip_requirements() -> str:
return os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "requirements.txt"))
41 changes: 37 additions & 4 deletions app/model_services/huggingface_llm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,12 @@ def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
def generate(
self,
prompt: str,
min_tokens: int = 100,
max_tokens: int = 512,
num_beams: int = 5,
temperature: float = 0.7,
top_p: float = 0.9,
stop_sequences: Optional[List[str]] = None,
report_tokens: Optional[Callable[[str], None]] = None,
**kwargs: Any
) -> str:
Expand All @@ -240,8 +244,12 @@ def generate(

Args:
prompt (str): The prompt for the text generation
min_tokens (int): The minimum number of tokens to generate. Defaults to 100.
max_tokens (int): The maximum number of tokens to generate. Defaults to 512.
num_beams (int): The number of beams for beam search. Defaults to 5.
temperature (float): The temperature for the text generation. Defaults to 0.7.
top_p (float): The Top-P value for nucleus sampling. Defaults to 0.9.
stop_sequences (Optional[List[str]]): List of strings that will stop generation when encountered. Defaults to None.
report_tokens (Optional[Callable[[str], None]]): The callback function to send metrics. Defaults to None.
**kwargs (Any): Additional keyword arguments to be passed to this method.

Expand All @@ -257,14 +265,25 @@ def generate(
generation_kwargs = dict(
inputs=inputs.input_ids,
attention_mask=inputs.attention_mask,
min_new_tokens=min_tokens,
max_new_tokens=max_tokens,
do_sample=False,
num_beams=num_beams,
do_sample=True,
temperature=temperature,
top_p=0.9,
top_p=top_p,
repetition_penalty=1.2,
no_repeat_ngram_size=3,
)

outputs = self.model.generate(**generation_kwargs)
generated_text = self.tokenizer.decode(outputs[0], skip_prompt=True, skip_special_tokens=True)

if stop_sequences:
for stop_seq in stop_sequences:
if stop_seq in generated_text:
generated_text = generated_text.split(stop_seq)[0]
break

logger.debug("Response generation completed")

if report_tokens:
Expand All @@ -280,6 +299,8 @@ async def generate_async(
prompt: str,
max_tokens: int = 512,
temperature: float = 0.7,
top_p: float = 0.9,
stop_sequences: Optional[List[str]] = None,
report_tokens: Optional[Callable[[str], None]] = None,
**kwargs: Any
) -> AsyncIterable:
Expand All @@ -290,6 +311,8 @@ async def generate_async(
prompt (str): The prompt for the text generation.
max_tokens (int): The maximum number of tokens to generate. Defaults to 512.
temperature (float): The temperature for the text generation. Defaults to 0.7.
top_p (float): The Top-P value for nucleus sampling. Defaults to 0.9.
stop_sequences (Optional[List[str]]): List of strings that will stop generation when encountered. Defaults to None.
report_tokens (Optional[Callable[[str], None]]): The callback function to send metrics. Defaults to None.
**kwargs (Any): Additional keyword arguments to be passed to the model loader.

Expand All @@ -314,15 +337,25 @@ async def generate_async(
max_new_tokens=max_tokens,
do_sample=True,
temperature=temperature,
top_p=0.9,
top_p=top_p,
repetition_penalty=1.2,
no_repeat_ngram_size=3,
)

try:
_ = self._text_generator.submit(self.model.generate, **generation_kwargs)
output = ""
for content in streamer:
yield content
prev_output = output
output += content
if stop_sequences:
for stop_seq in stop_sequences:
if stop_seq in output:
remaining = output[len(prev_output):output.find(stop_seq)]
if remaining:
yield remaining
return
yield content
await asyncio.sleep(0.01)
if report_tokens:
report_tokens(
Expand Down
19 changes: 5 additions & 14 deletions app/trainers/huggingface_llm_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
TrainingArguments,
PreTrainedModel,
PreTrainedTokenizerBase,
PreTrainedTokenizerFast,
TrainerCallback,
TrainerState,
TrainerControl,
Expand All @@ -34,13 +33,11 @@
get_model_data_package_base_name,
)
from app.trainers.base import SupervisedTrainer
from app.domain import ModelType, TrainerBackend, LlmRole, LlmTrainerType, LlmDatasetType, PromptMessage, Device
from app.domain import ModelType, TrainerBackend, LlmRole, LlmTrainerType, LlmDatasetType, PromptMessage
from app.exception import (
TrainingCancelledException,
ManagedModelException,
DatasetException,
ConfigurationException,
DeviceNotAvailableError,
ExtraDependencyRequiredException,
)
if TYPE_CHECKING:
Expand Down Expand Up @@ -81,9 +78,6 @@ class HuggingFaceLlmSupervisedTrainer(SupervisedTrainer, _HuggingFaceLlmTrainerC
CONTINUING_TOKEN_LABEL_ID = 1

def __init__(self, model_service: "HuggingFaceLlmModel") -> None:
if not isinstance(model_service.tokenizer, PreTrainedTokenizerFast):
logger.error("The supervised trainer requires a fast tokenizer to function correctly")
raise ManagedModelException("The supervised trainer requires a fast tokenizer to function correctly")
SupervisedTrainer.__init__(self, model_service._config, model_service.model_name)
self._model_service = model_service
self._model_name = model_service.model_name
Expand Down Expand Up @@ -299,15 +293,12 @@ def run(
run_id (str): The run ID of the training job.
description (Optional[str]): The optional description of the training or change logs.
"""

if self._config.DEVICE is not Device.GPU.value:
raise DeviceNotAvailableError("This trainer currently requires a CUDA device")

try:
from trl import GRPOConfig, GRPOTrainer # , PPOConfig, PPOTrainer
except ImportError:
logger.error("Cannot import the GRPO Trainer. Please install it with `pip install cms[vllm]`.")
raise ExtraDependencyRequiredException("Cannot import the GRPO Trainer. Please install it with `pip install cms[vllm]`.")
except ImportError as e:
logger.exception(e)
logger.error("Cannot import the GRPO Trainer. Please install it with `pip install cms[llm]`.")
raise ExtraDependencyRequiredException("Cannot import the GRPO Trainer. Please install it with `pip install cms[llm]`.")

trained_model_pack_path = None
redeploy = self._config.REDEPLOY_TRAINED_MODEL == "true"
Expand Down
Loading