diff --git a/dreadnode/scorers/judge.py b/dreadnode/scorers/judge.py index df5bab28..a323752f 100644 --- a/dreadnode/scorers/judge.py +++ b/dreadnode/scorers/judge.py @@ -46,6 +46,7 @@ def llm_judge( min_score: float | None = None, max_score: float | None = None, name: str = "llm_judge", + system_prompt: str | None = None, ) -> "Scorer[t.Any]": """ Score the output of a task using an LLM to judge it against a rubric. @@ -60,6 +61,7 @@ def llm_judge( min_score: Optional minimum score for the judgement - if provided, the score will be clamped to this value. max_score: Optional maximum score for the judgement - if provided, the score will be clamped to this value. name: The name of the scorer. + system_prompt: Optional custom system prompt for the judge. If None, uses default. """ async def evaluate( @@ -74,6 +76,7 @@ async def evaluate( model_params: rg.GenerateParams | AnyDict | None = model_params, min_score: float | None = min_score, max_score: float | None = max_score, + system_prompt: str | None = system_prompt, ) -> list[Metric]: generator: rg.Generator if isinstance(model, str): @@ -97,7 +100,10 @@ async def evaluate( rubric=rubric, ) - judgement = await judge.bind(generator)(input_data) + pipeline = generator.chat([]) + if system_prompt: + pipeline.chat.inject_system_content(system_prompt) + judgement = await judge.bind(pipeline)(input_data) if min_score is not None: judgement.score = max(min_score, judgement.score)