From fae847a43745e49b9f926373aa6120f5a5ff7803 Mon Sep 17 00:00:00 2001 From: GangGreenTemperTatum <104169244+GangGreenTemperTatum@users.noreply.github.com> Date: Wed, 7 Jan 2026 12:46:24 -0500 Subject: [PATCH 1/3] chore: add custom system prompt capability to judge --- dreadnode/scorers/judge.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/dreadnode/scorers/judge.py b/dreadnode/scorers/judge.py index df5bab28..46e0ac06 100644 --- a/dreadnode/scorers/judge.py +++ b/dreadnode/scorers/judge.py @@ -46,6 +46,7 @@ def llm_judge( min_score: float | None = None, max_score: float | None = None, name: str = "llm_judge", + system_prompt: str | None = None, ) -> "Scorer[t.Any]": """ Score the output of a task using an LLM to judge it against a rubric. @@ -60,6 +61,7 @@ def llm_judge( min_score: Optional minimum score for the judgement - if provided, the score will be clamped to this value. max_score: Optional maximum score for the judgement - if provided, the score will be clamped to this value. name: The name of the scorer. + system_prompt: Optional custom system prompt for the judge. If None, uses default. """ async def evaluate( @@ -97,7 +99,16 @@ async def evaluate( rubric=rubric, ) - judgement = await judge.bind(generator)(input_data) + if system_prompt: + completion = ( + await generator.chat(system_prompt) + .add(input_data.model_dump_json()) + .until_parsed_as(Judgement) + .run() + ) + judgement = completion.last.parse(Judgement) + else: + judgement = await judge.bind(generator)(input_data) if min_score is not None: judgement.score = max(min_score, judgement.score) From 066ad786f3c644124393501baea81d0a878844bf Mon Sep 17 00:00:00 2001 From: monoxgas Date: Mon, 12 Jan 2026 12:57:30 -0700 Subject: [PATCH 2/3] Fixing system prompt for judge --- dreadnode/scorers/judge.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/dreadnode/scorers/judge.py b/dreadnode/scorers/judge.py index 46e0ac06..2815091a 100644 --- a/dreadnode/scorers/judge.py +++ b/dreadnode/scorers/judge.py @@ -76,6 +76,7 @@ async def evaluate( model_params: rg.GenerateParams | AnyDict | None = model_params, min_score: float | None = min_score, max_score: float | None = max_score, + system_prompt: str | None = system_prompt, ) -> list[Metric]: generator: rg.Generator if isinstance(model, str): @@ -99,16 +100,11 @@ async def evaluate( rubric=rubric, ) - if system_prompt: - completion = ( - await generator.chat(system_prompt) - .add(input_data.model_dump_json()) - .until_parsed_as(Judgement) - .run() - ) - judgement = completion.last.parse(Judgement) - else: - judgement = await judge.bind(generator)(input_data) + judgement = await judge.bind( + generator.chat({"role": "system", "content": system_prompt}) + if system_prompt + else generator + )(input_data) if min_score is not None: judgement.score = max(min_score, judgement.score) From c76f63d17eec5ec036158534d808e6695a822f3e Mon Sep 17 00:00:00 2001 From: Raja Sekhar Rao Dheekonda Date: Mon, 12 Jan 2026 12:38:38 -0800 Subject: [PATCH 3/3] leverage inject system content util function in judge module --- dreadnode/scorers/judge.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dreadnode/scorers/judge.py b/dreadnode/scorers/judge.py index 2815091a..a323752f 100644 --- a/dreadnode/scorers/judge.py +++ b/dreadnode/scorers/judge.py @@ -100,11 +100,10 @@ async def evaluate( rubric=rubric, ) - judgement = await judge.bind( - generator.chat({"role": "system", "content": system_prompt}) - if system_prompt - else generator - )(input_data) + pipeline = generator.chat([]) + if system_prompt: + pipeline.chat.inject_system_content(system_prompt) + judgement = await judge.bind(pipeline)(input_data) if min_score is not None: judgement.score = max(min_score, judgement.score)