diff --git a/dreadnode/scorers/judge.py b/dreadnode/scorers/judge.py
index df5bab28..a323752f 100644
--- a/dreadnode/scorers/judge.py
+++ b/dreadnode/scorers/judge.py
@@ -46,6 +46,7 @@ def llm_judge(
     min_score: float | None = None,
     max_score: float | None = None,
     name: str = "llm_judge",
+    system_prompt: str | None = None,
 ) -> "Scorer[t.Any]":
     """
     Score the output of a task using an LLM to judge it against a rubric.
@@ -60,6 +61,7 @@ def llm_judge(
         min_score: Optional minimum score for the judgement - if provided, the score will be clamped to this value.
         max_score: Optional maximum score for the judgement - if provided, the score will be clamped to this value.
         name: The name of the scorer.
+        system_prompt: Optional custom system prompt for the judge. If None, uses default.
     """
 
     async def evaluate(
@@ -74,6 +76,7 @@ async def evaluate(
         model_params: rg.GenerateParams | AnyDict | None = model_params,
         min_score: float | None = min_score,
         max_score: float | None = max_score,
+        system_prompt: str | None = system_prompt,
     ) -> list[Metric]:
         generator: rg.Generator
         if isinstance(model, str):
@@ -97,7 +100,10 @@ async def evaluate(
             rubric=rubric,
         )
 
-        judgement = await judge.bind(generator)(input_data)
+        pipeline = generator.chat([])
+        if system_prompt:
+            pipeline.chat.inject_system_content(system_prompt)
+        judgement = await judge.bind(pipeline)(input_data)
 
         if min_score is not None:
             judgement.score = max(min_score, judgement.score)