Skip to content

Commit e515e0f

Browse files
google-genai-botcopybara-github
authored andcommitted
feat: Introduce a post-hoc, per-turn evaluator for user simulations
PiperOrigin-RevId: 844818512
1 parent 69997cd commit e515e0f

File tree

11 files changed

+1157
-0
lines changed

11 files changed

+1157
-0
lines changed

src/google/adk/evaluation/eval_metrics.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ class PrebuiltMetrics(Enum):
5757

5858
RUBRIC_BASED_TOOL_USE_QUALITY_V1 = "rubric_based_tool_use_quality_v1"
5959

60+
PER_TURN_USER_SIMULATOR_QUALITY_V1 = "per_turn_user_simulator_quality_v1"
61+
6062

6163
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
6264
Threshold: TypeAlias = float
@@ -223,6 +225,19 @@ class MatchType(Enum):
223225
)
224226

225227

228+
class LlmBackedUserSimulatorCriterion(LlmAsAJudgeCriterion):
229+
"""Criterion for LLM-backed User Simulator Evaluators."""
230+
231+
stop_signal: str = Field(
232+
default="</finished>",
233+
description=(
234+
"Stop signal to validate the successful completion of a conversation."
235+
" For optimal performance, this should match the one in the User"
236+
" Simulator."
237+
),
238+
)
239+
240+
226241
class EvalMetric(EvalBaseModel):
227242
"""A metric used to evaluate a particular aspect of an eval case."""
228243

src/google/adk/evaluation/evaluator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from pydantic import BaseModel
2121
from typing_extensions import TypeAlias
2222

23+
from .eval_case import ConversationScenario
2324
from .eval_case import Invocation
2425
from .eval_metrics import BaseCriterion
2526
from .eval_metrics import EvalStatus
@@ -62,6 +63,7 @@ def evaluate_invocations(
6263
self,
6364
actual_invocations: list[Invocation],
6465
expected_invocations: Optional[list[Invocation]],
66+
conversation_scenario: Optional[ConversationScenario],
6567
) -> EvaluationResult:
6668
"""Returns EvaluationResult after performing evaluations using actual and expected invocations.
6769
@@ -72,5 +74,7 @@ def evaluate_invocations(
7274
usually act as a benchmark/golden response. If these are specified
7375
usually the expectation is that the length of this list and actual
7476
invocation is the same.
77+
conversation_scenario: An optional conversation scenario for multi-turn
78+
conversations.
7579
"""
7680
raise NotImplementedError()

src/google/adk/evaluation/final_response_match_v1.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from typing_extensions import override
2121

2222
from ..dependencies.rouge_scorer import rouge_scorer
23+
from .eval_case import ConversationScenario
2324
from .eval_case import Invocation
2425
from .eval_metrics import EvalMetric
2526
from .eval_metrics import Interval
@@ -60,6 +61,7 @@ def evaluate_invocations(
6061
self,
6162
actual_invocations: list[Invocation],
6263
expected_invocations: Optional[list[Invocation]],
64+
_: Optional[ConversationScenario] = None,
6365
) -> EvaluationResult:
6466
if expected_invocations is None:
6567
raise ValueError("expected_invocations is required for this metric.")

src/google/adk/evaluation/hallucinations_v1.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from ..utils.feature_decorator import experimental
3535
from ._retry_options_utils import add_default_retry_options_if_not_present
3636
from .app_details import AppDetails
37+
from .eval_case import ConversationScenario
3738
from .eval_case import Invocation
3839
from .eval_case import InvocationEvent
3940
from .eval_case import InvocationEvents
@@ -720,6 +721,7 @@ async def evaluate_invocations(
720721
self,
721722
actual_invocations: list[Invocation],
722723
expected_invocations: Optional[list[Invocation]],
724+
_: Optional[ConversationScenario] = None,
723725
) -> EvaluationResult:
724726
# expected_invocations are not required by the metric and if they are not
725727
# supplied, we provide a list of None to rest of the code.

src/google/adk/evaluation/llm_as_judge.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from ..utils.feature_decorator import experimental
3030
from ._retry_options_utils import add_default_retry_options_if_not_present
3131
from .common import EvalBaseModel
32+
from .eval_case import ConversationScenario
3233
from .eval_case import Invocation
3334
from .eval_metrics import BaseCriterion
3435
from .eval_metrics import EvalMetric
@@ -118,6 +119,7 @@ async def evaluate_invocations(
118119
self,
119120
actual_invocations: list[Invocation],
120121
expected_invocations: Optional[list[Invocation]],
122+
_: Optional[ConversationScenario] = None,
121123
) -> EvaluationResult:
122124
if self._expected_invocations_required and expected_invocations is None:
123125
raise ValueError("expected_invocations is needed by this metric.")

src/google/adk/evaluation/local_eval_service.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from .base_eval_service import InferenceRequest
4141
from .base_eval_service import InferenceResult
4242
from .base_eval_service import InferenceStatus
43+
from .eval_case import ConversationScenario
4344
from .eval_case import Invocation
4445
from .eval_metrics import EvalMetric
4546
from .eval_metrics import EvalMetricResult
@@ -256,6 +257,7 @@ async def _evaluate_single_inference_result(
256257
eval_metric=eval_metric,
257258
actual_invocations=inference_result.inferences,
258259
expected_invocations=eval_case.conversation,
260+
conversation_scenario=eval_case.conversation_scenario,
259261
)
260262
except Exception as e:
261263
# We intentionally catch the Exception as we don't want failures to
@@ -345,6 +347,7 @@ async def _evaluate_metric(
345347
eval_metric: EvalMetric,
346348
actual_invocations: list[Invocation],
347349
expected_invocations: Optional[list[Invocation]],
350+
conversation_scenario: Optional[ConversationScenario],
348351
) -> EvaluationResult:
349352
"""Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
350353

@@ -359,6 +362,7 @@ async def _evaluate_metric(
359362
return await metric_evaluator.evaluate_invocations(
360363
actual_invocations=actual_invocations,
361364
expected_invocations=expected_invocations,
365+
conversation_scenario=conversation_scenario,
362366
)
363367
else:
364368
# Metrics that perform computation synchronously, mostly these don't

src/google/adk/evaluation/metric_evaluator_registry.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
2929
from .rubric_based_tool_use_quality_v1 import RubricBasedToolUseV1Evaluator
3030
from .safety_evaluator import SafetyEvaluatorV1
31+
from .simulation.per_turn_user_simulator_quality_v1 import PerTurnUserSimulatorQualityV1
3132
from .trajectory_evaluator import TrajectoryEvaluator
3233

3334
logger = logging.getLogger("google_adk." + __name__)
@@ -126,6 +127,10 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
126127
metric_info=RubricBasedToolUseV1Evaluator.get_metric_info(),
127128
evaluator=RubricBasedToolUseV1Evaluator,
128129
)
130+
metric_evaluator_registry.register_evaluator(
131+
metric_info=PerTurnUserSimulatorQualityV1.get_metric_info(),
132+
evaluator=PerTurnUserSimulatorQualityV1,
133+
)
129134

130135
return metric_evaluator_registry
131136

src/google/adk/evaluation/response_evaluator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from typing_extensions import override
2020

21+
from .eval_case import ConversationScenario
2122
from .eval_case import Invocation
2223
from .eval_metrics import EvalMetric
2324
from .eval_metrics import Interval
@@ -100,6 +101,7 @@ def evaluate_invocations(
100101
self,
101102
actual_invocations: list[Invocation],
102103
expected_invocations: Optional[list[Invocation]],
104+
_: Optional[ConversationScenario] = None,
103105
) -> EvaluationResult:
104106
# If the metric is response_match_score, just use the RougeEvaluator.
105107
if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value:

0 commit comments

Comments
 (0)