4040from .base_eval_service import InferenceRequest
4141from .base_eval_service import InferenceResult
4242from .base_eval_service import InferenceStatus
43+ from .eval_case import ConversationScenario
4344from .eval_case import Invocation
4445from .eval_metrics import EvalMetric
4546from .eval_metrics import EvalMetricResult
@@ -256,6 +257,7 @@ async def _evaluate_single_inference_result(
256257 eval_metric = eval_metric ,
257258 actual_invocations = inference_result .inferences ,
258259 expected_invocations = eval_case .conversation ,
260+ conversation_scenario = eval_case .conversation_scenario ,
259261 )
260262 except Exception as e :
261263 # We intentionally catch the Exception as we don't want failures to
@@ -345,6 +347,7 @@ async def _evaluate_metric(
345347 eval_metric : EvalMetric ,
346348 actual_invocations : list [Invocation ],
347349 expected_invocations : Optional [list [Invocation ]],
350+ conversation_scenario : Optional [ConversationScenario ],
348351 ) -> EvaluationResult :
349352 """Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
350353
@@ -359,6 +362,7 @@ async def _evaluate_metric(
359362 return await metric_evaluator .evaluate_invocations (
360363 actual_invocations = actual_invocations ,
361364 expected_invocations = expected_invocations ,
365+ conversation_scenario = conversation_scenario ,
362366 )
363367 else :
364368 # Metrics that perform computation synchronously, mostly these don't
0 commit comments