diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index f9ad117b4c40..09e3448962e3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -162,6 +162,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t if isinstance(llm_output, dict): success = llm_output.get("success", False) + details = llm_output.get("details", {}) + + if "success" not in llm_output and "success" in details: + success = details.get("success", False) + if isinstance(success, str): success = success.upper() == "TRUE" @@ -171,7 +176,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t f"{self._result_key}": success * 1.0, f"{self._result_key}_result": success_result, f"{self._result_key}_threshold": self._threshold, - f"{self._result_key}_reason": f"{reason} {llm_output.get('details', '')}", + f"{self._result_key}_reason": f"{reason} {details or ''}", f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0), f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0), f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty index 8370e58ce1d9..6fb2d4052a29 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty @@ -63,11 +63,11 @@ B. Examine tool result and definition for the tool being called to check whether C. If one or more tool result are **failed** , then you the **evaluation process** has **failed**, otherwise , the **evaluation process** has **succeeded** D. You are required to return your **output** in the following format: { - "explanation": "<15-60 words explaining the logic flow of your decision>", - "details": { - "failed_tools": "", - }, - "success": + "explanation": "<15-60 words explaining the logic flow of your decision>", + "details": { + "failed_tools": "", + }, + "success": } E. If no tool calls found at all , the TOOL_CALLS input is empty or the TOOL_CALLS input is not passed , the **evaluation process** has **succeeded** @@ -85,12 +85,13 @@ E. If no tool calls found at all , the TOOL_CALLS input is empty or the TOOL_CAL EXPECTED OUTPUT { - "explanation": "None of the results indicate an error", - "details": { - "failed_tools": "", - }, - "success": True -} + "explanation": "None of the results indicate an error", + "details": { + "failed_tools": "", + }, + "success": True +} + ### Example - Succeeded @@ -100,12 +101,12 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "None of the results indicate an error", - "details": { - "failed_tools": "", - }, - "success": True -} + "explanation": "None of the results indicate an error", + "details": { + "failed_tools": "", + }, + "success": True +} ### Example - Succeeded @@ -116,12 +117,12 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "Although the returned value 7 is not the square root of 4, this is a business mistake in the tool. The tool did not return a result indicating a technical error", - "details": { - "failed_tools": "", - }, - "success": True -} + "explanation": "Although the returned value 7 is not the square root of 4, this is a business mistake in the tool. The tool did not return a result indicating a technical error", + "details": { + "failed_tools": "", + }, + "success": True +} ### Example - Succeeded @@ -133,12 +134,12 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "The tool returned a semicolon separated list of names. Although the description in the definition says it should return comma-separated list , this formatting mistake is a business mistake of the tool, not a technical failure. The tool did not return an error", - "details": { - "failed_tools": "", - }, - "success": True -} + "explanation": "The tool returned a semicolon separated list of names. Although the description in the definition says it should return comma-separated list , this formatting mistake is a business mistake of the tool, not a technical failure. The tool did not return an error", + "details": { + "failed_tools": "", + }, + "success": True +} @@ -151,12 +152,12 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "The tool returned empty response which is accepted given that this tool functionality does not include returning data to the caller", - "details": { - "failed_tools": "", - }, - "success": True -} + "explanation": "The tool returned empty response which is accepted given that this tool functionality does not include returning data to the caller", + "details": { + "failed_tools": "", + }, + "success": True +} ## Failed Evaluation Process Examples @@ -171,12 +172,12 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "The tool returned empty response , however , given the tool definition , it should never return empty response because there should be weather info at any given point in time. An empty response here is considered a technical failure. The conclusion is the get_weather_info failed", - "details": { - "failed_tools": "get_weather_info", - }, - "success": False -} + "explanation": "The tool returned empty response , however , given the tool definition , it should never return empty response because there should be weather info at any given point in time. An empty response here is considered a technical failure. The conclusion is the get_weather_info failed", + "details": { + "failed_tools": "get_weather_info", + }, + "success": False +} ### Example - Failed @@ -187,12 +188,12 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "The tool returned a string indicating that it failed", - "details": { - "failed_tools": "get_current_user_Info", - }, - "success": False -} + "explanation": "The tool returned a string indicating that it failed", + "details": { + "failed_tools": "get_current_user_Info", + }, + "success": False +} ### Example - Failed @@ -203,12 +204,12 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "The tool returned an object with empty fields and a string indicating that it failed", - "details": { - "failed_tools": "get_current_user_Info", - }, - "success": False -} + "explanation": "The tool returned an object with empty fields and a string indicating that it failed", + "details": { + "failed_tools": "get_current_user_Info", + }, + "success": False +} ### Example - Failed @@ -218,13 +219,13 @@ EXPECTED OUTPUT [TOOL_RESULT] {temp:""} EXPECTED OUTPUT - { - "explanation": "The call for GetWeatherInfo returned an object containing single property 'temp' that is an empty string. This means the call to GetWeatherInfo returned empty result while weather info should be available at any time", - "details": { - "failed_tools": "GetWeatherInfo", - }, - "success": False - } +{ + "explanation": "The call for GetWeatherInfo returned an object containing single property 'temp' that is an empty string. This means the call to GetWeatherInfo returned empty result while weather info should be available at any time", + "details": { + "failed_tools": "GetWeatherInfo", + }, + "success": False +} ### Example - Failed @@ -235,13 +236,12 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "the returned result indicates that the call to get_day_of_week timed out", - "details": { - "failed_tools": "get_day_of_week", - }, - "success": False -} - + "explanation": "the returned result indicates that the call to get_day_of_week timed out", + "details": { + "failed_tools": "get_day_of_week", + }, + "success": False +} ### Example - Failed @@ -253,15 +253,14 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "null indicates an empty result which cannot be an accepted output of the tool given the tool definition since any given date represents a day of week", - "details": { - "failed_tools": "get_day_of_week", - }, - "success": False + "explanation": "null indicates an empty result which cannot be an accepted output of the tool given the tool definition since any given date represents a day of week", + "details": { + "failed_tools": "get_day_of_week", + }, + "success": False } - ### Example - Failed [TOOL_DEFINITIONS] [get_day_of_week] Takes date as an input and returns the day of week that this day represents @@ -271,13 +270,12 @@ EXPECTED OUTPUT EXPECTED OUTPUT { - "explanation": "Empty object cannot be an accepted output of the tool given the tool definition since any given date should represent a day of week", - "details": { - "failed_tools": "get_day_of_week", - }, - "success": False -} - + "explanation": "Empty object cannot be an accepted output of the tool given the tool definition since any given date should represent a day of week", + "details": { + "failed_tools": "get_day_of_week", + }, + "success": False +} ### Example - Failed @@ -289,13 +287,13 @@ EXPECTED OUTPUT [TOOL_RESULT] "Failed to book the ticket" EXPECTED OUTPUT - { - "explanation": "GetWeatherInfo returned an empty response while it should return the weather info and BookTicket returned an error.Both tools failed.", - "details": { - "failed_tools": "GetWeatherInfo,BookTicket", - }, - "success": False - } +{ + "explanation": "GetWeatherInfo returned an empty response while it should return the weather info and BookTicket returned an error.Both tools failed.", + "details": { + "failed_tools": "GetWeatherInfo,BookTicket", + }, + "success": False +} ### Example - Failed @@ -307,13 +305,13 @@ EXPECTED OUTPUT [TOOL_RESULT] "Failed to book the ticket" EXPECTED OUTPUT - { - "explanation": "Although GetWeatherInfo succeeded, BookTicket returned an error. The final result is failure because one of the tool calls has failed", - "details": { - "failed_tools": "BookTicket", - }, - "success": False - } +{ + "explanation": "Although GetWeatherInfo succeeded, BookTicket returned an error. The final result is failure because one of the tool calls has failed", + "details": { + "failed_tools": "BookTicket", + }, + "success": False +}