From 07d9678f99f101e810bd79a8441c04428d67a6b8 Mon Sep 17 00:00:00 2001 From: habema Date: Wed, 14 Jan 2026 15:13:24 +0300 Subject: [PATCH 1/3] Enhance error handling for MCP server connections and tool invocations --- src/agents/mcp/server.py | 233 +++++++++++++++++++++++++++++++++++---- src/agents/mcp/util.py | 9 +- 2 files changed, 221 insertions(+), 21 deletions(-) diff --git a/src/agents/mcp/server.py b/src/agents/mcp/server.py index 4fff94d0b6..181d660e56 100644 --- a/src/agents/mcp/server.py +++ b/src/agents/mcp/server.py @@ -3,12 +3,17 @@ import abc import asyncio import inspect +import sys from collections.abc import Awaitable from contextlib import AbstractAsyncContextManager, AsyncExitStack from datetime import timedelta from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Literal, TypeVar +import httpx + +if sys.version_info < (3, 11): + from exceptiongroup import BaseExceptionGroup # pyright: ignore[reportMissingImports] from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream from mcp import ClientSession, StdioServerParameters, Tool as MCPTool, stdio_client from mcp.client.session import MessageHandlerFnT @@ -251,6 +256,64 @@ def invalidate_tools_cache(self): """Invalidate the tools cache.""" self._cache_dirty = True + def _extract_http_error_from_exception(self, e: Exception) -> Exception | None: + """Extract HTTP error from exception or ExceptionGroup.""" + if isinstance(e, (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)): + return e + + # Check if it's an ExceptionGroup containing HTTP errors + if isinstance(e, BaseExceptionGroup): + for exc in e.exceptions: + if isinstance( + exc, (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException) + ): + return exc + + return None + + def _raise_user_error_for_http_error(self, http_error: Exception) -> None: + """Raise appropriate UserError for HTTP error.""" + if isinstance(http_error, httpx.HTTPStatusError): + status_code = http_error.response.status_code + if status_code == 401: + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Authentication failed (401 Unauthorized). " + f"Please check your credentials." + ) from http_error + elif status_code == 403: + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Access forbidden (403 Forbidden). " + f"Please check your permissions." + ) from http_error + + elif status_code >= 500: + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Server error ({status_code}). " + f"The MCP server may be experiencing issues." + ) from http_error + + else: + raise UserError( + f"Failed to connect to MCP server '{self.name}': HTTP error {status_code}" + ) from http_error + + elif isinstance(http_error, httpx.ConnectError): + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Could not reach the server. " + f"Please check that the server is running and the URL is correct." + ) from http_error + + elif isinstance(http_error, httpx.TimeoutException): + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Connection timeout. " + f"The server did not respond in time." + ) from http_error + async def _run_with_retries(self, func: Callable[[], Awaitable[T]]) -> T: attempts = 0 while True: @@ -265,6 +328,7 @@ async def _run_with_retries(self, func: Callable[[], Awaitable[T]]) -> T: async def connect(self): """Connect to the server.""" + connection_succeeded = False try: transport = await self.exit_stack.enter_async_context(self.create_streams()) # streamablehttp_client returns (read, write, get_session_id) @@ -285,10 +349,51 @@ async def connect(self): server_result = await session.initialize() self.server_initialize_result = server_result self.session = session + connection_succeeded = True except Exception as e: - logger.error(f"Error initializing MCP server: {e}") - await self.cleanup() + # Try to extract HTTP error from exception or ExceptionGroup + http_error = self._extract_http_error_from_exception(e) + if http_error: + self._raise_user_error_for_http_error(http_error) + + # For CancelledError, it might mask an HTTP error that will be raised during cleanup + if isinstance(e, asyncio.CancelledError): + raise UserError( + f"Failed to connect to MCP server '{self.name}': Connection was cancelled. " + f"This may indicate the server is unreachable or returned an error." + ) from e + + # For HTTP-related errors, wrap them + if isinstance(e, (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)): + self._raise_user_error_for_http_error(e) + + # For other errors, re-raise as-is (don't wrap non-HTTP errors) raise + finally: + # Always attempt cleanup on error, but suppress cleanup errors that mask the original + if not connection_succeeded: + try: + await self.cleanup() + except UserError: + # Re-raise UserError from cleanup (contains the real HTTP error) + raise + except Exception as cleanup_error: + # Suppress RuntimeError about cancel scopes during cleanup - this is a known + # issue with the MCP library's async generator cleanup and shouldn't mask the + # original error + if isinstance(cleanup_error, RuntimeError) and "cancel scope" in str( + cleanup_error + ): + logger.debug( + f"Ignoring cancel scope error during cleanup of MCP server " + f"'{self.name}': {cleanup_error}" + ) + else: + # Log other cleanup errors but don't raise - original error is more + # important + logger.warning( + f"Error during cleanup of MCP server '{self.name}': {cleanup_error}" + ) async def list_tools( self, @@ -301,21 +406,32 @@ async def list_tools( session = self.session assert session is not None - # Return from cache if caching is enabled, we have tools, and the cache is not dirty - if self.cache_tools_list and not self._cache_dirty and self._tools_list: - tools = self._tools_list - else: - # Fetch the tools from the server - result = await self._run_with_retries(lambda: session.list_tools()) - self._tools_list = result.tools - self._cache_dirty = False - tools = self._tools_list - - # Filter tools based on tool_filter - filtered_tools = tools - if self.tool_filter is not None: - filtered_tools = await self._apply_tool_filter(filtered_tools, run_context, agent) - return filtered_tools + try: + # Return from cache if caching is enabled, we have tools, and the cache is not dirty + if self.cache_tools_list and not self._cache_dirty and self._tools_list: + tools = self._tools_list + else: + # Fetch the tools from the server + result = await self._run_with_retries(lambda: session.list_tools()) + self._tools_list = result.tools + self._cache_dirty = False + tools = self._tools_list + + # Filter tools based on tool_filter + filtered_tools = tools + if self.tool_filter is not None: + filtered_tools = await self._apply_tool_filter(filtered_tools, run_context, agent) + return filtered_tools + except httpx.HTTPStatusError as e: + status_code = e.response.status_code + raise UserError( + f"Failed to list tools from MCP server '{self.name}': HTTP error {status_code}" + ) from e + except httpx.ConnectError as e: + raise UserError( + f"Failed to list tools from MCP server '{self.name}': Connection lost. " + f"The server may have disconnected." + ) from e async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> CallToolResult: """Invoke a tool on the server.""" @@ -324,7 +440,19 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C session = self.session assert session is not None - return await self._run_with_retries(lambda: session.call_tool(tool_name, arguments)) + try: + return await self._run_with_retries(lambda: session.call_tool(tool_name, arguments)) + except httpx.HTTPStatusError as e: + status_code = e.response.status_code + raise UserError( + f"Failed to call tool '{tool_name}' on MCP server '{self.name}': " + f"HTTP error {status_code}" + ) from e + except httpx.ConnectError as e: + raise UserError( + f"Failed to call tool '{tool_name}' on MCP server '{self.name}': Connection lost. " + f"The server may have disconnected." + ) from e async def list_prompts( self, @@ -349,8 +477,75 @@ async def cleanup(self): async with self._cleanup_lock: try: await self.exit_stack.aclose() + except BaseExceptionGroup as eg: + # Extract HTTP errors from ExceptionGroup raised during cleanup + # This happens when background tasks fail (e.g., HTTP errors) + http_error = None + connect_error = None + timeout_error = None + + for exc in eg.exceptions: + if isinstance(exc, httpx.HTTPStatusError): + http_error = exc + elif isinstance(exc, httpx.ConnectError): + connect_error = exc + elif isinstance(exc, httpx.TimeoutException): + timeout_error = exc + + # If we found an HTTP error, raise it as UserError + if http_error: + status_code = http_error.response.status_code + if status_code == 401: + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Authentication failed (401 Unauthorized). " + f"Please check your credentials." + ) from http_error + elif status_code == 403: + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Access forbidden (403 Forbidden). " + f"Please check your permissions." + ) from http_error + elif status_code >= 500: + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Server error ({status_code}). " + f"The MCP server may be experiencing issues." + ) from http_error + else: + raise UserError( + f"Failed to connect to MCP server '{self.name}': HTTP error {status_code}" # noqa: E501 + ) from http_error + elif connect_error: + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Could not reach the server. " + f"Please check that the server is running and the URL is correct." + ) from connect_error + elif timeout_error: + raise UserError( + f"Failed to connect to MCP server '{self.name}': " + f"Connection timeout. " + f"The server did not respond in time." + ) from timeout_error + else: + # No HTTP error found, suppress RuntimeError about cancel scopes + has_cancel_scope_error = any( + isinstance(exc, RuntimeError) and "cancel scope" in str(exc) + for exc in eg.exceptions + ) + if has_cancel_scope_error: + logger.debug(f"Ignoring cancel scope error during cleanup: {eg}") + else: + logger.error(f"Error cleaning up server: {eg}") except Exception as e: - logger.error(f"Error cleaning up server: {e}") + # Suppress RuntimeError about cancel scopes - this is a known issue with the MCP + # library when background tasks fail during async generator cleanup + if isinstance(e, RuntimeError) and "cancel scope" in str(e): + logger.debug(f"Ignoring cancel scope error during cleanup: {e}") + else: + logger.error(f"Error cleaning up server: {e}") finally: self.session = None diff --git a/src/agents/mcp/util.py b/src/agents/mcp/util.py index 6cfe5c96d5..832728986f 100644 --- a/src/agents/mcp/util.py +++ b/src/agents/mcp/util.py @@ -201,9 +201,14 @@ async def invoke_mcp_tool( try: result = await server.call_tool(tool.name, json_data) + except UserError: + # Re-raise UserError as-is (it already has a good message) + raise except Exception as e: - logger.error(f"Error invoking MCP tool {tool.name}: {e}") - raise AgentsException(f"Error invoking MCP tool {tool.name}: {e}") from e + logger.error(f"Error invoking MCP tool {tool.name} on server '{server.name}': {e}") + raise AgentsException( + f"Error invoking MCP tool {tool.name} on server '{server.name}': {e}" + ) from e if _debug.DONT_LOG_TOOL_DATA: logger.debug(f"MCP tool {tool.name} completed.") From 87c04c047659a38fcfc8e956ebeb49c24962e8ff Mon Sep 17 00:00:00 2001 From: habema Date: Wed, 14 Jan 2026 15:27:10 +0300 Subject: [PATCH 2/3] Address codex review comment --- src/agents/mcp/server.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/agents/mcp/server.py b/src/agents/mcp/server.py index 181d660e56..b3f83ba8ef 100644 --- a/src/agents/mcp/server.py +++ b/src/agents/mcp/server.py @@ -356,12 +356,10 @@ async def connect(self): if http_error: self._raise_user_error_for_http_error(http_error) - # For CancelledError, it might mask an HTTP error that will be raised during cleanup + # For CancelledError, preserve cancellation semantics - don't wrap it. + # If it's masking an HTTP error, cleanup() will extract and raise UserError. if isinstance(e, asyncio.CancelledError): - raise UserError( - f"Failed to connect to MCP server '{self.name}': Connection was cancelled. " - f"This may indicate the server is unreachable or returned an error." - ) from e + raise # For HTTP-related errors, wrap them if isinstance(e, (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)): From aa2ed8452b26fe42c34d36546345d6e1ded981f4 Mon Sep 17 00:00:00 2001 From: habema Date: Wed, 14 Jan 2026 16:00:13 +0300 Subject: [PATCH 3/3] Address codex review comment and cleanup error messages to a simpler version --- src/agents/mcp/server.py | 103 +++++++++++++-------------------------- 1 file changed, 35 insertions(+), 68 deletions(-) diff --git a/src/agents/mcp/server.py b/src/agents/mcp/server.py index b3f83ba8ef..015b5b6f76 100644 --- a/src/agents/mcp/server.py +++ b/src/agents/mcp/server.py @@ -273,46 +273,17 @@ def _extract_http_error_from_exception(self, e: Exception) -> Exception | None: def _raise_user_error_for_http_error(self, http_error: Exception) -> None: """Raise appropriate UserError for HTTP error.""" + error_message = f"Failed to connect to MCP server '{self.name}': " if isinstance(http_error, httpx.HTTPStatusError): - status_code = http_error.response.status_code - if status_code == 401: - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Authentication failed (401 Unauthorized). " - f"Please check your credentials." - ) from http_error - elif status_code == 403: - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Access forbidden (403 Forbidden). " - f"Please check your permissions." - ) from http_error - - elif status_code >= 500: - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Server error ({status_code}). " - f"The MCP server may be experiencing issues." - ) from http_error - - else: - raise UserError( - f"Failed to connect to MCP server '{self.name}': HTTP error {status_code}" - ) from http_error + error_message += f"HTTP error {http_error.response.status_code} ({http_error.response.reason_phrase})" # noqa: E501 elif isinstance(http_error, httpx.ConnectError): - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Could not reach the server. " - f"Please check that the server is running and the URL is correct." - ) from http_error + error_message += "Could not reach the server." elif isinstance(http_error, httpx.TimeoutException): - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Connection timeout. " - f"The server did not respond in time." - ) from http_error + error_message += "Connection timeout." + + raise UserError(error_message) from http_error async def _run_with_retries(self, func: Callable[[], Awaitable[T]]) -> T: attempts = 0 @@ -473,6 +444,11 @@ async def get_prompt( async def cleanup(self): """Cleanup the server.""" async with self._cleanup_lock: + # Only raise HTTP errors if we're cleaning up after a failed connection. + # During normal teardown (via __aexit__), log but don't raise to avoid + # masking the original exception. + is_failed_connection_cleanup = self.session is None + try: await self.exit_stack.aclose() except BaseExceptionGroup as eg: @@ -481,6 +457,7 @@ async def cleanup(self): http_error = None connect_error = None timeout_error = None + error_message = f"Failed to connect to MCP server '{self.name}': " for exc in eg.exceptions: if isinstance(exc, httpx.HTTPStatusError): @@ -490,43 +467,33 @@ async def cleanup(self): elif isinstance(exc, httpx.TimeoutException): timeout_error = exc - # If we found an HTTP error, raise it as UserError + # Only raise HTTP errors if we're cleaning up after a failed connection. + # During normal teardown, log them instead. if http_error: - status_code = http_error.response.status_code - if status_code == 401: - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Authentication failed (401 Unauthorized). " - f"Please check your credentials." - ) from http_error - elif status_code == 403: - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Access forbidden (403 Forbidden). " - f"Please check your permissions." - ) from http_error - elif status_code >= 500: - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Server error ({status_code}). " - f"The MCP server may be experiencing issues." - ) from http_error + if is_failed_connection_cleanup: + error_message += f"HTTP error {http_error.response.status_code} ({http_error.response.reason_phrase})" # noqa: E501 + raise UserError(error_message) from http_error else: - raise UserError( - f"Failed to connect to MCP server '{self.name}': HTTP error {status_code}" # noqa: E501 - ) from http_error + # Normal teardown - log but don't raise + logger.warning( + f"HTTP error during cleanup of MCP server '{self.name}': {http_error}" + ) elif connect_error: - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Could not reach the server. " - f"Please check that the server is running and the URL is correct." - ) from connect_error + if is_failed_connection_cleanup: + error_message += "Could not reach the server." + raise UserError(error_message) from connect_error + else: + logger.warning( + f"Connection error during cleanup of MCP server '{self.name}': {connect_error}" # noqa: E501 + ) elif timeout_error: - raise UserError( - f"Failed to connect to MCP server '{self.name}': " - f"Connection timeout. " - f"The server did not respond in time." - ) from timeout_error + if is_failed_connection_cleanup: + error_message += "Connection timeout." + raise UserError(error_message) from timeout_error + else: + logger.warning( + f"Timeout error during cleanup of MCP server '{self.name}': {timeout_error}" # noqa: E501 + ) else: # No HTTP error found, suppress RuntimeError about cancel scopes has_cancel_scope_error = any(