Skip to content

Commit 8ed4f52

Browse files
committed
fix(oauth): 🐛 prevent deadlock and token desync for rotating refresh tokens
This commit fixes critical issues in OAuth providers (Google, iFlow, Qwen) that use rotating refresh tokens, where each token refresh invalidates the previous token. **Problems Fixed:** 1. **Deadlock Prevention**: Removed inline re-authentication from `refresh_token()` method that was called while holding a lock. When refresh failed with HTTP 400/401/403, the method would call `initialize_token()` directly, which would try to acquire the same lock, causing a deadlock. Now, invalid token errors are caught and queued for background re-authentication via `asyncio.create_task()`. 2. **Token Desync**: Changed `_save_credentials()` to write to disk FIRST, then update cache. Previously, cache was updated first with `buffer_on_failure=True`, which could leave stale tokens on disk if the write failed. For rotating tokens, this caused the old refresh_token on disk to become invalid after a successful API call, requiring re-auth on restart. 3. **Stale Cache Usage**: Modified `refresh_token()` to always read fresh credentials from disk before refreshing, preventing use of stale cached tokens that may have been invalidated by another process. 4. **New Error Type**: Introduced `CredentialNeedsReauthError` exception to signal rotatable authentication failures. This allows the client to rotate to the next credential without logging scary tracebacks, while background re-auth fixes the broken credential. **Changes:** - Add `CredentialNeedsReauthError` exception class and classification in error_handler.py - Catch and wrap `CredentialNeedsReauthError` in client.py retry loop - Replace inline re-auth with background task queuing in all OAuth providers - Change `_save_credentials()` to disk-first writes with no buffering for rotating tokens - Add `force_interactive` parameter to `initialize_token()` for explicit re-auth requests - Always reload credentials from disk before refresh to prevent stale token usage - Return boolean from `_save_credentials()` and raise IOError on critical failures - Update re-auth queue processing to call `initialize_token(force_interactive=True)`
1 parent a68d8d0 commit 8ed4f52

File tree

5 files changed

+408
-167
lines changed

5 files changed

+408
-167
lines changed

src/rotator_library/client.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from .failure_logger import log_failure, configure_failure_logger
2424
from .error_handler import (
2525
PreRequestCallbackError,
26+
CredentialNeedsReauthError,
2627
classify_error,
2728
AllProviders,
2829
NoAvailableKeysError,
@@ -755,6 +756,12 @@ async def _safe_streaming_wrapper(
755756
await self.usage_manager.record_success(key, model)
756757
break
757758

759+
except CredentialNeedsReauthError as e:
760+
# This credential needs re-authentication but re-auth is already queued.
761+
# Wrap it so the outer retry loop can rotate to the next credential.
762+
# No scary traceback needed - this is an expected recovery scenario.
763+
raise StreamedAPIError("Credential needs re-authentication", data=e)
764+
758765
except (
759766
litellm.RateLimitError,
760767
litellm.ServiceUnavailableError,

src/rotator_library/error_handler.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,31 @@ class PreRequestCallbackError(Exception):
117117
pass
118118

119119

120+
class CredentialNeedsReauthError(Exception):
121+
"""
122+
Raised when a credential's refresh token is invalid and re-authentication is required.
123+
124+
This is a rotatable error - the request should try the next credential while
125+
the broken credential is queued for re-authentication in the background.
126+
127+
Unlike generic HTTPStatusError, this exception signals:
128+
- The credential is temporarily unavailable (needs user action)
129+
- Re-auth has already been queued
130+
- The request should rotate to the next credential without logging scary tracebacks
131+
132+
Attributes:
133+
credential_path: Path to the credential file that needs re-auth
134+
message: Human-readable message about the error
135+
"""
136+
137+
def __init__(self, credential_path: str, message: str = ""):
138+
self.credential_path = credential_path
139+
self.message = (
140+
message or f"Credential '{credential_path}' requires re-authentication"
141+
)
142+
super().__init__(self.message)
143+
144+
120145
# =============================================================================
121146
# ERROR TRACKING FOR CLIENT REPORTING
122147
# =============================================================================
@@ -698,6 +723,14 @@ def classify_error(e: Exception, provider: Optional[str] = None) -> ClassifiedEr
698723
status_code=400, # Treat as a bad request
699724
)
700725

726+
if isinstance(e, CredentialNeedsReauthError):
727+
# This is a rotatable error - credential is broken but re-auth is queued
728+
return ClassifiedError(
729+
error_type="credential_reauth_needed",
730+
original_exception=e,
731+
status_code=401, # Treat as auth error for reporting purposes
732+
)
733+
701734
if isinstance(e, RateLimitError):
702735
retry_after = get_retry_after(e)
703736
# Check if this is a quota error vs rate limit
@@ -789,6 +822,7 @@ def should_rotate_on_error(classified_error: ClassifiedError) -> bool:
789822
- quota_exceeded: Current key/account exhausted
790823
- forbidden: Current credential denied access
791824
- authentication: Current credential invalid
825+
- credential_reauth_needed: Credential needs interactive re-auth (queued)
792826
- server_error: Provider having issues (might work with different endpoint/key)
793827
- api_connection: Network issues (might be transient)
794828
- unknown: Safer to try another key

src/rotator_library/providers/google_oauth_base.py

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from ..utils.headless_detection import is_headless_environment
2323
from ..utils.reauth_coordinator import get_reauth_coordinator
2424
from ..utils.resilient_io import safe_write_json
25+
from ..error_handler import CredentialNeedsReauthError
2526

2627
lib_logger = logging.getLogger("rotator_library")
2728

@@ -366,7 +367,6 @@ async def _refresh_token(
366367
max_retries = 3
367368
new_token_data = None
368369
last_error = None
369-
needs_reauth = False
370370

371371
async with httpx.AsyncClient() as client:
372372
for attempt in range(max_retries):
@@ -390,15 +390,42 @@ async def _refresh_token(
390390
except httpx.HTTPStatusError as e:
391391
last_error = e
392392
status_code = e.response.status_code
393-
394-
# [INVALID GRANT HANDLING] Handle 401/403 by triggering re-authentication
395-
if status_code == 401 or status_code == 403:
396-
lib_logger.warning(
397-
f"Refresh token invalid for '{Path(path).name}' (HTTP {status_code}). "
398-
f"Token may have been revoked or expired. Starting re-authentication..."
393+
error_body = e.response.text
394+
395+
# [INVALID GRANT HANDLING] Handle 400/401/403 by queuing for re-auth
396+
# We must NOT call initialize_token from here as we hold a lock (would deadlock)
397+
if status_code == 400:
398+
# Check if this is an invalid_grant error
399+
if "invalid_grant" in error_body.lower():
400+
lib_logger.info(
401+
f"Credential '{Path(path).name}' needs re-auth (HTTP 400: invalid_grant). "
402+
f"Queued for re-authentication, rotating to next credential."
403+
)
404+
asyncio.create_task(
405+
self._queue_refresh(
406+
path, force=True, needs_reauth=True
407+
)
408+
)
409+
raise CredentialNeedsReauthError(
410+
credential_path=path,
411+
message=f"Refresh token invalid for '{Path(path).name}'. Re-auth queued.",
412+
)
413+
else:
414+
# Other 400 error - raise it
415+
raise
416+
417+
elif status_code in (401, 403):
418+
lib_logger.info(
419+
f"Credential '{Path(path).name}' needs re-auth (HTTP {status_code}). "
420+
f"Queued for re-authentication, rotating to next credential."
421+
)
422+
asyncio.create_task(
423+
self._queue_refresh(path, force=True, needs_reauth=True)
424+
)
425+
raise CredentialNeedsReauthError(
426+
credential_path=path,
427+
message=f"Token invalid for '{Path(path).name}' (HTTP {status_code}). Re-auth queued.",
399428
)
400-
needs_reauth = True
401-
break # Exit retry loop to trigger re-auth
402429

403430
elif status_code == 429:
404431
# Rate limit - honor Retry-After header if present
@@ -438,23 +465,6 @@ async def _refresh_token(
438465
continue
439466
raise
440467

441-
# [INVALID GRANT RE-AUTH] Trigger OAuth flow if refresh token is invalid
442-
if needs_reauth:
443-
lib_logger.info(
444-
f"Starting re-authentication for '{Path(path).name}'..."
445-
)
446-
try:
447-
# Call initialize_token to trigger OAuth flow
448-
new_creds = await self.initialize_token(path)
449-
return new_creds
450-
except Exception as reauth_error:
451-
lib_logger.error(
452-
f"Re-authentication failed for '{Path(path).name}': {reauth_error}"
453-
)
454-
raise ValueError(
455-
f"Refresh token invalid and re-authentication failed: {reauth_error}"
456-
)
457-
458468
# If we exhausted retries without success
459469
if new_token_data is None:
460470
raise last_error or Exception("Token refresh failed after all retries")
@@ -832,7 +842,7 @@ async def _process_reauth_queue(self):
832842

833843
try:
834844
lib_logger.info(f"Starting re-auth for '{Path(path).name}'...")
835-
await self.initialize_token(path)
845+
await self.initialize_token(path, force_interactive=True)
836846
lib_logger.info(f"Re-auth SUCCESS for '{Path(path).name}'")
837847

838848
except Exception as e:
@@ -1058,14 +1068,22 @@ async def handle_callback(reader, writer):
10581068
return new_creds
10591069

10601070
async def initialize_token(
1061-
self, creds_or_path: Union[Dict[str, Any], str]
1071+
self,
1072+
creds_or_path: Union[Dict[str, Any], str],
1073+
force_interactive: bool = False,
10621074
) -> Dict[str, Any]:
10631075
"""
10641076
Initialize OAuth token, triggering interactive OAuth flow if needed.
10651077
10661078
If interactive OAuth is required (expired refresh token, missing credentials, etc.),
10671079
the flow is coordinated globally via ReauthCoordinator to ensure only one
10681080
interactive OAuth flow runs at a time across all providers.
1081+
1082+
Args:
1083+
creds_or_path: Either a credentials dict or path to credentials file.
1084+
force_interactive: If True, skip expiry checks and force interactive OAuth.
1085+
Use this when the refresh token is known to be invalid
1086+
(e.g., after HTTP 400 from token endpoint).
10691087
"""
10701088
path = creds_or_path if isinstance(creds_or_path, str) else None
10711089

@@ -1085,7 +1103,11 @@ async def initialize_token(
10851103
await self._load_credentials(creds_or_path) if path else creds_or_path
10861104
)
10871105
reason = ""
1088-
if not creds.get("refresh_token"):
1106+
if force_interactive:
1107+
reason = (
1108+
"re-authentication was explicitly requested (refresh token invalid)"
1109+
)
1110+
elif not creds.get("refresh_token"):
10891111
reason = "refresh token is missing"
10901112
elif self._is_token_expired(creds):
10911113
reason = "token is expired"

0 commit comments

Comments
 (0)