Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 52 additions & 8 deletions crawl4ai/async_url_seeder.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,22 @@ class AsyncUrlSeeder:
Async version of UrlSeeder.
Call pattern is await/async for / async with.

Parameters
----------
ttl : timedelta, default TTL
Time-to-live for cached results.
client : httpx.AsyncClient, optional
HTTP client to use. If None, creates a new one.
logger : AsyncLoggerBase, optional
Logger instance for logging messages.
base_directory : str or pathlib.Path, optional
Base directory for cache storage. Defaults to home directory.
cache_root : str or pathlib.Path, optional
Root directory for URL seeder cache. Defaults to ~/.cache/url_seeder.
verify_redirect_targets : bool, default True
Whether to verify that redirect targets are alive (2xx status) before returning them.
When False, returns redirect targets without verification (legacy behavior).

Public coroutines
-----------------
await seed.urls(...)
Expand Down Expand Up @@ -203,6 +219,8 @@ def __init__(
# NEW: Add base_directory
base_directory: Optional[Union[str, pathlib.Path]] = None,
cache_root: Optional[Union[str, Path]] = None,
# NEW: Control redirect target verification
verify_redirect_targets: bool = True,
):
self.ttl = ttl
self._owns_client = client is None # Track if we created the client
Expand All @@ -227,6 +245,9 @@ def __init__(
cache_root or "~/.cache/url_seeder"))
(self.cache_root / "live").mkdir(parents=True, exist_ok=True)
(self.cache_root / "head").mkdir(exist_ok=True)

# Store redirect verification setting
self.verify_redirect_targets = verify_redirect_targets

def _log(self, level: str, message: str, tag: str = "URL_SEED", **kwargs: Any):
"""Helper to log messages using the provided logger, if available."""
Expand Down Expand Up @@ -682,24 +703,47 @@ async def _resolve_head(self, url: str) -> Optional[str]:

Returns:
* the same URL if it answers 2xx,
* the absolute redirect target if it answers 3xx,
* the absolute redirect target if it answers 3xx (and if verify_redirect_targets=True, only if target is alive/2xx),
* None on any other status or network error.
"""
try:
r = await self.client.head(url, timeout=10, follow_redirects=False)

# direct hit
# direct 2xx hit
if 200 <= r.status_code < 300:
return str(r.url)

# single level redirect
# single-level redirect (3xx)
if r.status_code in (301, 302, 303, 307, 308):
loc = r.headers.get("location")
if loc:
return urljoin(url, loc)

target = urljoin(url, loc)
# Avoid infinite loop on self-redirect
if target == url:
return None

# If not verifying redirect targets, return immediately (old behavior)
if not self.verify_redirect_targets:
return target

# Verify redirect target is alive (new behavior)
try:
r2 = await self.client.head(target, timeout=10, follow_redirects=False)
if 200 <= r2.status_code < 300:
return str(r2.url)
# Optionally, could handle another 3xx here for 2-step chains, but spec only says 1
else:
self._log(
"debug",
"HEAD redirect target {target} did not resolve: status {status}",
params={"target": target, "status": r2.status_code},
tag="URL_SEED",
)
return None
except Exception as e2:
self._log("debug", "HEAD {target} failed: {err}",
params={"target": target, "err": str(e2)}, tag="URL_SEED")
return None
# all other cases
return None

except Exception as e:
self._log("debug", "HEAD {url} failed: {err}",
params={"url": url, "err": str(e)}, tag="URL_SEED")
Expand Down
29 changes: 29 additions & 0 deletions tests/test_async_url_seeder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pytest
import asyncio
from crawl4ai.async_url_seeder import AsyncUrlSeeder

@pytest.mark.asyncio
async def test_resolve_head_handles_dead_redirects():
seeder = AsyncUrlSeeder()
# Should return None – redirects to a dead URL
assert await seeder._resolve_head("http://youtube.com/sitemap.xml") is None
assert await seeder._resolve_head("https://stripe.com/sitemap.xml") is None

@pytest.mark.asyncio
async def test_resolve_head_direct_hit():
seeder = AsyncUrlSeeder()
# Test with a known live URL, e.g., httpbin
result = await seeder._resolve_head("https://httpbin.org/status/200")
assert result == "https://httpbin.org/status/200"

@pytest.mark.asyncio
async def test_resolve_head_verify_redirect_targets_false():
# Test with verification disabled - should return redirect target without checking if alive
seeder = AsyncUrlSeeder(verify_redirect_targets=False)
# This should return the redirect target even if it's dead (old behavior)
result = await seeder._resolve_head("http://youtube.com/sitemap.xml")
# The exact redirect target might vary, but it should not be None
assert result is not None
assert isinstance(result, str)
# Should be different from the input URL (indicating redirect was followed)
assert result != "http://youtube.com/sitemap.xml"