From c003cb6e4fcd1a2e49d6d805adec07f4406e112b Mon Sep 17 00:00:00 2001 From: AHMET YILMAZ Date: Fri, 7 Nov 2025 15:42:37 +0800 Subject: [PATCH] fix #1563 (cdp): resolve page leaks and race conditions in concurrent crawling Fix memory leaks and race conditions when using arun_many() with managed CDP browsers. Each crawl now gets proper page isolation with automatic cleanup while maintaining shared browser context. Key fixes: - Close non-session pages after crawling to prevent tab accumulation - Add thread-safe page creation with locks to avoid concurrent access - Improve page lifecycle management for managed vs non-managed browsers - Keep session pages alive for authentication persistence - Prevent TOCTOU (time-of-check-time-of-use) race conditions This ensures stable parallel crawling without memory growth or browser instability. --- crawl4ai/async_crawler_strategy.py | 20 +- crawl4ai/browser_manager.py | 36 +- docs/md_v2/advanced/cdp-browser-crawling.md | 594 ++++++++++++++++++++ tests/test_arun_many_cdp.py | 63 +++ 4 files changed, 685 insertions(+), 28 deletions(-) create mode 100644 docs/md_v2/advanced/cdp-browser-crawling.md create mode 100644 tests/test_arun_many_cdp.py diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 76977bb95..f72ea54fd 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1047,14 +1047,28 @@ async def get_delayed_content(delay: float = 5.0) -> str: raise e finally: - # If no session_id is given we should close the page + # Clean up page after crawl completes + # For managed CDP browsers, close pages that are not part of a session to prevent memory leaks all_contexts = page.context.browser.contexts - total_pages = sum(len(context.pages) for context in all_contexts) + total_pages = sum(len(context.pages) for context in all_contexts) + + should_close_page = False + if config.session_id: + # Session pages are kept alive for reuse pass - elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless): + elif self.browser_config.use_managed_browser: + # For managed browsers (CDP), close non-session pages to prevent tab accumulation + # This is especially important for arun_many() with multiple concurrent crawls + should_close_page = True + elif total_pages <= 1 and self.browser_config.headless: + # Keep the last page in headless mode to avoid closing the browser pass else: + # For non-managed browsers, close the page + should_close_page = True + + if should_close_page: # Detach listeners before closing to prevent potential errors during close if config.capture_network_requests: page.remove_listener("request", handle_request_capture) diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index a0fb26734..b06530149 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -1035,34 +1035,20 @@ async def get_page(self, crawlerRunConfig: CrawlerRunConfig): self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) return page, context - # If using a managed browser, just grab the shared default_context + # If using a managed browser, reuse the default context and create new pages if self.config.use_managed_browser: + context = self.default_context if self.config.storage_state: - context = await self.create_browser_context(crawlerRunConfig) - ctx = self.default_context # default context, one window only + # Clone runtime state from storage to the shared context + ctx = self.default_context ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config) - # Avoid concurrent new_page on shared persistent context - # See GH-1198: context.pages can be empty under races - async with self._page_lock: - page = await ctx.new_page() - await self._apply_stealth_to_page(page) - else: - context = self.default_context - pages = context.pages - page = next((p for p in pages if p.url == crawlerRunConfig.url), None) - if not page: - if pages: - page = pages[0] - else: - # Double-check under lock to avoid TOCTOU and ensure only - # one task calls new_page when pages=[] concurrently - async with self._page_lock: - pages = context.pages - if pages: - page = pages[0] - else: - page = await context.new_page() - await self._apply_stealth_to_page(page) + + # Always create a new page for concurrent safety + # The page-level isolation prevents race conditions while sharing the same context + async with self._page_lock: + page = await context.new_page() + + await self._apply_stealth_to_page(page) else: # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) diff --git a/docs/md_v2/advanced/cdp-browser-crawling.md b/docs/md_v2/advanced/cdp-browser-crawling.md new file mode 100644 index 000000000..c48226b46 --- /dev/null +++ b/docs/md_v2/advanced/cdp-browser-crawling.md @@ -0,0 +1,594 @@ +# CDP Browser Crawling + +> **New in v0.7.6**: Efficient concurrent crawling with managed CDP (Chrome DevTools Protocol) browsers. Connect to a running browser instance and perform multiple crawls without spawning new windows. + +## 1. Overview + +When working with CDP browsers, you can connect to an existing browser instance instead of launching a new one for each crawl. This is particularly useful for: + +- **Development**: Keep your browser open with DevTools for debugging +- **Persistent Sessions**: Maintain authentication across multiple crawls +- **Resource Efficiency**: Reuse a single browser instance for multiple operations +- **Concurrent Crawling**: Run multiple crawls simultaneously with proper isolation + +**Key Benefits:** + +- ✅ Single browser window with multiple tabs (no window clutter) +- ✅ Shared state (cookies, localStorage) across crawls +- ✅ Concurrent safety with automatic page isolation +- ✅ Automatic cleanup to prevent memory leaks +- ✅ Works seamlessly with `arun_many()` for parallel crawling + +--- + +## 2. Quick Start + +### 2.1 Starting a CDP Browser + +Use the Crawl4AI CLI to start a managed CDP browser: + +```bash +# Start CDP browser on default port (9222) +crwl cdp + +# Start on custom port +crwl cdp -d 9223 + +# Start in headless mode +crwl cdp --headless +``` + +The browser will stay running until you press 'q' or close the terminal. + +### 2.2 Basic CDP Connection + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + # Configure CDP connection + browser_cfg = BrowserConfig( + browser_type="chromium", + cdp_url="http://localhost:9222", + verbose=True + ) + + # Crawl a single URL + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig() + ) + print(f"Success: {result.success}") + print(f"Content length: {len(result.markdown)}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## 3. Concurrent Crawling with arun_many() + +The real power of CDP crawling shines with `arun_many()`. The browser manager automatically handles: + +- **Page Isolation**: Each crawl gets its own tab +- **Context Sharing**: All tabs share cookies and localStorage +- **Concurrent Safety**: Proper locking prevents race conditions +- **Auto Cleanup**: Tabs are closed after crawling (except sessions) + +### 3.1 Basic Concurrent Crawling + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def crawl_multiple_urls(): + # URLs to crawl + urls = [ + "https://example.com", + "https://httpbin.org/html", + "https://www.python.org", + ] + + # Configure CDP browser + browser_cfg = BrowserConfig( + browser_type="chromium", + cdp_url="http://localhost:9222", + verbose=False + ) + + # Configure crawler (bypass cache for fresh data) + crawler_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS + ) + + # Crawl all URLs concurrently + async with AsyncWebCrawler(config=browser_cfg) as crawler: + results = await crawler.arun_many( + urls=urls, + config=crawler_cfg + ) + + # Process results + for result in results: + print(f"\nURL: {result.url}") + if result.success: + print(f"✓ Success | Content length: {len(result.markdown)}") + else: + print(f"✗ Failed: {result.error_message}") + +if __name__ == "__main__": + asyncio.run(crawl_multiple_urls()) +``` + +### 3.2 With Session Management + +Use sessions to maintain authentication and state across individual crawls: + +```python +async def crawl_with_sessions(): + browser_cfg = BrowserConfig( + browser_type="chromium", + cdp_url="http://localhost:9222" + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + # First crawl: Login page + login_result = await crawler.arun( + url="https://example.com/login", + config=CrawlerRunConfig( + session_id="my-session", # Session persists + js_code="document.querySelector('#login').click();" + ) + ) + + # Second crawl: Reuse authenticated session + dashboard_result = await crawler.arun( + url="https://example.com/dashboard", + config=CrawlerRunConfig( + session_id="my-session" # Same session, cookies preserved + ) + ) +``` + +--- + +## 4. How It Works + +### 4.1 Browser Context Reuse + +When using CDP browsers, Crawl4AI: + +1. **Connects** to the existing browser via CDP URL +2. **Reuses** the default browser context (single window) +3. **Creates** new pages (tabs) for each crawl +4. **Locks** page creation to prevent concurrent races +5. **Cleans up** pages after crawling (unless it's a session) + +```python +# Internal behavior (simplified) +if self.config.use_managed_browser: + context = self.default_context # Shared context + + # Thread-safe page creation + async with self._page_lock: + page = await context.new_page() # New tab per crawl + + # After crawl completes + if not config.session_id: + await page.close() # Auto cleanup +``` + +### 4.2 Page Lifecycle + +```mermaid +graph TD + A[Start Crawl] --> B{Has session_id?} + B -->|Yes| C[Reuse existing page] + B -->|No| D[Create new page/tab] + D --> E[Navigate & Extract] + C --> E + E --> F{Is session?} + F -->|Yes| G[Keep page open] + F -->|No| H[Close page] + H --> I[End] + G --> I +``` + +### 4.3 State Sharing + +All pages in the same context share: + +- 🍪 **Cookies**: Authentication tokens, preferences +- 💾 **localStorage**: Client-side data storage +- 🔐 **sessionStorage**: Per-tab session data +- 🌐 **Network cache**: Shared HTTP cache + +This makes it perfect for crawling authenticated sites or maintaining state across multiple pages. + +--- + +## 5. Configuration Options + +### 5.1 BrowserConfig for CDP + +```python +browser_cfg = BrowserConfig( + browser_type="chromium", # Must be "chromium" for CDP + cdp_url="http://localhost:9222", # CDP endpoint URL + verbose=True, # Log browser operations + + # Optional: Override headers for all requests + headers={ + "Accept-Language": "en-US,en;q=0.9", + }, + + # Optional: Set user agent + user_agent="Mozilla/5.0 ...", + + # Optional: Enable stealth mode (requires dedicated browser) + # enable_stealth=False, # Not compatible with CDP +) +``` + +### 5.2 CrawlerRunConfig Options + +```python +crawler_cfg = CrawlerRunConfig( + # Session management + session_id="my-session", # Persist page across calls + + # Caching + cache_mode=CacheMode.BYPASS, # Fresh data every time + + # Browser location (affects timezone, locale) + locale="en-US", + timezone_id="America/New_York", + geolocation={ + "latitude": 40.7128, + "longitude": -74.0060 + }, + + # Proxy (per-crawl override) + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "user", + "password": "pass" + } +) +``` + +--- + +## 6. Advanced Patterns + +### 6.1 Streaming Results + +Process URLs as they complete instead of waiting for all: + +```python +async def stream_crawl_results(): + browser_cfg = BrowserConfig( + browser_type="chromium", + cdp_url="http://localhost:9222" + ) + + urls = ["https://example.com" for _ in range(100)] + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + # Stream results as they complete + async for result in crawler.arun_many( + urls=urls, + config=CrawlerRunConfig(stream=True) + ): + if result.success: + print(f"✓ {result.url}: {len(result.markdown)} chars") + # Process immediately instead of waiting for all + await save_to_database(result) +``` + +### 6.2 Custom Concurrency Control + +```python +from crawl4ai import CrawlerRunConfig + +# Limit concurrent crawls to 3 +crawler_cfg = CrawlerRunConfig( + semaphore_count=3, # Max 3 concurrent requests + mean_delay=0.5, # Average 0.5s delay between requests + max_range=1.0, # +/- 1s random delay +) + +async with AsyncWebCrawler(config=browser_cfg) as crawler: + results = await crawler.arun_many(urls, config=crawler_cfg) +``` + +### 6.3 Multi-Config Crawling + +Different configurations for different URL groups: + +```python +from crawl4ai import CrawlerRunConfig + +# Fast crawl for static pages +fast_config = CrawlerRunConfig( + wait_until="domcontentloaded", + page_timeout=30000 +) + +# Slow crawl for dynamic pages +slow_config = CrawlerRunConfig( + wait_until="networkidle", + page_timeout=60000, + js_code="window.scrollTo(0, document.body.scrollHeight);" +) + +configs = [fast_config, slow_config, fast_config] +urls = ["https://static.com", "https://dynamic.com", "https://static2.com"] + +async with AsyncWebCrawler(config=browser_cfg) as crawler: + results = await crawler.arun_many(urls, configs=configs) +``` + +--- + +## 7. Best Practices + +### 7.1 Resource Management + +✅ **DO:** +```python +# Use context manager for automatic cleanup +async with AsyncWebCrawler(config=browser_cfg) as crawler: + results = await crawler.arun_many(urls) +# Browser connection closed automatically +``` + +❌ **DON'T:** +```python +# Manual management risks resource leaks +crawler = AsyncWebCrawler(config=browser_cfg) +await crawler.start() +results = await crawler.arun_many(urls) +# Forgot to call crawler.close()! +``` + +### 7.2 Session Management + +✅ **DO:** +```python +# Use sessions for related crawls +config = CrawlerRunConfig(session_id="user-flow") +await crawler.arun(login_url, config=config) +await crawler.arun(dashboard_url, config=config) +await crawler.kill_session("user-flow") # Clean up when done +``` + +❌ **DON'T:** +```python +# Creating new session IDs unnecessarily +for i in range(100): + config = CrawlerRunConfig(session_id=f"session-{i}") + await crawler.arun(url, config=config) +# 100 unclosed sessions accumulate! +``` + +### 7.3 Error Handling + +```python +async def robust_crawl(urls): + browser_cfg = BrowserConfig( + browser_type="chromium", + cdp_url="http://localhost:9222" + ) + + try: + async with AsyncWebCrawler(config=browser_cfg) as crawler: + results = await crawler.arun_many(urls) + + # Separate successes and failures + successes = [r for r in results if r.success] + failures = [r for r in results if not r.success] + + print(f"✓ {len(successes)} succeeded") + print(f"✗ {len(failures)} failed") + + # Retry failures with different config + if failures: + retry_urls = [r.url for r in failures] + retry_config = CrawlerRunConfig( + page_timeout=120000, # Longer timeout + wait_until="networkidle" + ) + retry_results = await crawler.arun_many( + retry_urls, + config=retry_config + ) + + return successes + retry_results + + except Exception as e: + print(f"Fatal error: {e}") + return [] +``` + +--- + +## 8. Troubleshooting + +### 8.1 Connection Issues + +**Problem**: `Cannot connect to CDP browser` + +```python +# Check CDP browser is running +$ lsof -i :9222 +# Should show: Chromium PID USER FD TYPE ... + +# Or start it if not running +$ crwl cdp +``` + +**Problem**: `ERR_ABORTED` errors in concurrent crawls + +✅ **Fixed in v0.7.6**: This issue has been resolved. Pages are now properly isolated with locking. + +### 8.2 Performance Issues + +**Problem**: Too many open tabs + +```python +# Ensure you're not using session_id for everything +config = CrawlerRunConfig() # No session_id +await crawler.arun_many(urls, config=config) +# Pages auto-close after crawling +``` + +**Problem**: Memory leaks + +```python +# Always use context manager +async with AsyncWebCrawler(config=browser_cfg) as crawler: + # Crawling code here + pass +# Automatic cleanup on exit +``` + +### 8.3 State Issues + +**Problem**: Cookies not persisting + +```python +# Use the same context (automatic with CDP) +browser_cfg = BrowserConfig(cdp_url="http://localhost:9222") +# All crawls share cookies automatically +``` + +**Problem**: Need isolated state + +```python +# Use different CDP endpoints or non-CDP browsers +browser_cfg_1 = BrowserConfig(cdp_url="http://localhost:9222") +browser_cfg_2 = BrowserConfig(cdp_url="http://localhost:9223") +# Completely isolated browsers +``` + +--- + +## 9. Comparison: CDP vs Regular Browsers + +| Feature | CDP Browser | Regular Browser | +|---------|-------------|-----------------| +| **Window Management** | ✅ Single window, multiple tabs | ❌ New window per context | +| **Startup Time** | ✅ Instant (already running) | ⏱️ ~2-3s per launch | +| **State Sharing** | ✅ Shared cookies/localStorage | ⚠️ Isolated by default | +| **Concurrent Safety** | ✅ Automatic locking | ✅ Separate processes | +| **Memory Usage** | ✅ Lower (shared browser) | ⚠️ Higher (multiple processes) | +| **Session Persistence** | ✅ Native support | ✅ Via session_id | +| **Stealth Mode** | ❌ Not compatible | ✅ Full support | +| **Best For** | Development, authenticated crawls | Production, isolated crawls | + +--- + +## 10. Real-World Examples + +### 10.1 E-commerce Product Scraping + +```python +async def scrape_products(): + browser_cfg = BrowserConfig( + browser_type="chromium", + cdp_url="http://localhost:9222" + ) + + # Get product URLs from category page + async with AsyncWebCrawler(config=browser_cfg) as crawler: + category_result = await crawler.arun( + url="https://shop.example.com/category", + config=CrawlerRunConfig( + css_selector=".product-link" + ) + ) + + # Extract product URLs + product_urls = extract_urls(category_result.links) + + # Crawl all products concurrently + product_results = await crawler.arun_many( + urls=product_urls, + config=CrawlerRunConfig( + css_selector=".product-details", + semaphore_count=5 # Polite crawling + ) + ) + + return [extract_product_data(r) for r in product_results] +``` + +### 10.2 News Article Monitoring + +```python +import asyncio +from datetime import datetime + +async def monitor_news_sites(): + browser_cfg = BrowserConfig( + browser_type="chromium", + cdp_url="http://localhost:9222" + ) + + news_sites = [ + "https://news.site1.com", + "https://news.site2.com", + "https://news.site3.com" + ] + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + while True: + print(f"\n[{datetime.now()}] Checking for updates...") + + results = await crawler.arun_many( + urls=news_sites, + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, # Always fresh + css_selector=".article-headline" + ) + ) + + for result in results: + if result.success: + headlines = extract_headlines(result) + for headline in headlines: + if is_new(headline): + notify_user(headline) + + # Check every 5 minutes + await asyncio.sleep(300) +``` + +--- + +## 11. Summary + +CDP browser crawling offers: + +- 🚀 **Performance**: Faster startup, lower resource usage +- 🔄 **State Management**: Shared cookies and authentication +- 🎯 **Concurrent Safety**: Automatic page isolation and cleanup +- 💻 **Developer Friendly**: Visual debugging with DevTools + +**When to use CDP:** +- Development and debugging +- Authenticated crawling (login required) +- Sequential crawls needing state +- Resource-constrained environments + +**When to use regular browsers:** +- Production deployments +- Maximum isolation required +- Stealth mode needed +- Distributed/cloud crawling + +For most use cases, **CDP browsers provide the best balance** of performance, convenience, and safety. diff --git a/tests/test_arun_many_cdp.py b/tests/test_arun_many_cdp.py new file mode 100644 index 000000000..9d82126c1 --- /dev/null +++ b/tests/test_arun_many_cdp.py @@ -0,0 +1,63 @@ +""" +Test for arun_many with managed CDP browser to ensure each crawl gets its own tab. +""" +import pytest +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + + +@pytest.mark.asyncio +async def test_arun_many_with_cdp(): + """Test arun_many opens a new tab for each url with managed CDP browser.""" + # NOTE: Requires a running CDP browser at localhost:9222 + # Can be started with: crwl cdp -d 9222 + browser_cfg = BrowserConfig( + browser_type="cdp", + cdp_url="http://localhost:9222", + verbose=False, + ) + urls = [ + "https://example.com", + "https://httpbin.org/html", + "https://www.python.org", + ] + crawler_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + ) + async with AsyncWebCrawler(config=browser_cfg) as crawler: + results = await crawler.arun_many(urls=urls, config=crawler_cfg) + # All results should be successful and distinct + assert len(results) == 3 + for result in results: + assert result.success, f"Crawl failed: {result.url} - {result.error_message}" + assert result.markdown is not None + + +@pytest.mark.asyncio +async def test_arun_many_with_cdp_sequential(): + """Test arun_many sequentially to isolate issues.""" + browser_cfg = BrowserConfig( + browser_type="cdp", + cdp_url="http://localhost:9222", + verbose=True, + ) + urls = [ + "https://example.com", + "https://httpbin.org/html", + "https://www.python.org", + ] + crawler_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + ) + async with AsyncWebCrawler(config=browser_cfg) as crawler: + results = [] + for url in urls: + result = await crawler.arun(url=url, config=crawler_cfg) + results.append(result) + assert result.success, f"Crawl failed: {result.url} - {result.error_message}" + assert result.markdown is not None + assert len(results) == 3 + + +if __name__ == "__main__": + asyncio.run(test_arun_many_with_cdp()) \ No newline at end of file