diff --git a/Dockerfile b/Dockerfile index f73cb4ee5..61661ccf4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,8 +41,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pkg-config \ python3-dev \ libjpeg-dev \ + lsof \ redis-server \ supervisor \ + tini \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -199,7 +201,12 @@ EXPOSE 6379 USER appuser # Set environment variables to ptoduction -ENV PYTHON_ENV=production +ENV PYTHON_ENV=production + +# Use tini as init system to properly reap zombie processes +# This is required for Playwright/Chromium which spawns many child processes +# See: https://github.com/unclecode/crawl4ai/issues/1666 +ENTRYPOINT ["/usr/bin/tini", "--"] # Start the application using supervisord CMD ["supervisord", "-c", "supervisord.conf"] \ No newline at end of file diff --git a/README.md b/README.md index 771798c0b..7535e6bd5 100644 --- a/README.md +++ b/README.md @@ -1093,6 +1093,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro | Company | About | Sponsorship Tier | |------|------|----------------------------| +| Thor Data | Leveraging Thordata ensures seamless compatibility with any AI/ML workflows and data infrastructure, massively accessing web data with 99.9% uptime, backed by one-on-one customer support. | 🥈 Silver | | nstproxy | NstProxy is a trusted proxy provider with over 110M+ real residential IPs, city-level targeting, 99.99% uptime, and low pricing at $0.1/GB, it delivers unmatched stability, scale, and cost-efficiency. | 🥈 Silver | | Scrapeless | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver | | Capsolver | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze | diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 2850b36a6..229b1506e 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1053,9 +1053,17 @@ async def get_delayed_content(delay: float = 5.0) -> str: raise e finally: + # Release the context reference so cleanup can work + if not self.browser_config.use_managed_browser: + try: + config_signature = self.browser_manager._make_config_signature(config) + await self.browser_manager.release_context(config_signature) + except Exception: + pass # Don't fail on cleanup + # If no session_id is given we should close the page all_contexts = page.context.browser.contexts - total_pages = sum(len(context.pages) for context in all_contexts) + total_pages = sum(len(context.pages) for context in all_contexts) if config.session_id: pass elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless): diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 3ca96aed4..8f3c6de3f 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -611,12 +611,16 @@ def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: b # Keep track of contexts by a "config signature," so each unique config reuses a single context self.contexts_by_config = {} self._contexts_lock = asyncio.Lock() - + + # Reference counting for contexts - tracks how many requests are using each context + # Key: config_signature, Value: count of active requests using this context + self._context_refcounts = {} + # Serialize context.new_page() across concurrent tasks to avoid races # when using a shared persistent context (context.pages may be empty # for all racers). Prevents 'Target page/context closed' errors. self._page_lock = asyncio.Lock() - + # Stealth adapter for stealth mode self._stealth_adapter = None if self.config.enable_stealth and not self.use_undetected: @@ -1102,6 +1106,9 @@ async def get_page(self, crawlerRunConfig: CrawlerRunConfig): await self.setup_context(context, crawlerRunConfig) self.contexts_by_config[config_signature] = context + # Increment reference count - this context is now in use + self._context_refcounts[config_signature] = self._context_refcounts.get(config_signature, 0) + 1 + # Create a new page from the chosen context page = await context.new_page() await self._apply_stealth_to_page(page) @@ -1137,11 +1144,127 @@ def _cleanup_expired_sessions(self): for sid in expired_sessions: asyncio.create_task(self.kill_session(sid)) + async def cleanup_contexts(self, max_contexts: int = 5, force: bool = False): + """ + Clean up contexts to prevent memory growth. + Only closes contexts that have no active references AND no open pages (safe cleanup). + + Args: + max_contexts: Maximum number of contexts to keep. Excess idle contexts + will be closed, starting with the oldest ones. + force: If True, close contexts even if they have pages (but never if refcount > 0). + Use with caution. + """ + async with self._contexts_lock: + # First, identify contexts that are safe to close: + # - No active references (refcount == 0) + # - No open pages (or force=True) + idle_contexts = [] + active_contexts = [] + + for sig, ctx in list(self.contexts_by_config.items()): + try: + refcount = self._context_refcounts.get(sig, 0) + has_pages = hasattr(ctx, 'pages') and len(ctx.pages) > 0 + + # Context is safe to close only if refcount is 0 + if refcount > 0: + # Context is actively being used by a request - never close + active_contexts.append((sig, ctx)) + elif has_pages and not force: + # Has pages but no refs - might be finishing up, skip unless forced + active_contexts.append((sig, ctx)) + else: + # refcount == 0 and (no pages or force=True) - safe to close + idle_contexts.append((sig, ctx)) + except Exception: + # Context may be in bad state, only cleanup if no refs + if self._context_refcounts.get(sig, 0) == 0: + idle_contexts.append((sig, ctx)) + else: + active_contexts.append((sig, ctx)) + + # Log context status for debugging + self.logger.debug( + message="Context cleanup check: {total} total, {idle} idle (refcount=0), {active} active", + tag="CLEANUP", + params={ + "total": len(self.contexts_by_config), + "idle": len(idle_contexts), + "active": len(active_contexts) + } + ) + + # Close idle contexts if we exceed max_contexts total + contexts_to_close = [] + if len(self.contexts_by_config) > max_contexts: + # Calculate how many we need to close + excess = len(self.contexts_by_config) - max_contexts + # Only close from idle contexts (safe) + contexts_to_close = idle_contexts[:excess] + + # If force=True and we still have too many, close active ones too + if force and len(self.contexts_by_config) - len(contexts_to_close) > max_contexts: + remaining_excess = len(self.contexts_by_config) - len(contexts_to_close) - max_contexts + contexts_to_close.extend(active_contexts[:remaining_excess]) + + # Perform cleanup + for sig, ctx in contexts_to_close: + try: + # If forcing and context has pages, close them first + if force and hasattr(ctx, 'pages'): + for page in list(ctx.pages): + try: + await page.close() + except Exception: + pass + + # Remove from our tracking dicts + self.contexts_by_config.pop(sig, None) + self._context_refcounts.pop(sig, None) + + # Close the context + await ctx.close() + + self.logger.info( + message="Cleaned up context: {sig}", + tag="CLEANUP", + params={"sig": sig[:8]} + ) + except Exception as e: + # Still remove from tracking even if close fails + self.contexts_by_config.pop(sig, None) + self._context_refcounts.pop(sig, None) + self.logger.warning( + message="Error closing context during cleanup: {error}", + tag="WARNING", + params={"error": str(e)} + ) + + return len(contexts_to_close) # Return count of cleaned contexts + + async def release_context(self, config_signature: str): + """ + Decrement the reference count for a context after a crawl completes. + Call this when a crawl operation finishes (success or failure). + + Args: + config_signature: The config signature of the context to release + """ + async with self._contexts_lock: + if config_signature in self._context_refcounts: + self._context_refcounts[config_signature] = max(0, self._context_refcounts[config_signature] - 1) + self.logger.debug( + message="Released context ref: {sig}, remaining refs: {refs}", + tag="CLEANUP", + params={"sig": config_signature[:8], "refs": self._context_refcounts[config_signature]} + ) + async def close(self): """Close all browser resources and clean up.""" if self.config.cdp_url: return - + if self.config.sleep_on_close: await asyncio.sleep(0.5) diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 81cd312ab..eaab388be 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -579,21 +579,33 @@ async def handle_crawl_request( results = [] func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") - partial_func = partial(func, - urls[0] if len(urls) == 1 else urls, - config=crawler_config, + partial_func = partial(func, + urls[0] if len(urls) == 1 else urls, + config=crawler_config, dispatcher=dispatcher) results = await partial_func() - + # Ensure results is always a list if not isinstance(results, list): results = [results] + # Clean up idle browser contexts to prevent memory leaks + # Only closes contexts with no open pages (safe cleanup) + try: + if hasattr(crawler, 'crawler_strategy') and hasattr(crawler.crawler_strategy, 'browser_manager'): + bm = crawler.crawler_strategy.browser_manager + # Clean up idle contexts (keep at most 3 to allow some reuse) + cleaned_count = await bm.cleanup_contexts(max_contexts=3) + if cleaned_count > 0: + logger.info(f"Browser cleanup: closed {cleaned_count} idle context(s)") + except Exception as e: + logger.warning(f"Browser context cleanup warning: {e}") + # await crawler.close() - + end_mem_mb = _get_memory_mb() # <--- Get memory after end_time = time.time() - + if start_mem_mb is not None and end_mem_mb is not None: mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory