From da82f0ada57296f57ea85333ce5cc5af1ac65294 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 23 Dec 2025 16:28:26 +0530 Subject: [PATCH 1/5] sponsors: Add thor data as sponsor --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 771798c0b..dc52a9d8f 100644 --- a/README.md +++ b/README.md @@ -1093,6 +1093,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro | Company | About | Sponsorship Tier | |------|------|----------------------------| +| Thor Data | NstProxy is a trusted proxy provider with over 110M+ real residential IPs, city-level targeting, 99.99% uptime, and low pricing at $0.1/GB, it delivers unmatched stability, scale, and cost-efficiency. | 🥈 Silver | | nstproxy | NstProxy is a trusted proxy provider with over 110M+ real residential IPs, city-level targeting, 99.99% uptime, and low pricing at $0.1/GB, it delivers unmatched stability, scale, and cost-efficiency. | 🥈 Silver | | Scrapeless | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver | | Capsolver | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze | From a234959b12afd24262ba4e28dea61c9a150fc77e Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 23 Dec 2025 20:45:00 +0530 Subject: [PATCH 2/5] sponsors: Add thor data as sponsor --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dc52a9d8f..7535e6bd5 100644 --- a/README.md +++ b/README.md @@ -1093,7 +1093,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro | Company | About | Sponsorship Tier | |------|------|----------------------------| -| Thor Data | NstProxy is a trusted proxy provider with over 110M+ real residential IPs, city-level targeting, 99.99% uptime, and low pricing at $0.1/GB, it delivers unmatched stability, scale, and cost-efficiency. | 🥈 Silver | +| Thor Data | Leveraging Thordata ensures seamless compatibility with any AI/ML workflows and data infrastructure, massively accessing web data with 99.9% uptime, backed by one-on-one customer support. | 🥈 Silver | | nstproxy | NstProxy is a trusted proxy provider with over 110M+ real residential IPs, city-level targeting, 99.99% uptime, and low pricing at $0.1/GB, it delivers unmatched stability, scale, and cost-efficiency. | 🥈 Silver | | Scrapeless | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver | | Capsolver | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze | From a046203e58da40fd0259125f1594e8bb22807285 Mon Sep 17 00:00:00 2001 From: Martichou Date: Tue, 25 Nov 2025 21:24:06 +0100 Subject: [PATCH 3/5] fix: prevent memory leak by closing unused context When scraping many URLs continuously, browser contexts accumulated in memory and were never cleaned up. The existing cleanup only ran when browsers went idle, which never happened under continuous load. See: #943. Key changes: - browser_manager.py: Add _context_refcounts tracking, cleanup_contexts(), and release_context() methods - async_crawler_strategy.py: Release context ref in finally block after crawl - deploy/docker/api.py: Trigger context cleanup after each request This fixes or at least, drastically improve the memory leaks in my testing. --- crawl4ai/async_crawler_strategy.py | 10 ++- crawl4ai/browser_manager.py | 129 ++++++++++++++++++++++++++++- deploy/docker/api.py | 24 ++++-- 3 files changed, 153 insertions(+), 10 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 2850b36a6..229b1506e 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1053,9 +1053,17 @@ async def get_delayed_content(delay: float = 5.0) -> str: raise e finally: + # Release the context reference so cleanup can work + if not self.browser_config.use_managed_browser: + try: + config_signature = self.browser_manager._make_config_signature(config) + await self.browser_manager.release_context(config_signature) + except Exception: + pass # Don't fail on cleanup + # If no session_id is given we should close the page all_contexts = page.context.browser.contexts - total_pages = sum(len(context.pages) for context in all_contexts) + total_pages = sum(len(context.pages) for context in all_contexts) if config.session_id: pass elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless): diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 3ca96aed4..8f3c6de3f 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -611,12 +611,16 @@ def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: b # Keep track of contexts by a "config signature," so each unique config reuses a single context self.contexts_by_config = {} self._contexts_lock = asyncio.Lock() - + + # Reference counting for contexts - tracks how many requests are using each context + # Key: config_signature, Value: count of active requests using this context + self._context_refcounts = {} + # Serialize context.new_page() across concurrent tasks to avoid races # when using a shared persistent context (context.pages may be empty # for all racers). Prevents 'Target page/context closed' errors. self._page_lock = asyncio.Lock() - + # Stealth adapter for stealth mode self._stealth_adapter = None if self.config.enable_stealth and not self.use_undetected: @@ -1102,6 +1106,9 @@ async def get_page(self, crawlerRunConfig: CrawlerRunConfig): await self.setup_context(context, crawlerRunConfig) self.contexts_by_config[config_signature] = context + # Increment reference count - this context is now in use + self._context_refcounts[config_signature] = self._context_refcounts.get(config_signature, 0) + 1 + # Create a new page from the chosen context page = await context.new_page() await self._apply_stealth_to_page(page) @@ -1137,11 +1144,127 @@ def _cleanup_expired_sessions(self): for sid in expired_sessions: asyncio.create_task(self.kill_session(sid)) + async def cleanup_contexts(self, max_contexts: int = 5, force: bool = False): + """ + Clean up contexts to prevent memory growth. + Only closes contexts that have no active references AND no open pages (safe cleanup). + + Args: + max_contexts: Maximum number of contexts to keep. Excess idle contexts + will be closed, starting with the oldest ones. + force: If True, close contexts even if they have pages (but never if refcount > 0). + Use with caution. + """ + async with self._contexts_lock: + # First, identify contexts that are safe to close: + # - No active references (refcount == 0) + # - No open pages (or force=True) + idle_contexts = [] + active_contexts = [] + + for sig, ctx in list(self.contexts_by_config.items()): + try: + refcount = self._context_refcounts.get(sig, 0) + has_pages = hasattr(ctx, 'pages') and len(ctx.pages) > 0 + + # Context is safe to close only if refcount is 0 + if refcount > 0: + # Context is actively being used by a request - never close + active_contexts.append((sig, ctx)) + elif has_pages and not force: + # Has pages but no refs - might be finishing up, skip unless forced + active_contexts.append((sig, ctx)) + else: + # refcount == 0 and (no pages or force=True) - safe to close + idle_contexts.append((sig, ctx)) + except Exception: + # Context may be in bad state, only cleanup if no refs + if self._context_refcounts.get(sig, 0) == 0: + idle_contexts.append((sig, ctx)) + else: + active_contexts.append((sig, ctx)) + + # Log context status for debugging + self.logger.debug( + message="Context cleanup check: {total} total, {idle} idle (refcount=0), {active} active", + tag="CLEANUP", + params={ + "total": len(self.contexts_by_config), + "idle": len(idle_contexts), + "active": len(active_contexts) + } + ) + + # Close idle contexts if we exceed max_contexts total + contexts_to_close = [] + if len(self.contexts_by_config) > max_contexts: + # Calculate how many we need to close + excess = len(self.contexts_by_config) - max_contexts + # Only close from idle contexts (safe) + contexts_to_close = idle_contexts[:excess] + + # If force=True and we still have too many, close active ones too + if force and len(self.contexts_by_config) - len(contexts_to_close) > max_contexts: + remaining_excess = len(self.contexts_by_config) - len(contexts_to_close) - max_contexts + contexts_to_close.extend(active_contexts[:remaining_excess]) + + # Perform cleanup + for sig, ctx in contexts_to_close: + try: + # If forcing and context has pages, close them first + if force and hasattr(ctx, 'pages'): + for page in list(ctx.pages): + try: + await page.close() + except Exception: + pass + + # Remove from our tracking dicts + self.contexts_by_config.pop(sig, None) + self._context_refcounts.pop(sig, None) + + # Close the context + await ctx.close() + + self.logger.info( + message="Cleaned up context: {sig}", + tag="CLEANUP", + params={"sig": sig[:8]} + ) + except Exception as e: + # Still remove from tracking even if close fails + self.contexts_by_config.pop(sig, None) + self._context_refcounts.pop(sig, None) + self.logger.warning( + message="Error closing context during cleanup: {error}", + tag="WARNING", + params={"error": str(e)} + ) + + return len(contexts_to_close) # Return count of cleaned contexts + + async def release_context(self, config_signature: str): + """ + Decrement the reference count for a context after a crawl completes. + Call this when a crawl operation finishes (success or failure). + + Args: + config_signature: The config signature of the context to release + """ + async with self._contexts_lock: + if config_signature in self._context_refcounts: + self._context_refcounts[config_signature] = max(0, self._context_refcounts[config_signature] - 1) + self.logger.debug( + message="Released context ref: {sig}, remaining refs: {refs}", + tag="CLEANUP", + params={"sig": config_signature[:8], "refs": self._context_refcounts[config_signature]} + ) + async def close(self): """Close all browser resources and clean up.""" if self.config.cdp_url: return - + if self.config.sleep_on_close: await asyncio.sleep(0.5) diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 81cd312ab..eaab388be 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -579,21 +579,33 @@ async def handle_crawl_request( results = [] func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") - partial_func = partial(func, - urls[0] if len(urls) == 1 else urls, - config=crawler_config, + partial_func = partial(func, + urls[0] if len(urls) == 1 else urls, + config=crawler_config, dispatcher=dispatcher) results = await partial_func() - + # Ensure results is always a list if not isinstance(results, list): results = [results] + # Clean up idle browser contexts to prevent memory leaks + # Only closes contexts with no open pages (safe cleanup) + try: + if hasattr(crawler, 'crawler_strategy') and hasattr(crawler.crawler_strategy, 'browser_manager'): + bm = crawler.crawler_strategy.browser_manager + # Clean up idle contexts (keep at most 3 to allow some reuse) + cleaned_count = await bm.cleanup_contexts(max_contexts=3) + if cleaned_count > 0: + logger.info(f"Browser cleanup: closed {cleaned_count} idle context(s)") + except Exception as e: + logger.warning(f"Browser context cleanup warning: {e}") + # await crawler.close() - + end_mem_mb = _get_memory_mb() # <--- Get memory after end_time = time.time() - + if start_mem_mb is not None and end_mem_mb is not None: mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory From 5196b955a573ecfb46b66f94e9792261fa994f27 Mon Sep 17 00:00:00 2001 From: Martichou Date: Thu, 1 Jan 2026 19:52:38 +0100 Subject: [PATCH 4/5] chore: add lsof inside dockerfile Signed-off-by: Martichou --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index f73cb4ee5..9fd01e549 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,6 +41,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pkg-config \ python3-dev \ libjpeg-dev \ + lsof \ redis-server \ supervisor \ && apt-get clean \ From 43b197ad513b58e665ff05861be10375acce0707 Mon Sep 17 00:00:00 2001 From: Martichou Date: Thu, 1 Jan 2026 21:05:05 +0100 Subject: [PATCH 5/5] chore: add tini inside the dockerfile Signed-off-by: Martichou --- Dockerfile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9fd01e549..61661ccf4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,6 +44,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ lsof \ redis-server \ supervisor \ + tini \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -200,7 +201,12 @@ EXPOSE 6379 USER appuser # Set environment variables to ptoduction -ENV PYTHON_ENV=production +ENV PYTHON_ENV=production + +# Use tini as init system to properly reap zombie processes +# This is required for Playwright/Chromium which spawns many child processes +# See: https://github.com/unclecode/crawl4ai/issues/1666 +ENTRYPOINT ["/usr/bin/tini", "--"] # Start the application using supervisord CMD ["supervisord", "-c", "supervisord.conf"] \ No newline at end of file