diff --git a/Dockerfile b/Dockerfile
index f73cb4ee5..61661ccf4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -41,8 +41,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
pkg-config \
python3-dev \
libjpeg-dev \
+ lsof \
redis-server \
supervisor \
+ tini \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
@@ -199,7 +201,12 @@ EXPOSE 6379
USER appuser
# Set environment variables to ptoduction
-ENV PYTHON_ENV=production
+ENV PYTHON_ENV=production
+
+# Use tini as init system to properly reap zombie processes
+# This is required for Playwright/Chromium which spawns many child processes
+# See: https://github.com/unclecode/crawl4ai/issues/1666
+ENTRYPOINT ["/usr/bin/tini", "--"]
# Start the application using supervisord
CMD ["supervisord", "-c", "supervisord.conf"]
\ No newline at end of file
diff --git a/README.md b/README.md
index 771798c0b..7535e6bd5 100644
--- a/README.md
+++ b/README.md
@@ -1093,6 +1093,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro
| Company | About | Sponsorship Tier |
|------|------|----------------------------|
+| | Leveraging Thordata ensures seamless compatibility with any AI/ML workflows and data infrastructure, massively accessing web data with 99.9% uptime, backed by one-on-one customer support. | 🥈 Silver |
| | NstProxy is a trusted proxy provider with over 110M+ real residential IPs, city-level targeting, 99.99% uptime, and low pricing at $0.1/GB, it delivers unmatched stability, scale, and cost-efficiency. | 🥈 Silver |
| | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver |
| | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze |
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 2850b36a6..229b1506e 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1053,9 +1053,17 @@ async def get_delayed_content(delay: float = 5.0) -> str:
raise e
finally:
+ # Release the context reference so cleanup can work
+ if not self.browser_config.use_managed_browser:
+ try:
+ config_signature = self.browser_manager._make_config_signature(config)
+ await self.browser_manager.release_context(config_signature)
+ except Exception:
+ pass # Don't fail on cleanup
+
# If no session_id is given we should close the page
all_contexts = page.context.browser.contexts
- total_pages = sum(len(context.pages) for context in all_contexts)
+ total_pages = sum(len(context.pages) for context in all_contexts)
if config.session_id:
pass
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index 3ca96aed4..8f3c6de3f 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -611,12 +611,16 @@ def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: b
# Keep track of contexts by a "config signature," so each unique config reuses a single context
self.contexts_by_config = {}
self._contexts_lock = asyncio.Lock()
-
+
+ # Reference counting for contexts - tracks how many requests are using each context
+ # Key: config_signature, Value: count of active requests using this context
+ self._context_refcounts = {}
+
# Serialize context.new_page() across concurrent tasks to avoid races
# when using a shared persistent context (context.pages may be empty
# for all racers). Prevents 'Target page/context closed' errors.
self._page_lock = asyncio.Lock()
-
+
# Stealth adapter for stealth mode
self._stealth_adapter = None
if self.config.enable_stealth and not self.use_undetected:
@@ -1102,6 +1106,9 @@ async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
await self.setup_context(context, crawlerRunConfig)
self.contexts_by_config[config_signature] = context
+ # Increment reference count - this context is now in use
+ self._context_refcounts[config_signature] = self._context_refcounts.get(config_signature, 0) + 1
+
# Create a new page from the chosen context
page = await context.new_page()
await self._apply_stealth_to_page(page)
@@ -1137,11 +1144,127 @@ def _cleanup_expired_sessions(self):
for sid in expired_sessions:
asyncio.create_task(self.kill_session(sid))
+ async def cleanup_contexts(self, max_contexts: int = 5, force: bool = False):
+ """
+ Clean up contexts to prevent memory growth.
+ Only closes contexts that have no active references AND no open pages (safe cleanup).
+
+ Args:
+ max_contexts: Maximum number of contexts to keep. Excess idle contexts
+ will be closed, starting with the oldest ones.
+ force: If True, close contexts even if they have pages (but never if refcount > 0).
+ Use with caution.
+ """
+ async with self._contexts_lock:
+ # First, identify contexts that are safe to close:
+ # - No active references (refcount == 0)
+ # - No open pages (or force=True)
+ idle_contexts = []
+ active_contexts = []
+
+ for sig, ctx in list(self.contexts_by_config.items()):
+ try:
+ refcount = self._context_refcounts.get(sig, 0)
+ has_pages = hasattr(ctx, 'pages') and len(ctx.pages) > 0
+
+ # Context is safe to close only if refcount is 0
+ if refcount > 0:
+ # Context is actively being used by a request - never close
+ active_contexts.append((sig, ctx))
+ elif has_pages and not force:
+ # Has pages but no refs - might be finishing up, skip unless forced
+ active_contexts.append((sig, ctx))
+ else:
+ # refcount == 0 and (no pages or force=True) - safe to close
+ idle_contexts.append((sig, ctx))
+ except Exception:
+ # Context may be in bad state, only cleanup if no refs
+ if self._context_refcounts.get(sig, 0) == 0:
+ idle_contexts.append((sig, ctx))
+ else:
+ active_contexts.append((sig, ctx))
+
+ # Log context status for debugging
+ self.logger.debug(
+ message="Context cleanup check: {total} total, {idle} idle (refcount=0), {active} active",
+ tag="CLEANUP",
+ params={
+ "total": len(self.contexts_by_config),
+ "idle": len(idle_contexts),
+ "active": len(active_contexts)
+ }
+ )
+
+ # Close idle contexts if we exceed max_contexts total
+ contexts_to_close = []
+ if len(self.contexts_by_config) > max_contexts:
+ # Calculate how many we need to close
+ excess = len(self.contexts_by_config) - max_contexts
+ # Only close from idle contexts (safe)
+ contexts_to_close = idle_contexts[:excess]
+
+ # If force=True and we still have too many, close active ones too
+ if force and len(self.contexts_by_config) - len(contexts_to_close) > max_contexts:
+ remaining_excess = len(self.contexts_by_config) - len(contexts_to_close) - max_contexts
+ contexts_to_close.extend(active_contexts[:remaining_excess])
+
+ # Perform cleanup
+ for sig, ctx in contexts_to_close:
+ try:
+ # If forcing and context has pages, close them first
+ if force and hasattr(ctx, 'pages'):
+ for page in list(ctx.pages):
+ try:
+ await page.close()
+ except Exception:
+ pass
+
+ # Remove from our tracking dicts
+ self.contexts_by_config.pop(sig, None)
+ self._context_refcounts.pop(sig, None)
+
+ # Close the context
+ await ctx.close()
+
+ self.logger.info(
+ message="Cleaned up context: {sig}",
+ tag="CLEANUP",
+ params={"sig": sig[:8]}
+ )
+ except Exception as e:
+ # Still remove from tracking even if close fails
+ self.contexts_by_config.pop(sig, None)
+ self._context_refcounts.pop(sig, None)
+ self.logger.warning(
+ message="Error closing context during cleanup: {error}",
+ tag="WARNING",
+ params={"error": str(e)}
+ )
+
+ return len(contexts_to_close) # Return count of cleaned contexts
+
+ async def release_context(self, config_signature: str):
+ """
+ Decrement the reference count for a context after a crawl completes.
+ Call this when a crawl operation finishes (success or failure).
+
+ Args:
+ config_signature: The config signature of the context to release
+ """
+ async with self._contexts_lock:
+ if config_signature in self._context_refcounts:
+ self._context_refcounts[config_signature] = max(0, self._context_refcounts[config_signature] - 1)
+ self.logger.debug(
+ message="Released context ref: {sig}, remaining refs: {refs}",
+ tag="CLEANUP",
+ params={"sig": config_signature[:8], "refs": self._context_refcounts[config_signature]}
+ )
+
async def close(self):
"""Close all browser resources and clean up."""
if self.config.cdp_url:
return
-
+
if self.config.sleep_on_close:
await asyncio.sleep(0.5)
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 81cd312ab..eaab388be 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -579,21 +579,33 @@ async def handle_crawl_request(
results = []
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
- partial_func = partial(func,
- urls[0] if len(urls) == 1 else urls,
- config=crawler_config,
+ partial_func = partial(func,
+ urls[0] if len(urls) == 1 else urls,
+ config=crawler_config,
dispatcher=dispatcher)
results = await partial_func()
-
+
# Ensure results is always a list
if not isinstance(results, list):
results = [results]
+ # Clean up idle browser contexts to prevent memory leaks
+ # Only closes contexts with no open pages (safe cleanup)
+ try:
+ if hasattr(crawler, 'crawler_strategy') and hasattr(crawler.crawler_strategy, 'browser_manager'):
+ bm = crawler.crawler_strategy.browser_manager
+ # Clean up idle contexts (keep at most 3 to allow some reuse)
+ cleaned_count = await bm.cleanup_contexts(max_contexts=3)
+ if cleaned_count > 0:
+ logger.info(f"Browser cleanup: closed {cleaned_count} idle context(s)")
+ except Exception as e:
+ logger.warning(f"Browser context cleanup warning: {e}")
+
# await crawler.close()
-
+
end_mem_mb = _get_memory_mb() # <--- Get memory after
end_time = time.time()
-
+
if start_mem_mb is not None and end_mem_mb is not None:
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory