Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
pkg-config \
python3-dev \
libjpeg-dev \
lsof \
redis-server \
supervisor \
tini \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

Expand Down Expand Up @@ -199,7 +201,12 @@ EXPOSE 6379
USER appuser

# Set environment variables to ptoduction
ENV PYTHON_ENV=production
ENV PYTHON_ENV=production

# Use tini as init system to properly reap zombie processes
# This is required for Playwright/Chromium which spawns many child processes
# See: https://github.com/unclecode/crawl4ai/issues/1666
ENTRYPOINT ["/usr/bin/tini", "--"]

# Start the application using supervisord
CMD ["supervisord", "-c", "supervisord.conf"]
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1093,6 +1093,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro

| Company | About | Sponsorship Tier |
|------|------|----------------------------|
| <a href="https://www.thordata.com/?ls=github&lk=crawl4ai" target="_blank"><img src="https://gist.github.com/aravindkarnam/dfc598a67be5036494475acece7e54cf/raw/thor_data.svg" alt="Thor Data" width="120"/></a> | Leveraging Thordata ensures seamless compatibility with any AI/ML workflows and data infrastructure, massively accessing web data with 99.9% uptime, backed by one-on-one customer support. | 🥈 Silver |
| <a href="https://app.nstproxy.com/register?i=ecOqW9" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.github.com/aravindkarnam/62f82bd4818d3079d9dd3c31df432cf8/raw/nst-light.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://www.nstproxy.com/logo.svg"><img alt="nstproxy" src="ttps://www.nstproxy.com/logo.svg"></picture></a> | NstProxy is a trusted proxy provider with over 110M+ real residential IPs, city-level targeting, 99.99% uptime, and low pricing at $0.1/GB, it delivers unmatched stability, scale, and cost-efficiency. | 🥈 Silver |
| <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a> | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver |
| <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze |
Expand Down
10 changes: 9 additions & 1 deletion crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,9 +1053,17 @@ async def get_delayed_content(delay: float = 5.0) -> str:
raise e

finally:
# Release the context reference so cleanup can work
if not self.browser_config.use_managed_browser:
try:
config_signature = self.browser_manager._make_config_signature(config)
await self.browser_manager.release_context(config_signature)
except Exception:
pass # Don't fail on cleanup
Comment on lines +1056 to +1062
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The release_context call here creates a reference counting imbalance when using session_id. Looking at browser_manager.py get_page(), when a session_id is provided and already exists, the function returns early (line 1063-1066) without incrementing the refcount. However, this release_context will still be called, decrementing a counter that was never incremented. This will cause the refcount to go negative (though clamped to 0 by the max() call in release_context), potentially allowing contexts to be cleaned up while still in use by sessions. The condition should also check that no session_id is being used, similar to: if not self.browser_config.use_managed_browser and not config.session_id:

Copilot uses AI. Check for mistakes.

# If no session_id is given we should close the page
all_contexts = page.context.browser.contexts
total_pages = sum(len(context.pages) for context in all_contexts)
total_pages = sum(len(context.pages) for context in all_contexts)
if config.session_id:
pass
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
Expand Down
129 changes: 126 additions & 3 deletions crawl4ai/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,12 +611,16 @@ def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: b
# Keep track of contexts by a "config signature," so each unique config reuses a single context
self.contexts_by_config = {}
self._contexts_lock = asyncio.Lock()


# Reference counting for contexts - tracks how many requests are using each context
# Key: config_signature, Value: count of active requests using this context
self._context_refcounts = {}

# Serialize context.new_page() across concurrent tasks to avoid races
# when using a shared persistent context (context.pages may be empty
# for all racers). Prevents 'Target page/context closed' errors.
self._page_lock = asyncio.Lock()

# Stealth adapter for stealth mode
self._stealth_adapter = None
if self.config.enable_stealth and not self.use_undetected:
Expand Down Expand Up @@ -1102,6 +1106,9 @@ async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
await self.setup_context(context, crawlerRunConfig)
self.contexts_by_config[config_signature] = context

# Increment reference count - this context is now in use
self._context_refcounts[config_signature] = self._context_refcounts.get(config_signature, 0) + 1

# Create a new page from the chosen context
page = await context.new_page()
await self._apply_stealth_to_page(page)
Expand Down Expand Up @@ -1137,11 +1144,127 @@ def _cleanup_expired_sessions(self):
for sid in expired_sessions:
asyncio.create_task(self.kill_session(sid))

async def cleanup_contexts(self, max_contexts: int = 5, force: bool = False):
"""
Clean up contexts to prevent memory growth.
Only closes contexts that have no active references AND no open pages (safe cleanup).

Args:
max_contexts: Maximum number of contexts to keep. Excess idle contexts
will be closed, starting with the oldest ones.
force: If True, close contexts even if they have pages (but never if refcount > 0).
Use with caution.
"""
async with self._contexts_lock:
# First, identify contexts that are safe to close:
# - No active references (refcount == 0)
# - No open pages (or force=True)
idle_contexts = []
active_contexts = []

for sig, ctx in list(self.contexts_by_config.items()):
try:
refcount = self._context_refcounts.get(sig, 0)
has_pages = hasattr(ctx, 'pages') and len(ctx.pages) > 0

# Context is safe to close only if refcount is 0
if refcount > 0:
# Context is actively being used by a request - never close
active_contexts.append((sig, ctx))
elif has_pages and not force:
# Has pages but no refs - might be finishing up, skip unless forced
active_contexts.append((sig, ctx))
else:
# refcount == 0 and (no pages or force=True) - safe to close
idle_contexts.append((sig, ctx))
except Exception:
# Context may be in bad state, only cleanup if no refs
if self._context_refcounts.get(sig, 0) == 0:
idle_contexts.append((sig, ctx))
else:
active_contexts.append((sig, ctx))

# Log context status for debugging
self.logger.debug(
message="Context cleanup check: {total} total, {idle} idle (refcount=0), {active} active",
tag="CLEANUP",
params={
"total": len(self.contexts_by_config),
"idle": len(idle_contexts),
"active": len(active_contexts)
}
)

# Close idle contexts if we exceed max_contexts total
contexts_to_close = []
if len(self.contexts_by_config) > max_contexts:
# Calculate how many we need to close
excess = len(self.contexts_by_config) - max_contexts
# Only close from idle contexts (safe)
contexts_to_close = idle_contexts[:excess]

# If force=True and we still have too many, close active ones too
if force and len(self.contexts_by_config) - len(contexts_to_close) > max_contexts:
remaining_excess = len(self.contexts_by_config) - len(contexts_to_close) - max_contexts
contexts_to_close.extend(active_contexts[:remaining_excess])
Comment on lines +1206 to +1209
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The documentation states that force will "close contexts even if they have pages (but never if refcount > 0)", but the implementation at lines 1207-1209 will actually close active contexts when force=True, and active_contexts includes contexts with refcount > 0 (added at line 1171-1173). This means force=True can close contexts that are actively being used by requests, contradicting the docstring and potentially causing "Target closed" errors during active crawls. The condition should filter out contexts with refcount > 0 from active_contexts before extending contexts_to_close.

Suggested change
# If force=True and we still have too many, close active ones too
if force and len(self.contexts_by_config) - len(contexts_to_close) > max_contexts:
remaining_excess = len(self.contexts_by_config) - len(contexts_to_close) - max_contexts
contexts_to_close.extend(active_contexts[:remaining_excess])
# If force=True and we still have too many, close additional contexts
# but never close contexts with refcount > 0 (they may be in active use).
if force and len(self.contexts_by_config) - len(contexts_to_close) > max_contexts:
remaining_excess = len(self.contexts_by_config) - len(contexts_to_close) - max_contexts
# From active_contexts, only consider those whose refcount is 0 for forced closure
force_closable_active = [
(sig, ctx)
for sig, ctx in active_contexts
if self._context_refcounts.get(sig, 0) == 0
]
contexts_to_close.extend(force_closable_active[:remaining_excess])

Copilot uses AI. Check for mistakes.

# Perform cleanup
for sig, ctx in contexts_to_close:
try:
# If forcing and context has pages, close them first
if force and hasattr(ctx, 'pages'):
for page in list(ctx.pages):
try:
await page.close()
except Exception:
pass
Comment on lines +1219 to +1220
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'except' clause does nothing but pass and there is no explanatory comment.

Suggested change
except Exception:
pass
except Exception as e:
# Ignore individual page close failures but record them for diagnostics
self.logger.warning(
message="Error closing page during context cleanup: {error}",
tag="WARNING",
params={"error": str(e)}
)

Copilot uses AI. Check for mistakes.

# Remove from our tracking dicts
self.contexts_by_config.pop(sig, None)
self._context_refcounts.pop(sig, None)

# Close the context
await ctx.close()

self.logger.info(
message="Cleaned up context: {sig}",
tag="CLEANUP",
params={"sig": sig[:8]}
)
except Exception as e:
# Still remove from tracking even if close fails
self.contexts_by_config.pop(sig, None)
self._context_refcounts.pop(sig, None)
self.logger.warning(
message="Error closing context during cleanup: {error}",
tag="WARNING",
params={"error": str(e)}
)

return len(contexts_to_close) # Return count of cleaned contexts

async def release_context(self, config_signature: str):
"""
Decrement the reference count for a context after a crawl completes.
Call this when a crawl operation finishes (success or failure).

Args:
config_signature: The config signature of the context to release
"""
async with self._contexts_lock:
if config_signature in self._context_refcounts:
self._context_refcounts[config_signature] = max(0, self._context_refcounts[config_signature] - 1)
self.logger.debug(
message="Released context ref: {sig}, remaining refs: {refs}",
tag="CLEANUP",
params={"sig": config_signature[:8], "refs": self._context_refcounts[config_signature]}
)

async def close(self):
"""Close all browser resources and clean up."""
if self.config.cdp_url:
return

if self.config.sleep_on_close:
await asyncio.sleep(0.5)

Expand Down
24 changes: 18 additions & 6 deletions deploy/docker/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,21 +579,33 @@ async def handle_crawl_request(

results = []
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
partial_func = partial(func,
urls[0] if len(urls) == 1 else urls,
config=crawler_config,
partial_func = partial(func,
urls[0] if len(urls) == 1 else urls,
config=crawler_config,
dispatcher=dispatcher)
results = await partial_func()

# Ensure results is always a list
if not isinstance(results, list):
results = [results]

# Clean up idle browser contexts to prevent memory leaks
# Only closes contexts with no open pages (safe cleanup)
try:
if hasattr(crawler, 'crawler_strategy') and hasattr(crawler.crawler_strategy, 'browser_manager'):
bm = crawler.crawler_strategy.browser_manager
# Clean up idle contexts (keep at most 3 to allow some reuse)
cleaned_count = await bm.cleanup_contexts(max_contexts=3)
if cleaned_count > 0:
logger.info(f"Browser cleanup: closed {cleaned_count} idle context(s)")
except Exception as e:
logger.warning(f"Browser context cleanup warning: {e}")

# await crawler.close()

end_mem_mb = _get_memory_mb() # <--- Get memory after
end_time = time.time()

if start_mem_mb is not None and end_mem_mb is not None:
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
Expand Down