unclecode · leoric-crown · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/.env.example b/.env.example
@@ -0,0 +1,45 @@
+# Docker Compose Configuration
+# This file is used by docker-compose for variable substitution in docker-compose.yml
+# Copy this file to .env and customize as needed
+
+# ──────────────────────────────────────────────────────────────────
+# Port Configuration
+# ──────────────────────────────────────────────────────────────────
+# Host port mapping (container always runs on 11235 internally)
+HOST_PORT=11235
+
+# ──────────────────────────────────────────────────────────────────
+# Image Selection
+# ──────────────────────────────────────────────────────────────────
+# Use pre-built image from Docker Hub (recommended)
+# IMAGE=unclecode/crawl4ai:latest
+# TAG=latest
+
+# ──────────────────────────────────────────────────────────────────
+# Build Configuration (only applies when building locally)
+# ──────────────────────────────────────────────────────────────────
+
+# INSTALL_TYPE: Feature set for the installation
+#   - default: Basic installation (~2-3GB image)
+#             Includes: JsonCssExtractionStrategy, JsonXPathExtractionStrategy,
+#                      LLMExtractionStrategy (API-based, no local ML)
+#             Best for: Standard web crawling, structured extraction, LLM-based extraction
+#
+#   - all: Full installation with ML dependencies (~6-8GB image)
+#         Adds: PyTorch, transformers, sentence-transformers, scikit-learn, NLTK
+#         Enables: CosineStrategy (semantic clustering), local transformer models
+#         Best for: Advanced ML-based extraction, semantic content analysis
+#
+#   - torch: PyTorch + scikit-learn + NLTK (no transformers)
+#   - transformer: Transformers + sentence-transformers (no PyTorch)
+#
+INSTALL_TYPE=default
+
+# ENABLE_GPU: Enable NVIDIA CUDA support for GPU acceleration
+#   - false: CPU-only (works on all platforms)
+#   - true: Adds CUDA toolkit (AMD64/x86_64 only, requires NVIDIA GPU)
+#
+# Note: GPU support only available on AMD64 architecture
+#       ARM64 (Apple Silicon) will skip GPU installation
+#
+ENABLE_GPU=false
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
@@ -614,11 +614,12 @@ def dump(self) -> dict:
 
     @staticmethod
     def load(data: dict) -> "BrowserConfig":
-        # Deserialize the object from a dictionary
+        if data is None:
+            return BrowserConfig()
         config = from_serializable_dict(data)
         if isinstance(config, BrowserConfig):
             return config
-        return BrowserConfig.from_kwargs(config)
+        return BrowserConfig.from_kwargs(config if config is not None else {})
 
 class VirtualScrollConfig:
     """Configuration for virtual scroll handling.
@@ -1549,11 +1550,12 @@ def dump(self) -> dict:
 
     @staticmethod
     def load(data: dict) -> "CrawlerRunConfig":
-        # Deserialize the object from a dictionary
+        if data is None:
+            return CrawlerRunConfig()
         config = from_serializable_dict(data)
         if isinstance(config, CrawlerRunConfig):
             return config
-        return CrawlerRunConfig.from_kwargs(config)
+        return CrawlerRunConfig.from_kwargs(config if config is not None else {})
 
     def to_dict(self):
         return {

diff --git a/deploy/docker/README.md b/deploy/docker/README.md
@@ -115,6 +115,17 @@ EOL
       unclecode/crawl4ai:0.7.0-r1
     ```
 
+*   **With custom host port:**
+    ```bash
+    docker run -d \
+      -p 8080:11235 \
+      --name crawl4ai \
+      --env-file .llm.env \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.7.0-r1
+    ```
+    > Access at `http://localhost:8080` (mapped to container's internal port 11235)
+
 > The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
 
 #### 4. Stopping the Container
@@ -143,15 +154,24 @@ git clone https://github.com/unclecode/crawl4ai.git
 cd crawl4ai
 ```
 
-#### 2. Environment Setup (API Keys)
+#### 2. Environment Setup
 
-If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
+Crawl4AI uses two environment files:
+
+- **`.env`** - Docker Compose variables (port mapping, image tags)
+- **`.llm.env`** - Container runtime variables (API keys, runtime config)
 
 ```bash
 # Make sure you are in the 'crawl4ai' root directory
-cp deploy/docker/.llm.env.example .llm.env
 
-# Now edit .llm.env and add your API keys
+# 1. (Optional) Copy Docker Compose config to customize host port
+cp .env.example .env
+# Edit .env to set HOST_PORT (default: 11235)
+# The container always runs on port 11235 internally
+
+# 2. Copy API keys config (if using LLMs)
+cp deploy/docker/.llm.env.example .llm.env
+# Edit .llm.env and add your API keys
 ```
 
 **Flexible LLM Provider Configuration:**
@@ -199,12 +219,15 @@ The `docker-compose.yml` file in the project root provides a simplified approach
     ```bash
     # Build with all features (includes torch and transformers)
     INSTALL_TYPE=all docker compose up --build -d
-    
+
     # Build with GPU support (for AMD64 platforms)
     ENABLE_GPU=true docker compose up --build -d
+
+    # Run on custom host port
+    HOST_PORT=8080 docker compose up -d
     ```
 
-> The server will be available at `http://localhost:11235`.
+> The server will be available at `http://localhost:11235` (or your custom `HOST_PORT`).
 
 #### 4. Stopping the Service
 
@@ -282,18 +305,23 @@ MCP is an open protocol that standardizes how applications provide context to LL
 
 ### Connecting via MCP
 
-The Crawl4AI server exposes two MCP endpoints:
+The Crawl4AI server exposes an MCP HTTP endpoint:
+
+- **FastMCP HTTP**: `http://localhost:11235/mcp`
 
-- **Server-Sent Events (SSE)**: `http://localhost:11235/mcp/sse`
-- **WebSocket**: `ws://localhost:11235/mcp/ws`
+> ⚠️ **Known limitation:** The FastMCP HTTP proxy does not yet forward JWT `Authorization`
+> headers. If `security.jwt_enabled=true`, MCP tool calls will fail authentication.
+> Until the auth-forwarding work lands, either
+> disable JWT for MCP usage or introduce an internal-only token/header that the
+> proxy can inject.
 
 ### Using with Claude Code
 
 You can add Crawl4AI as an MCP tool provider in Claude Code with a simple command:
 
 ```bash
-# Add the Crawl4AI server as an MCP provider
-claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
+# Add the Crawl4AI server as an MCP provider (HTTP transport)
+claude mcp add --transport http c4ai-http http://localhost:11235/mcp
 
 # List all MCP providers to verify it was added
 claude mcp list
@@ -388,19 +416,25 @@ Generates a PDF document of the specified URL.
 POST /execute_js
 ```
 
-Executes JavaScript snippets on the specified URL and returns the full crawl result.
+Executes JavaScript snippets against a fresh instance of the target page and
+returns the resulting crawl data.
 
 ```json
 {
   "url": "https://example.com",
   "scripts": [
-    "return document.title",
-    "return Array.from(document.querySelectorAll('a')).map(a => a.href)"
+    "(() => { document.body.dataset.demo = 'set'; return true; })()",
+    "(async () => { await new Promise(r => setTimeout(r, 500)); window.snapshot = document.body.dataset.demo; })()"
   ]
 }
 ```
 
-- `scripts`: List of JavaScript snippets to execute sequentially
+- `scripts`: List of JavaScript expressions (typically self-invoking
+  functions) that run sequentially in the page context. There is no `page`
+  handle; use DOM APIs such as `document` or `window`.
+- Results only report success or errors—returned values are not surfaced.  Run
+  all related snippets in a single call; each request creates and tears down a
+  fresh page.
 
 ---
 
@@ -685,7 +719,7 @@ app:
   title: "Crawl4AI API"
   version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
   host: "0.0.0.0"
-  port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
+  port: 11235 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
   reload: False # Default set to False - suitable for production
   timeout_keep_alive: 300