Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 23 additions & 10 deletions crawl4ai/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,11 +521,15 @@ async def crawl_with_profile_cli(profile_path, url):
# Run the crawler
result = await run_crawler(url, browser_cfg, crawler_cfg, True)

# Get JSON output config
config = get_global_config()
ensure_ascii = config.get("JSON_ENSURE_ASCII", USER_SETTINGS["JSON_ENSURE_ASCII"]["default"])

# Handle output
if output_format == "all":
console.print(json.dumps(result.model_dump(), indent=2))
console.print(json.dumps(result.model_dump(), indent=2, ensure_ascii=ensure_ascii))
elif output_format == "json":
console.print(json.dumps(json.loads(result.extracted_content), indent=2))
console.print(json.dumps(json.loads(result.extracted_content), indent=2, ensure_ascii=ensure_ascii))
elif output_format in ["markdown", "md"]:
console.print(result.markdown.raw_markdown)
elif output_format == "title":
Expand Down Expand Up @@ -1019,9 +1023,10 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
@click.option("--json-ensure-ascii/--no-json-ensure-ascii", default=None, help="Escape non-ASCII characters in JSON output (default: from global config)")
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int, json_ensure_ascii: Optional[bool]):
"""Crawl a website and extract content

Simple Usage:
Expand Down Expand Up @@ -1187,6 +1192,12 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
browser_cfg.verbose = config.get("VERBOSE", False)
crawler_cfg.verbose = config.get("VERBOSE", False)

# Get JSON output config (priority: CLI flag > global config)
if json_ensure_ascii is not None:
ensure_ascii = json_ensure_ascii
else:
ensure_ascii = config.get("JSON_ENSURE_ASCII", USER_SETTINGS["JSON_ENSURE_ASCII"]["default"])

# Run crawler
result : CrawlResult = anyio.run(
run_crawler,
Expand Down Expand Up @@ -1221,13 +1232,13 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
if output == "all":
if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
click.echo(json.dumps(output_data, indent=2))
click.echo(json.dumps(output_data, indent=2, ensure_ascii=ensure_ascii))
else:
click.echo(json.dumps(main_result.model_dump(), indent=2))
click.echo(json.dumps(main_result.model_dump(), indent=2, ensure_ascii=ensure_ascii))
elif output == "json":
print(main_result.extracted_content)
extracted_items = json.loads(main_result.extracted_content)
click.echo(json.dumps(extracted_items, indent=2))
click.echo(json.dumps(extracted_items, indent=2, ensure_ascii=ensure_ascii))

elif output in ["markdown", "md"]:
click.echo(main_result.markdown.raw_markdown)
Expand All @@ -1238,9 +1249,9 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
with open(output_file, "w") as f:
if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
f.write(json.dumps(output_data, indent=2))
f.write(json.dumps(output_data, indent=2, ensure_ascii=ensure_ascii))
else:
f.write(json.dumps(main_result.model_dump(), indent=2))
f.write(json.dumps(main_result.model_dump(), indent=2, ensure_ascii=ensure_ascii))
elif output == "json":
with open(output_file, "w") as f:
f.write(main_result.extracted_content)
Expand Down Expand Up @@ -1403,9 +1414,10 @@ def profiles_cmd():
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
@click.option("--json-ensure-ascii/--no-json-ensure-ascii", default=None, help="Escape non-ASCII characters in JSON output (default: from global config)")
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int, json_ensure_ascii: Optional[bool]):
"""Crawl4AI CLI - Web content extraction tool

Simple Usage:
Expand Down Expand Up @@ -1457,7 +1469,8 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
verbose=verbose,
profile=profile,
deep_crawl=deep_crawl,
max_pages=max_pages
max_pages=max_pages,
json_ensure_ascii=json_ensure_ascii
)

def main():
Expand Down
5 changes: 5 additions & 0 deletions crawl4ai/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,5 +142,10 @@
"description": "Default user agent mode (default, random, or mobile)",
"type": "string",
"options": ["default", "random", "mobile"]
},
"JSON_ENSURE_ASCII": {
"default": True,
"description": "Whether to escape non-ASCII characters in JSON output (False preserves Unicode like 'š', True escapes as '\\u0161')",
"type": "boolean"
}
}
2 changes: 1 addition & 1 deletion docs/codebase/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
| **config get** | `key` | Prints the value of a single setting, falls back to default if unset. |
| **config set** | `key value` | Persists a new value in the global config (stored under `~/.crawl4ai/config.yml`). |
| **examples** | – | Just spits out real-world CLI usage samples. |
| **crawl** | `url` *(positional)*<br>`--browser-config,-B` path<br>`--crawler-config,-C` path<br>`--filter-config,-f` path<br>`--extraction-config,-e` path<br>`--json-extract,-j` [desc]\*<br>`--schema,-s` path<br>`--browser,-b` k=v list<br>`--crawler,-c` k=v list<br>`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*<br>`--output-file,-O` path<br>`--bypass-cache,-b` *(flag, default true — note flag reuse)*<br>`--question,-q` str<br>`--verbose,-v` *(flag)*<br>`--profile,-p` profile-name | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. |
| **crawl** | `url` *(positional)*<br>`--browser-config,-B` path<br>`--crawler-config,-C` path<br>`--filter-config,-f` path<br>`--extraction-config,-e` path<br>`--json-extract,-j` [desc]\*<br>`--schema,-s` path<br>`--browser,-b` k=v list<br>`--crawler,-c` k=v list<br>`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*<br>`--output-file,-O` path<br>`--bypass-cache,-b` *(flag, default true — note flag reuse)*<br>`--question,-q` str<br>`--verbose,-v` *(flag)*<br>`--profile,-p` profile-name<br>`--json-ensure-ascii/--no-json-ensure-ascii` *(flag, default from global config)* | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. Control Unicode escaping in JSON output with `--json-ensure-ascii` flags. |
| **(default)** | Same flags as **crawl**, plus `--example` | Shortcut so you can type just `crwl https://site.com`. When first arg is not a known sub-command, it falls through to *crawl*. |

\* `--json-extract/-j` with no value turns on LLM-based JSON extraction using an auto schema, supplying a string lets you prompt-engineer the field descriptions.
Expand Down
15 changes: 15 additions & 0 deletions docs/md_v2/core/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,21 @@ crwl https://example.com -f filter_bm25.yml -o markdown-fit
- `markdown` / `md` - Raw markdown output
- `markdown-fit` / `md-fit` - Filtered markdown for better readability

### Unicode Handling in JSON Output

By default, non-ASCII characters in JSON output are escaped (e.g., 'š' becomes `\u0161`). You can control this behavior:

```bash
# Preserve Unicode characters in JSON output
crwl https://example.com -o all --no-json-ensure-ascii

# Escape non-ASCII characters (default behavior)
crwl https://example.com -o all --json-ensure-ascii

# Set global default
crwl config set JSON_ENSURE_ASCII false # Preserve Unicode by default
```

## Complete Examples

1. Basic Extraction:
Expand Down