Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions crawl4ai/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,9 +1230,21 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
click.echo(json.dumps(extracted_items, indent=2))

elif output in ["markdown", "md"]:
click.echo(main_result.markdown.raw_markdown)
if isinstance(result, list):
# Combine markdown from all crawled pages for deep crawl
for r in all_results:
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
click.echo(r.markdown.raw_markdown)
else:
click.echo(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
click.echo(main_result.markdown.fit_markdown)
if isinstance(result, list):
# Combine fit markdown from all crawled pages for deep crawl
for r in all_results:
click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
click.echo(r.markdown.fit_markdown)
else:
click.echo(main_result.markdown.fit_markdown)
else:
if output == "all":
with open(output_file, "w") as f:
Expand All @@ -1246,10 +1258,22 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
f.write(main_result.extracted_content)
elif output in ["markdown", "md"]:
with open(output_file, "w") as f:
f.write(main_result.markdown.raw_markdown)
if isinstance(result, list):
# Combine markdown from all crawled pages for deep crawl
for r in all_results:
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
f.write(r.markdown.raw_markdown)
else:
f.write(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
with open(output_file, "w") as f:
f.write(main_result.markdown.fit_markdown)
if isinstance(result, list):
# Combine fit markdown from all crawled pages for deep crawl
for r in all_results:
f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n")
f.write(r.markdown.fit_markdown)
else:
f.write(main_result.markdown.fit_markdown)

except Exception as e:
raise click.ClickException(str(e))
Expand Down
126 changes: 125 additions & 1 deletion tests/cli/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import pytest
from click.testing import CliRunner
from pathlib import Path
from unittest.mock import patch
import json
import yaml
from crawl4ai.cli import cli, load_config_file, parse_key_values
from crawl4ai.models import CrawlResult, MarkdownGenerationResult
import tempfile
import os
import click
Expand Down Expand Up @@ -129,5 +131,127 @@ def test_invalid_schema(self, runner, temp_config_dir):
])
assert result.exit_code != 0

class TestDeepCrawlOutput:
"""Tests for deep crawl output formatting"""

@pytest.fixture
def mock_crawl_results(self):
"""Create mock CrawlResult objects simulating deep crawl results"""
def make_result(url, content):
markdown = MarkdownGenerationResult(
raw_markdown=content,
markdown_with_citations=content,
references_markdown="",
fit_markdown=content,
)
result = CrawlResult(
url=url,
html=f"<html>{content}</html>",
success=True,
metadata={"depth": 0},
)
result._markdown = markdown
return result

return [
make_result("https://example.com/", "# Homepage\n\nWelcome to the homepage."),
make_result("https://example.com/about", "# About\n\nAbout us page content."),
make_result("https://example.com/contact", "# Contact\n\nContact information."),
]

def test_deep_crawl_markdown_output_includes_all_pages(self, runner, mock_crawl_results):
"""Test that deep crawl with markdown output includes all pages, not just the first"""
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
# Return list of results (simulating deep crawl)
mock_anyio_run.return_value = mock_crawl_results

result = runner.invoke(cli, [
'crawl',
'https://example.com',
'--deep-crawl', 'bfs',
'--max-pages', '3',
'-o', 'markdown'
])

assert result.exit_code == 0, f"CLI failed with: {result.output}"
# Should contain content from ALL pages
assert 'https://example.com/' in result.output
assert 'https://example.com/about' in result.output
assert 'https://example.com/contact' in result.output
assert 'Homepage' in result.output
assert 'About us page content' in result.output
assert 'Contact information' in result.output

def test_deep_crawl_markdown_fit_output_includes_all_pages(self, runner, mock_crawl_results):
"""Test that deep crawl with markdown-fit output includes all pages"""
with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
mock_anyio_run.return_value = mock_crawl_results

result = runner.invoke(cli, [
'crawl',
'https://example.com',
'--deep-crawl', 'bfs',
'--max-pages', '3',
'-o', 'markdown-fit'
])

assert result.exit_code == 0, f"CLI failed with: {result.output}"
# Should contain all URLs
assert 'https://example.com/' in result.output
assert 'https://example.com/about' in result.output
assert 'https://example.com/contact' in result.output

def test_deep_crawl_file_output_includes_all_pages(self, runner, mock_crawl_results, tmp_path):
"""Test that deep crawl with file output includes all pages"""
output_file = tmp_path / "output.md"

with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
mock_anyio_run.return_value = mock_crawl_results

result = runner.invoke(cli, [
'crawl',
'https://example.com',
'--deep-crawl', 'bfs',
'--max-pages', '3',
'-o', 'markdown',
'-O', str(output_file)
])

assert result.exit_code == 0, f"CLI failed with: {result.output}"
content = output_file.read_text()
# Should contain content from ALL pages
assert 'https://example.com/' in content
assert 'https://example.com/about' in content
assert 'https://example.com/contact' in content

def test_single_crawl_markdown_output_unchanged(self, runner):
"""Test that single (non-deep) crawl still works correctly"""
markdown = MarkdownGenerationResult(
raw_markdown="# Single Page\n\nContent here.",
markdown_with_citations="# Single Page\n\nContent here.",
references_markdown="",
)
single_result = CrawlResult(
url="https://example.com/",
html="<html>test</html>",
success=True,
)
single_result._markdown = markdown

with patch('crawl4ai.cli.anyio.run') as mock_anyio_run:
# Return single result (not a list)
mock_anyio_run.return_value = single_result

result = runner.invoke(cli, [
'crawl',
'https://example.com',
'-o', 'markdown'
])

assert result.exit_code == 0, f"CLI failed with: {result.output}"
assert '# Single Page' in result.output
assert 'Content here' in result.output


if __name__ == '__main__':
pytest.main(['-v', '-s', '--tb=native', __file__])
pytest.main(['-v', '-s', '--tb=native', __file__])