From 220a2246d30d3fc361448160ea976decf6c4ad1b Mon Sep 17 00:00:00 2001 From: Christian Oudard Date: Wed, 10 Dec 2025 10:12:01 -0700 Subject: [PATCH] When using --deep-crawl, output all pages, not just the first one. --- crawl4ai/cli.py | 32 +++++++++-- tests/cli/test_cli.py | 126 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 5 deletions(-) diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 51b535002..a285afef0 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -1230,9 +1230,21 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: click.echo(json.dumps(extracted_items, indent=2)) elif output in ["markdown", "md"]: - click.echo(main_result.markdown.raw_markdown) + if isinstance(result, list): + # Combine markdown from all crawled pages for deep crawl + for r in all_results: + click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n") + click.echo(r.markdown.raw_markdown) + else: + click.echo(main_result.markdown.raw_markdown) elif output in ["markdown-fit", "md-fit"]: - click.echo(main_result.markdown.fit_markdown) + if isinstance(result, list): + # Combine fit markdown from all crawled pages for deep crawl + for r in all_results: + click.echo(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n") + click.echo(r.markdown.fit_markdown) + else: + click.echo(main_result.markdown.fit_markdown) else: if output == "all": with open(output_file, "w") as f: @@ -1246,10 +1258,22 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: f.write(main_result.extracted_content) elif output in ["markdown", "md"]: with open(output_file, "w") as f: - f.write(main_result.markdown.raw_markdown) + if isinstance(result, list): + # Combine markdown from all crawled pages for deep crawl + for r in all_results: + f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n") + f.write(r.markdown.raw_markdown) + else: + f.write(main_result.markdown.raw_markdown) elif output in ["markdown-fit", "md-fit"]: with open(output_file, "w") as f: - f.write(main_result.markdown.fit_markdown) + if isinstance(result, list): + # Combine fit markdown from all crawled pages for deep crawl + for r in all_results: + f.write(f"\n\n{'='*60}\n# {r.url}\n{'='*60}\n\n") + f.write(r.markdown.fit_markdown) + else: + f.write(main_result.markdown.fit_markdown) except Exception as e: raise click.ClickException(str(e)) diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index b7416dc29..ed8f6d71a 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -1,9 +1,11 @@ import pytest from click.testing import CliRunner from pathlib import Path +from unittest.mock import patch import json import yaml from crawl4ai.cli import cli, load_config_file, parse_key_values +from crawl4ai.models import CrawlResult, MarkdownGenerationResult import tempfile import os import click @@ -129,5 +131,127 @@ def test_invalid_schema(self, runner, temp_config_dir): ]) assert result.exit_code != 0 +class TestDeepCrawlOutput: + """Tests for deep crawl output formatting""" + + @pytest.fixture + def mock_crawl_results(self): + """Create mock CrawlResult objects simulating deep crawl results""" + def make_result(url, content): + markdown = MarkdownGenerationResult( + raw_markdown=content, + markdown_with_citations=content, + references_markdown="", + fit_markdown=content, + ) + result = CrawlResult( + url=url, + html=f"{content}", + success=True, + metadata={"depth": 0}, + ) + result._markdown = markdown + return result + + return [ + make_result("https://example.com/", "# Homepage\n\nWelcome to the homepage."), + make_result("https://example.com/about", "# About\n\nAbout us page content."), + make_result("https://example.com/contact", "# Contact\n\nContact information."), + ] + + def test_deep_crawl_markdown_output_includes_all_pages(self, runner, mock_crawl_results): + """Test that deep crawl with markdown output includes all pages, not just the first""" + with patch('crawl4ai.cli.anyio.run') as mock_anyio_run: + # Return list of results (simulating deep crawl) + mock_anyio_run.return_value = mock_crawl_results + + result = runner.invoke(cli, [ + 'crawl', + 'https://example.com', + '--deep-crawl', 'bfs', + '--max-pages', '3', + '-o', 'markdown' + ]) + + assert result.exit_code == 0, f"CLI failed with: {result.output}" + # Should contain content from ALL pages + assert 'https://example.com/' in result.output + assert 'https://example.com/about' in result.output + assert 'https://example.com/contact' in result.output + assert 'Homepage' in result.output + assert 'About us page content' in result.output + assert 'Contact information' in result.output + + def test_deep_crawl_markdown_fit_output_includes_all_pages(self, runner, mock_crawl_results): + """Test that deep crawl with markdown-fit output includes all pages""" + with patch('crawl4ai.cli.anyio.run') as mock_anyio_run: + mock_anyio_run.return_value = mock_crawl_results + + result = runner.invoke(cli, [ + 'crawl', + 'https://example.com', + '--deep-crawl', 'bfs', + '--max-pages', '3', + '-o', 'markdown-fit' + ]) + + assert result.exit_code == 0, f"CLI failed with: {result.output}" + # Should contain all URLs + assert 'https://example.com/' in result.output + assert 'https://example.com/about' in result.output + assert 'https://example.com/contact' in result.output + + def test_deep_crawl_file_output_includes_all_pages(self, runner, mock_crawl_results, tmp_path): + """Test that deep crawl with file output includes all pages""" + output_file = tmp_path / "output.md" + + with patch('crawl4ai.cli.anyio.run') as mock_anyio_run: + mock_anyio_run.return_value = mock_crawl_results + + result = runner.invoke(cli, [ + 'crawl', + 'https://example.com', + '--deep-crawl', 'bfs', + '--max-pages', '3', + '-o', 'markdown', + '-O', str(output_file) + ]) + + assert result.exit_code == 0, f"CLI failed with: {result.output}" + content = output_file.read_text() + # Should contain content from ALL pages + assert 'https://example.com/' in content + assert 'https://example.com/about' in content + assert 'https://example.com/contact' in content + + def test_single_crawl_markdown_output_unchanged(self, runner): + """Test that single (non-deep) crawl still works correctly""" + markdown = MarkdownGenerationResult( + raw_markdown="# Single Page\n\nContent here.", + markdown_with_citations="# Single Page\n\nContent here.", + references_markdown="", + ) + single_result = CrawlResult( + url="https://example.com/", + html="test", + success=True, + ) + single_result._markdown = markdown + + with patch('crawl4ai.cli.anyio.run') as mock_anyio_run: + # Return single result (not a list) + mock_anyio_run.return_value = single_result + + result = runner.invoke(cli, [ + 'crawl', + 'https://example.com', + '-o', 'markdown' + ]) + + assert result.exit_code == 0, f"CLI failed with: {result.output}" + assert '# Single Page' in result.output + assert 'Content here' in result.output + + if __name__ == '__main__': - pytest.main(['-v', '-s', '--tb=native', __file__]) \ No newline at end of file + pytest.main(['-v', '-s', '--tb=native', __file__])