diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml deleted file mode 100644 index 072659497..000000000 --- a/.github/FUNDING.yml +++ /dev/null @@ -1,7 +0,0 @@ -# These are supported funding model platforms - -# GitHub Sponsors -github: unclecode - -# Custom links for enterprise inquiries (uncomment when ready) -# custom: ["https://crawl4ai.com/enterprise"] \ No newline at end of file diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 7366dad4d..1b4b84e13 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,19 +1,9 @@ -## Summary -Please include a summary of the change and/or which issues are fixed. +[sc-XStoryNumberXX](https://app.shortcut.com/coprocure/story/XStoryNumberXX/helpful-shortcut-but-still-need-tag-above-for-integration) -eg: `Fixes #123` (Tag GitHub issue numbers in this format, so it automatically links the issues with your PR) +### Summary -## List of files changed and why -eg: quickstart.py - To update the example as per new changes +XX Replace me - What is the goal of this PR? What changes were made? XX -## How Has This Been Tested? -Please describe the tests that you ran to verify your changes. +### Testing -## Checklist: - -- [ ] My code follows the style guidelines of this project -- [ ] I have performed a self-review of my own code -- [ ] I have commented my code, particularly in hard-to-understand areas -- [ ] I have made corresponding changes to the documentation -- [ ] I have added/updated unit tests that prove my fix is effective or that my feature works -- [ ] New and existing unit tests pass locally with my changes +XX Replace me - Automated tests? Manual testing? XX diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index b53eb7be4..000000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,46 +0,0 @@ -name: Discord GitHub Notifications - -on: - issues: - types: [opened] - issue_comment: - types: [created] - pull_request: - types: [opened] - discussion: - types: [created] - watch: - types: [started] - -jobs: - notify-discord: - runs-on: ubuntu-latest - steps: - - name: Send to Google Apps Script (Stars only) - if: github.event_name == 'watch' - run: | - curl -fSs -X POST "${{ secrets.GOOGLE_SCRIPT_ENDPOINT }}" \ - -H 'Content-Type: application/json' \ - -d '{"url":"${{ github.event.sender.html_url }}"}' - - name: Set webhook based on event type - id: set-webhook - run: | - if [ "${{ github.event_name }}" == "discussion" ]; then - echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT - elif [ "${{ github.event_name }}" == "watch" ]; then - echo "webhook=${{ secrets.DISCORD_STAR_GAZERS }}" >> $GITHUB_OUTPUT - else - echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT - fi - - - name: Discord Notification - uses: Ilshidur/action-discord@master - env: - DISCORD_WEBHOOK: ${{ steps.set-webhook.outputs.webhook }} - with: - args: | - ${{ github.event_name == 'issues' && format('๐Ÿ“ฃ New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) || - github.event_name == 'issue_comment' && format('๐Ÿ’ฌ New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) || - github.event_name == 'pull_request' && format('๐Ÿ”„ New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) || - github.event_name == 'watch' && format('โญ {0} starred Crawl4AI ๐Ÿฅณ! Check out their profile: {1}', github.event.sender.login, github.event.sender.html_url) || - format('๐Ÿ’ฌ New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }} diff --git a/.github/workflows/pr-ci.yaml b/.github/workflows/pr-ci.yaml new file mode 100644 index 000000000..79201609e --- /dev/null +++ b/.github/workflows/pr-ci.yaml @@ -0,0 +1,63 @@ +# Run this workflow when a pull request is created or activity +# happens on the PR target branch. +name: PR CI + +on: [pull_request] + +env: + PYTHON_VERSION: 3.13.10 + +# Cancel jobs in progress when a new reference is pushed. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + # Cancel in progress CI for non-renovate PRs. + cancel-in-progress: ${{ github.actor != 'renovate[bot]' }} + +jobs: + # Lint step disabled until we format all files. + # lint: + # runs-on: blacksmith-2vcpu-ubuntu-2404 + + # steps: + # - name: PR Checkout + # uses: actions/checkout@v6 + # with: + # fetch-depth: 0 + # - name: Lint Python Files + # uses: astral-sh/ruff-action@v3 + # with: + # version: "0.14.10" # keep in sync with pyproject.toml + # - name: Format Python Files + # run: ruff format --check + + python_test: + runs-on: blacksmith-2vcpu-ubuntu-2404 + env: + # Allow the Python interpreter version to be newer than PyO3's maximum supported version. + PYO3_USE_ABI3_FORWARD_COMPATIBILITY: 1 + + steps: + - name: PR Checkout + uses: actions/checkout@v6 + - name: Install Dependencies + run: | + sudo apt-get update + sudo apt-get install \ + libxml2-dev \ + libxslt1-dev \ + libjpeg-dev \ + libgeos-dev + - name: Setup uv + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ env.PYTHON_VERSION }} + activate-environment: true + enable-cache: true + cache-local-path: ".cache/.uv-cache" + - name: Install Playwright + run: uv run playwright install --with-deps chromium + - name: Python Tests + run: uv run pytest tests/ + timeout-minutes: 10 + - name: Minimize uv cache + run: uv cache prune --ci diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 3ee9042c2..000000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,142 +0,0 @@ -name: Release Pipeline -on: - push: - tags: - - 'v*' - - '!test-v*' # Exclude test tags - -jobs: - release: - runs-on: ubuntu-latest - permissions: - contents: write # Required for creating releases - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Extract version from tag - id: get_version - run: | - TAG_VERSION=${GITHUB_REF#refs/tags/v} - echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT - echo "Releasing version: $TAG_VERSION" - - - name: Install package dependencies - run: | - pip install -e . - - - name: Check version consistency - run: | - TAG_VERSION=${{ steps.get_version.outputs.VERSION }} - PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)") - - echo "Tag version: $TAG_VERSION" - echo "Package version: $PACKAGE_VERSION" - - if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then - echo "โŒ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION" - echo "Please update crawl4ai/__version__.py to match the tag version" - exit 1 - fi - echo "โœ… Version check passed: $TAG_VERSION" - - - name: Install build dependencies - run: | - python -m pip install --upgrade pip - pip install build twine - - - name: Build package - run: python -m build - - - name: Check package - run: twine check dist/* - - - name: Upload to PyPI - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - run: | - echo "๐Ÿ“ฆ Uploading to PyPI..." - twine upload dist/* - echo "โœ… Package uploaded to https://pypi.org/project/crawl4ai/" - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_TOKEN }} - - - name: Extract major and minor versions - id: versions - run: | - VERSION=${{ steps.get_version.outputs.VERSION }} - MAJOR=$(echo $VERSION | cut -d. -f1) - MINOR=$(echo $VERSION | cut -d. -f1-2) - echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT - echo "MINOR=$MINOR" >> $GITHUB_OUTPUT - - - name: Build and push Docker images - uses: docker/build-push-action@v5 - with: - context: . - push: true - tags: | - unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }} - unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }} - unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }} - unclecode/crawl4ai:latest - platforms: linux/amd64,linux/arm64 - - - name: Create GitHub Release - uses: softprops/action-gh-release@v2 - with: - tag_name: v${{ steps.get_version.outputs.VERSION }} - name: Release v${{ steps.get_version.outputs.VERSION }} - body: | - ## ๐ŸŽ‰ Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released! - - ### ๐Ÿ“ฆ Installation - - **PyPI:** - ```bash - pip install crawl4ai==${{ steps.get_version.outputs.VERSION }} - ``` - - **Docker:** - ```bash - docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }} - docker pull unclecode/crawl4ai:latest - ``` - - ### ๐Ÿ“ What's Changed - See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details. - draft: false - prerelease: false - token: ${{ secrets.GITHUB_TOKEN }} - - - name: Summary - run: | - echo "## ๐Ÿš€ Release Complete!" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### ๐Ÿ“ฆ PyPI Package" >> $GITHUB_STEP_SUMMARY - echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY - echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY - echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### ๐Ÿณ Docker Images" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### ๐Ÿ“‹ GitHub Release" >> $GITHUB_STEP_SUMMARY - echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/.github/workflows/test-release.yml.disabled b/.github/workflows/test-release.yml.disabled deleted file mode 100644 index ce4fb67f6..000000000 --- a/.github/workflows/test-release.yml.disabled +++ /dev/null @@ -1,116 +0,0 @@ -name: Test Release Pipeline -on: - push: - tags: - - 'test-v*' - -jobs: - test-release: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Extract version from tag - id: get_version - run: | - TAG_VERSION=${GITHUB_REF#refs/tags/test-v} - echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT - echo "Testing with version: $TAG_VERSION" - - - name: Install package dependencies - run: | - pip install -e . - - - name: Check version consistency - run: | - TAG_VERSION=${{ steps.get_version.outputs.VERSION }} - PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)") - - echo "Tag version: $TAG_VERSION" - echo "Package version: $PACKAGE_VERSION" - - if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then - echo "โŒ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION" - echo "Please update crawl4ai/__version__.py to match the tag version" - exit 1 - fi - echo "โœ… Version check passed: $TAG_VERSION" - - - name: Install build dependencies - run: | - python -m pip install --upgrade pip - pip install build twine - - - name: Build package - run: python -m build - - - name: Check package - run: twine check dist/* - - - name: Upload to Test PyPI - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }} - run: | - echo "๐Ÿ“ฆ Uploading to Test PyPI..." - twine upload --repository testpypi dist/* || { - if [ $? -eq 1 ]; then - echo "โš ๏ธ Upload failed - likely version already exists on Test PyPI" - echo "Continuing anyway for test purposes..." - else - exit 1 - fi - } - echo "โœ… Test PyPI step complete" - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_TOKEN }} - - - name: Build and push Docker test images - uses: docker/build-push-action@v5 - with: - context: . - push: true - tags: | - unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }} - unclecode/crawl4ai:test-latest - platforms: linux/amd64,linux/arm64 - cache-from: type=gha - cache-to: type=gha,mode=max - - - name: Summary - run: | - echo "## ๐ŸŽ‰ Test Release Complete!" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### ๐Ÿ“ฆ Test PyPI Package" >> $GITHUB_STEP_SUMMARY - echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY - echo "- URL: https://test.pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY - echo "- Install: \`pip install -i https://test.pypi.org/simple/ crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### ๐Ÿณ Docker Test Images" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:test-latest\`" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### ๐Ÿงน Cleanup Commands" >> $GITHUB_STEP_SUMMARY - echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY - echo "# Remove test tag" >> $GITHUB_STEP_SUMMARY - echo "git tag -d test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY - echo "git push origin :test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "# Remove Docker test images" >> $GITHUB_STEP_SUMMARY - echo "docker rmi unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY - echo "docker rmi unclecode/crawl4ai:test-latest" >> $GITHUB_STEP_SUMMARY - echo "\`\`\`" >> $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/.ruff.toml b/.ruff.toml new file mode 100644 index 000000000..6c642cbb8 --- /dev/null +++ b/.ruff.toml @@ -0,0 +1,55 @@ +target-version = "py313" + +[lint] +# Specific lint rules can be found at https://docs.astral.sh/ruff/rules +select = [ + # Pyflakes + "F", + # isort + "I", + "T20", + # ruff's default subset of codestyle rules that don't overlap with formatting. + "E4", + "E7", + "E9", + "PERF", + "SLF", + "NPY201", + # Enforce consistent rules when using "from __future__ import annotations" + "FA", + "UP", + # Enforce consistent, common imports, like `import pandas as pd` along with custom + # imports like `import brochure.models as app`. + "ICN", + # Enforce consistent return statements. + "RET" +] + +[lint.per-file-ignores] +# Ignore unused import amd import * for init files +"__init__.py" = ["F401", "F403"] +# Ignore print statements for commands and private access +"brochure/management/commands/*" = ["T20", "SLF"] +"settings.py" = ["T20"] +# Ignore perf linting in tests. +"tests/*" = ["PERF", "SLF"] + +[lint.extend-per-file-ignores] +"brochure/models/__init__.py" = ["I"] +"brochure/migrations/*" = ["I"] + +[lint.isort] +known-first-party = ["brochure"] +known-third-party = ["newrelic"] +section-order = ["future","pytest","standard-library","django","third-party","first-party","local-folder"] + +[lint.isort.sections] +"django" = ["django"] +"pytest" = ["pytest"] + +[lint.flake8-self] +# Ignore a few accesses of private django internals. +ignore-names = ["_prefetched_objects_cache", "_meta"] + +[lint.flake8-import-conventions.extend-aliases] +"brochure.models" = "app" \ No newline at end of file diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 6917f27e9..59b9451d1 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -1,110 +1,109 @@ # __init__.py import warnings -from .async_webcrawler import AsyncWebCrawler, CacheMode -# MODIFIED: Add SeedingConfig and VirtualScrollConfig here -from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode +# Adaptive Crawler +from .adaptive_crawler import ( + AdaptiveConfig, + AdaptiveCrawler, + CrawlState, + CrawlStrategy, + StatisticalStrategy, +) -from .content_scraping_strategy import ( - ContentScrapingStrategy, - LXMLWebScrapingStrategy, - WebScrapingStrategy, # Backward compatibility alias +# MODIFIED: Add SeedingConfig and VirtualScrollConfig here +from .async_configs import ( + BrowserConfig, + CrawlerRunConfig, + GeolocationConfig, + HTTPCrawlerConfig, + LinkPreviewConfig, + LLMConfig, + MatchMode, + ProxyConfig, + SeedingConfig, + VirtualScrollConfig, +) +from .async_dispatcher import ( + BaseDispatcher, + MemoryAdaptiveDispatcher, + RateLimiter, + SemaphoreDispatcher, ) from .async_logger import ( - AsyncLoggerBase, AsyncLogger, + AsyncLoggerBase, ) -from .proxy_strategy import ( - ProxyRotationStrategy, - RoundRobinProxyStrategy, -) -from .extraction_strategy import ( - ExtractionStrategy, - LLMExtractionStrategy, - CosineStrategy, - JsonCssExtractionStrategy, - JsonXPathExtractionStrategy, - JsonLxmlExtractionStrategy, - RegexExtractionStrategy -) + +# NEW: Import AsyncUrlSeeder +from .async_url_seeder import AsyncUrlSeeder +from .async_webcrawler import AsyncWebCrawler, CacheMode + +# Browser Adapters +from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter +from .browser_profiler import BrowserProfiler from .chunking_strategy import ChunkingStrategy, RegexChunking -from .markdown_generation_strategy import DefaultMarkdownGenerator -from .table_extraction import ( - TableExtractionStrategy, - DefaultTableExtraction, - NoTableExtraction, - LLMTableExtraction, -) +from .components.crawler_monitor import CrawlerMonitor from .content_filter_strategy import ( - PruningContentFilter, BM25ContentFilter, LLMContentFilter, + PruningContentFilter, RelevantContentFilter, ) -from .models import CrawlResult, MarkdownGenerationResult, DisplayMode -from .components.crawler_monitor import CrawlerMonitor -from .link_preview import LinkPreview -from .async_dispatcher import ( - MemoryAdaptiveDispatcher, - SemaphoreDispatcher, - RateLimiter, - BaseDispatcher, +from .content_scraping_strategy import ( + ContentScrapingStrategy, + LXMLWebScrapingStrategy, + WebScrapingStrategy, # Backward compatibility alias ) -from .docker_client import Crawl4aiDockerClient -from .hub import CrawlerHub -from .browser_profiler import BrowserProfiler from .deep_crawling import ( - DeepCrawlStrategy, + BestFirstCrawlingStrategy, BFSDeepCrawlStrategy, - FilterChain, - URLPatternFilter, - DomainFilter, - ContentTypeFilter, - URLFilter, - FilterStats, - SEOFilter, - KeywordRelevanceScorer, - URLScorer, CompositeScorer, + ContentTypeFilter, + DeepCrawlDecorator, + DeepCrawlStrategy, + DFSDeepCrawlStrategy, DomainAuthorityScorer, + DomainFilter, + FilterChain, + FilterStats, FreshnessScorer, + KeywordRelevanceScorer, PathDepthScorer, - BestFirstCrawlingStrategy, - DFSDeepCrawlStrategy, - DeepCrawlDecorator, -) -# NEW: Import AsyncUrlSeeder -from .async_url_seeder import AsyncUrlSeeder -# Adaptive Crawler -from .adaptive_crawler import ( - AdaptiveCrawler, - AdaptiveConfig, - CrawlState, - CrawlStrategy, - StatisticalStrategy + SEOFilter, + URLFilter, + URLPatternFilter, + URLScorer, ) - -# C4A Script Language Support -from .script import ( - compile as c4a_compile, - validate as c4a_validate, - compile_file as c4a_compile_file, - CompilationResult, - ValidationResult, - ErrorDetail +from .docker_client import Crawl4aiDockerClient +from .extraction_strategy import ( + CosineStrategy, + ExtractionStrategy, + JsonCssExtractionStrategy, + JsonLxmlExtractionStrategy, + JsonXPathExtractionStrategy, + LLMExtractionStrategy, + RegexExtractionStrategy, ) - -# Browser Adapters -from .browser_adapter import ( - BrowserAdapter, - PlaywrightAdapter, - UndetectedAdapter +from .link_preview import LinkPreview +from .markdown_generation_strategy import DefaultMarkdownGenerator +from .models import CrawlResult, DisplayMode, MarkdownGenerationResult +from .proxy_strategy import ( + ProxyRotationStrategy, + RoundRobinProxyStrategy, ) +from .script import CompilationResult, ErrorDetail, ValidationResult -from .utils import ( - start_colab_display_server, - setup_colab_environment +# C4A Script Language Support +from .script import compile as c4a_compile +from .script import compile_file as c4a_compile_file +from .script import validate as c4a_validate +from .table_extraction import ( + DefaultTableExtraction, + LLMTableExtraction, + NoTableExtraction, + TableExtractionStrategy, ) +from .utils import setup_colab_environment, start_colab_display_server __all__ = [ "AsyncLoggerBase", @@ -143,7 +142,6 @@ "PathDepthScorer", "DeepCrawlDecorator", "CrawlResult", - "CrawlerHub", "CacheMode", "MatchMode", "ContentScrapingStrategy", diff --git a/crawl4ai/adaptive_crawler copy.py b/crawl4ai/adaptive_crawler copy.py deleted file mode 100644 index 294a292d4..000000000 --- a/crawl4ai/adaptive_crawler copy.py +++ /dev/null @@ -1,1847 +0,0 @@ -""" -Adaptive Web Crawler for Crawl4AI - -This module implements adaptive information foraging for efficient web crawling. -It determines when sufficient information has been gathered to answer a query, -avoiding unnecessary crawls while ensuring comprehensive coverage. -""" - -from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Set, Tuple, Any, Union -from dataclasses import dataclass, field -import asyncio -import pickle -import os -import json -import math -from collections import defaultdict, Counter -import re -from pathlib import Path - -from crawl4ai.async_webcrawler import AsyncWebCrawler -from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig -from crawl4ai.models import Link, CrawlResult - - -@dataclass -class CrawlState: - """Tracks the current state of adaptive crawling""" - crawled_urls: Set[str] = field(default_factory=set) - knowledge_base: List[CrawlResult] = field(default_factory=list) - pending_links: List[Link] = field(default_factory=list) - query: str = "" - metrics: Dict[str, float] = field(default_factory=dict) - - # Statistical tracking - term_frequencies: Dict[str, int] = field(default_factory=lambda: defaultdict(int)) - document_frequencies: Dict[str, int] = field(default_factory=lambda: defaultdict(int)) - documents_with_terms: Dict[str, Set[int]] = field(default_factory=lambda: defaultdict(set)) - total_documents: int = 0 - - # History tracking for saturation - new_terms_history: List[int] = field(default_factory=list) - crawl_order: List[str] = field(default_factory=list) - - # Embedding-specific tracking (only if strategy is embedding) - kb_embeddings: Optional[Any] = None # Will be numpy array - query_embeddings: Optional[Any] = None # Will be numpy array - expanded_queries: List[str] = field(default_factory=list) - coverage_shape: Optional[Any] = None # Alpha shape - semantic_gaps: List[Tuple[List[float], float]] = field(default_factory=list) # Serializable - embedding_model: str = "" - - def save(self, path: Union[str, Path]): - """Save state to disk for persistence""" - path = Path(path) - path.parent.mkdir(parents=True, exist_ok=True) - - # Convert CrawlResult objects to dicts for serialization - state_dict = { - 'crawled_urls': list(self.crawled_urls), - 'knowledge_base': [self._crawl_result_to_dict(cr) for cr in self.knowledge_base], - 'pending_links': [link.model_dump() for link in self.pending_links], - 'query': self.query, - 'metrics': self.metrics, - 'term_frequencies': dict(self.term_frequencies), - 'document_frequencies': dict(self.document_frequencies), - 'documents_with_terms': {k: list(v) for k, v in self.documents_with_terms.items()}, - 'total_documents': self.total_documents, - 'new_terms_history': self.new_terms_history, - 'crawl_order': self.crawl_order, - # Embedding-specific fields (convert numpy arrays to lists for JSON) - 'kb_embeddings': self.kb_embeddings.tolist() if self.kb_embeddings is not None else None, - 'query_embeddings': self.query_embeddings.tolist() if self.query_embeddings is not None else None, - 'expanded_queries': self.expanded_queries, - 'semantic_gaps': self.semantic_gaps, - 'embedding_model': self.embedding_model - } - - with open(path, 'w') as f: - json.dump(state_dict, f, indent=2) - - @classmethod - def load(cls, path: Union[str, Path]) -> 'CrawlState': - """Load state from disk""" - path = Path(path) - with open(path, 'r') as f: - state_dict = json.load(f) - - state = cls() - state.crawled_urls = set(state_dict['crawled_urls']) - state.knowledge_base = [cls._dict_to_crawl_result(d) for d in state_dict['knowledge_base']] - state.pending_links = [Link(**link_dict) for link_dict in state_dict['pending_links']] - state.query = state_dict['query'] - state.metrics = state_dict['metrics'] - state.term_frequencies = defaultdict(int, state_dict['term_frequencies']) - state.document_frequencies = defaultdict(int, state_dict['document_frequencies']) - state.documents_with_terms = defaultdict(set, {k: set(v) for k, v in state_dict['documents_with_terms'].items()}) - state.total_documents = state_dict['total_documents'] - state.new_terms_history = state_dict['new_terms_history'] - state.crawl_order = state_dict['crawl_order'] - - # Load embedding-specific fields (convert lists back to numpy arrays) - import numpy as np - state.kb_embeddings = np.array(state_dict['kb_embeddings']) if state_dict.get('kb_embeddings') is not None else None - state.query_embeddings = np.array(state_dict['query_embeddings']) if state_dict.get('query_embeddings') is not None else None - state.expanded_queries = state_dict.get('expanded_queries', []) - state.semantic_gaps = state_dict.get('semantic_gaps', []) - state.embedding_model = state_dict.get('embedding_model', '') - - return state - - @staticmethod - def _crawl_result_to_dict(cr: CrawlResult) -> Dict: - """Convert CrawlResult to serializable dict""" - # Extract markdown content safely - markdown_content = "" - if hasattr(cr, 'markdown') and cr.markdown: - if hasattr(cr.markdown, 'raw_markdown'): - markdown_content = cr.markdown.raw_markdown - else: - markdown_content = str(cr.markdown) - - return { - 'url': cr.url, - 'content': markdown_content, - 'links': cr.links if hasattr(cr, 'links') else {}, - 'metadata': cr.metadata if hasattr(cr, 'metadata') else {} - } - - @staticmethod - def _dict_to_crawl_result(d: Dict): - """Convert dict back to CrawlResult""" - # Create a mock object that has the minimal interface we need - class MockMarkdown: - def __init__(self, content): - self.raw_markdown = content - - class MockCrawlResult: - def __init__(self, url, content, links, metadata): - self.url = url - self.markdown = MockMarkdown(content) - self.links = links - self.metadata = metadata - - return MockCrawlResult( - url=d['url'], - content=d.get('content', ''), - links=d.get('links', {}), - metadata=d.get('metadata', {}) - ) - - -@dataclass -class AdaptiveConfig: - """Configuration for adaptive crawling""" - confidence_threshold: float = 0.7 - max_depth: int = 5 - max_pages: int = 20 - top_k_links: int = 3 - min_gain_threshold: float = 0.1 - strategy: str = "statistical" # statistical, embedding, llm - - # Advanced parameters - saturation_threshold: float = 0.8 - consistency_threshold: float = 0.7 - coverage_weight: float = 0.4 - consistency_weight: float = 0.3 - saturation_weight: float = 0.3 - - # Link scoring parameters - relevance_weight: float = 0.5 - novelty_weight: float = 0.3 - authority_weight: float = 0.2 - - # Persistence - save_state: bool = False - state_path: Optional[str] = None - - # Embedding strategy parameters - embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2" - embedding_llm_config: Optional[Dict] = None # Separate config for embeddings - n_query_variations: int = 10 - coverage_threshold: float = 0.85 - alpha_shape_alpha: float = 0.5 - - # Embedding confidence calculation parameters - embedding_coverage_radius: float = 0.2 # Distance threshold for "covered" query points - # Example: With radius=0.2, a query point is considered covered if ANY document - # is within cosine distance 0.2 (very similar). Smaller = stricter coverage requirement - - embedding_k_exp: float = 3.0 # Exponential decay factor for distance-to-score mapping - # Example: score = exp(-k_exp * distance). With k_exp=1, distance 0.2 โ†’ score 0.82, - # distance 0.5 โ†’ score 0.61. Higher k_exp = steeper decay = more emphasis on very close matches - - embedding_nearest_weight: float = 0.7 # Weight for nearest neighbor in hybrid scoring - embedding_top_k_weight: float = 0.3 # Weight for top-k average in hybrid scoring - # Example: If nearest doc has score 0.9 and top-3 avg is 0.6, final = 0.7*0.9 + 0.3*0.6 = 0.81 - # Higher nearest_weight = more focus on best match vs neighborhood density - - # Embedding link selection parameters - embedding_overlap_threshold: float = 0.85 # Similarity threshold for penalizing redundant links - # Example: Links with >0.85 similarity to existing KB get penalized to avoid redundancy - # Lower = more aggressive deduplication, Higher = allow more similar content - - # Embedding stopping criteria parameters - embedding_min_relative_improvement: float = 0.1 # Minimum relative improvement to continue - # Example: If confidence is 0.6, need improvement > 0.06 per batch to continue crawling - # Lower = more patient crawling, Higher = stop earlier when progress slows - - embedding_validation_min_score: float = 0.4 # Minimum validation score to trust convergence - # Example: Even if learning converged, keep crawling if validation score < 0.4 - # This prevents premature stopping when we haven't truly covered the query space - - # Quality confidence mapping parameters (for display to user) - embedding_quality_min_confidence: float = 0.7 # Minimum confidence for validated systems - embedding_quality_max_confidence: float = 0.95 # Maximum realistic confidence - embedding_quality_scale_factor: float = 0.833 # Scaling factor for confidence mapping - # Example: Validated system with learning_score=0.5 โ†’ confidence = 0.7 + (0.5-0.4)*0.833 = 0.78 - # These control how internal scores map to user-friendly confidence percentages - - def validate(self): - """Validate configuration parameters""" - assert 0 <= self.confidence_threshold <= 1, "confidence_threshold must be between 0 and 1" - assert self.max_depth > 0, "max_depth must be positive" - assert self.max_pages > 0, "max_pages must be positive" - assert self.top_k_links > 0, "top_k_links must be positive" - assert 0 <= self.min_gain_threshold <= 1, "min_gain_threshold must be between 0 and 1" - - # Check weights sum to 1 - weight_sum = self.coverage_weight + self.consistency_weight + self.saturation_weight - assert abs(weight_sum - 1.0) < 0.001, f"Coverage weights must sum to 1, got {weight_sum}" - - weight_sum = self.relevance_weight + self.novelty_weight + self.authority_weight - assert abs(weight_sum - 1.0) < 0.001, f"Link scoring weights must sum to 1, got {weight_sum}" - - # Validate embedding parameters - assert 0 < self.embedding_coverage_radius < 1, "embedding_coverage_radius must be between 0 and 1" - assert self.embedding_k_exp > 0, "embedding_k_exp must be positive" - assert 0 <= self.embedding_nearest_weight <= 1, "embedding_nearest_weight must be between 0 and 1" - assert 0 <= self.embedding_top_k_weight <= 1, "embedding_top_k_weight must be between 0 and 1" - assert abs(self.embedding_nearest_weight + self.embedding_top_k_weight - 1.0) < 0.001, "Embedding weights must sum to 1" - assert 0 <= self.embedding_overlap_threshold <= 1, "embedding_overlap_threshold must be between 0 and 1" - assert 0 < self.embedding_min_relative_improvement < 1, "embedding_min_relative_improvement must be between 0 and 1" - assert 0 <= self.embedding_validation_min_score <= 1, "embedding_validation_min_score must be between 0 and 1" - assert 0 <= self.embedding_quality_min_confidence <= 1, "embedding_quality_min_confidence must be between 0 and 1" - assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1" - assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive" - - -class CrawlStrategy(ABC): - """Abstract base class for crawling strategies""" - - @abstractmethod - async def calculate_confidence(self, state: CrawlState) -> float: - """Calculate overall confidence that we have sufficient information""" - pass - - @abstractmethod - async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]: - """Rank pending links by expected information gain""" - pass - - @abstractmethod - async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool: - """Determine if crawling should stop""" - pass - - @abstractmethod - async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None: - """Update state with new crawl results""" - pass - - -class StatisticalStrategy(CrawlStrategy): - """Pure statistical approach - no LLM, no embeddings""" - - def __init__(self): - self.idf_cache = {} - self.bm25_k1 = 1.2 # BM25 parameter - self.bm25_b = 0.75 # BM25 parameter - - async def calculate_confidence(self, state: CrawlState) -> float: - """Calculate confidence using coverage, consistency, and saturation""" - if not state.knowledge_base: - return 0.0 - - coverage = self._calculate_coverage(state) - consistency = self._calculate_consistency(state) - saturation = self._calculate_saturation(state) - - # Store individual metrics - state.metrics['coverage'] = coverage - state.metrics['consistency'] = consistency - state.metrics['saturation'] = saturation - - # Weighted combination (weights from config not accessible here, using defaults) - confidence = 0.4 * coverage + 0.3 * consistency + 0.3 * saturation - - return confidence - - def _calculate_coverage(self, state: CrawlState) -> float: - """Coverage scoring - measures query term presence across knowledge base - - Returns a score between 0 and 1, where: - - 0 means no query terms found - - 1 means excellent coverage of all query terms - """ - if not state.query or state.total_documents == 0: - return 0.0 - - query_terms = self._tokenize(state.query.lower()) - if not query_terms: - return 0.0 - - term_scores = [] - max_tf = max(state.term_frequencies.values()) if state.term_frequencies else 1 - - for term in query_terms: - tf = state.term_frequencies.get(term, 0) - df = state.document_frequencies.get(term, 0) - - if df > 0: - # Document coverage: what fraction of docs contain this term - doc_coverage = df / state.total_documents - - # Frequency signal: normalized log frequency - freq_signal = math.log(1 + tf) / math.log(1 + max_tf) if max_tf > 0 else 0 - - # Combined score: document coverage with frequency boost - term_score = doc_coverage * (1 + 0.5 * freq_signal) - term_scores.append(term_score) - else: - term_scores.append(0.0) - - # Average across all query terms - coverage = sum(term_scores) / len(term_scores) - - # Apply square root curve to make score more intuitive - # This helps differentiate between partial and good coverage - return min(1.0, math.sqrt(coverage)) - - def _calculate_consistency(self, state: CrawlState) -> float: - """Information overlap between pages - high overlap suggests coherent topic coverage""" - if len(state.knowledge_base) < 2: - return 1.0 # Single or no documents are perfectly consistent - - # Calculate pairwise term overlap - overlaps = [] - - for i in range(len(state.knowledge_base)): - for j in range(i + 1, len(state.knowledge_base)): - # Get terms from both documents - terms_i = set(self._get_document_terms(state.knowledge_base[i])) - terms_j = set(self._get_document_terms(state.knowledge_base[j])) - - if terms_i and terms_j: - # Jaccard similarity - overlap = len(terms_i & terms_j) / len(terms_i | terms_j) - overlaps.append(overlap) - - if overlaps: - # Average overlap as consistency measure - consistency = sum(overlaps) / len(overlaps) - else: - consistency = 0.0 - - return consistency - - def _calculate_saturation(self, state: CrawlState) -> float: - """Diminishing returns indicator - are we still discovering new information?""" - if not state.new_terms_history: - return 0.0 - - if len(state.new_terms_history) < 2: - return 0.0 # Not enough history - - # Calculate rate of new term discovery - recent_rate = state.new_terms_history[-1] if state.new_terms_history[-1] > 0 else 1 - initial_rate = state.new_terms_history[0] if state.new_terms_history[0] > 0 else 1 - - # Saturation increases as rate decreases - saturation = 1 - (recent_rate / initial_rate) - - return max(0.0, min(saturation, 1.0)) - - async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]: - """Rank links by expected information gain""" - scored_links = [] - - for link in state.pending_links: - # Skip already crawled URLs - if link.href in state.crawled_urls: - continue - - # Calculate component scores - relevance = self._calculate_relevance(link, state) - novelty = self._calculate_novelty(link, state) - authority = 1.0 - # authority = self._calculate_authority(link) - - # Combined score - score = (config.relevance_weight * relevance + - config.novelty_weight * novelty + - config.authority_weight * authority) - - scored_links.append((link, score)) - - # Sort by score descending - scored_links.sort(key=lambda x: x[1], reverse=True) - - return scored_links - - def _calculate_relevance(self, link: Link, state: CrawlState) -> float: - """BM25 relevance score between link preview and query""" - if not state.query or not link: - return 0.0 - - # Combine available text from link - link_text = ' '.join(filter(None, [ - link.text or '', - link.title or '', - link.head_data.get('meta', {}).get('title', '') if link.head_data else '', - link.head_data.get('meta', {}).get('description', '') if link.head_data else '', - link.head_data.get('meta', {}).get('keywords', '') if link.head_data else '' - ])).lower() - - if not link_text: - return 0.0 - - # Use contextual score if available (from BM25 scoring during crawl) - # if link.contextual_score is not None: - if link.contextual_score and link.contextual_score > 0: - return link.contextual_score - - # Otherwise, calculate simple term overlap - query_terms = set(self._tokenize(state.query.lower())) - link_terms = set(self._tokenize(link_text)) - - if not query_terms: - return 0.0 - - overlap = len(query_terms & link_terms) / len(query_terms) - return overlap - - def _calculate_novelty(self, link: Link, state: CrawlState) -> float: - """Estimate how much new information this link might provide""" - if not state.knowledge_base: - return 1.0 # First links are maximally novel - - # Get terms from link preview - link_text = ' '.join(filter(None, [ - link.text or '', - link.title or '', - link.head_data.get('title', '') if link.head_data else '', - link.head_data.get('description', '') if link.head_data else '', - link.head_data.get('keywords', '') if link.head_data else '' - ])).lower() - - link_terms = set(self._tokenize(link_text)) - if not link_terms: - return 0.5 # Unknown novelty - - # Calculate what percentage of link terms are new - existing_terms = set(state.term_frequencies.keys()) - new_terms = link_terms - existing_terms - - novelty = len(new_terms) / len(link_terms) if link_terms else 0.0 - - return novelty - - def _calculate_authority(self, link: Link) -> float: - """Simple authority score based on URL structure and link attributes""" - score = 0.5 # Base score - - if not link.href: - return 0.0 - - url = link.href.lower() - - # Positive indicators - if '/docs/' in url or '/documentation/' in url: - score += 0.2 - if '/api/' in url or '/reference/' in url: - score += 0.2 - if '/guide/' in url or '/tutorial/' in url: - score += 0.1 - - # Check for file extensions - if url.endswith('.pdf'): - score += 0.1 - elif url.endswith(('.jpg', '.png', '.gif')): - score -= 0.3 # Reduce score for images - - # Use intrinsic score if available - if link.intrinsic_score is not None: - score = 0.7 * score + 0.3 * link.intrinsic_score - - return min(score, 1.0) - - async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool: - """Determine if crawling should stop""" - # Check confidence threshold - confidence = state.metrics.get('confidence', 0.0) - if confidence >= config.confidence_threshold: - return True - - # Check resource limits - if len(state.crawled_urls) >= config.max_pages: - return True - - # Check if we have any links left - if not state.pending_links: - return True - - # Check saturation - if state.metrics.get('saturation', 0.0) >= config.saturation_threshold: - return True - - return False - - async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None: - """Update state with new crawl results""" - for result in new_results: - # Track new terms - old_term_count = len(state.term_frequencies) - - # Extract and process content - try multiple fields - try: - content = result.markdown.raw_markdown - except AttributeError: - print(f"Warning: CrawlResult {result.url} has no markdown content") - content = "" - # content = "" - # if hasattr(result, 'extracted_content') and result.extracted_content: - # content = result.extracted_content - # elif hasattr(result, 'markdown') and result.markdown: - # content = result.markdown.raw_markdown - # elif hasattr(result, 'cleaned_html') and result.cleaned_html: - # content = result.cleaned_html - # elif hasattr(result, 'html') and result.html: - # # Use raw HTML as last resort - # content = result.html - - - terms = self._tokenize(content.lower()) - - # Update term frequencies - term_set = set() - for term in terms: - state.term_frequencies[term] += 1 - term_set.add(term) - - # Update document frequencies - doc_id = state.total_documents - for term in term_set: - if term not in state.documents_with_terms[term]: - state.document_frequencies[term] += 1 - state.documents_with_terms[term].add(doc_id) - - # Track new terms discovered - new_term_count = len(state.term_frequencies) - new_terms = new_term_count - old_term_count - state.new_terms_history.append(new_terms) - - # Update document count - state.total_documents += 1 - - # Add to crawl order - state.crawl_order.append(result.url) - - def _tokenize(self, text: str) -> List[str]: - """Simple tokenization - can be enhanced""" - # Remove punctuation and split - text = re.sub(r'[^\w\s]', ' ', text) - tokens = text.split() - - # Filter short tokens and stop words (basic) - tokens = [t for t in tokens if len(t) > 2] - - return tokens - - def _get_document_terms(self, crawl_result: CrawlResult) -> List[str]: - """Extract terms from a crawl result""" - content = crawl_result.markdown.raw_markdown or "" - return self._tokenize(content.lower()) - - -class EmbeddingStrategy(CrawlStrategy): - """Embedding-based adaptive crawling using semantic space coverage""" - - def __init__(self, embedding_model: str = None, llm_config: Dict = None): - self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2" - self.llm_config = llm_config - self._embedding_cache = {} - self._link_embedding_cache = {} # Cache for link embeddings - self._validation_passed = False # Track if validation passed - - # Performance optimization caches - self._distance_matrix_cache = None # Cache for query-KB distances - self._kb_embeddings_hash = None # Track KB changes - self._validation_embeddings_cache = None # Cache validation query embeddings - self._kb_similarity_threshold = 0.95 # Threshold for deduplication - - async def _get_embeddings(self, texts: List[str]) -> Any: - """Get embeddings using configured method""" - from .utils import get_text_embeddings - embedding_llm_config = { - 'provider': 'openai/text-embedding-3-small', - 'api_token': os.getenv('OPENAI_API_KEY') - } - return await get_text_embeddings( - texts, - embedding_llm_config, - self.embedding_model - ) - - def _compute_distance_matrix(self, query_embeddings: Any, kb_embeddings: Any) -> Any: - """Compute distance matrix using vectorized operations""" - import numpy as np - - if kb_embeddings is None or len(kb_embeddings) == 0: - return None - - # Ensure proper shapes - if len(query_embeddings.shape) == 1: - query_embeddings = query_embeddings.reshape(1, -1) - if len(kb_embeddings.shape) == 1: - kb_embeddings = kb_embeddings.reshape(1, -1) - - # Vectorized cosine distance: 1 - cosine_similarity - # Normalize vectors - query_norm = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True) - kb_norm = kb_embeddings / np.linalg.norm(kb_embeddings, axis=1, keepdims=True) - - # Compute cosine similarity matrix - similarity_matrix = np.dot(query_norm, kb_norm.T) - - # Convert to distance - distance_matrix = 1 - similarity_matrix - - return distance_matrix - - def _get_cached_distance_matrix(self, query_embeddings: Any, kb_embeddings: Any) -> Any: - """Get distance matrix with caching""" - import numpy as np - - if kb_embeddings is None or len(kb_embeddings) == 0: - return None - - # Check if KB has changed - kb_hash = hash(kb_embeddings.tobytes()) if kb_embeddings is not None else None - - if (self._distance_matrix_cache is None or - kb_hash != self._kb_embeddings_hash): - # Recompute matrix - self._distance_matrix_cache = self._compute_distance_matrix(query_embeddings, kb_embeddings) - self._kb_embeddings_hash = kb_hash - - return self._distance_matrix_cache - - async def map_query_semantic_space(self, query: str, n_synthetic: int = 10) -> Any: - """Generate a point cloud representing the semantic neighborhood of the query""" - from .utils import perform_completion_with_backoff - - # Generate more variations than needed for train/val split - n_total = int(n_synthetic * 1.3) # Generate 30% more for validation - - # Generate variations using LLM - prompt = f"""Generate {n_total} variations of this query that explore different aspects: '{query}' - - These should be queries a user might ask when looking for similar information. - Include different phrasings, related concepts, and specific aspects. - - Return as a JSON array of strings.""" - - # Use the LLM for query generation - provider = self.llm_config.get('provider', 'openai/gpt-4o-mini') if self.llm_config else 'openai/gpt-4o-mini' - api_token = self.llm_config.get('api_token') if self.llm_config else None - - # response = perform_completion_with_backoff( - # provider=provider, - # prompt_with_variables=prompt, - # api_token=api_token, - # json_response=True - # ) - - # variations = json.loads(response.choices[0].message.content) - - - # # Mock data with more variations for split - variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']} - - - variations = {'queries': [ - 'How do async and await work with coroutines in Python?', - 'What is the role of event loops in asynchronous programming?', - 'Can you explain the differences between async/await and traditional callback methods?', - 'How do coroutines interact with event loops in JavaScript?', - 'What are the benefits of using async await over promises in Node.js?', - # 'How to manage multiple coroutines with an event loop?', - # 'What are some common pitfalls when using async await with coroutines?', - # 'How do different programming languages implement async await and event loops?', - # 'What happens when an async function is called without await?', - # 'How does the event loop handle blocking operations?', - 'Can you nest async functions and how does that affect the event loop?', - 'What is the performance impact of using async/await?' - ]} - - # Split into train and validation - # all_queries = [query] + variations['queries'] - - # Randomly shuffle for proper train/val split (keeping original query in training) - import random - - # Keep original query always in training - other_queries = variations['queries'].copy() - random.shuffle(other_queries) - - # Split: 80% for training, 20% for validation - n_validation = max(2, int(len(other_queries) * 0.2)) # At least 2 for validation - val_queries = other_queries[-n_validation:] - train_queries = [query] + other_queries[:-n_validation] - - # Embed only training queries for now (faster) - train_embeddings = await self._get_embeddings(train_queries) - - # Store validation queries for later (don't embed yet to save time) - self._validation_queries = val_queries - - return train_embeddings, train_queries - - def compute_coverage_shape(self, query_points: Any, alpha: float = 0.5): - """Find the minimal shape that covers all query points using alpha shape""" - try: - import numpy as np - - if len(query_points) < 3: - return None - - # For high-dimensional embeddings (e.g., 384-dim, 768-dim), - # alpha shapes require exponentially more points than available. - # Instead, use a statistical coverage model - query_points = np.array(query_points) - - # Store coverage as centroid + radius model - coverage = { - 'center': np.mean(query_points, axis=0), - 'std': np.std(query_points, axis=0), - 'points': query_points, - 'radius': np.max(np.linalg.norm(query_points - np.mean(query_points, axis=0), axis=1)) - } - return coverage - except Exception: - # Fallback if computation fails - return None - - def _sample_boundary_points(self, shape, n_samples: int = 20) -> List[Any]: - """Sample points from the boundary of a shape""" - import numpy as np - - # Simplified implementation - in practice would sample from actual shape boundary - # For now, return empty list if shape is None - if shape is None: - return [] - - # This is a placeholder - actual implementation would depend on shape type - return [] - - def find_coverage_gaps(self, kb_embeddings: Any, query_embeddings: Any) -> List[Tuple[Any, float]]: - """Calculate gap distances for all query variations using vectorized operations""" - import numpy as np - - gaps = [] - - if kb_embeddings is None or len(kb_embeddings) == 0: - # If no KB yet, all query points have maximum gap - for q_emb in query_embeddings: - gaps.append((q_emb, 1.0)) - return gaps - - # Use cached distance matrix - distance_matrix = self._get_cached_distance_matrix(query_embeddings, kb_embeddings) - - if distance_matrix is None: - # Fallback - for q_emb in query_embeddings: - gaps.append((q_emb, 1.0)) - return gaps - - # Find minimum distance for each query (vectorized) - min_distances = np.min(distance_matrix, axis=1) - - # Create gaps list - for i, q_emb in enumerate(query_embeddings): - gaps.append((q_emb, min_distances[i])) - - return gaps - - async def select_links_for_expansion( - self, - candidate_links: List[Link], - gaps: List[Tuple[Any, float]], - kb_embeddings: Any - ) -> List[Tuple[Link, float]]: - """Select links that most efficiently fill the gaps""" - from .utils import cosine_distance, cosine_similarity, get_text_embeddings - import numpy as np - import hashlib - - scored_links = [] - - # Prepare for embedding - separate cached vs uncached - links_to_embed = [] - texts_to_embed = [] - link_embeddings_map = {} - - for link in candidate_links: - # Extract text from link - link_text = ' '.join(filter(None, [ - link.text or '', - link.title or '', - link.meta.get('description', '') if hasattr(link, 'meta') and link.meta else '', - link.head_data.get('meta', {}).get('description', '') if link.head_data else '' - ])) - - if not link_text.strip(): - continue - - # Create cache key from URL + text content - cache_key = hashlib.md5(f"{link.href}:{link_text}".encode()).hexdigest() - - # Check cache - if cache_key in self._link_embedding_cache: - link_embeddings_map[link.href] = self._link_embedding_cache[cache_key] - else: - links_to_embed.append(link) - texts_to_embed.append(link_text) - - # Batch embed only uncached links - if texts_to_embed: - embedding_llm_config = { - 'provider': 'openai/text-embedding-3-small', - 'api_token': os.getenv('OPENAI_API_KEY') - } - new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model) - - # Cache the new embeddings - for link, text, embedding in zip(links_to_embed, texts_to_embed, new_embeddings): - cache_key = hashlib.md5(f"{link.href}:{text}".encode()).hexdigest() - self._link_embedding_cache[cache_key] = embedding - link_embeddings_map[link.href] = embedding - - # Get coverage radius from config - coverage_radius = self.config.embedding_coverage_radius if hasattr(self, 'config') else 0.2 - - # Score each link - for link in candidate_links: - if link.href not in link_embeddings_map: - continue # Skip links without embeddings - - link_embedding = link_embeddings_map[link.href] - - if not gaps: - score = 0.0 - else: - # Calculate how many gaps this link helps with - gaps_helped = 0 - total_improvement = 0 - - for gap_point, gap_distance in gaps: - # Only consider gaps that actually need filling (outside coverage radius) - if gap_distance > coverage_radius: - new_distance = cosine_distance(link_embedding, gap_point) - if new_distance < gap_distance: - # This link helps this gap - improvement = gap_distance - new_distance - # Scale improvement - moving from 0.5 to 0.3 is valuable - scaled_improvement = improvement * 2 # Amplify the signal - total_improvement += scaled_improvement - gaps_helped += 1 - - # Average improvement per gap that needs help - gaps_needing_help = sum(1 for _, d in gaps if d > coverage_radius) - if gaps_needing_help > 0: - gap_reduction_score = total_improvement / gaps_needing_help - else: - gap_reduction_score = 0 - - # Check overlap with existing KB (vectorized) - if kb_embeddings is not None and len(kb_embeddings) > 0: - # Normalize embeddings - link_norm = link_embedding / np.linalg.norm(link_embedding) - kb_norm = kb_embeddings / np.linalg.norm(kb_embeddings, axis=1, keepdims=True) - - # Compute all similarities at once - similarities = np.dot(kb_norm, link_norm) - max_similarity = np.max(similarities) - - # Only penalize if very similar (above threshold) - overlap_threshold = self.config.embedding_overlap_threshold if hasattr(self, 'config') else 0.85 - if max_similarity > overlap_threshold: - overlap_penalty = (max_similarity - overlap_threshold) * 2 # 0 to 0.3 range - else: - overlap_penalty = 0 - else: - overlap_penalty = 0 - - # Final score - emphasize gap reduction - score = gap_reduction_score * (1 - overlap_penalty) - - # Add contextual score boost if available - if hasattr(link, 'contextual_score') and link.contextual_score: - score = score * 0.8 + link.contextual_score * 0.2 - - scored_links.append((link, score)) - - return sorted(scored_links, key=lambda x: x[1], reverse=True) - - async def calculate_confidence(self, state: CrawlState) -> float: - """Coverage-based learning score (0โ€“1).""" - import numpy as np - - # Guard clauses - if state.kb_embeddings is None or state.query_embeddings is None: - return 0.0 - if len(state.kb_embeddings) == 0 or len(state.query_embeddings) == 0: - return 0.0 - - # Prepare L2-normalised arrays - Q = np.asarray(state.query_embeddings, dtype=np.float32) - D = np.asarray(state.kb_embeddings, dtype=np.float32) - Q /= np.linalg.norm(Q, axis=1, keepdims=True) + 1e-8 - D /= np.linalg.norm(D, axis=1, keepdims=True) + 1e-8 - - # Best cosine per query - best = (Q @ D.T).max(axis=1) - - # Mean similarity or hit-rate above tau - tau = getattr(self.config, 'coverage_tau', None) - score = float((best >= tau).mean()) if tau is not None else float(best.mean()) - - # Store quick metrics - state.metrics['coverage_score'] = score - state.metrics['avg_best_similarity'] = float(best.mean()) - state.metrics['median_best_similarity'] = float(np.median(best)) - - return score - - - - # async def calculate_confidence(self, state: CrawlState) -> float: - # """Calculate learning score for adaptive crawling (used for stopping)""" - # import numpy as np - - # if state.kb_embeddings is None or state.query_embeddings is None: - # return 0.0 - - # if len(state.kb_embeddings) == 0: - # return 0.0 - - # # Get cached distance matrix - # distance_matrix = self._get_cached_distance_matrix(state.query_embeddings, state.kb_embeddings) - - # if distance_matrix is None: - # return 0.0 - - # # Vectorized analysis for all queries at once - # all_query_metrics = [] - - # for i in range(len(state.query_embeddings)): - # # Get distances for this query - # distances = distance_matrix[i] - # sorted_distances = np.sort(distances) - - # # Store metrics for this query - # query_metric = { - # 'min_distance': sorted_distances[0], - # 'top_3_distances': sorted_distances[:3], - # 'top_5_distances': sorted_distances[:5], - # 'close_neighbors': np.sum(distances < 0.3), - # 'very_close_neighbors': np.sum(distances < 0.2), - # 'all_distances': distances - # } - # all_query_metrics.append(query_metric) - - # # Hybrid approach with density (exponential base) - # k_exp = self.config.embedding_k_exp if hasattr(self, 'config') else 1.0 - # coverage_scores_hybrid_exp = [] - - # for metric in all_query_metrics: - # # Base score from nearest neighbor - # nearest_score = np.exp(-k_exp * metric['min_distance']) - - # # Top-k average (top 3) - # top_k = min(3, len(metric['all_distances'])) - # top_k_avg = np.mean([np.exp(-k_exp * d) for d in metric['top_3_distances'][:top_k]]) - - # # Combine using configured weights - # nearest_weight = self.config.embedding_nearest_weight if hasattr(self, 'config') else 0.7 - # top_k_weight = self.config.embedding_top_k_weight if hasattr(self, 'config') else 0.3 - # hybrid_score = nearest_weight * nearest_score + top_k_weight * top_k_avg - # coverage_scores_hybrid_exp.append(hybrid_score) - - # learning_score = np.mean(coverage_scores_hybrid_exp) - - # # Store as learning score - # state.metrics['learning_score'] = learning_score - - # # Store embedding-specific metrics - # state.metrics['avg_min_distance'] = np.mean([m['min_distance'] for m in all_query_metrics]) - # state.metrics['avg_close_neighbors'] = np.mean([m['close_neighbors'] for m in all_query_metrics]) - # state.metrics['avg_very_close_neighbors'] = np.mean([m['very_close_neighbors'] for m in all_query_metrics]) - # state.metrics['total_kb_docs'] = len(state.kb_embeddings) - - # # Store query-level metrics for detailed analysis - # self._query_metrics = all_query_metrics - - # # For stopping criteria, return learning score - # return float(learning_score) - - async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]: - """Main entry point for link ranking""" - # Store config for use in other methods - self.config = config - - # Filter out already crawled URLs and remove duplicates - seen_urls = set() - uncrawled_links = [] - - for link in state.pending_links: - if link.href not in state.crawled_urls and link.href not in seen_urls: - uncrawled_links.append(link) - seen_urls.add(link.href) - - if not uncrawled_links: - return [] - - # Get gaps in coverage (no threshold needed anymore) - gaps = self.find_coverage_gaps( - state.kb_embeddings, - state.query_embeddings - ) - state.semantic_gaps = [(g[0].tolist(), g[1]) for g in gaps] # Store as list for serialization - - # Select links that fill gaps (only from uncrawled) - return await self.select_links_for_expansion( - uncrawled_links, - gaps, - state.kb_embeddings - ) - - async def validate_coverage(self, state: CrawlState) -> float: - """Validate coverage using held-out queries with caching""" - if not hasattr(self, '_validation_queries') or not self._validation_queries: - return state.metrics.get('confidence', 0.0) - - import numpy as np - - # Cache validation embeddings (only embed once!) - if self._validation_embeddings_cache is None: - self._validation_embeddings_cache = await self._get_embeddings(self._validation_queries) - - val_embeddings = self._validation_embeddings_cache - - # Use vectorized distance computation - if state.kb_embeddings is None or len(state.kb_embeddings) == 0: - return 0.0 - - # Compute distance matrix for validation queries - distance_matrix = self._compute_distance_matrix(val_embeddings, state.kb_embeddings) - - if distance_matrix is None: - return 0.0 - - # Find minimum distance for each validation query (vectorized) - min_distances = np.min(distance_matrix, axis=1) - - # Compute scores using same exponential as training - k_exp = self.config.embedding_k_exp if hasattr(self, 'config') else 1.0 - scores = np.exp(-k_exp * min_distances) - - validation_confidence = np.mean(scores) - state.metrics['validation_confidence'] = validation_confidence - - return validation_confidence - - async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool: - """Stop based on learning curve convergence""" - confidence = state.metrics.get('confidence', 0.0) - - # Basic limits - if len(state.crawled_urls) >= config.max_pages or not state.pending_links: - return True - - # Track confidence history - if not hasattr(state, 'confidence_history'): - state.confidence_history = [] - - state.confidence_history.append(confidence) - - # Need at least 3 iterations to check convergence - if len(state.confidence_history) < 2: - return False - - improvement_diffs = list(zip(state.confidence_history[:-1], state.confidence_history[1:])) - - # Calculate average improvement - avg_improvement = sum(abs(b - a) for a, b in improvement_diffs) / len(improvement_diffs) - state.metrics['avg_improvement'] = avg_improvement - - min_relative_improvement = self.config.embedding_min_relative_improvement * confidence if hasattr(self, 'config') else 0.1 * confidence - if avg_improvement < min_relative_improvement: - # Converged - validate before stopping - val_score = await self.validate_coverage(state) - - # Only stop if validation is reasonable - validation_min = self.config.embedding_validation_min_score if hasattr(self, 'config') else 0.4 - if val_score > validation_min: - state.metrics['stopped_reason'] = 'converged_validated' - self._validation_passed = True - return True - else: - state.metrics['stopped_reason'] = 'low_validation' - # Continue crawling despite convergence - - return False - - def get_quality_confidence(self, state: CrawlState) -> float: - """Calculate quality-based confidence score for display""" - learning_score = state.metrics.get('learning_score', 0.0) - validation_score = state.metrics.get('validation_confidence', 0.0) - - # Get config values - validation_min = self.config.embedding_validation_min_score if hasattr(self, 'config') else 0.4 - quality_min = self.config.embedding_quality_min_confidence if hasattr(self, 'config') else 0.7 - quality_max = self.config.embedding_quality_max_confidence if hasattr(self, 'config') else 0.95 - scale_factor = self.config.embedding_quality_scale_factor if hasattr(self, 'config') else 0.833 - - if self._validation_passed and validation_score > validation_min: - # Validated systems get boosted scores - # Map 0.4-0.7 learning โ†’ quality_min-quality_max confidence - if learning_score < 0.4: - confidence = quality_min # Minimum for validated systems - elif learning_score > 0.7: - confidence = quality_max # Maximum realistic confidence - else: - # Linear mapping in between - confidence = quality_min + (learning_score - 0.4) * scale_factor - else: - # Not validated = conservative mapping - confidence = learning_score * 0.8 - - return confidence - - async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None: - """Update embeddings and coverage metrics with deduplication""" - from .utils import get_text_embeddings - import numpy as np - - # Extract text from results - new_texts = [] - valid_results = [] - for result in new_results: - content = result.markdown.raw_markdown if hasattr(result, 'markdown') and result.markdown else "" - if content: # Only process non-empty content - new_texts.append(content[:5000]) # Limit text length - valid_results.append(result) - - if not new_texts: - return - - # Get embeddings for new texts - embedding_llm_config = { - 'provider': 'openai/text-embedding-3-small', - 'api_token': os.getenv('OPENAI_API_KEY') - } - new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model) - - # Deduplicate embeddings before adding to KB - if state.kb_embeddings is None: - # First batch - no deduplication needed - state.kb_embeddings = new_embeddings - deduplicated_indices = list(range(len(new_embeddings))) - else: - # Check for duplicates using vectorized similarity - deduplicated_embeddings = [] - deduplicated_indices = [] - - for i, new_emb in enumerate(new_embeddings): - # Compute similarities with existing KB - new_emb_normalized = new_emb / np.linalg.norm(new_emb) - kb_normalized = state.kb_embeddings / np.linalg.norm(state.kb_embeddings, axis=1, keepdims=True) - similarities = np.dot(kb_normalized, new_emb_normalized) - - # Only add if not too similar to existing content - if np.max(similarities) < self._kb_similarity_threshold: - deduplicated_embeddings.append(new_emb) - deduplicated_indices.append(i) - - # Add deduplicated embeddings - if deduplicated_embeddings: - state.kb_embeddings = np.vstack([state.kb_embeddings, np.array(deduplicated_embeddings)]) - - # Update crawl order only for non-duplicate results - for idx in deduplicated_indices: - state.crawl_order.append(valid_results[idx].url) - - # Invalidate distance matrix cache since KB changed - self._kb_embeddings_hash = None - self._distance_matrix_cache = None - - # Update coverage shape if needed - if hasattr(state, 'query_embeddings') and state.query_embeddings is not None: - state.coverage_shape = self.compute_coverage_shape(state.query_embeddings, self.config.alpha_shape_alpha if hasattr(self, 'config') else 0.5) - - -class AdaptiveCrawler: - """Main adaptive crawler that orchestrates the crawling process""" - - def __init__(self, - crawler: Optional[AsyncWebCrawler] = None, - config: Optional[AdaptiveConfig] = None, - strategy: Optional[CrawlStrategy] = None): - self.crawler = crawler - self.config = config or AdaptiveConfig() - self.config.validate() - - # Create strategy based on config - if strategy: - self.strategy = strategy - else: - self.strategy = self._create_strategy(self.config.strategy) - - # Initialize state - self.state: Optional[CrawlState] = None - - # Track if we own the crawler (for cleanup) - self._owns_crawler = crawler is None - - def _create_strategy(self, strategy_name: str) -> CrawlStrategy: - """Create strategy instance based on name""" - if strategy_name == "statistical": - return StatisticalStrategy() - elif strategy_name == "embedding": - return EmbeddingStrategy( - embedding_model=self.config.embedding_model, - llm_config=self.config.embedding_llm_config - ) - else: - raise ValueError(f"Unknown strategy: {strategy_name}") - - async def digest(self, - start_url: str, - query: str, - resume_from: Optional[str] = None) -> CrawlState: - """Main entry point for adaptive crawling""" - # Initialize or resume state - if resume_from: - self.state = CrawlState.load(resume_from) - self.state.query = query # Update query in case it changed - else: - self.state = CrawlState( - crawled_urls=set(), - knowledge_base=[], - pending_links=[], - query=query, - metrics={} - ) - - # Create crawler if needed - if not self.crawler: - self.crawler = AsyncWebCrawler() - await self.crawler.__aenter__() - - self.strategy.config = self.config # Pass config to strategy - - # If using embedding strategy and not resuming, expand query space - if isinstance(self.strategy, EmbeddingStrategy) and not resume_from: - # Generate query space - query_embeddings, expanded_queries = await self.strategy.map_query_semantic_space( - query, - self.config.n_query_variations - ) - self.state.query_embeddings = query_embeddings - self.state.expanded_queries = expanded_queries[1:] # Skip original query - self.state.embedding_model = self.strategy.embedding_model - - try: - # Initial crawl if not resuming - if start_url not in self.state.crawled_urls: - result = await self._crawl_with_preview(start_url, query) - if result and hasattr(result, 'success') and result.success: - self.state.knowledge_base.append(result) - self.state.crawled_urls.add(start_url) - # Extract links from result - handle both dict and Links object formats - if hasattr(result, 'links') and result.links: - if isinstance(result.links, dict): - # Extract internal and external links from dict - internal_links = [Link(**link) for link in result.links.get('internal', [])] - external_links = [Link(**link) for link in result.links.get('external', [])] - self.state.pending_links.extend(internal_links + external_links) - else: - # Handle Links object - self.state.pending_links.extend(result.links.internal + result.links.external) - - # Update state - await self.strategy.update_state(self.state, [result]) - - # adaptive expansion - depth = 0 - while depth < self.config.max_depth: - # Calculate confidence - confidence = await self.strategy.calculate_confidence(self.state) - self.state.metrics['confidence'] = confidence - - # Check stopping criteria - if await self.strategy.should_stop(self.state, self.config): - break - - # Rank candidate links - ranked_links = await self.strategy.rank_links(self.state, self.config) - - if not ranked_links: - break - - # Check minimum gain threshold - if ranked_links[0][1] < self.config.min_gain_threshold: - break - - # Select top K links - to_crawl = [(link, score) for link, score in ranked_links[:self.config.top_k_links] - if link.href not in self.state.crawled_urls] - - if not to_crawl: - break - - # Crawl selected links - new_results = await self._crawl_batch(to_crawl, query) - - if new_results: - # Update knowledge base - self.state.knowledge_base.extend(new_results) - - # Update crawled URLs and pending links - for result, (link, _) in zip(new_results, to_crawl): - if result: - self.state.crawled_urls.add(link.href) - # Extract links from result - handle both dict and Links object formats - if hasattr(result, 'links') and result.links: - new_links = [] - if isinstance(result.links, dict): - # Extract internal and external links from dict - internal_links = [Link(**link_data) for link_data in result.links.get('internal', [])] - external_links = [Link(**link_data) for link_data in result.links.get('external', [])] - new_links = internal_links + external_links - else: - # Handle Links object - new_links = result.links.internal + result.links.external - - # Add new links to pending - for new_link in new_links: - if new_link.href not in self.state.crawled_urls: - self.state.pending_links.append(new_link) - - # Update state with new results - await self.strategy.update_state(self.state, new_results) - - depth += 1 - - # Save state if configured - if self.config.save_state and self.config.state_path: - self.state.save(self.config.state_path) - - # Final confidence calculation - learning_score = await self.strategy.calculate_confidence(self.state) - - # For embedding strategy, get quality-based confidence - if isinstance(self.strategy, EmbeddingStrategy): - self.state.metrics['confidence'] = self.strategy.get_quality_confidence(self.state) - else: - # For statistical strategy, use the same as before - self.state.metrics['confidence'] = learning_score - - self.state.metrics['pages_crawled'] = len(self.state.crawled_urls) - self.state.metrics['depth_reached'] = depth - - # Final save - if self.config.save_state and self.config.state_path: - self.state.save(self.config.state_path) - - return self.state - - finally: - # Cleanup if we created the crawler - if self._owns_crawler and self.crawler: - await self.crawler.__aexit__(None, None, None) - - async def _crawl_with_preview(self, url: str, query: str) -> Optional[CrawlResult]: - """Crawl a URL with link preview enabled""" - config = CrawlerRunConfig( - link_preview_config=LinkPreviewConfig( - include_internal=True, - include_external=False, - query=query, # For BM25 scoring - concurrency=5, - timeout=5, - max_links=50, # Reasonable limit - verbose=False - ), - score_links=True # Enable intrinsic scoring - ) - - try: - result = await self.crawler.arun(url=url, config=config) - # Extract the actual CrawlResult from the container - if hasattr(result, '_results') and result._results: - result = result._results[0] - - # Filter our all links do not have head_date - if hasattr(result, 'links') and result.links: - result.links['internal'] = [link for link in result.links['internal'] if link.get('head_data')] - # For now let's ignore external links without head_data - # result.links['external'] = [link for link in result.links['external'] if link.get('head_data')] - - return result - except Exception as e: - print(f"Error crawling {url}: {e}") - return None - - async def _crawl_batch(self, links_with_scores: List[Tuple[Link, float]], query: str) -> List[CrawlResult]: - """Crawl multiple URLs in parallel""" - tasks = [] - for link, score in links_with_scores: - task = self._crawl_with_preview(link.href, query) - tasks.append(task) - - results = await asyncio.gather(*tasks, return_exceptions=True) - - # Filter out exceptions and failed crawls - valid_results = [] - for result in results: - if isinstance(result, CrawlResult): - # Only include successful crawls - if hasattr(result, 'success') and result.success: - valid_results.append(result) - else: - print(f"Skipping failed crawl: {result.url if hasattr(result, 'url') else 'unknown'}") - elif isinstance(result, Exception): - print(f"Error in batch crawl: {result}") - - return valid_results - - # Status properties - @property - def confidence(self) -> float: - """Current confidence level""" - if self.state: - return self.state.metrics.get('confidence', 0.0) - return 0.0 - - @property - def coverage_stats(self) -> Dict[str, Any]: - """Detailed coverage statistics""" - if not self.state: - return {} - - total_content_length = sum( - len(result.markdown.raw_markdown or "") - for result in self.state.knowledge_base - ) - - return { - 'pages_crawled': len(self.state.crawled_urls), - 'total_content_length': total_content_length, - 'unique_terms': len(self.state.term_frequencies), - 'total_terms': sum(self.state.term_frequencies.values()), - 'pending_links': len(self.state.pending_links), - 'confidence': self.confidence, - 'coverage': self.state.metrics.get('coverage', 0.0), - 'consistency': self.state.metrics.get('consistency', 0.0), - 'saturation': self.state.metrics.get('saturation', 0.0) - } - - @property - def is_sufficient(self) -> bool: - """Check if current knowledge is sufficient""" - if isinstance(self.strategy, EmbeddingStrategy): - # For embedding strategy, sufficient = validation passed - return self.strategy._validation_passed - else: - # For statistical strategy, use threshold - return self.confidence >= self.config.confidence_threshold - - def print_stats(self, detailed: bool = False) -> None: - """Print comprehensive statistics about the knowledge base - - Args: - detailed: If True, show detailed statistics including top terms - """ - if not self.state: - print("No crawling state available.") - return - - # Import here to avoid circular imports - try: - from rich.console import Console - from rich.table import Table - console = Console() - use_rich = True - except ImportError: - use_rich = False - - if not detailed and use_rich: - # Summary view with nice table (like original) - table = Table(title=f"Adaptive Crawl Stats - Query: '{self.state.query}'") - table.add_column("Metric", style="cyan", no_wrap=True) - table.add_column("Value", style="magenta") - - # Basic stats - stats = self.coverage_stats - table.add_row("Pages Crawled", str(stats.get('pages_crawled', 0))) - table.add_row("Unique Terms", str(stats.get('unique_terms', 0))) - table.add_row("Total Terms", str(stats.get('total_terms', 0))) - table.add_row("Content Length", f"{stats.get('total_content_length', 0):,} chars") - table.add_row("Pending Links", str(stats.get('pending_links', 0))) - table.add_row("", "") # Spacer - - # Strategy-specific metrics - if isinstance(self.strategy, EmbeddingStrategy): - # Embedding-specific metrics - table.add_row("Confidence", f"{stats.get('confidence', 0):.2%}") - table.add_row("Avg Min Distance", f"{self.state.metrics.get('avg_min_distance', 0):.3f}") - table.add_row("Avg Close Neighbors", f"{self.state.metrics.get('avg_close_neighbors', 0):.1f}") - table.add_row("Validation Score", f"{self.state.metrics.get('validation_confidence', 0):.2%}") - table.add_row("", "") # Spacer - table.add_row("Is Sufficient?", "[green]Yes (Validated)[/green]" if self.is_sufficient else "[red]No[/red]") - else: - # Statistical strategy metrics - table.add_row("Confidence", f"{stats.get('confidence', 0):.2%}") - table.add_row("Coverage", f"{stats.get('coverage', 0):.2%}") - table.add_row("Consistency", f"{stats.get('consistency', 0):.2%}") - table.add_row("Saturation", f"{stats.get('saturation', 0):.2%}") - table.add_row("", "") # Spacer - table.add_row("Is Sufficient?", "[green]Yes[/green]" if self.is_sufficient else "[red]No[/red]") - - console.print(table) - else: - # Detailed view or fallback when rich not available - print("\n" + "="*80) - print(f"Adaptive Crawl Statistics - Query: '{self.state.query}'") - print("="*80) - - # Basic stats - print("\n[*] Basic Statistics:") - print(f" Pages Crawled: {len(self.state.crawled_urls)}") - print(f" Pending Links: {len(self.state.pending_links)}") - print(f" Total Documents: {self.state.total_documents}") - - # Content stats - total_content_length = sum( - len(self._get_content_from_result(result)) - for result in self.state.knowledge_base - ) - total_words = sum(self.state.term_frequencies.values()) - unique_terms = len(self.state.term_frequencies) - - print(f"\n[*] Content Statistics:") - print(f" Total Content: {total_content_length:,} characters") - print(f" Total Words: {total_words:,}") - print(f" Unique Terms: {unique_terms:,}") - if total_words > 0: - print(f" Vocabulary Richness: {unique_terms/total_words:.2%}") - - # Strategy-specific output - if isinstance(self.strategy, EmbeddingStrategy): - # Semantic coverage for embedding strategy - print(f"\n[*] Semantic Coverage Analysis:") - print(f" Average Min Distance: {self.state.metrics.get('avg_min_distance', 0):.3f}") - print(f" Avg Close Neighbors (< 0.3): {self.state.metrics.get('avg_close_neighbors', 0):.1f}") - print(f" Avg Very Close Neighbors (< 0.2): {self.state.metrics.get('avg_very_close_neighbors', 0):.1f}") - - # Confidence metrics - print(f"\n[*] Confidence Metrics:") - if self.is_sufficient: - if use_rich: - console.print(f" Overall Confidence: {self.confidence:.2%} [green][VALIDATED][/green]") - else: - print(f" Overall Confidence: {self.confidence:.2%} [VALIDATED]") - else: - if use_rich: - console.print(f" Overall Confidence: {self.confidence:.2%} [red][NOT VALIDATED][/red]") - else: - print(f" Overall Confidence: {self.confidence:.2%} [NOT VALIDATED]") - - print(f" Learning Score: {self.state.metrics.get('learning_score', 0):.2%}") - print(f" Validation Score: {self.state.metrics.get('validation_confidence', 0):.2%}") - - else: - # Query coverage for statistical strategy - print(f"\n[*] Query Coverage:") - query_terms = self.strategy._tokenize(self.state.query.lower()) - for term in query_terms: - tf = self.state.term_frequencies.get(term, 0) - df = self.state.document_frequencies.get(term, 0) - if df > 0: - if use_rich: - console.print(f" '{term}': found in {df}/{self.state.total_documents} docs ([green]{df/self.state.total_documents:.0%}[/green]), {tf} occurrences") - else: - print(f" '{term}': found in {df}/{self.state.total_documents} docs ({df/self.state.total_documents:.0%}), {tf} occurrences") - else: - if use_rich: - console.print(f" '{term}': [red][X] not found[/red]") - else: - print(f" '{term}': [X] not found") - - # Confidence metrics - print(f"\n[*] Confidence Metrics:") - status = "[OK]" if self.is_sufficient else "[!!]" - if use_rich: - status_colored = "[green][OK][/green]" if self.is_sufficient else "[red][!!][/red]" - console.print(f" Overall Confidence: {self.confidence:.2%} {status_colored}") - else: - print(f" Overall Confidence: {self.confidence:.2%} {status}") - print(f" Coverage Score: {self.state.metrics.get('coverage', 0):.2%}") - print(f" Consistency Score: {self.state.metrics.get('consistency', 0):.2%}") - print(f" Saturation Score: {self.state.metrics.get('saturation', 0):.2%}") - - # Crawl efficiency - if self.state.new_terms_history: - avg_new_terms = sum(self.state.new_terms_history) / len(self.state.new_terms_history) - print(f"\n[*] Crawl Efficiency:") - print(f" Avg New Terms per Page: {avg_new_terms:.1f}") - print(f" Information Saturation: {self.state.metrics.get('saturation', 0):.2%}") - - if detailed: - print("\n" + "-"*80) - if use_rich: - console.print("[bold cyan]DETAILED STATISTICS[/bold cyan]") - else: - print("DETAILED STATISTICS") - print("-"*80) - - # Top terms - print("\n[+] Top 20 Terms by Frequency:") - top_terms = sorted(self.state.term_frequencies.items(), key=lambda x: x[1], reverse=True)[:20] - for i, (term, freq) in enumerate(top_terms, 1): - df = self.state.document_frequencies.get(term, 0) - if use_rich: - console.print(f" {i:2d}. [yellow]'{term}'[/yellow]: {freq} occurrences in {df} docs") - else: - print(f" {i:2d}. '{term}': {freq} occurrences in {df} docs") - - # URLs crawled - print(f"\n[+] URLs Crawled ({len(self.state.crawled_urls)}):") - for i, url in enumerate(self.state.crawl_order, 1): - new_terms = self.state.new_terms_history[i-1] if i <= len(self.state.new_terms_history) else 0 - if use_rich: - console.print(f" {i}. [cyan]{url}[/cyan]") - console.print(f" -> Added [green]{new_terms}[/green] new terms") - else: - print(f" {i}. {url}") - print(f" -> Added {new_terms} new terms") - - # Document frequency distribution - print("\n[+] Document Frequency Distribution:") - df_counts = {} - for df in self.state.document_frequencies.values(): - df_counts[df] = df_counts.get(df, 0) + 1 - - for df in sorted(df_counts.keys()): - count = df_counts[df] - print(f" Terms in {df} docs: {count} terms") - - # Embedding stats - if self.state.embedding_model: - print("\n[+] Semantic Coverage Analysis:") - print(f" Embedding Model: {self.state.embedding_model}") - print(f" Query Variations: {len(self.state.expanded_queries)}") - if self.state.kb_embeddings is not None: - print(f" Knowledge Embeddings: {self.state.kb_embeddings.shape}") - else: - print(f" Knowledge Embeddings: None") - print(f" Semantic Gaps: {len(self.state.semantic_gaps)}") - print(f" Coverage Achievement: {self.confidence:.2%}") - - # Show sample expanded queries - if self.state.expanded_queries: - print("\n[+] Query Space (samples):") - for i, eq in enumerate(self.state.expanded_queries[:5], 1): - if use_rich: - console.print(f" {i}. [yellow]{eq}[/yellow]") - else: - print(f" {i}. {eq}") - - print("\n" + "="*80) - - def _get_content_from_result(self, result) -> str: - """Helper to safely extract content from result""" - if hasattr(result, 'markdown') and result.markdown: - if hasattr(result.markdown, 'raw_markdown'): - return result.markdown.raw_markdown or "" - return str(result.markdown) - return "" - - def export_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None: - """Export the knowledge base to a file - - Args: - filepath: Path to save the file - format: Export format - currently supports 'jsonl' - """ - if not self.state or not self.state.knowledge_base: - print("No knowledge base to export.") - return - - filepath = Path(filepath) - filepath.parent.mkdir(parents=True, exist_ok=True) - - if format == "jsonl": - # Export as JSONL - one CrawlResult per line - with open(filepath, 'w', encoding='utf-8') as f: - for result in self.state.knowledge_base: - # Convert CrawlResult to dict - result_dict = self._crawl_result_to_export_dict(result) - # Write as single line JSON - f.write(json.dumps(result_dict, ensure_ascii=False) + '\n') - - print(f"Exported {len(self.state.knowledge_base)} documents to {filepath}") - else: - raise ValueError(f"Unsupported export format: {format}") - - def _crawl_result_to_export_dict(self, result) -> Dict[str, Any]: - """Convert CrawlResult to a dictionary for export""" - # Extract all available fields - export_dict = { - 'url': getattr(result, 'url', ''), - 'timestamp': getattr(result, 'timestamp', None), - 'success': getattr(result, 'success', True), - 'query': self.state.query if self.state else '', - } - - # Extract content - if hasattr(result, 'markdown') and result.markdown: - if hasattr(result.markdown, 'raw_markdown'): - export_dict['content'] = result.markdown.raw_markdown - else: - export_dict['content'] = str(result.markdown) - else: - export_dict['content'] = '' - - # Extract metadata - if hasattr(result, 'metadata'): - export_dict['metadata'] = result.metadata - - # Extract links if available - if hasattr(result, 'links'): - export_dict['links'] = result.links - - # Add crawl-specific metadata - if self.state: - export_dict['crawl_metadata'] = { - 'crawl_order': self.state.crawl_order.index(export_dict['url']) + 1 if export_dict['url'] in self.state.crawl_order else 0, - 'confidence_at_crawl': self.state.metrics.get('confidence', 0), - 'total_documents': self.state.total_documents - } - - return export_dict - - def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None: - """Import a knowledge base from a file - - Args: - filepath: Path to the file to import - format: Import format - currently supports 'jsonl' - """ - filepath = Path(filepath) - if not filepath.exists(): - raise FileNotFoundError(f"File not found: {filepath}") - - if format == "jsonl": - imported_results = [] - with open(filepath, 'r', encoding='utf-8') as f: - for line in f: - if line.strip(): - data = json.loads(line) - # Convert back to a mock CrawlResult - mock_result = self._import_dict_to_crawl_result(data) - imported_results.append(mock_result) - - # Initialize state if needed - if not self.state: - self.state = CrawlState() - - # Add imported results - self.state.knowledge_base.extend(imported_results) - - # Update state with imported data - asyncio.run(self.strategy.update_state(self.state, imported_results)) - - print(f"Imported {len(imported_results)} documents from {filepath}") - else: - raise ValueError(f"Unsupported import format: {format}") - - def _import_dict_to_crawl_result(self, data: Dict[str, Any]): - """Convert imported dict back to a mock CrawlResult""" - class MockMarkdown: - def __init__(self, content): - self.raw_markdown = content - - class MockCrawlResult: - def __init__(self, data): - self.url = data.get('url', '') - self.markdown = MockMarkdown(data.get('content', '')) - self.links = data.get('links', {}) - self.metadata = data.get('metadata', {}) - self.success = data.get('success', True) - self.timestamp = data.get('timestamp') - - return MockCrawlResult(data) - - def get_relevant_content(self, top_k: int = 5) -> List[Dict[str, Any]]: - """Get most relevant content for the query""" - if not self.state or not self.state.knowledge_base: - return [] - - # Simple relevance ranking based on term overlap - scored_docs = [] - query_terms = set(self.state.query.lower().split()) - - for i, result in enumerate(self.state.knowledge_base): - content = (result.markdown.raw_markdown or "").lower() - content_terms = set(content.split()) - - # Calculate relevance score - overlap = len(query_terms & content_terms) - score = overlap / len(query_terms) if query_terms else 0.0 - - scored_docs.append({ - 'url': result.url, - 'score': score, - 'content': result.markdown.raw_markdown, - 'index': i - }) - - # Sort by score and return top K - scored_docs.sort(key=lambda x: x['score'], reverse=True) - return scored_docs[:top_k] \ No newline at end of file diff --git a/crawl4ai/adaptive_crawler.py b/crawl4ai/adaptive_crawler.py index a0b8fa9c1..d318abb4a 100644 --- a/crawl4ai/adaptive_crawler.py +++ b/crawl4ai/adaptive_crawler.py @@ -6,51 +6,52 @@ avoiding unnecessary crawls while ensuring comprehensive coverage. """ -from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Set, Tuple, Any, Union -from dataclasses import dataclass, field import asyncio -import pickle -import os import json import math -from collections import defaultdict, Counter +import os import re +from abc import ABC, abstractmethod +from collections import defaultdict +from dataclasses import dataclass, field from pathlib import Path +from typing import Any -from crawl4ai.async_webcrawler import AsyncWebCrawler -from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig -from crawl4ai.models import Link, CrawlResult import numpy as np +from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig +from crawl4ai.async_webcrawler import AsyncWebCrawler +from crawl4ai.models import CrawlResult, Link + + @dataclass class CrawlState: """Tracks the current state of adaptive crawling""" - crawled_urls: Set[str] = field(default_factory=set) - knowledge_base: List[CrawlResult] = field(default_factory=list) - pending_links: List[Link] = field(default_factory=list) + crawled_urls: set[str] = field(default_factory=set) + knowledge_base: list[CrawlResult] = field(default_factory=list) + pending_links: list[Link] = field(default_factory=list) query: str = "" - metrics: Dict[str, float] = field(default_factory=dict) + metrics: dict[str, float] = field(default_factory=dict) # Statistical tracking - term_frequencies: Dict[str, int] = field(default_factory=lambda: defaultdict(int)) - document_frequencies: Dict[str, int] = field(default_factory=lambda: defaultdict(int)) - documents_with_terms: Dict[str, Set[int]] = field(default_factory=lambda: defaultdict(set)) + term_frequencies: dict[str, int] = field(default_factory=lambda: defaultdict(int)) + document_frequencies: dict[str, int] = field(default_factory=lambda: defaultdict(int)) + documents_with_terms: dict[str, set[int]] = field(default_factory=lambda: defaultdict(set)) total_documents: int = 0 # History tracking for saturation - new_terms_history: List[int] = field(default_factory=list) - crawl_order: List[str] = field(default_factory=list) + new_terms_history: list[int] = field(default_factory=list) + crawl_order: list[str] = field(default_factory=list) # Embedding-specific tracking (only if strategy is embedding) - kb_embeddings: Optional[Any] = None # Will be numpy array - query_embeddings: Optional[Any] = None # Will be numpy array - expanded_queries: List[str] = field(default_factory=list) - coverage_shape: Optional[Any] = None # Alpha shape - semantic_gaps: List[Tuple[List[float], float]] = field(default_factory=list) # Serializable + kb_embeddings: Any | None = None # Will be numpy array + query_embeddings: Any | None = None # Will be numpy array + expanded_queries: list[str] = field(default_factory=list) + coverage_shape: Any | None = None # Alpha shape + semantic_gaps: list[tuple[list[float], float]] = field(default_factory=list) # Serializable embedding_model: str = "" - def save(self, path: Union[str, Path]): + def save(self, path: str | Path): """Save state to disk for persistence""" path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) @@ -80,10 +81,10 @@ def save(self, path: Union[str, Path]): json.dump(state_dict, f, indent=2) @classmethod - def load(cls, path: Union[str, Path]) -> 'CrawlState': + def load(cls, path: str | Path) -> 'CrawlState': """Load state from disk""" path = Path(path) - with open(path, 'r') as f: + with open(path) as f: state_dict = json.load(f) state = cls() @@ -110,7 +111,7 @@ def load(cls, path: Union[str, Path]) -> 'CrawlState': return state @staticmethod - def _crawl_result_to_dict(cr: CrawlResult) -> Dict: + def _crawl_result_to_dict(cr: CrawlResult) -> dict: """Convert CrawlResult to serializable dict""" # Extract markdown content safely markdown_content = "" @@ -128,7 +129,7 @@ def _crawl_result_to_dict(cr: CrawlResult) -> Dict: } @staticmethod - def _dict_to_crawl_result(d: Dict): + def _dict_to_crawl_result(d: dict): """Convert dict back to CrawlResult""" # Create a mock object that has the minimal interface we need class MockMarkdown: @@ -174,11 +175,11 @@ class AdaptiveConfig: # Persistence save_state: bool = False - state_path: Optional[str] = None + state_path: str | None = None # Embedding strategy parameters embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2" - embedding_llm_config: Optional[Dict] = None # Separate config for embeddings + embedding_llm_config: dict | None = None # Separate config for embeddings n_query_variations: int = 10 coverage_threshold: float = 0.85 alpha_shape_alpha: float = 0.5 @@ -261,7 +262,7 @@ async def calculate_confidence(self, state: CrawlState) -> float: pass @abstractmethod - async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]: + async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> list[tuple[Link, float]]: """Rank pending links by expected information gain""" pass @@ -271,7 +272,7 @@ async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool: pass @abstractmethod - async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None: + async def update_state(self, state: CrawlState, new_results: list[CrawlResult]) -> None: """Update state with new crawl results""" pass @@ -388,7 +389,7 @@ def _calculate_saturation(self, state: CrawlState) -> float: return max(0.0, min(saturation, 1.0)) - async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]: + async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> list[tuple[Link, float]]: """Rank links by expected information gain""" scored_links = [] @@ -523,7 +524,7 @@ async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool: return False - async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None: + async def update_state(self, state: CrawlState, new_results: list[CrawlResult]) -> None: """Update state with new crawl results""" for result in new_results: # Track new terms @@ -573,7 +574,7 @@ async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) # Add to crawl order state.crawl_order.append(result.url) - def _tokenize(self, text: str) -> List[str]: + def _tokenize(self, text: str) -> list[str]: """Simple tokenization - can be enhanced""" # Remove punctuation and split text = re.sub(r'[^\w\s]', ' ', text) @@ -584,7 +585,7 @@ def _tokenize(self, text: str) -> List[str]: return tokens - def _get_document_terms(self, crawl_result: CrawlResult) -> List[str]: + def _get_document_terms(self, crawl_result: CrawlResult) -> list[str]: """Extract terms from a crawl result""" content = crawl_result.markdown.raw_markdown or "" return self._tokenize(content.lower()) @@ -593,7 +594,7 @@ def _get_document_terms(self, crawl_result: CrawlResult) -> List[str]: class EmbeddingStrategy(CrawlStrategy): """Embedding-based adaptive crawling using semantic space coverage""" - def __init__(self, embedding_model: str = None, llm_config: Dict = None): + def __init__(self, embedding_model: str = None, llm_config: dict = None): self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2" self.llm_config = llm_config self._embedding_cache = {} @@ -606,7 +607,7 @@ def __init__(self, embedding_model: str = None, llm_config: Dict = None): self._validation_embeddings_cache = None # Cache validation query embeddings self._kb_similarity_threshold = 0.95 # Threshold for deduplication - async def _get_embeddings(self, texts: List[str]) -> Any: + async def _get_embeddings(self, texts: list[str]) -> Any: """Get embeddings using configured method""" from .utils import get_text_embeddings embedding_llm_config = { @@ -665,7 +666,6 @@ def _get_cached_distance_matrix(self, query_embeddings: Any, kb_embeddings: Any) async def map_query_semantic_space(self, query: str, n_synthetic: int = 10) -> Any: """Generate a point cloud representing the semantic neighborhood of the query""" - from .utils import perform_completion_with_backoff # Generate more variations than needed for train/val split n_total = int(n_synthetic * 1.3) # Generate 30% more for validation @@ -759,7 +759,7 @@ def compute_coverage_shape(self, query_points: Any, alpha: float = 0.5): # Fallback if computation fails return None - def _sample_boundary_points(self, shape, n_samples: int = 20) -> List[Any]: + def _sample_boundary_points(self, shape, n_samples: int = 20) -> list[Any]: """Sample points from the boundary of a shape""" @@ -771,7 +771,7 @@ def _sample_boundary_points(self, shape, n_samples: int = 20) -> List[Any]: # This is a placeholder - actual implementation would depend on shape type return [] - def find_coverage_gaps(self, kb_embeddings: Any, query_embeddings: Any) -> List[Tuple[Any, float]]: + def find_coverage_gaps(self, kb_embeddings: Any, query_embeddings: Any) -> list[tuple[Any, float]]: """Calculate gap distances for all query variations using vectorized operations""" @@ -803,14 +803,14 @@ def find_coverage_gaps(self, kb_embeddings: Any, query_embeddings: Any) -> List[ async def select_links_for_expansion( self, - candidate_links: List[Link], - gaps: List[Tuple[Any, float]], + candidate_links: list[Link], + gaps: list[tuple[Any, float]], kb_embeddings: Any - ) -> List[Tuple[Link, float]]: + ) -> list[tuple[Link, float]]: """Select links that most efficiently fill the gaps""" - from .utils import cosine_distance, cosine_similarity, get_text_embeddings - import hashlib + + from .utils import cosine_distance, get_text_embeddings scored_links = [] @@ -1021,7 +1021,7 @@ async def calculate_confidence(self, state: CrawlState) -> float: # # For stopping criteria, return learning score # return float(learning_score) - async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]: + async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> list[tuple[Link, float]]: """Main entry point for link ranking""" # Store config for use in other methods self.config = config @@ -1133,8 +1133,7 @@ async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool: state.metrics['stopped_reason'] = 'converged_validated' self._validation_passed = True return True - else: - state.metrics['stopped_reason'] = 'low_validation' + state.metrics['stopped_reason'] = 'low_validation' # Continue crawling despite convergence return False @@ -1166,7 +1165,7 @@ def get_quality_confidence(self, state: CrawlState) -> float: return confidence - async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None: + async def update_state(self, state: CrawlState, new_results: list[CrawlResult]) -> None: """Update embeddings and coverage metrics with deduplication""" from .utils import get_text_embeddings @@ -1232,9 +1231,9 @@ class AdaptiveCrawler: """Main adaptive crawler that orchestrates the crawling process""" def __init__(self, - crawler: Optional[AsyncWebCrawler] = None, - config: Optional[AdaptiveConfig] = None, - strategy: Optional[CrawlStrategy] = None): + crawler: AsyncWebCrawler | None = None, + config: AdaptiveConfig | None = None, + strategy: CrawlStrategy | None = None): self.crawler = crawler self.config = config or AdaptiveConfig() self.config.validate() @@ -1246,7 +1245,7 @@ def __init__(self, self.strategy = self._create_strategy(self.config.strategy) # Initialize state - self.state: Optional[CrawlState] = None + self.state: CrawlState | None = None # Track if we own the crawler (for cleanup) self._owns_crawler = crawler is None @@ -1255,18 +1254,17 @@ def _create_strategy(self, strategy_name: str) -> CrawlStrategy: """Create strategy instance based on name""" if strategy_name == "statistical": return StatisticalStrategy() - elif strategy_name == "embedding": + if strategy_name == "embedding": return EmbeddingStrategy( embedding_model=self.config.embedding_model, llm_config=self.config.embedding_llm_config ) - else: - raise ValueError(f"Unknown strategy: {strategy_name}") + raise ValueError(f"Unknown strategy: {strategy_name}") async def digest(self, start_url: str, query: str, - resume_from: Optional[str] = None) -> CrawlState: + resume_from: str | None = None) -> CrawlState: """Main entry point for adaptive crawling""" # Initialize or resume state if resume_from: @@ -1409,7 +1407,7 @@ async def digest(self, if self._owns_crawler and self.crawler: await self.crawler.__aexit__(None, None, None) - async def _crawl_with_preview(self, url: str, query: str) -> Optional[CrawlResult]: + async def _crawl_with_preview(self, url: str, query: str) -> CrawlResult | None: """Crawl a URL with link preview enabled""" config = CrawlerRunConfig( link_preview_config=LinkPreviewConfig( @@ -1441,7 +1439,7 @@ async def _crawl_with_preview(self, url: str, query: str) -> Optional[CrawlResul print(f"Error crawling {url}: {e}") return None - async def _crawl_batch(self, links_with_scores: List[Tuple[Link, float]], query: str) -> List[CrawlResult]: + async def _crawl_batch(self, links_with_scores: list[tuple[Link, float]], query: str) -> list[CrawlResult]: """Crawl multiple URLs in parallel""" tasks = [] for link, score in links_with_scores: @@ -1473,7 +1471,7 @@ def confidence(self) -> float: return 0.0 @property - def coverage_stats(self) -> Dict[str, Any]: + def coverage_stats(self) -> dict[str, Any]: """Detailed coverage statistics""" if not self.state: return {} @@ -1501,9 +1499,8 @@ def is_sufficient(self) -> bool: if isinstance(self.strategy, EmbeddingStrategy): # For embedding strategy, sufficient = validation passed return self.strategy._validation_passed - else: - # For statistical strategy, use threshold - return self.confidence >= self.config.confidence_threshold + # For statistical strategy, use threshold + return self.confidence >= self.config.confidence_threshold def print_stats(self, detailed: bool = False) -> None: """Print comprehensive statistics about the knowledge base @@ -1578,7 +1575,7 @@ def print_stats(self, detailed: bool = False) -> None: total_words = sum(self.state.term_frequencies.values()) unique_terms = len(self.state.term_frequencies) - print(f"\n[*] Content Statistics:") + print("\n[*] Content Statistics:") print(f" Total Content: {total_content_length:,} characters") print(f" Total Words: {total_words:,}") print(f" Unique Terms: {unique_terms:,}") @@ -1588,13 +1585,13 @@ def print_stats(self, detailed: bool = False) -> None: # Strategy-specific output if isinstance(self.strategy, EmbeddingStrategy): # Semantic coverage for embedding strategy - print(f"\n[*] Semantic Coverage Analysis:") + print("\n[*] Semantic Coverage Analysis:") print(f" Average Min Distance: {self.state.metrics.get('avg_min_distance', 0):.3f}") print(f" Avg Close Neighbors (< 0.3): {self.state.metrics.get('avg_close_neighbors', 0):.1f}") print(f" Avg Very Close Neighbors (< 0.2): {self.state.metrics.get('avg_very_close_neighbors', 0):.1f}") # Confidence metrics - print(f"\n[*] Confidence Metrics:") + print("\n[*] Confidence Metrics:") if self.is_sufficient: if use_rich: console.print(f" Overall Confidence: {self.confidence:.2%} [green][VALIDATED][/green]") @@ -1611,7 +1608,7 @@ def print_stats(self, detailed: bool = False) -> None: else: # Query coverage for statistical strategy - print(f"\n[*] Query Coverage:") + print("\n[*] Query Coverage:") query_terms = self.strategy._tokenize(self.state.query.lower()) for term in query_terms: tf = self.state.term_frequencies.get(term, 0) @@ -1628,7 +1625,7 @@ def print_stats(self, detailed: bool = False) -> None: print(f" '{term}': [X] not found") # Confidence metrics - print(f"\n[*] Confidence Metrics:") + print("\n[*] Confidence Metrics:") status = "[OK]" if self.is_sufficient else "[!!]" if use_rich: status_colored = "[green][OK][/green]" if self.is_sufficient else "[red][!!][/red]" @@ -1642,7 +1639,7 @@ def print_stats(self, detailed: bool = False) -> None: # Crawl efficiency if self.state.new_terms_history: avg_new_terms = sum(self.state.new_terms_history) / len(self.state.new_terms_history) - print(f"\n[*] Crawl Efficiency:") + print("\n[*] Crawl Efficiency:") print(f" Avg New Terms per Page: {avg_new_terms:.1f}") print(f" Information Saturation: {self.state.metrics.get('saturation', 0):.2%}") @@ -1693,7 +1690,7 @@ def print_stats(self, detailed: bool = False) -> None: if self.state.kb_embeddings is not None: print(f" Knowledge Embeddings: {self.state.kb_embeddings.shape}") else: - print(f" Knowledge Embeddings: None") + print(" Knowledge Embeddings: None") print(f" Semantic Gaps: {len(self.state.semantic_gaps)}") print(f" Coverage Achievement: {self.confidence:.2%}") @@ -1716,7 +1713,7 @@ def _get_content_from_result(self, result) -> str: return str(result.markdown) return "" - def export_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None: + def export_knowledge_base(self, filepath: str | Path, format: str = "jsonl") -> None: """Export the knowledge base to a file Args: @@ -1743,7 +1740,7 @@ def export_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl else: raise ValueError(f"Unsupported export format: {format}") - def _crawl_result_to_export_dict(self, result) -> Dict[str, Any]: + def _crawl_result_to_export_dict(self, result) -> dict[str, Any]: """Convert CrawlResult to a dictionary for export""" # Extract all available fields export_dict = { @@ -1780,7 +1777,7 @@ def _crawl_result_to_export_dict(self, result) -> Dict[str, Any]: return export_dict - def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None: + def import_knowledge_base(self, filepath: str | Path, format: str = "jsonl") -> None: """Import a knowledge base from a file Args: @@ -1793,7 +1790,7 @@ def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl if format == "jsonl": imported_results = [] - with open(filepath, 'r', encoding='utf-8') as f: + with open(filepath, encoding='utf-8') as f: for line in f: if line.strip(): data = json.loads(line) @@ -1815,7 +1812,7 @@ def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl else: raise ValueError(f"Unsupported import format: {format}") - def _import_dict_to_crawl_result(self, data: Dict[str, Any]): + def _import_dict_to_crawl_result(self, data: dict[str, Any]): """Convert imported dict back to a mock CrawlResult""" class MockMarkdown: def __init__(self, content): @@ -1832,7 +1829,7 @@ def __init__(self, data): return MockCrawlResult(data) - def get_relevant_content(self, top_k: int = 5) -> List[Dict[str, Any]]: + def get_relevant_content(self, top_k: int = 5) -> list[dict[str, Any]]: """Get most relevant content for the query""" if not self.state or not self.state.knowledge_base: return [] diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index a43b50a4a..e21ecf170 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,47 +1,45 @@ +import inspect import os -from typing import Union +from collections.abc import Callable +from enum import Enum +from typing import Any, Union + +from crawl4ai.cache_client import DEFAULT_CACHE_TTL_SECONDS, CacheClient, NoCacheClient + +from .cache_context import CacheMode +from .chunking_strategy import ChunkingStrategy, RegexChunking from .config import ( DEFAULT_PROVIDER, DEFAULT_PROVIDER_API_KEY, - MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, - PROVIDER_MODELS, + IMAGE_SCORE_THRESHOLD, + MIN_WORD_THRESHOLD, + PAGE_TIMEOUT, PROVIDER_MODELS_PREFIXES, SCREENSHOT_HEIGHT_TRESHOLD, - PAGE_TIMEOUT, - IMAGE_SCORE_THRESHOLD, SOCIAL_MEDIA_DOMAINS, ) - -from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator -from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy -from .chunking_strategy import ChunkingStrategy, RegexChunking - -from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy from .deep_crawling import DeepCrawlStrategy -from .table_extraction import TableExtractionStrategy, DefaultTableExtraction - -from .cache_context import CacheMode +from .extraction_strategy import ExtractionStrategy +from .markdown_generation_strategy import ( + DefaultMarkdownGenerator, + MarkdownGenerationStrategy, +) from .proxy_strategy import ProxyRotationStrategy - -from typing import Union, List, Callable -import inspect -from typing import Any, Dict, Optional -from enum import Enum +from .table_extraction import DefaultTableExtraction, TableExtractionStrategy +from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator # Type alias for URL matching -UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]] +UrlMatcher = Union[str, Callable[[str], bool], list[str | Callable[[str], bool]]] class MatchMode(Enum): OR = "or" AND = "and" -# from .proxy_strategy import ProxyConfig - -def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: +def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> dict: """ Recursively convert an object to a serializable dictionary using {type, params} structure for complex objects. @@ -173,7 +171,7 @@ def __init__( self, latitude: float, longitude: float, - accuracy: Optional[float] = 0.0 + accuracy: float | None = 0.0 ): """Configuration class for geolocation settings. @@ -187,7 +185,7 @@ def __init__( self.accuracy = accuracy @staticmethod - def from_dict(geo_dict: Dict) -> "GeolocationConfig": + def from_dict(geo_dict: dict) -> "GeolocationConfig": """Create a GeolocationConfig from a dictionary.""" return GeolocationConfig( latitude=geo_dict.get("latitude"), @@ -195,7 +193,7 @@ def from_dict(geo_dict: Dict) -> "GeolocationConfig": accuracy=geo_dict.get("accuracy", 0.0) ) - def to_dict(self) -> Dict: + def to_dict(self) -> dict: """Convert to dictionary representation.""" return { "latitude": self.latitude, @@ -220,9 +218,9 @@ class ProxyConfig: def __init__( self, server: str, - username: Optional[str] = None, - password: Optional[str] = None, - ip: Optional[str] = None, + username: str | None = None, + password: str | None = None, + ip: str | None = None, ): """Configuration class for a single proxy. @@ -239,16 +237,15 @@ def __init__( # Extract IP from server if not explicitly provided self.ip = ip or self._extract_ip_from_server() - def _extract_ip_from_server(self) -> Optional[str]: + def _extract_ip_from_server(self) -> str | None: """Extract IP address from server URL.""" try: # Simple extraction assuming http://ip:port format if "://" in self.server: parts = self.server.split("://")[1].split(":") return parts[0] - else: - parts = self.server.split(":") - return parts[0] + parts = self.server.split(":") + return parts[0] except Exception: return None @@ -264,17 +261,16 @@ def from_string(proxy_str: str) -> "ProxyConfig": password=password, ip=ip ) - elif len(parts) == 2: # ip:port only + if len(parts) == 2: # ip:port only ip, port = parts return ProxyConfig( server=f"http://{ip}:{port}", ip=ip ) - else: - raise ValueError(f"Invalid proxy string format: {proxy_str}") + raise ValueError(f"Invalid proxy string format: {proxy_str}") @staticmethod - def from_dict(proxy_dict: Dict) -> "ProxyConfig": + def from_dict(proxy_dict: dict) -> "ProxyConfig": """Create a ProxyConfig from a dictionary.""" return ProxyConfig( server=proxy_dict.get("server"), @@ -284,7 +280,7 @@ def from_dict(proxy_dict: Dict) -> "ProxyConfig": ) @staticmethod - def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]: + def from_env(env_var: str = "PROXIES") -> list["ProxyConfig"]: """Load proxies from environment variable. Args: @@ -304,7 +300,7 @@ def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]: print(f"Error loading proxies from environment: {e}") return proxies - def to_dict(self) -> Dict: + def to_dict(self) -> dict: """Convert to dictionary representation.""" return { "server": self.server, @@ -407,13 +403,13 @@ def __init__( chrome_channel: str = "chromium", channel: str = "chromium", proxy: str = None, - proxy_config: Union[ProxyConfig, dict, None] = None, + proxy_config: ProxyConfig | dict | None = None, viewport_width: int = 1080, viewport_height: int = 600, viewport: dict = None, accept_downloads: bool = False, downloads_path: str = None, - storage_state: Union[str, dict, None] = None, + storage_state: str | dict | None = None, ignore_https_errors: bool = True, java_script_enabled: bool = True, sleep_on_close: bool = False, @@ -631,7 +627,7 @@ def __init__( self, container_selector: str, scroll_count: int = 10, - scroll_by: Union[str, int] = "container_height", + scroll_by: str | int = "container_height", wait_after_scroll: float = 0.5, ): """ @@ -672,13 +668,13 @@ def __init__( self, include_internal: bool = True, include_external: bool = False, - include_patterns: Optional[List[str]] = None, - exclude_patterns: Optional[List[str]] = None, + include_patterns: list[str] | None = None, + exclude_patterns: list[str] | None = None, concurrency: int = 10, timeout: int = 5, max_links: int = 100, - query: Optional[str] = None, - score_threshold: Optional[float] = None, + query: str | None = None, + score_threshold: float | None = None, verbose: bool = False ): """ @@ -720,7 +716,7 @@ def __init__( raise ValueError("At least one of include_internal or include_external must be True") @staticmethod - def from_dict(config_dict: Dict[str, Any]) -> "LinkPreviewConfig": + def from_dict(config_dict: dict[str, Any]) -> "LinkPreviewConfig": """Create LinkPreviewConfig from dictionary (for backward compatibility).""" if not config_dict: return None @@ -738,7 +734,7 @@ def from_dict(config_dict: Dict[str, Any]) -> "LinkPreviewConfig": verbose=config_dict.get("verbose", False) ) - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: """Convert to dictionary format.""" return { "include_internal": self.include_internal, @@ -764,18 +760,18 @@ class HTTPCrawlerConfig: """HTTP-specific crawler configuration""" method: str = "GET" - headers: Optional[Dict[str, str]] = None - data: Optional[Dict[str, Any]] = None - json: Optional[Dict[str, Any]] = None + headers: dict[str, str] | None = None + data: dict[str, Any] | None = None + json: dict[str, Any] | None = None follow_redirects: bool = True verify_ssl: bool = True def __init__( self, method: str = "GET", - headers: Optional[Dict[str, str]] = None, - data: Optional[Dict[str, Any]] = None, - json: Optional[Dict[str, Any]] = None, + headers: dict[str, str] | None = None, + data: dict[str, Any] | None = None, + json: dict[str, Any] | None = None, follow_redirects: bool = True, verify_ssl: bool = True, ): @@ -830,14 +826,7 @@ def load(data: dict) -> "HTTPCrawlerConfig": return config return HTTPCrawlerConfig.from_kwargs(config) -class CrawlerRunConfig(): - _UNWANTED_PROPS = { - 'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED', - 'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS', - 'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY', - 'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY', - } - +class CrawlerRunConfig: """ Configuration class for controlling how the crawler runs each crawl operation. This includes parameters for content extraction, page manipulation, waiting conditions, @@ -903,7 +892,7 @@ class CrawlerRunConfig(): fetch_ssl_certificate: bool = False, # Caching Parameters cache_mode (CacheMode or None): Defines how caching is handled. - If None, defaults to CacheMode.ENABLED internally. + If None, defaults to CacheMode.BYPASS internally. Default: CacheMode.BYPASS. session_id (str or None): Optional session ID to persist the browser context and the created page instance. If the ID already exists, the crawler does not @@ -1053,7 +1042,7 @@ def __init__( markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(), only_text: bool = False, css_selector: str = None, - target_elements: List[str] = None, + target_elements: list[str] = None, excluded_tags: list = None, excluded_selector: str = None, keep_data_attributes: bool = False, @@ -1062,21 +1051,19 @@ def __init__( prettiify: bool = False, parser_type: str = "lxml", scraping_strategy: ContentScrapingStrategy = None, - proxy_config: Union[ProxyConfig, dict, None] = None, - proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None, + proxy_config: ProxyConfig | dict | None = None, + proxy_rotation_strategy: ProxyRotationStrategy | None = None, # Browser Location and Identity Parameters - locale: Optional[str] = None, - timezone_id: Optional[str] = None, - geolocation: Optional[GeolocationConfig] = None, + locale: str | None = None, + timezone_id: str | None = None, + geolocation: GeolocationConfig | None = None, # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters cache_mode: CacheMode = CacheMode.BYPASS, + cache_client: CacheClient = NoCacheClient(), + cache_ttl_seconds: int = DEFAULT_CACHE_TTL_SECONDS, session_id: str = None, - bypass_cache: bool = False, - disable_cache: bool = False, - no_cache_read: bool = False, - no_cache_write: bool = False, shared_data: dict = None, # Page Navigation and Timing Parameters wait_until: str = "domcontentloaded", @@ -1089,13 +1076,13 @@ def __init__( max_range: float = 0.3, semaphore_count: int = 5, # Page Interaction Parameters - js_code: Union[str, List[str]] = None, - c4a_script: Union[str, List[str]] = None, + js_code: str | list[str] = None, + c4a_script: str | list[str] = None, js_only: bool = False, ignore_body_visibility: bool = True, scan_full_page: bool = False, scroll_delay: float = 0.2, - max_scroll_steps: Optional[int] = None, + max_scroll_steps: int | None = None, process_iframes: bool = False, remove_overlay_elements: bool = False, simulate_user: bool = False, @@ -1136,16 +1123,16 @@ def __init__( user_agent_mode: str = None, user_agent_generator_config: dict = {}, # Deep Crawl Parameters - deep_crawl_strategy: Optional[DeepCrawlStrategy] = None, + deep_crawl_strategy: DeepCrawlStrategy | None = None, # Link Extraction Parameters - link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None, + link_preview_config: LinkPreviewConfig | dict[str, Any] = None, # Virtual Scroll Parameters - virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None, + virtual_scroll_config: VirtualScrollConfig | dict[str, Any] = None, # URL Matching Parameters - url_matcher: Optional[UrlMatcher] = None, + url_matcher: UrlMatcher | None = None, match_mode: MatchMode = MatchMode.OR, # Experimental Parameters - experimental: Dict[str, Any] = None, + experimental: dict[str, Any] = None, ): # TODO: Planning to set properties dynamically based on the __init__ signature self.url = url @@ -1165,7 +1152,7 @@ def __init__( self.remove_forms = remove_forms self.prettiify = prettiify self.parser_type = parser_type - self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy() + self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy(cache_client=cache_client) self.proxy_config = proxy_config if isinstance(proxy_config, dict): self.proxy_config = ProxyConfig.from_dict(proxy_config) @@ -1184,11 +1171,9 @@ def __init__( # Caching Parameters self.cache_mode = cache_mode + self.cache_client = cache_client + self.cache_ttl_seconds = cache_ttl_seconds self.session_id = session_id - self.bypass_cache = bypass_cache - self.disable_cache = disable_cache - self.no_cache_read = no_cache_read - self.no_cache_write = no_cache_write self.shared_data = shared_data # Page Navigation and Timing Parameters @@ -1384,12 +1369,12 @@ def is_match(self, url: str) -> bool: # Single function matcher return self.url_matcher(url) - elif isinstance(self.url_matcher, str): + if isinstance(self.url_matcher, str): # Single pattern string from fnmatch import fnmatch return fnmatch(url, self.url_matcher) - elif isinstance(self.url_matcher, list): + if isinstance(self.url_matcher, list): # List of mixed matchers if not self.url_matcher: # Empty list return False @@ -1408,29 +1393,16 @@ def is_match(self, url: str) -> bool: # Apply match mode logic if self.match_mode == MatchMode.OR: return any(results) if results else False - else: # AND mode - return all(results) if results else False + # AND mode + return all(results) if results else False return False def __getattr__(self, name): """Handle attribute access.""" - if name in self._UNWANTED_PROPS: - raise AttributeError(f"Getting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}") raise AttributeError(f"'{self.__class__.__name__}' has no attribute '{name}'") - def __setattr__(self, name, value): - """Handle attribute setting.""" - # TODO: Planning to set properties dynamically based on the __init__ signature - sig = inspect.signature(self.__init__) - all_params = sig.parameters # Dictionary of parameter names and their details - - if name in self._UNWANTED_PROPS and value is not all_params[name].default: - raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}") - - super().__setattr__(name, value) - @staticmethod def from_kwargs(kwargs: dict) -> "CrawlerRunConfig": return CrawlerRunConfig( @@ -1460,11 +1432,9 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig": fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), # Caching Parameters cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS), + cache_client=kwargs.get("cache_client", None), + cache_ttl_seconds=kwargs.get("cache_ttl_seconds", DEFAULT_CACHE_TTL_SECONDS), session_id=kwargs.get("session_id"), - bypass_cache=kwargs.get("bypass_cache", False), - disable_cache=kwargs.get("disable_cache", False), - no_cache_read=kwargs.get("no_cache_read", False), - no_cache_write=kwargs.get("no_cache_write", False), shared_data=kwargs.get("shared_data", None), # Page Navigation and Timing Parameters wait_until=kwargs.get("wait_until", "domcontentloaded"), @@ -1579,11 +1549,9 @@ def to_dict(self): "geolocation": self.geolocation, "fetch_ssl_certificate": self.fetch_ssl_certificate, "cache_mode": self.cache_mode, + "cache_client": self.cache_client, + "cache_ttl_seconds": self.cache_ttl_seconds, "session_id": self.session_id, - "bypass_cache": self.bypass_cache, - "disable_cache": self.disable_cache, - "no_cache_read": self.no_cache_read, - "no_cache_write": self.no_cache_write, "shared_data": self.shared_data, "wait_until": self.wait_until, "page_timeout": self.page_timeout, @@ -1671,15 +1639,15 @@ class LLMConfig: def __init__( self, provider: str = DEFAULT_PROVIDER, - api_token: Optional[str] = None, - base_url: Optional[str] = None, - temperature: Optional[float] = None, - max_tokens: Optional[int] = None, - top_p: Optional[float] = None, - frequency_penalty: Optional[float] = None, - presence_penalty: Optional[float] = None, - stop: Optional[List[str]] = None, - n: Optional[int] = None, + api_token: str | None = None, + base_url: str | None = None, + temperature: float | None = None, + max_tokens: int | None = None, + top_p: float | None = None, + frequency_penalty: float | None = None, + presence_penalty: float | None = None, + stop: list[str] | None = None, + n: int | None = None, ): """Configuaration class for LLM provider and API token.""" self.provider = provider @@ -1758,18 +1726,18 @@ class SeedingConfig: def __init__( self, source: str = "sitemap+cc", - pattern: Optional[str] = "*", + pattern: str | None = "*", live_check: bool = False, extract_head: bool = False, max_urls: int = -1, concurrency: int = 1000, hits_per_sec: int = 5, force: bool = False, - base_directory: Optional[str] = None, - llm_config: Optional[LLMConfig] = None, - verbose: Optional[bool] = None, - query: Optional[str] = None, - score_threshold: Optional[float] = None, + base_directory: str | None = None, + llm_config: LLMConfig | None = None, + verbose: bool | None = None, + query: str | None = None, + score_threshold: float | None = None, scoring_method: str = "bm25", filter_nonsense_urls: bool = True, ): @@ -1825,14 +1793,17 @@ def __init__( self.filter_nonsense_urls = filter_nonsense_urls # Add to_dict, from_kwargs, and clone methods for consistency - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: return {k: v for k, v in self.__dict__.items() if k != 'llm_config' or v is not None} @staticmethod - def from_kwargs(kwargs: Dict[str, Any]) -> 'SeedingConfig': + def from_kwargs(kwargs: dict[str, Any]) -> 'SeedingConfig': return SeedingConfig(**kwargs) def clone(self, **kwargs: Any) -> 'SeedingConfig': config_dict = self.to_dict() config_dict.update(kwargs) return SeedingConfig.from_kwargs(config_dict) + +def create_llm_config(*args, **kwargs) -> LLMConfig: + return LLMConfig(*args, **kwargs) \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 943867d0b..7c4fc60a0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -2,35 +2,42 @@ import asyncio import base64 +import contextlib +import hashlib +import os import time +import uuid from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Union -from typing import Optional, AsyncGenerator, Final -import os -from playwright.async_api import Page, Error -from playwright.async_api import TimeoutError as PlaywrightTimeoutError +from collections.abc import AsyncGenerator, Callable +from functools import partial from io import BytesIO +from types import MappingProxyType +from typing import Any, Final +from urllib.parse import urlparse + +import aiofiles +import aiohttp +import chardet +from aiohttp.client import ClientTimeout from PIL import Image, ImageDraw, ImageFont -import hashlib -import uuid +from playwright.async_api import Error, Page +from playwright.async_api import TimeoutError as PlaywrightTimeoutError + +from .async_configs import ( + BrowserConfig, + CrawlerRunConfig, + HTTPCrawlerConfig, + VirtualScrollConfig, +) +from .async_logger import AsyncLogger +from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter +from .browser_manager import BrowserManager +from .config import SCREENSHOT_HEIGHT_TRESHOLD from .js_snippet import load_js_script from .models import AsyncCrawlResponse -from .config import SCREENSHOT_HEIGHT_TRESHOLD -from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig -from .async_logger import AsyncLogger from .ssl_certificate import SSLCertificate from .user_agent_generator import ValidUAGenerator -from .browser_manager import BrowserManager -from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter -import aiofiles -import aiohttp -import chardet -from aiohttp.client import ClientTimeout -from urllib.parse import urlparse -from types import MappingProxyType -import contextlib -from functools import partial class AsyncCrawlerStrategy(ABC): """ @@ -200,8 +207,7 @@ async def execute_hook(self, hook_type: str, *args, **kwargs): if hook: if asyncio.iscoroutinefunction(hook): return await hook(*args, **kwargs) - else: - return hook(*args, **kwargs) + return hook(*args, **kwargs) return args[0] if args else None def update_user_agent(self, user_agent: str): @@ -216,7 +222,7 @@ def update_user_agent(self, user_agent: str): """ self.user_agent = user_agent - def set_custom_headers(self, headers: Dict[str, str]): + def set_custom_headers(self, headers: dict[str, str]): """ Set custom headers for the browser. @@ -252,7 +258,7 @@ async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): # Explicitly specified JavaScript js_code = wait_for[3:].strip() return await self.csp_compliant_wait(page, js_code, timeout) - elif wait_for.startswith("css:"): + if wait_for.startswith("css:"): # Explicitly specified CSS selector css_selector = wait_for[4:].strip() try: @@ -262,35 +268,32 @@ async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): raise TimeoutError( f"Timeout after {timeout}ms waiting for selector '{css_selector}'" ) - else: - raise ValueError(f"Invalid CSS selector: '{css_selector}'") + raise ValueError(f"Invalid CSS selector: '{css_selector}'") else: # Auto-detect based on content if wait_for.startswith("()") or wait_for.startswith("function"): # It's likely a JavaScript function return await self.csp_compliant_wait(page, wait_for, timeout) - else: - # Assume it's a CSS selector first + # Assume it's a CSS selector first + try: + await page.wait_for_selector(wait_for, timeout=timeout) + except Error as e: + if "Timeout" in str(e): + raise TimeoutError( + f"Timeout after {timeout}ms waiting for selector '{wait_for}'" + ) + # If it's not a timeout error, it might be an invalid selector + # Let's try to evaluate it as a JavaScript function as a fallback try: - await page.wait_for_selector(wait_for, timeout=timeout) - except Error as e: - if "Timeout" in str(e): - raise TimeoutError( - f"Timeout after {timeout}ms waiting for selector '{wait_for}'" - ) - else: - # If it's not a timeout error, it might be an invalid selector - # Let's try to evaluate it as a JavaScript function as a fallback - try: - return await self.csp_compliant_wait( - page, f"() => {{{wait_for}}}", timeout - ) - except Error: - raise ValueError( - f"Invalid wait_for parameter: '{wait_for}'. " - "It should be either a valid CSS selector, a JavaScript function, " - "or explicitly prefixed with 'js:' or 'css:'." - ) + return await self.csp_compliant_wait( + page, f"() => {{{wait_for}}}", timeout + ) + except Error: + raise ValueError( + f"Invalid wait_for parameter: '{wait_for}'. " + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'." + ) async def csp_compliant_wait( self, page: Page, user_wait_function: str, timeout: float = 30000 @@ -452,7 +455,7 @@ async def crawl( if url.startswith(("http://", "https://", "view-source:")): return await self._crawl_web(url, config) - elif url.startswith("file://"): + if url.startswith("file://"): # initialize empty lists for console messages captured_console = [] @@ -460,7 +463,7 @@ async def crawl( local_file_path = url[7:] # Remove 'file://' prefix if not os.path.exists(local_file_path): raise FileNotFoundError(f"Local file not found: {local_file_path}") - with open(local_file_path, "r", encoding="utf-8") as f: + with open(local_file_path, encoding="utf-8") as f: html = f.read() if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) @@ -482,7 +485,7 @@ async def crawl( # Fix: Check for "raw://" first, then "raw:" # Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:] ##### - elif url.startswith("raw://") or url.startswith("raw:"): + if url.startswith("raw://") or url.startswith("raw:"): # Process raw HTML content # raw_html = url[4:] if url[:4] == "raw:" else url[7:] raw_html = url[6:] if url.startswith("raw://") else url[4:] @@ -496,10 +499,9 @@ async def crawl( screenshot=screenshot_data, get_delayed_content=None, ) - else: - raise ValueError( - "URL must start with 'http://', 'https://', 'file://', or 'raw:'" - ) + raise ValueError( + "URL must start with 'http://', 'https://', 'file://', or 'raw:'" + ) async def _crawl_web( self, url: str, config: CrawlerRunConfig @@ -592,7 +594,7 @@ async def handle_response_capture(response): # body = await response.body() # json_body = await response.json() text_body = await response.text() - except Exception as e: + except Exception: body = None # json_body = None # text_body = None @@ -720,12 +722,6 @@ async def handle_request_failed_capture(request): status_code = first_resp.status response_headers = first_resp.headers - # if response is None: - # status_code = 200 - # response_headers = {} - # else: - # status_code = response.status - # response_headers = response.headers else: status_code = 200 @@ -767,48 +763,6 @@ async def handle_request_failed_capture(request): if not config.ignore_body_visibility: raise Error(f"Body element is hidden: {visibility_info}") - # try: - # await page.wait_for_selector("body", state="attached", timeout=30000) - - # await page.wait_for_function( - # """ - # () => { - # const body = document.body; - # const style = window.getComputedStyle(body); - # return style.display !== 'none' && - # style.visibility !== 'hidden' && - # style.opacity !== '0'; - # } - # """, - # timeout=30000, - # ) - # except Error as e: - # visibility_info = await page.evaluate( - # """ - # () => { - # const body = document.body; - # const style = window.getComputedStyle(body); - # return { - # display: style.display, - # visibility: style.visibility, - # opacity: style.opacity, - # hasContent: body.innerHTML.length, - # classList: Array.from(body.classList) - # } - # } - # """ - # ) - - # if self.config.verbose: - # self.logger.debug( - # message="Body visibility info: {info}", - # tag="DEBUG", - # params={"info": visibility_info}, - # ) - - # if not config.ignore_body_visibility: - # raise Error(f"Body element is hidden: {visibility_info}") - # Handle content loading and viewport adjustment if not self.browser_config.text_mode and ( config.wait_for_images or config.adjust_viewport_to_content @@ -835,12 +789,6 @@ async def handle_request_failed_capture(request): dimensions = await self.get_page_dimensions(page) page_height = dimensions["height"] page_width = dimensions["width"] - # page_width = await page.evaluate( - # "document.documentElement.scrollWidth" - # ) - # page_height = await page.evaluate( - # "document.documentElement.scrollHeight" - # ) target_width = self.browser_config.viewport_width target_height = int(target_width * page_width / page_height * 0.95) @@ -876,16 +824,7 @@ async def handle_request_failed_capture(request): if config.virtual_scroll_config: await self._handle_virtual_scroll(page, config.virtual_scroll_config) - # Execute JavaScript if provided - # if config.js_code: - # if isinstance(config.js_code, str): - # await page.evaluate(config.js_code) - # elif isinstance(config.js_code, list): - # for js in config.js_code: - # await page.evaluate(js) - if config.js_code: - # execution_result = await self.execute_user_script(page, config.js_code) execution_result = await self.robust_execute_user_script( page, config.js_code ) @@ -970,14 +909,12 @@ async def handle_request_failed_capture(request): print(f"Warning: Could not get content for selector '{selector}': {str(e)}") # Wrap in a div to create a valid HTML structure - html = f"
\n" + "\n".join(html_parts) + "\n
" + html = "
\n" + "\n".join(html_parts) + "\n
" except Error as e: raise RuntimeError(f"Failed to extract HTML content: {str(e)}") else: html = await page.content() - # # Get final HTML content - # html = await page.content() await self.execute_hook( "before_return_html", page=page, html=html, context=context, config=config ) @@ -1073,7 +1010,7 @@ async def get_delayed_content(delay: float = 5.0) -> str: await page.close() # async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): - async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None): + async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: int | None = None): """ Helper method to handle full page scanning. @@ -1151,7 +1088,7 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, ma # await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await self.safe_scroll(page, 0, total_height) - async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"): + async def _handle_virtual_scroll(self, page: Page, config: VirtualScrollConfig): """ Handle virtual scroll containers (e.g., Twitter-like feeds) by capturing content at different scroll positions and merging unique elements. @@ -1418,7 +1355,7 @@ async def export_pdf(self, page: Page) -> bytes: pdf_data = await page.pdf(print_background=True) return pdf_data - async def capture_mhtml(self, page: Page) -> Optional[str]: + async def capture_mhtml(self, page: Page) -> str | None: """ Captures the current page as MHTML using CDP. @@ -1484,7 +1421,7 @@ async def capture_mhtml(self, page: Page) -> Optional[str]: async def _capture_console_messages( self, page: Page, file_path: str - ) -> List[Dict[str, Union[str, float]]]: + ) -> list[dict[str, str | float]]: """ Captures console messages from the page. Args: @@ -1534,9 +1471,8 @@ async def take_screenshot(self, page, **kwargs) -> str: if not need_scroll: # Page is short enough, just take a screenshot return await self.take_screenshot_naive(page) - else: - # Page is too long, try to take a full-page screenshot - return await self.take_screenshot_scroller(page, **kwargs) + # Page is too long, try to take a full-page screenshot + return await self.take_screenshot_scroller(page, **kwargs) # return await self.take_screenshot_from_pdf(await self.export_pdf(page)) async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str: @@ -1725,15 +1661,14 @@ async def export_storage_state(self, path: str = None) -> dict: params={"path": path}, ) return state - else: - self.logger.warning( - message="No default_context available to export storage state.", - tag="WARNING", - ) + self.logger.warning( + message="No default_context available to export storage state.", + tag="WARNING", + ) async def robust_execute_user_script( - self, page: Page, js_code: Union[str, List[str]] - ) -> Dict[str, Any]: + self, page: Page, js_code: str | list[str] + ) -> dict[str, Any]: """ Executes user-provided JavaScript code with proper error handling and context, supporting both synchronous and async user code, plus navigations. @@ -1882,8 +1817,8 @@ async def robust_execute_user_script( return {"success": False, "error": str(e)} async def execute_user_script( - self, page: Page, js_code: Union[str, List[str]] - ) -> Dict[str, Any]: + self, page: Page, js_code: str | list[str] + ) -> dict[str, Any]: """ Executes user-provided JavaScript code with proper error handling and context. @@ -2020,7 +1955,7 @@ async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1): await page.wait_for_timeout(delay * 1000) return result - async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]: + async def csp_scroll_to(self, page: Page, x: int, y: int) -> dict[str, Any]: """ Performs a CSP-compliant scroll operation and returns the result status. @@ -2174,8 +2109,8 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): def __init__( self, - browser_config: Optional[HTTPCrawlerConfig] = None, - logger: Optional[AsyncLogger] = None, + browser_config: HTTPCrawlerConfig | None = None, + logger: AsyncLogger | None = None, max_connections: int = DEFAULT_MAX_CONNECTIONS, dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL, chunk_size: int = DEFAULT_CHUNK_SIZE @@ -2186,7 +2121,7 @@ def __init__( self.max_connections = max_connections self.dns_cache_ttl = dns_cache_ttl self.chunk_size = chunk_size - self._session: Optional[aiohttp.ClientSession] = None + self._session: aiohttp.ClientSession | None = None self.hooks = { k: partial(self._execute_hook, k) @@ -2250,7 +2185,7 @@ async def close(self) -> None: if self._session and not self._session.closed: try: await asyncio.wait_for(self._session.close(), timeout=5.0) - except asyncio.TimeoutError: + except TimeoutError: if self.logger: self.logger.warning( message="Session cleanup timed out", @@ -2259,7 +2194,7 @@ async def close(self) -> None: finally: self._session = None - async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]: + async def _stream_file(self, path: str) -> AsyncGenerator[memoryview]: async with aiofiles.open(path, mode='rb') as f: while chunk := await f.read(self.chunk_size): yield memoryview(chunk) @@ -2364,7 +2299,7 @@ async def _handle_http( async def crawl( self, url: str, - config: Optional[CrawlerRunConfig] = None, + config: CrawlerRunConfig | None = None, **kwargs ) -> AsyncCrawlResponse: config = config or CrawlerRunConfig.from_kwargs(kwargs) @@ -2378,10 +2313,10 @@ async def crawl( try: if scheme == 'file': return await self._handle_file(parsed.path) - elif scheme == 'raw': + if scheme == 'raw': return await self._handle_raw(parsed.path) - else: # http or https - return await self._handle_http(url, config) + # http or https + return await self._handle_http(url, config) except Exception as e: if self.logger: diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index a41ca97f0..c8a7d4f3d 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -1,21 +1,23 @@ -import os -from pathlib import Path -import aiosqlite import asyncio -from typing import Optional, Dict +import json +import os from contextlib import asynccontextmanager -import json -from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown -import aiofiles -from .async_logger import AsyncLogger +from typing import Dict, Optional -from .utils import ensure_content_dirs, generate_content_hash -from .utils import VersionManager -from .utils import get_error_context, create_box_message +import aiofiles +import aiosqlite -base_directory = DB_PATH = os.path.join( - os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai" +from .async_logger import AsyncLogger +from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown +from .utils import ( + VersionManager, + ensure_content_dirs, + generate_content_hash, + get_error_context, + get_home_folder, ) + +base_directory = DB_PATH = get_home_folder() os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(base_directory, "crawl4ai.db") @@ -33,7 +35,7 @@ def __init__(self, pool_size: int = 10, max_retries: int = 3): self._initialized = False self.version_manager = VersionManager() self.logger = AsyncLogger( - log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"), + log_file=os.path.join(base_directory, "crawler_db.log"), verbose=False, tag_width=10, ) diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index 5bb1a47c0..1b46e5e5d 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -1,43 +1,38 @@ -from typing import Dict, Optional, List, Tuple, Union +import asyncio +import random +import time +import uuid +from abc import ABC, abstractmethod +from collections.abc import AsyncGenerator +from urllib.parse import urlparse + +import psutil + from .async_configs import CrawlerRunConfig +from .components.crawler_monitor import CrawlerMonitor from .models import ( - CrawlResult, CrawlerTaskResult, + CrawlResult, CrawlStatus, DomainState, ) - -from .components.crawler_monitor import CrawlerMonitor - from .types import AsyncWebCrawler - -from collections.abc import AsyncGenerator - -import time -import psutil -import asyncio -import uuid - -from urllib.parse import urlparse -import random -from abc import ABC, abstractmethod - from .utils import get_true_memory_usage_percent class RateLimiter: def __init__( self, - base_delay: Tuple[float, float] = (1.0, 3.0), + base_delay: tuple[float, float] = (1.0, 3.0), max_delay: float = 60.0, max_retries: int = 3, - rate_limit_codes: List[int] = None, + rate_limit_codes: list[int] = None, ): self.base_delay = base_delay self.max_delay = max_delay self.max_retries = max_retries self.rate_limit_codes = rate_limit_codes or [429, 503] - self.domains: Dict[str, DomainState] = {} + self.domains: dict[str, DomainState] = {} def get_domain(self, url: str) -> str: return urlparse(url).netloc @@ -89,16 +84,16 @@ def update_delay(self, url: str, status_code: int) -> bool: class BaseDispatcher(ABC): def __init__( self, - rate_limiter: Optional[RateLimiter] = None, - monitor: Optional[CrawlerMonitor] = None, + rate_limiter: RateLimiter | None = None, + monitor: CrawlerMonitor | None = None, ): self.crawler = None - self._domain_last_hit: Dict[str, float] = {} + self._domain_last_hit: dict[str, float] = {} self.concurrent_sessions = 0 self.rate_limiter = rate_limiter self.monitor = monitor - def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]: + def select_config(self, url: str, configs: CrawlerRunConfig | list[CrawlerRunConfig]) -> CrawlerRunConfig | None: """Select the appropriate config for a given URL. Args: @@ -128,20 +123,20 @@ def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerR async def crawl_url( self, url: str, - config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], + config: CrawlerRunConfig | list[CrawlerRunConfig], task_id: str, - monitor: Optional[CrawlerMonitor] = None, + monitor: CrawlerMonitor | None = None, ) -> CrawlerTaskResult: pass @abstractmethod async def run_urls( self, - urls: List[str], + urls: list[str], crawler: AsyncWebCrawler, # noqa: F821 - config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], - monitor: Optional[CrawlerMonitor] = None, - ) -> List[CrawlerTaskResult]: + config: CrawlerRunConfig | list[CrawlerRunConfig], + monitor: CrawlerMonitor | None = None, + ) -> list[CrawlerTaskResult]: pass @@ -154,9 +149,9 @@ def __init__( check_interval: float = 1.0, max_session_permit: int = 20, fairness_timeout: float = 600.0, # 10 minutes before prioritizing long-waiting URLs - memory_wait_timeout: Optional[float] = 600.0, - rate_limiter: Optional[RateLimiter] = None, - monitor: Optional[CrawlerMonitor] = None, + memory_wait_timeout: float | None = 600.0, + rate_limiter: RateLimiter | None = None, + monitor: CrawlerMonitor | None = None, ): super().__init__(rate_limiter, monitor) self.memory_threshold_percent = memory_threshold_percent @@ -170,7 +165,7 @@ def __init__( self.task_queue = asyncio.PriorityQueue() # Priority queue for better management self.memory_pressure_mode = False # Flag to indicate when we're in memory pressure mode self.current_memory_percent = 0.0 # Track current memory usage - self._high_memory_start_time: Optional[float] = None + self._high_memory_start_time: float | None = None async def _memory_monitor_task(self): """Background task to continuously monitor memory usage and update state""" @@ -228,7 +223,7 @@ def _get_priority_score(self, wait_time: float, retry_count: int) -> float: async def crawl_url( self, url: str, - config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], + config: CrawlerRunConfig | list[CrawlerRunConfig], task_id: str, retry_count: int = 0, ) -> CrawlerTaskResult: @@ -373,10 +368,10 @@ async def crawl_url( async def run_urls( self, - urls: List[str], + urls: list[str], crawler: AsyncWebCrawler, - config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], - ) -> List[CrawlerTaskResult]: + config: CrawlerRunConfig | list[CrawlerRunConfig], + ) -> list[CrawlerTaskResult]: self.crawler = crawler # Start the memory monitor task @@ -461,6 +456,7 @@ async def run_urls( except Exception as e: if self.monitor: self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}") + raise e finally: # Clean up @@ -499,7 +495,7 @@ async def _update_queue_priorities(self): if self.monitor and task_id in self.monitor.stats: self.monitor.update_task(task_id, wait_time=wait_time) - except asyncio.TimeoutError: + except TimeoutError: # Queue might be empty or very slow break except Exception as e: @@ -529,10 +525,10 @@ async def _update_queue_priorities(self): async def run_urls_stream( self, - urls: List[str], + urls: list[str], crawler: AsyncWebCrawler, - config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], - ) -> AsyncGenerator[CrawlerTaskResult, None]: + config: CrawlerRunConfig | list[CrawlerRunConfig], + ) -> AsyncGenerator[CrawlerTaskResult]: self.crawler = crawler # Start the memory monitor task @@ -625,8 +621,8 @@ def __init__( self, semaphore_count: int = 5, max_session_permit: int = 20, - rate_limiter: Optional[RateLimiter] = None, - monitor: Optional[CrawlerMonitor] = None, + rate_limiter: RateLimiter | None = None, + monitor: CrawlerMonitor | None = None, ): super().__init__(rate_limiter, monitor) self.semaphore_count = semaphore_count @@ -635,7 +631,7 @@ def __init__( async def crawl_url( self, url: str, - config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], + config: CrawlerRunConfig | list[CrawlerRunConfig], task_id: str, semaphore: asyncio.Semaphore = None, ) -> CrawlerTaskResult: @@ -746,9 +742,9 @@ async def crawl_url( async def run_urls( self, crawler: AsyncWebCrawler, # noqa: F821 - urls: List[str], - config: Union[CrawlerRunConfig, List[CrawlerRunConfig]], - ) -> List[CrawlerTaskResult]: + urls: list[str], + config: CrawlerRunConfig | list[CrawlerRunConfig], + ) -> list[CrawlerTaskResult]: self.crawler = crawler if self.monitor: self.monitor.start() diff --git a/crawl4ai/async_url_seeder.py b/crawl4ai/async_url_seeder.py index d25647979..8dcf2c76b 100644 --- a/crawl4ai/async_url_seeder.py +++ b/crawl4ai/async_url_seeder.py @@ -14,26 +14,29 @@ """ from __future__ import annotations -import aiofiles + import asyncio +import fnmatch import gzip import hashlib -import io import json -import os -import pathlib import re -import time -from datetime import timedelta -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Sequence, Union +from collections.abc import Sequence +from typing import Any from urllib.parse import quote, urljoin import httpx -import fnmatch + +from crawl4ai.cache_client import ( + DEFAULT_CACHE_TTL_SECONDS, + URL_SEEDER_CACHE_KEY_PREFIX, + CacheClient, + NoCacheClient, +) + try: - from lxml import html as lxml_html from lxml import etree + from lxml import html as lxml_html LXML = True except ImportError: LXML = False @@ -52,20 +55,17 @@ # Assuming crawl4ai/async_logger.py defines AsyncLoggerBase # You might need to adjust this import based on your exact file structure # Import AsyncLogger for default if needed -from .async_logger import AsyncLoggerBase, AsyncLogger - # Import SeedingConfig for type hints from typing import TYPE_CHECKING + +from .async_logger import AsyncLoggerBase + if TYPE_CHECKING: from .async_configs import SeedingConfig # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ consts COLLINFO_URL = "https://index.commoncrawl.org/collinfo.json" -# CACHE_DIR = pathlib.Path("~/.crawl4ai").expanduser() # REMOVED: now managed by __init__ -# CACHE_DIR.mkdir(exist_ok=True) # REMOVED: now managed by __init__ -# INDEX_CACHE = CACHE_DIR / "latest_cc_index.txt" # REMOVED: now managed by __init__ -TTL = timedelta(days=7) # Keeping this constant as it's a seeder-specific TTL _meta_rx = re.compile( r']*?(?:name|property|http-equiv)\s*=\s*["\']?([^"\' >]+)[^>]*?content\s*=\s*["\']?([^"\' >]+)[^>]*?)\/?>', @@ -86,7 +86,7 @@ def _match(url: str, pattern: str) -> bool: or (canon.startswith("www.") and fnmatch.fnmatch(canon[4:], pattern))) -def _parse_head(src: str) -> Dict[str, Any]: +def _parse_head(src: str) -> dict[str, Any]: if LXML: try: if isinstance(src, str): @@ -95,7 +95,7 @@ def _parse_head(src: str) -> Dict[str, Any]: doc = lxml_html.fromstring(src) except (ValueError, etree.ParserError): return {} # malformed, bail gracefully - info: Dict[str, Any] = { + info: dict[str, Any] = { "title": (doc.find(".//title").text or "").strip() if doc.find(".//title") is not None else None, "charset": None, @@ -133,7 +133,7 @@ def _parse_head(src: str) -> Dict[str, Any]: info["lang"] = html_elem.attrib.get("lang", "") return info # regex fallback - info: Dict[str, Any] = {"title": None, "charset": None, + info: dict[str, Any] = {"title": None, "charset": None, "meta": {}, "link": {}, "jsonld": [], "lang": ""} m = _title_rx.search(src) info["title"] = m.group(1).strip() if m else None @@ -197,36 +197,24 @@ class AsyncUrlSeeder: def __init__( self, - ttl: timedelta = TTL, - client: Optional[httpx.AsyncClient] = None, - logger: Optional[AsyncLoggerBase] = None, # NEW: Add logger parameter - # NEW: Add base_directory - base_directory: Optional[Union[str, pathlib.Path]] = None, - cache_root: Optional[Union[str, Path]] = None, + cache_client: CacheClient = NoCacheClient(), + ttl_seconds: int | None = DEFAULT_CACHE_TTL_SECONDS, + client: httpx.AsyncClient | None = None, + logger: AsyncLoggerBase | None = None, ): - self.ttl = ttl + self.cache_client = cache_client + self.ttl_seconds = ttl_seconds self._owns_client = client is None # Track if we created the client self.client = client or httpx.AsyncClient(http2=True, timeout=20, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36" }) self.logger = logger # Store the logger instance - self.base_directory = pathlib.Path(base_directory or os.getenv( - "CRAWL4_AI_BASE_DIRECTORY", Path.home())) # Resolve base_directory - self.cache_dir = self.base_directory / ".crawl4ai" / \ - "seeder_cache" # NEW: Specific cache dir for seeder - self.cache_dir.mkdir(parents=True, exist_ok=True) # Ensure it exists - self.index_cache_path = self.cache_dir / \ - "latest_cc_index.txt" # NEW: Index cache path + # Common Crawl index cache key + self.cc_index_cache_key = f"{URL_SEEDER_CACHE_KEY_PREFIX}latest_cc_index" # defer โ€“ grabbing the index inside an active loop blows up - self.index_id: Optional[str] = None - self._rate_sem: Optional[asyncio.Semaphore] = None - - # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ cache dirs โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - self.cache_root = Path(os.path.expanduser( - cache_root or "~/.cache/url_seeder")) - (self.cache_root / "live").mkdir(parents=True, exist_ok=True) - (self.cache_root / "head").mkdir(exist_ok=True) + self.index_id: str | None = None + self._rate_sem: asyncio.Semaphore | None = None def _log(self, level: str, message: str, tag: str = "URL_SEED", **kwargs: Any): """Helper to log messages using the provided logger, if available.""" @@ -235,39 +223,31 @@ def _log(self, level: str, message: str, tag: str = "URL_SEED", **kwargs: Any): if log_method: log_method(message=message, tag=tag, params=kwargs.get('params', {})) - # else: # Fallback for unknown level, should not happen with AsyncLoggerBase - # print(f"[{tag}] {level.upper()}: {message.format(**kwargs)}") # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ cache helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - def _cache_path(self, kind: str, url: str) -> Path: + def _get_cache_key(self, kind: str, url: str) -> str: h = hashlib.sha1(url.encode()).hexdigest() - return self.cache_root / kind / f"{h}.json" + return f"{URL_SEEDER_CACHE_KEY_PREFIX}{kind}:{h}" - async def _cache_get(self, kind: str, url: str) -> Optional[Dict[str, Any]]: - p = self._cache_path(kind, url) - if not p.exists(): - return None - if time.time()-p.stat().st_mtime > self.ttl.total_seconds(): - return None - try: - async with aiofiles.open(p, "r") as f: - return json.loads(await f.read()) - except Exception: - return None + async def _cache_get(self, kind: str, url: str) -> dict[str, Any] | None: + cached = self.cache_client.get( + key=self._get_cache_key(kind, url) + ) + return json.loads(cached) if cached else None - async def _cache_set(self, kind: str, url: str, data: Dict[str, Any]) -> None: - try: - async with aiofiles.open(self._cache_path(kind, url), "w") as f: - await f.write(json.dumps(data, separators=(",", ":"))) - except Exception: - pass + async def _cache_set(self, kind: str, url: str, data: dict[str, Any]) -> None: + self.cache_client.set( + key=self._get_cache_key(kind, url), + value=json.dumps(data, separators=(",", ":")), + ttl_seconds=int(self.ttl_seconds) + ) # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ discovery entry async def urls(self, domain: str, - config: "SeedingConfig", - ) -> List[Dict[str, Any]]: + config: SeedingConfig, + ) -> list[dict[str, Any]]: """ Fetch URLs for a domain using configuration from SeedingConfig. @@ -364,7 +344,7 @@ async def producer(): producer_done.set() self._log("debug", "Producer finished.", tag="URL_SEED") - async def worker(res_list: List[Dict[str, Any]]): + async def worker(res_list: list[dict[str, Any]]): while True: if queue.empty() and producer_done.is_set(): # self._log("debug", "Worker exiting: queue empty and producer done.", tag="URL_SEED") @@ -372,7 +352,7 @@ async def worker(res_list: List[Dict[str, Any]]): try: # Increased timeout slightly url = await asyncio.wait_for(queue.get(), 5) - except asyncio.TimeoutError: + except TimeoutError: continue # Keep checking queue and producer_done status except Exception as e: self._log("error", "Worker failed to get URL from queue: {error}", params={ @@ -412,7 +392,7 @@ async def worker(res_list: List[Dict[str, Any]]): queue.task_done() # Mark task as done for queue.join() if ever used # launch - results: List[Dict[str, Any]] = [] + results: list[dict[str, Any]] = [] prod_task = asyncio.create_task(producer()) workers = [asyncio.create_task(worker(results)) for _ in range(concurrency)] @@ -450,8 +430,8 @@ async def worker(res_list: List[Dict[str, Any]]): async def many_urls( self, domains: Sequence[str], - config: "SeedingConfig", - ) -> Dict[str, List[Dict[str, Any]]]: + config: SeedingConfig, + ) -> dict[str, list[dict[str, Any]]]: """ Fetch URLs for many domains in parallel. @@ -484,11 +464,11 @@ async def many_urls( async def extract_head_for_urls( self, - urls: List[str], - config: Optional["SeedingConfig"] = None, + urls: list[str], + config: SeedingConfig | None = None, concurrency: int = 10, timeout: int = 5 - ) -> List[Dict[str, Any]]: + ) -> list[dict[str, Any]]: """ Extract head content for a custom list of URLs using URLSeeder's parallel processing. @@ -546,7 +526,7 @@ async def extract_head_for_urls( seen: set[str] = set() # Results collection - results: List[Dict[str, Any]] = [] + results: list[dict[str, Any]] = [] async def producer(): """Producer to feed URLs into the queue.""" @@ -563,13 +543,13 @@ async def producer(): finally: producer_done.set() - async def worker(res_list: List[Dict[str, Any]]): + async def worker(res_list: list[dict[str, Any]]): """Worker to process URLs from the queue.""" while True: try: # Wait for URL or producer completion url = await asyncio.wait_for(queue.get(), timeout=1.0) - except asyncio.TimeoutError: + except TimeoutError: if producer_done.is_set() and queue.empty(): break continue @@ -642,7 +622,7 @@ async def worker(res_list: List[Dict[str, Any]]): return results - async def _apply_bm25_scoring(self, results: List[Dict[str, Any]], config: "SeedingConfig") -> List[Dict[str, Any]]: + async def _apply_bm25_scoring(self, results: list[dict[str, Any]], config: SeedingConfig) -> list[dict[str, Any]]: """Apply BM25 scoring to results that have head_data.""" if not HAS_BM25: self._log("warning", "BM25 scoring requested but rank_bm25 not available", tag="URL_SEED") @@ -676,7 +656,7 @@ async def _apply_bm25_scoring(self, results: List[Dict[str, Any]], config: "Seed return results - async def _resolve_head(self, url: str) -> Optional[str]: + async def _resolve_head(self, url: str) -> str | None: """ HEAD-probe a URL. @@ -716,16 +696,16 @@ async def _from_cc(self, domain: str, pattern: str, force: bool): # โ”€โ”€ sanitize only for cache-file name safe = re.sub('[/?#]+', '_', raw) - path = self.cache_dir / f"{self.index_id}_{safe}_{digest}.jsonl" - - if path.exists() and not force: - self._log("info", "Loading CC URLs for {domain} from cache: {path}", - params={"domain": domain, "path": path}, tag="URL_SEED") - async with aiofiles.open(path, "r") as fp: - async for line in fp: - url = line.strip() - if _match(url, pattern): - yield url + cache_key = f"{URL_SEEDER_CACHE_KEY_PREFIX}cc:{self.index_id}_{safe}_{digest}" + cached = self.cache_client.get(key=cache_key) + + if cached and not force: + self._log("info", "Loading CC URLs for {domain} from cache: {key}", + params={"domain": domain, "key": cache_key}, tag="URL_SEED") + async for line in cached.splitlines(): + url = line.strip() + if _match(url, pattern): + yield url return # build CC glob โ€“ if a path is present keep it, else add trailing /* @@ -739,13 +719,18 @@ async def _from_cc(self, domain: str, pattern: str, force: bool): try: async with self.client.stream("GET", url) as r: r.raise_for_status() - async with aiofiles.open(path, "w") as fp: - async for line in r.aiter_lines(): - rec = json.loads(line) - u = rec["url"] - await fp.write(u+"\n") - if _match(u, pattern): - yield u + urls = [] + for line in r.aiter_lines(): + rec = json.loads(line) + u = rec["url"] + urls.append(u) + self.cache_client.set( + key=cache_key, + value="\n".join(urls), + ttl_seconds=self.ttl_seconds + ) + if _match(u, pattern): + yield u return except httpx.HTTPStatusError as e: if e.response.status_code == 503 and i < len(retries): @@ -773,16 +758,16 @@ async def _from_sitemaps(self, domain: str, pattern: str, force: bool = False): host = re.sub(r'^https?://', '', domain).rstrip('/') host = re.sub('[/?#]+', '_', domain) digest = hashlib.md5(pattern.encode()).hexdigest()[:8] - path = self.cache_dir / f"sitemap_{host}_{digest}.jsonl" - - if path.exists() and not force: - self._log("info", "Loading sitemap URLs for {d} from cache: {p}", - params={"d": host, "p": str(path)}, tag="URL_SEED") - async with aiofiles.open(path, "r") as fp: - async for line in fp: - url = line.strip() - if _match(url, pattern): - yield url + cache_key = f"{URL_SEEDER_CACHE_KEY_PREFIX}sitemap:{host}_{digest}" + cached = self.cache_client.get(key=cache_key) + + if cached and not force: + self._log("info", "Loading sitemap URLs for {d} from cache: {k}", + params={"d": host, "k": cache_key}, tag="URL_SEED") + for line in cached.splitlines(): + url = line.strip() + if _match(url, pattern): + yield url return # 1๏ธโƒฃ direct sitemap probe @@ -797,11 +782,16 @@ async def _from_sitemaps(self, domain: str, pattern: str, force: bool = False): if sm: self._log("info", "Found sitemap at {url}", params={ "url": sm}, tag="URL_SEED") - async with aiofiles.open(path, "w") as fp: - async for u in self._iter_sitemap(sm): - await fp.write(u + "\n") - if _match(u, pattern): - yield u + urls = [] + async for u in self._iter_sitemap(sm): + urls.append(u) + self.cache_client.set( + key=cache_key, + value="\n".join(urls), + ttl_seconds=self.ttl_seconds + ) + if _match(u, pattern): + yield u return # 2๏ธโƒฃ robots.txt fallback @@ -820,12 +810,17 @@ async def _from_sitemaps(self, domain: str, pattern: str, force: bool = False): return if sitemap_lines: - async with aiofiles.open(path, "w") as fp: - for sm in sitemap_lines: - async for u in self._iter_sitemap(sm): - await fp.write(u + "\n") - if _match(u, pattern): - yield u + urls = [] + for sm in sitemap_lines: + async for u in self._iter_sitemap(sm): + urls.append(u) + self.cache_client.set( + key=cache_key, + value="\n".join(urls), + ttl_seconds=self.ttl_seconds + ) + if _match(u, pattern): + yield u async def _iter_sitemap(self, url: str): try: @@ -956,9 +951,9 @@ async def process_subsitemap(sitemap_url: str): yield u # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ validate helpers - async def _validate(self, url: str, res_list: List[Dict[str, Any]], live: bool, - extract: bool, timeout: int, verbose: bool, query: Optional[str] = None, - score_threshold: Optional[float] = None, scoring_method: str = "bm25", + async def _validate(self, url: str, res_list: list[dict[str, Any]], live: bool, + extract: bool, timeout: int, verbose: bool, query: str | None = None, + score_threshold: float | None = None, scoring_method: str = "bm25", filter_nonsense: bool = True): # Local verbose parameter for this function is used to decide if intermediate logs should be printed # The main logger's verbose status should be controlled by the caller. @@ -1059,11 +1054,10 @@ async def _fetch_head( self._log("debug", "Redirecting from {original_url} to {new_url}", params={"original_url": r.url, "new_url": url}, tag="URL_SEED") continue - else: - self._log("warning", "Redirect status {status_code} but no Location header for {url}", - params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED") - # Return original URL if no new location - return False, "", str(r.url) + self._log("warning", "Redirect status {status_code} but no Location header for {url}", + params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED") + # Return original URL if no new location + return False, "", str(r.url) # For 2xx or other non-redirect codes, proceed to read content # Only allow successful codes, or continue @@ -1140,7 +1134,7 @@ async def _fetch_head( return False, "", url # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ BM25 scoring helpers - def _extract_text_context(self, head_data: Dict[str, Any]) -> str: + def _extract_text_context(self, head_data: dict[str, Any]) -> str: """Extract all relevant text from head metadata for scoring.""" # Priority fields with their weights (for future enhancement) text_parts = [] @@ -1197,7 +1191,6 @@ def _calculate_url_relevance_score(self, query: str, url: str) -> float: """Calculate relevance score between query and URL using string matching.""" # Normalize inputs query_lower = query.lower() - url_lower = url.lower() # Extract URL components from urllib.parse import urlparse @@ -1377,7 +1370,7 @@ def _is_nonsense_url(self, url: str) -> bool: return False - def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]: + def _calculate_bm25_score(self, query: str, documents: list[str]) -> list[float]: """Calculate BM25 scores for documents against a query.""" if not HAS_BM25: self._log( @@ -1441,10 +1434,11 @@ async def __aexit__(self, exc_type, exc_val, exc_tb): # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ index helper async def _latest_index(self) -> str: - if self.index_cache_path.exists() and (time.time()-self.index_cache_path.stat().st_mtime) < self.ttl.total_seconds(): - self._log("info", "Loading latest CC index from cache: {path}", - params={"path": self.index_cache_path}, tag="URL_SEED") - return self.index_cache_path.read_text().strip() + cached_index = self.cache_client.get(key=self.cc_index_cache_key) + if cached_index: + self._log("info", "Loading latest CC index from cache: {key}", + params={"key": self.cc_index_cache_key}, tag="URL_SEED") + return cached_index self._log("info", "Fetching latest Common Crawl index from {url}", params={"url": COLLINFO_URL}, tag="URL_SEED") @@ -1453,7 +1447,7 @@ async def _latest_index(self) -> str: j = await c.get(COLLINFO_URL, timeout=10) j.raise_for_status() # Raise an exception for bad status codes idx = j.json()[0]["id"] - self.index_cache_path.write_text(idx) + self.cache_client.set(key=self.cc_index_cache_key, value=idx, ttl_seconds=self.ttl_seconds) self._log("success", "Successfully fetched and cached CC index: {index_id}", params={"index_id": idx}, tag="URL_SEED") return idx diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ebd2859d2..e01d55427 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -1,52 +1,49 @@ -from .__version__ import __version__ as crawl4ai_version +from __future__ import annotations + +import asyncio +import json import os import sys import time +from contextlib import asynccontextmanager from pathlib import Path -from typing import Optional, List -import json -import asyncio +from typing import Any -# from contextlib import nullcontext, asynccontextmanager -from contextlib import asynccontextmanager -from .models import ( - CrawlResult, - MarkdownGenerationResult, - DispatchResult, - ScrapingResult, - CrawlResultContainer, - RunManyReturn -) -from .async_database import async_db_manager -from .chunking_strategy import * # noqa: F403 -from .chunking_strategy import IdentityChunking -from .content_filter_strategy import * # noqa: F403 -from .extraction_strategy import * # noqa: F403 -from .extraction_strategy import NoExtractionStrategy +from crawl4ai.cache_client import HTML_CACHE_KEY_PREFIX, CacheClient, NoCacheClient + +from .__version__ import __version__ as crawl4ai_version +from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig, SeedingConfig from .async_crawler_strategy import ( AsyncCrawlerStrategy, - AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse, + AsyncPlaywrightCrawlerStrategy, ) -from .cache_context import CacheMode, CacheContext +from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter +from .async_logger import AsyncLogger, AsyncLoggerBase +from .async_url_seeder import AsyncUrlSeeder +from .cache_context import CacheContext, CacheMode +from .chunking_strategy import IdentityChunking +from .deep_crawling import DeepCrawlDecorator +from .extraction_strategy import NoExtractionStrategy from .markdown_generation_strategy import ( DefaultMarkdownGenerator, MarkdownGenerationStrategy, ) -from .deep_crawling import DeepCrawlDecorator -from .async_logger import AsyncLogger, AsyncLoggerBase -from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig, SeedingConfig -from .async_dispatcher import * # noqa: F403 -from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter -from .async_url_seeder import AsyncUrlSeeder - +from .models import ( + CrawlResult, + CrawlResultContainer, + DispatchResult, + MarkdownGenerationResult, + RunManyReturn, + ScrapingResult, +) from .utils import ( - sanitize_input_encode, InvalidCSSSelectorError, + RobotsParser, fast_format_html, get_error_context, - RobotsParser, preprocess_html_for_schema, + sanitize_input_encode, ) @@ -109,28 +106,29 @@ class AsyncWebCrawler: def __init__( self, + cache_client: CacheClient = NoCacheClient(), crawler_strategy: AsyncCrawlerStrategy = None, config: BrowserConfig = None, base_directory: str = str( os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), thread_safe: bool = False, logger: AsyncLoggerBase = None, - **kwargs, ): """ Initialize the AsyncWebCrawler. Args: + cache_client: Client for caching crawled pages crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy config: Configuration object for browser settings. Default BrowserConfig() base_directory: Base directory for storing cache thread_safe: Whether to use thread-safe operations - **kwargs: Additional arguments for backwards compatibility """ # Handle browser configuration browser_config = config or BrowserConfig() self.browser_config = browser_config + self.cache_client = cache_client # Initialize logger first since other components may need it self.logger = logger or AsyncLogger( @@ -140,12 +138,9 @@ def __init__( ) # Initialize crawler strategy - params = {k: v for k, v in kwargs.items() if k in [ - "browser_config", "logger"]} self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( browser_config=browser_config, logger=self.logger, - **params, # Pass remaining kwargs for backwards compatibility ) # Thread safety setup @@ -157,7 +152,7 @@ def __init__( os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) # Initialize robots parser - self.robots_parser = RobotsParser() + self.robots_parser = RobotsParser(cache_client=cache_client) self.ready = False @@ -165,7 +160,7 @@ def __init__( self._deep_handler = DeepCrawlDecorator(self) self.arun = self._deep_handler(self.arun) - self.url_seeder: Optional[AsyncUrlSeeder] = None + self.url_seeder: AsyncUrlSeeder | None = None async def start(self): """ @@ -210,16 +205,7 @@ async def arun( """ Runs the crawler for a single source: URL (web, local file, or raw HTML). - Migration Guide: - Old way (deprecated): - result = await crawler.arun( - url="https://example.com", - word_count_threshold=200, - screenshot=True, - ... - ) - - New way (recommended): + Example: config = CrawlerRunConfig( word_count_threshold=200, screenshot=True, @@ -248,9 +234,19 @@ async def arun( try: self.logger.verbose = config.verbose - # Default to ENABLED if no cache mode specified + # Update proxy configuration from rotation strategy if available + if config.proxy_rotation_strategy: + next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy() + if next_proxy: + self.logger.info( + message="Switch proxy: {proxy}", + tag="PROXY", + params={"proxy": next_proxy.server} + ) + config.proxy_config = next_proxy + if config.cache_mode is None: - config.cache_mode = CacheMode.ENABLED + config.cache_mode = CacheMode.BYPASS # Create cache context cache_context = CacheContext(url, config.cache_mode, False) @@ -258,155 +254,120 @@ async def arun( # Initialize processing variables async_response: AsyncCrawlResponse = None cached_result: CrawlResult = None - screenshot_data = None - pdf_data = None - extracted_content = None start_time = time.perf_counter() # Try to get cached result if appropriate if cache_context.should_read(): - cached_result = await async_db_manager.aget_cached_url(url) - - if cached_result: - html = sanitize_input_encode(cached_result.html) - extracted_content = sanitize_input_encode( - cached_result.extracted_content or "" - ) - extracted_content = ( - None - if not extracted_content or extracted_content == "[]" - else extracted_content + cached_html = self.cache_client.get( + key=self._get_cache_key(url) ) - # If screenshot is requested but its not in cache, then set cache_result to None - screenshot_data = cached_result.screenshot - pdf_data = cached_result.pdf - # if config.screenshot and not screenshot or config.pdf and not pdf: - if config.screenshot and not screenshot_data: - cached_result = None - - if config.pdf and not pdf_data: - cached_result = None - - self.logger.url_status( - url=cache_context.display_url, - success=bool(html), - timing=time.perf_counter() - start_time, - tag="FETCH", - ) - - # Update proxy configuration from rotation strategy if available - if config and config.proxy_rotation_strategy: - next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy() - if next_proxy: - self.logger.info( - message="Switch proxy: {proxy}", - tag="PROXY", - params={"proxy": next_proxy.server} + html = sanitize_input_encode(cached_html) + + if html: + self.logger.url_status( + url=cache_context.display_url, + success=True, + timing=time.perf_counter() - start_time, + tag="COMPLETE" ) - config.proxy_config = next_proxy - # config = config.clone(proxy_config=next_proxy) + crawl_result: CrawlResult = await self.aprocess_html( + url=url, + html=html, + extracted_content=None, + config=config, + screenshot_data=None, + pdf_data=None, + is_raw_html=True if url.startswith("raw:") else False, + **kwargs, + ) + return CrawlResultContainer(crawl_result) # Fetch fresh content if needed - if not cached_result or not html: - t1 = time.perf_counter() - - if config.user_agent: - self.crawler_strategy.update_user_agent( - config.user_agent) - - # Check robots.txt if enabled - if config and config.check_robots_txt: - if not await self.robots_parser.can_fetch( - url, self.browser_config.user_agent - ): - return CrawlResult( - url=url, - html="", - success=False, - status_code=403, - error_message="Access denied by robots.txt", - response_headers={ - "X-Robots-Status": "Blocked by robots.txt" - }, - ) - - ############################## - # Call CrawlerStrategy.crawl # - ############################## - async_response = await self.crawler_strategy.crawl( - url, - config=config, # Pass the entire config object - ) + t1 = time.perf_counter() + + if config.user_agent: + self.crawler_strategy.update_user_agent( + config.user_agent) + + # Check robots.txt if enabled + if config and config.check_robots_txt: + if not await self.robots_parser.can_fetch( + url, self.browser_config.user_agent + ): + return CrawlResult( + url=url, + html="", + success=False, + status_code=403, + error_message="Access denied by robots.txt", + response_headers={ + "X-Robots-Status": "Blocked by robots.txt" + }, + ) - html = sanitize_input_encode(async_response.html) - screenshot_data = async_response.screenshot - pdf_data = async_response.pdf_data - js_execution_result = async_response.js_execution_result + ############################## + # Call CrawlerStrategy.crawl # + ############################## + async_response = await self.crawler_strategy.crawl( + url, + config=config, + ) - t2 = time.perf_counter() - self.logger.url_status( - url=cache_context.display_url, - success=bool(html), - timing=t2 - t1, - tag="FETCH", - ) + html = sanitize_input_encode(async_response.html) + screenshot_data = async_response.screenshot + pdf_data = async_response.pdf_data + js_execution_result = async_response.js_execution_result - ############################################################### - # Process the HTML content, Call CrawlerStrategy.process_html # - ############################################################### - crawl_result: CrawlResult = await self.aprocess_html( - url=url, - html=html, - extracted_content=extracted_content, - config=config, # Pass the config object instead of individual parameters - screenshot_data=screenshot_data, - pdf_data=pdf_data, - verbose=config.verbose, - is_raw_html=True if url.startswith("raw:") else False, - redirected_url=async_response.redirected_url, - **kwargs, - ) + t2 = time.perf_counter() + self.logger.url_status( + url=cache_context.display_url, + success=async_response.status_code == 200 and bool(html), + timing=t2 - t1, + tag="FETCH", + ) - crawl_result.status_code = async_response.status_code - crawl_result.redirected_url = async_response.redirected_url or url - crawl_result.response_headers = async_response.response_headers - crawl_result.downloaded_files = async_response.downloaded_files - crawl_result.js_execution_result = js_execution_result - crawl_result.mhtml = async_response.mhtml_data - crawl_result.ssl_certificate = async_response.ssl_certificate - # Add captured network and console data if available - crawl_result.network_requests = async_response.network_requests - crawl_result.console_messages = async_response.console_messages - - crawl_result.success = bool(html) - crawl_result.session_id = getattr( - config, "session_id", None) - - self.logger.url_status( - url=cache_context.display_url, - success=crawl_result.success, - timing=time.perf_counter() - start_time, - tag="COMPLETE", - ) + ############################################################### + # Process the HTML content, Call CrawlerStrategy.process_html # + ############################################################### + crawl_result: CrawlResult = await self.aprocess_html( + url=url, + html=html, + extracted_content=None, + config=config, # Pass the config object instead of individual parameters + screenshot_data=screenshot_data, + pdf_data=pdf_data, + is_raw_html=True if url.startswith("raw:") else False, + redirected_url=async_response.redirected_url, + **kwargs, + ) - # Update cache if appropriate - if cache_context.should_write() and not bool(cached_result): - await async_db_manager.acache_url(crawl_result) + crawl_result.status_code = async_response.status_code + crawl_result.redirected_url = async_response.redirected_url or url + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + crawl_result.js_execution_result = js_execution_result + crawl_result.mhtml = async_response.mhtml_data + crawl_result.ssl_certificate = async_response.ssl_certificate + # Add captured network and console data if available + crawl_result.network_requests = async_response.network_requests + crawl_result.console_messages = async_response.console_messages - return CrawlResultContainer(crawl_result) + self.logger.url_status( + url=cache_context.display_url, + success=crawl_result.success, + timing=time.perf_counter() - start_time, + tag="COMPLETE", + ) - else: - self.logger.url_status( - url=cache_context.display_url, - success=True, - timing=time.perf_counter() - start_time, - tag="COMPLETE" + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + self.cache_client.set( + key=self._get_cache_key(url), + value=crawl_result.html, + ttl_seconds=config.cache_ttl_seconds, ) - cached_result.success = bool(html) - cached_result.session_id = getattr( - config, "session_id", None) - cached_result.redirected_url = cached_result.redirected_url or url - return CrawlResultContainer(cached_result) + + return CrawlResultContainer(crawl_result) except Exception as e: error_context = get_error_context(sys.exc_info()) @@ -429,6 +390,9 @@ async def arun( url=url, html="", success=False, error_message=error_message ) ) + + def _get_cache_key(self, url: str) -> str: + return f"{HTML_CACHE_KEY_PREFIX}{url}" async def aprocess_html( self, @@ -438,7 +402,6 @@ async def aprocess_html( config: CrawlerRunConfig, screenshot_data: str, pdf_data: str, - verbose: bool, **kwargs, ) -> CrawlResult: """ @@ -502,9 +465,6 @@ async def aprocess_html( metadata = result.get("metadata", {}) else: cleaned_html = sanitize_input_encode(result.cleaned_html) - # media = result.media.model_dump() - # tables = media.pop("tables", []) - # links = result.links.model_dump() media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media tables = media.pop("tables", []) if isinstance(media, dict) else [] links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links @@ -515,7 +475,7 @@ async def aprocess_html( ################################ # Generate Markdown # ################################ - markdown_generator: Optional[MarkdownGenerationStrategy] = ( + markdown_generator: MarkdownGenerationStrategy | None = ( config.markdown_generator or DefaultMarkdownGenerator() ) @@ -561,7 +521,7 @@ async def aprocess_html( markdown_result: MarkdownGenerationResult = ( markdown_generator.generate_markdown( input_html=markdown_input_html, - base_url=params.get("redirected_url", url) + base_url=params.get("redirected_url", url) or url # html2text_options=kwargs.get('html2text', {}) ) ) @@ -573,11 +533,6 @@ async def aprocess_html( timing=int((time.perf_counter() - t1) * 1000) / 1000, tag="SCRAPE" ) - # self.logger.info( - # message="{url:.50}... | Time: {timing}s", - # tag="SCRAPE", - # params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, - # ) ################################ # Structured Content Extraction # @@ -648,25 +603,14 @@ async def aprocess_html( extracted_content=extracted_content, success=True, error_message="", + session_id=getattr(config, "session_id", None) ) async def arun_many( self, - urls: List[str], - config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None, - dispatcher: Optional[BaseDispatcher] = None, - # Legacy parameters maintained for backwards compatibility - # word_count_threshold=MIN_WORD_THRESHOLD, - # extraction_strategy: ExtractionStrategy = None, - # chunking_strategy: ChunkingStrategy = RegexChunking(), - # content_filter: RelevantContentFilter = None, - # cache_mode: Optional[CacheMode] = None, - # bypass_cache: bool = False, - # css_selector: str = None, - # screenshot: bool = False, - # pdf: bool = False, - # user_agent: str = None, - # verbose=True, + urls: list[str], + config: CrawlerRunConfig | list[CrawlerRunConfig] | None = None, + dispatcher: BaseDispatcher | None = None, **kwargs, ) -> RunManyReturn: """ @@ -702,20 +646,6 @@ async def arun_many( print(f"Processed {result.url}: {len(result.markdown)} chars") """ config = config or CrawlerRunConfig() - # if config is None: - # config = CrawlerRunConfig( - # word_count_threshold=word_count_threshold, - # extraction_strategy=extraction_strategy, - # chunking_strategy=chunking_strategy, - # content_filter=content_filter, - # cache_mode=cache_mode, - # bypass_cache=bypass_cache, - # css_selector=css_selector, - # screenshot=screenshot, - # pdf=pdf, - # verbose=verbose, - # **kwargs, - # ) if dispatcher is None: dispatcher = MemoryAdaptiveDispatcher( @@ -756,16 +686,15 @@ async def result_transformer(): yield transform_result(task_result) return result_transformer() - else: - _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config) - return [transform_result(res) for res in _results] + _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config) + return [transform_result(res) for res in _results] async def aseed_urls( self, - domain_or_domains: Union[str, List[str]], - config: Optional[SeedingConfig] = None, + domain_or_domains: str | list[str], + config: SeedingConfig | None = None, **kwargs - ) -> Union[List[str], Dict[str, List[Union[str, Dict[str, Any]]]]]: + ) -> list[str] | dict[str, list[str, dict[str, Any]]]: """ Discovers, filters, and optionally validates URLs for a given domain(s) using sitemaps and Common Crawl archives. @@ -807,9 +736,10 @@ async def aseed_urls( # Pass the crawler's base_directory for seeder's cache management # Pass the crawler's logger for consistent logging self.url_seeder = AsyncUrlSeeder( + cache_client=self.cache_client, base_directory=self.crawl4ai_folder, logger=self.logger - ) + ) # Merge config object with direct kwargs, giving kwargs precedence seeding_config = config.clone(**kwargs) if config else SeedingConfig.from_kwargs(kwargs) @@ -837,7 +767,7 @@ async def aseed_urls( domain_or_domains, seeding_config ) - elif isinstance(domain_or_domains, (list, tuple)): + if isinstance(domain_or_domains, (list, tuple)): self.logger.info( message="Starting URL seeding for {count} domains", tag="SEED", @@ -848,5 +778,4 @@ async def aseed_urls( domain_or_domains, seeding_config ) - else: - raise ValueError("`domain_or_domains` must be a string or a list of strings.") \ No newline at end of file + raise ValueError("`domain_or_domains` must be a string or a list of strings.") diff --git a/crawl4ai/cache_client.py b/crawl4ai/cache_client.py new file mode 100644 index 000000000..3916ca359 --- /dev/null +++ b/crawl4ai/cache_client.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod + +DEFAULT_CACHE_TTL_SECONDS = 2 * 60 # 2 hours + +_CRAWL4AI_CACHE_KEY_PREFIX = "c4ai:" +HTML_CACHE_KEY_PREFIX = f"{_CRAWL4AI_CACHE_KEY_PREFIX}html:" +ROBOTS_CACHE_KEY_PREFIX = f"{_CRAWL4AI_CACHE_KEY_PREFIX}robots:" +URL_SEEDER_CACHE_KEY_PREFIX = f"{_CRAWL4AI_CACHE_KEY_PREFIX}url_seeder:" + +class CacheClient(ABC): + @abstractmethod + def get(self, key: str) -> str | None: + pass + + @abstractmethod + def set(self, key: str, value: str, ttl_seconds: int) -> None: + pass + + @abstractmethod + def clear(self, prefix: str) -> None: + pass + + +class NoCacheClient(CacheClient): + def get(self, key: str) -> str | None: + return None + + def set(self, key: str, value: str, ttl_seconds: int) -> None: + pass + + def clear(self, prefix: str) -> None: + pass diff --git a/crawl4ai/cache_context.py b/crawl4ai/cache_context.py index 75914b5bf..d15804069 100644 --- a/crawl4ai/cache_context.py +++ b/crawl4ai/cache_context.py @@ -91,27 +91,3 @@ def display_url(self) -> str: """Returns the URL in display format.""" return self._url_display - -def _legacy_to_cache_mode( - disable_cache: bool = False, - bypass_cache: bool = False, - no_cache_read: bool = False, - no_cache_write: bool = False, -) -> CacheMode: - """ - Converts legacy cache parameters to the new CacheMode enum. - - This is an internal function to help transition from the old boolean flags - to the new CacheMode system. - """ - if disable_cache: - return CacheMode.DISABLED - if bypass_cache: - return CacheMode.BYPASS - if no_cache_read and no_cache_write: - return CacheMode.DISABLED - if no_cache_read: - return CacheMode.WRITE_ONLY - if no_cache_write: - return CacheMode.READ_ONLY - return CacheMode.ENABLED diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 9ef0e616c..3fcbe4a57 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1,36 +1,35 @@ +import asyncio +import copy import re -from itertools import chain from abc import ABC, abstractmethod -from typing import Dict, Any, Optional -from bs4 import BeautifulSoup -import asyncio +from itertools import chain +from typing import Any +from urllib.parse import urljoin + import requests +from lxml import etree +from lxml import html as lhtml +from requests.exceptions import InvalidSchema + +from crawl4ai.cache_client import CacheClient, NoCacheClient + from .config import ( - MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, - ONLY_TEXT_ELIGIBLE_TAGS, IMPORTANT_ATTRS, + MIN_WORD_THRESHOLD, + ONLY_TEXT_ELIGIBLE_TAGS, SOCIAL_MEDIA_DOMAINS, ) -from bs4 import NavigableString, Comment -from bs4 import PageElement, Tag -from urllib.parse import urljoin -from requests.exceptions import InvalidSchema +from .models import Link, Links, Media, MediaItem, ScrapingResult from .utils import ( - extract_metadata, - normalize_url, - is_external_url, - get_base_domain, + calculate_link_intrinsic_score, extract_metadata_using_lxml, extract_page_context, - calculate_link_intrinsic_score, + get_base_domain, + is_external_url, + normalize_url, ) -from lxml import etree -from lxml import html as lhtml -from typing import List -from .models import ScrapingResult, MediaItem, Link, Media, Links -import copy # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r"^og:") @@ -39,7 +38,7 @@ # Function to parse srcset -def parse_srcset(s: str) -> List[Dict]: +def parse_srcset(s: str) -> list[dict]: if not s: return [] variants = [] @@ -51,7 +50,7 @@ def parse_srcset(s: str) -> List[Dict]: if len(parts) >= 1: url = parts[0] width = ( - parts[1].rstrip("w").split('.')[0] + parts[1].rstrip("w").split(".")[0] if len(parts) > 1 and parts[1].endswith("w") else None ) @@ -79,13 +78,11 @@ def fetch_image_file_size(img, base_url): response = requests.head(img_url) if response.status_code == 200: return response.headers.get("Content-Length", None) - else: - print(f"Failed to retrieve file size for {img_url}") - return None + return None except InvalidSchema: return None finally: - return + return None class ContentScrapingStrategy(ABC): @@ -101,14 +98,16 @@ async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult: class LXMLWebScrapingStrategy(ContentScrapingStrategy): """ LXML-based implementation for fast web content scraping. - + This is the primary scraping strategy in Crawl4AI, providing high-performance HTML parsing and content extraction using the lxml library. - + Note: WebScrapingStrategy is now an alias for this class to maintain backward compatibility. """ - def __init__(self, logger=None): + + def __init__(self, cache_client: CacheClient = NoCacheClient(), logger=None): + self.cache_client = cache_client self.logger = logger self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)") self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)') @@ -131,7 +130,7 @@ def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult: Returns: ScrapingResult: A structured result containing the scraped content. """ - actual_url = kwargs.get("redirected_url", url) + actual_url = kwargs.get("redirected_url", url) or url raw_result = self._scrap(actual_url, html, **kwargs) if raw_result is None: return ScrapingResult( @@ -159,7 +158,7 @@ def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult: for aud in raw_result.get("media", {}).get("audios", []) if aud ], - tables=raw_result.get("media", {}).get("tables", []) + tables=raw_result.get("media", {}).get("tables", []), ) # Convert links @@ -198,7 +197,9 @@ async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult: """ return await asyncio.to_thread(self.scrap, url, html, **kwargs) - def process_element(self, url, element: lhtml.HtmlElement, **kwargs) -> Dict[str, Any]: + def process_element( + self, url, element: lhtml.HtmlElement, **kwargs + ) -> dict[str, Any]: """ Process an HTML element. @@ -232,9 +233,9 @@ def _process_element( self, url: str, element: lhtml.HtmlElement, - media: Dict[str, List], - internal_links_dict: Dict[str, Any], - external_links_dict: Dict[str, Any], + media: dict[str, list], + internal_links_dict: dict[str, Any], + external_links_dict: dict[str, Any], page_context: dict = None, **kwargs, ) -> bool: @@ -247,7 +248,7 @@ def _process_element( if base_element: base_href = base_element[0].get("href", "").strip() if base_href: - url = base_href + url = normalize_url(base_href, url) except Exception as e: self._log("error", f"Error extracting base URL: {str(e)}", "SCRAPE") pass @@ -265,7 +266,7 @@ def _process_element( "title": link.get("title", "").strip(), "base_domain": base_domain, } - + # Add intrinsic scoring if enabled if kwargs.get("score_links", False) and page_context is not None: try: @@ -275,7 +276,7 @@ def _process_element( title_attr=link_data["title"], class_attr=link.get("class", ""), rel_attr=link.get("rel", ""), - page_context=page_context + page_context=page_context, ) link_data["intrinsic_score"] = intrinsic_score except Exception: @@ -340,7 +341,7 @@ def _process_element( for media_type in ["video", "audio"]: for elem in element.xpath(f".//{media_type}"): media_info = { - "src": elem.get("src"), + "src": normalize_url(elem.get("src"), url), "alt": elem.get("alt"), "type": media_type, "description": self.find_closest_parent_with_useful_text( @@ -352,7 +353,9 @@ def _process_element( # Process source tags within media elements for source in elem.xpath(".//source"): if src := source.get("src"): - media[f"{media_type}s"].append({**media_info, "src": src}) + media[f"{media_type}s"].append( + {**media_info, "src": normalize_url(src, url)} + ) # Clean up unwanted elements if kwargs.get("remove_forms", False): @@ -375,7 +378,7 @@ def _process_element( def find_closest_parent_with_useful_text( self, element: lhtml.HtmlElement, **kwargs - ) -> Optional[str]: + ) -> str | None: image_description_min_word_threshold = kwargs.get( "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ) @@ -405,7 +408,7 @@ def flatten_nested_elements(self, element: lhtml.HtmlElement) -> lhtml.HtmlEleme def process_image( self, img: lhtml.HtmlElement, url: str, index: int, total_images: int, **kwargs - ) -> Optional[List[Dict]]: + ) -> list[dict] | None: # Quick validation checks style = img.get("style", "") alt = img.get("alt", "") @@ -423,7 +426,8 @@ def process_image( parent_classes = parent.get("class", "").split() if any( - "button" in cls or "icon" in cls or "logo" in cls for cls in parent_classes + "button" in parent_class or "icon" in parent_class or "logo" in parent_class + for parent_class in parent_classes ): return None @@ -446,13 +450,15 @@ def process_image( # Check formats in all possible sources image_formats = {"jpg", "jpeg", "png", "webp", "avif", "gif"} detected_format = None - for url in [src, data_src, srcset, data_srcset]: - if url: - format_matches = [fmt for fmt in image_formats if fmt in url.lower()] - if format_matches: - detected_format = format_matches[0] - score += 1 - break + for parts in [src, data_src, srcset, data_srcset]: + if not parts: + continue + format_matches = [fmt for fmt in image_formats if fmt in url.lower()] + if not format_matches: + continue + detected_format = format_matches[0] + score += 1 + break if srcset or data_srcset: score += 1 @@ -475,10 +481,10 @@ def process_image( "format": detected_format, } - def add_variant(src: str, width: Optional[str] = None): + def add_variant(src: str, width: str | None = None): if src and not src.startswith("data:") and src not in unique_urls: unique_urls.add(src) - variant = {**base_info, "src": src} + variant = {**base_info, "src": normalize_url(src, url)} if width: variant["width"] = width image_variants.append(variant) @@ -586,16 +592,15 @@ def remove_unwanted_attributes_fast( return root - def _scrap( self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, - target_elements: List[str] = None, + target_elements: list[str] = None, **kwargs, - ) -> Dict[str, Any]: + ) -> dict[str, Any]: if not html: return None @@ -607,38 +612,46 @@ def _scrap( body = doc base_domain = get_base_domain(url) - + # Extract page context for link scoring (if enabled) - do this BEFORE any removals page_context = None if kwargs.get("score_links", False): try: # Extract title - title_elements = doc.xpath('//title') - page_title = title_elements[0].text_content() if title_elements else "" - + title_elements = doc.xpath("//title") + page_title = ( + title_elements[0].text_content() if title_elements else "" + ) + # Extract headlines headlines = [] - for tag in ['h1', 'h2', 'h3']: - elements = doc.xpath(f'//{tag}') + for tag in ["h1", "h2", "h3"]: + elements = doc.xpath(f"//{tag}") for el in elements: text = el.text_content().strip() if text: headlines.append(text) - headlines_text = ' '.join(headlines) - + headlines_text = " ".join(headlines) + # Extract meta description - meta_desc_elements = doc.xpath('//meta[@name="description"]/@content') - meta_description = meta_desc_elements[0] if meta_desc_elements else "" - + meta_desc_elements = doc.xpath( + '//meta[@name="description"]/@content' + ) + meta_description = ( + meta_desc_elements[0] if meta_desc_elements else "" + ) + # Create page context - page_context = extract_page_context(page_title, headlines_text, meta_description, url) + page_context = extract_page_context( + page_title, headlines_text, meta_description, url + ) except Exception: page_context = {} # Fail gracefully - + # Early removal of all images if exclude_all_images is set # This is more efficient in lxml as we remove elements before any processing if kwargs.get("exclude_all_images", False): - for img in body.xpath('//img'): + for img in body.xpath("//img"): if img.getparent() is not None: img.getparent().remove(img) @@ -682,11 +695,17 @@ def _scrap( try: for_content_targeted_element = [] for target_element in target_elements: - for_content_targeted_element.extend(body.cssselect(target_element)) + for_content_targeted_element.extend( + body.cssselect(target_element) + ) content_element = lhtml.Element("div") content_element.extend(copy.deepcopy(for_content_targeted_element)) except Exception as e: - self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + self._log( + "error", + f"Error with target element detection: {str(e)}", + "SCRAPE", + ) return None else: content_element = body @@ -729,8 +748,8 @@ def _scrap( ) # Extract tables using the table extraction strategy if provided - if 'table' not in excluded_tags: - table_extraction = kwargs.get('table_extraction') + if "table" not in excluded_tags: + table_extraction = kwargs.get("table_extraction") if table_extraction: # Pass logger to the strategy if it doesn't have one if not table_extraction.logger: @@ -765,86 +784,125 @@ def _scrap( # Generate output HTML cleaned_html = lhtml.tostring( - # body, + # body, content_element, encoding="unicode", pretty_print=True, method="html", with_tail=False, ).strip() - + # Create links dictionary in the format expected by LinkPreview links = { "internal": list(internal_links_dict.values()), "external": list(external_links_dict.values()), } - + # Extract head content for links if configured link_preview_config = kwargs.get("link_preview_config") if link_preview_config is not None: try: import asyncio + from .link_preview import LinkPreview - from .models import Links, Link - + from .models import Link, Links + verbose = link_preview_config.verbose - + if verbose: - self._log("info", "Starting link head extraction for {internal} internal and {external} external links", - params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT") - + self._log( + "info", + "Starting link head extraction for {internal} internal and {external} external links", + params={ + "internal": len(links["internal"]), + "external": len(links["external"]), + }, + tag="LINK_EXTRACT", + ) + # Convert dict links to Link objects - internal_links = [Link(**link_data) for link_data in links["internal"]] - external_links = [Link(**link_data) for link_data in links["external"]] + internal_links = [ + Link(**link_data) for link_data in links["internal"] + ] + external_links = [ + Link(**link_data) for link_data in links["external"] + ] links_obj = Links(internal=internal_links, external=external_links) - + # Create a config object for LinkPreview class TempCrawlerRunConfig: def __init__(self, link_config, score_links): self.link_preview_config = link_config self.score_links = score_links - - config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False)) - + + config = TempCrawlerRunConfig( + link_preview_config, kwargs.get("score_links", False) + ) + # Extract head content (run async operation in sync context) async def extract_links(): - async with LinkPreview(self.logger) as extractor: + async with LinkPreview(self.cache_client, self.logger) as extractor: return await extractor.extract_link_heads(links_obj, config) - + # Run the async operation try: # Check if we're already in an async context loop = asyncio.get_running_loop() # If we're in an async context, we need to run in a thread import concurrent.futures + with concurrent.futures.ThreadPoolExecutor() as executor: future = executor.submit(asyncio.run, extract_links()) updated_links = future.result() except RuntimeError: # No running loop, we can use asyncio.run directly updated_links = asyncio.run(extract_links()) - + # Convert back to dict format - links["internal"] = [link.dict() for link in updated_links.internal] - links["external"] = [link.dict() for link in updated_links.external] - + links["internal"] = [link.model_dump() for link in updated_links.internal] + links["external"] = [link.model_dump() for link in updated_links.external] + if verbose: - successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"]) - successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"]) - self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external", - params={ - "internal_success": successful_internal, - "internal_total": len(updated_links.internal), - "external_success": successful_external, - "external_total": len(updated_links.external) - }, tag="LINK_EXTRACT") + successful_internal = len( + [ + l + for l in updated_links.internal + if l.head_extraction_status == "valid" + ] + ) + successful_external = len( + [ + l + for l in updated_links.external + if l.head_extraction_status == "valid" + ] + ) + self._log( + "info", + "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external", + params={ + "internal_success": successful_internal, + "internal_total": len(updated_links.internal), + "external_success": successful_external, + "external_total": len(updated_links.external), + }, + tag="LINK_EXTRACT", + ) else: - self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT") - + self._log( + "info", + "Link head extraction completed successfully", + tag="LINK_EXTRACT", + ) + except Exception as e: - self._log("error", f"Error during link head extraction: {str(e)}", tag="LINK_EXTRACT") + self._log( + "error", + f"Error during link head extraction: {str(e)}", + tag="LINK_EXTRACT", + ) # Continue with original links if head extraction fails - + return { "cleaned_html": cleaned_html, "success": success, @@ -881,12 +939,7 @@ async def extract_links(): return { "cleaned_html": cleaned_html, "success": False, - "media": { - "images": [], - "videos": [], - "audios": [], - "tables": [] - }, + "media": {"images": [], "videos": [], "audios": [], "tables": []}, "links": {"internal": [], "external": []}, "metadata": {}, } diff --git a/crawl4ai/crawlers/amazon_product/crawler.py b/crawl4ai/crawlers/amazon_product/crawler.py deleted file mode 100644 index 45cc9d6ad..000000000 --- a/crawl4ai/crawlers/amazon_product/crawler.py +++ /dev/null @@ -1,20 +0,0 @@ -from crawl4ai.hub import BaseCrawler - -__meta__ = { - "version": "1.2.0", - "tested_on": ["amazon.com"], - "rate_limit": "50 RPM", - "schema": {"product": ["name", "price"]} -} - -class AmazonProductCrawler(BaseCrawler): - async def run(self, url: str, **kwargs) -> str: - try: - self.logger.info(f"Crawling {url}") - return '{"product": {"name": "Test Amazon Product"}}' - except Exception as e: - self.logger.error(f"Crawl failed: {str(e)}") - return json.dumps({ - "error": str(e), - "metadata": self.meta # Include meta in error response - }) \ No newline at end of file diff --git a/crawl4ai/crawlers/google_search/crawler.py b/crawl4ai/crawlers/google_search/crawler.py deleted file mode 100644 index 182334171..000000000 --- a/crawl4ai/crawlers/google_search/crawler.py +++ /dev/null @@ -1,131 +0,0 @@ -from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.hub import BaseCrawler -from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema -from crawl4ai import JsonCssExtractionStrategy -from pathlib import Path -import json -import os -from typing import Dict - - -class GoogleSearchCrawler(BaseCrawler): - __meta__ = { - "version": "1.0.0", - "tested_on": ["google.com/search*"], - "rate_limit": "10 RPM", - "description": "Crawls Google Search results (text + images)", - } - - def __init__(self): - super().__init__() - self.js_script = (Path(__file__).parent / - "script.js").read_text() - - async def run(self, url="", query: str = "", search_type: str = "text", schema_cache_path = None, **kwargs) -> str: - """Crawl Google Search results for a query""" - url = f"https://www.google.com/search?q={query}&gl=sg&hl=en" if search_type == "text" else f"https://www.google.com/search?q={query}&gl=sg&hl=en&tbs=qdr:d&udm=2" - if kwargs.get("page_start", 1) > 1: - url = f"{url}&start={kwargs['page_start'] * 10}" - if kwargs.get("page_length", 1) > 1: - url = f"{url}&num={kwargs['page_length']}" - - browser_config = BrowserConfig(headless=True, verbose=True) - async with AsyncWebCrawler(config=browser_config) as crawler: - config = CrawlerRunConfig( - cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS), - keep_attrs=["id", "class"], - keep_data_attributes=True, - delay_before_return_html=kwargs.get( - "delay", 2 if search_type == "image" else 1), - js_code=self.js_script if search_type == "image" else None, - ) - - result = await crawler.arun(url=url, config=config) - if not result.success: - return json.dumps({"error": result.error}) - - if search_type == "image": - if result.js_execution_result.get("success", False) is False: - return json.dumps({"error": result.js_execution_result.get("error", "Unknown error")}) - if "results" in result.js_execution_result: - image_result = result.js_execution_result['results'][0] - if image_result.get("success", False) is False: - return json.dumps({"error": image_result.get("error", "Unknown error")}) - return json.dumps(image_result["result"], indent=4) - - # For text search, extract structured data - schemas = await self._build_schemas(result.cleaned_html, schema_cache_path) - extracted = { - key: JsonCssExtractionStrategy(schema=schemas[key]).run( - url=url, sections=[result.html] - ) - for key in schemas - } - return json.dumps(extracted, indent=4) - - async def _build_schemas(self, html: str, schema_cache_path: str = None) -> Dict[str, Dict]: - """Build extraction schemas (organic, top stories, etc.)""" - home_dir = get_home_folder() if not schema_cache_path else schema_cache_path - os.makedirs(f"{home_dir}/schema", exist_ok=True) - - # cleaned_html = optimize_html(html, threshold=100) - cleaned_html = preprocess_html_for_schema(html) - - organic_schema = None - if os.path.exists(f"{home_dir}/schema/organic_schema.json"): - with open(f"{home_dir}/schema/organic_schema.json", "r") as f: - organic_schema = json.load(f) - else: - organic_schema = JsonCssExtractionStrategy.generate_schema( - html=cleaned_html, - target_json_example="""{ - "title": "...", - "link": "...", - "snippet": "...", - "date": "1 hour ago", - }""", - query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text. date.""" - ) - - with open(f"{home_dir}/schema/organic_schema.json", "w") as f: - f.write(json.dumps(organic_schema)) - - top_stories_schema = None - if os.path.exists(f"{home_dir}/schema/top_stories_schema.json"): - with open(f"{home_dir}/schema/top_stories_schema.json", "r") as f: - top_stories_schema = json.load(f) - else: - top_stories_schema = JsonCssExtractionStrategy.generate_schema( - html=cleaned_html, - target_json_example="""{ - "title": "...", - "link": "...", - "source": "Insider Monkey", - "date": "1 hour ago", - }""", - query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl.""" - ) - - with open(f"{home_dir}/schema/top_stories_schema.json", "w") as f: - f.write(json.dumps(top_stories_schema)) - - suggested_query_schema = None - if os.path.exists(f"{home_dir}/schema/suggested_query_schema.json"): - with open(f"{home_dir}/schema/suggested_query_schema.json", "r") as f: - suggested_query_schema = json.load(f) - else: - suggested_query_schema = JsonCssExtractionStrategy.generate_schema( - html=cleaned_html, - target_json_example="""{ - "query": "A for Apple", - }""", - query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "People also search for" within the given HTML. I am interested in the queries only.""" - ) - with open(f"{home_dir}/schema/suggested_query_schema.json", "w") as f: - f.write(json.dumps(suggested_query_schema)) - - return { - "organic_schema": organic_schema, - "top_stories_schema": top_stories_schema, - "suggested_query_schema": suggested_query_schema, - } diff --git a/crawl4ai/crawlers/google_search/script.js b/crawl4ai/crawlers/google_search/script.js deleted file mode 100644 index 332574654..000000000 --- a/crawl4ai/crawlers/google_search/script.js +++ /dev/null @@ -1,115 +0,0 @@ -(() => { - // Function to extract image data from Google Images page - function extractImageData() { - const keys = Object.keys(window.W_jd); - let allImageData = []; - let currentPosition = 0; - - // Get the symbol we'll use (from first valid entry) - let targetSymbol; - for (let key of keys) { - try { - const symbols = Object.getOwnPropertySymbols(window.W_jd[key]); - if (symbols.length > 0) { - targetSymbol = symbols[0]; - break; - } - } catch (e) { - continue; - } - } - - if (!targetSymbol) return []; - - // Iterate through ALL keys - for (let key of keys) { - try { - const o1 = window.W_jd[key][targetSymbol] - if (!o1) continue; - const data = Object.values(o1)[0] - // const data = window.W_jd[key][targetSymbol]?.Ws; - // Check if this is a valid image data entry - if (data && Array.isArray(data[1])) { - const processedData = processImageEntry(data, currentPosition); - if (processedData) { - allImageData.push(processedData); - currentPosition++; - } - } - } catch (e) { - continue; - } - } - - return allImageData; - } - - function processImageEntry(entry, position) { - const imageData = entry[1]; - if (!Array.isArray(imageData)) return null; - - // Extract the image ID - const imageId = imageData[1]; - if (!imageId) return null; - - // Find the corresponding DOM element - const domElement = document.querySelector(`[data-docid="${imageId}"]`); - if (!domElement) return null; - - // Extract data from the array structure - const [ - _, - id, - thumbnailInfo, - imageInfo, - __, - ___, - rgb, - ____, - _____, - metadata - ] = imageData; - - // Ensure we have the required data - if (!thumbnailInfo || !imageInfo) return null; - - // Extract metadata from DOM - const title = domElement?.querySelector('.toI8Rb')?.textContent?.trim(); - const source = domElement?.querySelector('.guK3rf')?.textContent?.trim(); - const link = domElement?.querySelector('a.EZAeBe')?.href; - - if (!link) return null; - - // Build Google Image URL - const googleUrl = buildGoogleImageUrl(imageInfo[0], link, imageId, imageInfo[1], imageInfo[2]); - - return { - title, - imageUrl: imageInfo[0], - imageWidth: imageInfo[2], - imageHeight: imageInfo[1], - thumbnailUrl: thumbnailInfo[0], - thumbnailWidth: thumbnailInfo[2], - thumbnailHeight: thumbnailInfo[1], - source, - domain: metadata['2000']?.[1] || new URL(link).hostname, - link, - googleUrl, - position: position + 1 - }; - } - - function buildGoogleImageUrl(imgUrl, refUrl, tbnid, height, width) { - const params = new URLSearchParams({ - imgurl: imgUrl, - tbnid: tbnid, - imgrefurl: refUrl, - docid: tbnid, - w: width.toString(), - h: height.toString(), - }); - - return `https://www.google.com/imgres?${params.toString()}`; - } - return extractImageData(); -})(); \ No newline at end of file diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 380f83b43..66b447725 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -1,50 +1,57 @@ -from abc import ABC, abstractmethod +from __future__ import annotations + import inspect -from typing import Any, List, Dict, Optional, Tuple, Pattern, Union -from concurrent.futures import ThreadPoolExecutor, as_completed import json +import re import time +from abc import ABC, abstractmethod +from concurrent.futures import ThreadPoolExecutor, as_completed from enum import IntFlag, auto +from functools import partial +from re import Pattern +from typing import Any, TYPE_CHECKING + +import numpy as np +from bs4 import BeautifulSoup +from lxml import etree, html + +if TYPE_CHECKING: + from .async_configs import LLMConfig -from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA from .config import ( + CHUNK_TOKEN_THRESHOLD, DEFAULT_PROVIDER, DEFAULT_PROVIDER_API_KEY, - CHUNK_TOKEN_THRESHOLD, OVERLAP_RATE, WORD_TOKEN_RATE, ) +from .model_loader import * # noqa: F403 +from .model_loader import ( + calculate_batch_size, + get_device, + load_HF_embedding_model, + load_text_multilabel_classifier, +) +from .models import * # noqa: F403 +from .models import TokenUsage +from .prompts import ( + JSON_SCHEMA_BUILDER_XPATH, + PROMPT_EXTRACT_BLOCKS, + PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, + PROMPT_EXTRACT_INFERRED_SCHEMA, + PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, +) from .utils import * # noqa: F403 - from .utils import ( - sanitize_html, escape_json_string, - perform_completion_with_backoff, extract_xml_data, - split_and_parse_json_objects, - sanitize_input_encode, merge_chunks, -) -from .models import * # noqa: F403 - -from .models import TokenUsage - -from .model_loader import * # noqa: F403 -from .model_loader import ( - get_device, - load_HF_embedding_model, - load_text_multilabel_classifier, - calculate_batch_size + perform_completion_with_backoff, + sanitize_html, + sanitize_input_encode, + split_and_parse_json_objects, ) -from .types import LLMConfig, create_llm_config - -from functools import partial -import numpy as np -import re -from bs4 import BeautifulSoup -from lxml import html, etree - class ExtractionStrategy(ABC): """ @@ -66,7 +73,7 @@ def __init__(self, input_format: str = "markdown", **kwargs): self.verbose = kwargs.get("verbose", False) @abstractmethod - def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + def extract(self, url: str, html: str, *q, **kwargs) -> list[dict[str, Any]]: """ Extract meaningful blocks or chunks from the given HTML. @@ -76,7 +83,7 @@ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: """ pass - def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + def run(self, url: str, sections: list[str], *q, **kwargs) -> list[dict[str, Any]]: """ Process sections of text in parallel by default. @@ -100,13 +107,13 @@ class NoExtractionStrategy(ExtractionStrategy): A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block. """ - def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + def extract(self, url: str, html: str, *q, **kwargs) -> list[dict[str, Any]]: """ Extract meaningful blocks or chunks from the given HTML. """ return [{"index": 0, "content": html}] - def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + def run(self, url: str, sections: list[str], *q, **kwargs) -> list[dict[str, Any]]: return [ {"index": i, "tags": [], "content": section} for i, section in enumerate(sections) @@ -226,8 +233,8 @@ def __init__( ) def filter_documents_embeddings( - self, documents: List[str], semantic_filter: str, at_least_k: int = 20 - ) -> List[str]: + self, documents: list[str], semantic_filter: str, at_least_k: int = 20 + ) -> list[str]: """ Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding. @@ -282,7 +289,7 @@ def filter_documents_embeddings( return filtered_docs[:at_least_k] def get_embeddings( - self, sentences: List[str], batch_size=None, bypass_buffer=False + self, sentences: list[str], batch_size=None, bypass_buffer=False ): """ Get BERT embeddings for a list of sentences. @@ -336,7 +343,7 @@ def get_embeddings( self.buffer_embeddings = np.vstack(all_embeddings) return self.buffer_embeddings - def hierarchical_clustering(self, sentences: List[str], embeddings=None): + def hierarchical_clustering(self, sentences: list[str], embeddings=None): """ Perform hierarchical clustering on sentences and return cluster labels. @@ -347,7 +354,7 @@ def hierarchical_clustering(self, sentences: List[str], embeddings=None): NumPy array of cluster labels. """ # Get embeddings - from scipy.cluster.hierarchy import linkage, fcluster + from scipy.cluster.hierarchy import fcluster, linkage from scipy.spatial.distance import pdist self.timer = time.time() @@ -362,8 +369,8 @@ def hierarchical_clustering(self, sentences: List[str], embeddings=None): return labels def filter_clusters_by_word_count( - self, clusters: Dict[int, List[str]] - ) -> Dict[int, List[str]]: + self, clusters: dict[int, list[str]] + ) -> dict[int, list[str]]: """ Filter clusters to remove those with a word count below the threshold. @@ -386,7 +393,7 @@ def filter_clusters_by_word_count( return filtered_clusters - def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + def extract(self, url: str, html: str, *q, **kwargs) -> list[dict[str, Any]]: """ Extract clusters from HTML content using hierarchical clustering. @@ -458,7 +465,7 @@ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: return cluster_list - def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + def run(self, url: str, sections: list[str], *q, **kwargs) -> list[dict[str, Any]]: """ Process sections using hierarchical clustering. @@ -501,9 +508,9 @@ class LLMExtractionStrategy(ExtractionStrategy): } def __init__( self, - llm_config: 'LLMConfig' = None, + llm_config: LLMConfig = None, instruction: str = None, - schema: Dict = None, + schema: dict = None, extraction_type="block", chunk_token_threshold=CHUNK_TOKEN_THRESHOLD, overlap_rate=OVERLAP_RATE, @@ -514,7 +521,7 @@ def __init__( verbose=False, # Deprecated arguments provider: str = DEFAULT_PROVIDER, - api_token: Optional[str] = None, + api_token: str | None = None, base_url: str = None, api_base: str = None, **kwargs, @@ -546,6 +553,7 @@ def __init__( super().__init__( input_format=input_format, **kwargs) self.llm_config = llm_config if not self.llm_config: + from .async_configs import create_llm_config self.llm_config = create_llm_config( provider=DEFAULT_PROVIDER, api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY), @@ -584,7 +592,7 @@ def __setattr__(self, name, value): super().__setattr__(name, value) - def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]: + def extract(self, url: str, ix: int, html: str) -> list[dict[str, Any]]: """ Extract meaningful blocks or chunks from the given HTML using an LLM. @@ -711,7 +719,7 @@ def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]: } ] - def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]: + def _merge(self, documents, chunk_token_threshold, overlap) -> list[str]: """ Merge documents into sections based on chunk_token_threshold and overlap. """ @@ -723,7 +731,7 @@ def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]: ) return sections - def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: + def run(self, url: str, sections: list[str]) -> list[dict[str, Any]]: """ Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. @@ -835,7 +843,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy): DEL = "\n" - def __init__(self, schema: Dict[str, Any], **kwargs): + def __init__(self, schema: dict[str, Any], **kwargs): """ Initialize the JSON element extraction strategy with a schema. @@ -848,7 +856,7 @@ def __init__(self, schema: Dict[str, Any], **kwargs): def extract( self, url: str, html_content: str, *q, **kwargs - ) -> List[Dict[str, Any]]: + ) -> list[dict[str, Any]]: """ Extract structured data from HTML content. @@ -1027,9 +1035,9 @@ def _apply_transform(self, value, transform): if transform == "lowercase": return value.lower() - elif transform == "uppercase": + if transform == "uppercase": return value.upper() - elif transform == "strip": + if transform == "strip": return value.strip() return value @@ -1037,14 +1045,14 @@ def _compute_field(self, item, field): try: if "expression" in field: return eval(field["expression"], {}, item) - elif "function" in field: + if "function" in field: return field["function"](item) except Exception as e: if self.verbose: print(f"Error computing field {field['name']}: {str(e)}") return field.get("default") - def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + def run(self, url: str, sections: list[str], *q, **kwargs) -> list[dict[str, Any]]: """ Run the extraction strategy on a combined HTML content. @@ -1091,7 +1099,7 @@ def generate_schema( schema_type: str = "CSS", # or XPATH query: str = None, target_json_example: str = None, - llm_config: 'LLMConfig' = create_llm_config(), + llm_config: LLMConfig = None, provider: str = None, api_token: str = None, **kwargs @@ -1111,6 +1119,11 @@ def generate_schema( Returns: dict: Generated schema following the JsonElementExtractionStrategy format """ + # Create default LLMConfig if not provided + if llm_config is None: + from .async_configs import create_llm_config + llm_config = create_llm_config() + from .prompts import JSON_SCHEMA_BUILDER from .utils import perform_completion_with_backoff for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items(): @@ -1216,7 +1229,7 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy): _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element. """ - def __init__(self, schema: Dict[str, Any], **kwargs): + def __init__(self, schema: dict[str, Any], **kwargs): kwargs["input_format"] = "html" # Force HTML input super().__init__(schema, **kwargs) @@ -1242,7 +1255,7 @@ def _get_element_attribute(self, element, attribute: str): return element.get(attribute) class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy): - def __init__(self, schema: Dict[str, Any], **kwargs): + def __init__(self, schema: dict[str, Any], **kwargs): kwargs["input_format"] = "html" super().__init__(schema, **kwargs) self._selector_cache = {} @@ -1513,7 +1526,7 @@ def _clear_caches(self): self._result_cache.clear() class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy): - def __init__(self, schema: Dict[str, Any], **kwargs): + def __init__(self, schema: dict[str, Any], **kwargs): kwargs["input_format"] = "html" # Force HTML input super().__init__(schema, **kwargs) self._selector_cache = {} @@ -1564,8 +1577,7 @@ def select_func(element): sub_selector = selector_str.split(')', 1)[-1].strip() if sub_selector: return element.xpath(f".//td[{col_num}]//{sub_selector}") - else: - return element.xpath(f".//td[{col_num}]") + return element.xpath(f".//td[{col_num}]") # Last resort: try each part of the selector separately parts = selector_str.split() @@ -1632,7 +1644,7 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element. """ - def __init__(self, schema: Dict[str, Any], **kwargs): + def __init__(self, schema: dict[str, Any], **kwargs): kwargs["input_format"] = "html" # Force HTML input super().__init__(schema, **kwargs) @@ -1683,7 +1695,7 @@ def _get_element_attribute(self, element, attribute: str): _WB_FIX = re.compile(r"\x08") # stray back-space โ†’ word-boundary _NEEDS_ESCAPE = re.compile(r"(? Dict[str, str]: +def _sanitize_schema(schema: dict[str, str]) -> dict[str, str]: """Fix common JSON-escape goofs coming from LLMs or manual edits.""" safe = {} for label, pat in schema.items(): @@ -1781,7 +1793,7 @@ class _B(IntFlag): # ------------------------------------------------------------------ # # Built-in pattern catalog # ------------------------------------------------------------------ # - DEFAULT_PATTERNS: Dict[str, str] = { + DEFAULT_PATTERNS: dict[str, str] = { # Communication "email": r"[\w.+-]+@[\w-]+\.[\w.-]+", "phone_intl": r"\+?\d[\d .()-]{7,}\d", @@ -1822,9 +1834,9 @@ class _B(IntFlag): # ------------------------------------------------------------------ # def __init__( self, - pattern: "_B" = _B.NOTHING, + pattern: _B = _B.NOTHING, *, - custom: Optional[Union[Dict[str, str], List[Tuple[str, str]]]] = None, + custom: dict[str, str] | list[tuple[str, str]] | None = None, input_format: str = "fit_html", **kwargs, ) -> None: @@ -1838,7 +1850,7 @@ def __init__( super().__init__(input_format=input_format, **kwargs) # 1๏ธโƒฃ take only the requested built-ins - merged: Dict[str, str] = { + merged: dict[str, str] = { key: rx for key, rx in self.DEFAULT_PATTERNS.items() if getattr(self._B, key.upper()).value & pattern @@ -1851,16 +1863,16 @@ def __init__( else: # iterable of (label, regex) merged.update({lbl: rx for lbl, rx in custom}) - self._compiled: Dict[str, Pattern] = { + self._compiled: dict[str, Pattern] = { lbl: re.compile(rx, self._FLAGS) for lbl, rx in merged.items() } # ------------------------------------------------------------------ # # Extraction # ------------------------------------------------------------------ # - def extract(self, url: str, content: str, *q, **kw) -> List[Dict[str, Any]]: + def extract(self, url: str, content: str, *q, **kw) -> list[dict[str, Any]]: # text = self._plain_text(html) - out: List[Dict[str, Any]] = [] + out: list[dict[str, Any]] = [] for label, cre in self._compiled.items(): for m in cre.finditer(content): @@ -1893,11 +1905,11 @@ def generate_pattern( label: str, html: str, *, - query: Optional[str] = None, - examples: Optional[List[str]] = None, - llm_config: Optional[LLMConfig] = None, + query: str | None = None, + examples: list[str] | None = None, + llm_config: LLMConfig | None = None, **kwargs, - ) -> Dict[str, str]: + ) -> dict[str, str]: """ Ask an LLM for a single page-specific regex and return {label: pattern} โ”€โ”€ ready for RegexExtractionStrategy(custom=โ€ฆ) @@ -1912,6 +1924,7 @@ def generate_pattern( # โ”€โ”€ default LLM config if llm_config is None: + from .async_configs import create_llm_config llm_config = create_llm_config() # โ”€โ”€ system prompt โ€“ hardened diff --git a/crawl4ai/hub.py b/crawl4ai/hub.py deleted file mode 100644 index 75056df77..000000000 --- a/crawl4ai/hub.py +++ /dev/null @@ -1,69 +0,0 @@ -# crawl4ai/hub.py -from abc import ABC, abstractmethod -from typing import Dict, Type, Union -import logging -import importlib -from pathlib import Path -import inspect - -logger = logging.getLogger(__name__) - - -class BaseCrawler(ABC): - def __init__(self): - self.logger = logging.getLogger(self.__class__.__name__) - - @abstractmethod - async def run(self, url: str = "", **kwargs) -> str: - """ - Implement this method to return JSON string. - Must accept URL + arbitrary kwargs for flexibility. - """ - pass - - def __init_subclass__(cls, **kwargs): - """Enforce interface validation on subclassing""" - super().__init_subclass__(**kwargs) - - # Verify run method signature - run_method = cls.run - if not run_method.__code__.co_argcount >= 2: # self + url - raise TypeError(f"{cls.__name__} must implement 'run(self, url: str, **kwargs)'") - - # Verify async nature - if not inspect.iscoroutinefunction(run_method): - raise TypeError(f"{cls.__name__}.run must be async") - -class CrawlerHub: - _crawlers: Dict[str, Type[BaseCrawler]] = {} - - @classmethod - def _discover_crawlers(cls): - """Dynamically load crawlers from /crawlers in 3 lines""" - base_path = Path(__file__).parent / "crawlers" - for crawler_dir in base_path.iterdir(): - if crawler_dir.is_dir(): - try: - module = importlib.import_module( - f"crawl4ai.crawlers.{crawler_dir.name}.crawler" - ) - for attr in dir(module): - cls._maybe_register_crawler( - getattr(module, attr), crawler_dir.name - ) - except Exception as e: - logger.warning(f"Failed {crawler_dir.name}: {str(e)}") - - @classmethod - def _maybe_register_crawler(cls, obj, name: str): - """Brilliant one-liner registration""" - if isinstance(obj, type) and issubclass(obj, BaseCrawler) and obj != BaseCrawler: - module = importlib.import_module(obj.__module__) - obj.meta = getattr(module, "__meta__", {}) - cls._crawlers[name] = obj - - @classmethod - def get(cls, name: str) -> Union[Type[BaseCrawler], None]: - if not cls._crawlers: - cls._discover_crawlers() - return cls._crawlers.get(name) \ No newline at end of file diff --git a/crawl4ai/legacy/__init__.py b/crawl4ai/legacy/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/crawl4ai/legacy/cli.py b/crawl4ai/legacy/cli.py deleted file mode 100644 index b2d2199ec..000000000 --- a/crawl4ai/legacy/cli.py +++ /dev/null @@ -1,123 +0,0 @@ -import click -import sys -import asyncio -from typing import List -from .docs_manager import DocsManager -from .async_logger import AsyncLogger - -logger = AsyncLogger(verbose=True) -docs_manager = DocsManager(logger) - - -def print_table(headers: List[str], rows: List[List[str]], padding: int = 2): - """Print formatted table with headers and rows""" - widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)] - border = "+" + "+".join("-" * (w + 2 * padding) for w in widths) + "+" - - def format_row(row): - return ( - "|" - + "|".join( - f"{' ' * padding}{str(cell):<{w}}{' ' * padding}" - for cell, w in zip(row, widths) - ) - + "|" - ) - - click.echo(border) - click.echo(format_row(headers)) - click.echo(border) - for row in rows: - click.echo(format_row(row)) - click.echo(border) - - -@click.group() -def cli(): - """Crawl4AI Command Line Interface""" - pass - - -@cli.group() -def docs(): - """Documentation operations""" - pass - - -@docs.command() -@click.argument("sections", nargs=-1) -@click.option( - "--mode", type=click.Choice(["extended", "condensed"]), default="extended" -) -def combine(sections: tuple, mode: str): - """Combine documentation sections""" - try: - asyncio.run(docs_manager.ensure_docs_exist()) - click.echo(docs_manager.generate(sections, mode)) - except Exception as e: - logger.error(str(e), tag="ERROR") - sys.exit(1) - - -@docs.command() -@click.argument("query") -@click.option("--top-k", "-k", default=5) -@click.option("--build-index", is_flag=True, help="Build index if missing") -def search(query: str, top_k: int, build_index: bool): - """Search documentation""" - try: - result = docs_manager.search(query, top_k) - if result == "No search index available. Call build_search_index() first.": - if build_index or click.confirm("No search index found. Build it now?"): - asyncio.run(docs_manager.llm_text.generate_index_files()) - result = docs_manager.search(query, top_k) - click.echo(result) - except Exception as e: - click.echo(f"Error: {str(e)}", err=True) - sys.exit(1) - - -@docs.command() -def update(): - """Update docs from GitHub""" - try: - asyncio.run(docs_manager.fetch_docs()) - click.echo("Documentation updated successfully") - except Exception as e: - click.echo(f"Error: {str(e)}", err=True) - sys.exit(1) - - -@docs.command() -@click.option("--force-facts", is_flag=True, help="Force regenerate fact files") -@click.option("--clear-cache", is_flag=True, help="Clear BM25 cache") -def index(force_facts: bool, clear_cache: bool): - """Build or rebuild search indexes""" - try: - asyncio.run(docs_manager.ensure_docs_exist()) - asyncio.run( - docs_manager.llm_text.generate_index_files( - force_generate_facts=force_facts, clear_bm25_cache=clear_cache - ) - ) - click.echo("Search indexes built successfully") - except Exception as e: - click.echo(f"Error: {str(e)}", err=True) - sys.exit(1) - - -# Add docs list command -@docs.command() -def list(): - """List available documentation sections""" - try: - sections = docs_manager.list() - print_table(["Sections"], [[section] for section in sections]) - - except Exception as e: - click.echo(f"Error: {str(e)}", err=True) - sys.exit(1) - - -if __name__ == "__main__": - cli() diff --git a/crawl4ai/legacy/crawler_strategy.py b/crawl4ai/legacy/crawler_strategy.py deleted file mode 100644 index 34e20ecd8..000000000 --- a/crawl4ai/legacy/crawler_strategy.py +++ /dev/null @@ -1,394 +0,0 @@ -from abc import ABC, abstractmethod -from selenium import webdriver -from selenium.webdriver.chrome.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.chrome.options import Options -from selenium.common.exceptions import InvalidArgumentException, WebDriverException -# from selenium.webdriver.chrome.service import Service as ChromeService -# from webdriver_manager.chrome import ChromeDriverManager -# from urllib3.exceptions import MaxRetryError - -from .config import * -import logging, time -import base64 -from PIL import Image, ImageDraw, ImageFont -from io import BytesIO -from typing import Callable -import requests -import os -from pathlib import Path -from .utils import * - -logger = logging.getLogger("selenium.webdriver.remote.remote_connection") -logger.setLevel(logging.WARNING) - -logger_driver = logging.getLogger("selenium.webdriver.common.service") -logger_driver.setLevel(logging.WARNING) - -urllib3_logger = logging.getLogger("urllib3.connectionpool") -urllib3_logger.setLevel(logging.WARNING) - -# Disable http.client logging -http_client_logger = logging.getLogger("http.client") -http_client_logger.setLevel(logging.WARNING) - -# Disable driver_finder and service logging -driver_finder_logger = logging.getLogger("selenium.webdriver.common.driver_finder") -driver_finder_logger.setLevel(logging.WARNING) - - -class CrawlerStrategy(ABC): - @abstractmethod - def crawl(self, url: str, **kwargs) -> str: - pass - - @abstractmethod - def take_screenshot(self, save_path: str): - pass - - @abstractmethod - def update_user_agent(self, user_agent: str): - pass - - @abstractmethod - def set_hook(self, hook_type: str, hook: Callable): - pass - - -class CloudCrawlerStrategy(CrawlerStrategy): - def __init__(self, use_cached_html=False): - super().__init__() - self.use_cached_html = use_cached_html - - def crawl(self, url: str) -> str: - data = { - "urls": [url], - "include_raw_html": True, - "forced": True, - "extract_blocks": False, - } - - response = requests.post("http://crawl4ai.uccode.io/crawl", json=data) - response = response.json() - html = response["results"][0]["html"] - return sanitize_input_encode(html) - - -class LocalSeleniumCrawlerStrategy(CrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, **kwargs): - super().__init__() - print("[LOG] ๐Ÿš€ Initializing LocalSeleniumCrawlerStrategy") - self.options = Options() - self.options.headless = True - if kwargs.get("proxy"): - self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy"))) - if kwargs.get("user_agent"): - self.options.add_argument("--user-agent=" + kwargs.get("user_agent")) - else: - user_agent = kwargs.get( - "user_agent", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - ) - self.options.add_argument(f"--user-agent={user_agent}") - self.options.add_argument( - "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - ) - - self.options.headless = kwargs.get("headless", True) - if self.options.headless: - self.options.add_argument("--headless") - - self.options.add_argument("--disable-gpu") - self.options.add_argument("--window-size=1920,1080") - self.options.add_argument("--no-sandbox") - self.options.add_argument("--disable-dev-shm-usage") - self.options.add_argument("--disable-blink-features=AutomationControlled") - - # self.options.add_argument("--disable-dev-shm-usage") - self.options.add_argument("--disable-gpu") - # self.options.add_argument("--disable-extensions") - # self.options.add_argument("--disable-infobars") - # self.options.add_argument("--disable-logging") - # self.options.add_argument("--disable-popup-blocking") - # self.options.add_argument("--disable-translate") - # self.options.add_argument("--disable-default-apps") - # self.options.add_argument("--disable-background-networking") - # self.options.add_argument("--disable-sync") - # self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess") - # self.options.add_argument("--disable-browser-side-navigation") - # self.options.add_argument("--dns-prefetch-disable") - # self.options.add_argument("--disable-web-security") - self.options.add_argument("--log-level=3") - self.use_cached_html = use_cached_html - self.use_cached_html = use_cached_html - self.js_code = js_code - self.verbose = kwargs.get("verbose", False) - - # Hooks - self.hooks = { - "on_driver_created": None, - "on_user_agent_updated": None, - "before_get_url": None, - "after_get_url": None, - "before_return_html": None, - } - - # chromedriver_autoinstaller.install() - # import chromedriver_autoinstaller - # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") - # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options) - # chromedriver_path = chromedriver_autoinstaller.install() - # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver() - # self.service = Service(chromedriver_autoinstaller.install()) - - # chromedriver_path = ChromeDriverManager().install() - # self.service = Service(chromedriver_path) - # self.service.log_path = "NUL" - # self.driver = webdriver.Chrome(service=self.service, options=self.options) - - # Use selenium-manager (built into Selenium 4.10.0+) - self.service = Service() - self.driver = webdriver.Chrome(options=self.options) - - self.driver = self.execute_hook("on_driver_created", self.driver) - - if kwargs.get("cookies"): - for cookie in kwargs.get("cookies"): - self.driver.add_cookie(cookie) - - def set_hook(self, hook_type: str, hook: Callable): - if hook_type in self.hooks: - self.hooks[hook_type] = hook - else: - raise ValueError(f"Invalid hook type: {hook_type}") - - def execute_hook(self, hook_type: str, *args): - hook = self.hooks.get(hook_type) - if hook: - result = hook(*args) - if result is not None: - if isinstance(result, webdriver.Chrome): - return result - else: - raise TypeError( - f"Hook {hook_type} must return an instance of webdriver.Chrome or None." - ) - # If the hook returns None or there is no hook, return self.driver - return self.driver - - def update_user_agent(self, user_agent: str): - self.options.add_argument(f"user-agent={user_agent}") - self.driver.quit() - self.driver = webdriver.Chrome(service=self.service, options=self.options) - self.driver = self.execute_hook("on_user_agent_updated", self.driver) - - def set_custom_headers(self, headers: dict): - # Enable Network domain for sending headers - self.driver.execute_cdp_cmd("Network.enable", {}) - # Set extra HTTP headers - self.driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": headers}) - - def _ensure_page_load(self, max_checks=6, check_interval=0.01): - initial_length = len(self.driver.page_source) - - for ix in range(max_checks): - # print(f"Checking page load: {ix}") - time.sleep(check_interval) - current_length = len(self.driver.page_source) - - if current_length != initial_length: - break - - return self.driver.page_source - - def crawl(self, url: str, **kwargs) -> str: - # Create md5 hash of the URL - import hashlib - - url_hash = hashlib.md5(url.encode()).hexdigest() - - if self.use_cached_html: - cache_file_path = os.path.join( - os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), - ".crawl4ai", - "cache", - url_hash, - ) - if os.path.exists(cache_file_path): - with open(cache_file_path, "r") as f: - return sanitize_input_encode(f.read()) - - try: - self.driver = self.execute_hook("before_get_url", self.driver) - if self.verbose: - print(f"[LOG] ๐Ÿ•ธ๏ธ Crawling {url} using LocalSeleniumCrawlerStrategy...") - self.driver.get(url) # - - WebDriverWait(self.driver, 20).until( - lambda d: d.execute_script("return document.readyState") == "complete" - ) - WebDriverWait(self.driver, 10).until( - EC.presence_of_all_elements_located((By.TAG_NAME, "body")) - ) - - self.driver.execute_script( - "window.scrollTo(0, document.body.scrollHeight);" - ) - - self.driver = self.execute_hook("after_get_url", self.driver) - html = sanitize_input_encode( - self._ensure_page_load() - ) # self.driver.page_source - can_not_be_done_headless = ( - False # Look at my creativity for naming variables - ) - - # TODO: Very ugly approach, but promise to change it! - if ( - kwargs.get("bypass_headless", False) - or html == "" - ): - print( - "[LOG] ๐Ÿ™Œ Page could not be loaded in headless mode. Trying non-headless mode..." - ) - can_not_be_done_headless = True - options = Options() - options.headless = False - # set window size very small - options.add_argument("--window-size=5,5") - driver = webdriver.Chrome(service=self.service, options=options) - driver.get(url) - self.driver = self.execute_hook("after_get_url", driver) - html = sanitize_input_encode(driver.page_source) - driver.quit() - - # Execute JS code if provided - self.js_code = kwargs.get("js_code", self.js_code) - if self.js_code and type(self.js_code) == str: - self.driver.execute_script(self.js_code) - # Optionally, wait for some condition after executing the JS code - WebDriverWait(self.driver, 10).until( - lambda driver: driver.execute_script("return document.readyState") - == "complete" - ) - elif self.js_code and type(self.js_code) == list: - for js in self.js_code: - self.driver.execute_script(js) - WebDriverWait(self.driver, 10).until( - lambda driver: driver.execute_script( - "return document.readyState" - ) - == "complete" - ) - - # Optionally, wait for some condition after executing the JS code : Contributed by (https://github.com/jonymusky) - wait_for = kwargs.get("wait_for", False) - if wait_for: - if callable(wait_for): - print("[LOG] ๐Ÿ”„ Waiting for condition...") - WebDriverWait(self.driver, 20).until(wait_for) - else: - print("[LOG] ๐Ÿ”„ Waiting for condition...") - WebDriverWait(self.driver, 20).until( - EC.presence_of_element_located((By.CSS_SELECTOR, wait_for)) - ) - - if not can_not_be_done_headless: - html = sanitize_input_encode(self.driver.page_source) - self.driver = self.execute_hook("before_return_html", self.driver, html) - - # Store in cache - cache_file_path = os.path.join( - os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), - ".crawl4ai", - "cache", - url_hash, - ) - with open(cache_file_path, "w", encoding="utf-8") as f: - f.write(html) - - if self.verbose: - print(f"[LOG] โœ… Crawled {url} successfully!") - - return html - except InvalidArgumentException as e: - if not hasattr(e, "msg"): - e.msg = sanitize_input_encode(str(e)) - raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}") - except WebDriverException as e: - # If e does nlt have msg attribute create it and set it to str(e) - if not hasattr(e, "msg"): - e.msg = sanitize_input_encode(str(e)) - raise WebDriverException(f"Failed to crawl {url}: {e.msg}") - except Exception as e: - if not hasattr(e, "msg"): - e.msg = sanitize_input_encode(str(e)) - raise Exception(f"Failed to crawl {url}: {e.msg}") - - def take_screenshot(self) -> str: - try: - # Get the dimensions of the page - total_width = self.driver.execute_script("return document.body.scrollWidth") - total_height = self.driver.execute_script( - "return document.body.scrollHeight" - ) - - # Set the window size to the dimensions of the page - self.driver.set_window_size(total_width, total_height) - - # Take screenshot - screenshot = self.driver.get_screenshot_as_png() - - # Open the screenshot with PIL - image = Image.open(BytesIO(screenshot)) - - # Convert image to RGB mode (this will handle both RGB and RGBA images) - rgb_image = image.convert("RGB") - - # Convert to JPEG and compress - buffered = BytesIO() - rgb_image.save(buffered, format="JPEG", quality=85) - img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") - - if self.verbose: - print("[LOG] ๐Ÿ“ธ Screenshot taken and converted to base64") - - return img_base64 - except Exception as e: - error_message = sanitize_input_encode( - f"Failed to take screenshot: {str(e)}" - ) - print(error_message) - - # Generate an image with black background - img = Image.new("RGB", (800, 600), color="black") - draw = ImageDraw.Draw(img) - - # Load a font - try: - font = ImageFont.truetype("arial.ttf", 40) - except IOError: - font = ImageFont.load_default() - - # Define text color and wrap the text - text_color = (255, 255, 255) - max_width = 780 - wrapped_text = wrap_text(draw, error_message, font, max_width) - - # Calculate text position - text_position = (10, 10) - - # Draw the text on the image - draw.text(text_position, wrapped_text, fill=text_color, font=font) - - # Convert to base64 - buffered = BytesIO() - img.save(buffered, format="JPEG") - img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") - - return img_base64 - - def quit(self): - self.driver.quit() diff --git a/crawl4ai/legacy/database.py b/crawl4ai/legacy/database.py deleted file mode 100644 index 815b6b051..000000000 --- a/crawl4ai/legacy/database.py +++ /dev/null @@ -1,180 +0,0 @@ -import os -from pathlib import Path -import sqlite3 -from typing import Optional, Tuple - -DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") -os.makedirs(DB_PATH, exist_ok=True) -DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") - - -def init_db(): - global DB_PATH - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - cursor.execute( - """ - CREATE TABLE IF NOT EXISTS crawled_data ( - url TEXT PRIMARY KEY, - html TEXT, - cleaned_html TEXT, - markdown TEXT, - extracted_content TEXT, - success BOOLEAN, - media TEXT DEFAULT "{}", - links TEXT DEFAULT "{}", - metadata TEXT DEFAULT "{}", - screenshot TEXT DEFAULT "" - ) - """ - ) - conn.commit() - conn.close() - - -def alter_db_add_screenshot(new_column: str = "media"): - check_db_path() - try: - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - cursor.execute( - f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""' - ) - conn.commit() - conn.close() - except Exception as e: - print(f"Error altering database to add screenshot column: {e}") - - -def check_db_path(): - if not DB_PATH: - raise ValueError("Database path is not set or is empty.") - - -def get_cached_url( - url: str, -) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: - check_db_path() - try: - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - cursor.execute( - "SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?", - (url,), - ) - result = cursor.fetchone() - conn.close() - return result - except Exception as e: - print(f"Error retrieving cached URL: {e}") - return None - - -def cache_url( - url: str, - html: str, - cleaned_html: str, - markdown: str, - extracted_content: str, - success: bool, - media: str = "{}", - links: str = "{}", - metadata: str = "{}", - screenshot: str = "", -): - check_db_path() - try: - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - cursor.execute( - """ - INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - ON CONFLICT(url) DO UPDATE SET - html = excluded.html, - cleaned_html = excluded.cleaned_html, - markdown = excluded.markdown, - extracted_content = excluded.extracted_content, - success = excluded.success, - media = excluded.media, - links = excluded.links, - metadata = excluded.metadata, - screenshot = excluded.screenshot - """, - ( - url, - html, - cleaned_html, - markdown, - extracted_content, - success, - media, - links, - metadata, - screenshot, - ), - ) - conn.commit() - conn.close() - except Exception as e: - print(f"Error caching URL: {e}") - - -def get_total_count() -> int: - check_db_path() - try: - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - cursor.execute("SELECT COUNT(*) FROM crawled_data") - result = cursor.fetchone() - conn.close() - return result[0] - except Exception as e: - print(f"Error getting total count: {e}") - return 0 - - -def clear_db(): - check_db_path() - try: - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - cursor.execute("DELETE FROM crawled_data") - conn.commit() - conn.close() - except Exception as e: - print(f"Error clearing database: {e}") - - -def flush_db(): - check_db_path() - try: - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - cursor.execute("DROP TABLE crawled_data") - conn.commit() - conn.close() - except Exception as e: - print(f"Error flushing database: {e}") - - -def update_existing_records(new_column: str = "media", default_value: str = "{}"): - check_db_path() - try: - conn = sqlite3.connect(DB_PATH) - cursor = conn.cursor() - cursor.execute( - f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL' - ) - conn.commit() - conn.close() - except Exception as e: - print(f"Error updating existing records: {e}") - - -if __name__ == "__main__": - # Delete the existing database file - if os.path.exists(DB_PATH): - os.remove(DB_PATH) - init_db() - # alter_db_add_screenshot("COL_NAME") diff --git a/crawl4ai/legacy/docs_manager.py b/crawl4ai/legacy/docs_manager.py deleted file mode 100644 index 9a6096a5e..000000000 --- a/crawl4ai/legacy/docs_manager.py +++ /dev/null @@ -1,75 +0,0 @@ -import requests -import shutil -from pathlib import Path -from crawl4ai.async_logger import AsyncLogger -from crawl4ai.llmtxt import AsyncLLMTextManager - - -class DocsManager: - def __init__(self, logger=None): - self.docs_dir = Path.home() / ".crawl4ai" / "docs" - self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt" - self.docs_dir.mkdir(parents=True, exist_ok=True) - self.logger = logger or AsyncLogger(verbose=True) - self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger) - - async def ensure_docs_exist(self): - """Fetch docs if not present""" - if not any(self.docs_dir.iterdir()): - await self.fetch_docs() - - async def fetch_docs(self) -> bool: - """Copy from local docs or download from GitHub""" - try: - # Try local first - if self.local_docs.exists() and ( - any(self.local_docs.glob("*.md")) - or any(self.local_docs.glob("*.tokens")) - ): - # Empty the local docs directory - for file_path in self.docs_dir.glob("*.md"): - file_path.unlink() - # for file_path in self.docs_dir.glob("*.tokens"): - # file_path.unlink() - for file_path in self.local_docs.glob("*.md"): - shutil.copy2(file_path, self.docs_dir / file_path.name) - # for file_path in self.local_docs.glob("*.tokens"): - # shutil.copy2(file_path, self.docs_dir / file_path.name) - return True - - # Fallback to GitHub - response = requests.get( - "https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt", - headers={"Accept": "application/vnd.github.v3+json"}, - ) - response.raise_for_status() - - for item in response.json(): - if item["type"] == "file" and item["name"].endswith(".md"): - content = requests.get(item["download_url"]).text - with open(self.docs_dir / item["name"], "w", encoding="utf-8") as f: - f.write(content) - return True - - except Exception as e: - self.logger.error(f"Failed to fetch docs: {str(e)}") - raise - - def list(self) -> list[str]: - """List available topics""" - names = [file_path.stem for file_path in self.docs_dir.glob("*.md")] - # Remove [0-9]+_ prefix - names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names] - # Exclude those end with .xs.md and .q.md - names = [ - name - for name in names - if not name.endswith(".xs") and not name.endswith(".q") - ] - return names - - def generate(self, sections, mode="extended"): - return self.llm_text.generate(sections, mode) - - def search(self, query: str, top_k: int = 5): - return self.llm_text.search(query, top_k) diff --git a/crawl4ai/legacy/llmtxt.py b/crawl4ai/legacy/llmtxt.py deleted file mode 100644 index 302564165..000000000 --- a/crawl4ai/legacy/llmtxt.py +++ /dev/null @@ -1,546 +0,0 @@ -import os -from pathlib import Path -import re -from typing import Dict, List, Tuple, Optional, Any -import json -from tqdm import tqdm -import time -import psutil -import numpy as np -from rank_bm25 import BM25Okapi -from nltk.tokenize import word_tokenize -from nltk.corpus import stopwords -from nltk.stem import WordNetLemmatizer -from litellm import batch_completion -from .async_logger import AsyncLogger -import litellm -import pickle -import hashlib # <--- ADDED for file-hash -import glob - -litellm.set_verbose = False - - -def _compute_file_hash(file_path: Path) -> str: - """Compute MD5 hash for the file's entire content.""" - hash_md5 = hashlib.md5() - with file_path.open("rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() - - -class AsyncLLMTextManager: - def __init__( - self, - docs_dir: Path, - logger: Optional[AsyncLogger] = None, - max_concurrent_calls: int = 5, - batch_size: int = 3, - ) -> None: - self.docs_dir = docs_dir - self.logger = logger - self.max_concurrent_calls = max_concurrent_calls - self.batch_size = batch_size - self.bm25_index = None - self.document_map: Dict[str, Any] = {} - self.tokenized_facts: List[str] = [] - self.bm25_index_file = self.docs_dir / "bm25_index.pkl" - - async def _process_document_batch(self, doc_batch: List[Path]) -> None: - """Process a batch of documents in parallel""" - contents = [] - for file_path in doc_batch: - try: - with open(file_path, "r", encoding="utf-8") as f: - contents.append(f.read()) - except Exception as e: - self.logger.error(f"Error reading {file_path}: {str(e)}") - contents.append("") # Add empty content to maintain batch alignment - - prompt = """Given a documentation file, generate a list of atomic facts where each fact: -1. Represents a single piece of knowledge -2. Contains variations in terminology for the same concept -3. References relevant code patterns if they exist -4. Is written in a way that would match natural language queries - -Each fact should follow this format: -: | | - -Example Facts: -browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True) -redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0) -pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5] - -Wrap your response in ... tags. -""" - - # Prepare messages for batch processing - messages_list = [ - [ - { - "role": "user", - "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}", - } - ] - for content in contents - if content - ] - - try: - responses = batch_completion( - model="anthropic/claude-3-5-sonnet-latest", - messages=messages_list, - logger_fn=None, - ) - - # Process responses and save index files - for response, file_path in zip(responses, doc_batch): - try: - index_content_match = re.search( - r"(.*?)", - response.choices[0].message.content, - re.DOTALL, - ) - if not index_content_match: - self.logger.warning( - f"No ... content found for {file_path}" - ) - continue - - index_content = re.sub( - r"\n\s*\n", "\n", index_content_match.group(1) - ).strip() - if index_content: - index_file = file_path.with_suffix(".q.md") - with open(index_file, "w", encoding="utf-8") as f: - f.write(index_content) - self.logger.info(f"Created index file: {index_file}") - else: - self.logger.warning( - f"No index content found in response for {file_path}" - ) - - except Exception as e: - self.logger.error( - f"Error processing response for {file_path}: {str(e)}" - ) - - except Exception as e: - self.logger.error(f"Error in batch completion: {str(e)}") - - def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]: - if "|" not in line: - return False, "Missing separator '|'" - - parts = [p.strip() for p in line.split("|")] - if len(parts) != 3: - return False, f"Expected 3 parts, got {len(parts)}" - - concept_part = parts[0] - if ":" not in concept_part: - return False, "Missing ':' in concept definition" - - return True, None - - def _load_or_create_token_cache(self, fact_file: Path) -> Dict: - """ - Load token cache from .q.tokens if present and matching file hash. - Otherwise return a new structure with updated file-hash. - """ - cache_file = fact_file.with_suffix(".q.tokens") - current_hash = _compute_file_hash(fact_file) - - if cache_file.exists(): - try: - with open(cache_file, "r") as f: - cache = json.load(f) - # If the hash matches, return it directly - if cache.get("content_hash") == current_hash: - return cache - # Otherwise, we signal that it's changed - self.logger.info(f"Hash changed for {fact_file}, reindex needed.") - except json.JSONDecodeError: - self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.") - except Exception as e: - self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}") - - # Return a fresh cache - return {"facts": {}, "content_hash": current_hash} - - def _save_token_cache(self, fact_file: Path, cache: Dict) -> None: - cache_file = fact_file.with_suffix(".q.tokens") - # Always ensure we're saving the correct file-hash - cache["content_hash"] = _compute_file_hash(fact_file) - with open(cache_file, "w") as f: - json.dump(cache, f) - - def preprocess_text(self, text: str) -> List[str]: - parts = [x.strip() for x in text.split("|")] if "|" in text else [text] - # Remove : after the first word of parts[0] - parts[0] = re.sub(r"^(.*?):", r"\1", parts[0]) - - lemmatizer = WordNetLemmatizer() - stop_words = set(stopwords.words("english")) - { - "how", - "what", - "when", - "where", - "why", - "which", - } - - tokens = [] - for part in parts: - if "(" in part and ")" in part: - code_tokens = re.findall( - r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part - ) - tokens.extend(code_tokens) - - words = word_tokenize(part.lower()) - tokens.extend( - [ - lemmatizer.lemmatize(token) - for token in words - if token not in stop_words - ] - ) - - return tokens - - def maybe_load_bm25_index(self, clear_cache=False) -> bool: - """ - Load existing BM25 index from disk, if present and clear_cache=False. - """ - if not clear_cache and os.path.exists(self.bm25_index_file): - self.logger.info("Loading existing BM25 index from disk.") - with open(self.bm25_index_file, "rb") as f: - data = pickle.load(f) - self.tokenized_facts = data["tokenized_facts"] - self.bm25_index = data["bm25_index"] - return True - return False - - def build_search_index(self, clear_cache=False) -> None: - """ - Checks for new or modified .q.md files by comparing file-hash. - If none need reindexing and clear_cache is False, loads existing index if available. - Otherwise, reindexes only changed/new files and merges or creates a new index. - """ - # If clear_cache is True, we skip partial logic: rebuild everything from scratch - if clear_cache: - self.logger.info("Clearing cache and rebuilding full search index.") - if self.bm25_index_file.exists(): - self.bm25_index_file.unlink() - - process = psutil.Process() - self.logger.info("Checking which .q.md files need (re)indexing...") - - # Gather all .q.md files - q_files = [ - self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md") - ] - - # We'll store known (unchanged) facts in these lists - existing_facts: List[str] = [] - existing_tokens: List[List[str]] = [] - - # Keep track of invalid lines for logging - invalid_lines = [] - needSet = [] # files that must be (re)indexed - - for qf in q_files: - token_cache_file = qf.with_suffix(".q.tokens") - - # If no .q.tokens or clear_cache is True โ†’ definitely reindex - if clear_cache or not token_cache_file.exists(): - needSet.append(qf) - continue - - # Otherwise, load the existing cache and compare hash - cache = self._load_or_create_token_cache(qf) - # If the .q.tokens was out of date (i.e. changed hash), we reindex - if len(cache["facts"]) == 0 or cache.get( - "content_hash" - ) != _compute_file_hash(qf): - needSet.append(qf) - else: - # File is unchanged โ†’ retrieve cached token data - for line, cache_data in cache["facts"].items(): - existing_facts.append(line) - existing_tokens.append(cache_data["tokens"]) - self.document_map[line] = qf # track the doc for that fact - - if not needSet and not clear_cache: - # If no file needs reindexing, try loading existing index - if self.maybe_load_bm25_index(clear_cache=False): - self.logger.info( - "No new/changed .q.md files found. Using existing BM25 index." - ) - return - else: - # If there's no existing index, we must build a fresh index from the old caches - self.logger.info( - "No existing BM25 index found. Building from cached facts." - ) - if existing_facts: - self.logger.info( - f"Building BM25 index with {len(existing_facts)} cached facts." - ) - self.bm25_index = BM25Okapi(existing_tokens) - self.tokenized_facts = existing_facts - with open(self.bm25_index_file, "wb") as f: - pickle.dump( - { - "bm25_index": self.bm25_index, - "tokenized_facts": self.tokenized_facts, - }, - f, - ) - else: - self.logger.warning("No facts found at all. Index remains empty.") - return - - # ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md' - # If we reach here, we have new or changed .q.md files - # We'll parse them, reindex them, and then combine with existing_facts - # ----------------------------------------------------- - - self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...") - - # 1) Parse the new or changed .q.md files - new_facts = [] - new_tokens = [] - with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar: - for file in needSet: - # We'll build up a fresh cache - fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)} - try: - with open(file, "r", encoding="utf-8") as f_obj: - content = f_obj.read().strip() - lines = [l.strip() for l in content.split("\n") if l.strip()] - - for line in lines: - is_valid, error = self._validate_fact_line(line) - if not is_valid: - invalid_lines.append((file, line, error)) - continue - - tokens = self.preprocess_text(line) - fresh_cache["facts"][line] = { - "tokens": tokens, - "added": time.time(), - } - new_facts.append(line) - new_tokens.append(tokens) - self.document_map[line] = file - - # Save the new .q.tokens with updated hash - self._save_token_cache(file, fresh_cache) - - mem_usage = process.memory_info().rss / 1024 / 1024 - self.logger.debug( - f"Memory usage after {file.name}: {mem_usage:.2f}MB" - ) - - except Exception as e: - self.logger.error(f"Error processing {file}: {str(e)}") - - file_pbar.update(1) - - if invalid_lines: - self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:") - for file, line, error in invalid_lines: - self.logger.warning(f"{file}: {error} in line: {line[:50]}...") - - # 2) Merge newly tokenized facts with the existing ones - all_facts = existing_facts + new_facts - all_tokens = existing_tokens + new_tokens - - # 3) Build BM25 index from combined facts - self.logger.info( - f"Building BM25 index with {len(all_facts)} total facts (old + new)." - ) - self.bm25_index = BM25Okapi(all_tokens) - self.tokenized_facts = all_facts - - # 4) Save the updated BM25 index to disk - with open(self.bm25_index_file, "wb") as f: - pickle.dump( - { - "bm25_index": self.bm25_index, - "tokenized_facts": self.tokenized_facts, - }, - f, - ) - - final_mem = process.memory_info().rss / 1024 / 1024 - self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB") - - async def generate_index_files( - self, force_generate_facts: bool = False, clear_bm25_cache: bool = False - ) -> None: - """ - Generate index files for all documents in parallel batches - - Args: - force_generate_facts (bool): If True, regenerate indexes even if they exist - clear_bm25_cache (bool): If True, clear existing BM25 index cache - """ - self.logger.info("Starting index generation for documentation files.") - - md_files = [ - self.docs_dir / f - for f in os.listdir(self.docs_dir) - if f.endswith(".md") and not any(f.endswith(x) for x in [".q.md", ".xs.md"]) - ] - - # Filter out files that already have .q files unless force=True - if not force_generate_facts: - md_files = [ - f - for f in md_files - if not (self.docs_dir / f.name.replace(".md", ".q.md")).exists() - ] - - if not md_files: - self.logger.info("All index files exist. Use force=True to regenerate.") - else: - # Process documents in batches - for i in range(0, len(md_files), self.batch_size): - batch = md_files[i : i + self.batch_size] - self.logger.info( - f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}" - ) - await self._process_document_batch(batch) - - self.logger.info("Index generation complete, building/updating search index.") - self.build_search_index(clear_cache=clear_bm25_cache) - - def generate(self, sections: List[str], mode: str = "extended") -> str: - # Get all markdown files - all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + glob.glob( - str(self.docs_dir / "[0-9]*.xs.md") - ) - - # Extract base names without extensions - base_docs = { - Path(f).name.split(".")[0] - for f in all_files - if not Path(f).name.endswith(".q.md") - } - - # Filter by sections if provided - if sections: - base_docs = { - doc - for doc in base_docs - if any(section.lower() in doc.lower() for section in sections) - } - - # Get file paths based on mode - files = [] - for doc in sorted( - base_docs, - key=lambda x: int(x.split("_")[0]) if x.split("_")[0].isdigit() else 999999, - ): - if mode == "condensed": - xs_file = self.docs_dir / f"{doc}.xs.md" - regular_file = self.docs_dir / f"{doc}.md" - files.append(str(xs_file if xs_file.exists() else regular_file)) - else: - files.append(str(self.docs_dir / f"{doc}.md")) - - # Read and format content - content = [] - for file in files: - try: - with open(file, "r", encoding="utf-8") as f: - fname = Path(file).name - content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}") - except Exception as e: - self.logger.error(f"Error reading {file}: {str(e)}") - - return "\n\n---\n\n".join(content) if content else "" - - def search(self, query: str, top_k: int = 5) -> str: - if not self.bm25_index: - return "No search index available. Call build_search_index() first." - - query_tokens = self.preprocess_text(query) - doc_scores = self.bm25_index.get_scores(query_tokens) - - mean_score = np.mean(doc_scores) - std_score = np.std(doc_scores) - score_threshold = mean_score + (0.25 * std_score) - - file_data = self._aggregate_search_scores( - doc_scores=doc_scores, - score_threshold=score_threshold, - query_tokens=query_tokens, - ) - - ranked_files = sorted( - file_data.items(), - key=lambda x: ( - x[1]["code_match_score"] * 2.0 - + x[1]["match_count"] * 1.5 - + x[1]["total_score"] - ), - reverse=True, - )[:top_k] - - results = [] - for file, _ in ranked_files: - main_doc = str(file).replace(".q.md", ".md") - if os.path.exists(self.docs_dir / main_doc): - with open(self.docs_dir / main_doc, "r", encoding="utf-8") as f: - only_file_name = main_doc.split("/")[-1] - content = ["#" * 20, f"# {only_file_name}", "#" * 20, "", f.read()] - results.append("\n".join(content)) - - return "\n\n---\n\n".join(results) - - def _aggregate_search_scores( - self, doc_scores: List[float], score_threshold: float, query_tokens: List[str] - ) -> Dict: - file_data = {} - - for idx, score in enumerate(doc_scores): - if score <= score_threshold: - continue - - fact = self.tokenized_facts[idx] - file_path = self.document_map[fact] - - if file_path not in file_data: - file_data[file_path] = { - "total_score": 0, - "match_count": 0, - "code_match_score": 0, - "matched_facts": [], - } - - components = fact.split("|") if "|" in fact else [fact] - - code_match_score = 0 - if len(components) == 3: - code_ref = components[2].strip() - code_tokens = self.preprocess_text(code_ref) - code_match_score = len(set(query_tokens) & set(code_tokens)) / len( - query_tokens - ) - - file_data[file_path]["total_score"] += score - file_data[file_path]["match_count"] += 1 - file_data[file_path]["code_match_score"] = max( - file_data[file_path]["code_match_score"], code_match_score - ) - file_data[file_path]["matched_facts"].append(fact) - - return file_data - - def refresh_index(self) -> None: - """Convenience method for a full rebuild.""" - self.build_search_index(clear_cache=True) diff --git a/crawl4ai/legacy/version_manager.py b/crawl4ai/legacy/version_manager.py deleted file mode 100644 index 17d73faaf..000000000 --- a/crawl4ai/legacy/version_manager.py +++ /dev/null @@ -1,29 +0,0 @@ -# version_manager.py -from pathlib import Path -from packaging import version -from . import __version__ - - -class VersionManager: - def __init__(self): - self.home_dir = Path.home() / ".crawl4ai" - self.version_file = self.home_dir / "version.txt" - - def get_installed_version(self): - """Get the version recorded in home directory""" - if not self.version_file.exists(): - return None - try: - return version.parse(self.version_file.read_text().strip()) - except: - return None - - def update_version(self): - """Update the version file to current library version""" - self.version_file.write_text(__version__.__version__) - - def needs_update(self): - """Check if database needs update based on version""" - installed = self.get_installed_version() - current = version.parse(__version__.__version__) - return installed is None or installed < current diff --git a/crawl4ai/legacy/web_crawler.py b/crawl4ai/legacy/web_crawler.py deleted file mode 100644 index 7e5230b8b..000000000 --- a/crawl4ai/legacy/web_crawler.py +++ /dev/null @@ -1,294 +0,0 @@ -import os, time - -os.environ["TOKENIZERS_PARALLELISM"] = "false" -from pathlib import Path - -from .models import UrlModel, CrawlResult -from .database import init_db, get_cached_url, cache_url -from .utils import * -from .chunking_strategy import * -from .extraction_strategy import * -from .crawler_strategy import * -from typing import List -from concurrent.futures import ThreadPoolExecutor -from ..content_scraping_strategy import LXMLWebScrapingStrategy as WebScrapingStrategy -from .config import * -import warnings -import json - -warnings.filterwarnings( - "ignore", - message='Field "model_name" has conflict with protected namespace "model_".', -) - - -class WebCrawler: - def __init__( - self, - crawler_strategy: CrawlerStrategy = None, - always_by_pass_cache: bool = False, - verbose: bool = False, - ): - self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy( - verbose=verbose - ) - self.always_by_pass_cache = always_by_pass_cache - self.crawl4ai_folder = os.path.join( - os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai" - ) - os.makedirs(self.crawl4ai_folder, exist_ok=True) - os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) - init_db() - self.ready = False - - def warmup(self): - print("[LOG] ๐ŸŒค๏ธ Warming up the WebCrawler") - self.run( - url="https://google.com/", - word_count_threshold=5, - extraction_strategy=NoExtractionStrategy(), - bypass_cache=False, - verbose=False, - ) - self.ready = True - print("[LOG] ๐ŸŒž WebCrawler is ready to crawl") - - def fetch_page( - self, - url_model: UrlModel, - provider: str = DEFAULT_PROVIDER, - api_token: str = None, - extract_blocks_flag: bool = True, - word_count_threshold=MIN_WORD_THRESHOLD, - css_selector: str = None, - screenshot: bool = False, - use_cached_html: bool = False, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - **kwargs, - ) -> CrawlResult: - return self.run( - url_model.url, - word_count_threshold, - extraction_strategy or NoExtractionStrategy(), - chunking_strategy, - bypass_cache=url_model.forced, - css_selector=css_selector, - screenshot=screenshot, - **kwargs, - ) - pass - - def fetch_pages( - self, - url_models: List[UrlModel], - provider: str = DEFAULT_PROVIDER, - api_token: str = None, - extract_blocks_flag: bool = True, - word_count_threshold=MIN_WORD_THRESHOLD, - use_cached_html: bool = False, - css_selector: str = None, - screenshot: bool = False, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - **kwargs, - ) -> List[CrawlResult]: - extraction_strategy = extraction_strategy or NoExtractionStrategy() - - def fetch_page_wrapper(url_model, *args, **kwargs): - return self.fetch_page(url_model, *args, **kwargs) - - with ThreadPoolExecutor() as executor: - results = list( - executor.map( - fetch_page_wrapper, - url_models, - [provider] * len(url_models), - [api_token] * len(url_models), - [extract_blocks_flag] * len(url_models), - [word_count_threshold] * len(url_models), - [css_selector] * len(url_models), - [screenshot] * len(url_models), - [use_cached_html] * len(url_models), - [extraction_strategy] * len(url_models), - [chunking_strategy] * len(url_models), - *[kwargs] * len(url_models), - ) - ) - - return results - - def run( - self, - url: str, - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - user_agent: str = None, - verbose=True, - **kwargs, - ) -> CrawlResult: - try: - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) - - cached = None - screenshot_data = None - extracted_content = None - if not bypass_cache and not self.always_by_pass_cache: - cached = get_cached_url(url) - - if kwargs.get("warmup", True) and not self.ready: - return None - - if cached: - html = sanitize_input_encode(cached[1]) - extracted_content = sanitize_input_encode(cached[4]) - if screenshot: - screenshot_data = cached[9] - if not screenshot_data: - cached = None - - if not cached or not html: - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - t1 = time.time() - html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs)) - t2 = time.time() - if verbose: - print( - f"[LOG] ๐Ÿš€ Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" - ) - if screenshot: - screenshot_data = self.crawler_strategy.take_screenshot() - - crawl_result = self.process_html( - url, - html, - extracted_content, - word_count_threshold, - extraction_strategy, - chunking_strategy, - css_selector, - screenshot_data, - verbose, - bool(cached), - **kwargs, - ) - crawl_result.success = bool(html) - return crawl_result - except Exception as e: - if not hasattr(e, "msg"): - e.msg = str(e) - print(f"[ERROR] ๐Ÿšซ Failed to crawl {url}, error: {e.msg}") - return CrawlResult(url=url, html="", success=False, error_message=e.msg) - - def process_html( - self, - url: str, - html: str, - extracted_content: str, - word_count_threshold: int, - extraction_strategy: ExtractionStrategy, - chunking_strategy: ChunkingStrategy, - css_selector: str, - screenshot: bool, - verbose: bool, - is_cached: bool, - **kwargs, - ) -> CrawlResult: - t = time.time() - # Extract content from HTML - try: - t1 = time.time() - scrapping_strategy = WebScrapingStrategy() - extra_params = { - k: v - for k, v in kwargs.items() - if k not in ["only_text", "image_description_min_word_threshold"] - } - result = scrapping_strategy.scrap( - url, - html, - word_count_threshold=word_count_threshold, - css_selector=css_selector, - only_text=kwargs.get("only_text", False), - image_description_min_word_threshold=kwargs.get( - "image_description_min_word_threshold", - IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, - ), - **extra_params, - ) - - # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) - if verbose: - print( - f"[LOG] ๐Ÿš€ Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds" - ) - - if result is None: - raise ValueError(f"Failed to extract content from the website: {url}") - except InvalidCSSSelectorError as e: - raise ValueError(str(e)) - - cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) - markdown = sanitize_input_encode(result.get("markdown", "")) - media = result.get("media", []) - links = result.get("links", []) - metadata = result.get("metadata", {}) - - if extracted_content is None: - if verbose: - print( - f"[LOG] ๐Ÿ”ฅ Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}" - ) - - sections = chunking_strategy.chunk(markdown) - extracted_content = extraction_strategy.run(url, sections) - extracted_content = json.dumps( - extracted_content, indent=4, default=str, ensure_ascii=False - ) - - if verbose: - print( - f"[LOG] ๐Ÿš€ Extraction done for {url}, time taken: {time.time() - t:.2f} seconds." - ) - - screenshot = None if not screenshot else screenshot - - if not is_cached: - cache_url( - url, - html, - cleaned_html, - markdown, - extracted_content, - True, - json.dumps(media), - json.dumps(links), - json.dumps(metadata), - screenshot=screenshot, - ) - - return CrawlResult( - url=url, - html=html, - cleaned_html=format_html(cleaned_html), - markdown=markdown, - media=media, - links=links, - metadata=metadata, - screenshot=screenshot, - extracted_content=extracted_content, - success=True, - error_message="", - ) diff --git a/crawl4ai/link_preview.py b/crawl4ai/link_preview.py index 13d32d58f..327566510 100644 --- a/crawl4ai/link_preview.py +++ b/crawl4ai/link_preview.py @@ -5,13 +5,15 @@ efficient parallel processing and caching infrastructure. """ -import asyncio import fnmatch -from typing import Dict, List, Optional, Any +from typing import Any + +from crawl4ai.cache_client import CacheClient, NoCacheClient + +from .async_configs import CrawlerRunConfig, SeedingConfig from .async_logger import AsyncLogger from .async_url_seeder import AsyncUrlSeeder -from .async_configs import SeedingConfig, CrawlerRunConfig -from .models import Links, Link +from .models import Link, Links from .utils import calculate_total_score @@ -27,15 +29,16 @@ class LinkPreview: - Memory-safe processing for large link sets """ - def __init__(self, logger: Optional[AsyncLogger] = None): + def __init__(self, cache_client: CacheClient = NoCacheClient(), logger: AsyncLogger | None = None): """ Initialize the LinkPreview. Args: logger: Optional logger instance for recording events """ + self.cache_client = cache_client self.logger = logger - self.seeder: Optional[AsyncUrlSeeder] = None + self.seeder: AsyncUrlSeeder | None = None self._owns_seeder = False async def __aenter__(self): @@ -50,7 +53,7 @@ async def __aexit__(self, exc_type, exc_val, exc_tb): async def start(self): """Initialize the URLSeeder instance.""" if not self.seeder: - self.seeder = AsyncUrlSeeder(logger=self.logger) + self.seeder = AsyncUrlSeeder(cache_client=self.cache_client, logger=self.logger) await self.seeder.__aenter__() self._owns_seeder = True @@ -109,7 +112,7 @@ async def extract_link_heads( return updated_links - def _filter_links(self, links: Links, link_config: Dict[str, Any]) -> List[str]: + def _filter_links(self, links: Links, link_config: dict[str, Any]) -> list[str]: """ Filter links based on configuration parameters. @@ -176,9 +179,9 @@ def _filter_links(self, links: Links, link_config: Dict[str, Any]) -> List[str]: async def _extract_heads_parallel( self, - urls: List[str], - link_config: Dict[str, Any] - ) -> List[Dict[str, Any]]: + urls: list[str], + link_config: dict[str, Any] + ) -> list[dict[str, Any]]: """ Extract head content for URLs using URLSeeder's parallel processing. @@ -223,10 +226,10 @@ async def _extract_heads_parallel( async def _extract_with_progress( self, - urls: List[str], + urls: list[str], seeding_config: SeedingConfig, - link_config: Dict[str, Any] - ) -> List[Dict[str, Any]]: + link_config: dict[str, Any] + ) -> list[dict[str, Any]]: """Extract head content with progress reporting.""" total_urls = len(urls) @@ -276,7 +279,7 @@ async def _extract_with_progress( def _merge_head_data( self, original_links: Links, - head_results: List[Dict[str, Any]], + head_results: list[dict[str, Any]], config: CrawlerRunConfig ) -> Links: """ diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py index d6da292f3..821e2e7a7 100644 --- a/crawl4ai/migrations.py +++ b/crawl4ai/migrations.py @@ -1,12 +1,15 @@ -import os import asyncio -from pathlib import Path -import aiosqlite -from typing import Optional -import xxhash -import aiofiles +import os import shutil from datetime import datetime +from typing import Optional + +import aiofiles +import aiosqlite +import xxhash + +from crawl4ai.utils import get_home_folder + from .async_logger import AsyncLogger, LogLevel # Initialize logger @@ -162,7 +165,7 @@ async def backup_database(db_path: str) -> str: async def run_migration(db_path: Optional[str] = None): """Run database migration""" if db_path is None: - db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") + db_path = os.path.join(get_home_folder(), "crawl4ai.db") if not os.path.exists(db_path): logger.info("No existing database found. Skipping migration.", tag="INIT") diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 640c2f2db..78a2f00e2 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -227,16 +227,6 @@ def fit_markdown(self): "Please use 'markdown.fit_markdown' instead." ) - @property - def fit_html(self): - """ - Deprecated property that raises an AttributeError when accessed. - """ - raise AttributeError( - "The 'fit_html' attribute is deprecated and has been removed. " - "Please use 'markdown.fit_html' instead." - ) - def model_dump(self, *args, **kwargs): """ Override model_dump to include the _markdown private attribute in serialization. diff --git a/crawl4ai/table_extraction.py b/crawl4ai/table_extraction.py index b2f1992b8..d65ddf51a 100644 --- a/crawl4ai/table_extraction.py +++ b/crawl4ai/table_extraction.py @@ -5,17 +5,18 @@ The strategy pattern allows for flexible table extraction methods while maintaining a consistent interface. """ -from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Any, Union, Tuple -from lxml import etree -import re import json -from .types import LLMConfig, create_llm_config -from .utils import perform_completion_with_backoff, sanitize_html import os -from concurrent.futures import ThreadPoolExecutor, as_completed import time +from abc import ABC, abstractmethod +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Any + import tiktoken +from lxml import etree + +from .types import LLMConfig +from .utils import perform_completion_with_backoff, sanitize_html class TableExtractionStrategy(ABC): @@ -37,7 +38,7 @@ def __init__(self, **kwargs): self.logger = kwargs.get("logger", None) @abstractmethod - def extract_tables(self, element: etree.Element, **kwargs) -> List[Dict[str, Any]]: + def extract_tables(self, element: etree.Element, **kwargs) -> list[dict[str, Any]]: """ Extract tables from the given HTML element. @@ -87,7 +88,7 @@ def __init__(self, **kwargs): self.min_rows = kwargs.get("min_rows", 0) self.min_cols = kwargs.get("min_cols", 0) - def extract_tables(self, element: etree.Element, **kwargs) -> List[Dict[str, Any]]: + def extract_tables(self, element: etree.Element, **kwargs) -> list[dict[str, Any]]: """ Extract all data tables from the HTML element. @@ -210,7 +211,7 @@ def is_data_table(self, table: etree.Element, **kwargs) -> bool: threshold = kwargs.get("table_score_threshold", self.table_score_threshold) return score >= threshold - def extract_table_data(self, table: etree.Element) -> Dict[str, Any]: + def extract_table_data(self, table: etree.Element) -> dict[str, Any]: """ Extract structured data from a table element. @@ -303,7 +304,7 @@ class NoTableExtraction(TableExtractionStrategy): This can be used to explicitly disable table extraction when needed. """ - def extract_tables(self, element: etree.Element, **kwargs) -> List[Dict[str, Any]]: + def extract_tables(self, element: etree.Element, **kwargs) -> list[dict[str, Any]]: """ Return an empty list (no tables extracted). @@ -688,8 +689,8 @@ class LLMTableExtraction(TableExtractionStrategy): **CRITICAL**: Your response must be a valid JSON object that conforms to this schema. The entire purpose of using an LLM for this task is to handle complex HTML tables that standard parsers cannot process correctly. Your value lies in intelligently interpreting complex structures and returning complete, clean, tabulated data in valid JSON format.""" def __init__(self, - llm_config: Optional[LLMConfig] = None, - css_selector: Optional[str] = None, + llm_config: LLMConfig | None = None, + css_selector: str | None = None, max_tries: int = 3, enable_chunking: bool = True, chunk_token_threshold: int = 3000, @@ -717,6 +718,7 @@ def __init__(self, self.llm_config = llm_config if not self.llm_config: # Use default configuration if not provided + from .async_configs import create_llm_config self.llm_config = create_llm_config( provider=os.getenv("DEFAULT_PROVIDER", "openai/gpt-4o-mini"), api_token=os.getenv("OPENAI_API_KEY"), @@ -730,7 +732,7 @@ def __init__(self, self.max_parallel_chunks = max(1, max_parallel_chunks) self.extra_args = kwargs.get("extra_args", {}) - def extract_tables(self, element: etree.Element, **kwargs) -> List[Dict[str, Any]]: + def extract_tables(self, element: etree.Element, **kwargs) -> list[dict[str, Any]]: """ Extract tables from HTML using LLM. @@ -760,7 +762,7 @@ def extract_tables(self, element: etree.Element, **kwargs) -> List[Dict[str, Any # Check if there are any tables in the content if ' tags found in HTML content") + self._log("info", "No tags found in HTML content") return [] if self.verbose: @@ -857,18 +859,16 @@ def extract_tables(self, element: etree.Element, **kwargs) -> List[Dict[str, Any if self.verbose: self._log("warning", f"No valid tables extracted on attempt {attempt}, retrying...") continue - else: - if self.verbose: - self._log("warning", f"No valid tables extracted after {self.max_tries} attempts") - return [] + if self.verbose: + self._log("warning", f"No valid tables extracted after {self.max_tries} attempts") + return [] except json.JSONDecodeError as e: if self.verbose: self._log("error", f"JSON parsing error on attempt {attempt}: {str(e)}") if attempt < self.max_tries: continue - else: - return [] + return [] except Exception as e: if self.verbose: @@ -883,8 +883,7 @@ def extract_tables(self, element: etree.Element, **kwargs) -> List[Dict[str, Any import time time.sleep(1) continue - else: - return [] + return [] # Should not reach here, but return empty list as fallback return [] @@ -920,7 +919,7 @@ def _needs_chunking(self, html_content: str) -> bool: return needs_chunk - def _extract_table_structure(self, html_content: str) -> Tuple[List[etree.Element], List[etree.Element], List[etree.Element], bool]: + def _extract_table_structure(self, html_content: str) -> tuple[list[etree.Element], list[etree.Element], list[etree.Element], bool]: """ Extract headers, body rows, and footer from table HTML. @@ -981,7 +980,7 @@ def _extract_table_structure(self, html_content: str) -> Tuple[List[etree.Elemen return header_rows, body_rows, footer_rows, has_headers - def _create_smart_chunks(self, html_content: str) -> Tuple[List[str], bool]: + def _create_smart_chunks(self, html_content: str) -> tuple[list[str], bool]: """ Create smart chunks of table HTML, preserving headers in each chunk. @@ -1053,7 +1052,7 @@ def _create_smart_chunks(self, html_content: str) -> Tuple[List[str], bool]: return chunks, has_headers - def _create_chunk_html(self, header_html: str, body_rows: List[str], footer_html: Optional[str]) -> str: + def _create_chunk_html(self, header_html: str, body_rows: list[str], footer_html: str | None) -> str: """ Create a complete table HTML chunk with headers, body rows, and optional footer. """ @@ -1073,7 +1072,7 @@ def _create_chunk_html(self, header_html: str, body_rows: List[str], footer_html return ''.join(html_parts) - def _rebalance_chunks(self, chunks: List[str], min_rows: int) -> List[str]: + def _rebalance_chunks(self, chunks: list[str], min_rows: int) -> list[str]: """ Rebalance chunks to ensure minimum rows per chunk. Merge small chunks if necessary. @@ -1082,7 +1081,7 @@ def _rebalance_chunks(self, chunks: List[str], min_rows: int) -> List[str]: # In production, you'd want more sophisticated rebalancing return chunks - def _process_chunk(self, chunk_html: str, chunk_index: int, total_chunks: int, has_headers: bool = True) -> Dict[str, Any]: + def _process_chunk(self, chunk_html: str, chunk_index: int, total_chunks: int, has_headers: bool = True) -> dict[str, Any]: """ Process a single chunk with the LLM. """ @@ -1164,12 +1163,11 @@ def _process_chunk(self, chunk_html: str, chunk_index: int, total_chunks: int, h if attempt < self.max_tries: time.sleep(1) continue - else: - return {'chunk_index': chunk_index, 'table': None, 'error': str(e)} + return {'chunk_index': chunk_index, 'table': None, 'error': str(e)} return {'chunk_index': chunk_index, 'table': None} - def _merge_chunk_results(self, chunk_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + def _merge_chunk_results(self, chunk_results: list[dict[str, Any]]) -> list[dict[str, Any]]: """ Merge results from multiple chunks into a single table. """ @@ -1205,7 +1203,7 @@ def _merge_chunk_results(self, chunk_results: List[Dict[str, Any]]) -> List[Dict return [merged_table] - def _extract_with_chunking(self, html_content: str) -> List[Dict[str, Any]]: + def _extract_with_chunking(self, html_content: str) -> list[dict[str, Any]]: """ Extract tables using chunking and parallel processing. """ @@ -1251,12 +1249,12 @@ def _extract_with_chunking(self, html_content: str) -> List[Dict[str, Any]]: chunk_results.append({'chunk_index': chunk_index, 'table': None, 'error': str(e)}) if self.verbose: - self._log("info", f"All chunks processed, merging results...") + self._log("info", "All chunks processed, merging results...") # Merge results return self._merge_chunk_results(chunk_results) - def _css_to_xpath_select(self, element: etree.Element, css_selector: str) -> List[etree.Element]: + def _css_to_xpath_select(self, element: etree.Element, css_selector: str) -> list[etree.Element]: """ Convert CSS selector to XPath and select elements. This is a basic implementation - for complex CSS selectors, @@ -1301,7 +1299,7 @@ def _css_to_xpath_select(self, element: etree.Element, css_selector: str) -> Lis self._log("warning", f"XPath conversion failed for selector '{css_selector}': {str(e)}") return [] - def _validate_table_structure(self, table: Dict) -> bool: + def _validate_table_structure(self, table: dict) -> bool: """ Validate that the table has the required structure. @@ -1350,7 +1348,7 @@ def _validate_table_structure(self, table: Dict) -> bool: return True - def _ensure_table_format(self, table: Dict) -> Dict[str, Any]: + def _ensure_table_format(self, table: dict) -> dict[str, Any]: """ Ensure the table has all required fields with proper defaults. diff --git a/crawl4ai/types.py b/crawl4ai/types.py index 72a0828e9..782ebda17 100644 --- a/crawl4ai/types.py +++ b/crawl4ai/types.py @@ -8,7 +8,6 @@ AsyncWebCrawler = Union['AsyncWebCrawlerType'] CacheMode = Union['CacheModeType'] CrawlResult = Union['CrawlResultType'] -CrawlerHub = Union['CrawlerHubType'] BrowserProfiler = Union['BrowserProfilerType'] # NEW: Add AsyncUrlSeederType AsyncUrlSeeder = Union['AsyncUrlSeederType'] @@ -86,110 +85,181 @@ # Only import types during type checking to avoid circular imports if TYPE_CHECKING: # Logger imports - from .async_logger import ( - AsyncLoggerBase as AsyncLoggerBaseType, - AsyncLogger as AsyncLoggerType, - ) - - # Crawler core imports - from .async_webcrawler import ( - AsyncWebCrawler as AsyncWebCrawlerType, - CacheMode as CacheModeType, - ) - from .models import CrawlResult as CrawlResultType - from .hub import CrawlerHub as CrawlerHubType - from .browser_profiler import BrowserProfiler as BrowserProfilerType - # NEW: Import AsyncUrlSeeder for type checking - from .async_url_seeder import AsyncUrlSeeder as AsyncUrlSeederType - # Configuration imports from .async_configs import ( BrowserConfig as BrowserConfigType, + ) + from .async_configs import ( CrawlerRunConfig as CrawlerRunConfigType, + ) + from .async_configs import ( HTTPCrawlerConfig as HTTPCrawlerConfigType, + ) + from .async_configs import ( LLMConfig as LLMConfigType, + ) + from .async_configs import ( # NEW: Import SeedingConfig for type checking SeedingConfig as SeedingConfigType, ) - - # Content scraping imports - from .content_scraping_strategy import ( - ContentScrapingStrategy as ContentScrapingStrategyType, - LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType, + + # Dispatcher imports + from .async_dispatcher import ( + BaseDispatcher as BaseDispatcherType, ) - - # Proxy imports - from .proxy_strategy import ( - ProxyRotationStrategy as ProxyRotationStrategyType, - RoundRobinProxyStrategy as RoundRobinProxyStrategyType, + from .async_dispatcher import ( + CrawlerMonitor as CrawlerMonitorType, ) - - # Extraction imports - from .extraction_strategy import ( - ExtractionStrategy as ExtractionStrategyType, - LLMExtractionStrategy as LLMExtractionStrategyType, - CosineStrategy as CosineStrategyType, - JsonCssExtractionStrategy as JsonCssExtractionStrategyType, - JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType, + from .async_dispatcher import ( + DisplayMode as DisplayModeType, + ) + from .async_dispatcher import ( + MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType, + ) + from .async_dispatcher import ( + RateLimiter as RateLimiterType, ) - + from .async_dispatcher import ( + RunManyReturn as RunManyReturnType, + ) + from .async_dispatcher import ( + SemaphoreDispatcher as SemaphoreDispatcherType, + ) + from .async_logger import ( + AsyncLogger as AsyncLoggerType, + ) + from .async_logger import ( + AsyncLoggerBase as AsyncLoggerBaseType, + ) + + # NEW: Import AsyncUrlSeeder for type checking + from .async_url_seeder import AsyncUrlSeeder as AsyncUrlSeederType + + # Crawler core imports + from .async_webcrawler import ( + AsyncWebCrawler as AsyncWebCrawlerType, + ) + from .async_webcrawler import ( + CacheMode as CacheModeType, + ) + from .browser_profiler import BrowserProfiler as BrowserProfilerType + # Chunking imports from .chunking_strategy import ( ChunkingStrategy as ChunkingStrategyType, + ) + from .chunking_strategy import ( RegexChunking as RegexChunkingType, ) - - # Markdown generation imports - from .markdown_generation_strategy import ( - DefaultMarkdownGenerator as DefaultMarkdownGeneratorType, + from .content_filter_strategy import ( + BM25ContentFilter as BM25ContentFilterType, ) - from .models import MarkdownGenerationResult as MarkdownGenerationResultType - + from .content_filter_strategy import ( + LLMContentFilter as LLMContentFilterType, + ) + from .content_filter_strategy import ( + PruningContentFilter as PruningContentFilterType, + ) + # Content filter imports from .content_filter_strategy import ( RelevantContentFilter as RelevantContentFilterType, - PruningContentFilter as PruningContentFilterType, - BM25ContentFilter as BM25ContentFilterType, - LLMContentFilter as LLMContentFilterType, ) - - # Dispatcher imports - from .async_dispatcher import ( - BaseDispatcher as BaseDispatcherType, - MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType, - SemaphoreDispatcher as SemaphoreDispatcherType, - RateLimiter as RateLimiterType, - CrawlerMonitor as CrawlerMonitorType, - DisplayMode as DisplayModeType, - RunManyReturn as RunManyReturnType, + + # Content scraping imports + from .content_scraping_strategy import ( + ContentScrapingStrategy as ContentScrapingStrategyType, + ) + from .content_scraping_strategy import ( + LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType, + ) + from .deep_crawling import ( + BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType, ) - - # Docker client - from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType - - # Deep crawling imports from .deep_crawling import ( - DeepCrawlStrategy as DeepCrawlStrategyType, BFSDeepCrawlStrategy as BFSDeepCrawlStrategyType, - FilterChain as FilterChainType, + ) + from .deep_crawling import ( + CompositeScorer as CompositeScorerType, + ) + from .deep_crawling import ( ContentTypeFilter as ContentTypeFilterType, + ) + from .deep_crawling import ( + DeepCrawlDecorator as DeepCrawlDecoratorType, + ) + + # Deep crawling imports + from .deep_crawling import ( + DeepCrawlStrategy as DeepCrawlStrategyType, + ) + from .deep_crawling import ( + DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType, + ) + from .deep_crawling import ( + DomainAuthorityScorer as DomainAuthorityScorerType, + ) + from .deep_crawling import ( DomainFilter as DomainFilterType, - URLFilter as URLFilterType, + ) + from .deep_crawling import ( + FilterChain as FilterChainType, + ) + from .deep_crawling import ( FilterStats as FilterStatsType, - SEOFilter as SEOFilterType, - KeywordRelevanceScorer as KeywordRelevanceScorerType, - URLScorer as URLScorerType, - CompositeScorer as CompositeScorerType, - DomainAuthorityScorer as DomainAuthorityScorerType, + ) + from .deep_crawling import ( FreshnessScorer as FreshnessScorerType, + ) + from .deep_crawling import ( + KeywordRelevanceScorer as KeywordRelevanceScorerType, + ) + from .deep_crawling import ( PathDepthScorer as PathDepthScorerType, - BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType, - DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType, - DeepCrawlDecorator as DeepCrawlDecoratorType, + ) + from .deep_crawling import ( + SEOFilter as SEOFilterType, + ) + from .deep_crawling import ( + URLFilter as URLFilterType, + ) + from .deep_crawling import ( + URLScorer as URLScorerType, + ) + + # Docker client + from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType + from .extraction_strategy import ( + CosineStrategy as CosineStrategyType, + ) + + # Extraction imports + from .extraction_strategy import ( + ExtractionStrategy as ExtractionStrategyType, + ) + from .extraction_strategy import ( + JsonCssExtractionStrategy as JsonCssExtractionStrategyType, + ) + from .extraction_strategy import ( + JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType, + ) + from .extraction_strategy import ( + LLMExtractionStrategy as LLMExtractionStrategyType, ) + # Markdown generation imports + from .markdown_generation_strategy import ( + DefaultMarkdownGenerator as DefaultMarkdownGeneratorType, + ) + from .models import CrawlResult as CrawlResultType + from .models import MarkdownGenerationResult as MarkdownGenerationResultType + + # Proxy imports + from .proxy_strategy import ( + ProxyRotationStrategy as ProxyRotationStrategyType, + ) + from .proxy_strategy import ( + RoundRobinProxyStrategy as RoundRobinProxyStrategyType, + ) -def create_llm_config(*args, **kwargs) -> 'LLMConfigType': - from .async_configs import LLMConfig - return LLMConfig(*args, **kwargs) \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 73f1d2a31..f3e8ec7b4 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1,57 +1,61 @@ -import time -from concurrent.futures import ThreadPoolExecutor, as_completed -from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString -import json +import asyncio +import cProfile import html -import lxml -import re +import json +import linecache import os -import subprocess import platform -from .prompts import PROMPT_EXTRACT_BLOCKS +import pstats +import re +import subprocess +import textwrap +import time +import traceback from array import array -from .html2text import html2text, CustomHTML2Text -# from .config import * -from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS -import httpx -from socket import gaierror +from collections import deque +from collections.abc import Callable, Generator, Iterable, Sequence +from concurrent.futures import ThreadPoolExecutor, as_completed +from functools import lru_cache, wraps +from itertools import chain from pathlib import Path -from typing import Dict, Any, List, Optional, Callable, Generator, Tuple, Iterable -from urllib.parse import urljoin -import requests -from requests.exceptions import InvalidSchema -import xxhash -import textwrap -import cProfile -import pstats -from functools import wraps -import asyncio -from lxml import etree, html as lhtml -import sqlite3 -import hashlib - -from urllib.robotparser import RobotFileParser -import aiohttp -from functools import lru_cache +from socket import gaierror +from urllib.parse import ( + parse_qs, + parse_qsl, + quote, + unquote, + urlencode, + urljoin, + urlparse, + urlunparse, +) -from packaging import version -from . import __version__ -from typing import Sequence +# Monkey patch to fix wildcard handling in urllib.robotparser +from urllib.robotparser import RobotFileParser, RuleLine -from itertools import chain -from collections import deque -import psutil +import aiohttp +import httpx +import lxml import numpy as np +import psutil +import xxhash +from bs4 import BeautifulSoup +from lxml import etree +from lxml import html as lhtml +from packaging import version -from urllib.parse import ( - urljoin, urlparse, urlunparse, - parse_qsl, urlencode, quote, unquote +from crawl4ai.cache_client import ( + DEFAULT_CACHE_TTL_SECONDS, + ROBOTS_CACHE_KEY_PREFIX, + CacheClient, ) - -# Monkey patch to fix wildcard handling in urllib.robotparser -from urllib.robotparser import RuleLine -import re +from . import __version__ +from .config import ( + DEFAULT_PROVIDER, + PROVIDER_MODELS, +) +from .prompts import PROMPT_EXTRACT_BLOCKS original_applies_to = RuleLine.applies_to @@ -77,8 +81,8 @@ def chunk_documents( chunk_token_threshold: int, overlap: int, word_token_rate: float = 0.75, - tokenizer: Optional[Callable[[str], List[str]]] = None, -) -> Generator[str, None, None]: + tokenizer: Callable[[str], list[str]] | None = None, +) -> Generator[str]: """ Efficiently chunks documents into token-limited sections with overlap between chunks. @@ -164,7 +168,7 @@ def merge_chunks( overlap: int = 0, word_token_ratio: float = 1.0, splitter: Callable = None -) -> List[str]: +) -> list[str]: """ Merges a sequence of documents into chunks based on a target token count, with optional overlap. @@ -183,7 +187,7 @@ def merge_chunks( # Pre-tokenize all docs and store token counts splitter = splitter or str.split token_counts = array('I') - all_tokens: List[List[str]] = [] + all_tokens: list[list[str]] = [] total_tokens = 0 for doc in docs: @@ -199,7 +203,7 @@ def merge_chunks( # Pre-allocate chunks num_chunks = max(1, (total_tokens + target_size - 1) // target_size) - chunks: List[List[str]] = [[] for _ in range(num_chunks)] + chunks: list[list[str]] = [[] for _ in range(num_chunks)] curr_chunk = 0 curr_size = 0 @@ -225,8 +229,7 @@ def merge_chunks( class VersionManager: def __init__(self): - self.home_dir = Path.home() / ".crawl4ai" - self.version_file = self.home_dir / "version.txt" + self.version_file = Path(get_home_folder()) / "version.txt" def get_installed_version(self): """Get the version recorded in home directory""" @@ -249,65 +252,22 @@ def needs_update(self): class RobotsParser: - # Default 7 days cache TTL - CACHE_TTL = 7 * 24 * 60 * 60 - - def __init__(self, cache_dir=None, cache_ttl=None): - self.cache_dir = cache_dir or os.path.join(get_home_folder(), ".crawl4ai", "robots") - self.cache_ttl = cache_ttl or self.CACHE_TTL - os.makedirs(self.cache_dir, exist_ok=True) - self.db_path = os.path.join(self.cache_dir, "robots_cache.db") - self._init_db() - - def _init_db(self): - # Use WAL mode for better concurrency and performance - with sqlite3.connect(self.db_path) as conn: - conn.execute("PRAGMA journal_mode=WAL") - conn.execute(""" - CREATE TABLE IF NOT EXISTS robots_cache ( - domain TEXT PRIMARY KEY, - rules TEXT NOT NULL, - fetch_time INTEGER NOT NULL, - hash TEXT NOT NULL - ) - """) - conn.execute("CREATE INDEX IF NOT EXISTS idx_domain ON robots_cache(domain)") - - def _get_cached_rules(self, domain: str) -> tuple[str, bool]: - """Get cached rules. Returns (rules, is_fresh)""" - with sqlite3.connect(self.db_path) as conn: - cursor = conn.execute( - "SELECT rules, fetch_time, hash FROM robots_cache WHERE domain = ?", - (domain,) - ) - result = cursor.fetchone() - - if not result: - return None, False - - rules, fetch_time, _ = result - # Check if cache is still fresh based on TTL - return rules, (time.time() - fetch_time) < self.cache_ttl + def __init__(self, cache_client: CacheClient, cache_ttl=None): + self.cache_client = cache_client + self.cache_ttl = cache_ttl or DEFAULT_CACHE_TTL_SECONDS + + def _get_cache_key(self, domain: str) -> str: + return f"{ROBOTS_CACHE_KEY_PREFIX}{domain}" + + def _get_cached_rules(self, domain: str) -> str: + return self.cache_client.get(key=self._get_cache_key(domain)) def _cache_rules(self, domain: str, content: str): - """Cache robots.txt content with hash for change detection""" - hash_val = hashlib.md5(content.encode()).hexdigest() - with sqlite3.connect(self.db_path) as conn: - # Check if content actually changed - cursor = conn.execute( - "SELECT hash FROM robots_cache WHERE domain = ?", - (domain,) - ) - result = cursor.fetchone() - - # Only update if hash changed or no previous entry - if not result or result[0] != hash_val: - conn.execute( - """INSERT OR REPLACE INTO robots_cache - (domain, rules, fetch_time, hash) - VALUES (?, ?, ?, ?)""", - (domain, content, int(time.time()), hash_val) - ) + self.cache_client.set( + key=self._get_cache_key(domain), + value=content, + ttl_seconds=self.cache_ttl + ) async def can_fetch(self, url: str, user_agent: str = "*") -> bool: """ @@ -330,10 +290,10 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool: return True # Fast path - check cache first - rules, is_fresh = self._get_cached_rules(domain) + rules = self._get_cached_rules(domain) # If rules not found or stale, fetch new ones - if not is_fresh: + if not rules: try: # Ensure we use the same scheme as the input URL scheme = parsed.scheme or 'http' @@ -350,9 +310,6 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool: # On any error (timeout, connection failed, etc), allow access return True - if not rules: - return True - # Create parser for this check parser = RobotFileParser() parser.parse(rules.splitlines()) @@ -363,18 +320,6 @@ async def can_fetch(self, url: str, user_agent: str = "*") -> bool: return parser.can_fetch(user_agent, url) - def clear_cache(self): - """Clear all cached robots.txt entries""" - with sqlite3.connect(self.db_path) as conn: - conn.execute("DELETE FROM robots_cache") - - def clear_expired(self): - """Remove only expired entries from cache""" - with sqlite3.connect(self.db_path) as conn: - expire_time = int(time.time()) - self.cache_ttl - conn.execute("DELETE FROM robots_cache WHERE fetch_time < ?", (expire_time,)) - - class InvalidCSSSelectorError(Exception): pass @@ -569,7 +514,7 @@ def get_system_memory(): system = platform.system() if system == "Linux": - with open("/proc/meminfo", "r") as mem: + with open("/proc/meminfo") as mem: for line in mem: if line.startswith("MemTotal:"): return int(line.split()[1]) * 1024 # Convert KB to bytes @@ -656,7 +601,7 @@ async def get_chromium_path(browser_type) -> str: home_folder = get_home_folder() path_file = os.path.join(home_folder, f"{browser_type.lower()}.path") if os.path.exists(path_file): - with open(path_file, "r") as f: + with open(path_file) as f: return f.read() from playwright.async_api import async_playwright @@ -773,7 +718,7 @@ def sanitize_html(html): return sanitized_html -def sanitize_input_encode(text: str) -> str: +def sanitize_input_encode(text: str | None) -> str: """Sanitize input to handle potential encoding issues.""" try: try: @@ -816,683 +761,11 @@ def escape_json_string(s): # Additional problematic characters # Unicode control characters - s = re.sub(r"[\x00-\x1f\x7f-\x9f]", lambda x: "\\u{:04x}".format(ord(x.group())), s) + s = re.sub(r"[\x00-\x1f\x7f-\x9f]", lambda x: f"\\u{ord(x.group()):04x}", s) return s -def replace_inline_tags(soup, tags, only_text=False): - """ - Replace inline HTML tags with Markdown-style equivalents. - - How it works: - 1. Maps specific tags (e.g., , ) to Markdown syntax. - 2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object. - 3. Optionally replaces tags with their text content only. - - Args: - soup (BeautifulSoup): Parsed HTML content. - tags (List[str]): List of tags to replace. - only_text (bool): Whether to replace tags with plain text. Defaults to False. - - Returns: - BeautifulSoup: Updated BeautifulSoup object with replaced tags. - """ - - tag_replacements = { - "b": lambda tag: f"**{tag.text}**", - "i": lambda tag: f"*{tag.text}*", - "u": lambda tag: f"__{tag.text}__", - "span": lambda tag: f"{tag.text}", - "del": lambda tag: f"~~{tag.text}~~", - "ins": lambda tag: f"++{tag.text}++", - "sub": lambda tag: f"~{tag.text}~", - "sup": lambda tag: f"^^{tag.text}^^", - "strong": lambda tag: f"**{tag.text}**", - "em": lambda tag: f"*{tag.text}*", - "code": lambda tag: f"`{tag.text}`", - "kbd": lambda tag: f"`{tag.text}`", - "var": lambda tag: f"_{tag.text}_", - "s": lambda tag: f"~~{tag.text}~~", - "q": lambda tag: f'"{tag.text}"', - "abbr": lambda tag: f"{tag.text} ({tag.get('title', '')})", - "cite": lambda tag: f"_{tag.text}_", - "dfn": lambda tag: f"_{tag.text}_", - "time": lambda tag: f"{tag.text}", - "small": lambda tag: f"{tag.text}", - "mark": lambda tag: f"=={tag.text}==", - } - - replacement_data = [ - (tag, tag_replacements.get(tag, lambda t: t.text)) for tag in tags - ] - - for tag_name, replacement_func in replacement_data: - for tag in soup.find_all(tag_name): - replacement_text = tag.text if only_text else replacement_func(tag) - tag.replace_with(replacement_text) - - return soup - - # for tag_name in tags: - # for tag in soup.find_all(tag_name): - # if not only_text: - # replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag) - # tag.replace_with(replacement_text) - # else: - # tag.replace_with(tag.text) - - # return soup - - -def get_content_of_website( - url, html, word_count_threshold=MIN_WORD_THRESHOLD, css_selector=None, **kwargs -): - """ - Extract structured content, media, and links from website HTML. - - How it works: - 1. Parses the HTML content using BeautifulSoup. - 2. Extracts internal/external links and media (images, videos, audios). - 3. Cleans the content by removing unwanted tags and attributes. - 4. Converts cleaned HTML to Markdown. - 5. Collects metadata and returns the extracted information. - - Args: - url (str): The website URL. - html (str): The HTML content of the website. - word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD. - css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None. - - Returns: - Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata. - """ - - try: - if not html: - return None - # Parse HTML content with BeautifulSoup - soup = BeautifulSoup(html, "html.parser") - - # Get the content within the tag - body = soup.body - - # If css_selector is provided, extract content based on the selector - if css_selector: - selected_elements = body.select(css_selector) - if not selected_elements: - raise InvalidCSSSelectorError( - f"Invalid CSS selector , No elements found for CSS selector: {css_selector}" - ) - div_tag = soup.new_tag("div") - for el in selected_elements: - div_tag.append(el) - body = div_tag - - links = {"internal": [], "external": []} - - # Extract all internal and external links - for a in body.find_all("a", href=True): - href = a["href"] - url_base = url.split("/")[2] - if href.startswith("http") and url_base not in href: - links["external"].append({"href": href, "text": a.get_text()}) - else: - links["internal"].append({"href": href, "text": a.get_text()}) - - # Remove script, style, and other tags that don't carry useful content from body - for tag in body.find_all(["script", "style", "link", "meta", "noscript"]): - tag.decompose() - - # Remove all attributes from remaining tags in body, except for img tags - for tag in body.find_all(): - if tag.name != "img": - tag.attrs = {} - - # Extract all img tgas int0 [{src: '', alt: ''}] - media = {"images": [], "videos": [], "audios": []} - for img in body.find_all("img"): - media["images"].append( - {"src": img.get("src"), "alt": img.get("alt"), "type": "image"} - ) - - # Extract all video tags into [{src: '', alt: ''}] - for video in body.find_all("video"): - media["videos"].append( - {"src": video.get("src"), "alt": video.get("alt"), "type": "video"} - ) - - # Extract all audio tags into [{src: '', alt: ''}] - for audio in body.find_all("audio"): - media["audios"].append( - {"src": audio.get("src"), "alt": audio.get("alt"), "type": "audio"} - ) - - # Replace images with their alt text or remove them if no alt text is available - for img in body.find_all("img"): - alt_text = img.get("alt") - if alt_text: - img.replace_with(soup.new_string(alt_text)) - else: - img.decompose() - - # Create a function that replace content of all"pre" tag with its inner text - def replace_pre_tags_with_text(node): - for child in node.find_all("pre"): - # set child inner html to its text - child.string = child.get_text() - return node - - # Replace all "pre" tags with their inner text - body = replace_pre_tags_with_text(body) - - # Replace inline tags with their text content - body = replace_inline_tags( - body, - [ - "b", - "i", - "u", - "span", - "del", - "ins", - "sub", - "sup", - "strong", - "em", - "code", - "kbd", - "var", - "s", - "q", - "abbr", - "cite", - "dfn", - "time", - "small", - "mark", - ], - only_text=kwargs.get("only_text", False), - ) - - # Recursively remove empty elements, their parent elements, and elements with word count below threshold - def remove_empty_and_low_word_count_elements(node, word_count_threshold): - for child in node.contents: - if isinstance(child, element.Tag): - remove_empty_and_low_word_count_elements( - child, word_count_threshold - ) - word_count = len(child.get_text(strip=True).split()) - if ( - len(child.contents) == 0 and not child.get_text(strip=True) - ) or word_count < word_count_threshold: - child.decompose() - return node - - body = remove_empty_and_low_word_count_elements(body, word_count_threshold) - - def remove_small_text_tags( - body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD - ): - # We'll use a list to collect all tags that don't meet the word count requirement - tags_to_remove = [] - - # Traverse all tags in the body - for tag in body.find_all(True): # True here means all tags - # Check if the tag contains text and if it's not just whitespace - if tag.string and tag.string.strip(): - # Split the text by spaces and count the words - word_count = len(tag.string.strip().split()) - # If the word count is less than the threshold, mark the tag for removal - if word_count < word_count_threshold: - tags_to_remove.append(tag) - - # Remove all marked tags from the tree - for tag in tags_to_remove: - tag.decompose() # or tag.extract() to remove and get the element - - return body - - # Remove small text tags - body = remove_small_text_tags(body, word_count_threshold) - - def is_empty_or_whitespace(tag: Tag): - if isinstance(tag, NavigableString): - return not tag.strip() - # Check if the tag itself is empty or all its children are empty/whitespace - if not tag.contents: - return True - return all(is_empty_or_whitespace(child) for child in tag.contents) - - def remove_empty_tags(body: Tag): - # Continue processing until no more changes are made - changes = True - while changes: - changes = False - # Collect all tags that are empty or contain only whitespace - empty_tags = [ - tag for tag in body.find_all(True) if is_empty_or_whitespace(tag) - ] - for tag in empty_tags: - # If a tag is empty, decompose it - tag.decompose() - changes = True # Mark that a change was made - - return body - - # Remove empty tags - body = remove_empty_tags(body) - - # Flatten nested elements with only one child of the same type - def flatten_nested_elements(node): - for child in node.contents: - if isinstance(child, element.Tag): - flatten_nested_elements(child) - if ( - len(child.contents) == 1 - and child.contents[0].name == child.name - ): - # print('Flattening:', child.name) - child_content = child.contents[0] - child.replace_with(child_content) - - return node - - body = flatten_nested_elements(body) - - # Remove comments - for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): - comment.extract() - - # Remove consecutive empty newlines and replace multiple spaces with a single space - cleaned_html = str(body).replace("\n\n", "\n").replace(" ", " ") - - # Sanitize the cleaned HTML content - cleaned_html = sanitize_html(cleaned_html) - # sanitized_html = escape_json_string(cleaned_html) - - # Convert cleaned HTML to Markdown - h = html2text.HTML2Text() - h = CustomHTML2Text() - h.ignore_links = True - markdown = h.handle(cleaned_html) - markdown = markdown.replace(" ```", "```") - - try: - meta = extract_metadata(html, soup) - except Exception as e: - print("Error extracting metadata:", str(e)) - meta = {} - - # Return the Markdown content - return { - "markdown": markdown, - "cleaned_html": cleaned_html, - "success": True, - "media": media, - "links": links, - "metadata": meta, - } - - except Exception as e: - print("Error processing HTML content:", str(e)) - raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e - - -def get_content_of_website_optimized( - url: str, - html: str, - word_count_threshold: int = MIN_WORD_THRESHOLD, - css_selector: str = None, - **kwargs, -) -> Dict[str, Any]: - """ - Extracts and cleans content from website HTML, optimizing for useful media and contextual information. - - Parses the provided HTML to extract internal and external links, filters and scores images for usefulness, gathers contextual descriptions for media, removes unwanted or low-value elements, and converts the cleaned HTML to Markdown. Also extracts metadata and returns all structured content in a dictionary. - - Args: - url: The URL of the website being processed. - html: The raw HTML content to extract from. - word_count_threshold: Minimum word count for elements to be retained. - css_selector: Optional CSS selector to restrict extraction to specific elements. - - Returns: - A dictionary containing Markdown content, cleaned HTML, extraction success status, media and link lists, and metadata. - - Raises: - InvalidCSSSelectorError: If a provided CSS selector does not match any elements. - """ - if not html: - return None - - soup = BeautifulSoup(html, "html.parser") - body = soup.body - - image_description_min_word_threshold = kwargs.get( - "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD - ) - - for tag in kwargs.get("excluded_tags", []) or []: - for el in body.select(tag): - el.decompose() - - if css_selector: - selected_elements = body.select(css_selector) - if not selected_elements: - raise InvalidCSSSelectorError( - f"Invalid CSS selector, No elements found for CSS selector: {css_selector}" - ) - body = soup.new_tag("div") - for el in selected_elements: - body.append(el) - - links = {"internal": [], "external": []} - media = {"images": [], "videos": [], "audios": []} - - # Extract meaningful text for media files from closest parent - def find_closest_parent_with_useful_text(tag): - current_tag = tag - while current_tag: - current_tag = current_tag.parent - # Get the text content from the parent tag - if current_tag: - text_content = current_tag.get_text(separator=" ", strip=True) - # Check if the text content has at least word_count_threshold - if len(text_content.split()) >= image_description_min_word_threshold: - return text_content - return None - - def process_image(img, url, index, total_images): - # Check if an image has valid display and inside undesired html elements - """ - Processes an HTML image element to determine its relevance and extract metadata. - - Evaluates an image's visibility, context, and usefulness based on its attributes and parent elements. If the image passes validation and exceeds a usefulness score threshold, returns a dictionary with its source, alt text, contextual description, score, and type. Otherwise, returns None. - - Args: - img: The BeautifulSoup image tag to process. - url: The base URL of the page containing the image. - index: The index of the image in the list of images on the page. - total_images: The total number of images on the page. - - Returns: - A dictionary with image metadata if the image is considered useful, or None otherwise. - """ - def is_valid_image(img, parent, parent_classes): - style = img.get("style", "") - src = img.get("src", "") - classes_to_check = ["button", "icon", "logo"] - tags_to_check = ["button", "input"] - return all( - [ - "display:none" not in style, - src, - not any( - s in var - for var in [src, img.get("alt", ""), *parent_classes] - for s in classes_to_check - ), - parent.name not in tags_to_check, - ] - ) - - # Score an image for it's usefulness - def score_image_for_usefulness(img, base_url, index, images_count): - # Function to parse image height/width value and units - """ - Scores an HTML image element for usefulness based on size, format, attributes, and position. - - The function evaluates an image's dimensions, file format, alt text, and its position among all images on the page to assign a usefulness score. Higher scores indicate images that are likely more relevant or informative for content extraction or summarization. - - Args: - img: The HTML image element to score. - base_url: The base URL used to resolve relative image sources. - index: The position of the image in the list of images on the page (zero-based). - images_count: The total number of images on the page. - - Returns: - An integer usefulness score for the image. - """ - def parse_dimension(dimension): - if dimension: - match = re.match(r"(\d+)(\D*)", dimension) - if match: - number = int(match.group(1)) - unit = ( - match.group(2) or "px" - ) # Default unit is 'px' if not specified - return number, unit - return None, None - - # Fetch image file metadata to extract size and extension - def fetch_image_file_size(img, base_url): - # If src is relative path construct full URL, if not it may be CDN URL - """ - Fetches the file size of an image by sending a HEAD request to its URL. - - Args: - img: A BeautifulSoup tag representing the image element. - base_url: The base URL to resolve relative image sources. - - Returns: - The value of the "Content-Length" header as a string if available, otherwise None. - """ - img_url = urljoin(base_url, img.get("src")) - try: - response = requests.head(img_url) - if response.status_code == 200: - return response.headers.get("Content-Length", None) - else: - print(f"Failed to retrieve file size for {img_url}") - return None - except InvalidSchema: - return None - - image_height = img.get("height") - height_value, height_unit = parse_dimension(image_height) - image_width = img.get("width") - width_value, width_unit = parse_dimension(image_width) - image_size = 0 # int(fetch_image_file_size(img,base_url) or 0) - image_format = os.path.splitext(img.get("src", ""))[1].lower() - # Remove . from format - image_format = image_format.strip(".") - score = 0 - if height_value: - if height_unit == "px" and height_value > 150: - score += 1 - if height_unit in ["%", "vh", "vmin", "vmax"] and height_value > 30: - score += 1 - if width_value: - if width_unit == "px" and width_value > 150: - score += 1 - if width_unit in ["%", "vh", "vmin", "vmax"] and width_value > 30: - score += 1 - if image_size > 10000: - score += 1 - if img.get("alt") != "": - score += 1 - if any(image_format == format for format in ["jpg", "png", "webp"]): - score += 1 - if index / images_count < 0.5: - score += 1 - return score - - if not is_valid_image(img, img.parent, img.parent.get("class", [])): - return None - score = score_image_for_usefulness(img, url, index, total_images) - if score <= IMAGE_SCORE_THRESHOLD: - return None - return { - "src": img.get("src", "").replace('\\"', '"').strip(), - "alt": img.get("alt", ""), - "desc": find_closest_parent_with_useful_text(img), - "score": score, - "type": "image", - } - - def process_element(element: element.PageElement) -> bool: - try: - if isinstance(element, NavigableString): - if isinstance(element, Comment): - element.extract() - return False - - if element.name in ["script", "style", "link", "meta", "noscript"]: - element.decompose() - return False - - keep_element = False - - if element.name == "a" and element.get("href"): - href = element["href"] - url_base = url.split("/")[2] - link_data = {"href": href, "text": element.get_text()} - if href.startswith("http") and url_base not in href: - links["external"].append(link_data) - else: - links["internal"].append(link_data) - keep_element = True - - elif element.name == "img": - return True # Always keep image elements - - elif element.name in ["video", "audio"]: - media[f"{element.name}s"].append( - { - "src": element.get("src"), - "alt": element.get("alt"), - "type": element.name, - "description": find_closest_parent_with_useful_text(element), - } - ) - source_tags = element.find_all("source") - for source_tag in source_tags: - media[f"{element.name}s"].append( - { - "src": source_tag.get("src"), - "alt": element.get("alt"), - "type": element.name, - "description": find_closest_parent_with_useful_text( - element - ), - } - ) - return True # Always keep video and audio elements - - if element.name != "pre": - if element.name in [ - "b", - "i", - "u", - "span", - "del", - "ins", - "sub", - "sup", - "strong", - "em", - "code", - "kbd", - "var", - "s", - "q", - "abbr", - "cite", - "dfn", - "time", - "small", - "mark", - ]: - if kwargs.get("only_text", False): - element.replace_with(element.get_text()) - else: - element.unwrap() - elif element.name != "img": - element.attrs = {} - - # Process children - for child in list(element.children): - if isinstance(child, NavigableString) and not isinstance( - child, Comment - ): - if len(child.strip()) > 0: - keep_element = True - else: - if process_element(child): - keep_element = True - - # Check word count - if not keep_element: - word_count = len(element.get_text(strip=True).split()) - keep_element = word_count >= word_count_threshold - - if not keep_element: - element.decompose() - - return keep_element - except Exception as e: - print("Error processing element:", str(e)) - return False - - # process images by filtering and extracting contextual text from the page - imgs = body.find_all("img") - media["images"] = [ - result - for result in ( - process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs) - ) - if result is not None - ] - - process_element(body) - - def flatten_nested_elements(node): - if isinstance(node, NavigableString): - return node - if ( - len(node.contents) == 1 - and isinstance(node.contents[0], element.Tag) - and node.contents[0].name == node.name - ): - return flatten_nested_elements(node.contents[0]) - node.contents = [flatten_nested_elements(child) for child in node.contents] - return node - - body = flatten_nested_elements(body) - base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') - for img in imgs: - try: - src = img.get("src", "") - if base64_pattern.match(src): - img["src"] = base64_pattern.sub("", src) - except Exception as _ex: - pass - - cleaned_html = str(body).replace("\n\n", "\n").replace(" ", " ") - cleaned_html = sanitize_html(cleaned_html) - - h = CustomHTML2Text() - h.ignore_links = True - markdown = h.handle(cleaned_html) - markdown = markdown.replace(" ```", "```") - - try: - meta = extract_metadata(html, soup) - except Exception as e: - print("Error extracting metadata:", str(e)) - meta = {} - - return { - "markdown": markdown, - "cleaned_html": cleaned_html, - "success": True, - "media": media, - "links": links, - "metadata": meta, - } - - def extract_metadata_using_lxml(html, doc=None): """ Extract metadata from HTML using lxml for better performance. @@ -2119,34 +1392,9 @@ def fast_format_html(html_string): return "\n".join(formatted) -def normalize_url(href, base_url): - """Normalize URLs to ensure consistent format""" - from urllib.parse import urljoin, urlparse - - # Parse base URL to get components - parsed_base = urlparse(base_url) - if not parsed_base.scheme or not parsed_base.netloc: - raise ValueError(f"Invalid base URL format: {base_url}") - - if parsed_base.scheme.lower() not in ["http", "https"]: - # Handle special protocols - raise ValueError(f"Invalid base URL format: {base_url}") - cleaned_href = href.strip() - - # Use urljoin to handle all cases - return urljoin(base_url, cleaned_href) - - - - def normalize_url( href: str, base_url: str, - *, - drop_query_tracking=True, - sort_query=True, - keep_fragment=False, - extra_drop_params=None ): """ Extended URL normalizer @@ -2157,14 +1405,6 @@ def normalize_url( The raw link extracted from a page. base_url : str The pageโ€™s canonical URL (used to resolve relative links). - drop_query_tracking : bool (default True) - Remove common tracking query parameters. - sort_query : bool (default True) - Alphabetically sort query keys for deterministic output. - keep_fragment : bool (default False) - Preserve the hash fragment (#section) if you need in-page links. - extra_drop_params : Iterable[str] | None - Additional query keys to strip (case-insensitive). Returns ------- @@ -2192,43 +1432,22 @@ def normalize_url( # โ”€โ”€ query โ”€โ”€ query = parsed.query if query: - # explode, mutate, then rebuild params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)] - - if drop_query_tracking: - default_tracking = { - 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', - 'utm_content', 'gclid', 'fbclid', 'ref', 'ref_src' - } - if extra_drop_params: - default_tracking |= {p.lower() for p in extra_drop_params} - params = [(k, v) for k, v in params if k not in default_tracking] - - if sort_query: - params.sort(key=lambda kv: kv[0]) - query = urlencode(params, doseq=True) if params else '' - # โ”€โ”€ fragment โ”€โ”€ - fragment = parsed.fragment if keep_fragment else '' - # Re-assemble - normalized = urlunparse(( + return urlunparse(( parsed.scheme, netloc, path, parsed.params, query, - fragment + '' )) - return normalized - def normalize_url_for_deep_crawl(href, base_url): """Normalize URLs to ensure consistent format""" - from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode - # Handle None or empty values if not href: return None @@ -2300,43 +1519,6 @@ def efficient_normalize_url_for_deep_crawl(href, base_url): return normalized - -def normalize_url_tmp(href, base_url): - """Normalize URLs to ensure consistent format""" - # Extract protocol and domain from base URL - try: - base_parts = base_url.split("/") - protocol = base_parts[0] - domain = base_parts[2] - except IndexError: - raise ValueError(f"Invalid base URL format: {base_url}") - - # Handle special protocols - special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"} - if any(href.lower().startswith(proto) for proto in special_protocols): - return href.strip() - - # Handle anchor links - if href.startswith("#"): - return f"{base_url}{href}" - - # Handle protocol-relative URLs - if href.startswith("//"): - return f"{protocol}{href}" - - # Handle root-relative URLs - if href.startswith("/"): - return f"{protocol}//{domain}{href}" - - # Handle relative URLs - if not href.startswith(("http://", "https://")): - # Remove leading './' if present - href = href.lstrip("./") - return f"{protocol}//{domain}/{href}" - - return href.strip() - - def get_base_domain(url: str) -> str: """ Extract the base domain from a given URL, handling common edge cases. @@ -2704,7 +1886,7 @@ def generate_content_hash(content: str) -> str: # return hashlib.sha256(content.encode()).hexdigest() -def ensure_content_dirs(base_path: str) -> Dict[str, str]: +def ensure_content_dirs(base_path: str) -> dict[str, str]: """Create content directories if they don't exist""" dirs = { "html": "html_content", @@ -2755,10 +1937,6 @@ def get_error_context(exc_info, context_lines: int = 5): Returns: dict: Error context information """ - import traceback - import linecache - import os - # Get the full traceback tb = traceback.extract_tb(exc_info[2]) @@ -3021,7 +2199,7 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre return result - except Exception as e: + except Exception: # Fallback for parsing errors return html_content[:max_size] if len(html_content) > max_size else html_content @@ -3038,8 +2216,6 @@ def start_colab_display_server(): except ImportError: raise RuntimeError("This function must be run in Google Colab environment.") - import os, time, subprocess - os.environ["DISPLAY"] = ":99" # Xvfb @@ -3256,8 +2432,8 @@ def calculate_link_intrinsic_score( def calculate_total_score( - intrinsic_score: Optional[float] = None, - contextual_score: Optional[float] = None, + intrinsic_score: float | None = None, + contextual_score: float | None = None, score_links_enabled: bool = False, query_provided: bool = False ) -> float: @@ -3305,8 +2481,8 @@ def calculate_total_score( # Embedding utilities async def get_text_embeddings( - texts: List[str], - llm_config: Optional[Dict] = None, + texts: list[str], + llm_config: dict | None = None, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", batch_size: int = 32 ) -> np.ndarray: @@ -3360,44 +2536,42 @@ async def get_text_embeddings( return np.array(embeddings) # Default: use sentence-transformers - else: - # Lazy load to avoid importing heavy libraries unless needed - try: - from sentence_transformers import SentenceTransformer - except ImportError: - raise ImportError( - "sentence-transformers is required for local embeddings. " - "Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers" - ) - - # Cache the model in function attribute to avoid reloading - if not hasattr(get_text_embeddings, '_models'): - get_text_embeddings._models = {} - - if model_name not in get_text_embeddings._models: - get_text_embeddings._models[model_name] = SentenceTransformer(model_name) - - encoder = get_text_embeddings._models[model_name] - - # Batch encode for efficiency - embeddings = encoder.encode( - texts, - batch_size=batch_size, - show_progress_bar=False, - convert_to_numpy=True + # Lazy load to avoid importing heavy libraries unless needed + try: + from sentence_transformers import SentenceTransformer + except ImportError: + raise ImportError( + "sentence-transformers is required for local embeddings. " + "Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers" ) - - return embeddings + + # Cache the model in function attribute to avoid reloading + if not hasattr(get_text_embeddings, '_models'): + get_text_embeddings._models = {} + + if model_name not in get_text_embeddings._models: + get_text_embeddings._models[model_name] = SentenceTransformer(model_name) + + encoder = get_text_embeddings._models[model_name] + + # Batch encode for efficiency + embeddings = encoder.encode( + texts, + batch_size=batch_size, + show_progress_bar=False, + convert_to_numpy=True + ) + + return embeddings def get_text_embeddings_sync( - texts: List[str], - llm_config: Optional[Dict] = None, + texts: list[str], + llm_config: dict | None = None, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", batch_size: int = 32 ) -> np.ndarray: """Synchronous wrapper for get_text_embeddings""" - import numpy as np return asyncio.run(get_text_embeddings(texts, llm_config, model_name, batch_size)) @@ -3476,7 +2650,7 @@ def get_true_memory_usage_percent() -> float: return max(0.0, min(100.0, used_percent)) -def get_memory_stats() -> Tuple[float, float, float]: +def get_memory_stats() -> tuple[float, float, float]: """ Get comprehensive memory statistics. diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md index eb29b94ce..400d50a15 100644 --- a/deploy/docker/c4ai-code-context.md +++ b/deploy/docker/c4ai-code-context.md @@ -3571,8 +3571,6 @@ from .model_loader import ( calculate_batch_size ) -from .types import LLMConfig, create_llm_config - from functools import partial import numpy as np import re diff --git a/docs/examples/serp_api_project_11_feb.py b/docs/examples/serp_api_project_11_feb.py deleted file mode 100644 index df0768ed7..000000000 --- a/docs/examples/serp_api_project_11_feb.py +++ /dev/null @@ -1,305 +0,0 @@ -import asyncio -import json -from typing import Any, Dict, List, Optional - -from regex import P -from crawl4ai import ( - AsyncWebCrawler, - BrowserConfig, - CrawlerRunConfig, - CacheMode, - LLMExtractionStrategy, - JsonCssExtractionStrategy, - CrawlerHub, - CrawlResult, - DefaultMarkdownGenerator, - PruningContentFilter, -) -from pathlib import Path -from pydantic import BaseModel - -__current_dir = Path(__file__).parent - -# Crawl4ai Hello Web -async def little_hello_web(): - async with AsyncWebCrawler() as crawler: - result : CrawlResult = await crawler.arun( - url="https://www.helloworld.org" - ) - print(result.markdown.raw_markdown[:500]) - -async def hello_web(): - browser_config = BrowserConfig(headless=True, verbose=True) - async with AsyncWebCrawler(config=browser_config) as crawler: - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - markdown_generator=DefaultMarkdownGenerator( - content_filter=PruningContentFilter( - threshold=0.48, threshold_type="fixed", min_word_threshold=0 - ) - ), - ) - result : CrawlResult = await crawler.arun( - url="https://www.helloworld.org", config=crawler_config - ) - print(result.markdown.fit_markdown[:500]) - -# Naive Approach Using Large Language Models -async def extract_using_llm(): - print("Extracting using Large Language Models") - - browser_config = BrowserConfig(headless=True, verbose=True) - crawler = AsyncWebCrawler(config=browser_config) - - await crawler.start() - try: - class Sitelink(BaseModel): - title: str - link: str - - class GoogleSearchResult(BaseModel): - title: str - link: str - snippet: str - sitelinks: Optional[List[Sitelink]] = None - - llm_extraction_strategy = LLMExtractionStrategy( - provider = "openai/gpt-4o", - schema = GoogleSearchResult.model_json_schema(), - instruction="""I want to extract the title, link, snippet, and sitelinks from a Google search result. I shared here the content of div#search from the search result page. We are just interested in organic search results. - Example: - { - "title": "Google", - "link": "https://www.google.com", - "snippet": "Google is a search engine.", - "sitelinks": [ - { - "title": "Gmail", - "link": "https://mail.google.com" - }, - { - "title": "Google Drive", - "link": "https://drive.google.com" - } - ] - }""", - # apply_chunking=False, - chunk_token_threshold=2 ** 12, # 2^12 = 4096 - verbose=True, - # input_format="html", # html, markdown, cleaned_html - input_format="cleaned_html" - ) - - - crawl_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - keep_attrs=["id", "class"], - keep_data_attributes=True, - delay_before_return_html=2, - extraction_strategy=llm_extraction_strategy, - css_selector="div#search", - ) - - result : CrawlResult = await crawler.arun( - url="https://www.google.com/search?q=apple%20inc&start=0&num=10", - config=crawl_config, - ) - - search_result = {} - if result.success: - search_result = json.loads(result.extracted_content) - - # save search result to file - with open(__current_dir / "search_result_using_llm.json", "w") as f: - f.write(json.dumps(search_result, indent=4)) - print(json.dumps(search_result, indent=4)) - - finally: - await crawler.close() - -# Example of using CrawlerHub -async def schema_generator(): - print("Generating schema") - html = "" - - # Load html from file - with open(__current_dir / "google_search_item.html", "r") as f: - html = f.read() - - organic_schema = JsonCssExtractionStrategy.generate_schema( - html=html, - target_json_example="""{ - "title": "...", - "link": "...", - "snippet": "...", - "date": "1 hour ago", - "sitelinks": [ - { - "title": "...", - "link": "..." - } - ] - }""", - query="""The given HTML is the crawled HTML from the Google search result, which refers to one HTML element representing one organic Google search result. Please find the schema for the organic search item based on the given HTML. I am interested in the title, link, snippet text, sitelinks, and date.""", - ) - - print(json.dumps(organic_schema, indent=4)) - pass - -# Golden Standard -async def build_schema(html:str, force: bool = False) -> Dict[str, Any]: - print("Building schema") - schemas = {} - if (__current_dir / "organic_schema.json").exists() and not force: - with open(__current_dir / "organic_schema.json", "r") as f: - schemas["organic"] = json.loads(f.read()) - else: - # Extract schema from html - organic_schema = JsonCssExtractionStrategy.generate_schema( - html=html, - target_json_example="""{ - "title": "...", - "link": "...", - "snippet": "...", - "date": "1 hour ago", - "sitelinks": [ - { - "title": "...", - "link": "..." - } - ] - }""", - query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text, sitelinks and date. Usually they are all inside a div#search.""", - ) - - # Save schema to file current_dir/organic_schema.json - with open(__current_dir / "organic_schema.json", "w") as f: - f.write(json.dumps(organic_schema, indent=4)) - - schemas["organic"] = organic_schema - - # Repeat the same for top_stories_schema - if (__current_dir / "top_stories_schema.json").exists(): - with open(__current_dir / "top_stories_schema.json", "r") as f: - schemas["top_stories"] = json.loads(f.read()) - else: - top_stories_schema = JsonCssExtractionStrategy.generate_schema( - html=html, - target_json_example="""{ - "title": "...", - "link": "...", - "source": "Insider Monkey", - "date": "1 hour ago", - }""", - query="""The given HTML is the crawled HTML from the Google search result. Please find the schema for the Top Stories item in the given HTML. I am interested in the title, link, source, and date.""", - ) - - with open(__current_dir / "top_stories_schema.json", "w") as f: - f.write(json.dumps(top_stories_schema, indent=4)) - - schemas["top_stories"] = top_stories_schema - - # Repeat the same for suggested_queries_schema - if (__current_dir / "suggested_queries_schema.json").exists(): - with open(__current_dir / "suggested_queries_schema.json", "r") as f: - schemas["suggested_queries"] = json.loads(f.read()) - else: - suggested_queries_schema = JsonCssExtractionStrategy.generate_schema( - html=html, - target_json_example="""{ - "query": "A for Apple", - }""", - query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "relatedSearches" at the bottom of the page. I am interested in the queries only.""", - ) - - with open(__current_dir / "suggested_queries_schema.json", "w") as f: - f.write(json.dumps(suggested_queries_schema, indent=4)) - - schemas["suggested_queries"] = suggested_queries_schema - - return schemas - -async def search(q: str = "apple inc") -> Dict[str, Any]: - print("Searching for:", q) - - browser_config = BrowserConfig(headless=True, verbose=True) - crawler = AsyncWebCrawler(config=browser_config) - search_result: Dict[str, List[Dict[str, Any]]] = {} - - await crawler.start() - try: - crawl_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - keep_attrs=["id", "class"], - keep_data_attributes=True, - delay_before_return_html=2, - ) - from urllib.parse import quote - result: CrawlResult = await crawler.arun( - f"https://www.google.com/search?q={quote(q)}&start=0&num=10", - config=crawl_config - ) - - if result.success: - schemas : Dict[str, Any] = await build_schema(result.html) - - for schema in schemas.values(): - schema_key = schema["name"].lower().replace(' ', '_') - search_result[schema_key] = JsonCssExtractionStrategy( - schema=schema - ).run( - url="", - sections=[result.html], - ) - - # save search result to file - with open(__current_dir / "search_result.json", "w") as f: - f.write(json.dumps(search_result, indent=4)) - print(json.dumps(search_result, indent=4)) - - finally: - await crawler.close() - - return search_result - -# Example of using CrawlerHub -async def hub_example(query: str = "apple inc"): - print("Using CrawlerHub") - crawler_cls = CrawlerHub.get("google_search") - crawler = crawler_cls() - - # Text search - text_results = await crawler.run( - query=query, - search_type="text", - schema_cache_path="/Users/unclecode/.crawl4ai" - ) - # Save search result to file - with open(__current_dir / "search_result_using_hub.json", "w") as f: - f.write(json.dumps(json.loads(text_results), indent=4)) - - print(json.dumps(json.loads(text_results), indent=4)) - - -async def demo(): - # Step 1: Introduction & Overview - # await little_hello_web() - # await hello_web() - - # Step 2: Demo end result, using hub - # await hub_example() - - # Step 3: Using LLm for extraction - # await extract_using_llm() - - # Step 4: GEt familiar with schema generation - # await schema_generator() - - # Step 5: Golden Standard - # await search() - - # Step 6: Introduction to CrawlerHub - await hub_example() - -if __name__ == "__main__": - asyncio.run(demo()) diff --git a/pyproject.toml b/pyproject.toml index 9b00bd28e..ead301b46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "Crawl4AI" dynamic = ["version"] description = "๐Ÿš€๐Ÿค– Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.13" license = "Apache-2.0" authors = [ {name = "Unclecode", email = "unclecode@kidocode.com"} @@ -17,9 +17,9 @@ dependencies = [ "aiohttp>=3.11.11", "aiosqlite~=0.20", "anyio>=4.0.0", - "lxml~=5.3", + "lxml~=6.0", "litellm>=1.53.1", - "numpy>=1.26.0,<3", + "numpy==2.3.5", "pillow>=10.4", "playwright>=1.49.0", "patchright>=1.49.0", @@ -44,15 +44,12 @@ dependencies = [ "brotli>=1.1.0", "humanize>=4.10.0", "lark>=1.2.2", - "alphashape>=1.3.1", - "shapely>=2.0.0" + "shapely>=2.0.0", ] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", @@ -96,5 +93,7 @@ crawl4ai = { workspace = true } [dependency-groups] dev = [ - "crawl4ai", + "pytest>=8.4.2", + "pytest-asyncio>=1.2.0", + "ruff==0.14.3", ] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 20f4df4ff..000000000 --- a/requirements.txt +++ /dev/null @@ -1,35 +0,0 @@ -# Note: These requirements are also specified in pyproject.toml -# This file is kept for development environment setup and compatibility -aiofiles>=24.1.0 -aiohttp>=3.11.11 -aiosqlite~=0.20 -anyio>=4.0.0 -lxml~=5.3 -litellm>=1.53.1 -numpy>=1.26.0,<3 -pillow>=10.4 -playwright>=1.49.0 -patchright>=1.49.0 -python-dotenv~=1.0 -requests~=2.26 -beautifulsoup4~=4.12 -tf-playwright-stealth>=1.1.0 -xxhash~=3.4 -rank-bm25~=0.2 -colorama~=0.4 -snowballstemmer~=2.2 -pydantic>=2.10 -pyOpenSSL>=24.3.0 -psutil>=6.1.1 -PyYAML>=6.0 -nltk>=3.9.1 -rich>=13.9.4 -chardet>=5.2.0 -brotli>=1.1.0 -httpx[http2]>=0.27.2 -alphashape>=1.3.1 -shapely>=2.0.0 - -fake-useragent>=2.2.0 -pdf2image>=1.17.0 -PyPDF2>=3.0.1 \ No newline at end of file diff --git a/setup.py b/setup.py index a0b910412..6bf831795 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,8 @@ -from setuptools import setup, find_packages import os -from pathlib import Path import shutil +from pathlib import Path + +from setuptools import find_packages, setup # Note: Most configuration is now in pyproject.toml # This setup.py is kept for backwards compatibility @@ -56,8 +57,6 @@ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", diff --git a/tests/async/test_0.4.2_browser_manager.py b/tests/async/test_0.4.2_browser_manager.py deleted file mode 100644 index 21b4be11b..000000000 --- a/tests/async/test_0.4.2_browser_manager.py +++ /dev/null @@ -1,160 +0,0 @@ -import os -import sys -import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator - -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - - -# Assuming that the changes made allow different configurations -# for managed browser, persistent context, and so forth. - - -async def test_default_headless(): - async with AsyncWebCrawler( - headless=True, - verbose=True, - user_agent_mode="random", - user_agent_generator_config={"device_type": "mobile", "os_type": "android"}, - use_managed_browser=False, - use_persistent_context=False, - ignore_https_errors=True, - # Testing normal ephemeral context - ) as crawler: - result = await crawler.arun( - url="https://www.kidocode.com/degrees/technology", - cache_mode=CacheMode.BYPASS, - markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), - ) - print("[test_default_headless] success:", result.success) - print("HTML length:", len(result.html if result.html else "")) - - -async def test_managed_browser_persistent(): - # Treating use_persistent_context=True as managed_browser scenario. - async with AsyncWebCrawler( - headless=False, - verbose=True, - user_agent_mode="random", - user_agent_generator_config={"device_type": "desktop", "os_type": "mac"}, - use_managed_browser=True, - use_persistent_context=True, # now should behave same as managed browser - user_data_dir="./outpu/test_profile", - # This should store and reuse profile data across runs - ) as crawler: - result = await crawler.arun( - url="https://www.google.com", - cache_mode=CacheMode.BYPASS, - markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), - ) - print("[test_managed_browser_persistent] success:", result.success) - print("HTML length:", len(result.html if result.html else "")) - - -async def test_session_reuse(): - # Test creating a session, using it for multiple calls - session_id = "my_session" - async with AsyncWebCrawler( - headless=False, - verbose=True, - user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", - # Fixed user-agent for consistency - use_managed_browser=False, - use_persistent_context=False, - ) as crawler: - # First call: create session - result1 = await crawler.arun( - url="https://www.example.com", - cache_mode=CacheMode.BYPASS, - session_id=session_id, - markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), - ) - print("[test_session_reuse first call] success:", result1.success) - - # Second call: same session, possibly cookie retained - result2 = await crawler.arun( - url="https://www.example.com/about", - cache_mode=CacheMode.BYPASS, - session_id=session_id, - markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), - ) - print("[test_session_reuse second call] success:", result2.success) - - -async def test_magic_mode(): - # Test magic mode with override_navigator and simulate_user - async with AsyncWebCrawler( - headless=False, - verbose=True, - user_agent_mode="random", - user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}, - use_managed_browser=False, - use_persistent_context=False, - magic=True, - override_navigator=True, - simulate_user=True, - ) as crawler: - result = await crawler.arun( - url="https://www.kidocode.com/degrees/business", - cache_mode=CacheMode.BYPASS, - markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), - ) - print("[test_magic_mode] success:", result.success) - print("HTML length:", len(result.html if result.html else "")) - - -async def test_proxy_settings(): - # Test with a proxy (if available) to ensure code runs with proxy - async with AsyncWebCrawler( - headless=True, - verbose=False, - user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", - proxy="http://127.0.0.1:8080", # Assuming local proxy server for test - use_managed_browser=False, - use_persistent_context=False, - ) as crawler: - result = await crawler.arun( - url="https://httpbin.org/ip", - cache_mode=CacheMode.BYPASS, - markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), - ) - print("[test_proxy_settings] success:", result.success) - if result.success: - print("HTML preview:", result.html[:200] if result.html else "") - - -async def test_ignore_https_errors(): - # Test ignore HTTPS errors with a self-signed or invalid cert domain - # This is just conceptual, the domain should be one that triggers SSL error. - # Using a hypothetical URL that fails SSL: - async with AsyncWebCrawler( - headless=True, - verbose=True, - user_agent="Mozilla/5.0", - ignore_https_errors=True, - use_managed_browser=False, - use_persistent_context=False, - ) as crawler: - result = await crawler.arun( - url="https://self-signed.badssl.com/", - cache_mode=CacheMode.BYPASS, - markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), - ) - print("[test_ignore_https_errors] success:", result.success) - - -async def main(): - print("Running tests...") - # await test_default_headless() - # await test_managed_browser_persistent() - # await test_session_reuse() - # await test_magic_mode() - # await test_proxy_settings() - await test_ignore_https_errors() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/async/test_0.4.2_config_params.py b/tests/async/test_0.4.2_config_params.py deleted file mode 100644 index bb2113d08..000000000 --- a/tests/async/test_0.4.2_config_params.py +++ /dev/null @@ -1,211 +0,0 @@ -import os, sys - -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - -import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from crawl4ai.content_filter_strategy import PruningContentFilter -from crawl4ai import JsonCssExtractionStrategy -from crawl4ai.chunking_strategy import RegexChunking - - -# Category 1: Browser Configuration Tests -async def test_browser_config_object(): - """Test the new BrowserConfig object with various browser settings""" - browser_config = BrowserConfig( - browser_type="chromium", - headless=False, - viewport_width=1920, - viewport_height=1080, - use_managed_browser=True, - user_agent_mode="random", - user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}, - ) - - async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler: - result = await crawler.arun("https://example.com", cache_mode=CacheMode.BYPASS) - assert result.success, "Browser config crawl failed" - assert len(result.html) > 0, "No HTML content retrieved" - - -async def test_browser_performance_config(): - """Test browser configurations focused on performance""" - browser_config = BrowserConfig( - text_mode=True, - light_mode=True, - extra_args=["--disable-gpu", "--disable-software-rasterizer"], - ignore_https_errors=True, - java_script_enabled=False, - ) - - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun("https://example.com") - assert result.success, "Performance optimized crawl failed" - assert result.status_code == 200, "Unexpected status code" - - -# Category 2: Content Processing Tests -async def test_content_extraction_config(): - """Test content extraction with various strategies""" - crawler_config = CrawlerRunConfig( - word_count_threshold=300, - extraction_strategy=JsonCssExtractionStrategy( - schema={ - "name": "article", - "baseSelector": "div", - "fields": [{"name": "title", "selector": "h1", "type": "text"}], - } - ), - chunking_strategy=RegexChunking(), - content_filter=PruningContentFilter(), - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - "https://example.com/article", config=crawler_config - ) - assert result.extracted_content is not None, "Content extraction failed" - assert "title" in result.extracted_content, "Missing expected content field" - - -# Category 3: Cache and Session Management Tests -async def test_cache_and_session_management(): - """Test different cache modes and session handling""" - browser_config = BrowserConfig(use_persistent_context=True) - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.WRITE_ONLY, - process_iframes=True, - remove_overlay_elements=True, - ) - - async with AsyncWebCrawler(config=browser_config) as crawler: - # First request - should write to cache - result1 = await crawler.arun("https://example.com", config=crawler_config) - - # Second request - should use fresh fetch due to WRITE_ONLY mode - result2 = await crawler.arun("https://example.com", config=crawler_config) - - assert result1.success and result2.success, "Cache mode crawl failed" - assert result1.html == result2.html, "Inconsistent results between requests" - - -# Category 4: Media Handling Tests -async def test_media_handling_config(): - """Test configurations related to media handling""" - # Get the base path for home directroy ~/.crawl4ai/downloads, make sure it exists - os.makedirs(os.path.expanduser("~/.crawl4ai/downloads"), exist_ok=True) - browser_config = BrowserConfig( - viewport_width=1920, - viewport_height=1080, - accept_downloads=True, - downloads_path=os.path.expanduser("~/.crawl4ai/downloads"), - ) - crawler_config = CrawlerRunConfig( - screenshot=True, - pdf=True, - adjust_viewport_to_content=True, - wait_for_images=True, - screenshot_height_threshold=20000, - ) - - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun("https://example.com", config=crawler_config) - assert result.screenshot is not None, "Screenshot capture failed" - assert result.pdf is not None, "PDF generation failed" - - -# Category 5: Anti-Bot and Site Interaction Tests -async def test_antibot_config(): - """Test configurations for handling anti-bot measures""" - crawler_config = CrawlerRunConfig( - simulate_user=True, - override_navigator=True, - magic=True, - wait_for="js:()=>document.querySelector('body')", - delay_before_return_html=1.0, - log_console=True, - cache_mode=CacheMode.BYPASS, - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://example.com", config=crawler_config) - assert result.success, "Anti-bot measure handling failed" - - -# Category 6: Parallel Processing Tests -async def test_parallel_processing(): - """Test parallel processing capabilities""" - crawler_config = CrawlerRunConfig(mean_delay=0.5, max_range=1.0, semaphore_count=5) - - urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"] - - async with AsyncWebCrawler() as crawler: - results = await crawler.arun_many(urls, config=crawler_config) - assert len(results) == len(urls), "Not all URLs were processed" - assert all(r.success for r in results), "Some parallel requests failed" - - -# Category 7: Backwards Compatibility Tests -async def test_legacy_parameter_support(): - """Test that legacy parameters still work""" - async with AsyncWebCrawler( - headless=True, browser_type="chromium", viewport_width=1024, viewport_height=768 - ) as crawler: - result = await crawler.arun( - "https://example.com", - screenshot=True, - word_count_threshold=200, - bypass_cache=True, - css_selector=".main-content", - ) - assert result.success, "Legacy parameter support failed" - - -# Category 8: Mixed Configuration Tests -async def test_mixed_config_usage(): - """Test mixing new config objects with legacy parameters""" - browser_config = BrowserConfig(headless=True) - crawler_config = CrawlerRunConfig(screenshot=True) - - async with AsyncWebCrawler( - config=browser_config, - verbose=True, # legacy parameter - ) as crawler: - result = await crawler.arun( - "https://example.com", - config=crawler_config, - cache_mode=CacheMode.BYPASS, # legacy parameter - css_selector="body", # legacy parameter - ) - assert result.success, "Mixed configuration usage failed" - - -if __name__ == "__main__": - - async def run_tests(): - test_functions = [ - test_browser_config_object, - # test_browser_performance_config, - # test_content_extraction_config, - # test_cache_and_session_management, - # test_media_handling_config, - # test_antibot_config, - # test_parallel_processing, - # test_legacy_parameter_support, - # test_mixed_config_usage - ] - - for test in test_functions: - print(f"\nRunning {test.__name__}...") - try: - await test() - print(f"โœ“ {test.__name__} passed") - except AssertionError as e: - print(f"โœ— {test.__name__} failed: {str(e)}") - except Exception as e: - print(f"โœ— {test.__name__} error: {str(e)}") - - asyncio.run(run_tests()) diff --git a/tests/async/test_caching.py b/tests/async/test_caching.py deleted file mode 100644 index d7f6efb54..000000000 --- a/tests/async/test_caching.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -import sys -import pytest -import asyncio - -# Add the parent directory to the Python path -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) - -from crawl4ai.async_webcrawler import AsyncWebCrawler - - -@pytest.mark.asyncio -async def test_caching(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - - # First crawl (should not use cache) - start_time = asyncio.get_event_loop().time() - result1 = await crawler.arun(url=url, bypass_cache=True) - end_time = asyncio.get_event_loop().time() - time_taken1 = end_time - start_time - - assert result1.success - - # Second crawl (should use cache) - start_time = asyncio.get_event_loop().time() - result2 = await crawler.arun(url=url, bypass_cache=False) - end_time = asyncio.get_event_loop().time() - time_taken2 = end_time - start_time - - assert result2.success - assert time_taken2 < time_taken1 # Cached result should be faster - - -@pytest.mark.asyncio -async def test_bypass_cache(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - - # First crawl - result1 = await crawler.arun(url=url, bypass_cache=False) - assert result1.success - - # Second crawl with bypass_cache=True - result2 = await crawler.arun(url=url, bypass_cache=True) - assert result2.success - - # Content should be different (or at least, not guaranteed to be the same) - assert result1.html != result2.html or result1.markdown != result2.markdown - - -@pytest.mark.asyncio -async def test_clear_cache(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - - # Crawl and cache - await crawler.arun(url=url, bypass_cache=False) - - # Clear cache - await crawler.aclear_cache() - - # Check cache size - cache_size = await crawler.aget_cache_size() - assert cache_size == 0 - - -@pytest.mark.asyncio -async def test_flush_cache(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - - # Crawl and cache - await crawler.arun(url=url, bypass_cache=False) - - # Flush cache - await crawler.aflush_cache() - - # Check cache size - cache_size = await crawler.aget_cache_size() - assert cache_size == 0 - - -# Entry point for debugging -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/tests/async/test_content_scraper_strategy.py b/tests/async/test_content_scraper_strategy.py deleted file mode 100644 index 00022cd6d..000000000 --- a/tests/async/test_content_scraper_strategy.py +++ /dev/null @@ -1,216 +0,0 @@ -import os -import sys -import time -import csv -from tabulate import tabulate -from dataclasses import dataclass -from typing import List - -parent_dir = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -) -sys.path.append(parent_dir) -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - -from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy -# This test compares the same strategy with itself now since WebScrapingStrategy is deprecated - - -@dataclass -class TestResult: - name: str - success: bool - images: int - internal_links: int - external_links: int - markdown_length: int - execution_time: float - - -class StrategyTester: - def __init__(self): - self.new_scraper = LXMLWebScrapingStrategy() - self.current_scraper = LXMLWebScrapingStrategy() # Same strategy now - with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f: - self.WIKI_HTML = f.read() - self.results = {"new": [], "current": []} - - def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]: - results = [] - for scraper in [self.new_scraper, self.current_scraper]: - start_time = time.time() - result = scraper._get_content_of_website_optimized( - url="https://en.wikipedia.org/wiki/Test", html=self.WIKI_HTML, **kwargs - ) - execution_time = time.time() - start_time - - test_result = TestResult( - name=name, - success=result["success"], - images=len(result["media"]["images"]), - internal_links=len(result["links"]["internal"]), - external_links=len(result["links"]["external"]), - markdown_length=len(result["markdown"]), - execution_time=execution_time, - ) - results.append(test_result) - - return results[0], results[1] # new, current - - def run_all_tests(self): - test_cases = [ - ("Basic Extraction", {}), - ("Exclude Tags", {"excluded_tags": ["table", "div.infobox", "div.navbox"]}), - ("Word Threshold", {"word_count_threshold": 50}), - ("CSS Selector", {"css_selector": "div.mw-parser-output > p"}), - ( - "Link Exclusions", - { - "exclude_external_links": True, - "exclude_social_media_links": True, - "exclude_domains": ["facebook.com", "twitter.com"], - }, - ), - ( - "Media Handling", - { - "exclude_external_images": True, - "image_description_min_word_threshold": 20, - }, - ), - ("Text Only", {"only_text": True, "remove_forms": True}), - ("HTML Cleaning", {"clean_html": True, "keep_data_attributes": True}), - ( - "HTML2Text Options", - { - "html2text": { - "skip_internal_links": True, - "single_line_break": True, - "mark_code": True, - "preserve_tags": ["pre", "code"], - } - }, - ), - ] - - all_results = [] - for name, kwargs in test_cases: - try: - new_result, current_result = self.run_test(name, **kwargs) - all_results.append((name, new_result, current_result)) - except Exception as e: - print(f"Error in {name}: {str(e)}") - - self.save_results_to_csv(all_results) - self.print_comparison_table(all_results) - - def save_results_to_csv(self, all_results: List[tuple]): - csv_file = os.path.join(__location__, "strategy_comparison_results.csv") - with open(csv_file, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow( - [ - "Test Name", - "Strategy", - "Success", - "Images", - "Internal Links", - "External Links", - "Markdown Length", - "Execution Time", - ] - ) - - for name, new_result, current_result in all_results: - writer.writerow( - [ - name, - "New", - new_result.success, - new_result.images, - new_result.internal_links, - new_result.external_links, - new_result.markdown_length, - f"{new_result.execution_time:.3f}", - ] - ) - writer.writerow( - [ - name, - "Current", - current_result.success, - current_result.images, - current_result.internal_links, - current_result.external_links, - current_result.markdown_length, - f"{current_result.execution_time:.3f}", - ] - ) - - def print_comparison_table(self, all_results: List[tuple]): - table_data = [] - headers = [ - "Test Name", - "Strategy", - "Success", - "Images", - "Internal Links", - "External Links", - "Markdown Length", - "Time (s)", - ] - - for name, new_result, current_result in all_results: - # Check for differences - differences = [] - if new_result.images != current_result.images: - differences.append("images") - if new_result.internal_links != current_result.internal_links: - differences.append("internal_links") - if new_result.external_links != current_result.external_links: - differences.append("external_links") - if new_result.markdown_length != current_result.markdown_length: - differences.append("markdown") - - # Add row for new strategy - new_row = [ - name, - "New", - new_result.success, - new_result.images, - new_result.internal_links, - new_result.external_links, - new_result.markdown_length, - f"{new_result.execution_time:.3f}", - ] - table_data.append(new_row) - - # Add row for current strategy - current_row = [ - "", - "Current", - current_result.success, - current_result.images, - current_result.internal_links, - current_result.external_links, - current_result.markdown_length, - f"{current_result.execution_time:.3f}", - ] - table_data.append(current_row) - - # Add difference summary if any - if differences: - table_data.append( - ["", "โš ๏ธ Differences", ", ".join(differences), "", "", "", "", ""] - ) - - # Add empty row for better readability - table_data.append([""] * len(headers)) - - print("\nStrategy Comparison Results:") - print(tabulate(table_data, headers=headers, tablefmt="grid")) - - -if __name__ == "__main__": - tester = StrategyTester() - tester.run_all_tests() diff --git a/tests/async/test_database_operations.py b/tests/async/test_database_operations.py deleted file mode 100644 index db0d328ed..000000000 --- a/tests/async/test_database_operations.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -import sys -import pytest - -# Add the parent directory to the Python path -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) - -from crawl4ai.async_webcrawler import AsyncWebCrawler - - -@pytest.mark.asyncio -async def test_cache_url(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.example.com" - # First run to cache the URL - result1 = await crawler.arun(url=url, bypass_cache=True) - assert result1.success - - # Second run to retrieve from cache - result2 = await crawler.arun(url=url, bypass_cache=False) - assert result2.success - assert result2.html == result1.html - - -@pytest.mark.asyncio -async def test_bypass_cache(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.python.org" - # First run to cache the URL - result1 = await crawler.arun(url=url, bypass_cache=True) - assert result1.success - - # Second run bypassing cache - result2 = await crawler.arun(url=url, bypass_cache=True) - assert result2.success - assert ( - result2.html != result1.html - ) # Content might be different due to dynamic nature of websites - - -@pytest.mark.asyncio -async def test_cache_size(): - async with AsyncWebCrawler(verbose=True) as crawler: - initial_size = await crawler.aget_cache_size() - - url = "https://www.nbcnews.com/business" - await crawler.arun(url=url, bypass_cache=True) - - new_size = await crawler.aget_cache_size() - assert new_size == initial_size + 1 - - -@pytest.mark.asyncio -async def test_clear_cache(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.example.org" - await crawler.arun(url=url, bypass_cache=True) - - initial_size = await crawler.aget_cache_size() - assert initial_size > 0 - - await crawler.aclear_cache() - new_size = await crawler.aget_cache_size() - assert new_size == 0 - - -@pytest.mark.asyncio -async def test_flush_cache(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.example.net" - await crawler.arun(url=url, bypass_cache=True) - - initial_size = await crawler.aget_cache_size() - assert initial_size > 0 - - await crawler.aflush_cache() - new_size = await crawler.aget_cache_size() - assert new_size == 0 - - # Try to retrieve the previously cached URL - result = await crawler.arun(url=url, bypass_cache=False) - assert ( - result.success - ) # The crawler should still succeed, but it will fetch the content anew - - -# Entry point for debugging -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/tests/async/test_parameters_and_options.py b/tests/async/test_parameters_and_options.py deleted file mode 100644 index e153fbd3f..000000000 --- a/tests/async/test_parameters_and_options.py +++ /dev/null @@ -1,116 +0,0 @@ -import os -import sys -import pytest - -# Add the parent directory to the Python path -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) - -from crawl4ai.async_webcrawler import AsyncWebCrawler - - -@pytest.mark.asyncio -async def test_word_count_threshold(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - result_no_threshold = await crawler.arun( - url=url, word_count_threshold=0, bypass_cache=True - ) - result_with_threshold = await crawler.arun( - url=url, word_count_threshold=50, bypass_cache=True - ) - - assert len(result_no_threshold.markdown) > len(result_with_threshold.markdown) - - -@pytest.mark.asyncio -async def test_css_selector(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - css_selector = "h1, h2, h3" - result = await crawler.arun( - url=url, css_selector=css_selector, bypass_cache=True - ) - - assert result.success - assert ( - " button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" - ] - result_with_more = await crawler.arun(url=url, js=js_code, bypass_cache=True) - - assert result_with_more.success - assert len(result_with_more.markdown) > len(result_without_more.markdown) - - -@pytest.mark.asyncio -async def test_screenshot(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - result = await crawler.arun(url=url, screenshot=True, bypass_cache=True) - - assert result.success - assert result.screenshot - assert isinstance(result.screenshot, str) # Should be a base64 encoded string - - -@pytest.mark.asyncio -async def test_custom_user_agent(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - custom_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Crawl4AI/1.0" - result = await crawler.arun( - url=url, user_agent=custom_user_agent, bypass_cache=True - ) - - assert result.success - # Note: We can't directly verify the user agent in the result, but we can check if the crawl was successful - - -@pytest.mark.asyncio -async def test_extract_media_and_links(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - result = await crawler.arun(url=url, bypass_cache=True) - - assert result.success - assert result.media - assert isinstance(result.media, dict) - assert "images" in result.media - assert result.links - assert isinstance(result.links, dict) - assert "internal" in result.links and "external" in result.links - - -@pytest.mark.asyncio -async def test_metadata_extraction(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://www.nbcnews.com/business" - result = await crawler.arun(url=url, bypass_cache=True) - - assert result.success - assert result.metadata - assert isinstance(result.metadata, dict) - # Check for common metadata fields - assert any( - key in result.metadata for key in ["title", "description", "keywords"] - ) - - -# Entry point for debugging -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/tests/async_assistant/test_extract_pipeline.py b/tests/async_assistant/test_extract_pipeline.py deleted file mode 100644 index 719d6ea14..000000000 --- a/tests/async_assistant/test_extract_pipeline.py +++ /dev/null @@ -1,381 +0,0 @@ -""" -Test implementation of AI Assistant extract pipeline using only Crawl4AI capabilities. -This follows the exact flow discussed: query enhancement, classification, HTML skimming, -parent extraction, schema generation, and extraction. -""" - -import asyncio -import json -import os -from typing import List, Dict, Any, Optional, Union -from lxml import html as lxml_html -import re - -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LLMConfig -from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy -from crawl4ai.utils import perform_completion_with_backoff - - -async def extract_pipeline( - base_url: str, - urls: Union[str, List[str], None], - query: str, - target_json_example: Optional[str] = None, - force_llm: bool = False, - verbose: bool = True -) -> Union[Dict, List[Dict]]: - """ - Full implementation of the AI-powered extraction pipeline using only Crawl4AI. - - Pipeline: - 1. Quick crawl & HTML skimming - 2. Classification (structural vs semantic) using LLM - 3. Parent element extraction using LLM (for structural) - 4. Schema generation using Crawl4AI's generate_schema - 5. Extraction execution using Crawl4AI strategies - """ - - # Normalize URLs - if urls is None: - urls = base_url - target_urls = [urls] if isinstance(urls, str) else urls - single_result = isinstance(urls, str) or urls is None - - # LLM configs for different tasks - llm_small = LLMConfig( - provider="openai/gpt-4o-mini", - api_token=os.getenv("OPENAI_API_KEY") - ) - llm_small.temperature = 0.3 - - llm_strong = LLMConfig( - provider="openai/gpt-4o", - api_token=os.getenv("OPENAI_API_KEY") - ) - llm_strong.temperature = 0.5 - - def vprint(msg: str): - if verbose: - print(f"๐Ÿ” {msg}") - - # Step 1: Starting - vprint(f"Query: '{query}'") - - # Step 2: Quick crawl for analysis - async with AsyncWebCrawler(verbose=False) as crawler: - vprint(f"Quick crawl: {base_url}") - quick_result = await crawler.arun( - url=base_url, - config=CrawlerRunConfig( - cache_mode="bypass", - delay_before_return_html=2.0 - ) - ) - - if not quick_result.success: - raise Exception(f"Failed to crawl {base_url}") - - # Step 3: HTML Skimming using lxml - def skim_html(html: str) -> str: - """Remove non-structural elements using lxml.""" - parser = lxml_html.HTMLParser(remove_comments=True) - tree = lxml_html.fromstring(html, parser=parser) - - # Remove head section entirely - for head in tree.xpath('//head'): - head.getparent().remove(head) - - # Remove non-structural elements including SVGs - for element in tree.xpath('//script | //style | //noscript | //meta | //link | //svg'): - parent = element.getparent() - if parent is not None: - parent.remove(element) - - # Remove base64 images - for img in tree.xpath('//img[@src]'): - src = img.get('src', '') - if 'base64' in src: - img.set('src', 'BASE64_IMAGE') - - # Remove long class/id attributes - for element in tree.xpath('//*[@class or @id]'): - if element.get('class') and len(element.get('class')) > 100: - element.set('class', 'LONG_CLASS') - if element.get('id') and len(element.get('id')) > 50: - element.set('id', 'LONG_ID') - - # Truncate text nodes - for text_node in tree.xpath('//text()'): - if text_node.strip() and len(text_node) > 100: - parent = text_node.getparent() - if parent is not None: - new_text = text_node[:50] + "..." + text_node[-20:] - if text_node.is_text: - parent.text = new_text - elif text_node.is_tail: - parent.tail = new_text - - return lxml_html.tostring(tree, encoding='unicode') - - skimmed_html = skim_html(quick_result.html) - vprint(f"Skimmed HTML from {len(quick_result.html)} to {len(skimmed_html)} chars") - - # Step 4: Classification using LLM - classification = 'semantic' # Default - - if not force_llm: - classification_prompt = f""" - Analyze this HTML to determine extraction strategy. - - Query: "{query}" - - HTML sample: - <<<>> - {skimmed_html} - <<<>> - - Determine if this can be extracted using CSS/XPath patterns (structural) - or requires semantic understanding (semantic). - - Look for: - - Repeating patterns (lists, cards, tables) โ†’ structural - - Consistent HTML structure โ†’ structural - - Need for inference or understanding โ†’ semantic - - Return JSON: - {{ - "strategy": "structural" or "semantic", - "confidence": 0.0-1.0, - "reasoning": "..." - }} - """ - - response = perform_completion_with_backoff( - provider=llm_small.provider, - prompt_with_variables=classification_prompt, - api_token=llm_small.api_token, - json_response=True, - temperature=llm_small.temperature - ) - - classification_result = json.loads(response.choices[0].message.content) - classification = classification_result['strategy'] - vprint(f"Classification: {classification} (confidence: {classification_result['confidence']})") - vprint(f"Reasoning: {classification_result['reasoning']}") - - if force_llm: - classification = 'semantic' - vprint("Forced LLM extraction") - - # Step 5 & 6: Execute appropriate extraction strategy - if classification == 'structural': - # Extract parent element using LLM with proper explanation - parent_prompt = f""" - Identify the CSS selector for the BASE ELEMENT TEMPLATE containing the data to extract. - - IMPORTANT: The base element template is a repeating pattern in the HTML where each instance - contains one item of data (like a product card, article card, issue card, etc.). - - The selector should: - - Not be too specific (avoid selecting just one item) - - Not be too general (avoid selecting unrelated elements) - - Select ALL instances of the repeating pattern - - Point to the container that holds ONE complete data item - - For example: - - On Amazon: div.s-result-item (each product card) - - On GitHub issues: div[id^="issue_"] (each issue card) - - On a blog: article.post-card (each article) - - User query: "{query}" - """ - - if target_json_example: - parent_prompt += f""" - - The user expects to extract data in this format: - {target_json_example} - - Find the base element that contains all these fields. - """ - else: - parent_prompt += """ - - Also provide a JSON example of what data can be extracted from one instance of this base element. - """ - - parent_prompt += f""" - - HTML (first 8000 chars): - <<<>> - {skimmed_html} - <<<>> - - Return JSON: - {{ - "parent_selector": "css_selector_here", - "explanation": "why this selector is appropriate",""" - - if not target_json_example: - parent_prompt += """ - "suggested_json_example": { - "field1": "example value", - "field2": "example value" - }""" - - parent_prompt += """ - }} - """ - - response = perform_completion_with_backoff( - provider=llm_small.provider, - prompt_with_variables=parent_prompt, - api_token=llm_small.api_token, - json_response=True, - temperature=llm_small.temperature - ) - - parent_data = json.loads(response.choices[0].message.content) - parent_selector = parent_data['parent_selector'] - vprint(f"Parent selector: {parent_selector}") - vprint(f"Explanation: {parent_data['explanation']}") - - # Use suggested JSON example if no target provided - if not target_json_example and 'suggested_json_example' in parent_data: - target_json_example = json.dumps(parent_data['suggested_json_example']) - vprint(f"Using LLM suggested example: {target_json_example}") - - # Get the actual parent HTML for schema generation - tree = lxml_html.fromstring(quick_result.html) - parent_elements = tree.cssselect(parent_selector) - - if not parent_elements: - vprint("Parent selector not found, falling back to semantic") - classification = 'semantic' - else: - # Use the first instance as sample - sample_html = lxml_html.tostring(parent_elements[0], encoding='unicode') - vprint(f"Generating schema from sample HTML ({len(sample_html)} chars)") - - # Generate schema using Crawl4AI - schema_params = { - "html": sample_html, - "query": query, - "llm_config": llm_strong - } - - if target_json_example: - schema_params["target_json_example"] = target_json_example - - schema = JsonCssExtractionStrategy.generate_schema(**schema_params) - - vprint(f"Generated schema with {len(schema.get('fields', []))} fields") - - # Extract from all URLs - extraction_strategy = JsonCssExtractionStrategy(schema) - results = [] - - for url in target_urls: - vprint(f"Extracting from: {url}") - result = await crawler.arun( - url=url, - config=CrawlerRunConfig( - extraction_strategy=extraction_strategy, - cache_mode="bypass" - ) - ) - - if result.success and result.extracted_content: - data = json.loads(result.extracted_content) - results.append({ - 'url': url, - 'data': data, - 'count': len(data) if isinstance(data, list) else 1, - 'method': 'JsonCssExtraction', - 'schema': schema - }) - - return results[0] if single_result else results - - # Semantic extraction (LLM) - if classification == 'semantic': - vprint("Using LLM extraction") - - # Build instruction from query - instruction = f""" - {query} - - Return structured JSON data. - """ - - extraction_strategy = LLMExtractionStrategy( - llm_config=llm_strong, - instruction=instruction - ) - - results = [] - for url in target_urls: - vprint(f"LLM extracting from: {url}") - result = await crawler.arun( - url=url, - config=CrawlerRunConfig( - extraction_strategy=extraction_strategy, - cache_mode="bypass" - ) - ) - - if result.success and result.extracted_content: - data = json.loads(result.extracted_content) - results.append({ - 'url': url, - 'data': data, - 'count': len(data) if isinstance(data, list) else 1, - 'method': 'LLMExtraction' - }) - - return results[0] if single_result else results - - -async def main(): - """Test the extraction pipeline.""" - - print("\n๐Ÿš€ CRAWL4AI EXTRACTION PIPELINE TEST") - print("="*50) - - # Test structural extraction - try: - result = await extract_pipeline( - base_url="https://github.com/unclecode/crawl4ai/issues", - urls=None, - query="I want to extract all issue titles, numbers, and who opened them", - verbose=True - ) - - print(f"\nโœ… Success! Extracted {result.get('count', 0)} items") - print(f"Method used: {result.get('method')}") - - if result.get('data'): - print("\nFirst few items:") - data = result['data'] - items_to_show = data[:3] if isinstance(data, list) else data - print(json.dumps(items_to_show, indent=2)) - - if result.get('schema'): - print(f"\nGenerated schema fields: {[f['name'] for f in result['schema'].get('fields', [])]}") - - except Exception as e: - print(f"\nโŒ Error: {e}") - import traceback - traceback.print_exc() - - -if __name__ == "__main__": - # Check for API key - if not os.getenv("OPENAI_API_KEY"): - print("โš ๏ธ Error: OPENAI_API_KEY environment variable not set") - exit(1) - - asyncio.run(main()) - - diff --git a/tests/async_assistant/test_extract_pipeline_v2.py b/tests/async_assistant/test_extract_pipeline_v2.py deleted file mode 100644 index bb65df8d2..000000000 --- a/tests/async_assistant/test_extract_pipeline_v2.py +++ /dev/null @@ -1,386 +0,0 @@ -""" -Test implementation v2: Combined classification and preparation in one LLM call. -More efficient approach that reduces token usage and LLM calls. -""" - -import asyncio -import json -import os -from typing import List, Dict, Any, Optional, Union -from lxml import html as lxml_html -import re - -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LLMConfig -from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy -from crawl4ai.utils import perform_completion_with_backoff - - -async def extract_pipeline_v2( - base_url: str, - urls: Union[str, List[str], None], - query: str, - target_json_example: Optional[str] = None, - force_llm: bool = False, - verbose: bool = True -) -> Union[Dict, List[Dict]]: - """ - Improved extraction pipeline with combined classification and preparation. - - Pipeline: - 1. Quick crawl & HTML skimming - 2. Combined LLM call for classification + preparation - 3. Execute appropriate extraction strategy - """ - - # Normalize URLs - if urls is None: - urls = base_url - target_urls = [urls] if isinstance(urls, str) else urls - single_result = isinstance(urls, str) or urls is None - - # LLM configs - llm_small = LLMConfig( - provider="openai/gpt-4o-mini", - api_token=os.getenv("OPENAI_API_KEY") - ) - llm_small.temperature = 0.3 - - llm_strong = LLMConfig( - provider="openai/gpt-4o", - api_token=os.getenv("OPENAI_API_KEY") - ) - llm_strong.temperature = 0.5 - - def vprint(msg: str): - if verbose: - print(f"๐Ÿ” {msg}") - - vprint(f"Query: '{query}'") - if target_json_example: - vprint(f"Target format provided: {target_json_example[:100]}...") - - # Step 1: Quick crawl for analysis - async with AsyncWebCrawler(verbose=False) as crawler: - vprint(f"Quick crawl: {base_url}") - quick_result = await crawler.arun( - url=base_url, - config=CrawlerRunConfig( - cache_mode="bypass", - delay_before_return_html=2.0 - ) - ) - - if not quick_result.success: - raise Exception(f"Failed to crawl {base_url}") - - # HTML Skimming - def skim_html(html: str) -> str: - """Remove non-structural elements using lxml.""" - parser = lxml_html.HTMLParser(remove_comments=True) - tree = lxml_html.fromstring(html, parser=parser) - - # Remove head section entirely - for head in tree.xpath('//head'): - head.getparent().remove(head) - - # Remove non-structural elements including SVGs - for element in tree.xpath('//script | //style | //noscript | //meta | //link | //svg'): - parent = element.getparent() - if parent is not None: - parent.remove(element) - - # Remove base64 images - for img in tree.xpath('//img[@src]'): - src = img.get('src', '') - if 'base64' in src: - img.set('src', 'BASE64_IMAGE') - - # Remove long class/id attributes - for element in tree.xpath('//*[@class or @id]'): - if element.get('class') and len(element.get('class')) > 100: - element.set('class', 'LONG_CLASS') - if element.get('id') and len(element.get('id')) > 50: - element.set('id', 'LONG_ID') - - # Truncate text nodes - for text_node in tree.xpath('//text()'): - if text_node.strip() and len(text_node) > 100: - parent = text_node.getparent() - if parent is not None: - new_text = text_node[:50] + "..." + text_node[-20:] - if text_node.is_text: - parent.text = new_text - elif text_node.is_tail: - parent.tail = new_text - - return lxml_html.tostring(tree, encoding='unicode') - - skimmed_html = skim_html(quick_result.html) - vprint(f"Skimmed HTML from {len(quick_result.html)} to {len(skimmed_html)} chars") - - # Step 2: Combined classification and preparation - if force_llm: - classification_data = {"classification": "semantic"} - vprint("Forced LLM extraction") - else: - combined_prompt = f""" - Analyze this HTML and prepare for data extraction. - - User query: "{query}" - """ - - if target_json_example: - combined_prompt += f""" - Target format: {target_json_example} - """ - - combined_prompt += f""" - - HTML: - <<<>>> - {skimmed_html} - <<<>>> - - STEP 1: Determine extraction strategy - - If data follows repeating HTML patterns (lists, tables, cards) โ†’ "structural" - - If data requires understanding/inference โ†’ "semantic" - - STEP 2A: If STRUCTURAL extraction is appropriate: - - Find the CSS selector for the BASE ELEMENT (repeating pattern) - - Base element = container holding ONE data item (e.g., product card, table row) - - Selector should select ALL instances, not too specific, not too general - - Count approximate number of these elements - """ - - if not target_json_example: - combined_prompt += """ - - Suggest what JSON structure can be extracted from one element - """ - - combined_prompt += """ - - STEP 2B: If SEMANTIC extraction is needed: - - Write a detailed instruction for what to extract - - Be specific about the data needed - """ - - if not target_json_example: - combined_prompt += """ - - Suggest expected JSON output structure - """ - - combined_prompt += """ - - Return JSON with ONLY the relevant fields based on classification: - { - "classification": "structural" or "semantic", - "confidence": 0.0-1.0, - "reasoning": "brief explanation", - - // Include ONLY if classification is "structural": - "base_selector": "css selector", - "element_count": approximate number, - - // Include ONLY if classification is "semantic": - "extraction_instruction": "detailed instruction", - - // Include if no target_json_example was provided: - "suggested_json_example": { ... } - } - """ - - response = perform_completion_with_backoff( - provider=llm_small.provider, - prompt_with_variables=combined_prompt, - api_token=llm_small.api_token, - json_response=True, - temperature=llm_small.temperature - ) - - classification_data = json.loads(response.choices[0].message.content) - vprint(f"Classification: {classification_data['classification']} (confidence: {classification_data['confidence']})") - vprint(f"Reasoning: {classification_data['reasoning']}") - - # Use suggested JSON example if needed - if not target_json_example and 'suggested_json_example' in classification_data: - target_json_example = json.dumps(classification_data['suggested_json_example']) - vprint(f"Using suggested example: {target_json_example}") - - # Step 3: Execute extraction based on classification - if classification_data['classification'] == 'structural': - vprint(f"Base selector: {classification_data['base_selector']}") - vprint(f"Found ~{classification_data['element_count']} elements") - - # Get sample HTML for schema generation - tree = lxml_html.fromstring(quick_result.html) - parent_elements = tree.cssselect(classification_data['base_selector']) - - if not parent_elements: - vprint("Base selector not found, falling back to semantic") - classification_data['classification'] = 'semantic' - else: - # Use first element as sample - sample_html = lxml_html.tostring(parent_elements[0], encoding='unicode') - vprint(f"Generating schema from sample ({len(sample_html)} chars)") - - # Generate schema - schema_params = { - "html": sample_html, - "query": query, - "llm_config": llm_strong - } - - if target_json_example: - schema_params["target_json_example"] = target_json_example - - schema = JsonCssExtractionStrategy.generate_schema(**schema_params) - vprint(f"Generated schema with {len(schema.get('fields', []))} fields") - - # Extract from all URLs - extraction_strategy = JsonCssExtractionStrategy(schema) - results = [] - - for idx, url in enumerate(target_urls): - vprint(f"Extracting from: {url}") - - # Use already crawled HTML for base_url, crawl others - if idx == 0 and url == base_url: - # We already have this HTML, use raw:// to avoid re-crawling - raw_url = f"raw://{quick_result.html}" - vprint("Using cached HTML with raw:// scheme") - else: - # Need to crawl this URL - raw_url = url - - result = await crawler.arun( - url=raw_url, - config=CrawlerRunConfig( - extraction_strategy=extraction_strategy, - cache_mode="bypass" - ) - ) - - if result.success and result.extracted_content: - data = json.loads(result.extracted_content) - results.append({ - 'url': url, # Keep original URL for reference - 'data': data, - 'count': len(data) if isinstance(data, list) else 1, - 'method': 'JsonCssExtraction', - 'schema': schema - }) - - return results[0] if single_result else results - - # Semantic extraction - if classification_data['classification'] == 'semantic': - vprint("Using LLM extraction") - - # Use generated instruction or create simple one - if 'extraction_instruction' in classification_data: - instruction = classification_data['extraction_instruction'] - vprint(f"Generated instruction: {instruction[:100]}...") - else: - instruction = f"{query}\n\nReturn structured JSON data." - - extraction_strategy = LLMExtractionStrategy( - llm_config=llm_strong, - instruction=instruction - ) - - results = [] - for idx, url in enumerate(target_urls): - vprint(f"LLM extracting from: {url}") - - # Use already crawled HTML for base_url, crawl others - if idx == 0 and url == base_url: - # We already have this HTML, use raw:// to avoid re-crawling - raw_url = f"raw://{quick_result.html}" - vprint("Using cached HTML with raw:// scheme") - else: - # Need to crawl this URL - raw_url = url - - result = await crawler.arun( - url=raw_url, - config=CrawlerRunConfig( - extraction_strategy=extraction_strategy, - cache_mode="bypass" - ) - ) - - if result.success and result.extracted_content: - data = json.loads(result.extracted_content) - results.append({ - 'url': url, # Keep original URL for reference - 'data': data, - 'count': len(data) if isinstance(data, list) else 1, - 'method': 'LLMExtraction' - }) - - return results[0] if single_result else results - - -async def main(): - """Test the improved extraction pipeline.""" - - print("\n๐Ÿš€ CRAWL4AI EXTRACTION PIPELINE V2 TEST") - print("="*50) - - try: - # Test 1: Structural extraction (GitHub issues) - print("\nTest 1: GitHub Issues (should use structural)") - result = await extract_pipeline_v2( - base_url="https://github.com/unclecode/crawl4ai/issues", - urls=None, - query="Extract all issue titles, numbers, and authors", - verbose=True - ) - - print(f"\nโœ… Extracted {result.get('count', 0)} items using {result.get('method')}") - if result.get('data'): - print("Sample:", json.dumps(result['data'][:2] if isinstance(result['data'], list) else result['data'], indent=2)) - - # Test 2: With target JSON example - print("\n\nTest 2: With target JSON example") - target_example = json.dumps({ - "title": "Issue title here", - "number": "#123", - "author": "username" - }) - - result2 = await extract_pipeline_v2( - base_url="https://github.com/unclecode/crawl4ai/issues", - urls=None, - query="Extract GitHub issues", - target_json_example=target_example, - verbose=True - ) - - print(f"\nโœ… Extracted {result2.get('count', 0)} items") - - # Test 3: Semantic extraction (force LLM) - print("\n\nTest 3: Force semantic extraction") - result3 = await extract_pipeline_v2( - base_url="https://en.wikipedia.org/wiki/Artificial_intelligence", - urls=None, - query="Extract key concepts and their relationships in AI field", - force_llm=True, - verbose=True - ) - - print(f"\nโœ… Extracted using {result3.get('method')}") - - except Exception as e: - print(f"\nโŒ Error: {e}") - import traceback - traceback.print_exc() - - -if __name__ == "__main__": - if not os.getenv("OPENAI_API_KEY"): - print("โš ๏ธ Error: OPENAI_API_KEY environment variable not set") - exit(1) - - asyncio.run(main()) \ No newline at end of file diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py deleted file mode 100644 index 2ec64a6b5..000000000 --- a/tests/browser/docker/test_docker_browser.py +++ /dev/null @@ -1,651 +0,0 @@ -"""Test examples for Docker Browser Strategy. - -These examples demonstrate the functionality of Docker Browser Strategy -and serve as functional tests. -""" - -import asyncio -import os -import sys -import shutil -import uuid - -# Add the project root to Python path if running directly -if __name__ == "__main__": - sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..'))) - -from crawl4ai.browser import BrowserManager -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from crawl4ai.async_logger import AsyncLogger -from crawl4ai.browser import DockerConfig -from crawl4ai.browser import DockerRegistry -from crawl4ai.browser import DockerUtils - -# Create a logger for clear terminal output -logger = AsyncLogger(verbose=True, log_file=None) - -# Global Docker utils instance -docker_utils = DockerUtils(logger) - -async def test_docker_components(): - """Test Docker utilities, registry, and image building. - - This function tests the core Docker components before running the browser tests. - It validates DockerRegistry, DockerUtils, and builds test images to ensure - everything is functioning correctly. - """ - logger.info("Testing Docker components", tag="SETUP") - - # Create a test registry directory - registry_dir = os.path.join(os.path.dirname(__file__), "test_registry") - registry_file = os.path.join(registry_dir, "test_registry.json") - os.makedirs(registry_dir, exist_ok=True) - - try: - # 1. Test DockerRegistry - logger.info("Testing DockerRegistry...", tag="SETUP") - registry = DockerRegistry(registry_file) - - # Test saving and loading registry - test_container_id = "test-container-123" - registry.register_container(test_container_id, 9876, "test-hash-123") - registry.save() - - # Create a new registry instance that loads from the file - registry2 = DockerRegistry(registry_file) - port = registry2.get_container_host_port(test_container_id) - hash_value = registry2.get_container_config_hash(test_container_id) - - if port != 9876 or hash_value != "test-hash-123": - logger.error("DockerRegistry persistence failed", tag="SETUP") - return False - - # Clean up test container from registry - registry2.unregister_container(test_container_id) - logger.success("DockerRegistry works correctly", tag="SETUP") - - # 2. Test DockerUtils - logger.info("Testing DockerUtils...", tag="SETUP") - - # Test port detection - in_use = docker_utils.is_port_in_use(22) # SSH port is usually in use - logger.info(f"Port 22 in use: {in_use}", tag="SETUP") - - # Get next available port - available_port = docker_utils.get_next_available_port(9000) - logger.info(f"Next available port: {available_port}", tag="SETUP") - - # Test config hash generation - config_dict = {"mode": "connect", "headless": True} - config_hash = docker_utils.generate_config_hash(config_dict) - logger.info(f"Generated config hash: {config_hash[:8]}...", tag="SETUP") - - # 3. Test Docker is available - logger.info("Checking Docker availability...", tag="SETUP") - if not await check_docker_available(): - logger.error("Docker is not available - cannot continue tests", tag="SETUP") - return False - - # 4. Test building connect image - logger.info("Building connect mode Docker image...", tag="SETUP") - connect_image = await docker_utils.ensure_docker_image_exists(None, "connect") - if not connect_image: - logger.error("Failed to build connect mode image", tag="SETUP") - return False - logger.success(f"Successfully built connect image: {connect_image}", tag="SETUP") - - # 5. Test building launch image - logger.info("Building launch mode Docker image...", tag="SETUP") - launch_image = await docker_utils.ensure_docker_image_exists(None, "launch") - if not launch_image: - logger.error("Failed to build launch mode image", tag="SETUP") - return False - logger.success(f"Successfully built launch image: {launch_image}", tag="SETUP") - - # 6. Test creating and removing container - logger.info("Testing container creation and removal...", tag="SETUP") - container_id = await docker_utils.create_container( - image_name=launch_image, - host_port=available_port, - container_name="crawl4ai-test-container" - ) - - if not container_id: - logger.error("Failed to create test container", tag="SETUP") - return False - - logger.info(f"Created test container: {container_id[:12]}", tag="SETUP") - - # Verify container is running - running = await docker_utils.is_container_running(container_id) - if not running: - logger.error("Test container is not running", tag="SETUP") - await docker_utils.remove_container(container_id) - return False - - # Test commands in container - logger.info("Testing command execution in container...", tag="SETUP") - returncode, stdout, stderr = await docker_utils.exec_in_container( - container_id, ["ls", "-la", "/"] - ) - - if returncode != 0: - logger.error(f"Command execution failed: {stderr}", tag="SETUP") - await docker_utils.remove_container(container_id) - return False - - # Verify Chrome is installed in the container - returncode, stdout, stderr = await docker_utils.exec_in_container( - container_id, ["which", "chromium"] - ) - - if returncode != 0: - logger.error("Chrome not found in container", tag="SETUP") - await docker_utils.remove_container(container_id) - return False - - chrome_path = stdout.strip() - logger.info(f"Chrome found at: {chrome_path}", tag="SETUP") - - # Test Chrome version - returncode, stdout, stderr = await docker_utils.exec_in_container( - container_id, ["chromium", "--version"] - ) - - if returncode != 0: - logger.error(f"Failed to get Chrome version: {stderr}", tag="SETUP") - await docker_utils.remove_container(container_id) - return False - - logger.info(f"Chrome version: {stdout.strip()}", tag="SETUP") - - # Remove test container - removed = await docker_utils.remove_container(container_id) - if not removed: - logger.error("Failed to remove test container", tag="SETUP") - return False - - logger.success("Test container removed successfully", tag="SETUP") - - # All components tested successfully - logger.success("All Docker components tested successfully", tag="SETUP") - return True - - except Exception as e: - logger.error(f"Docker component tests failed: {str(e)}", tag="SETUP") - return False - finally: - # Clean up registry test directory - if os.path.exists(registry_dir): - shutil.rmtree(registry_dir) - -async def test_docker_connect_mode(): - """Test Docker browser in connect mode. - - This tests the basic functionality of creating a browser in Docker - connect mode and using it for navigation. - """ - logger.info("Testing Docker browser in connect mode", tag="TEST") - - # Create temp directory for user data - temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data") - os.makedirs(temp_dir, exist_ok=True) - - try: - # Create Docker configuration - docker_config = DockerConfig( - mode="connect", - persistent=False, - remove_on_exit=True, - user_data_dir=temp_dir - ) - - # Create browser configuration - browser_config = BrowserConfig( - browser_mode="docker", - headless=True, - docker_config=docker_config - ) - - # Create browser manager - manager = BrowserManager(browser_config=browser_config, logger=logger) - - # Start the browser - await manager.start() - logger.info("Browser started successfully", tag="TEST") - - # Create crawler config - crawler_config = CrawlerRunConfig(url="https://example.com") - - # Get a page - page, context = await manager.get_page(crawler_config) - logger.info("Got page successfully", tag="TEST") - - # Navigate to a website - await page.goto("https://example.com") - logger.info("Navigated to example.com", tag="TEST") - - # Get page title - title = await page.title() - logger.info(f"Page title: {title}", tag="TEST") - - # Clean up - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - return True - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Ensure cleanup - try: - await manager.close() - except: - pass - return False - finally: - # Clean up the temp directory - if os.path.exists(temp_dir): - shutil.rmtree(temp_dir) - -async def test_docker_launch_mode(): - """Test Docker browser in launch mode. - - This tests launching a Chrome browser within a Docker container - on demand with custom settings. - """ - logger.info("Testing Docker browser in launch mode", tag="TEST") - - # Create temp directory for user data - temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data_launch") - os.makedirs(temp_dir, exist_ok=True) - - try: - # Create Docker configuration - docker_config = DockerConfig( - mode="launch", - persistent=False, - remove_on_exit=True, - user_data_dir=temp_dir - ) - - # Create browser configuration - browser_config = BrowserConfig( - browser_mode="docker", - headless=True, - text_mode=True, # Enable text mode for faster operation - docker_config=docker_config - ) - - # Create browser manager - manager = BrowserManager(browser_config=browser_config, logger=logger) - - # Start the browser - await manager.start() - logger.info("Browser started successfully", tag="TEST") - - # Create crawler config - crawler_config = CrawlerRunConfig(url="https://example.com") - - # Get a page - page, context = await manager.get_page(crawler_config) - logger.info("Got page successfully", tag="TEST") - - # Navigate to a website - await page.goto("https://example.com") - logger.info("Navigated to example.com", tag="TEST") - - # Get page title - title = await page.title() - logger.info(f"Page title: {title}", tag="TEST") - - # Clean up - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - return True - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Ensure cleanup - try: - await manager.close() - except: - pass - return False - finally: - # Clean up the temp directory - if os.path.exists(temp_dir): - shutil.rmtree(temp_dir) - -async def test_docker_persistent_storage(): - """Test Docker browser with persistent storage. - - This tests creating localStorage data in one session and verifying - it persists to another session when using persistent storage. - """ - logger.info("Testing Docker browser with persistent storage", tag="TEST") - - # Create a unique temp directory - test_id = uuid.uuid4().hex[:8] - temp_dir = os.path.join(os.path.dirname(__file__), f"tmp_user_data_persist_{test_id}") - os.makedirs(temp_dir, exist_ok=True) - - manager1 = None - manager2 = None - - try: - # Create Docker configuration with persistence - docker_config = DockerConfig( - mode="connect", - persistent=True, # Keep container running between sessions - user_data_dir=temp_dir, - container_user_data_dir="/data" - ) - - # Create browser configuration - browser_config = BrowserConfig( - browser_mode="docker", - headless=True, - docker_config=docker_config - ) - - # Create first browser manager - manager1 = BrowserManager(browser_config=browser_config, logger=logger) - - # Start the browser - await manager1.start() - logger.info("First browser started successfully", tag="TEST") - - # Create crawler config - crawler_config = CrawlerRunConfig() - - # Get a page - page1, context1 = await manager1.get_page(crawler_config) - - # Navigate to example.com - await page1.goto("https://example.com") - - # Set localStorage item - test_value = f"test_value_{test_id}" - await page1.evaluate(f"localStorage.setItem('test_key', '{test_value}')") - logger.info(f"Set localStorage test_key = {test_value}", tag="TEST") - - # Close the first browser manager - await manager1.close() - logger.info("First browser closed", tag="TEST") - - # Create second browser manager with same config - manager2 = BrowserManager(browser_config=browser_config, logger=logger) - - # Start the browser - await manager2.start() - logger.info("Second browser started successfully", tag="TEST") - - # Get a page - page2, context2 = await manager2.get_page(crawler_config) - - # Navigate to same site - await page2.goto("https://example.com") - - # Get localStorage item - value = await page2.evaluate("localStorage.getItem('test_key')") - logger.info(f"Retrieved localStorage test_key = {value}", tag="TEST") - - # Check if persistence worked - if value == test_value: - logger.success("Storage persistence verified!", tag="TEST") - else: - logger.error(f"Storage persistence failed! Expected {test_value}, got {value}", tag="TEST") - - # Clean up - await manager2.close() - logger.info("Second browser closed successfully", tag="TEST") - - return value == test_value - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Ensure cleanup - try: - if manager1: - await manager1.close() - if manager2: - await manager2.close() - except: - pass - return False - finally: - # Clean up the temp directory - if os.path.exists(temp_dir): - shutil.rmtree(temp_dir) - -async def test_docker_parallel_pages(): - """Test Docker browser with parallel page creation. - - This tests the ability to create and use multiple pages in parallel - from a single Docker browser instance. - """ - logger.info("Testing Docker browser with parallel pages", tag="TEST") - - try: - # Create Docker configuration - docker_config = DockerConfig( - mode="connect", - persistent=False, - remove_on_exit=True - ) - - # Create browser configuration - browser_config = BrowserConfig( - browser_mode="docker", - headless=True, - docker_config=docker_config - ) - - # Create browser manager - manager = BrowserManager(browser_config=browser_config, logger=logger) - - # Start the browser - await manager.start() - logger.info("Browser started successfully", tag="TEST") - - # Create crawler config - crawler_config = CrawlerRunConfig() - - # Get multiple pages - page_count = 3 - pages = await manager.get_pages(crawler_config, count=page_count) - logger.info(f"Got {len(pages)} pages successfully", tag="TEST") - - if len(pages) != page_count: - logger.error(f"Expected {page_count} pages, got {len(pages)}", tag="TEST") - await manager.close() - return False - - # Navigate to different sites with each page - tasks = [] - for i, (page, _) in enumerate(pages): - tasks.append(page.goto(f"https://example.com?page={i}")) - - # Wait for all navigations to complete - await asyncio.gather(*tasks) - logger.info("All pages navigated successfully", tag="TEST") - - # Get titles from all pages - titles = [] - for i, (page, _) in enumerate(pages): - title = await page.title() - titles.append(title) - logger.info(f"Page {i+1} title: {title}", tag="TEST") - - # Clean up - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - return True - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Ensure cleanup - try: - await manager.close() - except: - pass - return False - -async def test_docker_registry_reuse(): - """Test Docker container reuse via registry. - - This tests that containers with matching configurations - are reused rather than creating new ones. - """ - logger.info("Testing Docker container reuse via registry", tag="TEST") - - # Create registry for this test - registry_dir = os.path.join(os.path.dirname(__file__), "registry_reuse_test") - registry_file = os.path.join(registry_dir, "registry.json") - os.makedirs(registry_dir, exist_ok=True) - - manager1 = None - manager2 = None - container_id1 = None - - try: - # Create identical Docker configurations with custom registry - docker_config1 = DockerConfig( - mode="connect", - persistent=True, # Keep container running after closing - registry_file=registry_file - ) - - # Create first browser configuration - browser_config1 = BrowserConfig( - browser_mode="docker", - headless=True, - docker_config=docker_config1 - ) - - # Create first browser manager - manager1 = BrowserManager(browser_config=browser_config1, logger=logger) - - # Start the first browser - await manager1.start() - logger.info("First browser started successfully", tag="TEST") - - # Get container ID from the strategy - docker_strategy1 = manager1.strategy - container_id1 = docker_strategy1.container_id - logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST") - - # Close the first manager but keep container running - await manager1.close() - logger.info("First browser closed", tag="TEST") - - # Create second Docker configuration identical to first - docker_config2 = DockerConfig( - mode="connect", - persistent=True, - registry_file=registry_file - ) - - # Create second browser configuration - browser_config2 = BrowserConfig( - browser_mode="docker", - headless=True, - docker_config=docker_config2 - ) - - # Create second browser manager - manager2 = BrowserManager(browser_config=browser_config2, logger=logger) - - # Start the second browser - should reuse existing container - await manager2.start() - logger.info("Second browser started successfully", tag="TEST") - - # Get container ID from the second strategy - docker_strategy2 = manager2.strategy - container_id2 = docker_strategy2.container_id - logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST") - - # Verify container reuse - if container_id1 == container_id2: - logger.success("Container reuse successful - using same container!", tag="TEST") - else: - logger.error("Container reuse failed - new container created!", tag="TEST") - - # Clean up - docker_strategy2.docker_config.persistent = False - docker_strategy2.docker_config.remove_on_exit = True - await manager2.close() - logger.info("Second browser closed and container removed", tag="TEST") - - return container_id1 == container_id2 - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Ensure cleanup - try: - if manager1: - await manager1.close() - if manager2: - await manager2.close() - # Make sure container is removed - if container_id1: - await docker_utils.remove_container(container_id1, force=True) - except: - pass - return False - finally: - # Clean up registry directory - if os.path.exists(registry_dir): - shutil.rmtree(registry_dir) - -async def run_tests(): - """Run all tests sequentially.""" - results = [] - - logger.info("Starting Docker Browser Strategy tests", tag="TEST") - - # Check if Docker is available - if not await check_docker_available(): - logger.error("Docker is not available - skipping tests", tag="TEST") - return - - # First test Docker components - # setup_result = await test_docker_components() - # if not setup_result: - # logger.error("Docker component tests failed - skipping browser tests", tag="TEST") - # return - - # Run browser tests - results.append(await test_docker_connect_mode()) - results.append(await test_docker_launch_mode()) - results.append(await test_docker_persistent_storage()) - results.append(await test_docker_parallel_pages()) - results.append(await test_docker_registry_reuse()) - - # Print summary - total = len(results) - passed = sum(1 for r in results if r) - logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") - - if passed == total: - logger.success("All tests passed!", tag="SUMMARY") - else: - logger.error(f"{total - passed} tests failed", tag="SUMMARY") - -async def check_docker_available() -> bool: - """Check if Docker is available on the system. - - Returns: - bool: True if Docker is available, False otherwise - """ - try: - proc = await asyncio.create_subprocess_exec( - "docker", "--version", - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE - ) - stdout, _ = await proc.communicate() - return proc.returncode == 0 and stdout - except: - return False - -if __name__ == "__main__": - asyncio.run(run_tests()) \ No newline at end of file diff --git a/tests/browser/test_browser_manager.py b/tests/browser/test_browser_manager.py deleted file mode 100644 index d8f9376d7..000000000 --- a/tests/browser/test_browser_manager.py +++ /dev/null @@ -1,190 +0,0 @@ -"""Test examples for BrowserManager. - -These examples demonstrate the functionality of BrowserManager -and serve as functional tests. -""" - -import asyncio -import os -import sys -from typing import List - -# Add the project root to Python path if running directly -if __name__ == "__main__": - sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) - -from crawl4ai.browser import BrowserManager -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from crawl4ai.async_logger import AsyncLogger - -# Create a logger for clear terminal output -logger = AsyncLogger(verbose=True, log_file=None) - -async def test_basic_browser_manager(): - """Test basic BrowserManager functionality with default configuration.""" - logger.info("Starting test_basic_browser_manager", tag="TEST") - - try: - # Create a browser manager with default config - manager = BrowserManager(logger=logger) - - # Start the browser - await manager.start() - logger.info("Browser started successfully", tag="TEST") - - # Get a page - crawler_config = CrawlerRunConfig(url="https://example.com") - page, context = await manager.get_page(crawler_config) - logger.info("Page created successfully", tag="TEST") - - # Navigate to a website - await page.goto("https://example.com") - title = await page.title() - logger.info(f"Page title: {title}", tag="TEST") - - # Clean up - await manager.close() - logger.success("test_basic_browser_manager completed successfully", tag="TEST") - return True - except Exception as e: - logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST") - return False - -async def test_custom_browser_config(): - """Test BrowserManager with custom browser configuration.""" - logger.info("Starting test_custom_browser_config", tag="TEST") - - try: - # Create a custom browser config - browser_config = BrowserConfig( - browser_type="chromium", - headless=True, - viewport_width=1280, - viewport_height=800, - light_mode=True - ) - - # Create browser manager with the config - manager = BrowserManager(browser_config=browser_config, logger=logger) - - # Start the browser - await manager.start() - logger.info("Browser started successfully with custom config", tag="TEST") - - # Get a page - crawler_config = CrawlerRunConfig(url="https://example.com") - page, context = await manager.get_page(crawler_config) - - # Navigate to a website - await page.goto("https://example.com") - title = await page.title() - logger.info(f"Page title: {title}", tag="TEST") - - # Verify viewport size - viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })") - logger.info(f"Viewport size: {viewport_size}", tag="TEST") - - # Clean up - await manager.close() - logger.success("test_custom_browser_config completed successfully", tag="TEST") - return True - except Exception as e: - logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST") - return False - -async def test_multiple_pages(): - """Test BrowserManager with multiple pages.""" - logger.info("Starting test_multiple_pages", tag="TEST") - - try: - # Create browser manager - manager = BrowserManager(logger=logger) - - # Start the browser - await manager.start() - logger.info("Browser started successfully", tag="TEST") - - # Create multiple pages - pages = [] - urls = ["https://example.com", "https://example.org", "https://mozilla.org"] - - for i, url in enumerate(urls): - crawler_config = CrawlerRunConfig(url=url) - page, context = await manager.get_page(crawler_config) - await page.goto(url) - pages.append((page, url)) - logger.info(f"Created page {i+1} for {url}", tag="TEST") - - # Verify all pages are loaded correctly - for i, (page, url) in enumerate(pages): - title = await page.title() - logger.info(f"Page {i+1} title: {title}", tag="TEST") - - # Clean up - await manager.close() - logger.success("test_multiple_pages completed successfully", tag="TEST") - return True - except Exception as e: - logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST") - return False - -async def test_session_management(): - """Test session management in BrowserManager.""" - logger.info("Starting test_session_management", tag="TEST") - - try: - # Create browser manager - manager = BrowserManager(logger=logger) - - # Start the browser - await manager.start() - logger.info("Browser started successfully", tag="TEST") - - # Create a session - session_id = "test_session_1" - crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id) - page1, context1 = await manager.get_page(crawler_config) - await page1.goto("https://example.com") - logger.info(f"Created session with ID: {session_id}", tag="TEST") - - # Get the same session again - page2, context2 = await manager.get_page(crawler_config) - - # Verify it's the same page/context - is_same_page = page1 == page2 - is_same_context = context1 == context2 - logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST") - - # Kill the session - await manager.kill_session(session_id) - logger.info(f"Killed session with ID: {session_id}", tag="TEST") - - # Clean up - await manager.close() - logger.success("test_session_management completed successfully", tag="TEST") - return True - except Exception as e: - logger.error(f"test_session_management failed: {str(e)}", tag="TEST") - return False - -async def run_tests(): - """Run all tests sequentially.""" - results = [] - - results.append(await test_basic_browser_manager()) - results.append(await test_custom_browser_config()) - results.append(await test_multiple_pages()) - results.append(await test_session_management()) - - # Print summary - total = len(results) - passed = sum(results) - logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") - - if passed == total: - logger.success("All tests passed!", tag="SUMMARY") - else: - logger.error(f"{total - passed} tests failed", tag="SUMMARY") - -if __name__ == "__main__": - asyncio.run(run_tests()) diff --git a/tests/browser/test_builtin_browser.py b/tests/browser/test_builtin_browser.py deleted file mode 100644 index 4797648c6..000000000 --- a/tests/browser/test_builtin_browser.py +++ /dev/null @@ -1,809 +0,0 @@ -""" -Test script for builtin browser functionality in the browser module. - -This script tests: -1. Creating a builtin browser -2. Getting browser information -3. Killing the browser -4. Restarting the browser -5. Testing operations with different browser strategies -6. Testing edge cases -""" - -import asyncio -import os -import sys -import time -from typing import List, Dict, Any -from colorama import Fore, Style, init - -# Add the project root to the path for imports -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) - -from rich.console import Console -from rich.table import Table -from rich.panel import Panel -from rich.text import Text -from rich.box import Box, SIMPLE - -from crawl4ai.browser import BrowserManager -from crawl4ai.browser.strategies import BuiltinBrowserStrategy -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from crawl4ai.async_logger import AsyncLogger - -# Initialize colorama for cross-platform colored terminal output -init() - -# Define colors for pretty output -SUCCESS = Fore.GREEN -WARNING = Fore.YELLOW -ERROR = Fore.RED -INFO = Fore.CYAN -RESET = Fore.RESET - -# Create logger -logger = AsyncLogger(verbose=True) - - -async def test_builtin_browser_creation(): - """Test creating a builtin browser using the BrowserManager with BuiltinBrowserStrategy""" - print(f"\n{INFO}========== Testing Builtin Browser Creation =========={RESET}") - - # Step 1: Create a BrowserManager with builtin mode - print(f"\n{INFO}1. Creating BrowserManager with builtin mode{RESET}") - browser_config = BrowserConfig(browser_mode="builtin", headless=True, verbose=True) - manager = BrowserManager(browser_config=browser_config, logger=logger) - - # Step 2: Check if we have a BuiltinBrowserStrategy - print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}") - if isinstance(manager.strategy, BuiltinBrowserStrategy): - print( - f"{SUCCESS}Correct strategy type: {manager.strategy.__class__.__name__}{RESET}" - ) - else: - print( - f"{ERROR}Wrong strategy type: {manager.strategy.__class__.__name__}{RESET}" - ) - return None - - # Step 3: Start the manager to launch or connect to builtin browser - print(f"\n{INFO}3. Starting the browser manager{RESET}") - try: - await manager.start() - print(f"{SUCCESS}Browser manager started successfully{RESET}") - except Exception as e: - print(f"{ERROR}Failed to start browser manager: {str(e)}{RESET}") - return None - - # Step 4: Get browser info from the strategy - print(f"\n{INFO}4. Getting browser information{RESET}") - browser_info = manager.strategy.get_browser_info() - if browser_info: - print(f"{SUCCESS}Browser info retrieved:{RESET}") - for key, value in browser_info.items(): - if key != "config": # Skip the verbose config section - print(f" {key}: {value}") - - cdp_url = browser_info.get("cdp_url") - print(f"{SUCCESS}CDP URL: {cdp_url}{RESET}") - else: - print(f"{ERROR}Failed to get browser information{RESET}") - cdp_url = None - - # Save manager for later tests - return manager, cdp_url - - -async def test_page_operations(manager: BrowserManager): - """Test page operations with the builtin browser""" - print( - f"\n{INFO}========== Testing Page Operations with Builtin Browser =========={RESET}" - ) - - # Step 1: Get a single page - print(f"\n{INFO}1. Getting a single page{RESET}") - try: - crawler_config = CrawlerRunConfig() - page, context = await manager.get_page(crawler_config) - print(f"{SUCCESS}Got page successfully{RESET}") - - # Navigate to a test URL - await page.goto("https://example.com") - title = await page.title() - print(f"{SUCCESS}Page title: {title}{RESET}") - - # Close the page - await page.close() - print(f"{SUCCESS}Page closed successfully{RESET}") - except Exception as e: - print(f"{ERROR}Page operation failed: {str(e)}{RESET}") - return False - - # Step 2: Get multiple pages - print(f"\n{INFO}2. Getting multiple pages with get_pages(){RESET}") - try: - # Request 3 pages - crawler_config = CrawlerRunConfig() - pages = await manager.get_pages(crawler_config, count=3) - print(f"{SUCCESS}Got {len(pages)} pages{RESET}") - - # Test each page - for i, (page, context) in enumerate(pages): - await page.goto(f"https://example.com?test={i}") - title = await page.title() - print(f"{SUCCESS}Page {i + 1} title: {title}{RESET}") - await page.close() - - print(f"{SUCCESS}All pages tested and closed successfully{RESET}") - except Exception as e: - print(f"{ERROR}Multiple page operation failed: {str(e)}{RESET}") - return False - - return True - - -async def test_browser_status_management(manager: BrowserManager): - """Test browser status and management operations""" - print(f"\n{INFO}========== Testing Browser Status and Management =========={RESET}") - - # Step 1: Get browser status - print(f"\n{INFO}1. Getting browser status{RESET}") - try: - status = await manager.strategy.get_builtin_browser_status() - print(f"{SUCCESS}Browser status:{RESET}") - print(f" Running: {status['running']}") - print(f" CDP URL: {status['cdp_url']}") - except Exception as e: - print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}") - return False - - # Step 2: Test killing the browser - print(f"\n{INFO}2. Testing killing the browser{RESET}") - try: - result = await manager.strategy.kill_builtin_browser() - if result: - print(f"{SUCCESS}Browser killed successfully{RESET}") - else: - print(f"{ERROR}Failed to kill browser{RESET}") - except Exception as e: - print(f"{ERROR}Browser kill operation failed: {str(e)}{RESET}") - return False - - # Step 3: Check status after kill - print(f"\n{INFO}3. Checking status after kill{RESET}") - try: - status = await manager.strategy.get_builtin_browser_status() - if not status["running"]: - print(f"{SUCCESS}Browser is correctly reported as not running{RESET}") - else: - print(f"{ERROR}Browser is incorrectly reported as still running{RESET}") - except Exception as e: - print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}") - return False - - # Step 4: Launch a new browser - print(f"\n{INFO}4. Launching a new browser{RESET}") - try: - cdp_url = await manager.strategy.launch_builtin_browser( - browser_type="chromium", headless=True - ) - if cdp_url: - print(f"{SUCCESS}New browser launched at: {cdp_url}{RESET}") - else: - print(f"{ERROR}Failed to launch new browser{RESET}") - return False - except Exception as e: - print(f"{ERROR}Browser launch failed: {str(e)}{RESET}") - return False - - return True - - -async def test_multiple_managers(): - """Test creating multiple BrowserManagers that use the same builtin browser""" - print(f"\n{INFO}========== Testing Multiple Browser Managers =========={RESET}") - - # Step 1: Create first manager - print(f"\n{INFO}1. Creating first browser manager{RESET}") - browser_config1 = BrowserConfig(browser_mode="builtin", headless=True) - manager1 = BrowserManager(browser_config=browser_config1, logger=logger) - - # Step 2: Create second manager - print(f"\n{INFO}2. Creating second browser manager{RESET}") - browser_config2 = BrowserConfig(browser_mode="builtin", headless=True) - manager2 = BrowserManager(browser_config=browser_config2, logger=logger) - - # Step 3: Start both managers (should connect to the same builtin browser) - print(f"\n{INFO}3. Starting both managers{RESET}") - try: - await manager1.start() - print(f"{SUCCESS}First manager started{RESET}") - - await manager2.start() - print(f"{SUCCESS}Second manager started{RESET}") - - # Check if they got the same CDP URL - cdp_url1 = manager1.strategy.config.cdp_url - cdp_url2 = manager2.strategy.config.cdp_url - - if cdp_url1 == cdp_url2: - print( - f"{SUCCESS}Both managers connected to the same browser: {cdp_url1}{RESET}" - ) - else: - print( - f"{WARNING}Managers connected to different browsers: {cdp_url1} and {cdp_url2}{RESET}" - ) - except Exception as e: - print(f"{ERROR}Failed to start managers: {str(e)}{RESET}") - return False - - # Step 4: Test using both managers - print(f"\n{INFO}4. Testing operations with both managers{RESET}") - try: - # First manager creates a page - page1, ctx1 = await manager1.get_page(CrawlerRunConfig()) - await page1.goto("https://example.com") - title1 = await page1.title() - print(f"{SUCCESS}Manager 1 page title: {title1}{RESET}") - - # Second manager creates a page - page2, ctx2 = await manager2.get_page(CrawlerRunConfig()) - await page2.goto("https://example.org") - title2 = await page2.title() - print(f"{SUCCESS}Manager 2 page title: {title2}{RESET}") - - # Clean up - await page1.close() - await page2.close() - except Exception as e: - print(f"{ERROR}Failed to use both managers: {str(e)}{RESET}") - return False - - # Step 5: Close both managers - print(f"\n{INFO}5. Closing both managers{RESET}") - try: - await manager1.close() - print(f"{SUCCESS}First manager closed{RESET}") - - await manager2.close() - print(f"{SUCCESS}Second manager closed{RESET}") - except Exception as e: - print(f"{ERROR}Failed to close managers: {str(e)}{RESET}") - return False - - return True - - -async def test_edge_cases(): - """Test edge cases like multiple starts, killing browser during operations, etc.""" - print(f"\n{INFO}========== Testing Edge Cases =========={RESET}") - - # Step 1: Test multiple starts with the same manager - print(f"\n{INFO}1. Testing multiple starts with the same manager{RESET}") - browser_config = BrowserConfig(browser_mode="builtin", headless=True) - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - await manager.start() - print(f"{SUCCESS}First start successful{RESET}") - - # Try to start again - await manager.start() - print(f"{SUCCESS}Second start completed without errors{RESET}") - - # Test if it's still functional - page, context = await manager.get_page(CrawlerRunConfig()) - await page.goto("https://example.com") - title = await page.title() - print( - f"{SUCCESS}Page operations work after multiple starts. Title: {title}{RESET}" - ) - await page.close() - except Exception as e: - print(f"{ERROR}Multiple starts test failed: {str(e)}{RESET}") - return False - finally: - await manager.close() - - # Step 2: Test killing the browser while manager is active - print(f"\n{INFO}2. Testing killing the browser while manager is active{RESET}") - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - await manager.start() - print(f"{SUCCESS}Manager started{RESET}") - - # Kill the browser directly - print(f"{INFO}Killing the browser...{RESET}") - await manager.strategy.kill_builtin_browser() - print(f"{SUCCESS}Browser killed{RESET}") - - # Try to get a page (should fail or launch a new browser) - try: - page, context = await manager.get_page(CrawlerRunConfig()) - print( - f"{WARNING}Page request succeeded despite killed browser (might have auto-restarted){RESET}" - ) - title = await page.title() - print(f"{SUCCESS}Got page title: {title}{RESET}") - await page.close() - except Exception as e: - print( - f"{SUCCESS}Page request failed as expected after browser was killed: {str(e)}{RESET}" - ) - except Exception as e: - print(f"{ERROR}Kill during operation test failed: {str(e)}{RESET}") - return False - finally: - await manager.close() - - return True - - -async def cleanup_browsers(): - """Clean up any remaining builtin browsers""" - print(f"\n{INFO}========== Cleaning Up Builtin Browsers =========={RESET}") - - browser_config = BrowserConfig(browser_mode="builtin", headless=True) - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - # No need to start, just access the strategy directly - strategy = manager.strategy - if isinstance(strategy, BuiltinBrowserStrategy): - result = await strategy.kill_builtin_browser() - if result: - print(f"{SUCCESS}Successfully killed all builtin browsers{RESET}") - else: - print(f"{WARNING}No builtin browsers found to kill{RESET}") - else: - print(f"{ERROR}Wrong strategy type: {strategy.__class__.__name__}{RESET}") - except Exception as e: - print(f"{ERROR}Cleanup failed: {str(e)}{RESET}") - finally: - # Just to be safe - try: - await manager.close() - except: - pass - - -async def test_performance_scaling(): - """Test performance with multiple browsers and pages. - - This test creates multiple browsers on different ports, - spawns multiple pages per browser, and measures performance metrics. - """ - print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}") - - # Configuration parameters - num_browsers = 10 - pages_per_browser = 10 - total_pages = num_browsers * pages_per_browser - base_port = 9222 - - # Set up a measuring mechanism for memory - import psutil - import gc - - # Force garbage collection before starting - gc.collect() - process = psutil.Process() - initial_memory = process.memory_info().rss / 1024 / 1024 # in MB - peak_memory = initial_memory - - # Report initial configuration - print( - f"{INFO}Test configuration: {num_browsers} browsers ร— {pages_per_browser} pages = {total_pages} total crawls{RESET}" - ) - - # List to track managers - managers: List[BrowserManager] = [] - all_pages = [] - - - - # Get crawl4ai home directory - crawl4ai_home = os.path.expanduser("~/.crawl4ai") - temp_dir = os.path.join(crawl4ai_home, "temp") - os.makedirs(temp_dir, exist_ok=True) - - # Create all managers but don't start them yet - manager_configs = [] - for i in range(num_browsers): - port = base_port + i - browser_config = BrowserConfig( - browser_mode="builtin", - headless=True, - debugging_port=port, - user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"), - ) - manager = BrowserManager(browser_config=browser_config, logger=logger) - manager.strategy.shutting_down = True - manager_configs.append((manager, i, port)) - - # Define async function to start a single manager - async def start_manager(manager, index, port): - try: - await manager.start() - return manager - except Exception as e: - print( - f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}" - ) - return None - - # Start all managers in parallel - start_tasks = [ - start_manager(manager, i, port) for manager, i, port in manager_configs - ] - started_managers = await asyncio.gather(*start_tasks) - - # Filter out None values (failed starts) and add to managers list - managers = [m for m in started_managers if m is not None] - - if len(managers) == 0: - print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}") - return False - - if len(managers) < num_browsers: - print( - f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}" - ) - - # Create pages for each browser - for i, manager in enumerate(managers): - try: - pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser) - all_pages.extend(pages) - except Exception as e: - print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}") - - # Check memory after page creation - gc.collect() - current_memory = process.memory_info().rss / 1024 / 1024 - peak_memory = max(peak_memory, current_memory) - - # Ask for confirmation before loading - confirmation = input( - f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}" - ) - # Step 1: Create and start multiple browser managers in parallel - start_time = time.time() - - if confirmation.lower() == "y": - load_start_time = time.time() - - # Function to load a single page - async def load_page(page_ctx, index): - page, _ = page_ctx - try: - await page.goto(f"https://example.com/page{index}", timeout=30000) - title = await page.title() - return title - except Exception as e: - return f"Error: {str(e)}" - - # Load all pages concurrently - load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)] - load_results = await asyncio.gather(*load_tasks, return_exceptions=True) - - # Count successes and failures - successes = sum( - 1 for r in load_results if isinstance(r, str) and not r.startswith("Error") - ) - failures = len(load_results) - successes - - load_time = time.time() - load_start_time - total_test_time = time.time() - start_time - - # Check memory after loading (peak memory) - gc.collect() - current_memory = process.memory_info().rss / 1024 / 1024 - peak_memory = max(peak_memory, current_memory) - - # Calculate key metrics - memory_per_page = peak_memory / successes if successes > 0 else 0 - time_per_crawl = total_test_time / successes if successes > 0 else 0 - crawls_per_second = successes / total_test_time if total_test_time > 0 else 0 - crawls_per_minute = crawls_per_second * 60 - crawls_per_hour = crawls_per_minute * 60 - - # Print simplified performance summary - from rich.console import Console - from rich.table import Table - - console = Console() - - # Create a simple summary table - table = Table(title="CRAWL4AI PERFORMANCE SUMMARY") - - table.add_column("Metric", style="cyan") - table.add_column("Value", style="green") - - table.add_row("Total Crawls Completed", f"{successes}") - table.add_row("Total Time", f"{total_test_time:.2f} seconds") - table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds") - table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second") - table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls") - table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls") - table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB") - table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB") - - # Display the table - console.print(table) - - # Ask confirmation before cleanup - confirmation = input( - f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}" - ) - if confirmation.lower() != "y": - print(f"{WARNING}Cleanup aborted by user{RESET}") - return False - - # Close all pages - for page, _ in all_pages: - try: - await page.close() - except: - pass - - # Close all managers - for manager in managers: - try: - await manager.close() - except: - pass - - # Remove the temp directory - import shutil - - if os.path.exists(temp_dir): - shutil.rmtree(temp_dir) - - return True - - -async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browser: int = 10): - """Test performance with multiple browsers and pages. - - This test creates multiple browsers on different ports, - spawns multiple pages per browser, and measures performance metrics. - """ - print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}") - - # Configuration parameters - num_browsers = num_browsers - pages_per_browser = pages_per_browser - total_pages = num_browsers * pages_per_browser - base_port = 9222 - - # Set up a measuring mechanism for memory - import psutil - import gc - - # Force garbage collection before starting - gc.collect() - process = psutil.Process() - initial_memory = process.memory_info().rss / 1024 / 1024 # in MB - peak_memory = initial_memory - - # Report initial configuration - print( - f"{INFO}Test configuration: {num_browsers} browsers ร— {pages_per_browser} pages = {total_pages} total crawls{RESET}" - ) - - # List to track managers - managers: List[BrowserManager] = [] - all_pages = [] - - # Get crawl4ai home directory - crawl4ai_home = os.path.expanduser("~/.crawl4ai") - temp_dir = os.path.join(crawl4ai_home, "temp") - os.makedirs(temp_dir, exist_ok=True) - - # Create all managers but don't start them yet - manager_configs = [] - for i in range(num_browsers): - port = base_port + i - browser_config = BrowserConfig( - browser_mode="builtin", - headless=True, - debugging_port=port, - user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"), - ) - manager = BrowserManager(browser_config=browser_config, logger=logger) - manager.strategy.shutting_down = True - manager_configs.append((manager, i, port)) - - # Define async function to start a single manager - async def start_manager(manager, index, port): - try: - await manager.start() - return manager - except Exception as e: - print( - f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}" - ) - return None - - # Start all managers in parallel - start_tasks = [ - start_manager(manager, i, port) for manager, i, port in manager_configs - ] - started_managers = await asyncio.gather(*start_tasks) - - # Filter out None values (failed starts) and add to managers list - managers = [m for m in started_managers if m is not None] - - if len(managers) == 0: - print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}") - return False - - if len(managers) < num_browsers: - print( - f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}" - ) - - # Create pages for each browser - for i, manager in enumerate(managers): - try: - pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser) - all_pages.extend(pages) - except Exception as e: - print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}") - - # Check memory after page creation - gc.collect() - current_memory = process.memory_info().rss / 1024 / 1024 - peak_memory = max(peak_memory, current_memory) - - # Ask for confirmation before loading - confirmation = input( - f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}" - ) - # Step 1: Create and start multiple browser managers in parallel - start_time = time.time() - - if confirmation.lower() == "y": - load_start_time = time.time() - - # Function to load a single page - async def load_page(page_ctx, index): - page, _ = page_ctx - try: - await page.goto(f"https://example.com/page{index}", timeout=30000) - title = await page.title() - return title - except Exception as e: - return f"Error: {str(e)}" - - # Load all pages concurrently - load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)] - load_results = await asyncio.gather(*load_tasks, return_exceptions=True) - - # Count successes and failures - successes = sum( - 1 for r in load_results if isinstance(r, str) and not r.startswith("Error") - ) - failures = len(load_results) - successes - - load_time = time.time() - load_start_time - total_test_time = time.time() - start_time - - # Check memory after loading (peak memory) - gc.collect() - current_memory = process.memory_info().rss / 1024 / 1024 - peak_memory = max(peak_memory, current_memory) - - # Calculate key metrics - memory_per_page = peak_memory / successes if successes > 0 else 0 - time_per_crawl = total_test_time / successes if successes > 0 else 0 - crawls_per_second = successes / total_test_time if total_test_time > 0 else 0 - crawls_per_minute = crawls_per_second * 60 - crawls_per_hour = crawls_per_minute * 60 - - # Print simplified performance summary - from rich.console import Console - from rich.table import Table - - console = Console() - - # Create a simple summary table - table = Table(title="CRAWL4AI PERFORMANCE SUMMARY") - - table.add_column("Metric", style="cyan") - table.add_column("Value", style="green") - - table.add_row("Total Crawls Completed", f"{successes}") - table.add_row("Total Time", f"{total_test_time:.2f} seconds") - table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds") - table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second") - table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls") - table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls") - table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB") - table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB") - - # Display the table - console.print(table) - - # Ask confirmation before cleanup - confirmation = input( - f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}" - ) - if confirmation.lower() != "y": - print(f"{WARNING}Cleanup aborted by user{RESET}") - return False - - # Close all pages - for page, _ in all_pages: - try: - await page.close() - except: - pass - - # Close all managers - for manager in managers: - try: - await manager.close() - except: - pass - - # Remove the temp directory - import shutil - - if os.path.exists(temp_dir): - shutil.rmtree(temp_dir) - - return True - - - -async def main(): - """Run all tests""" - try: - print(f"{INFO}Starting builtin browser tests with browser module{RESET}") - - # # Run browser creation test - # manager, cdp_url = await test_builtin_browser_creation() - # if not manager: - # print(f"{ERROR}Browser creation failed, cannot continue tests{RESET}") - # return - - # # Run page operations test - # await test_page_operations(manager) - - # # Run browser status and management test - # await test_browser_status_management(manager) - - # # Close manager before multiple manager test - # await manager.close() - - # Run multiple managers test - await test_multiple_managers() - - # Run performance scaling test - await test_performance_scaling() - - # Run cleanup test - await cleanup_browsers() - - # Run edge cases test - await test_edge_cases() - - print(f"\n{SUCCESS}All tests completed!{RESET}") - - except Exception as e: - print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}") - import traceback - - traceback.print_exc() - finally: - # Clean up: kill any remaining builtin browsers - await cleanup_browsers() - print(f"{SUCCESS}Test cleanup complete{RESET}") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/browser/test_builtin_strategy.py b/tests/browser/test_builtin_strategy.py deleted file mode 100644 index 7c435b3de..000000000 --- a/tests/browser/test_builtin_strategy.py +++ /dev/null @@ -1,160 +0,0 @@ -"""Test examples for BuiltinBrowserStrategy. - -These examples demonstrate the functionality of BuiltinBrowserStrategy -and serve as functional tests. -""" - -import asyncio -import os -import sys - -# Add the project root to Python path if running directly -if __name__ == "__main__": - sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) - -from crawl4ai.browser import BrowserManager -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from crawl4ai.async_logger import AsyncLogger - -# Create a logger for clear terminal output -logger = AsyncLogger(verbose=True, log_file=None) - -async def test_builtin_browser(): - """Test using a builtin browser that persists between sessions.""" - logger.info("Testing builtin browser", tag="TEST") - - browser_config = BrowserConfig( - browser_mode="builtin", - headless=True - ) - - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - # Start should connect to existing builtin browser or create one - await manager.start() - logger.info("Connected to builtin browser", tag="TEST") - - # Test page creation - crawler_config = CrawlerRunConfig() - page, context = await manager.get_page(crawler_config) - - # Test navigation - await page.goto("https://example.com") - title = await page.title() - logger.info(f"Page title: {title}", tag="TEST") - - # Close manager (should not close the builtin browser) - await manager.close() - logger.info("First session closed", tag="TEST") - - # Create a second manager to verify browser persistence - logger.info("Creating second session to verify persistence", tag="TEST") - manager2 = BrowserManager(browser_config=browser_config, logger=logger) - - await manager2.start() - logger.info("Connected to existing builtin browser", tag="TEST") - - page2, context2 = await manager2.get_page(crawler_config) - await page2.goto("https://example.org") - title2 = await page2.title() - logger.info(f"Second session page title: {title2}", tag="TEST") - - await manager2.close() - logger.info("Second session closed successfully", tag="TEST") - - return True - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - try: - await manager.close() - except: - pass - return False - -async def test_builtin_browser_status(): - """Test getting status of the builtin browser.""" - logger.info("Testing builtin browser status", tag="TEST") - - from crawl4ai.browser.strategies import BuiltinBrowserStrategy - - browser_config = BrowserConfig( - browser_mode="builtin", - headless=True - ) - - # Create strategy directly to access its status methods - strategy = BuiltinBrowserStrategy(browser_config, logger) - - try: - # Get status before starting (should be not running) - status_before = await strategy.get_builtin_browser_status() - logger.info(f"Initial status: {status_before}", tag="TEST") - - # Start the browser - await strategy.start() - logger.info("Browser started successfully", tag="TEST") - - # Get status after starting - status_after = await strategy.get_builtin_browser_status() - logger.info(f"Status after start: {status_after}", tag="TEST") - - # Create a page to verify functionality - crawler_config = CrawlerRunConfig() - page, context = await strategy.get_page(crawler_config) - await page.goto("https://example.com") - title = await page.title() - logger.info(f"Page title: {title}", tag="TEST") - - # Close strategy (should not kill the builtin browser) - await strategy.close() - logger.info("Strategy closed successfully", tag="TEST") - - # Create a new strategy object - strategy2 = BuiltinBrowserStrategy(browser_config, logger) - - # Get status again (should still be running) - status_final = await strategy2.get_builtin_browser_status() - logger.info(f"Final status: {status_final}", tag="TEST") - - # Verify that the status shows the browser is running - is_running = status_final.get('running', False) - logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST") - - # Kill the builtin browser to clean up - logger.info("Killing builtin browser", tag="TEST") - success = await strategy2.kill_builtin_browser() - logger.info(f"Killed builtin browser successfully: {success}", tag="TEST") - - return is_running and success - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - try: - await strategy.close() - - # Try to kill the builtin browser to clean up - strategy2 = BuiltinBrowserStrategy(browser_config, logger) - await strategy2.kill_builtin_browser() - except: - pass - return False - -async def run_tests(): - """Run all tests sequentially.""" - results = [] - - results.append(await test_builtin_browser()) - results.append(await test_builtin_browser_status()) - - # Print summary - total = len(results) - passed = sum(results) - logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") - - if passed == total: - logger.success("All tests passed!", tag="SUMMARY") - else: - logger.error(f"{total - passed} tests failed", tag="SUMMARY") - -if __name__ == "__main__": - asyncio.run(run_tests()) diff --git a/tests/browser/test_cdp_strategy.py b/tests/browser/test_cdp_strategy.py deleted file mode 100644 index 1df089a54..000000000 --- a/tests/browser/test_cdp_strategy.py +++ /dev/null @@ -1,228 +0,0 @@ -"""Test examples for CDPBrowserStrategy. - -These examples demonstrate the functionality of CDPBrowserStrategy -and serve as functional tests. -""" - -import asyncio -import os -import sys - -# Add the project root to Python path if running directly -if __name__ == "__main__": - sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) - -from crawl4ai.browser import BrowserManager -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from crawl4ai.async_logger import AsyncLogger - -# Create a logger for clear terminal output -logger = AsyncLogger(verbose=True, log_file=None) - -async def test_cdp_launch_connect(): - """Test launching a browser and connecting via CDP.""" - logger.info("Testing launch and connect via CDP", tag="TEST") - - browser_config = BrowserConfig( - use_managed_browser=True, - browser_mode="cdp", - headless=True - ) - - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - await manager.start() - logger.info("Browser launched and connected via CDP", tag="TEST") - - # Test with multiple pages - pages = [] - for i in range(3): - crawler_config = CrawlerRunConfig() - page, context = await manager.get_page(crawler_config) - await page.goto(f"https://example.com?test={i}") - pages.append(page) - logger.info(f"Created page {i+1}", tag="TEST") - - # Verify all pages are working - for i, page in enumerate(pages): - title = await page.title() - logger.info(f"Page {i+1} title: {title}", tag="TEST") - - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - return True - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - try: - await manager.close() - except: - pass - return False - -async def test_cdp_with_user_data_dir(): - """Test CDP browser with a user data directory.""" - logger.info("Testing CDP browser with user data directory", tag="TEST") - - # Create a temporary user data directory - import tempfile - user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-") - logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST") - - browser_config = BrowserConfig( - headless=True, - browser_mode="cdp", - user_data_dir=user_data_dir - ) - - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - await manager.start() - logger.info("Browser launched with user data directory", tag="TEST") - - # Navigate to a page and store some data - crawler_config = CrawlerRunConfig() - page, context = await manager.get_page(crawler_config) - - # Set a cookie - await context.add_cookies([{ - "name": "test_cookie", - "value": "test_value", - "url": "https://example.com" - }]) - - # Visit the site - await page.goto("https://example.com") - - # Verify cookie was set - cookies = await context.cookies(["https://example.com"]) - has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies) - logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST") - - # Close the browser - await manager.close() - logger.info("First browser session closed", tag="TEST") - - # Start a new browser with the same user data directory - logger.info("Starting second browser session with same user data directory", tag="TEST") - manager2 = BrowserManager(browser_config=browser_config, logger=logger) - await manager2.start() - - # Get a new page and check if the cookie persists - page2, context2 = await manager2.get_page(crawler_config) - await page2.goto("https://example.com") - - # Verify cookie persisted - cookies2 = await context2.cookies(["https://example.com"]) - has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2) - logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST") - - # Clean up - await manager2.close() - - # Remove temporary directory - import shutil - shutil.rmtree(user_data_dir, ignore_errors=True) - logger.info(f"Removed temporary user data directory", tag="TEST") - - return has_test_cookie and has_test_cookie2 - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - try: - await manager.close() - except: - pass - - # Clean up temporary directory - try: - import shutil - shutil.rmtree(user_data_dir, ignore_errors=True) - except: - pass - - return False - -async def test_cdp_session_management(): - """Test session management with CDP browser.""" - logger.info("Testing session management with CDP browser", tag="TEST") - - browser_config = BrowserConfig( - use_managed_browser=True, - headless=True - ) - - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - await manager.start() - logger.info("Browser launched successfully", tag="TEST") - - # Create two sessions - session1_id = "test_session_1" - session2_id = "test_session_2" - - # Set up first session - crawler_config1 = CrawlerRunConfig(session_id=session1_id) - page1, context1 = await manager.get_page(crawler_config1) - await page1.goto("https://example.com") - await page1.evaluate("localStorage.setItem('session1_data', 'test_value')") - logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST") - - # Set up second session - crawler_config2 = CrawlerRunConfig(session_id=session2_id) - page2, context2 = await manager.get_page(crawler_config2) - await page2.goto("https://example.org") - await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')") - logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST") - - # Get first session again - page1_again, _ = await manager.get_page(crawler_config1) - - # Verify it's the same page and data persists - is_same_page = page1 == page1_again - data1 = await page1_again.evaluate("localStorage.getItem('session1_data')") - logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST") - - # Kill first session - await manager.kill_session(session1_id) - logger.info(f"Killed session 1", tag="TEST") - - # Verify second session still works - data2 = await page2.evaluate("localStorage.getItem('session2_data')") - logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST") - - # Clean up - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - return is_same_page and data1 == "test_value" and data2 == "test_value2" - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - try: - await manager.close() - except: - pass - return False - -async def run_tests(): - """Run all tests sequentially.""" - results = [] - - # results.append(await test_cdp_launch_connect()) - results.append(await test_cdp_with_user_data_dir()) - results.append(await test_cdp_session_management()) - - # Print summary - total = len(results) - passed = sum(results) - logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") - - if passed == total: - logger.success("All tests passed!", tag="SUMMARY") - else: - logger.error(f"{total - passed} tests failed", tag="SUMMARY") - -if __name__ == "__main__": - asyncio.run(run_tests()) diff --git a/tests/browser/test_parallel_crawling.py b/tests/browser/test_parallel_crawling.py deleted file mode 100644 index 9e72f06e3..000000000 --- a/tests/browser/test_parallel_crawling.py +++ /dev/null @@ -1,902 +0,0 @@ -""" -Test examples for parallel crawling with the browser module. - -These examples demonstrate the functionality of parallel page creation -and serve as functional tests for multi-page crawling performance. -""" - -import asyncio -import os -import sys -import time -from typing import List - -# Add the project root to Python path if running directly -if __name__ == "__main__": - sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) - -from crawl4ai.browser import BrowserManager -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from crawl4ai.async_logger import AsyncLogger - -# Create a logger for clear terminal output -logger = AsyncLogger(verbose=True, log_file=None) - -async def test_get_pages_basic(): - """Test basic functionality of get_pages method.""" - logger.info("Testing basic get_pages functionality", tag="TEST") - - browser_config = BrowserConfig(headless=True) - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - await manager.start() - - # Request 3 pages - crawler_config = CrawlerRunConfig() - pages = await manager.get_pages(crawler_config, count=3) - - # Verify we got the correct number of pages - assert len(pages) == 3, f"Expected 3 pages, got {len(pages)}" - - # Verify each page is valid - for i, (page, context) in enumerate(pages): - await page.goto("https://example.com") - title = await page.title() - logger.info(f"Page {i+1} title: {title}", tag="TEST") - assert title, f"Page {i+1} has no title" - - await manager.close() - logger.success("Basic get_pages test completed successfully", tag="TEST") - return True - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - try: - await manager.close() - except: - pass - return False - -async def test_parallel_approaches_comparison(): - """Compare two parallel crawling approaches: - 1. Create a page for each URL on-demand (get_page + gather) - 2. Get all pages upfront with get_pages, then use them (get_pages + gather) - """ - logger.info("Comparing different parallel crawling approaches", tag="TEST") - - urls = [ - "https://example.com/page1", - "https://crawl4ai.com", - "https://kidocode.com", - "https://bbc.com", - # "https://example.com/page1", - # "https://example.com/page2", - # "https://example.com/page3", - # "https://example.com/page4", - ] - - browser_config = BrowserConfig(headless=False) - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - await manager.start() - - # Approach 1: Create a page for each URL on-demand and run in parallel - logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST") - start_time = time.time() - - async def fetch_title_approach1(url): - """Create a new page for each URL, go to the URL, and get title""" - crawler_config = CrawlerRunConfig(url=url) - page, context = await manager.get_page(crawler_config) - try: - await page.goto(url) - title = await page.title() - return title - finally: - await page.close() - - # Run fetch_title_approach1 for each URL in parallel - tasks = [fetch_title_approach1(url) for url in urls] - approach1_results = await asyncio.gather(*tasks) - - approach1_time = time.time() - start_time - logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST") - - # Approach 2: Get all pages upfront with get_pages, then use them in parallel - logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST") - start_time = time.time() - - # Get all pages upfront - crawler_config = CrawlerRunConfig() - pages = await manager.get_pages(crawler_config, count=len(urls)) - - async def fetch_title_approach2(page_ctx, url): - """Use a pre-created page to go to URL and get title""" - page, _ = page_ctx - try: - await page.goto(url) - title = await page.title() - return title - finally: - await page.close() - - # Use the pre-created pages to fetch titles in parallel - tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in zip(pages, urls)] - approach2_results = await asyncio.gather(*tasks) - - approach2_time = time.time() - start_time - logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST") - - # Compare results and performance - speedup = approach1_time / approach2_time if approach2_time > 0 else 0 - if speedup > 1: - logger.success(f"Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST") - else: - logger.info(f"Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST") - - # Verify same content was retrieved in both approaches - assert len(approach1_results) == len(approach2_results), "Result count mismatch" - - # Sort results for comparison since parallel execution might complete in different order - assert sorted(approach1_results) == sorted(approach2_results), "Results content mismatch" - - await manager.close() - return True - - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - try: - await manager.close() - except: - pass - return False - -async def test_multi_browser_scaling(num_browsers=3, pages_per_browser=5): - """Test performance with multiple browsers and pages per browser. - Compares two approaches: - 1. On-demand page creation (get_page + gather) - 2. Pre-created pages (get_pages + gather) - """ - logger.info(f"Testing multi-browser scaling with {num_browsers} browsers ร— {pages_per_browser} pages", tag="TEST") - - # Generate test URLs - total_pages = num_browsers * pages_per_browser - urls = [f"https://example.com/page_{i}" for i in range(total_pages)] - - # Create browser managers - managers = [] - base_port = 9222 - - try: - # Start all browsers in parallel - start_tasks = [] - for i in range(num_browsers): - browser_config = BrowserConfig( - headless=True # Using default browser mode like in test_parallel_approaches_comparison - ) - manager = BrowserManager(browser_config=browser_config, logger=logger) - start_tasks.append(manager.start()) - managers.append(manager) - - await asyncio.gather(*start_tasks) - - # Distribute URLs among managers - urls_per_manager = {} - for i, manager in enumerate(managers): - start_idx = i * pages_per_browser - end_idx = min(start_idx + pages_per_browser, len(urls)) - urls_per_manager[manager] = urls[start_idx:end_idx] - - # Approach 1: Create a page for each URL on-demand and run in parallel - logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST") - start_time = time.time() - - async def fetch_title_approach1(manager, url): - """Create a new page for the URL, go to the URL, and get title""" - crawler_config = CrawlerRunConfig(url=url) - page, context = await manager.get_page(crawler_config) - try: - await page.goto(url) - title = await page.title() - return title - finally: - await page.close() - - # Run fetch_title_approach1 for each URL in parallel - tasks = [] - for manager, manager_urls in urls_per_manager.items(): - for url in manager_urls: - tasks.append(fetch_title_approach1(manager, url)) - - approach1_results = await asyncio.gather(*tasks) - - approach1_time = time.time() - start_time - logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST") - - # Approach 2: Get all pages upfront with get_pages, then use them in parallel - logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST") - start_time = time.time() - - # Get all pages upfront for each manager - all_pages = [] - for manager, manager_urls in urls_per_manager.items(): - crawler_config = CrawlerRunConfig() - pages = await manager.get_pages(crawler_config, count=len(manager_urls)) - all_pages.extend(zip(pages, manager_urls)) - - async def fetch_title_approach2(page_ctx, url): - """Use a pre-created page to go to URL and get title""" - page, _ = page_ctx - try: - await page.goto(url) - title = await page.title() - return title - finally: - await page.close() - - # Use the pre-created pages to fetch titles in parallel - tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in all_pages] - approach2_results = await asyncio.gather(*tasks) - - approach2_time = time.time() - start_time - logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST") - - # Compare results and performance - speedup = approach1_time / approach2_time if approach2_time > 0 else 0 - pages_per_second = total_pages / approach2_time - - # Show a simple summary - logger.info(f"๐Ÿ“Š Summary: {num_browsers} browsers ร— {pages_per_browser} pages = {total_pages} total crawls", tag="TEST") - logger.info(f"โšก Performance: {pages_per_second:.1f} pages/second ({pages_per_second*60:.0f} pages/minute)", tag="TEST") - logger.info(f"๐Ÿš€ Total crawl time: {approach2_time:.2f} seconds", tag="TEST") - - if speedup > 1: - logger.success(f"โœ… Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST") - else: - logger.info(f"โœ… Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST") - - # Close all managers - for manager in managers: - await manager.close() - - return True - - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Clean up - for manager in managers: - try: - await manager.close() - except: - pass - return False - -async def grid_search_optimal_configuration(total_urls=50): - """Perform a grid search to find the optimal balance between number of browsers and pages per browser. - - This function tests different combinations of browser count and pages per browser, - while keeping the total number of URLs constant. It measures performance metrics - for each configuration to find the "sweet spot" that provides the best speed - with reasonable memory usage. - - Args: - total_urls: Total number of URLs to crawl (default: 50) - """ - logger.info(f"=== GRID SEARCH FOR OPTIMAL CRAWLING CONFIGURATION ({total_urls} URLs) ===", tag="TEST") - - # Generate test URLs once - urls = [f"https://example.com/page_{i}" for i in range(total_urls)] - - # Define grid search configurations - # We'll use more flexible approach: test all browser counts from 1 to min(20, total_urls) - # and distribute pages evenly (some browsers may have 1 more page than others) - configurations = [] - - # Maximum number of browsers to test - max_browsers_to_test = min(20, total_urls) - - # Try configurations with 1 to max_browsers_to_test browsers - for num_browsers in range(1, max_browsers_to_test + 1): - base_pages_per_browser = total_urls // num_browsers - remainder = total_urls % num_browsers - - # Generate exact page distribution array - if remainder > 0: - # First 'remainder' browsers get one more page - page_distribution = [base_pages_per_browser + 1] * remainder + [base_pages_per_browser] * (num_browsers - remainder) - pages_distribution = f"{base_pages_per_browser+1} pages ร— {remainder} browsers, {base_pages_per_browser} pages ร— {num_browsers - remainder} browsers" - else: - # All browsers get the same number of pages - page_distribution = [base_pages_per_browser] * num_browsers - pages_distribution = f"{base_pages_per_browser} pages ร— {num_browsers} browsers" - - # Format the distribution as a tuple string like (4, 4, 3, 3) - distribution_str = str(tuple(page_distribution)) - - configurations.append((num_browsers, base_pages_per_browser, pages_distribution, page_distribution, distribution_str)) - - # Track results - results = [] - - # Test each configuration - for num_browsers, pages_per_browser, pages_distribution, page_distribution, distribution_str in configurations: - logger.info("-" * 80, tag="TEST") - logger.info(f"Testing configuration: {num_browsers} browsers with distribution: {distribution_str}", tag="TEST") - logger.info(f"Details: {pages_distribution}", tag="TEST") - # Sleep a bit for randomness - await asyncio.sleep(0.5) - - try: - # Import psutil for memory tracking - try: - import psutil - process = psutil.Process() - initial_memory = process.memory_info().rss / (1024 * 1024) # MB - except ImportError: - logger.warning("psutil not available, memory metrics will not be tracked", tag="TEST") - initial_memory = 0 - - # Create and start browser managers - managers = [] - start_time = time.time() - - # Start all browsers in parallel - start_tasks = [] - for i in range(num_browsers): - browser_config = BrowserConfig( - headless=True - ) - manager = BrowserManager(browser_config=browser_config, logger=logger) - start_tasks.append(manager.start()) - managers.append(manager) - - await asyncio.gather(*start_tasks) - browser_startup_time = time.time() - start_time - - # Measure memory after browser startup - if initial_memory > 0: - browser_memory = process.memory_info().rss / (1024 * 1024) - initial_memory - else: - browser_memory = 0 - - # Distribute URLs among managers using the exact page distribution - urls_per_manager = {} - total_assigned = 0 - - for i, manager in enumerate(managers): - if i < len(page_distribution): - # Get the exact number of pages for this browser from our distribution - manager_pages = page_distribution[i] - - # Get the URL slice for this manager - start_idx = total_assigned - end_idx = start_idx + manager_pages - urls_per_manager[manager] = urls[start_idx:end_idx] - total_assigned += manager_pages - else: - # If we have more managers than our distribution (should never happen) - urls_per_manager[manager] = [] - - # Use the more efficient approach (pre-created pages) - logger.info("Running page crawling test...", tag="TEST") - crawl_start_time = time.time() - - # Get all pages upfront for each manager - all_pages = [] - for manager, manager_urls in urls_per_manager.items(): - if not manager_urls: # Skip managers with no URLs - continue - crawler_config = CrawlerRunConfig() - pages = await manager.get_pages(crawler_config, count=len(manager_urls)) - all_pages.extend(zip(pages, manager_urls)) - - # Measure memory after page creation - if initial_memory > 0: - pages_memory = process.memory_info().rss / (1024 * 1024) - browser_memory - initial_memory - else: - pages_memory = 0 - - # Function to crawl a URL with a pre-created page - async def fetch_title(page_ctx, url): - page, _ = page_ctx - try: - await page.goto(url) - title = await page.title() - return title - finally: - await page.close() - - # Use the pre-created pages to fetch titles in parallel - tasks = [fetch_title(page_ctx, url) for page_ctx, url in all_pages] - crawl_results = await asyncio.gather(*tasks) - - crawl_time = time.time() - crawl_start_time - total_time = time.time() - start_time - - # Final memory measurement - if initial_memory > 0: - peak_memory = max(browser_memory + pages_memory, process.memory_info().rss / (1024 * 1024) - initial_memory) - else: - peak_memory = 0 - - # Close all managers - for manager in managers: - await manager.close() - - # Calculate metrics - pages_per_second = total_urls / crawl_time - - # Store result metrics - result = { - "num_browsers": num_browsers, - "pages_per_browser": pages_per_browser, - "page_distribution": page_distribution, - "distribution_str": distribution_str, - "total_urls": total_urls, - "browser_startup_time": browser_startup_time, - "crawl_time": crawl_time, - "total_time": total_time, - "browser_memory": browser_memory, - "pages_memory": pages_memory, - "peak_memory": peak_memory, - "pages_per_second": pages_per_second, - # Calculate efficiency score (higher is better) - # This balances speed vs memory usage - "efficiency_score": pages_per_second / (peak_memory + 1) if peak_memory > 0 else pages_per_second, - } - - results.append(result) - - # Log the results - logger.info(f"Browser startup: {browser_startup_time:.2f}s", tag="TEST") - logger.info(f"Crawl time: {crawl_time:.2f}s", tag="TEST") - logger.info(f"Total time: {total_time:.2f}s", tag="TEST") - logger.info(f"Performance: {pages_per_second:.1f} pages/second", tag="TEST") - - if peak_memory > 0: - logger.info(f"Browser memory: {browser_memory:.1f}MB", tag="TEST") - logger.info(f"Pages memory: {pages_memory:.1f}MB", tag="TEST") - logger.info(f"Peak memory: {peak_memory:.1f}MB", tag="TEST") - logger.info(f"Efficiency score: {result['efficiency_score']:.6f}", tag="TEST") - - except Exception as e: - logger.error(f"Error testing configuration: {str(e)}", tag="TEST") - import traceback - traceback.print_exc() - - # Clean up - for manager in managers: - try: - await manager.close() - except: - pass - - # Print summary of all configurations - logger.info("=" * 100, tag="TEST") - logger.info("GRID SEARCH RESULTS SUMMARY", tag="TEST") - logger.info("=" * 100, tag="TEST") - - # Rank configurations by efficiency score - ranked_results = sorted(results, key=lambda x: x["efficiency_score"], reverse=True) - - # Also determine rankings by different metrics - fastest = sorted(results, key=lambda x: x["crawl_time"])[0] - lowest_memory = sorted(results, key=lambda x: x["peak_memory"] if x["peak_memory"] > 0 else float('inf'))[0] - most_efficient = ranked_results[0] - - # Print top performers by category - logger.info("๐Ÿ† TOP PERFORMERS BY CATEGORY:", tag="TEST") - logger.info(f"โšก Fastest: {fastest['num_browsers']} browsers ร— ~{fastest['pages_per_browser']} pages " + - f"({fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/s)", tag="TEST") - - if lowest_memory["peak_memory"] > 0: - logger.info(f"๐Ÿ’พ Lowest memory: {lowest_memory['num_browsers']} browsers ร— ~{lowest_memory['pages_per_browser']} pages " + - f"({lowest_memory['peak_memory']:.1f}MB)", tag="TEST") - - logger.info(f"๐ŸŒŸ Most efficient: {most_efficient['num_browsers']} browsers ร— ~{most_efficient['pages_per_browser']} pages " + - f"(score: {most_efficient['efficiency_score']:.6f})", tag="TEST") - - # Print result table header - logger.info("\n๐Ÿ“Š COMPLETE RANKING TABLE (SORTED BY EFFICIENCY SCORE):", tag="TEST") - logger.info("-" * 120, tag="TEST") - - # Define table header - header = f"{'Rank':<5} | {'Browsers':<8} | {'Distribution':<55} | {'Total Time(s)':<12} | {'Speed(p/s)':<12} | {'Memory(MB)':<12} | {'Efficiency':<10} | {'Notes'}" - logger.info(header, tag="TEST") - logger.info("-" * 120, tag="TEST") - - # Print each configuration in ranked order - for rank, result in enumerate(ranked_results, 1): - # Add special notes for top performers - notes = [] - if result == fastest: - notes.append("โšก Fastest") - if result == lowest_memory: - notes.append("๐Ÿ’พ Lowest Memory") - if result == most_efficient: - notes.append("๐ŸŒŸ Most Efficient") - - notes_str = " | ".join(notes) if notes else "" - - # Format memory if available - memory_str = f"{result['peak_memory']:.1f}" if result['peak_memory'] > 0 else "N/A" - - # Get the distribution string - dist_str = result.get('distribution_str', str(tuple([result['pages_per_browser']] * result['num_browsers']))) - - # Build the row - row = f"{rank:<5} | {result['num_browsers']:<8} | {dist_str:<55} | {result['total_time']:.2f}s{' ':<7} | " - row += f"{result['pages_per_second']:.2f}{' ':<6} | {memory_str}{' ':<6} | {result['efficiency_score']:.4f}{' ':<4} | {notes_str}" - - logger.info(row, tag="TEST") - - logger.info("-" * 120, tag="TEST") - - # Generate visualization if matplotlib is available - try: - import matplotlib.pyplot as plt - import numpy as np - - # Extract data for plotting from ranked results - browser_counts = [r["num_browsers"] for r in ranked_results] - efficiency_scores = [r["efficiency_score"] for r in ranked_results] - crawl_times = [r["crawl_time"] for r in ranked_results] - total_times = [r["total_time"] for r in ranked_results] - - # Filter results with memory data - memory_results = [r for r in ranked_results if r["peak_memory"] > 0] - memory_browser_counts = [r["num_browsers"] for r in memory_results] - peak_memories = [r["peak_memory"] for r in memory_results] - - # Create figure with clean design - plt.figure(figsize=(14, 12), facecolor='white') - plt.style.use('ggplot') - - # Create grid for subplots - gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 1], hspace=0.3) - - # Plot 1: Efficiency Score (higher is better) - ax1 = plt.subplot(gs[0]) - bar_colors = ['#3498db'] * len(browser_counts) - - # Highlight the most efficient - most_efficient_idx = browser_counts.index(most_efficient["num_browsers"]) - bar_colors[most_efficient_idx] = '#e74c3c' # Red for most efficient - - bars = ax1.bar(range(len(browser_counts)), efficiency_scores, color=bar_colors) - ax1.set_xticks(range(len(browser_counts))) - ax1.set_xticklabels([f"{bc}" for bc in browser_counts], rotation=45) - ax1.set_xlabel('Number of Browsers') - ax1.set_ylabel('Efficiency Score (higher is better)') - ax1.set_title('Browser Configuration Efficiency (higher is better)') - - # Add value labels on top of bars - for bar, score in zip(bars, efficiency_scores): - height = bar.get_height() - ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02*max(efficiency_scores), - f'{score:.3f}', ha='center', va='bottom', rotation=90, fontsize=8) - - # Highlight best configuration - ax1.text(0.02, 0.90, f"๐ŸŒŸ Most Efficient: {most_efficient['num_browsers']} browsers with ~{most_efficient['pages_per_browser']} pages", - transform=ax1.transAxes, fontsize=12, verticalalignment='top', - bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.3)) - - # Plot 2: Time Performance - ax2 = plt.subplot(gs[1]) - - # Plot both total time and crawl time - ax2.plot(browser_counts, crawl_times, 'bo-', label='Crawl Time (s)', linewidth=2) - ax2.plot(browser_counts, total_times, 'go--', label='Total Time (s)', linewidth=2, alpha=0.6) - - # Mark the fastest configuration - fastest_idx = browser_counts.index(fastest["num_browsers"]) - ax2.plot(browser_counts[fastest_idx], crawl_times[fastest_idx], 'ro', ms=10, - label=f'Fastest: {fastest["num_browsers"]} browsers') - - ax2.set_xlabel('Number of Browsers') - ax2.set_ylabel('Time (seconds)') - ax2.set_title(f'Time Performance for {total_urls} URLs by Browser Count') - ax2.grid(True, linestyle='--', alpha=0.7) - ax2.legend(loc='upper right') - - # Plot pages per second on second y-axis - pages_per_second = [total_urls/t for t in crawl_times] - ax2_twin = ax2.twinx() - ax2_twin.plot(browser_counts, pages_per_second, 'r^--', label='Pages/second', alpha=0.5) - ax2_twin.set_ylabel('Pages per second') - - # Add note about the fastest configuration - ax2.text(0.02, 0.90, f"โšก Fastest: {fastest['num_browsers']} browsers with ~{fastest['pages_per_browser']} pages" + - f"\n {fastest['crawl_time']:.2f}s ({fastest['pages_per_second']:.1f} pages/s)", - transform=ax2.transAxes, fontsize=12, verticalalignment='top', - bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3)) - - # Plot 3: Memory Usage (if available) - if memory_results: - ax3 = plt.subplot(gs[2]) - - # Prepare data for grouped bar chart - memory_per_browser = [m/n for m, n in zip(peak_memories, memory_browser_counts)] - memory_per_page = [m/(n*p) for m, n, p in zip( - [r["peak_memory"] for r in memory_results], - [r["num_browsers"] for r in memory_results], - [r["pages_per_browser"] for r in memory_results])] - - x = np.arange(len(memory_browser_counts)) - width = 0.35 - - # Create grouped bars - ax3.bar(x - width/2, peak_memories, width, label='Total Memory (MB)', color='#9b59b6') - ax3.bar(x + width/2, memory_per_browser, width, label='Memory per Browser (MB)', color='#3498db') - - # Configure axis - ax3.set_xticks(x) - ax3.set_xticklabels([f"{bc}" for bc in memory_browser_counts], rotation=45) - ax3.set_xlabel('Number of Browsers') - ax3.set_ylabel('Memory (MB)') - ax3.set_title('Memory Usage by Browser Configuration') - ax3.legend(loc='upper left') - ax3.grid(True, linestyle='--', alpha=0.7) - - # Add second y-axis for memory per page - ax3_twin = ax3.twinx() - ax3_twin.plot(x, memory_per_page, 'ro-', label='Memory per Page (MB)') - ax3_twin.set_ylabel('Memory per Page (MB)') - - # Get lowest memory configuration - lowest_memory_idx = memory_browser_counts.index(lowest_memory["num_browsers"]) - - # Add note about lowest memory configuration - ax3.text(0.02, 0.90, f"๐Ÿ’พ Lowest Memory: {lowest_memory['num_browsers']} browsers with ~{lowest_memory['pages_per_browser']} pages" + - f"\n {lowest_memory['peak_memory']:.1f}MB ({lowest_memory['peak_memory']/total_urls:.2f}MB per page)", - transform=ax3.transAxes, fontsize=12, verticalalignment='top', - bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.3)) - - # Add overall title - plt.suptitle(f'Browser Scaling Grid Search Results for {total_urls} URLs', fontsize=16, y=0.98) - - # Add timestamp and info at the bottom - plt.figtext(0.5, 0.01, f"Generated by Crawl4AI at {time.strftime('%Y-%m-%d %H:%M:%S')}", - ha="center", fontsize=10, style='italic') - - # Get current directory and save the figure there - import os - __current_file = os.path.abspath(__file__) - current_dir = os.path.dirname(__current_file) - output_file = os.path.join(current_dir, 'browser_scaling_grid_search.png') - - # Adjust layout and save figure with high DPI - plt.tight_layout(rect=[0, 0.03, 1, 0.97]) - plt.savefig(output_file, dpi=200, bbox_inches='tight') - logger.success(f"Visualization saved to {output_file}", tag="TEST") - - except ImportError: - logger.warning("matplotlib not available, skipping visualization", tag="TEST") - - return most_efficient["num_browsers"], most_efficient["pages_per_browser"] - -async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2): - """Find optimal browser configuration for crawling a specific number of URLs. - - Args: - total_urls: Number of URLs to crawl - verbose: Whether to print progress - rate_limit_delay: Delay between page loads to avoid rate limiting - - Returns: - dict: Contains fastest, lowest_memory, and optimal configurations - """ - if verbose: - print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n") - - # Generate test URLs with timestamp to avoid caching - timestamp = int(time.time()) - urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)] - - # Limit browser configurations to test (1 browser to max 10) - max_browsers = min(10, total_urls) - configs_to_test = [] - - # Generate configurations (browser count, pages distribution) - for num_browsers in range(1, max_browsers + 1): - base_pages = total_urls // num_browsers - remainder = total_urls % num_browsers - - # Create distribution array like [3, 3, 2, 2] (some browsers get one more page) - if remainder > 0: - distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder) - else: - distribution = [base_pages] * num_browsers - - configs_to_test.append((num_browsers, distribution)) - - results = [] - - # Test each configuration - for browser_count, page_distribution in configs_to_test: - if verbose: - print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}") - - try: - # Track memory if possible - try: - import psutil - process = psutil.Process() - start_memory = process.memory_info().rss / (1024 * 1024) # MB - except ImportError: - if verbose: - print("Memory tracking not available (psutil not installed)") - start_memory = 0 - - # Start browsers in parallel - managers = [] - start_tasks = [] - start_time = time.time() - - for i in range(browser_count): - config = BrowserConfig(headless=True) - manager = BrowserManager(browser_config=config, logger=logger) - start_tasks.append(manager.start()) - managers.append(manager) - - await asyncio.gather(*start_tasks) - - # Distribute URLs among browsers - urls_per_manager = {} - url_index = 0 - - for i, manager in enumerate(managers): - pages_for_this_browser = page_distribution[i] - end_index = url_index + pages_for_this_browser - urls_per_manager[manager] = urls[url_index:end_index] - url_index = end_index - - # Create pages for each browser - all_pages = [] - for manager, manager_urls in urls_per_manager.items(): - if not manager_urls: - continue - pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls)) - all_pages.extend(zip(pages, manager_urls)) - - # Crawl pages with delay to avoid rate limiting - async def crawl_page(page_ctx, url): - page, _ = page_ctx - try: - await page.goto(url) - if rate_limit_delay > 0: - await asyncio.sleep(rate_limit_delay) - title = await page.title() - return title - finally: - await page.close() - - crawl_start = time.time() - crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages] - await asyncio.gather(*crawl_tasks) - crawl_time = time.time() - crawl_start - total_time = time.time() - start_time - - # Measure final memory usage - if start_memory > 0: - end_memory = process.memory_info().rss / (1024 * 1024) - memory_used = end_memory - start_memory - else: - memory_used = 0 - - # Close all browsers - for manager in managers: - await manager.close() - - # Calculate metrics - pages_per_second = total_urls / crawl_time - - # Calculate efficiency score (higher is better) - # This balances speed vs memory - if memory_used > 0: - efficiency = pages_per_second / (memory_used + 1) - else: - efficiency = pages_per_second - - # Store result - result = { - "browser_count": browser_count, - "distribution": tuple(page_distribution), - "crawl_time": crawl_time, - "total_time": total_time, - "memory_used": memory_used, - "pages_per_second": pages_per_second, - "efficiency": efficiency - } - - results.append(result) - - if verbose: - print(f" โœ“ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)") - if memory_used > 0: - print(f" โœ“ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)") - print(f" โœ“ Efficiency score: {efficiency:.4f}") - - except Exception as e: - if verbose: - print(f" โœ— Error: {str(e)}") - - # Clean up - for manager in managers: - try: - await manager.close() - except: - pass - - # If no successful results, return None - if not results: - return None - - # Find best configurations - fastest = sorted(results, key=lambda x: x["crawl_time"])[0] - - # Only consider memory if available - memory_results = [r for r in results if r["memory_used"] > 0] - if memory_results: - lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0] - else: - lowest_memory = fastest - - # Find most efficient (balanced speed vs memory) - optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0] - - # Print summary - if verbose: - print("\n=== OPTIMAL CONFIGURATIONS ===") - print(f"โšก Fastest: {fastest['browser_count']} browsers {fastest['distribution']}") - print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec") - - print(f"๐Ÿ’พ Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}") - if lowest_memory["memory_used"] > 0: - print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page") - - print(f"๐ŸŒŸ Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}") - print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}") - - return { - "fastest": fastest, - "lowest_memory": lowest_memory, - "optimal": optimal, - "all_configs": results - } - -async def run_tests(): - """Run all tests sequentially.""" - results = [] - - # Find optimal configuration using our utility function - configs = await find_optimal_browser_config( - total_urls=20, # Use a small number for faster testing - verbose=True, - rate_limit_delay=0.2 # 200ms delay between page loads to avoid rate limiting - ) - - if configs: - # Show the optimal configuration - optimal = configs["optimal"] - print(f"\n๐ŸŽฏ Recommended configuration for production use:") - print(f" {optimal['browser_count']} browsers with distribution {optimal['distribution']}") - print(f" Estimated performance: {optimal['pages_per_second']:.1f} pages/second") - results.append(True) - else: - print("\nโŒ Failed to find optimal configuration") - results.append(False) - - # Print summary - total = len(results) - passed = sum(results) - print(f"\nTests complete: {passed}/{total} passed") - - if passed == total: - print("All tests passed!") - else: - print(f"{total - passed} tests failed") - -if __name__ == "__main__": - asyncio.run(run_tests()) \ No newline at end of file diff --git a/tests/browser/test_playwright_strategy.py b/tests/browser/test_playwright_strategy.py deleted file mode 100644 index 94003b533..000000000 --- a/tests/browser/test_playwright_strategy.py +++ /dev/null @@ -1,316 +0,0 @@ -"""Test examples for PlaywrightBrowserStrategy. - -These examples demonstrate the functionality of PlaywrightBrowserStrategy -and serve as functional tests. -""" - -import asyncio -import os -import re -import sys - -# Add the project root to Python path if running directly -if __name__ == "__main__": - sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) - -from crawl4ai.browser import BrowserManager -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from crawl4ai.async_logger import AsyncLogger - -# Create a logger for clear terminal output -logger = AsyncLogger(verbose=True, log_file=None) - - - -async def test_start_close(): - # Create browser config for standard Playwright - browser_config = BrowserConfig( - headless=True, - viewport_width=1280, - viewport_height=800 - ) - - # Create browser manager with the config - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - for _ in range(4): - # Start the browser - await manager.start() - logger.info("Browser started successfully", tag="TEST") - - # Get a page - page, context = await manager.get_page(CrawlerRunConfig()) - logger.info("Got page successfully", tag="TEST") - - # Navigate to a website - await page.goto("https://example.com") - logger.info("Navigated to example.com", tag="TEST") - - # Get page title - title = await page.title() - logger.info(f"Page title: {title}", tag="TEST") - - # Clean up - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - await asyncio.sleep(1) # Wait for a moment before restarting - - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Ensure cleanup - try: - await manager.close() - except: - pass - return False - return True - -async def test_playwright_basic(): - """Test basic Playwright browser functionality.""" - logger.info("Testing standard Playwright browser", tag="TEST") - - # Create browser config for standard Playwright - browser_config = BrowserConfig( - headless=True, - viewport_width=1280, - viewport_height=800 - ) - - # Create browser manager with the config - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - # Start the browser - await manager.start() - logger.info("Browser started successfully", tag="TEST") - - # Create crawler config - crawler_config = CrawlerRunConfig(url="https://example.com") - - # Get a page - page, context = await manager.get_page(crawler_config) - logger.info("Got page successfully", tag="TEST") - - # Navigate to a website - await page.goto("https://example.com") - logger.info("Navigated to example.com", tag="TEST") - - # Get page title - title = await page.title() - logger.info(f"Page title: {title}", tag="TEST") - - # Clean up - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - return True - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Ensure cleanup - try: - await manager.close() - except: - pass - return False - -async def test_playwright_text_mode(): - """Test Playwright browser in text-only mode.""" - logger.info("Testing Playwright text mode", tag="TEST") - - # Create browser config with text mode enabled - browser_config = BrowserConfig( - headless=True, - text_mode=True # Enable text-only mode - ) - - # Create browser manager with the config - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - # Start the browser - await manager.start() - logger.info("Browser started successfully in text mode", tag="TEST") - - # Get a page - crawler_config = CrawlerRunConfig(url="https://example.com") - page, context = await manager.get_page(crawler_config) - - # Navigate to a website - await page.goto("https://example.com") - logger.info("Navigated to example.com", tag="TEST") - - # Get page title - title = await page.title() - logger.info(f"Page title: {title}", tag="TEST") - - # Check if images are blocked in text mode - # We'll check if any image requests were made - has_images = False - async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info: - try: - # Try to load a page with images - await page.goto("https://picsum.photos/", wait_until="domcontentloaded") - request = await request_info.value - has_images = True - except: - # Timeout without image requests means text mode is working - has_images = False - - logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST") - - # Clean up - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - return True - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Ensure cleanup - try: - await manager.close() - except: - pass - return False - -async def test_playwright_context_reuse(): - """Test context caching and reuse with identical configurations.""" - logger.info("Testing context reuse with identical configurations", tag="TEST") - - # Create browser config - browser_config = BrowserConfig(headless=True) - - # Create browser manager - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - # Start the browser - await manager.start() - logger.info("Browser started successfully", tag="TEST") - - # Create identical crawler configs - crawler_config1 = CrawlerRunConfig( - css_selector="body", - ) - - crawler_config2 = CrawlerRunConfig( - css_selector="body", - ) - - # Get pages with these configs - page1, context1 = await manager.get_page(crawler_config1) - page2, context2 = await manager.get_page(crawler_config2) - - # Check if contexts are reused - is_same_context = context1 == context2 - logger.info(f"Contexts reused: {is_same_context}", tag="TEST") - - # Now try with a different config - crawler_config3 = CrawlerRunConfig() - - page3, context3 = await manager.get_page(crawler_config3) - - # This should be a different context - is_different_context = context1 != context3 - logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST") - - # Clean up - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - # Both tests should pass for success - return is_same_context and is_different_context - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - # Ensure cleanup - try: - await manager.close() - except: - pass - return False - -async def test_playwright_session_management(): - """Test session management with Playwright browser.""" - logger.info("Testing session management with Playwright browser", tag="TEST") - - browser_config = BrowserConfig( - headless=True - ) - - manager = BrowserManager(browser_config=browser_config, logger=logger) - - try: - await manager.start() - logger.info("Browser launched successfully", tag="TEST") - - # Create two sessions - session1_id = "playwright_session_1" - session2_id = "playwright_session_2" - - # Set up first session - crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com") - page1, context1 = await manager.get_page(crawler_config1) - await page1.goto("https://example.com") - await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')") - logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST") - - # Set up second session - crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org") - page2, context2 = await manager.get_page(crawler_config2) - await page2.goto("https://example.org") - await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')") - logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST") - - # Get first session again - page1_again, context1_again = await manager.get_page(crawler_config1) - - # Verify it's the same page and data persists - is_same_page = page1 == page1_again - is_same_context = context1 == context1_again - data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')") - logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST") - - # Kill first session - await manager.kill_session(session1_id) - logger.info(f"Killed session 1", tag="TEST") - - # Verify second session still works - data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')") - logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST") - - # Clean up - await manager.close() - logger.info("Browser closed successfully", tag="TEST") - - return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2" - except Exception as e: - logger.error(f"Test failed: {str(e)}", tag="TEST") - try: - await manager.close() - except: - pass - return False - -async def run_tests(): - """Run all tests sequentially.""" - results = [] - - # results.append(await test_start_close()) - # results.append(await test_playwright_basic()) - # results.append(await test_playwright_text_mode()) - # results.append(await test_playwright_context_reuse()) - results.append(await test_playwright_session_management()) - - # Print summary - total = len(results) - passed = sum(results) - logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY") - - if passed == total: - logger.success("All tests passed!", tag="SUMMARY") - else: - logger.error(f"{total - passed} tests failed", tag="SUMMARY") - -if __name__ == "__main__": - asyncio.run(run_tests()) diff --git a/tests/check_dependencies.py b/tests/check_dependencies.py deleted file mode 100755 index e47ec372a..000000000 --- a/tests/check_dependencies.py +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/env python3 -""" -Dependency checker for Crawl4AI -Analyzes imports in the codebase and shows which files use them -""" - -import ast -import os -import sys -from pathlib import Path -from typing import Set, Dict, List, Tuple -from collections import defaultdict -import re -import toml - -# Standard library modules to ignore -STDLIB_MODULES = { - 'abc', 'argparse', 'asyncio', 'base64', 'collections', 'concurrent', 'contextlib', - 'copy', 'datetime', 'decimal', 'email', 'enum', 'functools', 'glob', 'hashlib', - 'http', 'importlib', 'io', 'itertools', 'json', 'logging', 'math', 'mimetypes', - 'multiprocessing', 'os', 'pathlib', 'pickle', 'platform', 'pprint', 'random', - 're', 'shutil', 'signal', 'socket', 'sqlite3', 'string', 'subprocess', 'sys', - 'tempfile', 'threading', 'time', 'traceback', 'typing', 'unittest', 'urllib', - 'uuid', 'warnings', 'weakref', 'xml', 'zipfile', 'dataclasses', 'secrets', - 'statistics', 'textwrap', 'queue', 'csv', 'gzip', 'tarfile', 'configparser', - 'inspect', 'operator', 'struct', 'binascii', 'codecs', 'locale', 'gc', - 'atexit', 'builtins', 'html', 'errno', 'fcntl', 'pwd', 'grp', 'resource', - 'termios', 'tty', 'pty', 'select', 'selectors', 'ssl', 'zlib', 'bz2', - 'lzma', 'types', 'copy', 'pydoc', 'profile', 'cProfile', 'timeit', - 'trace', 'doctest', 'pdb', 'contextvars', 'dataclasses', 'graphlib', - 'zoneinfo', 'tomllib', 'cgi', 'wsgiref', 'fileinput', 'linecache', - 'tokenize', 'tabnanny', 'compileall', 'dis', 'pickletools', 'formatter', - '__future__', 'array', 'ctypes', 'heapq', 'bisect', 'array', 'weakref', - 'types', 'copy', 'pprint', 'repr', 'numbers', 'cmath', 'fractions', - 'statistics', 'itertools', 'functools', 'operator', 'pathlib', 'fileinput', - 'stat', 'filecmp', 'tempfile', 'glob', 'fnmatch', 'linecache', 'shutil', - 'pickle', 'copyreg', 'shelve', 'marshal', 'dbm', 'sqlite3', 'zlib', 'gzip', - 'bz2', 'lzma', 'zipfile', 'tarfile', 'configparser', 'netrc', 'xdrlib', - 'plistlib', 'hashlib', 'hmac', 'secrets', 'os', 'io', 'time', 'argparse', - 'getopt', 'logging', 'getpass', 'curses', 'platform', 'errno', 'ctypes', - 'threading', 'multiprocessing', 'concurrent', 'subprocess', 'sched', 'queue', - 'contextvars', 'asyncio', 'socket', 'ssl', 'email', 'json', 'mailcap', - 'mailbox', 'mimetypes', 'base64', 'binhex', 'binascii', 'quopri', 'uu', - 'html', 'xml', 'webbrowser', 'cgi', 'cgitb', 'wsgiref', 'urllib', 'http', - 'ftplib', 'poplib', 'imaplib', 'nntplib', 'smtplib', 'smtpd', 'telnetlib', - 'uuid', 'socketserver', 'xmlrpc', 'ipaddress', 'audioop', 'aifc', 'sunau', - 'wave', 'chunk', 'colorsys', 'imghdr', 'sndhdr', 'ossaudiodev', 'gettext', - 'locale', 'turtle', 'cmd', 'shlex', 'tkinter', 'typing', 'pydoc', 'doctest', - 'unittest', 'test', '2to3', 'distutils', 'venv', 'ensurepip', 'zipapp', - 'py_compile', 'compileall', 'dis', 'pickletools', 'pdb', 'timeit', 'trace', - 'tracemalloc', 'warnings', 'faulthandler', 'pdb', 'dataclasses', 'cgi', - 'cgitb', 'chunk', 'crypt', 'imghdr', 'mailcap', 'nis', 'nntplib', 'optparse', - 'ossaudiodev', 'pipes', 'smtpd', 'sndhdr', 'spwd', 'sunau', 'telnetlib', - 'uu', 'xdrlib', 'msilib', 'pstats', 'rlcompleter', 'tkinter', 'ast' -} - -# Known package name mappings (import name -> package name) -PACKAGE_MAPPINGS = { - 'bs4': 'beautifulsoup4', - 'PIL': 'pillow', - 'cv2': 'opencv-python', - 'sklearn': 'scikit-learn', - 'yaml': 'PyYAML', - 'OpenSSL': 'pyOpenSSL', - 'sqlalchemy': 'SQLAlchemy', - 'playwright': 'playwright', - 'patchright': 'patchright', - 'dotenv': 'python-dotenv', - 'fake_useragent': 'fake-useragent', - 'playwright_stealth': 'tf-playwright-stealth', - 'sentence_transformers': 'sentence-transformers', - 'rank_bm25': 'rank-bm25', - 'snowballstemmer': 'snowballstemmer', - 'PyPDF2': 'PyPDF2', - 'pdf2image': 'pdf2image', -} - - -class ImportVisitor(ast.NodeVisitor): - """AST visitor to extract imports from Python files""" - - def __init__(self): - self.imports = {} # Changed to dict to store line numbers - self.from_imports = {} - - def visit_Import(self, node): - for alias in node.names: - module_name = alias.name.split('.')[0] - if module_name not in self.imports: - self.imports[module_name] = [] - self.imports[module_name].append(node.lineno) - - def visit_ImportFrom(self, node): - if node.module and node.level == 0: # absolute imports only - module_name = node.module.split('.')[0] - if module_name not in self.from_imports: - self.from_imports[module_name] = [] - self.from_imports[module_name].append(node.lineno) - - -def extract_imports_from_file(filepath: Path) -> Dict[str, List[int]]: - """Extract all imports from a Python file with line numbers""" - all_imports = {} - - try: - with open(filepath, 'r', encoding='utf-8') as f: - content = f.read() - - tree = ast.parse(content) - visitor = ImportVisitor() - visitor.visit(tree) - - # Merge imports and from_imports - for module, lines in visitor.imports.items(): - if module not in all_imports: - all_imports[module] = [] - all_imports[module].extend(lines) - - for module, lines in visitor.from_imports.items(): - if module not in all_imports: - all_imports[module] = [] - all_imports[module].extend(lines) - - except Exception as e: - # Silently skip files that can't be parsed - pass - - return all_imports - - -def get_codebase_imports_with_files(root_dir: Path) -> Dict[str, List[Tuple[str, List[int]]]]: - """Get all imports from the crawl4ai library and docs folders with file locations and line numbers""" - import_to_files = defaultdict(list) - - # Only scan crawl4ai library folder and docs folder - target_dirs = [ - root_dir / 'crawl4ai', - root_dir / 'docs' - ] - - for target_dir in target_dirs: - if not target_dir.exists(): - continue - - for py_file in target_dir.rglob('*.py'): - # Skip __pycache__ directories - if '__pycache__' in py_file.parts: - continue - - # Skip setup.py and similar files - if py_file.name in ['setup.py', 'setup.cfg', 'conf.py']: - continue - - imports = extract_imports_from_file(py_file) - - # Map each import to the file and line numbers - for imp, line_numbers in imports.items(): - relative_path = py_file.relative_to(root_dir) - import_to_files[imp].append((str(relative_path), sorted(line_numbers))) - - return dict(import_to_files) - - -def get_declared_dependencies() -> Set[str]: - """Get declared dependencies from pyproject.toml and requirements.txt""" - declared = set() - - # Read from pyproject.toml - if Path('pyproject.toml').exists(): - with open('pyproject.toml', 'r') as f: - data = toml.load(f) - - # Get main dependencies - deps = data.get('project', {}).get('dependencies', []) - for dep in deps: - # Parse dependency string (e.g., "numpy>=1.26.0,<3") - match = re.match(r'^([a-zA-Z0-9_-]+)', dep) - if match: - pkg_name = match.group(1).lower() - declared.add(pkg_name) - - # Get optional dependencies - optional = data.get('project', {}).get('optional-dependencies', {}) - for group, deps in optional.items(): - for dep in deps: - match = re.match(r'^([a-zA-Z0-9_-]+)', dep) - if match: - pkg_name = match.group(1).lower() - declared.add(pkg_name) - - # Also check requirements.txt as backup - if Path('requirements.txt').exists(): - with open('requirements.txt', 'r') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#'): - match = re.match(r'^([a-zA-Z0-9_-]+)', line) - if match: - pkg_name = match.group(1).lower() - declared.add(pkg_name) - - return declared - - -def normalize_package_name(name: str) -> str: - """Normalize package name for comparison""" - # Handle known mappings first - if name in PACKAGE_MAPPINGS: - return PACKAGE_MAPPINGS[name].lower() - - # Basic normalization - return name.lower().replace('_', '-') - - -def check_missing_dependencies(): - """Main function to check for missing dependencies""" - print("๐Ÿ” Analyzing crawl4ai library and docs folders...\n") - - # Get all imports with their file locations - root_dir = Path('.') - import_to_files = get_codebase_imports_with_files(root_dir) - - # Get declared dependencies - declared_deps = get_declared_dependencies() - - # Normalize declared dependencies - normalized_declared = {normalize_package_name(dep) for dep in declared_deps} - - # Categorize imports - external_imports = {} - local_imports = {} - - # Known local packages - local_packages = {'crawl4ai'} - - for imp, file_info in import_to_files.items(): - # Skip standard library - if imp in STDLIB_MODULES: - continue - - # Check if it's a local import - if any(imp.startswith(local) for local in local_packages): - local_imports[imp] = file_info - else: - external_imports[imp] = file_info - - # Check which external imports are not declared - not_declared = {} - declared_imports = {} - - for imp, file_info in external_imports.items(): - normalized_imp = normalize_package_name(imp) - - # Check if import is covered by declared dependencies - found = False - for declared in normalized_declared: - if normalized_imp == declared or normalized_imp.startswith(declared + '.') or declared.startswith(normalized_imp): - found = True - break - - if found: - declared_imports[imp] = file_info - else: - not_declared[imp] = file_info - - # Print results - print(f"๐Ÿ“Š Summary:") - print(f" - Total unique imports: {len(import_to_files)}") - print(f" - External imports: {len(external_imports)}") - print(f" - Declared dependencies: {len(declared_deps)}") - print(f" - External imports NOT in dependencies: {len(not_declared)}\n") - - if not_declared: - print("โŒ External imports NOT declared in pyproject.toml or requirements.txt:\n") - - # Sort by import name - for imp in sorted(not_declared.keys()): - file_info = not_declared[imp] - print(f" ๐Ÿ“ฆ {imp}") - if imp in PACKAGE_MAPPINGS: - print(f" โ†’ Package name: {PACKAGE_MAPPINGS[imp]}") - - # Show up to 3 files that use this import - for i, (file_path, line_numbers) in enumerate(file_info[:3]): - # Format line numbers for clickable output - if len(line_numbers) == 1: - print(f" - {file_path}:{line_numbers[0]}") - else: - # Show first few line numbers - line_str = ','.join(str(ln) for ln in line_numbers[:3]) - if len(line_numbers) > 3: - line_str += f"... ({len(line_numbers)} imports)" - print(f" - {file_path}: lines {line_str}") - - if len(file_info) > 3: - print(f" ... and {len(file_info) - 3} more files") - print() - - # Check for potentially unused dependencies - print("\n๐Ÿ”Ž Checking declared dependencies usage...\n") - - # Get all used external packages - used_packages = set() - for imp in external_imports.keys(): - normalized = normalize_package_name(imp) - used_packages.add(normalized) - - # Find unused - unused = [] - for dep in declared_deps: - normalized_dep = normalize_package_name(dep) - - # Check if any import uses this dependency - found_usage = False - for used in used_packages: - if used == normalized_dep or used.startswith(normalized_dep) or normalized_dep.startswith(used): - found_usage = True - break - - if not found_usage: - # Some packages are commonly unused directly - indirect_deps = {'wheel', 'setuptools', 'pip', 'colorama', 'certifi', 'packaging', 'urllib3'} - if normalized_dep not in indirect_deps: - unused.append(dep) - - if unused: - print("โš ๏ธ Declared dependencies with NO imports found:") - for dep in sorted(unused): - print(f" - {dep}") - print("\n Note: These might be used indirectly or by other dependencies") - else: - print("โœ… All declared dependencies have corresponding imports") - - print("\n" + "="*60) - print("๐Ÿ’ก How to use this report:") - print(" 1. Check each โŒ import to see if it's legitimate") - print(" 2. If legitimate, add the package to pyproject.toml") - print(" 3. If it's an internal module or typo, fix the import") - print(" 4. Review unused dependencies - remove if truly not needed") - print("="*60) - - -if __name__ == '__main__': - check_missing_dependencies() \ No newline at end of file diff --git a/tests/deep_crwaling/test_filter.py b/tests/deep_crwaling/test_filter.py deleted file mode 100644 index 29ada087e..000000000 --- a/tests/deep_crwaling/test_filter.py +++ /dev/null @@ -1,75 +0,0 @@ -# // File: tests/deep_crawling/test_filters.py -import pytest -from urllib.parse import urlparse -from crawl4ai import ContentTypeFilter, URLFilter - -# Minimal URLFilter base class stub if not already importable directly for tests -# In a real scenario, this would be imported from the library -if not hasattr(URLFilter, '_update_stats'): # Check if it's a basic stub - class URLFilter: # Basic stub for testing if needed - def __init__(self, name=None): self.name = name - def apply(self, url: str) -> bool: raise NotImplementedError - def _update_stats(self, passed: bool): pass # Mock implementation - -# Assume ContentTypeFilter is structured as discussed. If its definition is not fully -# available for direct import in the test environment, a more elaborate stub or direct -# instantiation of the real class (if possible) would be needed. -# For this example, we assume ContentTypeFilter can be imported and used. - -class TestContentTypeFilter: - @pytest.mark.parametrize( - "url, allowed_types, expected", - [ - # Existing tests (examples) - ("http://example.com/page.html", ["text/html"], True), - ("http://example.com/page.json", ["application/json"], True), - ("http://example.com/image.png", ["text/html"], False), - ("http://example.com/document.pdf", ["application/pdf"], True), - ("http://example.com/page", ["text/html"], True), # No extension, allowed - ("http://example.com/page", ["text/html"], False), # No extension, disallowed - ("http://example.com/page.unknown", ["text/html"], False), # Unknown extension - - # Tests for PHP extensions - ("http://example.com/index.php", ["application/x-httpd-php"], True), - ("http://example.com/script.php3", ["application/x-httpd-php"], True), - ("http://example.com/legacy.php4", ["application/x-httpd-php"], True), - ("http://example.com/main.php5", ["application/x-httpd-php"], True), - ("http://example.com/api.php7", ["application/x-httpd-php"], True), - ("http://example.com/index.phtml", ["application/x-httpd-php"], True), - ("http://example.com/source.phps", ["application/x-httpd-php-source"], True), - - # Test rejection of PHP extensions - ("http://example.com/index.php", ["text/html"], False), - ("http://example.com/script.php3", ["text/plain"], False), - ("http://example.com/source.phps", ["application/x-httpd-php"], False), # Mismatch MIME - ("http://example.com/source.php", ["application/x-httpd-php-source"], False), # Mismatch MIME for .php - - # Test case-insensitivity of extensions in URL - ("http://example.com/PAGE.HTML", ["text/html"], True), - ("http://example.com/INDEX.PHP", ["application/x-httpd-php"], True), - ("http://example.com/SOURCE.PHPS", ["application/x-httpd-php-source"], True), - - # Test case-insensitivity of allowed_types - ("http://example.com/index.php", ["APPLICATION/X-HTTPD-PHP"], True), - ], - ) - def test_apply(self, url, allowed_types, expected): - content_filter = ContentTypeFilter( - allowed_types=allowed_types - ) - assert content_filter.apply(url) == expected - - @pytest.mark.parametrize( - "url, expected_extension", - [ - ("http://example.com/file.html", "html"), - ("http://example.com/file.tar.gz", "gz"), - ("http://example.com/path/", ""), - ("http://example.com/nodot", ""), - ("http://example.com/.config", "config"), # hidden file with extension - ("http://example.com/path/to/archive.BIG.zip", "zip"), # Case test - ] - ) - def test_extract_extension(self, url, expected_extension): - # Test the static method directly - assert ContentTypeFilter._extract_extension(url) == expected_extension diff --git a/tests/docker/test_config_object.py b/tests/docker/test_config_object.py deleted file mode 100644 index 94a30f058..000000000 --- a/tests/docker/test_config_object.py +++ /dev/null @@ -1,113 +0,0 @@ -import json -from crawl4ai import ( - CrawlerRunConfig, - DefaultMarkdownGenerator, - RegexChunking, - JsonCssExtractionStrategy, - BM25ContentFilter, - CacheMode -) -from crawl4ai.deep_crawling import BFSDeepCrawlStrategy -from crawl4ai.deep_crawling.filters import FastFilterChain -from crawl4ai.deep_crawling.filters import FastContentTypeFilter, FastDomainFilter -from crawl4ai.deep_crawling.scorers import FastKeywordRelevanceScorer - -def create_test_config() -> CrawlerRunConfig: - # Set up content filtering and markdown generation - content_filter = BM25ContentFilter( - user_query="technology articles", - ) - - markdown_generator = DefaultMarkdownGenerator( - content_filter=content_filter, - options={"ignore_links": False, "body_width": 0} - ) - - # Set up extraction strategy - extraction_schema = { - "name": "ArticleExtractor", - "baseSelector": "article.content", - "fields": [ - {"name": "title", "selector": "h1", "type": "text"}, - {"name": "content", "selector": ".article-body", "type": "html"} - ] - } - extraction_strategy = JsonCssExtractionStrategy(schema=extraction_schema) - - # Set up deep crawling - filter_chain = FastFilterChain([ - FastContentTypeFilter(["text/html"]), - FastDomainFilter(blocked_domains=["ads.*"]) - ]) - - url_scorer = FastKeywordRelevanceScorer( - keywords=["article", "blog"], - weight=1.0 - ) - - deep_crawl_strategy = BFSDeepCrawlStrategy( - max_depth=3, - filter_chain=filter_chain, - url_scorer=url_scorer - ) - - # Create the config - config = CrawlerRunConfig( - word_count_threshold=200, - extraction_strategy=extraction_strategy, - chunking_strategy=RegexChunking(patterns=[r"\n\n"]), - markdown_generator=markdown_generator, - css_selector="main.content", - excluded_tags=["nav", "footer"], - keep_attrs=["href", "src"], - cache_mode=CacheMode.BYPASS, - wait_until="networkidle", - page_timeout=30000, - scan_full_page=True, - deep_crawl_strategy=deep_crawl_strategy, - verbose=True, - stream=True - ) - - return config - -def test_config_serialization_cycle(): - # Create original config - original_config = create_test_config() - - # Dump to serializable dictionary - serialized = original_config.dump() - - print(json.dumps(serialized, indent=2)) - - # Load back into config object - deserialized_config = CrawlerRunConfig.load(serialized) - - # Verify core attributes - assert deserialized_config.word_count_threshold == original_config.word_count_threshold - assert deserialized_config.css_selector == original_config.css_selector - assert deserialized_config.excluded_tags == original_config.excluded_tags - assert deserialized_config.keep_attrs == original_config.keep_attrs - assert deserialized_config.cache_mode == original_config.cache_mode - assert deserialized_config.wait_until == original_config.wait_until - assert deserialized_config.page_timeout == original_config.page_timeout - assert deserialized_config.scan_full_page == original_config.scan_full_page - assert deserialized_config.verbose == original_config.verbose - assert deserialized_config.stream == original_config.stream - - # Verify complex objects - assert isinstance(deserialized_config.extraction_strategy, JsonCssExtractionStrategy) - assert isinstance(deserialized_config.chunking_strategy, RegexChunking) - assert isinstance(deserialized_config.markdown_generator, DefaultMarkdownGenerator) - assert isinstance(deserialized_config.markdown_generator.content_filter, BM25ContentFilter) - assert isinstance(deserialized_config.deep_crawl_strategy, BFSDeepCrawlStrategy) - - # Verify deep crawl strategy configuration - assert deserialized_config.deep_crawl_strategy.max_depth == 3 - assert isinstance(deserialized_config.deep_crawl_strategy.filter_chain, FastFilterChain) - assert isinstance(deserialized_config.deep_crawl_strategy.url_scorer, FastKeywordRelevanceScorer) - - print("Serialization cycle test passed successfully!") - -if __name__ == "__main__": - test_config_serialization_cycle() \ No newline at end of file diff --git a/tests/docker_example.py b/tests/docker_example.py deleted file mode 100644 index f661ecc1a..000000000 --- a/tests/docker_example.py +++ /dev/null @@ -1,397 +0,0 @@ -import requests -import json -import time -import sys -import base64 -import os -from typing import Dict, Any - -class Crawl4AiTester: - def __init__(self, base_url: str = "http://localhost:11235"): - self.base_url = base_url - - - def submit_and_wait( - self, request_data: Dict[str, Any], timeout: int = 300 - ) -> Dict[str, Any]: - # Submit crawl job using async endpoint - response = requests.post( - f"{self.base_url}/crawl/job", json=request_data - ) - response.raise_for_status() - job_response = response.json() - task_id = job_response["task_id"] - print(f"Submitted job with task_id: {task_id}") - - # Poll for result - start_time = time.time() - while True: - if time.time() - start_time > timeout: - raise TimeoutError( - f"Task {task_id} did not complete within {timeout} seconds" - ) - - result = requests.get( - f"{self.base_url}/crawl/job/{task_id}" - ) - result.raise_for_status() - status = result.json() - - if status["status"] == "failed": - print("Task failed:", status.get("error")) - raise Exception(f"Task failed: {status.get('error')}") - - if status["status"] == "completed": - return status - - time.sleep(2) - - def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: - # Use synchronous crawl endpoint - response = requests.post( - f"{self.base_url}/crawl", - json=request_data, - timeout=60, - ) - if response.status_code == 408: - raise TimeoutError("Task did not complete within server timeout") - response.raise_for_status() - return response.json() - - -def test_docker_deployment(version="basic"): - tester = Crawl4AiTester( - base_url="http://localhost:11235", - #base_url="https://crawl4ai-sby74.ondigitalocean.app", - ) - print(f"Testing Crawl4AI Docker {version} version") - - # Health check with timeout and retry - max_retries = 5 - for i in range(max_retries): - try: - health = requests.get(f"{tester.base_url}/health", timeout=10) - print("Health check:", health.json()) - break - except requests.exceptions.RequestException: - if i == max_retries - 1: - print(f"Failed to connect after {max_retries} attempts") - sys.exit(1) - print(f"Waiting for service to start (attempt {i+1}/{max_retries})...") - time.sleep(5) - - # Test cases based on version - test_basic_crawl(tester) - test_basic_crawl_sync(tester) - - if version in ["full", "transformer"]: - test_cosine_extraction(tester) - - test_js_execution(tester) - test_css_selector(tester) - test_structured_extraction(tester) - test_llm_extraction(tester) - test_llm_with_ollama(tester) - test_screenshot(tester) - - -def test_basic_crawl(tester: Crawl4AiTester): - print("\n=== Testing Basic Crawl (Async) ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - } - - result = tester.submit_and_wait(request) - print(f"Basic crawl result count: {len(result['result']['results'])}") - assert result["result"]["success"] - assert len(result["result"]["results"]) > 0 - assert len(result["result"]["results"][0]["markdown"]) > 0 - - -def test_basic_crawl_sync(tester: Crawl4AiTester): - print("\n=== Testing Basic Crawl (Sync) ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - } - - result = tester.submit_sync(request) - print(f"Basic crawl result count: {len(result['results'])}") - assert result["success"] - assert len(result["results"]) > 0 - assert len(result["results"][0]["markdown"]) > 0 - - -def test_js_execution(tester: Crawl4AiTester): - print("\n=== Testing JS Execution ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - "browser_config": {"headless": True}, - "crawler_config": { - "js_code": [ - "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); if(loadMoreButton) loadMoreButton.click();" - ], - "wait_for": "wide-tease-item__wrapper df flex-column flex-row-m flex-nowrap-m enable-new-sports-feed-mobile-design(10)" - } - } - - result = tester.submit_and_wait(request) - print(f"JS execution result count: {len(result['result']['results'])}") - assert result["result"]["success"] - - -def test_css_selector(tester: Crawl4AiTester): - print("\n=== Testing CSS Selector ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - "browser_config": {"headless": True}, - "crawler_config": { - "css_selector": ".wide-tease-item__description", - "word_count_threshold": 10 - } - } - - result = tester.submit_and_wait(request) - print(f"CSS selector result count: {len(result['result']['results'])}") - assert result["result"]["success"] - - -def test_structured_extraction(tester: Crawl4AiTester): - print("\n=== Testing Structured Extraction ===") - schema = { - "name": "Cryptocurrency Prices", - "baseSelector": "table[data-testid=\"prices-table\"] tbody tr", - "fields": [ - { - "name": "asset_name", - "selector": "td:nth-child(2) p.cds-headline-h4steop", - "type": "text" - }, - { - "name": "asset_symbol", - "selector": "td:nth-child(2) p.cds-label2-l1sm09ec", - "type": "text" - }, - { - "name": "asset_image_url", - "selector": "td:nth-child(2) img[alt=\"Asset Symbol\"]", - "type": "attribute", - "attribute": "src" - }, - { - "name": "asset_url", - "selector": "td:nth-child(2) a[aria-label^=\"Asset page for\"]", - "type": "attribute", - "attribute": "href" - }, - { - "name": "price", - "selector": "td:nth-child(3) div.cds-typographyResets-t6muwls.cds-body-bwup3gq", - "type": "text" - }, - { - "name": "change", - "selector": "td:nth-child(7) p.cds-body-bwup3gq", - "type": "text" - } - ] -} - - - request = { - "urls": ["https://www.coinbase.com/explore"], - "crawler_config": { - "type": "CrawlerRunConfig", - "params": { - "extraction_strategy": { - "type": "JsonCssExtractionStrategy", - "params": {"schema": schema} - } - } - } - } - - result = tester.submit_and_wait(request) - extracted = json.loads(result["result"]["results"][0]["extracted_content"]) - print(f"Extracted {len(extracted)} items") - if extracted: - print("Sample item:", json.dumps(extracted[0], indent=2)) - assert result["result"]["success"] - assert len(extracted) > 0 - - -def test_llm_extraction(tester: Crawl4AiTester): - print("\n=== Testing LLM Extraction ===") - schema = { - "type": "object", - "properties": { - "asset_name": { - "type": "string", - "description": "Name of the asset.", - }, - "price": { - "type": "string", - "description": "Price of the asset.", - }, - "change": { - "type": "string", - "description": "Change in price of the asset.", - }, - }, - "required": ["asset_name", "price", "change"], - } - - request = { - "urls": ["https://www.coinbase.com/en-in/explore"], - "browser_config": {}, - "crawler_config": { - "type": "CrawlerRunConfig", - "params": { - "extraction_strategy": { - "type": "LLMExtractionStrategy", - "params": { - "llm_config": { - "type": "LLMConfig", - "params": { - "provider": "gemini/gemini-2.5-flash", - "api_token": os.getenv("GEMINI_API_KEY") - } - }, - "schema": schema, - "extraction_type": "schema", - "instruction": "From the crawled content tioned asset names along with their prices and change in price.", - } - }, - "word_count_threshold": 1 - } - } - } - - try: - result = tester.submit_and_wait(request) - extracted = json.loads(result["result"]["results"][0]["extracted_content"]) - print(f"Extracted {len(extracted)} model pricing entries") - if extracted: - print("Sample entry:", json.dumps(extracted[0], indent=2)) - assert result["result"]["success"] - except Exception as e: - print(f"LLM extraction test failed (might be due to missing API key): {str(e)}") - - -def test_llm_with_ollama(tester: Crawl4AiTester): - print("\n=== Testing LLM with Ollama ===") - schema = { - "type": "object", - "properties": { - "article_title": { - "type": "string", - "description": "The main title of the news article", - }, - "summary": { - "type": "string", - "description": "A brief summary of the article content", - }, - "main_topics": { - "type": "array", - "items": {"type": "string"}, - "description": "Main topics or themes discussed in the article", - }, - }, - } - - request = { - "urls": ["https://www.nbcnews.com/business"], - "browser_config": {"verbose": True}, - "crawler_config": { - "type": "CrawlerRunConfig", - "params": { - "extraction_strategy": { - "type": "LLMExtractionStrategy", - "params": { - "llm_config": { - "type": "LLMConfig", - "params": { - "provider": "ollama/llama3.2:latest", - } - }, - "schema": schema, - "extraction_type": "schema", - "instruction": "Extract the main article information including title, summary, and main topics.", - } - }, - "word_count_threshold": 1 - } - } - } - - try: - result = tester.submit_and_wait(request) - extracted = json.loads(result["result"]["results"][0]["extracted_content"]) - print("Extracted content:", json.dumps(extracted, indent=2)) - assert result["result"]["success"] - except Exception as e: - print(f"Ollama extraction test failed: {str(e)}") - - -def test_cosine_extraction(tester: Crawl4AiTester): - print("\n=== Testing Cosine Extraction ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - "browser_config": {}, - "crawler_config": { - "type": "CrawlerRunConfig", - "params": { - "extraction_strategy": { - "type": "CosineStrategy", - "params": { - "semantic_filter": "business finance economy", - "word_count_threshold": 10, - "max_dist": 0.2, - "top_k": 3, - } - } - } - } - } - - try: - result = tester.submit_and_wait(request) - extracted = json.loads(result["result"]["results"][0]["extracted_content"]) - print(f"Extracted {len(extracted)} text clusters") - if extracted: - print("First cluster tags:", extracted[0]["tags"]) - assert result["result"]["success"] - except Exception as e: - print(f"Cosine extraction test failed: {str(e)}") - - -def test_screenshot(tester: Crawl4AiTester): - print("\n=== Testing Screenshot ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - "browser_config": {"headless": True}, - "crawler_config": { - "type": "CrawlerRunConfig", - "params": { - "screenshot": True - } - } - } - - result = tester.submit_and_wait(request) - screenshot_data = result["result"]["results"][0]["screenshot"] - print("Screenshot captured:", bool(screenshot_data)) - - if screenshot_data: - # Save screenshot - screenshot_bytes = base64.b64decode(screenshot_data) - with open("test_screenshot.jpg", "wb") as f: - f.write(screenshot_bytes) - print("Screenshot saved as test_screenshot.jpg") - - assert result["result"]["success"] - - -if __name__ == "__main__": - version = sys.argv[1] if len(sys.argv) > 1 else "basic" - # version = "full" - test_docker_deployment(version) diff --git a/tests/general/test_crawlers.py b/tests/general/test_crawlers.py deleted file mode 100644 index 45fb8fcb3..000000000 --- a/tests/general/test_crawlers.py +++ /dev/null @@ -1,17 +0,0 @@ - -# example_usageexample_usageexample_usage# example_usage.py -import asyncio -from crawl4ai.crawlers import get_crawler - -async def main(): - # Get the registered crawler - example_crawler = get_crawler("example_site.content") - - # Crawl example.com - result = await example_crawler(url="https://example.com") - - print(result) - - -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file diff --git a/tests/general/test_llm_filter.py b/tests/general/test_llm_filter.py deleted file mode 100644 index 6211c4295..000000000 --- a/tests/general/test_llm_filter.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai import LLMConfig -from crawl4ai.content_filter_strategy import LLMContentFilter - -async def test_llm_filter(): - # Create an HTML source that needs intelligent filtering - url = "https://docs.python.org/3/tutorial/classes.html" - - browser_config = BrowserConfig( - headless=True, - verbose=True - ) - - # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) - run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) - - async with AsyncWebCrawler(config=browser_config) as crawler: - # First get the raw HTML - result = await crawler.arun(url, config=run_config) - html = result.cleaned_html - - # Initialize LLM filter with focused instruction - filter = LLMContentFilter( - llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), - instruction=""" - Focus on extracting the core educational content about Python classes. - Include: - - Key concepts and their explanations - - Important code examples - - Essential technical details - Exclude: - - Navigation elements - - Sidebars - - Footer content - - Version information - - Any non-essential UI elements - - Format the output as clean markdown with proper code blocks and headers. - """, - verbose=True - ) - - filter = LLMContentFilter( - llm_config = LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), - chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 - instruction=""" - Extract the main educational content while preserving its original wording and substance completely. Your task is to: - - 1. Maintain the exact language and terminology used in the main content - 2. Keep all technical explanations, examples, and educational content intact - 3. Preserve the original flow and structure of the core content - 4. Remove only clearly irrelevant elements like: - - Navigation menus - - Advertisement sections - - Cookie notices - - Footers with site information - - Sidebars with external links - - Any UI elements that don't contribute to learning - - The goal is to create a clean markdown version that reads exactly like the original article, - keeping all valuable content but free from distracting elements. Imagine you're creating - a perfect reading experience where nothing valuable is lost, but all noise is removed. - """, - verbose=True - ) - - # Apply filtering - filtered_content = filter.filter_content(html, ignore_cache = True) - - # Show results - print("\nFiltered Content Length:", len(filtered_content)) - print("\nFirst 500 chars of filtered content:") - if filtered_content: - print(filtered_content[0][:500]) - - # Save on disc the markdown version - with open("filtered_content.md", "w", encoding="utf-8") as f: - f.write("\n".join(filtered_content)) - - # Show token usage - filter.show_usage() - -if __name__ == "__main__": - asyncio.run(test_llm_filter()) \ No newline at end of file diff --git a/tests/general/test_robot_parser.py b/tests/general/test_robot_parser.py deleted file mode 100644 index a2fc30f1a..000000000 --- a/tests/general/test_robot_parser.py +++ /dev/null @@ -1,159 +0,0 @@ -from crawl4ai.utils import RobotsParser - -import asyncio -import aiohttp -from aiohttp import web -import tempfile -import shutil -import os, sys, time, json - - -async def test_robots_parser(): - print("\n=== Testing RobotsParser ===\n") - - # Setup temporary directory for testing - temp_dir = tempfile.mkdtemp() - try: - # 1. Basic setup test - print("1. Testing basic initialization...") - parser = RobotsParser(cache_dir=temp_dir) - assert os.path.exists(parser.db_path), "Database file not created" - print("โœ“ Basic initialization passed") - - # 2. Test common cases - print("\n2. Testing common cases...") - allowed = await parser.can_fetch("https://www.example.com", "MyBot/1.0") - print(f"โœ“ Regular website fetch: {'allowed' if allowed else 'denied'}") - - # Test caching - print("Testing cache...") - start = time.time() - await parser.can_fetch("https://www.example.com", "MyBot/1.0") - duration = time.time() - start - print(f"โœ“ Cached lookup took: {duration*1000:.2f}ms") - assert duration < 0.03, "Cache lookup too slow" - - # 3. Edge cases - print("\n3. Testing edge cases...") - - # Empty URL - result = await parser.can_fetch("", "MyBot/1.0") - print(f"โœ“ Empty URL handled: {'allowed' if result else 'denied'}") - - # Invalid URL - result = await parser.can_fetch("not_a_url", "MyBot/1.0") - print(f"โœ“ Invalid URL handled: {'allowed' if result else 'denied'}") - - # URL without scheme - result = await parser.can_fetch("example.com/page", "MyBot/1.0") - print(f"โœ“ URL without scheme handled: {'allowed' if result else 'denied'}") - - # 4. Test with local server - async def start_test_server(): - app = web.Application() - - async def robots_txt(request): - return web.Response(text="""User-agent: * -Disallow: /private/ -Allow: /public/ -""") - - async def malformed_robots(request): - return web.Response(text="<<>>") - - async def timeout_robots(request): - await asyncio.sleep(5) - return web.Response(text="Should timeout") - - async def empty_robots(request): - return web.Response(text="") - - async def giant_robots(request): - return web.Response(text="User-agent: *\nDisallow: /\n" * 10000) - - # Mount all handlers at root level - app.router.add_get('/robots.txt', robots_txt) - app.router.add_get('/malformed/robots.txt', malformed_robots) - app.router.add_get('/timeout/robots.txt', timeout_robots) - app.router.add_get('/empty/robots.txt', empty_robots) - app.router.add_get('/giant/robots.txt', giant_robots) - - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite(runner, 'localhost', 8080) - await site.start() - return runner - - runner = await start_test_server() - try: - print("\n4. Testing robots.txt rules...") - base_url = "http://localhost:8080" - - # Test public access - result = await parser.can_fetch(f"{base_url}/public/page", "bot") - print(f"Public access (/public/page): {'allowed' if result else 'denied'}") - assert result, "Public path should be allowed" - - # Test private access - result = await parser.can_fetch(f"{base_url}/private/secret", "bot") - print(f"Private access (/private/secret): {'allowed' if result else 'denied'}") - assert not result, "Private path should be denied" - - # Test malformed - result = await parser.can_fetch("http://localhost:8080/malformed/page", "bot") - print(f"โœ“ Malformed robots.txt handled: {'allowed' if result else 'denied'}") - - # Test timeout - start = time.time() - result = await parser.can_fetch("http://localhost:8080/timeout/page", "bot") - duration = time.time() - start - print(f"โœ“ Timeout handled (took {duration:.2f}s): {'allowed' if result else 'denied'}") - assert duration < 3, "Timeout not working" - - # Test empty - result = await parser.can_fetch("http://localhost:8080/empty/page", "bot") - print(f"โœ“ Empty robots.txt handled: {'allowed' if result else 'denied'}") - - # Test giant file - start = time.time() - result = await parser.can_fetch("http://localhost:8080/giant/page", "bot") - duration = time.time() - start - print(f"โœ“ Giant robots.txt handled (took {duration:.2f}s): {'allowed' if result else 'denied'}") - - finally: - await runner.cleanup() - - # 5. Cache manipulation - print("\n5. Testing cache manipulation...") - - # Clear expired - parser.clear_expired() - print("โœ“ Clear expired entries completed") - - # Clear all - parser.clear_cache() - print("โœ“ Clear all cache completed") - - # Test with custom TTL - custom_parser = RobotsParser(cache_dir=temp_dir, cache_ttl=1) # 1 second TTL - await custom_parser.can_fetch("https://www.example.com", "bot") - print("โœ“ Custom TTL fetch completed") - await asyncio.sleep(1.1) - start = time.time() - await custom_parser.can_fetch("https://www.example.com", "bot") - print(f"โœ“ TTL expiry working (refetched after {time.time() - start:.2f}s)") - - finally: - # Cleanup - shutil.rmtree(temp_dir) - print("\nTest cleanup completed") - -async def main(): - try: - await test_robots_parser() - except Exception as e: - print(f"Test failed: {str(e)}") - raise - -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file diff --git a/tests/general/test_schema_builder.py b/tests/general/test_schema_builder.py deleted file mode 100644 index 929632247..000000000 --- a/tests/general/test_schema_builder.py +++ /dev/null @@ -1,112 +0,0 @@ -# https://claude.ai/chat/c4bbe93d-fb54-44ce-92af-76b4c8086c6b -# https://claude.ai/chat/c24a768c-d8b2-478a-acc7-d76d42a308da -import os, sys - -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - -import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator -from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy -from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy -import json - -# Test HTML - A complex job board with companies, departments, and positions -test_html = """ -
-
-
- -

Google

-
- 10,000+ employees - Technology - Careers Page -
-
- -
-
-

Engineering

-
-
-

Senior Software Engineer

- $150,000 - $250,000 -
- Mountain View, CA - Full-time - 5+ years -
-
- Python - Kubernetes - Machine Learning -
-

Join our core engineering team...

-
- Posted: 2024-03-15 - -
-
- -
-
- -
-

Marketing

-
-
-

Growth Marketing Manager

- $120,000 - $180,000 -
- New York, NY - Full-time - 3+ years -
-
- SEO - Analytics - Content Strategy -
-

Drive our growth initiatives...

-
- Posted: 2024-03-14 - -
-
-
-
-
-
-
-""" - -# Test cases -def test_schema_generation(): - # Test 1: No query (should extract everything) - print("\nTest 1: No Query (Full Schema)") - schema1 = JsonCssExtractionStrategy.generate_schema(test_html) - print(json.dumps(schema1, indent=2)) - - # Test 2: Query for just basic job info - print("\nTest 2: Basic Job Info Query") - query2 = "I only need job titles, salaries, and locations" - schema2 = JsonCssExtractionStrategy.generate_schema(test_html, query2) - print(json.dumps(schema2, indent=2)) - - # Test 3: Query for company and department structure - print("\nTest 3: Organizational Structure Query") - query3 = "Extract company details and department names, without position details" - schema3 = JsonCssExtractionStrategy.generate_schema(test_html, query3) - print(json.dumps(schema3, indent=2)) - - # Test 4: Query for specific skills tracking - print("\nTest 4: Skills Analysis Query") - query4 = "I want to analyze required skills across all positions" - schema4 = JsonCssExtractionStrategy.generate_schema(test_html, query4) - print(json.dumps(schema4, indent=2)) - -if __name__ == "__main__": - test_schema_generation() \ No newline at end of file diff --git a/tests/general/tets_robot.py b/tests/general/tets_robot.py deleted file mode 100644 index 9bb30bb9e..000000000 --- a/tests/general/tets_robot.py +++ /dev/null @@ -1,62 +0,0 @@ -import asyncio -from crawl4ai import * - -async def test_real_websites(): - print("\n=== Testing Real Website Robots.txt Compliance ===\n") - - browser_config = BrowserConfig(headless=True, verbose=True) - async with AsyncWebCrawler(config=browser_config) as crawler: - - # Test cases with URLs - test_cases = [ - # Public sites that should be allowed - ("https://example.com", True), # Simple public site - ("https://httpbin.org/get", True), # API endpoint - - # Sites with known strict robots.txt - ("https://www.facebook.com/robots.txt", False), # Social media - ("https://www.google.com/search", False), # Search pages - - # Edge cases - ("https://api.github.com", True), # API service - ("https://raw.githubusercontent.com", True), # Content delivery - - # Non-existent/error cases - ("https://thisisnotarealwebsite.com", True), # Non-existent domain - ("https://localhost:12345", True), # Invalid port - ] - - for url, expected in test_cases: - print(f"\nTesting: {url}") - try: - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - check_robots_txt=True, # Enable robots.txt checking - verbose=True - ) - - result = await crawler.arun(url=url, config=config) - allowed = result.success and not result.error_message - - print(f"Expected: {'allowed' if expected else 'denied'}") - print(f"Actual: {'allowed' if allowed else 'denied'}") - print(f"Status Code: {result.status_code}") - if result.error_message: - print(f"Error: {result.error_message}") - - # Optional: Print robots.txt content if available - if result.metadata and 'robots_txt' in result.metadata: - print(f"Robots.txt rules:\n{result.metadata['robots_txt']}") - - except Exception as e: - print(f"Test failed with error: {str(e)}") - -async def main(): - try: - await test_real_websites() - except Exception as e: - print(f"Test suite failed: {str(e)}") - raise - -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 000000000..83aa37c74 --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,62 @@ +import os +import shutil +import tempfile +import time + +from crawl4ai.cache_client import CacheClient + +EXAMPLE_URL = "https://www.example.com" +EXAMPLE_RAW_HTML = """Example Domain

Example Domain

This domain is for use in documentation examples without needing permission. Avoid use in operations.

Learn more

\n""" + +class TestCacheClient(CacheClient): + """ + A simple local file-based cache client. + File content format: \n---CACHE_DELIMITER---\n + """ + CACHE_DELIMITER = "\n---CACHE_DELIMITER---\n" + + def __init__(self): + self.base_directory = tempfile.mkdtemp(prefix="crawl4ai_test_cache_") + + def _get_file_path(self, key: str) -> str: + safe_key = key.replace(":", "_").replace("/", "_") + return os.path.join(self.base_directory, safe_key) + + def get(self, key: str) -> str | None: + file_path = self._get_file_path(key) + if not os.path.exists(file_path): + return None + + with open(file_path, encoding="utf-8") as f: + content = f.read() + + cached = content.split(self.CACHE_DELIMITER, 1) + expiration_time = float(cached[0]) + + if time.time() > expiration_time: + os.remove(file_path) + return None + + return cached[1] + + def set(self, key: str, value: str, ttl_seconds: int) -> None: + file_path = self._get_file_path(key) + expiration_time = time.time() + ttl_seconds + content = f"{expiration_time}{self.CACHE_DELIMITER}{value}" + with open(file_path, "w+", encoding="utf-8") as f: + f.write(content) + + def clear(self, prefix: str) -> None: + for filename in os.listdir(self.base_directory): + if filename.startswith(prefix.replace(":", "_")): + file_path = os.path.join(self.base_directory, filename) + os.remove(file_path) + + # === UTILITY METHODS FOR TESTING === + + def count(self) -> int: + return len(os.listdir(self.base_directory)) + + def cleanup(self): + shutil.rmtree(self.base_directory) + diff --git a/tests/hub/test_simple.py b/tests/hub/test_simple.py deleted file mode 100644 index a970d683c..000000000 --- a/tests/hub/test_simple.py +++ /dev/null @@ -1,34 +0,0 @@ -# test.py -from crawl4ai import CrawlerHub -import json - -async def amazon_example(): - if (crawler_cls := CrawlerHub.get("amazon_product")) : - crawler = crawler_cls() - print(f"Crawler version: {crawler_cls.meta['version']}") - print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}") - print(await crawler.run("https://amazon.com/test")) - else: - print("Crawler not found!") - -async def google_example(): - # Get crawler dynamically - crawler_cls = CrawlerHub.get("google_search") - crawler = crawler_cls() - - # Text search - text_results = await crawler.run( - query="apple inc", - search_type="text", - schema_cache_path="/Users/unclecode/.crawl4ai" - ) - print(json.dumps(json.loads(text_results), indent=4)) - - # Image search - # image_results = await crawler.run(query="apple inc", search_type="image") - # print(image_results) - -if __name__ == "__main__": - import asyncio - # asyncio.run(amazon_example()) - asyncio.run(google_example()) \ No newline at end of file diff --git a/tests/mcp/test_mcp_socket.py b/tests/mcp/test_mcp_socket.py deleted file mode 100644 index 32456b311..000000000 --- a/tests/mcp/test_mcp_socket.py +++ /dev/null @@ -1,119 +0,0 @@ -# pip install "mcp-sdk[ws]" anyio -import anyio, json -from mcp.client.websocket import websocket_client -from mcp.client.session import ClientSession - -async def test_list(): - async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w): - async with ClientSession(r, w) as s: - await s.initialize() - - print("tools :", [t.name for t in (await s.list_tools()).tools]) - print("resources :", [r.name for r in (await s.list_resources()).resources]) - print("templates :", [t.name for t in (await s.list_resource_templates()).resource_templates]) - - -async def test_crawl(s: ClientSession) -> None: - """Hit the @mcp_tool('crawl') endpoint.""" - res = await s.call_tool( - "crawl", - { - "urls": ["https://example.com"], - "browser_config": {}, - "crawler_config": {}, - }, - ) - print("crawl โ†’", json.loads(res.content[0].text)) - - -async def test_md(s: ClientSession) -> None: - """Hit the @mcp_tool('md') endpoint.""" - res = await s.call_tool( - "md", - { - "url": "https://example.com", - "f": "fit", # or RAW, BM25, LLM - "q": None, - "c": "0", - }, - ) - result = json.loads(res.content[0].text) - print("md โ†’", result['markdown'][:100], "...") - -async def test_screenshot(s: ClientSession): - res = await s.call_tool( - "screenshot", - { - "url": "https://example.com", - "screenshot_wait_for": 1.0, - }, - ) - png_b64 = json.loads(res.content[0].text)["screenshot"] - print("screenshot โ†’", png_b64[:60], "โ€ฆ (base64)") - - -async def test_pdf(s: ClientSession): - res = await s.call_tool( - "pdf", - { - "url": "https://example.com", - }, - ) - pdf_b64 = json.loads(res.content[0].text)["pdf"] - print("pdf โ†’", pdf_b64[:60], "โ€ฆ (base64)") - -async def test_execute_js(s: ClientSession): - # click the โ€œMoreโ€ link on Hackerย News front page and wait 1ย s - res = await s.call_tool( - "execute_js", - { - "url": "https://news.ycombinator.com/news", - "js_code": [ - "await page.click('a.morelink')", - "await page.waitForTimeout(1000)", - ], - }, - ) - crawl_result = json.loads(res.content[0].text) - print("execute_js โ†’ status", crawl_result["success"], "| html len:", len(crawl_result["html"])) - -async def test_html(s: ClientSession): - # click the โ€œMoreโ€ link on Hackerย News front page and wait 1ย s - res = await s.call_tool( - "html", - { - "url": "https://news.ycombinator.com/news", - }, - ) - crawl_result = json.loads(res.content[0].text) - print("execute_js โ†’ status", crawl_result["success"], "| html len:", len(crawl_result["html"])) - -async def test_context(s: ClientSession): - # click the โ€œMoreโ€ link on Hackerย News front page and wait 1ย s - res = await s.call_tool( - "ask", - { - "query": "I hv a question about Crawl4ai library, how to extract internal links when crawling a page?" - }, - ) - crawl_result = json.loads(res.content[0].text) - print("execute_js โ†’ status", crawl_result["success"], "| html len:", len(crawl_result["html"])) - - -async def main() -> None: - async with websocket_client("ws://localhost:11235/mcp/ws") as (r, w): - async with ClientSession(r, w) as s: - await s.initialize() # handshake - tools = (await s.list_tools()).tools - print("tools:", [t.name for t in tools]) - - # await test_list() - await test_crawl(s) - await test_md(s) - await test_screenshot(s) - await test_pdf(s) - await test_execute_js(s) - await test_html(s) - await test_context(s) - -anyio.run(main) diff --git a/tests/mcp/test_mcp_sse.py b/tests/mcp/test_mcp_sse.py deleted file mode 100644 index d9eee5574..000000000 --- a/tests/mcp/test_mcp_sse.py +++ /dev/null @@ -1,11 +0,0 @@ -from mcp.client.sse import sse_client -from mcp.client.session import ClientSession - -async def main(): - async with sse_client("http://127.0.0.1:8020/mcp") as (r, w): - async with ClientSession(r, w) as sess: - print(await sess.list_tools()) # now works - -if __name__ == "__main__": - import asyncio - asyncio.run(main()) diff --git a/tests/memory/test_docker_config_gen.py b/tests/memory/test_docker_config_gen.py deleted file mode 100644 index ae6e533c4..000000000 --- a/tests/memory/test_docker_config_gen.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 -""" -Quick sanityโ€‘check for /config/dump endpoint. - -Usage: - python test_config_dump.py [http://localhost:8020] - -If the server isnโ€™t running, start it first: - uvicorn deploy.docker.server:app --port 8020 -""" - -import sys, json, textwrap, requests - -# BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020" -BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235" -URL = f"{BASE.rstrip('/')}/config/dump" - -CASES = [ - # --- CrawlRunConfig variants --- - "CrawlerRunConfig()", - "CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)", - "CrawlerRunConfig(js_only=True, wait_until='networkidle')", - - # --- BrowserConfig variants --- - "BrowserConfig()", - "BrowserConfig(headless=False, extra_args=['--disable-gpu'])", - "BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')", -] - -for code in CASES: - print("\n=== POST:", code) - resp = requests.post(URL, json={"code": code}, timeout=15) - if resp.ok: - print(json.dumps(resp.json(), indent=2)[:400] + "...") - else: - print("ERROR", resp.status_code, resp.text[:200]) diff --git a/tests/memory/test_stress_api_xs.py b/tests/memory/test_stress_api_xs.py deleted file mode 100644 index 272488833..000000000 --- a/tests/memory/test_stress_api_xs.py +++ /dev/null @@ -1,203 +0,0 @@ -"""Lite Crawl4AI API stressโ€‘tester. - -โœ” batch or stream mode (single unified path) -โœ” global stats + JSON summary -โœ” rich table progress -โœ” Typer CLI with presets (quick / soak) - -Usage examples: - python api_stress_test.py # uses quick preset - python api_stress_test.py soak # 5โ€ฏK URLs stress run - python api_stress_test.py --urls 200 --concurrent 10 --chunk 20 -""" - -from __future__ import annotations - -import asyncio, json, time, uuid, pathlib, statistics -from typing import List, Dict, Optional - -import httpx, typer -from rich.console import Console -from rich.table import Table - -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ defaults / presets โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -PRESETS = { - "quick": dict(urls=1, concurrent=1, chunk=1, stream=False), - "debug": dict(urls=10, concurrent=2, chunk=5, stream=False), - "soak": dict(urls=5000, concurrent=20, chunk=50, stream=True), -} - -API_HEALTH_ENDPOINT = "/health" -REQUEST_TIMEOUT = 180.0 - -console = Console() -app = typer.Typer(add_completion=False, rich_markup_mode="rich") - -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -async def _check_health(client: httpx.AsyncClient) -> None: - resp = await client.get(API_HEALTH_ENDPOINT, timeout=10) - resp.raise_for_status() - console.print(f"[green]Server healthy โ€” version {resp.json().get('version','?')}[/]") - -async def _iter_results(resp: httpx.Response, stream: bool): - """Yield result dicts from batch JSON or NDโ€‘JSON stream.""" - if stream: - async for line in resp.aiter_lines(): - if not line: - continue - rec = json.loads(line) - if rec.get("status") == "completed": - break - yield rec - else: - data = resp.json() - for rec in data.get("results", []): - yield rec, data # rec + whole payload for memory delta/peak - -async def _consume_stream(resp: httpx.Response) -> Dict: - stats = {"success_urls": 0, "failed_urls": 0, "mem_metric": 0.0} - async for line in resp.aiter_lines(): - if not line: - continue - rec = json.loads(line) - if rec.get("status") == "completed": - break - if rec.get("success"): - stats["success_urls"] += 1 - else: - stats["failed_urls"] += 1 - mem = rec.get("server_memory_mb") - if mem is not None: - stats["mem_metric"] = max(stats["mem_metric"], float(mem)) - return stats - -def _consume_batch(body: Dict) -> Dict: - stats = {"success_urls": 0, "failed_urls": 0} - for rec in body.get("results", []): - if rec.get("success"): - stats["success_urls"] += 1 - else: - stats["failed_urls"] += 1 - stats["mem_metric"] = body.get("server_memory_delta_mb") - stats["peak"] = body.get("server_peak_memory_mb") - return stats - -async def _fetch_chunk( - client: httpx.AsyncClient, - urls: List[str], - stream: bool, - semaphore: asyncio.Semaphore, -) -> Dict: - endpoint = "/crawl/stream" if stream else "/crawl" - payload = { - "urls": urls, - "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, - "crawler_config": {"type": "CrawlerRunConfig", - "params": {"cache_mode": "BYPASS", "stream": stream}}, - } - - async with semaphore: - start = time.perf_counter() - - if stream: - # ---- streaming request ---- - async with client.stream("POST", endpoint, json=payload) as resp: - resp.raise_for_status() - stats = await _consume_stream(resp) - else: - # ---- batch request ---- - resp = await client.post(endpoint, json=payload) - resp.raise_for_status() - stats = _consume_batch(resp.json()) - - stats["elapsed"] = time.perf_counter() - start - return stats - - -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ core runner โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -async def _run(api: str, urls: int, concurrent: int, chunk: int, stream: bool, report: pathlib.Path): - client = httpx.AsyncClient(base_url=api, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=concurrent+5)) - await _check_health(client) - - url_list = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(urls)] - chunks = [url_list[i:i+chunk] for i in range(0, len(url_list), chunk)] - sem = asyncio.Semaphore(concurrent) - - table = Table(show_header=True, header_style="bold magenta") - table.add_column("Batch", style="dim", width=6) - table.add_column("Success/Fail", width=12) - table.add_column("Mem", width=14) - table.add_column("Time (s)") - - agg_success = agg_fail = 0 - deltas, peaks = [], [] - - start = time.perf_counter() - tasks = [asyncio.create_task(_fetch_chunk(client, c, stream, sem)) for c in chunks] - for idx, coro in enumerate(asyncio.as_completed(tasks), 1): - res = await coro - agg_success += res["success_urls"] - agg_fail += res["failed_urls"] - if res["mem_metric"] is not None: - deltas.append(res["mem_metric"]) - if res["peak"] is not None: - peaks.append(res["peak"]) - - mem_txt = f"{res['mem_metric']:.1f}" if res["mem_metric"] is not None else "โ€‘" - if res["peak"] is not None: - mem_txt = f"{res['peak']:.1f}/{mem_txt}" - - table.add_row(str(idx), f"{res['success_urls']}/{res['failed_urls']}", mem_txt, f"{res['elapsed']:.2f}") - - console.print(table) - total_time = time.perf_counter() - start - - summary = { - "urls": urls, - "concurrent": concurrent, - "chunk": chunk, - "stream": stream, - "success_urls": agg_success, - "failed_urls": agg_fail, - "elapsed_sec": round(total_time, 2), - "avg_mem": round(statistics.mean(deltas), 2) if deltas else None, - "max_mem": max(deltas) if deltas else None, - "avg_peak": round(statistics.mean(peaks), 2) if peaks else None, - "max_peak": max(peaks) if peaks else None, - } - console.print("\n[bold green]Done:[/]" , summary) - - report.mkdir(parents=True, exist_ok=True) - path = report / f"api_test_{int(time.time())}.json" - path.write_text(json.dumps(summary, indent=2)) - console.print(f"[green]Summary โ†’ {path}") - - await client.aclose() - -# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Typer CLI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ -@app.command() -def main( - preset: str = typer.Argument("quick", help="quick / debug / soak or custom"), - api_url: str = typer.Option("http://localhost:8020", show_default=True), - urls: int = typer.Option(None, help="Total URLs to crawl"), - concurrent: int = typer.Option(None, help="Concurrent API requests"), - chunk: int = typer.Option(None, help="URLs per request"), - stream: bool = typer.Option(None, help="Use /crawl/stream"), - report: pathlib.Path = typer.Option("reports_api", help="Where to save JSON summary"), -): - """Run a stress test against a running Crawl4AI API server.""" - if preset not in PRESETS and any(v is None for v in (urls, concurrent, chunk, stream)): - console.print(f"[red]Unknown preset '{preset}' and custom params missing[/]") - raise typer.Exit(1) - - cfg = PRESETS.get(preset, {}) - urls = urls or cfg.get("urls") - concurrent = concurrent or cfg.get("concurrent") - chunk = chunk or cfg.get("chunk") - stream = stream if stream is not None else cfg.get("stream", False) - - console.print(f"[cyan]API:[/] {api_url}ย | URLs: {urls}ย | Concurrency: {concurrent} | Chunk: {chunk} | Stream: {stream}") - asyncio.run(_run(api_url, urls, concurrent, chunk, stream, report)) - -if __name__ == "__main__": - app() diff --git a/tests/profiler/test_create_profile.py b/tests/profiler/test_create_profile.py deleted file mode 100644 index e441ea4a2..000000000 --- a/tests/profiler/test_create_profile.py +++ /dev/null @@ -1,32 +0,0 @@ -from crawl4ai import BrowserProfiler -import asyncio - - -if __name__ == "__main__": - # Example usage - profiler = BrowserProfiler() - - # Create a new profile - import os - from pathlib import Path - home_dir = Path.home() - profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile"))) - - print(f"Profile created at: {profile_path}") - - - - # # Launch a standalone browser - # asyncio.run(profiler.launch_standalone_browser()) - - # # List profiles - # profiles = profiler.list_profiles() - # for profile in profiles: - # print(f"Profile: {profile['name']}, Path: {profile['path']}") - - # # Delete a profile - # success = profiler.delete_profile("my-profile") - # if success: - # print("Profile deleted successfully") - # else: - # print("Failed to delete profile") \ No newline at end of file diff --git a/tests/profiler/test_keyboard_handle.py b/tests/profiler/test_keyboard_handle.py deleted file mode 100644 index 8845c1058..000000000 --- a/tests/profiler/test_keyboard_handle.py +++ /dev/null @@ -1,55 +0,0 @@ -import sys -import pytest -import asyncio -from unittest.mock import patch, MagicMock -from crawl4ai.browser_profiler import BrowserProfiler - -@pytest.mark.asyncio -@pytest.mark.skipif(sys.platform != "win32", reason="Windows-specific msvcrt test") -async def test_keyboard_input_handling(): - # Mock sequence of keystrokes: arrow key followed by 'q' - mock_keys = [b'\x00K', b'q'] - mock_kbhit = MagicMock(side_effect=[True, True, False]) - mock_getch = MagicMock(side_effect=mock_keys) - - with patch('msvcrt.kbhit', mock_kbhit), patch('msvcrt.getch', mock_getch): - # profiler = BrowserProfiler() - user_done_event = asyncio.Event() - - # Create a local async function to simulate the keyboard input handling - async def test_listen_for_quit_command(): - if sys.platform == "win32": - while True: - try: - if mock_kbhit(): - raw = mock_getch() - try: - key = raw.decode("utf-8") - except UnicodeDecodeError: - continue - - if len(key) != 1 or not key.isprintable(): - continue - - if key.lower() == "q": - user_done_event.set() - return - - await asyncio.sleep(0.1) - except Exception as e: - continue - - # Run the listener - listener_task = asyncio.create_task(test_listen_for_quit_command()) - - # Wait for the event to be set - try: - await asyncio.wait_for(user_done_event.wait(), timeout=1.0) - assert user_done_event.is_set() - finally: - if not listener_task.done(): - listener_task.cancel() - try: - await listener_task - except asyncio.CancelledError: - pass \ No newline at end of file diff --git a/tests/releases/test_release_0.6.4.py b/tests/releases/test_release_0.6.4.py deleted file mode 100644 index 06bd8f9e9..000000000 --- a/tests/releases/test_release_0.6.4.py +++ /dev/null @@ -1,151 +0,0 @@ -import pytest -import asyncio -import time -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, CacheMode - - -@pytest.mark.asyncio -async def test_wait_for_timeout_separate_from_page_timeout(): - """Test that wait_for has its own timeout separate from page_timeout""" - browser_config = BrowserConfig(headless=True) - - # Test with short wait_for_timeout but longer page_timeout - config = CrawlerRunConfig( - wait_for="css:.nonexistent-element", - wait_for_timeout=2000, # 2 seconds - page_timeout=10000, # 10 seconds - cache_mode=CacheMode.BYPASS - ) - - async with AsyncWebCrawler(config=browser_config) as crawler: - start_time = time.time() - result = await crawler.arun("https://example.com", config=config) - elapsed = time.time() - start_time - - # Should timeout after ~2 seconds (wait_for_timeout), not 10 seconds - assert elapsed < 5, f"Expected timeout around 2s, but took {elapsed:.2f}s" - assert result.success, "Crawl should still succeed even if wait_for times out" - - -@pytest.mark.asyncio -async def test_wait_for_timeout_with_existing_element(): - """Test that wait_for_timeout works correctly when element exists""" - browser_config = BrowserConfig(headless=True) - - config = CrawlerRunConfig( - wait_for="css:body", # This should exist quickly - wait_for_timeout=5000, - cache_mode=CacheMode.BYPASS - ) - - async with AsyncWebCrawler(config=browser_config) as crawler: - start_time = time.time() - result = await crawler.arun("https://example.com", config=config) - elapsed = time.time() - start_time - - # Should complete quickly since body element exists - assert elapsed < 3, f"Expected quick completion, but took {elapsed:.2f}s" - assert result.success - assert " - - - Test GA Integration - - - - - -

Test Page

-

Testing Google Analytics integration

- - - """ - - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(f"raw://{html_content}", config=config) - - assert result.success - # Check that GA scripts are preserved in the HTML - assert "googletagmanager.com/gtag/js" in result.html - assert "dataLayer" in result.html - assert "gtag('config'" in result.html - - -@pytest.mark.asyncio -async def test_mkdocs_no_duplicate_gtag(): - """Test that there are no duplicate gtag.js entries in documentation""" - browser_config = BrowserConfig(headless=True) - config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) - - # Simulate MkDocs-like HTML structure - html_content = """ - - - - Crawl4AI Documentation - - - - -

Crawl4AI Documentation

-

Welcome to the documentation

- - - """ - - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(f"raw://{html_content}", config=config) - - assert result.success - # Count occurrences of gtag.js to ensure no duplicates - gtag_count = result.html.count("googletagmanager.com/gtag/js") - assert gtag_count <= 1, f"Found {gtag_count} gtag.js scripts, expected at most 1" - - # Ensure the analytics functionality is still there - if gtag_count == 1: - assert "dataLayer" in result.html - assert "gtag('config'" in result.html - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/releases/test_release_0.7.0.py b/tests/releases/test_release_0.7.0.py deleted file mode 100644 index a0885a705..000000000 --- a/tests/releases/test_release_0.7.0.py +++ /dev/null @@ -1,317 +0,0 @@ -#!/usr/bin/env python3 - -import asyncio -import pytest -import os -import json -import tempfile -from pathlib import Path -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy, LLMConfig -from crawl4ai.content_filter_strategy import BM25ContentFilter -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator -from crawl4ai.async_url_seeder import AsyncUrlSeeder -from crawl4ai.utils import RobotsParser - - -class TestCrawl4AIv070: - """Test suite for Crawl4AI v0.7.0 changes""" - - @pytest.mark.asyncio - async def test_raw_url_parsing(self): - """Test raw:// URL parsing logic fix""" - html_content = "

Test Content

This is a test paragraph.

" - - async with AsyncWebCrawler() as crawler: - # Test raw:// prefix - result1 = await crawler.arun(f"raw://{html_content}") - assert result1.success - assert "Test Content" in result1.markdown - - # Test raw: prefix - result2 = await crawler.arun(f"raw:{html_content}") - assert result2.success - assert "Test Content" in result2.markdown - - @pytest.mark.asyncio - async def test_max_pages_limit_batch_processing(self): - """Test max_pages limit is respected during batch processing""" - urls = [ - "https://httpbin.org/html", - "https://httpbin.org/json", - "https://httpbin.org/xml" - ] - - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - max_pages=2 - ) - - async with AsyncWebCrawler() as crawler: - results = await crawler.arun_many(urls, config=config) - # Should only process 2 pages due to max_pages limit - successful_results = [r for r in results if r.success] - assert len(successful_results) <= 2 - - @pytest.mark.asyncio - async def test_navigation_abort_handling(self): - """Test handling of navigation aborts during file downloads""" - async with AsyncWebCrawler() as crawler: - # Test with a URL that might cause navigation issues - result = await crawler.arun( - "https://httpbin.org/status/404", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) - ) - # Should not crash even with navigation issues - assert result is not None - - @pytest.mark.asyncio - async def test_screenshot_capture_fix(self): - """Test screenshot capture improvements""" - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - screenshot=True - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://httpbin.org/html", config=config) - assert result.success - assert result.screenshot is not None - assert len(result.screenshot) > 0 - - @pytest.mark.asyncio - async def test_redirect_status_codes(self): - """Test that real redirect status codes are surfaced""" - async with AsyncWebCrawler() as crawler: - # Test with a redirect URL - result = await crawler.arun( - "https://httpbin.org/redirect/1", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) - ) - assert result.success - # Should have redirect information - assert result.status_code in [200, 301, 302, 303, 307, 308] - - @pytest.mark.asyncio - async def test_local_file_processing(self): - """Test local file processing with captured_console initialization""" - with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: - f.write("

Local File Test

") - temp_file = f.name - - try: - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(f"file://{temp_file}") - assert result.success - assert "Local File Test" in result.markdown - finally: - os.unlink(temp_file) - - @pytest.mark.asyncio - async def test_robots_txt_wildcard_support(self): - """Test robots.txt wildcard rules support""" - parser = RobotsParser() - - # Test wildcard patterns - robots_content = "User-agent: *\nDisallow: /admin/*\nDisallow: *.pdf" - - # This should work without throwing exceptions - assert parser is not None - - @pytest.mark.asyncio - async def test_exclude_external_images(self): - """Test exclude_external_images flag""" - html_with_images = ''' - - Local - External - - ''' - - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - exclude_external_images=True - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(f"raw://{html_with_images}", config=config) - assert result.success - # External images should be excluded - assert "external.com" not in result.cleaned_html - - @pytest.mark.asyncio - async def test_llm_extraction_strategy_fix(self): - """Test LLM extraction strategy choices error fix""" - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("OpenAI API key not available") - - llm_config = LLMConfig( - provider="openai/gpt-4o-mini", - api_token=os.getenv("OPENAI_API_KEY") - ) - - strategy = LLMExtractionStrategy( - llm_config=llm_config, - instruction="Extract the main heading", - extraction_type="block" - ) - - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - extraction_strategy=strategy - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://httpbin.org/html", config=config) - assert result.success - # Should not throw 'str' object has no attribute 'choices' error - assert result.extracted_content is not None - - @pytest.mark.asyncio - async def test_wait_for_timeout(self): - """Test separate timeout for wait_for condition""" - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - wait_for="css:non-existent-element", - wait_for_timeout=1000 # 1 second timeout - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://httpbin.org/html", config=config) - # Should timeout gracefully and still return result - assert result is not None - - @pytest.mark.asyncio - async def test_bm25_content_filter_language_parameter(self): - """Test BM25 filter with language parameter for stemming""" - content_filter = BM25ContentFilter( - user_query="test content", - language="english", - use_stemming=True - ) - - markdown_generator = DefaultMarkdownGenerator( - content_filter=content_filter - ) - - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - markdown_generator=markdown_generator - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://httpbin.org/html", config=config) - assert result.success - assert result.markdown is not None - - @pytest.mark.asyncio - async def test_url_normalization(self): - """Test URL normalization for invalid schemes and trailing slashes""" - async with AsyncWebCrawler() as crawler: - # Test with trailing slash - result = await crawler.arun( - "https://httpbin.org/html/", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) - ) - assert result.success - - @pytest.mark.asyncio - async def test_max_scroll_steps(self): - """Test max_scroll_steps parameter for full page scanning""" - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - scan_full_page=True, - max_scroll_steps=3 - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://httpbin.org/html", config=config) - assert result.success - - @pytest.mark.asyncio - async def test_async_url_seeder(self): - """Test AsyncUrlSeeder functionality""" - seeder = AsyncUrlSeeder( - base_url="https://httpbin.org", - max_depth=1, - max_urls=5 - ) - - async with AsyncWebCrawler() as crawler: - urls = await seeder.seed(crawler) - assert isinstance(urls, list) - assert len(urls) <= 5 - - @pytest.mark.asyncio - async def test_pdf_processing_timeout(self): - """Test PDF processing with timeout""" - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - pdf=True, - pdf_timeout=10000 # 10 seconds - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://httpbin.org/html", config=config) - assert result.success - # PDF might be None for HTML pages, but should not hang - assert result.pdf is not None or result.pdf is None - - @pytest.mark.asyncio - async def test_browser_session_management(self): - """Test improved browser session management""" - browser_config = BrowserConfig( - headless=True, - use_persistent_context=True - ) - - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun( - "https://httpbin.org/html", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) - ) - assert result.success - - @pytest.mark.asyncio - async def test_memory_management(self): - """Test memory management features""" - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - memory_threshold_percent=80.0, - check_interval=1.0, - memory_wait_timeout=600 # 10 minutes default - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://httpbin.org/html", config=config) - assert result.success - - @pytest.mark.asyncio - async def test_virtual_scroll_support(self): - """Test virtual scroll support for modern web scraping""" - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - scan_full_page=True, - virtual_scroll=True - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://httpbin.org/html", config=config) - assert result.success - - @pytest.mark.asyncio - async def test_adaptive_crawling(self): - """Test adaptive crawling feature""" - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - adaptive_crawling=True - ) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun("https://httpbin.org/html", config=config) - assert result.success - - -if __name__ == "__main__": - # Run the tests - pytest.main([__file__, "-v"]) diff --git a/tests/test_arun_many.py b/tests/test_arun_many.py deleted file mode 100644 index 2a315a2a3..000000000 --- a/tests/test_arun_many.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -Test example for multiple crawler configs feature -""" -import asyncio -import sys -from pathlib import Path - -# Add parent directory to path for imports -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.processors.pdf import PDFContentScrapingStrategy - - -async def test_run_many(): - default_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - # scraping_strategy=PDFContentScrapingStrategy() - ) - - test_urls = [ - # "https://blog.python.org/", # Blog URL - "https://www.python.org/", # Generic HTTPS page - "https://www.kidocode.com/", # Generic HTTPS page - "https://www.example.com/", # Generic HTTPS page - # "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", - ] - - async with AsyncWebCrawler() as crawler: - # Single config - traditional usage still works - print("Test 1: Single config (backwards compatible)") - result = await crawler.arun_many( - urls=test_urls[:2], - config=default_config - ) - print(f"Crawled {len(result)} URLs with single config\n") - for item in result: - print(f" {item.url} -> {item.status_code}") - - -if __name__ == "__main__": - asyncio.run(test_run_many()) diff --git a/tests/general/test_async_webcrawler.py b/tests/test_async_web_crawler.py similarity index 64% rename from tests/general/test_async_webcrawler.py rename to tests/test_async_web_crawler.py index 4d7aa815f..895944b04 100644 --- a/tests/general/test_async_webcrawler.py +++ b/tests/test_async_web_crawler.py @@ -1,14 +1,41 @@ -import asyncio import pytest -from typing import List + from crawl4ai import ( AsyncWebCrawler, - BrowserConfig, + BrowserConfig, CrawlerRunConfig, MemoryAdaptiveDispatcher, RateLimiter, - CacheMode ) +from tests.helpers import EXAMPLE_URL + + +@pytest.mark.asyncio +async def test_arun(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=EXAMPLE_URL + ) + assert result.status_code == 200 + assert result.url == EXAMPLE_URL + assert result.markdown is not None + +@pytest.mark.asyncio +async def test_arun_many(): + test_urls = [ + "https://www.python.org/", + EXAMPLE_URL, + ] + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=test_urls[:2], + ) + assert len(results) == len(test_urls) + for item in results: + assert item.status_code == 200 + assert item.markdown is not None + assert item.url in test_urls @pytest.mark.asyncio @pytest.mark.parametrize("viewport", [ @@ -28,9 +55,8 @@ async def test_viewport_config(viewport): async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://example.com", + url=EXAMPLE_URL, config=CrawlerRunConfig( - # cache_mode=CacheMode.BYPASS, page_timeout=30000 # 30 seconds ) ) @@ -52,7 +78,7 @@ async def test_memory_management(): max_session_permit=5 ) - urls = ["https://example.com"] * 3 # Test with multiple identical URLs + urls = [EXAMPLE_URL] * 3 # Test with multiple identical URLs async with AsyncWebCrawler(config=browser_config) as crawler: results = await crawler.arun_many( @@ -80,7 +106,7 @@ async def test_rate_limiting(): ) urls = [ - "https://example.com", + EXAMPLE_URL, "https://example.org", "https://example.net" ] @@ -109,13 +135,14 @@ async def test_javascript_execution(): async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://example.com", + url=EXAMPLE_URL, config=CrawlerRunConfig( js_code=js_code, page_timeout=30000 ) ) - assert result.success + + assert result.success @pytest.mark.asyncio @pytest.mark.parametrize("error_url", [ @@ -135,15 +162,35 @@ async def test_error_handling(error_url): result = await crawler.arun( url=error_url, config=CrawlerRunConfig( - page_timeout=10000, # Short timeout for error cases - cache_mode=CacheMode.BYPASS + page_timeout=10000 ) ) - assert not result.success - assert result.error_message is not None - -if __name__ == "__main__": - asyncio.run(test_viewport_config((1024, 768))) - asyncio.run(test_memory_management()) - asyncio.run(test_rate_limiting()) - asyncio.run(test_javascript_execution()) \ No newline at end of file + + assert not result.success + assert result.error_message is not None + + +@pytest.mark.asyncio +async def test_extract_media(): + async with AsyncWebCrawler() as crawler: + url = "https://www.nbcnews.com/business" + result = await crawler.arun(url=url) + + assert result.success + assert result.media + assert result.media["images"] + assert any(img["src"] for img in result.media["images"]) + assert any(img["alt"] for img in result.media["images"]) + assert any(img["score"] for img in result.media["images"]) + +@pytest.mark.asyncio +async def test_extract_metadata(): + async with AsyncWebCrawler() as crawler: + url = "https://www.nbcnews.com/business" + result = await crawler.arun(url=url) + + assert result.success + assert result.metadata + assert all( + key in result.metadata for key in ["title", "description", "keywords"] + ) diff --git a/tests/test_caching.py b/tests/test_caching.py new file mode 100644 index 000000000..7d10f0f63 --- /dev/null +++ b/tests/test_caching.py @@ -0,0 +1,146 @@ +import pytest + +from unittest.mock import Mock, patch + +import pytest_asyncio + +from crawl4ai.async_configs import CrawlerRunConfig +from crawl4ai.async_webcrawler import AsyncWebCrawler +from crawl4ai.cache_context import CacheMode +from crawl4ai.models import AsyncCrawlResponse +from tests.helpers import EXAMPLE_RAW_HTML, EXAMPLE_URL, TestCacheClient + + +@pytest_asyncio.fixture +async def mock_async_crawl_response(monkeypatch): + mock_crawl_response = AsyncCrawlResponse( + html=EXAMPLE_RAW_HTML, + response_headers={ + 'accept-ranges': 'bytes', + 'alt-svc': 'h3=":443"; ma=93600', + 'cache-control': 'max-age=86000', + 'content-length': '513', + 'content-type': 'text/html', + 'date': 'Wed, 19 Nov 2025 20:09:52 GMT', + 'etag': '"bc2473a18e003bdb249eba5ce893033f:1760028122.592274"', + 'last-modified': 'Thu, 09 Oct 2025 16:42:02 GMT' + }, + js_execution_result=None, + status_code=200, + screenshot=None, + pdf_data=None, + mhtml_data=None, + downloaded_files=None, + ssl_certificate=None, + redirected_url=EXAMPLE_URL, + network_requests=None, + console_messages=None + ) + async def mock_crawl(self, url, **kwargs): + return mock_crawl_response + monkeypatch.setattr("crawl4ai.async_crawler_strategy.AsyncPlaywrightCrawlerStrategy.crawl", mock_crawl) + + +@pytest.mark.asyncio +async def test_caching(): + cache_client = TestCacheClient() + + async with AsyncWebCrawler(cache_client=cache_client) as crawler: + # First crawl (should not use cache) + result1 = await crawler.arun(url=EXAMPLE_URL, config=CrawlerRunConfig( + cache_mode=CacheMode.ENABLED + )) + cache_size = cache_client.count() + + assert result1.success + assert cache_size == 1 + + # Second crawl (should use cache) + result2 = await crawler.arun(url=EXAMPLE_URL, config=CrawlerRunConfig( + cache_mode=CacheMode.ENABLED + )) + final_cache_size = cache_client.count() + + assert result2.success + assert result2.html == result1.html + assert final_cache_size == 1 + + cache_client.cleanup() + + +@pytest.mark.asyncio +async def test_cache_excluded_tags(mock_async_crawl_response): + cache_client = TestCacheClient() + + async with AsyncWebCrawler(cache_client=cache_client) as crawler: + result1 = await crawler.arun(url=EXAMPLE_URL, config=CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + excluded_tags=["p"] + )) + cache_size = cache_client.count() + + assert result1.success + assert result1.markdown == "# Example Domain\n" + assert cache_size == 1 + + result2 = await crawler.arun(url=EXAMPLE_URL, config=CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + excluded_tags=["h1"] + )) + cache_size2 = cache_client.count() + + assert result2.success + assert result2.markdown == "This domain is for use in documentation examples without needing permission. Avoid use in operations.\n[Learn more](https://iana.org/domains/example)\n" + assert cache_size2 == cache_size + + result3 = await crawler.arun(url=EXAMPLE_URL, config=CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + )) + cache_size3 = cache_client.count() + + assert result3.success + assert result3.markdown == "# Example Domain\nThis domain is for use in documentation examples without needing permission. Avoid use in operations.\n[Learn more](https://iana.org/domains/example)\n" + assert cache_size3 == cache_size + + cache_client.cleanup() + + +@pytest.mark.asyncio +async def test_bypass_cache(): + cache_client = TestCacheClient() + get_spy = Mock(wraps=cache_client.get) + + with patch.object(TestCacheClient, 'get', get_spy): + async with AsyncWebCrawler(cache_client=cache_client) as crawler: + result1 = await crawler.arun(url=EXAMPLE_URL, config=CrawlerRunConfig( + cache_mode=CacheMode.ENABLED + )) + + assert result1.success + assert get_spy.call_count == 1 + + result2 = await crawler.arun(url=EXAMPLE_URL, config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS + )) + + assert result2.success + assert get_spy.call_count == 1 + + cache_client.cleanup() + + +@pytest.mark.asyncio +async def test_clear_cache(): + cache_client = TestCacheClient() + + async with AsyncWebCrawler(cache_client=cache_client) as crawler: + await crawler.arun(url=EXAMPLE_URL, config=CrawlerRunConfig( + cache_mode=CacheMode.ENABLED + )) + assert cache_client.count() == 1 + + cache_client.clear(prefix="") + assert cache_client.count() == 0 + + cache_client.cleanup() + diff --git a/tests/test_cli_docs.py b/tests/test_cli_docs.py deleted file mode 100644 index 6941f20db..000000000 --- a/tests/test_cli_docs.py +++ /dev/null @@ -1,44 +0,0 @@ -import asyncio -from crawl4ai.docs_manager import DocsManager -from click.testing import CliRunner -from crawl4ai.cli import cli - - -def test_cli(): - """Test all CLI commands""" - runner = CliRunner() - - print("\n1. Testing docs update...") - # Use sync version for testing - docs_manager = DocsManager() - loop = asyncio.get_event_loop() - loop.run_until_complete(docs_manager.fetch_docs()) - - # print("\n2. Testing listing...") - # result = runner.invoke(cli, ['docs', 'list']) - # print(f"Status: {'โœ…' if result.exit_code == 0 else 'โŒ'}") - # print(result.output) - - # print("\n2. Testing index building...") - # result = runner.invoke(cli, ['docs', 'index']) - # print(f"Status: {'โœ…' if result.exit_code == 0 else 'โŒ'}") - # print(f"Output: {result.output}") - - # print("\n3. Testing search...") - # result = runner.invoke(cli, ['docs', 'search', 'how to use crawler', '--build-index']) - # print(f"Status: {'โœ…' if result.exit_code == 0 else 'โŒ'}") - # print(f"First 200 chars: {result.output[:200]}...") - - # print("\n4. Testing combine with sections...") - # result = runner.invoke(cli, ['docs', 'combine', 'chunking_strategies', 'extraction_strategies', '--mode', 'extended']) - # print(f"Status: {'โœ…' if result.exit_code == 0 else 'โŒ'}") - # print(f"First 200 chars: {result.output[:200]}...") - - print("\n5. Testing combine all sections...") - result = runner.invoke(cli, ["docs", "combine", "--mode", "condensed"]) - print(f"Status: {'โœ…' if result.exit_code == 0 else 'โŒ'}") - print(f"First 200 chars: {result.output[:200]}...") - - -if __name__ == "__main__": - test_cli() diff --git a/tests/test_config_matching_only.py b/tests/test_config_matching_only.py deleted file mode 100644 index 0f21666e5..000000000 --- a/tests/test_config_matching_only.py +++ /dev/null @@ -1,131 +0,0 @@ -""" -Test only the config matching logic without running crawler -""" -import sys -from pathlib import Path - -# Add parent directory to path for imports -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from crawl4ai.async_configs import CrawlerRunConfig, MatchMode - -def test_all_matching_scenarios(): - print("Testing CrawlerRunConfig.is_match() method") - print("=" * 50) - - # Test 1: Single string pattern - print("\n1. Single string pattern (glob style)") - config = CrawlerRunConfig( - url_matcher="*.pdf", - # For example we can set this => scraping_strategy=PDFContentScrapingStrategy() - ) - test_urls = [ - ("https://example.com/file.pdf", True), - ("https://example.com/doc.PDF", False), # Case sensitive - ("https://example.com/file.txt", False), - ("file.pdf", True), - ] - for url, expected in test_urls: - result = config.is_match(url) - status = "โœ“" if result == expected else "โœ—" - print(f" {status} {url} -> {result}") - - # Test 2: List of patterns with OR - print("\n2. List of patterns with OR (default)") - config = CrawlerRunConfig( - url_matcher=["*/article/*", "*/blog/*", "*.html"], - match_mode=MatchMode.OR - ) - test_urls = [ - ("https://example.com/article/news", True), - ("https://example.com/blog/post", True), - ("https://example.com/page.html", True), - ("https://example.com/page.php", False), - ] - for url, expected in test_urls: - result = config.is_match(url) - status = "โœ“" if result == expected else "โœ—" - print(f" {status} {url} -> {result}") - - # Test 3: Custom function - print("\n3. Custom function matcher") - config = CrawlerRunConfig( - url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml')) - ) - test_urls = [ - ("https://api.example.com/data.json", True), - ("https://api.example.com/data.xml", True), - ("https://api.example.com/data.html", False), - ("https://example.com/data.json", False), # No 'api' - ] - for url, expected in test_urls: - result = config.is_match(url) - status = "โœ“" if result == expected else "โœ—" - print(f" {status} {url} -> {result}") - - # Test 4: Mixed list with AND - print("\n4. Mixed patterns and functions with AND") - config = CrawlerRunConfig( - url_matcher=[ - "https://*", # Must be HTTPS - lambda url: '.com' in url, # Must have .com - lambda url: len(url) < 50 # Must be short - ], - match_mode=MatchMode.AND - ) - test_urls = [ - ("https://example.com/page", True), - ("http://example.com/page", False), # Not HTTPS - ("https://example.org/page", False), # No .com - ("https://example.com/" + "x" * 50, False), # Too long - ] - for url, expected in test_urls: - result = config.is_match(url) - status = "โœ“" if result == expected else "โœ—" - print(f" {status} {url} -> {result}") - - # Test 5: Complex real-world scenario - print("\n5. Complex pattern combinations") - config = CrawlerRunConfig( - url_matcher=[ - "*/api/v[0-9]/*", # API versioned endpoints - lambda url: 'graphql' in url, # GraphQL endpoints - "*.json" # JSON files - ], - match_mode=MatchMode.OR - ) - test_urls = [ - ("https://example.com/api/v1/users", True), - ("https://example.com/api/v2/posts", True), - ("https://example.com/graphql", True), - ("https://example.com/data.json", True), - ("https://example.com/api/users", False), # No version - ] - for url, expected in test_urls: - result = config.is_match(url) - status = "โœ“" if result == expected else "โœ—" - print(f" {status} {url} -> {result}") - - # Test 6: Edge cases - print("\n6. Edge cases") - - # No matcher - config = CrawlerRunConfig() - result = config.is_match("https://example.com") - print(f" {'โœ“' if not result else 'โœ—'} No matcher -> {result}") - - # Empty list - config = CrawlerRunConfig(url_matcher=[]) - result = config.is_match("https://example.com") - print(f" {'โœ“' if not result else 'โœ—'} Empty list -> {result}") - - # None in list (should be skipped) - config = CrawlerRunConfig(url_matcher=["*.pdf", None, "*.doc"]) - result = config.is_match("test.pdf") - print(f" {'โœ“' if result else 'โœ—'} List with None -> {result}") - - print("\n" + "=" * 50) - print("All matching tests completed!") - -if __name__ == "__main__": - test_all_matching_scenarios() \ No newline at end of file diff --git a/tests/test_config_selection.py b/tests/test_config_selection.py deleted file mode 100644 index 97245f9f1..000000000 --- a/tests/test_config_selection.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Test config selection logic in dispatchers -""" -import asyncio -import sys -from pathlib import Path -from unittest.mock import AsyncMock, MagicMock - -# Add parent directory to path for imports -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from crawl4ai.async_configs import CrawlerRunConfig, MatchMode -from crawl4ai.async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher - -class TestDispatcher(BaseDispatcher): - """Simple test dispatcher to verify config selection""" - - async def crawl_url(self, url, config, task_id, **kwargs): - # Just return which config was selected - selected = self.select_config(url, config) - return {"url": url, "config_id": id(selected)} - - async def run_urls(self, urls, crawler, config): - results = [] - for url in urls: - result = await self.crawl_url(url, config, "test") - results.append(result) - return results - -async def test_dispatcher_config_selection(): - print("Testing dispatcher config selection") - print("=" * 50) - - # Create test configs with different matchers - pdf_config = CrawlerRunConfig(url_matcher="*.pdf") - api_config = CrawlerRunConfig(url_matcher=lambda url: 'api' in url) - default_config = CrawlerRunConfig() # No matcher - - configs = [pdf_config, api_config, default_config] - - # Create test dispatcher - dispatcher = TestDispatcher() - - # Test single config - print("\nTest 1: Single config") - result = await dispatcher.crawl_url("https://example.com/file.pdf", pdf_config, "test1") - assert result["config_id"] == id(pdf_config) - print("โœ“ Single config works") - - # Test config list selection - print("\nTest 2: Config list selection") - test_cases = [ - ("https://example.com/file.pdf", id(pdf_config)), - ("https://api.example.com/data", id(api_config)), - ("https://example.com/page", id(configs[0])), # No match, uses first - ] - - for url, expected_id in test_cases: - result = await dispatcher.crawl_url(url, configs, "test") - assert result["config_id"] == expected_id, f"URL {url} got wrong config" - print(f"โœ“ {url} -> correct config selected") - - # Test with MemoryAdaptiveDispatcher - print("\nTest 3: MemoryAdaptiveDispatcher config selection") - mem_dispatcher = MemoryAdaptiveDispatcher() - - # Test select_config method directly - selected = mem_dispatcher.select_config("https://example.com/doc.pdf", configs) - assert selected == pdf_config - print("โœ“ MemoryAdaptiveDispatcher.select_config works") - - # Test empty config list - print("\nTest 4: Edge cases") - selected = mem_dispatcher.select_config("https://example.com", []) - assert isinstance(selected, CrawlerRunConfig) # Should return default - print("โœ“ Empty config list returns default config") - - # Test None config - selected = mem_dispatcher.select_config("https://example.com", None) - assert isinstance(selected, CrawlerRunConfig) # Should return default - print("โœ“ None config returns default config") - - print("\n" + "=" * 50) - print("All dispatcher tests passed! โœ“") - -if __name__ == "__main__": - asyncio.run(test_dispatcher_config_selection()) \ No newline at end of file diff --git a/tests/test_crawler_run_config.py b/tests/test_crawler_run_config.py new file mode 100644 index 000000000..bc550c036 --- /dev/null +++ b/tests/test_crawler_run_config.py @@ -0,0 +1,165 @@ +import pytest + +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LinkPreviewConfig +from crawl4ai.async_configs import MatchMode +from tests.helpers import EXAMPLE_URL + + +def test_is_match_single_pattern(): + config = CrawlerRunConfig( + url_matcher="*.pdf" + ) + test_urls = [ + (f"{EXAMPLE_URL}/file.pdf", True), + (f"{EXAMPLE_URL}/doc.PDF", False), # Case sensitive + (f"{EXAMPLE_URL}/file.txt", False), + ("file.pdf", True), + ] + + for url, expected in test_urls: + assert config.is_match(url) == expected + +def test_is_match_with_or_mode(): + config = CrawlerRunConfig( + url_matcher=["*/article/*", "*/blog/*", "*.html"], + match_mode=MatchMode.OR + ) + test_urls = [ + (f"{EXAMPLE_URL}/article/news", True), + (f"{EXAMPLE_URL}/blog/post", True), + (f"{EXAMPLE_URL}/page.html", True), + (f"{EXAMPLE_URL}/page.php", False), + ] + for url, expected in test_urls: + assert config.is_match(url) == expected + +def test_is_match_custom_function(): + config = CrawlerRunConfig( + url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml')) + ) + test_urls = [ + (f"{EXAMPLE_URL}/api/data.json", True), + (f"{EXAMPLE_URL}/api/data.xml", True), + (f"{EXAMPLE_URL}/data.html", False), + (f"{EXAMPLE_URL}/data.json", False), # No 'api' + ] + for url, expected in test_urls: + assert config.is_match(url) == expected + +def test_is_match_mixed_with_and_mode(): + config = CrawlerRunConfig( + url_matcher=[ + "https://*", # Must be HTTPS + lambda url: '.com' in url, # Must have .com + lambda url: len(url) < 50 # Must be short + ], + match_mode=MatchMode.AND + ) + test_urls = [ + (f"{EXAMPLE_URL}/page", True), + ("http://example.com/page", False), # Not HTTPS + ("example.org/page", False), # No .com + (f"{EXAMPLE_URL}/" + "x" * 50, False), # Too long + ] + for url, expected in test_urls: + assert config.is_match(url) == expected + +def test_is_match_mixed_with_or_mode(): + config = CrawlerRunConfig( + url_matcher=[ + "*/api/v[0-9]/*", # API versioned endpoints + lambda url: 'graphql' in url, # GraphQL endpoints + "*.json" # JSON files + ], + match_mode=MatchMode.OR + ) + test_urls = [ + (f"{EXAMPLE_URL}/api/v1/users", True), + (f"{EXAMPLE_URL}/api/v2/posts", True), + (f"{EXAMPLE_URL}/graphql", True), + (f"{EXAMPLE_URL}/data.json", True), + (f"{EXAMPLE_URL}/api/users", False), # No version + ] + for url, expected in test_urls: + assert config.is_match(url) == expected + +def test_is_match_no_matcher(): + config = CrawlerRunConfig() + assert config.is_match(EXAMPLE_URL) + +def test_is_match_empty_list(): + config = CrawlerRunConfig(url_matcher=[]) + assert not config.is_match(EXAMPLE_URL) + +@pytest.mark.asyncio +async def test_link_preview(): + config = CrawlerRunConfig( + link_preview_config=LinkPreviewConfig(), + score_links=True, + exclude_external_links=True, + ) + test_url = "https://docs.python.org/3/" + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(test_url, config=config) + + assert result.success + assert result.links + assert result.links["external"] == [] + + internal_links = result.links["internal"] + + assert any(il["total_score"] for il in internal_links) + assert all(il["href"] for il in internal_links) + assert all("title" in il for il in internal_links) + assert all("text" in il for il in internal_links) + assert all("head_data" in il for il in internal_links) + assert all("head_extraction_status" in il for il in internal_links) + assert all("head_extraction_error" in il for il in internal_links) + assert all("total_score" in il for il in internal_links) + +@pytest.mark.asyncio +async def test_css_selector(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=EXAMPLE_URL, config=CrawlerRunConfig( + css_selector="h1, title" + ) + ) + + assert result.success + assert ( + " button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" + ] + )) + + assert result_with_more.success + assert len(result_with_more.markdown) > len(result_without_more.markdown) + + +@pytest.mark.asyncio +async def test_user_agent(): + url = "https://www.nbcnews.com/business" + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Crawl4AI/1.0" + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=url, config=CrawlerRunConfig( + user_agent=user_agent + ) + ) + + assert result.success + assert crawler.browser_config.user_agent == user_agent diff --git a/tests/test_docker.py b/tests/test_docker.py deleted file mode 100644 index c507ae565..000000000 --- a/tests/test_docker.py +++ /dev/null @@ -1,299 +0,0 @@ -import requests -import json -import time -import sys -import base64 -import os -from typing import Dict, Any - - -class Crawl4AiTester: - def __init__(self, base_url: str = "http://localhost:11235"): - self.base_url = base_url - - def submit_and_wait( - self, request_data: Dict[str, Any], timeout: int = 300 - ) -> Dict[str, Any]: - # Submit crawl job - response = requests.post(f"{self.base_url}/crawl", json=request_data) - task_id = response.json()["task_id"] - print(f"Task ID: {task_id}") - - # Poll for result - start_time = time.time() - while True: - if time.time() - start_time > timeout: - raise TimeoutError( - f"Task {task_id} did not complete within {timeout} seconds" - ) - - result = requests.get(f"{self.base_url}/task/{task_id}") - status = result.json() - - if status["status"] == "failed": - print("Task failed:", status.get("error")) - raise Exception(f"Task failed: {status.get('error')}") - - if status["status"] == "completed": - return status - - time.sleep(2) - - -def test_docker_deployment(version="basic"): - tester = Crawl4AiTester() - print(f"Testing Crawl4AI Docker {version} version") - - # Health check with timeout and retry - max_retries = 5 - for i in range(max_retries): - try: - health = requests.get(f"{tester.base_url}/health", timeout=10) - print("Health check:", health.json()) - break - except requests.exceptions.RequestException: - if i == max_retries - 1: - print(f"Failed to connect after {max_retries} attempts") - sys.exit(1) - print(f"Waiting for service to start (attempt {i+1}/{max_retries})...") - time.sleep(5) - - # Test cases based on version - test_basic_crawl(tester) - - # if version in ["full", "transformer"]: - # test_cosine_extraction(tester) - - # test_js_execution(tester) - # test_css_selector(tester) - # test_structured_extraction(tester) - # test_llm_extraction(tester) - # test_llm_with_ollama(tester) - # test_screenshot(tester) - - -def test_basic_crawl(tester: Crawl4AiTester): - print("\n=== Testing Basic Crawl ===") - request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10} - - result = tester.submit_and_wait(request) - print(f"Basic crawl result length: {len(result['result']['markdown'])}") - assert result["result"]["success"] - assert len(result["result"]["markdown"]) > 0 - - -def test_js_execution(tester: Crawl4AiTester): - print("\n=== Testing JS Execution ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 8, - "js_code": [ - "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" - ], - "wait_for": "article.tease-card:nth-child(10)", - "crawler_params": {"headless": True}, - } - - result = tester.submit_and_wait(request) - print(f"JS execution result length: {len(result['result']['markdown'])}") - assert result["result"]["success"] - - -def test_css_selector(tester: Crawl4AiTester): - print("\n=== Testing CSS Selector ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 7, - "css_selector": ".wide-tease-item__description", - "crawler_params": {"headless": True}, - "extra": {"word_count_threshold": 10}, - } - - result = tester.submit_and_wait(request) - print(f"CSS selector result length: {len(result['result']['markdown'])}") - assert result["result"]["success"] - - -def test_structured_extraction(tester: Crawl4AiTester): - print("\n=== Testing Structured Extraction ===") - schema = { - "name": "Coinbase Crypto Prices", - "baseSelector": ".cds-tableRow-t45thuk", - "fields": [ - { - "name": "crypto", - "selector": "td:nth-child(1) h2", - "type": "text", - }, - { - "name": "symbol", - "selector": "td:nth-child(1) p", - "type": "text", - }, - { - "name": "price", - "selector": "td:nth-child(2)", - "type": "text", - }, - ], - } - - request = { - "urls": ["https://www.coinbase.com/explore"], - "priority": 9, - "extraction_config": {"type": "json_css", "params": {"schema": schema}}, - } - - result = tester.submit_and_wait(request) - extracted = json.loads(result["result"]["extracted_content"]) - print(f"Extracted {len(extracted)} items") - print("Sample item:", json.dumps(extracted[0], indent=2)) - assert result["result"]["success"] - assert len(extracted) > 0 - - -def test_llm_extraction(tester: Crawl4AiTester): - print("\n=== Testing LLM Extraction ===") - schema = { - "type": "object", - "properties": { - "model_name": { - "type": "string", - "description": "Name of the OpenAI model.", - }, - "input_fee": { - "type": "string", - "description": "Fee for input token for the OpenAI model.", - }, - "output_fee": { - "type": "string", - "description": "Fee for output token for the OpenAI model.", - }, - }, - "required": ["model_name", "input_fee", "output_fee"], - } - - request = { - "urls": ["https://openai.com/api/pricing"], - "priority": 8, - "extraction_config": { - "type": "llm", - "params": { - "provider": "openai/gpt-4o-mini", - "api_token": os.getenv("OPENAI_API_KEY"), - "schema": schema, - "extraction_type": "schema", - "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""", - }, - }, - "crawler_params": {"word_count_threshold": 1}, - } - - try: - result = tester.submit_and_wait(request) - extracted = json.loads(result["result"]["extracted_content"]) - print(f"Extracted {len(extracted)} model pricing entries") - print("Sample entry:", json.dumps(extracted[0], indent=2)) - assert result["result"]["success"] - except Exception as e: - print(f"LLM extraction test failed (might be due to missing API key): {str(e)}") - - -def test_llm_with_ollama(tester: Crawl4AiTester): - print("\n=== Testing LLM with Ollama ===") - schema = { - "type": "object", - "properties": { - "article_title": { - "type": "string", - "description": "The main title of the news article", - }, - "summary": { - "type": "string", - "description": "A brief summary of the article content", - }, - "main_topics": { - "type": "array", - "items": {"type": "string"}, - "description": "Main topics or themes discussed in the article", - }, - }, - } - - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 8, - "extraction_config": { - "type": "llm", - "params": { - "provider": "ollama/llama2", - "schema": schema, - "extraction_type": "schema", - "instruction": "Extract the main article information including title, summary, and main topics.", - }, - }, - "extra": {"word_count_threshold": 1}, - "crawler_params": {"verbose": True}, - } - - try: - result = tester.submit_and_wait(request) - extracted = json.loads(result["result"]["extracted_content"]) - print("Extracted content:", json.dumps(extracted, indent=2)) - assert result["result"]["success"] - except Exception as e: - print(f"Ollama extraction test failed: {str(e)}") - - -def test_cosine_extraction(tester: Crawl4AiTester): - print("\n=== Testing Cosine Extraction ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 8, - "extraction_config": { - "type": "cosine", - "params": { - "semantic_filter": "business finance economy", - "word_count_threshold": 10, - "max_dist": 0.2, - "top_k": 3, - }, - }, - } - - try: - result = tester.submit_and_wait(request) - extracted = json.loads(result["result"]["extracted_content"]) - print(f"Extracted {len(extracted)} text clusters") - print("First cluster tags:", extracted[0]["tags"]) - assert result["result"]["success"] - except Exception as e: - print(f"Cosine extraction test failed: {str(e)}") - - -def test_screenshot(tester: Crawl4AiTester): - print("\n=== Testing Screenshot ===") - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 5, - "screenshot": True, - "crawler_params": {"headless": True}, - } - - result = tester.submit_and_wait(request) - print("Screenshot captured:", bool(result["result"]["screenshot"])) - - if result["result"]["screenshot"]: - # Save screenshot - screenshot_data = base64.b64decode(result["result"]["screenshot"]) - with open("test_screenshot.jpg", "wb") as f: - f.write(screenshot_data) - print("Screenshot saved as test_screenshot.jpg") - - assert result["result"]["success"] - - -if __name__ == "__main__": - version = sys.argv[1] if len(sys.argv) > 1 else "basic" - # version = "full" - test_docker_deployment(version) diff --git a/tests/test_docker_api_with_llm_provider.py b/tests/test_docker_api_with_llm_provider.py deleted file mode 100644 index f17368ae3..000000000 --- a/tests/test_docker_api_with_llm_provider.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python3 -"""Test script to verify Docker API with LLM provider configuration.""" - -import requests -import json -import time - -BASE_URL = "http://localhost:11235" - -def test_health(): - """Test health endpoint.""" - print("1. Testing health endpoint...") - response = requests.get(f"{BASE_URL}/health") - print(f" Status: {response.status_code}") - print(f" Response: {response.json()}") - print() - -def test_schema(): - """Test schema endpoint to see configuration.""" - print("2. Testing schema endpoint...") - response = requests.get(f"{BASE_URL}/schema") - print(f" Status: {response.status_code}") - # Print only browser config to keep output concise - print(f" Browser config keys: {list(response.json().get('browser', {}).keys())[:5]}...") - print() - -def test_markdown_with_llm_filter(): - """Test markdown endpoint with LLM filter (should use configured provider).""" - print("3. Testing markdown endpoint with LLM filter...") - print(" This should use the Groq provider from LLM_PROVIDER env var") - - # Note: This will fail with dummy API keys, but we can see if it tries to use Groq - payload = { - "url": "https://httpbin.org/html", - "f": "llm", - "q": "Extract the main content" - } - - response = requests.post(f"{BASE_URL}/md", json=payload) - print(f" Status: {response.status_code}") - - if response.status_code != 200: - print(f" Error: {response.text[:200]}...") - else: - print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars") - print() - -def test_markdown_with_provider_override(): - """Test markdown endpoint with provider override in request.""" - print("4. Testing markdown endpoint with provider override...") - print(" This should use OpenAI provider from request parameter") - - payload = { - "url": "https://httpbin.org/html", - "f": "llm", - "q": "Extract the main content", - "provider": "openai/gpt-4" # Override to use OpenAI - } - - response = requests.post(f"{BASE_URL}/md", json=payload) - print(f" Status: {response.status_code}") - - if response.status_code != 200: - print(f" Error: {response.text[:200]}...") - else: - print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars") - print() - -def test_simple_crawl(): - """Test simple crawl without LLM.""" - print("5. Testing simple crawl (no LLM required)...") - - payload = { - "urls": ["https://httpbin.org/html"], - "browser_config": { - "type": "BrowserConfig", - "params": {"headless": True} - }, - "crawler_config": { - "type": "CrawlerRunConfig", - "params": {"cache_mode": "bypass"} - } - } - - response = requests.post(f"{BASE_URL}/crawl", json=payload) - print(f" Status: {response.status_code}") - - if response.status_code == 200: - result = response.json() - print(f" Success: {result.get('success')}") - print(f" Results count: {len(result.get('results', []))}") - if result.get('results'): - print(f" First result success: {result['results'][0].get('success')}") - else: - print(f" Error: {response.text[:200]}...") - print() - -def test_playground(): - """Test if playground is accessible.""" - print("6. Testing playground interface...") - response = requests.get(f"{BASE_URL}/playground") - print(f" Status: {response.status_code}") - print(f" Content-Type: {response.headers.get('content-type')}") - print() - -if __name__ == "__main__": - print("=== Crawl4AI Docker API Tests ===\n") - print(f"Testing API at {BASE_URL}\n") - - # Wait a bit for server to be fully ready - time.sleep(2) - - test_health() - test_schema() - test_simple_crawl() - test_playground() - - print("\nTesting LLM functionality (these may fail with dummy API keys):\n") - test_markdown_with_llm_filter() - test_markdown_with_provider_override() - - print("\nTests completed!") \ No newline at end of file diff --git a/tests/test_link_extractor.py b/tests/test_link_extractor.py deleted file mode 100644 index 1482ce015..000000000 --- a/tests/test_link_extractor.py +++ /dev/null @@ -1,262 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script for Link Extractor functionality -""" - -from crawl4ai.models import Link -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai import LinkPreviewConfig -import asyncio -import sys -import os - -# Add the crawl4ai directory to the path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'crawl4ai')) - - -async def test_link_extractor(): - """Test the link extractor functionality""" - - print("๐Ÿ”— Testing Link Extractor Functionality") - print("=" * 50) - - # Test configuration with link extraction AND scoring enabled - config = CrawlerRunConfig( - link_preview_config=LinkPreviewConfig( - include_internal=True, - include_external=False, # Only internal links for this test - # No include/exclude patterns for first test - let's see what we get - query="API documentation reference guide", - score_threshold=0.3, - concurrency=5, - timeout=10, - max_links=5, # Just test with 5 links first - verbose=True # Show detailed progress - ), - score_links=True, # Enable intrinsic link scoring - only_text=True, - verbose=True - ) - - # Test URLs - test_urls = [ - "https://docs.python.org/3/", # Python docs - should have many internal links - "https://httpbin.org/", # Simple site for testing - ] - - async with AsyncWebCrawler() as crawler: - for url in test_urls: - print(f"\n๐ŸŒ Testing URL: {url}") - print("-" * 40) - - try: - result = await crawler.arun(url, config=config) - - # Debug: Check if link extraction config is being passed - print(f"๐Ÿ” Debug - Link extraction config: {config.link_preview_config.to_dict() if config.link_preview_config else None}") - print(f"๐Ÿ” Debug - Score links: {config.score_links}") - - if result.success: - print(f"โœ… Crawl successful!") - print( - f"๐Ÿ“„ Page title: {result.metadata.get('title', 'No title')}") - - # Check links - handle both dict and Links object structure - if isinstance(result.links, dict): - internal_links = [ - Link(**link) for link in result.links.get('internal', [])] - external_links = [ - Link(**link) for link in result.links.get('external', [])] - else: - internal_links = result.links.internal - external_links = result.links.external - - print(f"๐Ÿ”— Found {len(internal_links)} internal links") - print(f"๐ŸŒ Found {len(external_links)} external links") - - # Show links with head data - links_with_head = [link for link in internal_links + external_links - if hasattr(link, 'head_data') and link.head_data] - - print( - f"๐Ÿง  Links with head data extracted: {len(links_with_head)}") - - # Show all score types for all links (first 3) - all_links = internal_links + external_links - if all_links: - print(f"\n๐Ÿ”ข Sample link scores (first 3 links):") - for i, link in enumerate(all_links[:3]): - print(f"\n {i+1}. {link.href}") - - # Show intrinsic score - if hasattr(link, 'intrinsic_score') and link.intrinsic_score is not None: - if link.intrinsic_score == float('inf'): - print(f" Intrinsic Score: โˆž (scoring disabled)") - else: - print(f" Intrinsic Score: {link.intrinsic_score:.2f}/10.0") - else: - print(f" Intrinsic Score: Not available") - - # Show contextual score (BM25) - if hasattr(link, 'contextual_score') and link.contextual_score is not None: - print(f" Contextual Score: {link.contextual_score:.3f}") - else: - print(f" Contextual Score: Not available") - - # Show total score - if hasattr(link, 'total_score') and link.total_score is not None: - print(f" Total Score: {link.total_score:.3f}") - else: - print(f" Total Score: Not available") - - print(f" Text: '{link.text[:50]}...' " if link.text else " Text: (no text)") - - if links_with_head: - print("\n๐Ÿ“Š Sample links with head data:") - # Show top 3 - for i, link in enumerate(links_with_head[:3]): - print(f"\n {i+1}. {link.href}") - print( - f" Status: {link.head_extraction_status}") - - # Show all three score types - print(f" ๐Ÿ“Š Scoring Summary:") - if hasattr(link, 'intrinsic_score') and link.intrinsic_score is not None: - if link.intrinsic_score == float('inf'): - print(f" โ€ข Intrinsic Score: โˆž (scoring disabled)") - else: - print(f" โ€ข Intrinsic Score: {link.intrinsic_score:.2f}/10.0") - else: - print(f" โ€ข Intrinsic Score: Not available") - - if hasattr(link, 'contextual_score') and link.contextual_score is not None: - print(f" โ€ข Contextual Score: {link.contextual_score:.3f}") - else: - print(f" โ€ข Contextual Score: Not available") - - if hasattr(link, 'total_score') and link.total_score is not None: - print(f" โ€ข Total Score: {link.total_score:.3f}") - else: - print(f" โ€ข Total Score: Not available") - - if link.head_data: - title = link.head_data.get('title', 'No title') - if title: - print(f" Title: {title[:60]}...") - - meta = link.head_data.get('meta', {}) - if 'description' in meta and meta['description']: - desc = meta['description'] - print(f" Description: {desc[:80]}...") - - # Show link metadata keys (should now be properly formatted) - link_data = link.head_data.get('link', {}) - if link_data: - keys = list(link_data.keys())[:3] - print(f" Link types: {keys}") - - # Show failed extractions - failed_links = [link for link in internal_links + external_links - if hasattr(link, 'head_extraction_status') and - link.head_extraction_status == 'failed'] - - if failed_links: - print( - f"\nโŒ Failed head extractions: {len(failed_links)}") - for link in failed_links[:2]: # Show first 2 failures - print(f" - {link.href}") - if hasattr(link, 'head_extraction_error') and link.head_extraction_error: - print( - f" Error: {link.head_extraction_error}") - - else: - print(f"โŒ Crawl failed: {result.error_message}") - - except Exception as e: - print(f"๐Ÿ’ฅ Error testing {url}: {str(e)}") - import traceback - traceback.print_exc() - - -def test_config_examples(): - """Show example configurations""" - - print("\n๐Ÿ“š Example Configurations") - print("=" * 50) - - examples = [ - { - "name": "BM25 Scored Documentation Links", - "config": LinkPreviewConfig( - include_internal=True, - include_external=False, - include_patterns=["*/docs/*", "*/api/*", "*/reference/*"], - query="API documentation reference guide", - score_threshold=0.3, - max_links=30, - verbose=True - ) - }, - { - "name": "Internal Links Only", - "config": LinkPreviewConfig( - include_internal=True, - include_external=False, - max_links=50, - verbose=True - ) - }, - { - "name": "External Links with Patterns", - "config": LinkPreviewConfig( - include_internal=False, - include_external=True, - include_patterns=["*github.com*", "*stackoverflow.com*"], - max_links=20, - concurrency=10 - ) - }, - { - "name": "High-Performance Mode", - "config": LinkPreviewConfig( - include_internal=True, - include_external=False, - concurrency=20, - timeout=3, - max_links=100, - verbose=False - ) - } - ] - - for example in examples: - print(f"\n๐Ÿ“ {example['name']}:") - print(" Configuration:") - config_dict = example['config'].to_dict() - for key, value in config_dict.items(): - print(f" {key}: {value}") - - print(" Usage:") - print(" from crawl4ai import LinkPreviewConfig") - print(" config = CrawlerRunConfig(") - print(" link_preview_config=LinkPreviewConfig(") - for key, value in config_dict.items(): - if isinstance(value, str): - print(f" {key}='{value}',") - elif isinstance(value, list) and value: - print(f" {key}={value},") - elif value is not None: - print(f" {key}={value},") - print(" )") - print(" )") - - -if __name__ == "__main__": - # Show configuration examples first - test_config_examples() - - # Run the actual test - print("\n๐Ÿš€ Running Link Extractor Tests...") - asyncio.run(test_link_extractor()) - - print("\nโœจ Test completed!") diff --git a/tests/test_llm_simple_url.py b/tests/test_llm_simple_url.py deleted file mode 100644 index bb31434c4..000000000 --- a/tests/test_llm_simple_url.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 -""" -Test LLMTableExtraction with controlled HTML -""" - -import os -import sys -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - -import asyncio -from crawl4ai import ( - AsyncWebCrawler, - CrawlerRunConfig, - LLMConfig, - LLMTableExtraction, - DefaultTableExtraction, - CacheMode -) - -async def test_controlled_html(): - """Test with controlled HTML content.""" - print("\n" + "=" * 60) - print("LLM TABLE EXTRACTION TEST") - print("=" * 60) - - url = "https://en.wikipedia.org/wiki/List_of_chemical_elements" - # url = "https://en.wikipedia.org/wiki/List_of_prime_ministers_of_India" - - # Configure LLM - llm_config = LLMConfig( - # provider="openai/gpt-4.1-mini", - # api_token=os.getenv("OPENAI_API_KEY"), - provider="groq/llama-3.3-70b-versatile", - api_token="GROQ_API_TOKEN", - temperature=0.1, - max_tokens=32000 - ) - - print("\n1. Testing LLMTableExtraction:") - - # Create LLM extraction strategy - llm_strategy = LLMTableExtraction( - llm_config=llm_config, - verbose=True, - # css_selector="div.w3-example" - css_selector="div.mw-content-ltr", - # css_selector="table.wikitable", - max_tries=2, - - enable_chunking=True, - chunk_token_threshold=5000, # Lower threshold to force chunking - min_rows_per_chunk=10, - max_parallel_chunks=3 - ) - - config_llm = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - table_extraction=llm_strategy - ) - - async with AsyncWebCrawler() as crawler: - # Test with LLM extraction - result_llm = await crawler.arun( - # url=f"raw:{test_html}", - url=url, - config=config_llm - ) - - if result_llm.success: - print(f"\n โœ“ LLM Extraction: Found {len(result_llm.tables)} table(s)") - - for i, table in enumerate(result_llm.tables, 1): - print(f"\n Table {i}:") - print(f" - Caption: {table.get('caption', 'No caption')}") - print(f" - Headers: {table['headers']}") - print(f" - Rows: {len(table['rows'])}") - - # Show how colspan/rowspan were handled - print(f" - Sample rows:") - for j, row in enumerate(table['rows'][:2], 1): - print(f" Row {j}: {row}") - - metadata = table.get('metadata', {}) - print(f" - Metadata:") - print(f" โ€ข Has merged cells: {metadata.get('has_merged_cells', False)}") - print(f" โ€ข Table type: {metadata.get('table_type', 'unknown')}") - - # # Compare with default extraction - # print("\n2. Comparing with DefaultTableExtraction:") - - # default_strategy = DefaultTableExtraction( - # table_score_threshold=3, - # verbose=False - # ) - - # config_default = CrawlerRunConfig( - # cache_mode=CacheMode.BYPASS, - # table_extraction=default_strategy - # ) - - # result_default = await crawler.arun( - # # url=f"raw:{test_html}", - # url=url, - # config=config_default - # ) - - # if result_default.success: - # print(f" โœ“ Default Extraction: Found {len(result_default.tables)} table(s)") - - # # Compare handling of complex structures - # print("\n3. Comparison Summary:") - # print(f" LLM found: {len(result_llm.tables)} tables") - # print(f" Default found: {len(result_default.tables)} tables") - - # if result_llm.tables and result_default.tables: - # llm_first = result_llm.tables[0] - # default_first = result_default.tables[0] - - # print(f"\n First table comparison:") - # print(f" LLM headers: {len(llm_first['headers'])} columns") - # print(f" Default headers: {len(default_first['headers'])} columns") - - # # Check if LLM better handled the complex structure - # if llm_first.get('metadata', {}).get('has_merged_cells'): - # print(" โœ“ LLM correctly identified merged cells") - - # # Test pandas compatibility - # try: - # import pandas as pd - - # print("\n4. Testing Pandas compatibility:") - - # # Create DataFrame from LLM extraction - # df_llm = pd.DataFrame( - # llm_first['rows'], - # columns=llm_first['headers'] - # ) - # print(f" โœ“ LLM table -> DataFrame: Shape {df_llm.shape}") - - # # Create DataFrame from default extraction - # df_default = pd.DataFrame( - # default_first['rows'], - # columns=default_first['headers'] - # ) - # print(f" โœ“ Default table -> DataFrame: Shape {df_default.shape}") - - # print("\n LLM DataFrame preview:") - # print(df_llm.head(2).to_string()) - - # except ImportError: - # print("\n4. Pandas not installed, skipping DataFrame test") - - print("\nโœ… Test completed successfully!") - -async def main(): - """Run the test.""" - - # Check for API key - if not os.getenv("OPENAI_API_KEY"): - print("โš ๏ธ OPENAI_API_KEY not set. Please set it to test LLM extraction.") - print(" You can set it with: export OPENAI_API_KEY='your-key-here'") - return - - await test_controlled_html() - -if __name__ == "__main__": - asyncio.run(main()) - - - \ No newline at end of file diff --git a/tests/test_llmtxt.py b/tests/test_llmtxt.py deleted file mode 100644 index 2cdb02715..000000000 --- a/tests/test_llmtxt.py +++ /dev/null @@ -1,52 +0,0 @@ -from crawl4ai.llmtxt import AsyncLLMTextManager # Changed to AsyncLLMTextManager -from crawl4ai.async_logger import AsyncLogger -from pathlib import Path -import asyncio - - -async def main(): - current_file = Path(__file__).resolve() - # base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs" - base_dir = current_file.parent.parent / "local/_docs/llm.txt" - docs_dir = base_dir - - # Create directory if it doesn't exist - docs_dir.mkdir(parents=True, exist_ok=True) - - # Initialize logger - logger = AsyncLogger() - # Updated initialization with default batching params - # manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2) - manager = AsyncLLMTextManager(docs_dir, logger, batch_size=2) - - # Let's first check what files we have - print("\nAvailable files:") - for f in docs_dir.glob("*.md"): - print(f"- {f.name}") - - # Generate index files - print("\nGenerating index files...") - await manager.generate_index_files( - force_generate_facts=False, clear_bm25_cache=False - ) - - # Test some relevant queries about Crawl4AI - test_queries = [ - "How is using the `arun_many` method?", - ] - - print("\nTesting search functionality:") - for query in test_queries: - print(f"\nQuery: {query}") - results = manager.search(query, top_k=2) - print(f"Results length: {len(results)} characters") - if results: - print( - "First 200 chars of results:", results[:200].replace("\n", " "), "..." - ) - else: - print("No results found") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/test_main.py b/tests/test_main.py deleted file mode 100644 index b32b68f0e..000000000 --- a/tests/test_main.py +++ /dev/null @@ -1,276 +0,0 @@ -import asyncio -import aiohttp -import json -import time -import os -from typing import Dict, Any - - -class NBCNewsAPITest: - def __init__(self, base_url: str = "http://localhost:8000"): - self.base_url = base_url - self.session = None - - async def __aenter__(self): - self.session = aiohttp.ClientSession() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - if self.session: - await self.session.close() - - async def submit_crawl(self, request_data: Dict[str, Any]) -> str: - async with self.session.post( - f"{self.base_url}/crawl", json=request_data - ) as response: - result = await response.json() - return result["task_id"] - - async def get_task_status(self, task_id: str) -> Dict[str, Any]: - async with self.session.get(f"{self.base_url}/task/{task_id}") as response: - return await response.json() - - async def wait_for_task( - self, task_id: str, timeout: int = 300, poll_interval: int = 2 - ) -> Dict[str, Any]: - start_time = time.time() - while True: - if time.time() - start_time > timeout: - raise TimeoutError( - f"Task {task_id} did not complete within {timeout} seconds" - ) - - status = await self.get_task_status(task_id) - if status["status"] in ["completed", "failed"]: - return status - - await asyncio.sleep(poll_interval) - - async def check_health(self) -> Dict[str, Any]: - async with self.session.get(f"{self.base_url}/health") as response: - return await response.json() - - -async def test_basic_crawl(): - print("\n=== Testing Basic Crawl ===") - async with NBCNewsAPITest() as api: - request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10} - task_id = await api.submit_crawl(request) - result = await api.wait_for_task(task_id) - print(f"Basic crawl result length: {len(result['result']['markdown'])}") - assert result["status"] == "completed" - assert "result" in result - assert result["result"]["success"] - - -async def test_js_execution(): - print("\n=== Testing JS Execution ===") - async with NBCNewsAPITest() as api: - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 8, - "js_code": [ - "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" - ], - "wait_for": "article.tease-card:nth-child(10)", - "crawler_params": {"headless": True}, - } - task_id = await api.submit_crawl(request) - result = await api.wait_for_task(task_id) - print(f"JS execution result length: {len(result['result']['markdown'])}") - assert result["status"] == "completed" - assert result["result"]["success"] - - -async def test_css_selector(): - print("\n=== Testing CSS Selector ===") - async with NBCNewsAPITest() as api: - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 7, - "css_selector": ".wide-tease-item__description", - } - task_id = await api.submit_crawl(request) - result = await api.wait_for_task(task_id) - print(f"CSS selector result length: {len(result['result']['markdown'])}") - assert result["status"] == "completed" - assert result["result"]["success"] - - -async def test_structured_extraction(): - print("\n=== Testing Structured Extraction ===") - async with NBCNewsAPITest() as api: - schema = { - "name": "NBC News Articles", - "baseSelector": "article.tease-card", - "fields": [ - {"name": "title", "selector": "h2", "type": "text"}, - { - "name": "description", - "selector": ".tease-card__description", - "type": "text", - }, - { - "name": "link", - "selector": "a", - "type": "attribute", - "attribute": "href", - }, - ], - } - - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 9, - "extraction_config": {"type": "json_css", "params": {"schema": schema}}, - } - task_id = await api.submit_crawl(request) - result = await api.wait_for_task(task_id) - extracted = json.loads(result["result"]["extracted_content"]) - print(f"Extracted {len(extracted)} articles") - assert result["status"] == "completed" - assert result["result"]["success"] - assert len(extracted) > 0 - - -async def test_batch_crawl(): - print("\n=== Testing Batch Crawl ===") - async with NBCNewsAPITest() as api: - request = { - "urls": [ - "https://www.nbcnews.com/business", - "https://www.nbcnews.com/business/consumer", - "https://www.nbcnews.com/business/economy", - ], - "priority": 6, - "crawler_params": {"headless": True}, - } - task_id = await api.submit_crawl(request) - result = await api.wait_for_task(task_id) - print(f"Batch crawl completed, got {len(result['results'])} results") - assert result["status"] == "completed" - assert "results" in result - assert len(result["results"]) == 3 - - -async def test_llm_extraction(): - print("\n=== Testing LLM Extraction with Ollama ===") - async with NBCNewsAPITest() as api: - schema = { - "type": "object", - "properties": { - "article_title": { - "type": "string", - "description": "The main title of the news article", - }, - "summary": { - "type": "string", - "description": "A brief summary of the article content", - }, - "main_topics": { - "type": "array", - "items": {"type": "string"}, - "description": "Main topics or themes discussed in the article", - }, - }, - "required": ["article_title", "summary", "main_topics"], - } - - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 8, - "extraction_config": { - "type": "llm", - "params": { - "provider": "openai/gpt-4o-mini", - "api_key": os.getenv("OLLAMA_API_KEY"), - "schema": schema, - "extraction_type": "schema", - "instruction": """Extract the main article information including title, a brief summary, and main topics discussed. - Focus on the primary business news article on the page.""", - }, - }, - "crawler_params": {"headless": True, "word_count_threshold": 1}, - } - - task_id = await api.submit_crawl(request) - result = await api.wait_for_task(task_id) - - if result["status"] == "completed": - extracted = json.loads(result["result"]["extracted_content"]) - print("Extracted article analysis:") - print(json.dumps(extracted, indent=2)) - - assert result["status"] == "completed" - assert result["result"]["success"] - - -async def test_screenshot(): - print("\n=== Testing Screenshot ===") - async with NBCNewsAPITest() as api: - request = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 5, - "screenshot": True, - "crawler_params": {"headless": True}, - } - task_id = await api.submit_crawl(request) - result = await api.wait_for_task(task_id) - print("Screenshot captured:", bool(result["result"]["screenshot"])) - assert result["status"] == "completed" - assert result["result"]["success"] - assert result["result"]["screenshot"] is not None - - -async def test_priority_handling(): - print("\n=== Testing Priority Handling ===") - async with NBCNewsAPITest() as api: - # Submit low priority task first - low_priority = { - "urls": ["https://www.nbcnews.com/business"], - "priority": 1, - "crawler_params": {"headless": True}, - } - low_task_id = await api.submit_crawl(low_priority) - - # Submit high priority task - high_priority = { - "urls": ["https://www.nbcnews.com/business/consumer"], - "priority": 10, - "crawler_params": {"headless": True}, - } - high_task_id = await api.submit_crawl(high_priority) - - # Get both results - high_result = await api.wait_for_task(high_task_id) - low_result = await api.wait_for_task(low_task_id) - - print("Both tasks completed") - assert high_result["status"] == "completed" - assert low_result["status"] == "completed" - - -async def main(): - try: - # Start with health check - async with NBCNewsAPITest() as api: - health = await api.check_health() - print("Server health:", health) - - # Run all tests - # await test_basic_crawl() - # await test_js_execution() - # await test_css_selector() - # await test_structured_extraction() - await test_llm_extraction() - # await test_batch_crawl() - # await test_screenshot() - # await test_priority_handling() - - except Exception as e: - print(f"Test failed: {str(e)}") - raise - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/test_memory_macos.py b/tests/test_memory_macos.py deleted file mode 100755 index 7019ff035..000000000 --- a/tests/test_memory_macos.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -"""Test script to verify macOS memory calculation accuracy.""" - -import psutil -import platform -import time -from crawl4ai.utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb - - -def test_memory_calculation(): - """Test and compare memory calculations.""" - print(f"Platform: {platform.system()}") - print(f"Python version: {platform.python_version()}") - print("-" * 60) - - # Get psutil's view - vm = psutil.virtual_memory() - psutil_percent = vm.percent - psutil_available_gb = vm.available / (1024**3) - total_gb = vm.total / (1024**3) - - # Get our corrected view - true_percent = get_true_memory_usage_percent() - true_available_gb = get_true_available_memory_gb() - true_percent_calc, available_calc, total_calc = get_memory_stats() - - print("Memory Statistics Comparison:") - print(f"Total Memory: {total_gb:.2f} GB") - print() - - print("PSUtil (Standard) Calculation:") - print(f" - Memory Used: {psutil_percent:.1f}%") - print(f" - Available: {psutil_available_gb:.2f} GB") - print() - - print("Platform-Aware Calculation:") - print(f" - Memory Used: {true_percent:.1f}%") - print(f" - Available: {true_available_gb:.2f} GB") - print(f" - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory") - print() - - # Show the impact on dispatcher behavior - print("Impact on MemoryAdaptiveDispatcher:") - thresholds = { - "Normal": 90.0, - "Critical": 95.0, - "Recovery": 85.0 - } - - for name, threshold in thresholds.items(): - psutil_triggered = psutil_percent >= threshold - true_triggered = true_percent >= threshold - print(f" - {name} Threshold ({threshold}%):") - print(f" PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}") - print(f" Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}") - if psutil_triggered != true_triggered: - print(f" โ†’ Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}") - print() - - # Monitor for a few seconds - print("Monitoring memory for 10 seconds...") - for i in range(10): - vm = psutil.virtual_memory() - true_pct = get_true_memory_usage_percent() - print(f" {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r") - time.sleep(1) - print("\n") - - -if __name__ == "__main__": - test_memory_calculation() \ No newline at end of file diff --git a/tests/test_multi_config.py b/tests/test_multi_config.py deleted file mode 100644 index 09dd5283d..000000000 --- a/tests/test_multi_config.py +++ /dev/null @@ -1,117 +0,0 @@ -""" -Test example for multiple crawler configs feature -""" -import asyncio -import sys -from pathlib import Path - -# Add parent directory to path for imports -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode - -async def test_multi_config(): - # Create different configs for different URL patterns - - # Config for PDF files - pdf_config = CrawlerRunConfig( - url_matcher="*.pdf", - ) - - # Config for articles (using multiple patterns with OR logic) - article_config = CrawlerRunConfig( - url_matcher=["*/news/*", "*blog*", "*/article/*"], - match_mode=MatchMode.OR, - screenshot=True, - ) - - # Config using custom matcher function - api_config = CrawlerRunConfig( - url_matcher=lambda url: 'api' in url or 'json' in url, - ) - - # Config combining patterns and functions with AND logic - secure_docs_config = CrawlerRunConfig( - url_matcher=[ - "*.doc*", # Matches .doc, .docx - lambda url: url.startswith('https://') # Must be HTTPS - ], - match_mode=MatchMode.AND, - ) - - # Default config (no url_matcher means it won't match anything unless it's the fallback) - default_config = CrawlerRunConfig( - # cache_mode=CacheMode.BYPASS, - ) - - # List of configs - order matters! First match wins - configs = [ - pdf_config, - article_config, - api_config, - secure_docs_config, - default_config # Fallback - ] - - # Test URLs - using real URLs that exist - test_urls = [ - "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF - "https://www.bbc.com/news/articles/c5y3e3glnldo", # News article - "https://blog.python.org/", # Blog URL - "https://api.github.com/users/github", # GitHub API (returns JSON) - "https://httpbin.org/json", # API endpoint that returns JSON - "https://www.python.org/", # Generic HTTPS page - "http://info.cern.ch/", # HTTP (not HTTPS) page - "https://example.com/", # โ†’ Default config - ] - - # Test the matching logic - print("Config matching test:") - print("-" * 50) - for url in test_urls: - for i, config in enumerate(configs): - if config.is_match(url): - print(f"{url} -> Config {i} matches") - break - else: - print(f"{url} -> No match, will use fallback (first config)") - - print("\n" + "=" * 50 + "\n") - - # Now test with actual crawler - async with AsyncWebCrawler() as crawler: - # Single config - traditional usage still works - print("Test 1: Single config (backwards compatible)") - result = await crawler.arun_many( - urls=["https://www.python.org/"], - config=default_config - ) - print(f"Crawled {len(result)} URLs with single config\n") - - # Multiple configs - new feature - print("Test 2: Multiple configs") - # Just test with 2 URLs to avoid timeout - results = await crawler.arun_many( - urls=test_urls[:2], # Just test first 2 URLs - config=configs # Pass list of configs - ) - print(f"Crawled {len(results)} URLs with multiple configs") - - # Using custom matcher inline - print("\nTest 3: Inline custom matcher") - custom_config = CrawlerRunConfig( - url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(), - verbose=False - ) - results = await crawler.arun_many( - urls=[ - "https://docs.python.org/3/library/asyncio.html", # Long URL with 'python' - "https://python.org/", # Short URL with 'python' - won't match - "https://www.google.com/" # No 'python' - won't match - ], - config=[custom_config, default_config] - ) - print(f"Crawled {len(results)} URLs with custom matcher") - -if __name__ == "__main__": - asyncio.run(test_multi_config()) \ No newline at end of file diff --git a/tests/test_normalize_url.py b/tests/test_normalize_url.py deleted file mode 100644 index b1f1cc7d0..000000000 --- a/tests/test_normalize_url.py +++ /dev/null @@ -1,91 +0,0 @@ -import unittest -from crawl4ai.utils import normalize_url - -class TestNormalizeUrl(unittest.TestCase): - - def test_basic_relative_path(self): - self.assertEqual(normalize_url("path/to/page.html", "http://example.com/base/"), "http://example.com/base/path/to/page.html") - - def test_base_url_with_trailing_slash(self): - self.assertEqual(normalize_url("page.html", "http://example.com/base/"), "http://example.com/base/page.html") - - def test_base_url_without_trailing_slash(self): - # If normalize_url correctly uses urljoin, "base" is treated as a file. - self.assertEqual(normalize_url("page.html", "http://example.com/base"), "http://example.com/page.html") - - def test_absolute_url_as_href(self): - self.assertEqual(normalize_url("http://another.com/page.html", "http://example.com/"), "http://another.com/page.html") - - def test_href_with_leading_trailing_spaces(self): - self.assertEqual(normalize_url(" page.html ", "http://example.com/"), "http://example.com/page.html") - - def test_empty_href(self): - # urljoin with an empty href and base ending in '/' returns the base. - self.assertEqual(normalize_url("", "http://example.com/base/"), "http://example.com/base/") - # urljoin with an empty href and base not ending in '/' also returns base. - self.assertEqual(normalize_url("", "http://example.com/base"), "http://example.com/base") - - def test_href_with_query_parameters(self): - self.assertEqual(normalize_url("page.html?query=test", "http://example.com/"), "http://example.com/page.html?query=test") - - def test_href_with_fragment(self): - self.assertEqual(normalize_url("page.html#section", "http://example.com/"), "http://example.com/page.html#section") - - def test_different_scheme_in_href(self): - self.assertEqual(normalize_url("https://secure.example.com/page.html", "http://example.com/"), "https://secure.example.com/page.html") - - def test_parent_directory_in_href(self): - self.assertEqual(normalize_url("../otherpage.html", "http://example.com/base/current/"), "http://example.com/base/otherpage.html") - - def test_root_relative_href(self): - self.assertEqual(normalize_url("/otherpage.html", "http://example.com/base/current/"), "http://example.com/otherpage.html") - - def test_base_url_with_path_and_no_trailing_slash(self): - # If normalize_url correctly uses urljoin, "path" is treated as a file. - self.assertEqual(normalize_url("file.html", "http://example.com/path"), "http://example.com/file.html") - - def test_base_url_is_just_domain(self): - self.assertEqual(normalize_url("page.html", "http://example.com"), "http://example.com/page.html") - - def test_href_is_only_query(self): - self.assertEqual(normalize_url("?query=true", "http://example.com/page.html"), "http://example.com/page.html?query=true") - - def test_href_is_only_fragment(self): - self.assertEqual(normalize_url("#fragment", "http://example.com/page.html"), "http://example.com/page.html#fragment") - - def test_relative_link_from_base_file_url(self): - """ - Tests the specific bug report: relative links from a base URL that is a file. - Example: - Page URL: http://example.com/path/to/document.html - Link on page: - Expected: http://example.com/path/to/file.xlsx - """ - base_url_file = "http://example.com/zwgk/fdzdgk/zdxx/spaq/t19360680.shtml" - href_relative_current_dir = "./P020241203375994691134.xlsx" - expected_url1 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/P020241203375994691134.xlsx" - self.assertEqual(normalize_url(href_relative_current_dir, base_url_file), expected_url1) - - # Test with a relative link that doesn't start with "./" - href_relative_no_dot_slash = "another.doc" - expected_url2 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/another.doc" - self.assertEqual(normalize_url(href_relative_no_dot_slash, base_url_file), expected_url2) - - def test_invalid_base_url_scheme(self): - with self.assertRaises(ValueError) as context: - normalize_url("page.html", "ftp://example.com/") - self.assertIn("Invalid base URL format", str(context.exception)) - - def test_invalid_base_url_netloc(self): - with self.assertRaises(ValueError) as context: - normalize_url("page.html", "http:///path/") - self.assertIn("Invalid base URL format", str(context.exception)) - - def test_base_url_with_port(self): - self.assertEqual(normalize_url("path/file.html", "http://example.com:8080/base/"), "http://example.com:8080/base/path/file.html") - - def test_href_with_special_characters(self): - self.assertEqual(normalize_url("path%20with%20spaces/file.html", "http://example.com/"), "http://example.com/path%20with%20spaces/file.html") - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/tests/test_scraping_strategy.py b/tests/test_scraping_strategy.py deleted file mode 100644 index df4628540..000000000 --- a/tests/test_scraping_strategy.py +++ /dev/null @@ -1,26 +0,0 @@ -import nest_asyncio - -nest_asyncio.apply() - -import asyncio -from crawl4ai import ( - AsyncWebCrawler, - CrawlerRunConfig, - LXMLWebScrapingStrategy, - CacheMode, -) - - -async def main(): - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - scraping_strategy=LXMLWebScrapingStrategy(), # Faster alternative to default BeautifulSoup - ) - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://example.com", config=config) - print(f"Success: {result.success}") - print(f"Markdown length: {len(result.markdown.raw_markdown)}") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 000000000..cdb07a980 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,42 @@ +import os + +from crawl4ai.utils import get_home_folder, normalize_url + + +def test_get_home_folder(monkeypatch, tmp_path): + base_dir = tmp_path / "custom_base" + monkeypatch.setenv("CRAWL4_AI_BASE_DIRECTORY", str(base_dir)) + + home_folder = get_home_folder() + assert home_folder == f"{base_dir}/.crawl4ai" + assert os.path.exists(home_folder) + assert os.path.exists(f"{home_folder}/cache") + assert os.path.exists(f"{home_folder}/models") + +class TestNormalizeUrl: + def test_relative_path_href(self): + assert normalize_url("path/to/page.html", "http://example.com/base/") == "http://example.com/base/path/to/page.html" + + def test_base_url_without_trailing_slash(self): + assert normalize_url("page.html", "http://example.com/base") == "http://example.com/page.html" + + def test_absolute_url_as_href(self): + assert normalize_url("http://another.com/page.html", "http://example.com/") == "http://another.com/page.html" + + def test_leading_trailing_spaces(self): + assert normalize_url(" page.html ", "http://example.com/") == "http://example.com/page.html" + + def test_href_with_query_parameters(self): + assert normalize_url("page.html?query=test", "http://example.com/") == "http://example.com/page.html?query=test" + + def test_parent_directory_in_href(self): + assert normalize_url("../otherpage.html", "http://example.com/base/current/") == "http://example.com/base/otherpage.html" + + def test_root_relative_href(self): + assert normalize_url("/otherpage.html", "http://example.com/base/current/") == "http://example.com/otherpage.html" + + def test_href_is_only_query(self): + assert normalize_url("?query=true", "http://example.com/page.html") == "http://example.com/page.html?query=true" + + def test_href_with_special_characters(self): + assert normalize_url("path with spaces/file.html", "http://example.com/") == "http://example.com/path%20with%20spaces/file.html" \ No newline at end of file diff --git a/tests/test_virtual_scroll.py b/tests/test_virtual_scroll.py index 1e7a7890e..1dad86667 100644 --- a/tests/test_virtual_scroll.py +++ b/tests/test_virtual_scroll.py @@ -1,13 +1,34 @@ + +import pytest + +import asyncio +import http.server +import os +import random +import re +import socketserver +import tempfile +import threading +from typing import Final + +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CacheMode, + CrawlerRunConfig, + VirtualScrollConfig, +) + """ -Test virtual scroll implementation according to the design: +Test virtual scroll implementation: - Create a page with virtual scroll that replaces content -- Verify all 1000 items are captured +- Verify all 400 items are captured """ -import asyncio -import os -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig +NUM_ITEMS: Final = 400 +NUM_ITEMS_PER_PAGE: Final = 10 +@pytest.mark.asyncio async def test_virtual_scroll(): """Test virtual scroll with content replacement (true virtual scroll)""" @@ -29,12 +50,12 @@ async def test_virtual_scroll(): -

Virtual Scroll Test - 1000 Items

+

Virtual Scroll Test - 400 Items