diff --git a/Cargo.toml b/Cargo.toml index a464f6d..99e23b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,10 +11,7 @@ repository = "https://github.com/blocklessnetwork/sdk-rust" [dependencies] base64 = { version = "0.13", default-features = false, features = ["alloc"], optional = true } -htmd = { version = "0.2.2", default-features = false, optional = true } json = { version = "0.12", default-features = false } -kuchikiki = { version = "0.8", default-features = false, optional = true } -regex = { version = "1.11.1", default-features = false, features = ["unicode-case"], optional = true } serde = { version = "1.0", features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["alloc"] } url = { version = "2.5", default-features = false } @@ -23,7 +20,6 @@ url = { version = "2.5", default-features = false } default = [ "http", "llm", - "bless-crawl", "cgi", "socket", "memory", @@ -32,7 +28,6 @@ default = [ mock-ffi = [] http = ["rpc", "dep:base64", "dep:serde"] llm = ["dep:serde"] -bless-crawl = ["http", "dep:htmd", "dep:kuchikiki", "dep:regex", "dep:serde"] cgi = [] socket = [] memory = [] diff --git a/examples/web-scrape.rs b/examples/web-scrape.rs deleted file mode 100644 index 552aa39..0000000 --- a/examples/web-scrape.rs +++ /dev/null @@ -1,93 +0,0 @@ -use blockless_sdk::bless_crawl::*; - -/// This example demonstrates how to use the Blockless SDK to perform web scraping -/// using the BlessCrawl functionality. -/// -/// It shows how to: -/// - Create a BlessCrawl instance with default configuration -/// - Scrape content from a single URL with custom configuration overrides -/// - Map links from a webpage to discover available URLs -/// - Handle errors and responses appropriately -fn main() { - println!("=== Blockless Web Scraping SDK Example ===\n"); - - example_scraping(); - example_mapping(); - example_crawling(); -} - -fn example_scraping() { - println!("--- Example 1: Basic Web Scraping ---"); - - let url = "https://example.com"; - println!("scraping: {}...", url); - - // First scrape with default config - let response = BlessCrawl::default() - .scrape(url, None) - .expect("Failed to scrape"); - println!("response with default config: {:?}", response); - println!(); - println!( - "---------- markdown ----------\n{}\n------------------------------", - response.data.content - ); -} - -fn example_mapping() { - println!("--- Example 2: Link Mapping/Discovery ---"); - - let url = "https://example.com"; - println!("Mapping links from: {}", url); - - let options = MapOptions::new() - .with_link_types(vec!["internal".to_string(), "external".to_string()]) - .with_base_url(url.to_string()) - .with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]); - - let response = BlessCrawl::default() - .map(url, Some(options)) - .expect("Failed to map"); - println!("response: {:?}", response); - println!(); - println!( - "------------ links ------------\n{:?}\n------------------------------", - response.data.links - ); - println!(); - println!( - "------------ total links ------------\n{}\n------------------------------", - response.data.total_links - ); -} - -fn example_crawling() { - println!("--- Example 3: Recursive Website Crawling ---"); - - let url = "https://example.com"; - println!("Crawling website: {}", url); - - let options = CrawlOptions::new() - .with_max_depth(2) - .with_limit(10) - .with_include_paths(vec!["/".to_string()]) - .with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()]) - .with_follow_external(false) - .with_delay_between_requests(1000) - .with_parallel_requests(3); - - let response = BlessCrawl::default() - .crawl(url, Some(options)) - .expect("Failed to crawl"); - println!("response: {:?}", response); - println!(); - println!( - "------------ pages ------------\n{:?}\n------------------------------", - response.data.pages - ); - println!(); - println!( - "------------ total pages ------------\n{}\n------------------------------", - response.data.total_pages - ); -} diff --git a/src/bless_crawl/html_to_markdown.rs b/src/bless_crawl/html_to_markdown.rs deleted file mode 100644 index 9137634..0000000 --- a/src/bless_crawl/html_to_markdown.rs +++ /dev/null @@ -1,119 +0,0 @@ -use htmd::HtmlToMarkdown; -use regex::Regex; - -/// Parses HTML content and converts it to Markdown -/// -/// This function replicates the behavior of the JavaScript parseMarkdown function: -/// - Converts HTML to Markdown using htmd -/// - Processes multi-line links by escaping newlines inside link content -/// - Removes "Skip to Content" links -/// - Returns empty string for empty/null input -pub fn parse_markdown(html: &str) -> String { - if html.is_empty() { - return String::new(); - } - - // Convert HTML to Markdown using htmd - let markdown = match HtmlToMarkdown::new().convert(html) { - Ok(md) => md, - Err(_) => { - // Return empty string if conversion fails - return String::new(); - } - }; - - // Process the markdown content - let processed_markdown = process_multiline_links(&markdown); - remove_skip_to_content_links(&processed_markdown) -} - -/// Processes multi-line links by escaping newlines inside link content -/// -/// This function replicates the JavaScript processMultiLineLinks function: -/// - Tracks when we're inside link content (between [ and ]) -/// - Escapes newlines with backslash when inside links -fn process_multiline_links(markdown_content: &str) -> String { - let mut new_markdown_content = String::new(); - let mut link_open_count: usize = 0; - - for ch in markdown_content.chars() { - match ch { - '[' => { - link_open_count += 1; - } - ']' => { - link_open_count = link_open_count.saturating_sub(1); - } - _ => {} - } - - let inside_link_content = link_open_count > 0; - - if inside_link_content && ch == '\n' { - new_markdown_content.push('\\'); - new_markdown_content.push('\n'); - } else { - new_markdown_content.push(ch); - } - } - - new_markdown_content -} - -/// Removes "Skip to Content" links from the markdown content -/// -/// This function replicates the JavaScript removeSkipToContentLinks function: -/// - Removes [Skip to Content](#page) and [Skip to content](#skip) patterns -/// - Case-insensitive matching -fn remove_skip_to_content_links(markdown_content: &str) -> String { - let re = Regex::new(r"(?i)\[Skip to Content\]\(#[^)]*\)").unwrap(); - re.replace_all(markdown_content, "").to_string() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_markdown_simple() { - let html = "

Hello, world!

"; - let result = parse_markdown(html); - assert_eq!(result.trim(), "Hello, world!"); - } - - #[test] - fn test_parse_markdown_complex() { - let html = - "

Hello bold world!

"; - let result = parse_markdown(html); - assert_eq!(result.trim(), "Hello **bold** world!\n\n* List item"); - } - - #[test] - fn test_parse_markdown_empty() { - let html = ""; - let result = parse_markdown(html); - assert_eq!(result, ""); - } - - #[test] - fn test_process_multiline_links() { - let markdown = "[Link\nwith newline](http://example.com)"; - let result = process_multiline_links(markdown); - assert_eq!(result, "[Link\\\nwith newline](http://example.com)"); - } - - #[test] - fn test_remove_skip_to_content_links() { - let markdown = "Some content [Skip to Content](#page) more content"; - let result = remove_skip_to_content_links(markdown); - assert_eq!(result, "Some content more content"); - } - - #[test] - fn test_remove_skip_to_content_links_case_insensitive() { - let markdown = "Some content [Skip to content](#skip) more content"; - let result = remove_skip_to_content_links(markdown); - assert_eq!(result, "Some content more content"); - } -} diff --git a/src/bless_crawl/html_transform.rs b/src/bless_crawl/html_transform.rs deleted file mode 100644 index 8c56ebe..0000000 --- a/src/bless_crawl/html_transform.rs +++ /dev/null @@ -1,374 +0,0 @@ -use kuchikiki::{parse_html, traits::TendrilSink}; -use serde::{Deserialize, Serialize}; -use url::Url; - -const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [ - "header", - "footer", - "nav", - "aside", - ".header", - ".top", - ".navbar", - "#header", - ".footer", - ".bottom", - "#footer", - ".sidebar", - ".side", - ".aside", - "#sidebar", - ".modal", - ".popup", - "#modal", - ".overlay", - ".ad", - ".ads", - ".advert", - "#ad", - ".lang-selector", - ".language", - "#language-selector", - ".social", - ".social-media", - ".social-links", - "#social", - ".menu", - ".navigation", - "#nav", - ".breadcrumbs", - "#breadcrumbs", - ".share", - "#share", - ".widget", - "#widget", - ".cookie", - "#cookie", -]; - -const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [ - "#main", - // swoogo event software as .widget in all of their content - ".swoogo-cols", - ".swoogo-text", - ".swoogo-table-div", - ".swoogo-space", - ".swoogo-alert", - ".swoogo-sponsors", - ".swoogo-title", - ".swoogo-tabs", - ".swoogo-logo", - ".swoogo-image", - ".swoogo-button", - ".swoogo-agenda", -]; - -#[derive(Debug, Clone, Deserialize, Serialize)] -pub struct TransformHtmlOptions { - pub html: String, - pub url: String, - pub include_tags: Vec, - pub exclude_tags: Vec, - pub only_main_content: bool, -} - -#[derive(Debug)] -struct ImageSource { - url: String, - size: i32, - is_x: bool, -} - -#[derive(Debug)] -pub enum HtmlTransformError { - ParseError, - UrlParseError, - SelectError, -} - -impl std::fmt::Display for HtmlTransformError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - HtmlTransformError::ParseError => write!(f, "Failed to parse HTML"), - HtmlTransformError::UrlParseError => write!(f, "Failed to parse URL"), - HtmlTransformError::SelectError => write!(f, "Failed to select HTML elements"), - } - } -} - -impl std::error::Error for HtmlTransformError {} - -/// Transforms HTML by removing unwanted elements, filtering tags, and processing URLs -pub fn transform_html(opts: TransformHtmlOptions) -> Result { - let mut document = parse_html().one(opts.html); - - // If include_tags is specified, only include those tags - if !opts.include_tags.is_empty() { - let new_document = parse_html().one("
"); - let root = new_document - .select_first("div") - .map_err(|_| HtmlTransformError::SelectError)?; - - for tag_selector in opts.include_tags.iter() { - let matching_nodes: Vec<_> = document - .select(tag_selector) - .map_err(|_| HtmlTransformError::SelectError)? - .collect(); - for tag in matching_nodes { - root.as_node().append(tag.as_node().clone()); - } - } - - document = new_document; - } - - // Remove unwanted elements - let unwanted_selectors = ["head", "meta", "noscript", "style", "script"]; - for selector in &unwanted_selectors { - while let Ok(element) = document.select_first(selector) { - element.as_node().detach(); - } - } - - // Remove excluded tags - for tag_selector in opts.exclude_tags.iter() { - while let Ok(element) = document.select_first(tag_selector) { - element.as_node().detach(); - } - } - - // Remove non-main content if requested - if opts.only_main_content { - for selector in EXCLUDE_NON_MAIN_TAGS.iter() { - let elements: Vec<_> = document - .select(selector) - .map_err(|_| HtmlTransformError::SelectError)? - .collect(); - for element in elements { - // Check if this element contains any force-include tags - let should_keep = FORCE_INCLUDE_MAIN_TAGS.iter().any(|force_selector| { - element - .as_node() - .select(force_selector) - .map(|mut iter| iter.next().is_some()) - .unwrap_or(false) - }); - - if !should_keep { - element.as_node().detach(); - } - } - } - } - - // Process images with srcset attributes - let srcset_images: Vec<_> = document - .select("img[srcset]") - .map_err(|_| HtmlTransformError::SelectError)? - .collect(); - - for img in srcset_images { - let srcset = img.attributes.borrow().get("srcset").map(|s| s.to_string()); - if let Some(srcset) = srcset { - let mut sizes: Vec = srcset - .split(',') - .filter_map(|entry| { - let tokens: Vec<&str> = entry.trim().split(' ').collect(); - if tokens.is_empty() { - return None; - } - - let size_token = if tokens.len() > 1 && !tokens[1].is_empty() { - tokens[1] - } else { - "1x" - }; - - if let Ok(parsed_size) = size_token[..size_token.len() - 1].parse() { - Some(ImageSource { - url: tokens[0].to_string(), - size: parsed_size, - is_x: size_token.ends_with('x'), - }) - } else { - None - } - }) - .collect(); - - // Add src attribute as 1x if all sizes are x-based - if sizes.iter().all(|s| s.is_x) { - let src = img.attributes.borrow().get("src").map(|s| s.to_string()); - if let Some(src) = src { - sizes.push(ImageSource { - url: src, - size: 1, - is_x: true, - }); - } - } - - // Sort by size (largest first) and use the biggest image - sizes.sort_by(|a, b| b.size.cmp(&a.size)); - if let Some(biggest) = sizes.first() { - img.attributes - .borrow_mut() - .insert("src", biggest.url.clone()); - } - } - } - - // Convert relative URLs to absolute URLs - let base_url = Url::parse(&opts.url).map_err(|_| HtmlTransformError::UrlParseError)?; - - // Process image src attributes - let src_images: Vec<_> = document - .select("img[src]") - .map_err(|_| HtmlTransformError::SelectError)? - .collect(); - for img in src_images { - let old_src = img.attributes.borrow().get("src").map(|s| s.to_string()); - if let Some(old_src) = old_src { - if let Ok(new_url) = base_url.join(&old_src) { - img.attributes - .borrow_mut() - .insert("src", new_url.to_string()); - } - } - } - - // Process anchor href attributes - let href_anchors: Vec<_> = document - .select("a[href]") - .map_err(|_| HtmlTransformError::SelectError)? - .collect(); - for anchor in href_anchors { - let old_href = anchor - .attributes - .borrow() - .get("href") - .map(|s| s.to_string()); - if let Some(old_href) = old_href { - if let Ok(new_url) = base_url.join(&old_href) { - anchor - .attributes - .borrow_mut() - .insert("href", new_url.to_string()); - } - } - } - - Ok(document.to_string()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_transform_html_removes_unwanted_elements() { - let opts = TransformHtmlOptions { - html: "Test

Content

".to_string(), - url: "https://example.com".to_string(), - include_tags: vec![], - exclude_tags: vec![], - only_main_content: false, - }; - - let result = transform_html(opts).unwrap(); - let expected = "

Content

"; - assert_eq!(result, expected); - } - - #[test] - fn test_transform_html_include_tags() { - let opts = TransformHtmlOptions { - html: "
Keep this
Remove this
".to_string(), - url: "https://example.com".to_string(), - include_tags: vec![".content".to_string()], - exclude_tags: vec![], - only_main_content: false, - }; - - let result = transform_html(opts).unwrap(); - let expected = - "
Keep this
"; - assert_eq!(result, expected); - } - - #[test] - fn test_transform_html_exclude_tags() { - let opts = TransformHtmlOptions { - html: "
Keep this
Remove this
".to_string(), - url: "https://example.com".to_string(), - include_tags: vec![], - exclude_tags: vec![".ad".to_string()], - only_main_content: false, - }; - - let result = transform_html(opts).unwrap(); - let expected = "
Keep this
"; - assert_eq!(result, expected); - } - - #[test] - fn test_transform_html_relative_urls() { - let opts = TransformHtmlOptions { - html: r#"Link"# - .to_string(), - url: "https://example.com/subdir/".to_string(), - include_tags: vec![], - exclude_tags: vec![], - only_main_content: false, - }; - - let result = transform_html(opts).unwrap(); - let expected = r#"Link"#; - assert_eq!(result, expected); - } - - #[test] - fn test_transform_html_only_main_content() { - let opts = TransformHtmlOptions { - html: "
Header

Main content

Footer
".to_string(), - url: "https://example.com".to_string(), - include_tags: vec![], - exclude_tags: vec![], - only_main_content: true, - }; - - let result = transform_html(opts).unwrap(); - let expected = "

Main content

"; - assert_eq!(result, expected); - } - - #[test] - fn test_transform_html_srcset_processing() { - let opts = TransformHtmlOptions { - html: r#""#.to_string(), - url: "https://example.com".to_string(), - include_tags: vec![], - exclude_tags: vec![], - only_main_content: false, - }; - - let result = transform_html(opts).unwrap(); - let expected = r#""#; - assert_eq!(result, expected); - } - - #[test] - fn test_transform_html_force_include_tags() { - let opts = TransformHtmlOptions { - html: r#"

Important content

"#.to_string(), - url: "https://example.com".to_string(), - include_tags: vec![], - exclude_tags: vec![], - only_main_content: true, - }; - - let result = transform_html(opts).unwrap(); - let expected = r#"

Important content

"#; - assert_eq!(result, expected); - } -} diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs deleted file mode 100644 index 2a6e542..0000000 --- a/src/bless_crawl/mod.rs +++ /dev/null @@ -1,642 +0,0 @@ -//! # BlessCrawl - Distributed Web Scraping SDK -//! -//! Provides distributed web scraping across the BLESS network's browser nodes. -//! -//! ## Features -//! -//! - **scrape()**: Extract content from a URL as markdown -//! - **map()**: Discover and extract all links from a webpage -//! - **crawl()**: Recursively crawl websites with depth controls -//! -//! ## Limits -//! -//! - Timeout: 15s default, 120s max -//! - Wait time: 3s default, 20s max -//! - Buffer sizes: 2MB (scrape), 128KB (map), 8MB (crawl) - -mod html_to_markdown; -mod html_transform; - -use html_to_markdown::parse_markdown; -pub use html_transform::{transform_html, HtmlTransformError, TransformHtmlOptions}; -use std::collections::HashMap; - -type Handle = u32; -type ExitCode = u8; - -#[cfg(not(feature = "mock-ffi"))] -#[link(wasm_import_module = "bless_crawl")] -extern "C" { - /// Scrape webpage content and return as markdown - #[allow(clippy::too_many_arguments)] - fn scrape( - h: *mut Handle, - url_ptr: *const u8, - url_len: usize, - options_ptr: *const u8, - options_len: usize, - result_ptr: *mut u8, - result_len: usize, - bytes_written: *mut usize, - ) -> ExitCode; - - /// Close and cleanup a web scraper instance - fn close(h: Handle) -> ExitCode; -} - -#[cfg(feature = "mock-ffi")] -#[allow(unused_variables)] -mod mock_ffi { - use super::{ExitCode, Handle}; - - #[allow(clippy::too_many_arguments)] - pub unsafe fn scrape( - h: *mut Handle, - _url_ptr: *const u8, - _url_len: usize, - _options_ptr: *const u8, - _options_len: usize, - result_ptr: *mut u8, - result_len: usize, - bytes_written: *mut usize, - ) -> ExitCode { - 1 - } - - pub unsafe fn close(_h: Handle) -> ExitCode { - 1 - } -} - -#[cfg(feature = "mock-ffi")] -use mock_ffi::*; - -#[derive(Debug, Clone, PartialEq, serde::Serialize)] -pub struct ScrapeOptions { - pub timeout: u32, - pub wait_time: u32, - #[serde(skip_serializing_if = "Option::is_none")] - pub include_tags: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub exclude_tags: Option>, - pub only_main_content: bool, - pub format: Format, - #[serde(skip_serializing_if = "Option::is_none")] - pub viewport: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub user_agent: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub headers: Option>, -} - -impl Default for ScrapeOptions { - fn default() -> Self { - Self { - timeout: BlessCrawl::DEFAULT_TIMEOUT_MS, - wait_time: BlessCrawl::DEFAULT_WAIT_TIME_MS, - include_tags: None, - exclude_tags: None, - only_main_content: false, - format: Format::Markdown, - viewport: None, - user_agent: None, - headers: None, - } - } -} - -#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)] -pub enum Format { - #[default] - #[serde(rename = "markdown")] - Markdown, - #[serde(rename = "html")] - Html, - #[serde(rename = "json")] - Json, -} - -impl std::str::FromStr for Format { - type Err = (); - fn from_str(s: &str) -> Result { - match s { - "markdown" => Ok(Format::Markdown), - "html" => Ok(Format::Html), - "json" => Ok(Format::Json), - _ => Err(()), - } - } -} - -#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)] -pub struct Viewport { - #[serde(skip_serializing_if = "Option::is_none")] - pub width: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub height: Option, -} - -#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)] -pub struct MapOptions { - #[serde(skip_serializing_if = "Option::is_none")] - pub link_types: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub base_url: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub filter_extensions: Option>, -} - -#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)] -pub struct CrawlOptions { - #[serde(skip_serializing_if = "Option::is_none")] - pub limit: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub max_depth: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub exclude_paths: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub include_paths: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub follow_external: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub delay_between_requests: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub parallel_requests: Option, -} - -#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)] -pub struct PageMetadata { - #[serde(skip_serializing_if = "Option::is_none")] - pub title: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub description: Option, - pub url: String, - pub status_code: u16, - #[serde(skip_serializing_if = "Option::is_none")] - pub language: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub keywords: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub robots: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub author: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub creator: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub publisher: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub og_title: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub og_description: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub og_image: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub og_url: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub og_site_name: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub og_type: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub twitter_title: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub twitter_description: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub twitter_image: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub twitter_card: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub twitter_site: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub twitter_creator: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub favicon: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub viewport: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub referrer: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub content_type: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub scrape_id: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub source_url: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub proxy_used: Option, -} - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct ScrapeData { - pub success: bool, - pub timestamp: u64, - pub format: Format, - pub content: String, - pub metadata: PageMetadata, -} - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct Response { - pub success: bool, - #[serde(skip_serializing_if = "Option::is_none")] - pub error: Option, - pub data: T, -} - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct LinkInfo { - pub url: String, - // TODO: use enum instead of string - pub link_type: String, // "internal", "external", "anchor" -} - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct MapData { - pub url: String, - pub links: Vec, - pub total_links: usize, - pub timestamp: u64, -} - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct CrawlError { - pub url: String, - pub error: String, - pub depth: u32, -} - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct CrawlData { - pub root_url: String, - pub pages: Vec, - #[serde(skip_serializing_if = "Option::is_none")] - pub link_map: Option, - pub depth_reached: u8, - pub total_pages: usize, - pub errors: Vec, -} - -impl ScrapeOptions { - pub fn new() -> Self { - Self::default() - } - - pub fn with_include_tags(mut self, tags: Vec) -> Self { - self.include_tags = Some(tags); - self - } - - pub fn with_exclude_tags(mut self, tags: Vec) -> Self { - self.exclude_tags = Some(tags); - self - } - - pub fn with_format(mut self, format: Format) -> Self { - self.format = format; - self - } - - pub fn with_viewport(mut self, width: u32, height: u32) -> Self { - self.viewport = Some(Viewport { - width: Some(width), - height: Some(height), - }); - self - } - - pub fn with_user_agent(mut self, user_agent: String) -> Self { - self.user_agent = Some(user_agent); - self - } - - pub fn with_headers(mut self, headers: HashMap) -> Self { - self.headers = Some(headers); - self - } -} - -impl MapOptions { - pub fn new() -> Self { - Self::default() - } - - pub fn with_link_types(mut self, link_types: Vec) -> Self { - self.link_types = Some(link_types); - self - } - - pub fn with_base_url(mut self, base_url: String) -> Self { - self.base_url = Some(base_url); - self - } - - pub fn with_filter_extensions(mut self, extensions: Vec) -> Self { - self.filter_extensions = Some(extensions); - self - } -} - -impl CrawlOptions { - pub fn new() -> Self { - Self::default() - } - - pub fn with_limit(mut self, limit: u32) -> Self { - self.limit = Some(limit); - self - } - - pub fn with_max_depth(mut self, max_depth: u8) -> Self { - self.max_depth = Some(max_depth); - self - } - - pub fn with_exclude_paths(mut self, paths: Vec) -> Self { - self.exclude_paths = Some(paths); - self - } - - pub fn with_include_paths(mut self, paths: Vec) -> Self { - self.include_paths = Some(paths); - self - } - - pub fn with_follow_external(mut self, follow: bool) -> Self { - self.follow_external = Some(follow); - self - } - - pub fn with_delay_between_requests(mut self, delay: u32) -> Self { - self.delay_between_requests = Some(delay); - self - } - - pub fn with_parallel_requests(mut self, parallel: u32) -> Self { - self.parallel_requests = Some(parallel); - self - } -} - -/// BlessCrawl client for distributed web scraping operations. -#[derive(Debug, Clone, Default)] -pub struct BlessCrawl { - inner: Handle, - config: ScrapeOptions, -} - -impl BlessCrawl { - /// Default timeout in milliseconds (15 seconds) - pub const DEFAULT_TIMEOUT_MS: u32 = 15000; - /// Default wait time in milliseconds (3 seconds) - pub const DEFAULT_WAIT_TIME_MS: u32 = 3000; - - /// Maximum timeout in milliseconds (2 minutes) - pub const MAX_TIMEOUT_MS: u32 = 120000; - /// Maximum wait time in milliseconds (20 seconds) - pub const MAX_WAIT_TIME_MS: u32 = 20000; - - /// Maximum result buffer size in bytes (2MB) - pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024; - - /// Maximum result buffer size in bytes (1MB) - pub const MAX_MAP_BUFFER_SIZE: usize = 1024 * 1024; - - /// Maximum result buffer size in bytes (8MB) - pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024; - - /// Creates a new BlessCrawl instance with the given configuration. - pub fn with_config(config: ScrapeOptions) -> Result { - let instance = Self { inner: 0, config }; - instance.validate_config(&instance.config)?; - Ok(instance) - } - - fn validate_config(&self, config: &ScrapeOptions) -> Result<(), WebScrapeErrorKind> { - if config.timeout > Self::MAX_TIMEOUT_MS { - return Err(WebScrapeErrorKind::InvalidTimeout); - } - if config.wait_time > Self::MAX_WAIT_TIME_MS { - return Err(WebScrapeErrorKind::InvalidWaitTime); - } - Ok(()) - } - - /// Returns a reference to the current configuration. - pub fn get_config(&self) -> &ScrapeOptions { - &self.config - } - - pub fn handle(&self) -> Handle { - self.inner - } - - /// Scrapes webpage content and returns it as markdown with metadata. - pub fn scrape( - &self, - url: &str, - options: Option, - ) -> Result, WebScrapeErrorKind> { - // Use provided options or fall back to instance config - let config = if let Some(opts) = options { - self.validate_config(&opts)?; - opts - } else { - self.config.clone() - }; - - let options_json = serde_json::to_vec(&config).unwrap(); - - let mut handle = self.inner; - let mut result_buf = vec![0u8; Self::MAX_SCRAPE_BUFFER_SIZE]; - let mut bytes_written: usize = 0; - - let code = unsafe { - scrape( - &mut handle, - url.as_ptr(), - url.len(), - options_json.as_ptr(), - options_json.len(), - result_buf.as_mut_ptr(), - result_buf.len(), - &mut bytes_written, - ) - }; - - if code != 0 { - return Err(code.into()); - } - if bytes_written == 0 { - return Err(WebScrapeErrorKind::EmptyResponse); - } - if bytes_written > result_buf.len() { - return Err(WebScrapeErrorKind::MemoryError); - } - - let result_bytes = - unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) }; - - // deserialize the result to host ScrapeResponse - let mut scrape_response = serde_json::from_slice::>(result_bytes) - .map_err(|e| { - eprintln!("error: {:?}", e); - WebScrapeErrorKind::ParseError - })?; - - if let Some(error) = scrape_response.error { - return Err(WebScrapeErrorKind::RuntimeError(error)); - } - - // post-process html - scrape_response.data.content = transform_html(TransformHtmlOptions { - html: scrape_response.data.content, - url: scrape_response.data.metadata.url.clone(), - include_tags: config.include_tags.unwrap_or_default(), - exclude_tags: config.exclude_tags.unwrap_or_default(), - only_main_content: config.only_main_content, - }) - .map_err(|e| { - eprintln!("error: {:?}", e); - WebScrapeErrorKind::TransformError - })?; - - // if the format is markdown, set the data to the markdown of the html - match config.format { - Format::Markdown => { - scrape_response.data.content = parse_markdown(&scrape_response.data.content); - } - Format::Html => (), // no need to do anything - Format::Json => unimplemented!(), - } - - // convert the host ScrapeResponse to the user ScrapeResponse - Ok(scrape_response) - } - - /// Extracts all links from a webpage, categorized by type. - pub fn map( - &self, - url: &str, - options: Option, - ) -> Result, WebScrapeErrorKind> { - let _map_options = options.unwrap_or_default(); - - // let scrape_response = self.scrape(url, None)?; - // TODO: implement map by post-processing the scrape response or using fetch - - Ok(Response { - success: true, - error: None, - data: MapData { - url: url.to_string(), - links: vec![], - total_links: 0, - timestamp: 0, - }, - }) - } - - /// Recursively crawls a website with configurable depth and filtering. - pub fn crawl( - &self, - url: &str, - options: Option, - ) -> Result>, WebScrapeErrorKind> { - let _crawl_options = options.unwrap_or_default(); - - // TODO: implement crawl by post-processing the scrape response or using fetch - - Ok(Response { - success: true, - error: None, - data: CrawlData { - root_url: url.to_string(), - pages: vec![], - link_map: None, - depth_reached: 0, - total_pages: 0, - errors: vec![], - }, - }) - } -} - -impl Drop for BlessCrawl { - fn drop(&mut self) { - // if the handle is 0, it means the instance was never initialized on the host - if self.inner == 0 { - return; - } - let code = unsafe { close(self.inner) }; - if code != 0 { - eprintln!("Error closing web scraper: {}", code); - } - } -} - -#[derive(Debug)] -pub enum WebScrapeErrorKind { - InvalidUrl, - Timeout, - NetworkError, - RenderingError, - MemoryError, - DepthExceeded, - RateLimited, - TransformError, - Utf8Error, - ParseError, - ScrapeFailed, - MapFailed, - CrawlFailed, - EmptyResponse, - InvalidTimeout, - InvalidWaitTime, - RuntimeError(String), -} - -impl From for WebScrapeErrorKind { - fn from(code: u8) -> Self { - match code { - 1 => WebScrapeErrorKind::InvalidUrl, - 2 => WebScrapeErrorKind::Timeout, - 3 => WebScrapeErrorKind::NetworkError, - 4 => WebScrapeErrorKind::RenderingError, - 5 => WebScrapeErrorKind::MemoryError, - 6 => WebScrapeErrorKind::DepthExceeded, - 7 => WebScrapeErrorKind::RateLimited, - 8 => WebScrapeErrorKind::TransformError, - 9 => WebScrapeErrorKind::RuntimeError(String::from("Invalid timeout")), - 10 => WebScrapeErrorKind::RuntimeError(String::from("Invalid wait time")), - _ => WebScrapeErrorKind::RuntimeError(String::from("Unknown error")), - } - } -} - -impl std::fmt::Display for WebScrapeErrorKind { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - WebScrapeErrorKind::InvalidUrl => write!(f, "Invalid URL provided"), - WebScrapeErrorKind::Timeout => write!(f, "Request timeout"), - WebScrapeErrorKind::NetworkError => write!(f, "Network error"), - WebScrapeErrorKind::RenderingError => write!(f, "Page rendering error"), - WebScrapeErrorKind::MemoryError => write!(f, "Memory allocation error"), - WebScrapeErrorKind::DepthExceeded => write!(f, "Maximum crawl depth exceeded"), - WebScrapeErrorKind::RateLimited => write!(f, "Rate limited"), - WebScrapeErrorKind::TransformError => write!(f, "Transform error"), - WebScrapeErrorKind::Utf8Error => write!(f, "UTF-8 conversion error"), - WebScrapeErrorKind::ParseError => write!(f, "JSON parse error"), - WebScrapeErrorKind::ScrapeFailed => write!(f, "Scrape operation failed"), - WebScrapeErrorKind::MapFailed => write!(f, "Map operation failed"), - WebScrapeErrorKind::CrawlFailed => write!(f, "Crawl operation failed"), - WebScrapeErrorKind::EmptyResponse => write!(f, "Empty response from host"), - WebScrapeErrorKind::InvalidTimeout => { - write!(f, "Timeout exceeds maximum allowed (120s)") - } - WebScrapeErrorKind::InvalidWaitTime => { - write!(f, "Wait time exceeds maximum allowed (20s)") - } - WebScrapeErrorKind::RuntimeError(error) => write!(f, "Runtime error: {}", error), - } - } -} - -impl std::error::Error for WebScrapeErrorKind {} diff --git a/src/lib.rs b/src/lib.rs index 80bfe1d..f399128 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,3 @@ pub mod socket; #[cfg(feature = "http")] pub mod http; - -#[cfg(feature = "bless-crawl")] -pub mod bless_crawl;