From 85f6b015b6d0629bacb9e62746d0aef01f1b1cf5 Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:47:39 +1200 Subject: [PATCH 01/12] upd cargo.toml deps --- Cargo.toml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a9e2388..48b0121 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,11 +10,13 @@ license = "MIT/Apache-2.0" repository = "https://github.com/blocklessnetwork/sdk-rust" [dependencies] +htmd = { version = "0.2.2", default-features = false } json = { version = "0.12", default-features = false } +kuchikiki = { version = "0.8", default-features = false } +regex = { version = "1.11.1", default-features = false, features = ["unicode-case"] } serde = { version = "1.0", features = ["derive"], optional = true } - -[dev-dependencies] -serde_json = "1.0" +serde_json = { version = "1.0", default-features = false, features = ["alloc"] } +url = { version = "2.5", default-features = false } [features] default = ["serde"] From ea2d17d646a69a0d56e36bb4eb2af8596c77d431 Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:48:28 +1200 Subject: [PATCH 02/12] bless-crawl plugin impl --- src/bless_crawl/mod.rs | 751 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 751 insertions(+) create mode 100644 src/bless_crawl/mod.rs diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs new file mode 100644 index 0000000..9c35b9b --- /dev/null +++ b/src/bless_crawl/mod.rs @@ -0,0 +1,751 @@ +//! # BlessCrawl - Distributed Web Scraping SDK +//! +//! Provides distributed web scraping across the BLESS network's browser nodes. +//! +//! ## Features +//! +//! - **scrape()**: Extract content from a URL as markdown +//! - **map()**: Discover and extract all links from a webpage +//! - **crawl()**: Recursively crawl websites with depth controls +//! +//! ## Limits +//! +//! - Timeout: 15s default, 120s max +//! - Wait time: 3s default, 20s max +//! - Buffer sizes: 2MB (scrape), 128KB (map), 8MB (crawl) +//! +//! ## Example +//! +//! ```rust +//! use blockless_sdk::*; +//! +//! // Create with default config +//! let crawler = BlessCrawl::default(); +//! let result = crawler.scrape("https://example.com", None).unwrap(); +//! +//! // Or override config per request +//! let custom_config = ScrapeOptions { timeout: 30000, wait_time: 5000, ..Default::default() }; +//! let result = crawler.scrape("https://example.com", Some(custom_config)).unwrap(); +//! println!("Content: {}", result.data.content); +//! ``` + +mod html_to_markdown; +mod html_transform; + +use html_to_markdown::parse_markdown; +pub use html_transform::{transform_html, TransformHtmlOptions, HtmlTransformError}; +use std::collections::HashMap; + +type Handle = u32; +type ExitCode = u8; + +#[cfg(not(feature = "mock-ffi"))] +#[link(wasm_import_module = "bless_crawl")] +extern "C" { + /// Scrape webpage content and return as markdown + fn scrape( + h: *mut Handle, + url_ptr: *const u8, + url_len: usize, + options_ptr: *const u8, + options_len: usize, + result_ptr: *mut u8, + result_len: usize, + bytes_written: *mut usize, + ) -> ExitCode; + + /// Extract and return all discoverable links from webpage + fn map( + h: *mut Handle, + url_ptr: *const u8, + url_len: usize, + options_ptr: *const u8, + options_len: usize, + result_ptr: *mut u8, + result_len: usize, + bytes_written: *mut usize, + ) -> ExitCode; + + /// Recursively crawl website starting from given URL + fn crawl( + h: *mut Handle, + url_ptr: *const u8, + url_len: usize, + options_ptr: *const u8, + options_len: usize, + result_ptr: *mut u8, + result_len: usize, + bytes_written: *mut usize, + ) -> ExitCode; + + /// Close and cleanup a web scraper instance + fn close(h: Handle) -> ExitCode; +} + +#[cfg(feature = "mock-ffi")] +#[allow(unused_variables)] +mod mock_ffi { + use super::{ExitCode, Handle}; + + pub unsafe fn scrape( + h: *mut Handle, + _url_ptr: *const u8, + _url_len: usize, + _options_ptr: *const u8, + _options_len: usize, + result_ptr: *mut u8, + result_len: usize, + bytes_written: *mut usize, + ) -> ExitCode { + unimplemented!() + } + + pub unsafe fn map( + h: *mut Handle, + _url_ptr: *const u8, + _url_len: usize, + _options_ptr: *const u8, + _options_len: usize, + result_ptr: *mut u8, + result_len: usize, + bytes_written: *mut usize, + ) -> ExitCode { + unimplemented!() + } + + pub unsafe fn crawl( + h: *mut Handle, + _url_ptr: *const u8, + _url_len: usize, + _options_ptr: *const u8, + _options_len: usize, + result_ptr: *mut u8, + result_len: usize, + bytes_written: *mut usize, + ) -> ExitCode { + unimplemented!() + } + + pub unsafe fn close(_h: Handle) -> ExitCode { + unimplemented!() + } +} + +#[cfg(feature = "mock-ffi")] +use mock_ffi::*; + +#[derive(Debug, Clone, PartialEq, serde::Serialize)] +pub struct ScrapeOptions { + pub timeout: u32, + pub wait_time: u32, + pub include_tags: Option>, + pub exclude_tags: Option>, + pub only_main_content: bool, + pub format: Format, + pub viewport: Option, + pub user_agent: Option, + pub headers: Option>, +} + +impl Default for ScrapeOptions { + fn default() -> Self { + Self { + timeout: BlessCrawl::DEFAULT_TIMEOUT_MS, + wait_time: BlessCrawl::DEFAULT_WAIT_TIME_MS, + include_tags: None, + exclude_tags: None, + only_main_content: false, + format: Format::Markdown, + viewport: None, + user_agent: None, + headers: None, + } + } +} + +#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)] +pub enum Format { + #[default] + #[serde(rename = "markdown")] + Markdown, + #[serde(rename = "html")] + Html, + #[serde(rename = "json")] + Json, +} + +impl std::fmt::Display for Format { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_string().to_lowercase()) + } +} + +impl std::str::FromStr for Format { + type Err = (); + fn from_str(s: &str) -> Result { + match s { + "markdown" => Ok(Format::Markdown), + "html" => Ok(Format::Html), + "json" => Ok(Format::Json), + _ => Err(()), + } + } +} + +#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)] +pub struct Viewport { + pub width: Option, + pub height: Option, +} + +#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)] +pub struct MapOptions { + pub link_types: Option>, + pub base_url: Option, + pub filter_extensions: Option>, +} + +#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)] +pub struct CrawlOptions { + pub limit: Option, + pub max_depth: Option, + pub exclude_paths: Option>, + pub include_paths: Option>, + pub follow_external: Option, + pub delay_between_requests: Option, + pub parallel_requests: Option, +} + +#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)] +pub struct PageMetadata { + pub title: Option, + pub description: Option, + pub url: String, + pub status_code: u16, + pub language: Option, + pub keywords: Option, + pub robots: Option, + pub author: Option, + pub creator: Option, + pub publisher: Option, + pub og_title: Option, + pub og_description: Option, + pub og_image: Option, + pub og_url: Option, + pub og_site_name: Option, + pub og_type: Option, + pub twitter_title: Option, + pub twitter_description: Option, + pub twitter_image: Option, + pub twitter_card: Option, + pub twitter_site: Option, + pub twitter_creator: Option, + pub favicon: Option, + pub viewport: Option, + pub referrer: Option, + pub content_type: Option, + pub scrape_id: Option, + pub source_url: Option, + pub proxy_used: Option, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct ScrapeData { + pub success: bool, + pub timestamp: u64, + pub format: Format, + pub content: String, + pub metadata: PageMetadata, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct Response { + pub success: bool, + pub error: Option, + pub data: T, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct LinkInfo { + pub url: String, + // TODO: use enum instead of string + pub link_type: String, // "internal", "external", "anchor" +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct MapData { + pub url: String, + pub links: Vec, + pub total_links: usize, + pub timestamp: u64, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct CrawlError { + pub url: String, + pub error: String, + pub depth: u32, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct CrawlData { + pub root_url: String, + pub pages: Vec, + pub link_map: Option, + pub depth_reached: u8, + pub total_pages: usize, + pub errors: Vec, +} + +impl ScrapeOptions { + pub fn new() -> Self { + Self::default() + } + + pub fn with_include_tags(mut self, tags: Vec) -> Self { + self.include_tags = Some(tags); + self + } + + pub fn with_exclude_tags(mut self, tags: Vec) -> Self { + self.exclude_tags = Some(tags); + self + } + + pub fn with_format(mut self, format: Format) -> Self { + self.format = format; + self + } + + pub fn with_viewport(mut self, width: u32, height: u32) -> Self { + self.viewport = Some(Viewport { width: Some(width), height: Some(height) }); + self + } + + pub fn with_user_agent(mut self, user_agent: String) -> Self { + self.user_agent = Some(user_agent); + self + } + + pub fn with_headers(mut self, headers: HashMap) -> Self { + self.headers = Some(headers); + self + } +} + +impl MapOptions { + pub fn new() -> Self { + Self::default() + } + + pub fn with_link_types(mut self, link_types: Vec) -> Self { + self.link_types = Some(link_types); + self + } + + pub fn with_base_url(mut self, base_url: String) -> Self { + self.base_url = Some(base_url); + self + } + + pub fn with_filter_extensions(mut self, extensions: Vec) -> Self { + self.filter_extensions = Some(extensions); + self + } +} + +impl CrawlOptions { + pub fn new() -> Self { + Self::default() + } + + pub fn with_limit(mut self, limit: u32) -> Self { + self.limit = Some(limit); + self + } + + pub fn with_max_depth(mut self, max_depth: u8) -> Self { + self.max_depth = Some(max_depth); + self + } + + pub fn with_exclude_paths(mut self, paths: Vec) -> Self { + self.exclude_paths = Some(paths); + self + } + + pub fn with_include_paths(mut self, paths: Vec) -> Self { + self.include_paths = Some(paths); + self + } + + pub fn with_follow_external(mut self, follow: bool) -> Self { + self.follow_external = Some(follow); + self + } + + pub fn with_delay_between_requests(mut self, delay: u32) -> Self { + self.delay_between_requests = Some(delay); + self + } + + pub fn with_parallel_requests(mut self, parallel: u32) -> Self { + self.parallel_requests = Some(parallel); + self + } +} + +/// BlessCrawl client for distributed web scraping operations. +#[derive(Debug, Clone, Default)] +pub struct BlessCrawl { + inner: Handle, + config: ScrapeOptions, +} + +impl BlessCrawl { + /// Default timeout in milliseconds (15 seconds) + pub const DEFAULT_TIMEOUT_MS: u32 = 15000; + /// Default wait time in milliseconds (3 seconds) + pub const DEFAULT_WAIT_TIME_MS: u32 = 3000; + + /// Maximum timeout in milliseconds (2 minutes) + pub const MAX_TIMEOUT_MS: u32 = 120000; + /// Maximum wait time in milliseconds (20 seconds) + pub const MAX_WAIT_TIME_MS: u32 = 20000; + + /// Maximum result buffer size in bytes (2MB) + pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024; + + /// Maximum result buffer size in bytes (1MB) + pub const MAX_MAP_BUFFER_SIZE: usize = 1 * 1024 * 1024; + + /// Maximum result buffer size in bytes (8MB) + pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024; + + /// Creates a new BlessCrawl instance with the given configuration. + pub fn with_config(config: ScrapeOptions) -> Result { + let instance = Self { inner: 0, config }; + instance.validate_config(&instance.config)?; + Ok(instance) + } + + fn validate_config(&self, config: &ScrapeOptions) -> Result<(), WebScrapeErrorKind> { + if config.timeout > Self::MAX_TIMEOUT_MS { + return Err(WebScrapeErrorKind::InvalidTimeout); + } + if config.wait_time > Self::MAX_WAIT_TIME_MS { + return Err(WebScrapeErrorKind::InvalidWaitTime); + } + Ok(()) + } + + /// Returns a reference to the current configuration. + pub fn get_config(&self) -> &ScrapeOptions { + &self.config + } + + pub fn handle(&self) -> Handle { + self.inner + } + + /// Scrapes webpage content and returns it as markdown with metadata. + pub fn scrape(&self, url: &str, options: Option) -> Result, WebScrapeErrorKind> { + // Use provided options or fall back to instance config + let config = if let Some(opts) = options { + self.validate_config(&opts)?; + opts + } else { + self.config.clone() + }; + + let options_json = serde_json::to_vec(&config).unwrap(); + + let mut handle = self.inner; + let mut result_buf = vec![0u8; Self::MAX_SCRAPE_BUFFER_SIZE]; + let mut bytes_written: usize = 0; + + let code = unsafe { + scrape( + &mut handle, + url.as_ptr(), + url.len(), + options_json.as_ptr(), + options_json.len(), + result_buf.as_mut_ptr(), + result_buf.len(), + &mut bytes_written, + ) + }; + + if code != 0 { + return Err(code.into()); + } + if bytes_written == 0 { + return Err(WebScrapeErrorKind::EmptyResponse); + } + if bytes_written > result_buf.len() { + return Err(WebScrapeErrorKind::MemoryError); + } + + let result_bytes = + unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) }; + + // deserialize the result to host ScrapeResponse + let mut scrape_response = serde_json::from_slice::>( + result_bytes, + ) + .map_err(|e| { + eprintln!("error: {:?}", e); + WebScrapeErrorKind::ParseError + })?; + + if let Some(error) = scrape_response.error { + return Err(WebScrapeErrorKind::RuntimeError(error)); + } + + // post-process html + scrape_response.data.content = transform_html(TransformHtmlOptions { + html: scrape_response.data.content, + url: scrape_response.data.metadata.url.clone(), + include_tags: config.include_tags.unwrap_or_default(), + exclude_tags: config.exclude_tags.unwrap_or_default(), + only_main_content: config.only_main_content, + }).map_err(|e| { + eprintln!("error: {:?}", e); + WebScrapeErrorKind::TransformError + })?; + + // if the format is markdown, set the data to the markdown of the html + match config.format { + Format::Markdown => { + scrape_response.data.content = parse_markdown(&scrape_response.data.content); + } + Format::Html => (), // no need to do anything + Format::Json => unimplemented!(), + } + + // convert the host ScrapeResponse to the user ScrapeResponse + Ok(scrape_response) + } + + /// Extracts all links from a webpage, categorized by type. + pub fn map( + &self, + url: &str, + options: Option, + ) -> Result, WebScrapeErrorKind> { + let mut combined_options = serde_json::to_value(&self.config).unwrap(); + if let Some(map_opts) = options { + combined_options["map_options"] = serde_json::to_value(map_opts).unwrap(); + } + let options_json = serde_json::to_vec(&combined_options).unwrap(); + + let mut result_buf = vec![0u8; Self::MAX_MAP_BUFFER_SIZE]; + let mut bytes_written: usize = 0; + + let mut handle = self.inner; + let code = unsafe { + map( + &mut handle, + url.as_ptr(), + url.len(), + options_json.as_ptr(), + options_json.len(), + result_buf.as_mut_ptr(), + result_buf.len(), + &mut bytes_written, + ) + }; + + if code != 0 { + return Err(code.into()); + } + + if bytes_written == 0 { + return Err(WebScrapeErrorKind::EmptyResponse); + } + + if bytes_written > result_buf.len() { + return Err(WebScrapeErrorKind::MemoryError); + } + + let result_bytes = + unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) }; + + // deserialize the result to MapResponse + let map_response = serde_json::from_slice::>(result_bytes).map_err(|e| { + eprintln!("error: {:?}", e); + WebScrapeErrorKind::ParseError + })?; + + if let Some(error) = map_response.error { + return Err(WebScrapeErrorKind::RuntimeError(error)); + } + + Ok(map_response) + } + + /// Recursively crawls a website with configurable depth and filtering. + pub fn crawl( + &self, + url: &str, + options: Option, + ) -> Result>, WebScrapeErrorKind> { + let mut combined_options = serde_json::to_value(&self.config).unwrap(); + if let Some(crawl_opts) = options { + combined_options["crawl_options"] = serde_json::to_value(crawl_opts).unwrap(); + } + let options_json = serde_json::to_vec(&combined_options).unwrap(); + + let mut result_buf = vec![0u8; Self::MAX_CRAWL_BUFFER_SIZE]; + let mut bytes_written: usize = 0; + + let mut handle = self.inner; + let code = unsafe { + crawl( + &mut handle, + url.as_ptr(), + url.len(), + options_json.as_ptr(), + options_json.len(), + result_buf.as_mut_ptr(), + result_buf.len(), + &mut bytes_written, + ) + }; + + if code != 0 { + return Err(code.into()); + } + + if bytes_written == 0 { + return Err(WebScrapeErrorKind::EmptyResponse); + } + + if bytes_written > result_buf.len() { + return Err(WebScrapeErrorKind::MemoryError); + } + + let result_bytes = + unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) }; + + // deserialize the result to CrawlResponse + let mut host_crawl_response = + serde_json::from_slice::>>(result_bytes).map_err(|e| { + eprintln!("error: {:?}", e); + WebScrapeErrorKind::ParseError + })?; + + if let Some(error) = host_crawl_response.error { + return Err(WebScrapeErrorKind::RuntimeError(error)); + } + + // post-process html + for page in host_crawl_response.data.pages.iter_mut() { + page.content = transform_html(TransformHtmlOptions { + html: page.content.clone(), + url: page.metadata.url.clone(), + include_tags: self.config.include_tags.clone().unwrap_or_default(), + exclude_tags: self.config.exclude_tags.clone().unwrap_or_default(), + only_main_content: self.config.only_main_content, + }).map_err(|e| { + eprintln!("error: {:?}", e); + WebScrapeErrorKind::TransformError + })?; + + // if the format is markdown, set the content to the markdown of the html + match self.config.format { + Format::Markdown => { + page.content = parse_markdown(&page.content); + } + Format::Html => (), // no need to do anything + Format::Json => unimplemented!(), + } + } + + // convert the host CrawlResponse to the user CrawlResponse + Ok(host_crawl_response) + } +} + +impl Drop for BlessCrawl { + fn drop(&mut self) { + // if the handle is 0, it means the instance was never initialized on the host + if self.inner == 0 { + return; + } + let code = unsafe { close(self.inner) }; + if code != 0 { + eprintln!("Error closing web scraper: {}", code); + } + } +} + +#[derive(Debug)] +pub enum WebScrapeErrorKind { + InvalidUrl, + Timeout, + NetworkError, + RenderingError, + MemoryError, + DepthExceeded, + RateLimited, + TransformError, + Utf8Error, + ParseError, + ScrapeFailed, + MapFailed, + CrawlFailed, + EmptyResponse, + InvalidTimeout, + InvalidWaitTime, + RuntimeError(String), +} + +impl From for WebScrapeErrorKind { + fn from(code: u8) -> Self { + match code { + 1 => WebScrapeErrorKind::InvalidUrl, + 2 => WebScrapeErrorKind::Timeout, + 3 => WebScrapeErrorKind::NetworkError, + 4 => WebScrapeErrorKind::RenderingError, + 5 => WebScrapeErrorKind::MemoryError, + 6 => WebScrapeErrorKind::DepthExceeded, + 7 => WebScrapeErrorKind::RateLimited, + 8 => WebScrapeErrorKind::TransformError, + 9 => WebScrapeErrorKind::RuntimeError(String::from("Invalid timeout")), + 10 => WebScrapeErrorKind::RuntimeError(String::from("Invalid wait time")), + _ => WebScrapeErrorKind::RuntimeError(String::from("Unknown error")), + } + } +} + +impl std::fmt::Display for WebScrapeErrorKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WebScrapeErrorKind::InvalidUrl => write!(f, "Invalid URL provided"), + WebScrapeErrorKind::Timeout => write!(f, "Request timeout"), + WebScrapeErrorKind::NetworkError => write!(f, "Network error"), + WebScrapeErrorKind::RenderingError => write!(f, "Page rendering error"), + WebScrapeErrorKind::MemoryError => write!(f, "Memory allocation error"), + WebScrapeErrorKind::DepthExceeded => write!(f, "Maximum crawl depth exceeded"), + WebScrapeErrorKind::RateLimited => write!(f, "Rate limited"), + WebScrapeErrorKind::TransformError => write!(f, "Transform error"), + WebScrapeErrorKind::Utf8Error => write!(f, "UTF-8 conversion error"), + WebScrapeErrorKind::ParseError => write!(f, "JSON parse error"), + WebScrapeErrorKind::ScrapeFailed => write!(f, "Scrape operation failed"), + WebScrapeErrorKind::MapFailed => write!(f, "Map operation failed"), + WebScrapeErrorKind::CrawlFailed => write!(f, "Crawl operation failed"), + WebScrapeErrorKind::EmptyResponse => write!(f, "Empty response from host"), + WebScrapeErrorKind::InvalidTimeout => { + write!(f, "Timeout exceeds maximum allowed (120s)") + } + WebScrapeErrorKind::InvalidWaitTime => { + write!(f, "Wait time exceeds maximum allowed (20s)") + } + WebScrapeErrorKind::RuntimeError(error) => write!(f, "Runtime error: {}", error), + } + } +} + +impl std::error::Error for WebScrapeErrorKind {} From deefc87f6a086de494a270ae998dda7d83a4c3fe Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:48:37 +1200 Subject: [PATCH 03/12] html to markdown impl --- src/bless_crawl/html_to_markdown.rs | 121 ++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 src/bless_crawl/html_to_markdown.rs diff --git a/src/bless_crawl/html_to_markdown.rs b/src/bless_crawl/html_to_markdown.rs new file mode 100644 index 0000000..f8eb03f --- /dev/null +++ b/src/bless_crawl/html_to_markdown.rs @@ -0,0 +1,121 @@ +use htmd::HtmlToMarkdown; +use regex::Regex; + +/// Parses HTML content and converts it to Markdown +/// +/// This function replicates the behavior of the JavaScript parseMarkdown function: +/// - Converts HTML to Markdown using htmd +/// - Processes multi-line links by escaping newlines inside link content +/// - Removes "Skip to Content" links +/// - Returns empty string for empty/null input +pub fn parse_markdown(html: &str) -> String { + if html.is_empty() { + return String::new(); + } + + // Convert HTML to Markdown using htmd + let markdown = match HtmlToMarkdown::new().convert(html) { + Ok(md) => md, + Err(_) => { + // Return empty string if conversion fails + return String::new(); + } + }; + + // Process the markdown content + let processed_markdown = process_multiline_links(&markdown); + let final_markdown = remove_skip_to_content_links(&processed_markdown); + + final_markdown +} + +/// Processes multi-line links by escaping newlines inside link content +/// +/// This function replicates the JavaScript processMultiLineLinks function: +/// - Tracks when we're inside link content (between [ and ]) +/// - Escapes newlines with backslash when inside links +fn process_multiline_links(markdown_content: &str) -> String { + let mut new_markdown_content = String::new(); + let mut link_open_count: usize = 0; + + for ch in markdown_content.chars() { + match ch { + '[' => { + link_open_count += 1; + } + ']' => { + link_open_count = link_open_count.saturating_sub(1); + } + _ => {} + } + + let inside_link_content = link_open_count > 0; + + if inside_link_content && ch == '\n' { + new_markdown_content.push('\\'); + new_markdown_content.push('\n'); + } else { + new_markdown_content.push(ch); + } + } + + new_markdown_content +} + +/// Removes "Skip to Content" links from the markdown content +/// +/// This function replicates the JavaScript removeSkipToContentLinks function: +/// - Removes [Skip to Content](#page) and [Skip to content](#skip) patterns +/// - Case-insensitive matching +fn remove_skip_to_content_links(markdown_content: &str) -> String { + let re = Regex::new(r"(?i)\[Skip to Content\]\(#[^)]*\)").unwrap(); + re.replace_all(markdown_content, "").to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_markdown_simple() { + let html = "

Hello, world!

"; + let result = parse_markdown(html); + assert_eq!(result.trim(), "Hello, world!"); + } + + #[test] + fn test_parse_markdown_complex() { + let html = + "

Hello bold world!

  • List item
"; + let result = parse_markdown(html); + assert_eq!(result.trim(), "Hello **bold** world!\n\n* List item"); + } + + #[test] + fn test_parse_markdown_empty() { + let html = ""; + let result = parse_markdown(html); + assert_eq!(result, ""); + } + + #[test] + fn test_process_multiline_links() { + let markdown = "[Link\nwith newline](http://example.com)"; + let result = process_multiline_links(markdown); + assert_eq!(result, "[Link\\\nwith newline](http://example.com)"); + } + + #[test] + fn test_remove_skip_to_content_links() { + let markdown = "Some content [Skip to Content](#page) more content"; + let result = remove_skip_to_content_links(markdown); + assert_eq!(result, "Some content more content"); + } + + #[test] + fn test_remove_skip_to_content_links_case_insensitive() { + let markdown = "Some content [Skip to content](#skip) more content"; + let result = remove_skip_to_content_links(markdown); + assert_eq!(result, "Some content more content"); + } +} From dd01f92b1db6c3111a49a6129fa03ab355be472a Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:48:49 +1200 Subject: [PATCH 04/12] html transformation impl for include and exclude tags --- src/bless_crawl/html_transform.rs | 349 ++++++++++++++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 src/bless_crawl/html_transform.rs diff --git a/src/bless_crawl/html_transform.rs b/src/bless_crawl/html_transform.rs new file mode 100644 index 0000000..0a0a596 --- /dev/null +++ b/src/bless_crawl/html_transform.rs @@ -0,0 +1,349 @@ +use kuchikiki::{parse_html, traits::TendrilSink}; +use serde::{Deserialize, Serialize}; +use url::Url; + +const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [ + "header", + "footer", + "nav", + "aside", + ".header", + ".top", + ".navbar", + "#header", + ".footer", + ".bottom", + "#footer", + ".sidebar", + ".side", + ".aside", + "#sidebar", + ".modal", + ".popup", + "#modal", + ".overlay", + ".ad", + ".ads", + ".advert", + "#ad", + ".lang-selector", + ".language", + "#language-selector", + ".social", + ".social-media", + ".social-links", + "#social", + ".menu", + ".navigation", + "#nav", + ".breadcrumbs", + "#breadcrumbs", + ".share", + "#share", + ".widget", + "#widget", + ".cookie", + "#cookie", +]; + +const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [ + "#main", + // swoogo event software as .widget in all of their content + ".swoogo-cols", + ".swoogo-text", + ".swoogo-table-div", + ".swoogo-space", + ".swoogo-alert", + ".swoogo-sponsors", + ".swoogo-title", + ".swoogo-tabs", + ".swoogo-logo", + ".swoogo-image", + ".swoogo-button", + ".swoogo-agenda", +]; + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct TransformHtmlOptions { + pub html: String, + pub url: String, + pub include_tags: Vec, + pub exclude_tags: Vec, + pub only_main_content: bool, +} + +#[derive(Debug)] +struct ImageSource { + url: String, + size: i32, + is_x: bool, +} + +#[derive(Debug)] +pub enum HtmlTransformError { + ParseError, + UrlParseError, + SelectError, +} + +impl std::fmt::Display for HtmlTransformError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + HtmlTransformError::ParseError => write!(f, "Failed to parse HTML"), + HtmlTransformError::UrlParseError => write!(f, "Failed to parse URL"), + HtmlTransformError::SelectError => write!(f, "Failed to select HTML elements"), + } + } +} + +impl std::error::Error for HtmlTransformError {} + +/// Transforms HTML by removing unwanted elements, filtering tags, and processing URLs +pub fn transform_html(opts: TransformHtmlOptions) -> Result { + let mut document = parse_html().one(opts.html); + + // If include_tags is specified, only include those tags + if !opts.include_tags.is_empty() { + let new_document = parse_html().one("
"); + let root = new_document.select_first("div").map_err(|_| HtmlTransformError::SelectError)?; + + for tag_selector in opts.include_tags.iter() { + let matching_nodes: Vec<_> = document.select(tag_selector) + .map_err(|_| HtmlTransformError::SelectError)? + .collect(); + for tag in matching_nodes { + root.as_node().append(tag.as_node().clone()); + } + } + + document = new_document; + } + + // Remove unwanted elements + let unwanted_selectors = ["head", "meta", "noscript", "style", "script"]; + for selector in &unwanted_selectors { + while let Ok(element) = document.select_first(selector) { + element.as_node().detach(); + } + } + + // Remove excluded tags + for tag_selector in opts.exclude_tags.iter() { + while let Ok(element) = document.select_first(tag_selector) { + element.as_node().detach(); + } + } + + // Remove non-main content if requested + if opts.only_main_content { + for selector in EXCLUDE_NON_MAIN_TAGS.iter() { + let elements: Vec<_> = document.select(selector) + .map_err(|_| HtmlTransformError::SelectError)? + .collect(); + for element in elements { + // Check if this element contains any force-include tags + let should_keep = FORCE_INCLUDE_MAIN_TAGS.iter().any(|force_selector| { + element.as_node().select(force_selector) + .map(|mut iter| iter.next().is_some()) + .unwrap_or(false) + }); + + if !should_keep { + element.as_node().detach(); + } + } + } + } + + // Process images with srcset attributes + let srcset_images: Vec<_> = document.select("img[srcset]") + .map_err(|_| HtmlTransformError::SelectError)? + .collect(); + + for img in srcset_images { + let srcset = img.attributes.borrow().get("srcset").map(|s| s.to_string()); + if let Some(srcset) = srcset { + let mut sizes: Vec = srcset.split(',').filter_map(|entry| { + let tokens: Vec<&str> = entry.trim().split(' ').collect(); + if tokens.is_empty() { + return None; + } + + let size_token = if tokens.len() > 1 && !tokens[1].is_empty() { + tokens[1] + } else { + "1x" + }; + + if let Ok(parsed_size) = size_token[..size_token.len()-1].parse() { + Some(ImageSource { + url: tokens[0].to_string(), + size: parsed_size, + is_x: size_token.ends_with('x') + }) + } else { + None + } + }).collect(); + + // Add src attribute as 1x if all sizes are x-based + if sizes.iter().all(|s| s.is_x) { + let src = img.attributes.borrow().get("src").map(|s| s.to_string()); + if let Some(src) = src { + sizes.push(ImageSource { + url: src, + size: 1, + is_x: true, + }); + } + } + + // Sort by size (largest first) and use the biggest image + sizes.sort_by(|a, b| b.size.cmp(&a.size)); + if let Some(biggest) = sizes.first() { + img.attributes.borrow_mut().insert("src", biggest.url.clone()); + } + } + } + + // Convert relative URLs to absolute URLs + let base_url = Url::parse(&opts.url).map_err(|_| HtmlTransformError::UrlParseError)?; + + // Process image src attributes + let src_images: Vec<_> = document.select("img[src]") + .map_err(|_| HtmlTransformError::SelectError)? + .collect(); + for img in src_images { + let old_src = img.attributes.borrow().get("src").map(|s| s.to_string()); + if let Some(old_src) = old_src { + if let Ok(new_url) = base_url.join(&old_src) { + img.attributes.borrow_mut().insert("src", new_url.to_string()); + } + } + } + + // Process anchor href attributes + let href_anchors: Vec<_> = document.select("a[href]") + .map_err(|_| HtmlTransformError::SelectError)? + .collect(); + for anchor in href_anchors { + let old_href = anchor.attributes.borrow().get("href").map(|s| s.to_string()); + if let Some(old_href) = old_href { + if let Ok(new_url) = base_url.join(&old_href) { + anchor.attributes.borrow_mut().insert("href", new_url.to_string()); + } + } + } + + Ok(document.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_transform_html_removes_unwanted_elements() { + let opts = TransformHtmlOptions { + html: "Test

Content

".to_string(), + url: "https://example.com".to_string(), + include_tags: vec![], + exclude_tags: vec![], + only_main_content: false, + }; + + let result = transform_html(opts).unwrap(); + let expected = "

Content

"; + assert_eq!(result, expected); + } + + #[test] + fn test_transform_html_include_tags() { + let opts = TransformHtmlOptions { + html: "
Keep this
Remove this
".to_string(), + url: "https://example.com".to_string(), + include_tags: vec![".content".to_string()], + exclude_tags: vec![], + only_main_content: false, + }; + + let result = transform_html(opts).unwrap(); + let expected = "
Keep this
"; + assert_eq!(result, expected); + } + + #[test] + fn test_transform_html_exclude_tags() { + let opts = TransformHtmlOptions { + html: "
Keep this
Remove this
".to_string(), + url: "https://example.com".to_string(), + include_tags: vec![], + exclude_tags: vec![".ad".to_string()], + only_main_content: false, + }; + + let result = transform_html(opts).unwrap(); + let expected = "
Keep this
"; + assert_eq!(result, expected); + } + + #[test] + fn test_transform_html_relative_urls() { + let opts = TransformHtmlOptions { + html: r#"Link"#.to_string(), + url: "https://example.com/subdir/".to_string(), + include_tags: vec![], + exclude_tags: vec![], + only_main_content: false, + }; + + let result = transform_html(opts).unwrap(); + let expected = r#"Link"#; + assert_eq!(result, expected); + } + + #[test] + fn test_transform_html_only_main_content() { + let opts = TransformHtmlOptions { + html: "
Header

Main content

Footer
".to_string(), + url: "https://example.com".to_string(), + include_tags: vec![], + exclude_tags: vec![], + only_main_content: true, + }; + + let result = transform_html(opts).unwrap(); + let expected = "

Main content

"; + assert_eq!(result, expected); + } + + #[test] + fn test_transform_html_srcset_processing() { + let opts = TransformHtmlOptions { + html: r#""#.to_string(), + url: "https://example.com".to_string(), + include_tags: vec![], + exclude_tags: vec![], + only_main_content: false, + }; + + let result = transform_html(opts).unwrap(); + let expected = r#""#; + assert_eq!(result, expected); + } + + #[test] + fn test_transform_html_force_include_tags() { + let opts = TransformHtmlOptions { + html: r#"

Important content

"#.to_string(), + url: "https://example.com".to_string(), + include_tags: vec![], + exclude_tags: vec![], + only_main_content: true, + }; + + let result = transform_html(opts).unwrap(); + let expected = r#"

Important content

"#; + assert_eq!(result, expected); + } +} \ No newline at end of file From 3f82c4093dd9d5cae1adc9e420d06bd06a649fb0 Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:48:59 +1200 Subject: [PATCH 05/12] bless-crawl plugin impl - lib --- src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index d67e81e..b60c611 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +mod bless_crawl; mod cgi; mod error; mod http; @@ -5,6 +6,7 @@ mod llm; mod memory; mod socket; +pub use bless_crawl::*; pub use cgi::*; pub use error::*; pub use http::*; From 057835fe9bf2f9e8afd336564fe86884884d83e7 Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:49:32 +1200 Subject: [PATCH 06/12] webscrape example --- examples/web-scrape.rs | 72 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 examples/web-scrape.rs diff --git a/examples/web-scrape.rs b/examples/web-scrape.rs new file mode 100644 index 0000000..63f25a7 --- /dev/null +++ b/examples/web-scrape.rs @@ -0,0 +1,72 @@ +use blockless_sdk::*; + +/// This example demonstrates how to use the Blockless SDK to perform web scraping +/// using the BlessCrawl functionality. +/// +/// It shows how to: +/// - Create a BlessCrawl instance with default configuration +/// - Scrape content from a single URL with custom configuration overrides +/// - Map links from a webpage to discover available URLs +/// - Handle errors and responses appropriately +fn main() { + println!("=== Blockless Web Scraping SDK Example ===\n"); + + example_scraping(); + example_mapping(); + example_crawling(); +} + +fn example_scraping() { + println!("--- Example 1: Basic Web Scraping ---"); + + let url = "https://example.com"; + println!("scraping: {}...", url); + + // First scrape with default config + let response = BlessCrawl::default().scrape(url, None).expect("Failed to scrape"); + println!("response with default config: {:?}", response); + println!(); + println!("---------- markdown ----------\n{}\n------------------------------", response.data.content); +} + +fn example_mapping() { + println!("--- Example 2: Link Mapping/Discovery ---"); + + let url = "https://example.com"; + println!("Mapping links from: {}", url); + + let options = MapOptions::new() + .with_link_types(vec!["internal".to_string(), "external".to_string()]) + .with_base_url(url.to_string()) + .with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]); + + let response = BlessCrawl::default().map(url, Some(options)).expect("Failed to map"); + println!("response: {:?}", response); + println!(); + println!("------------ links ------------\n{:?}\n------------------------------", response.data.links); + println!(); + println!("------------ total links ------------\n{}\n------------------------------", response.data.total_links); +} + +fn example_crawling() { + println!("--- Example 3: Recursive Website Crawling ---"); + + let url = "https://example.com"; + println!("Crawling website: {}", url); + + let options = CrawlOptions::new() + .with_max_depth(2) + .with_limit(10) + .with_include_paths(vec!["/".to_string()]) + .with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()]) + .with_follow_external(false) + .with_delay_between_requests(1000) + .with_parallel_requests(3); + + let response = BlessCrawl::default().crawl(url, Some(options)).expect("Failed to crawl"); + println!("response: {:?}", response); + println!(); + println!("------------ pages ------------\n{:?}\n------------------------------", response.data.pages); + println!(); + println!("------------ total pages ------------\n{}\n------------------------------", response.data.total_pages); +} From 02316d3c948fa5994a5385b14742905f3be39645 Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:49:40 +1200 Subject: [PATCH 07/12] readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8e7d9dc..46a799e 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,7 @@ cargo build --release --target wasm32-wasip1 --example llm-mcp | [httpbin](./examples/httpbin.rs) | HTTP to query anything from httpbin | ✅ | ✅ | | [llm](./examples/llm.rs) | LLM to chat with `Llama-3.1-8B-Instruct-q4f32_1-MLC` and `SmolLM2-1.7B-Instruct-q4f16_1-MLC` models | ✅ | ✅ | | [llm-mcp](./examples/llm-mcp.rs) | LLM with MCP (Model Control Protocol) demonstrating tool integration using SSE endpoints | ✅ | ✅ | - +| [web-scrape](./examples/web-scrape.rs) | Web Scraping to scrape content from a single URL with custom configuration overrides | ✅ | ❌ | ## Testing From f161018353a45ee8183792a312d295d28b769331 Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:53:35 +1200 Subject: [PATCH 08/12] cargo fmt --all --- examples/web-scrape.rs | 37 ++++++++--- src/bless_crawl/html_transform.rs | 105 ++++++++++++++++++------------ src/bless_crawl/mod.rs | 54 ++++++++------- 3 files changed, 126 insertions(+), 70 deletions(-) diff --git a/examples/web-scrape.rs b/examples/web-scrape.rs index 63f25a7..d4165a3 100644 --- a/examples/web-scrape.rs +++ b/examples/web-scrape.rs @@ -23,10 +23,15 @@ fn example_scraping() { println!("scraping: {}...", url); // First scrape with default config - let response = BlessCrawl::default().scrape(url, None).expect("Failed to scrape"); + let response = BlessCrawl::default() + .scrape(url, None) + .expect("Failed to scrape"); println!("response with default config: {:?}", response); println!(); - println!("---------- markdown ----------\n{}\n------------------------------", response.data.content); + println!( + "---------- markdown ----------\n{}\n------------------------------", + response.data.content + ); } fn example_mapping() { @@ -40,12 +45,20 @@ fn example_mapping() { .with_base_url(url.to_string()) .with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]); - let response = BlessCrawl::default().map(url, Some(options)).expect("Failed to map"); + let response = BlessCrawl::default() + .map(url, Some(options)) + .expect("Failed to map"); println!("response: {:?}", response); println!(); - println!("------------ links ------------\n{:?}\n------------------------------", response.data.links); + println!( + "------------ links ------------\n{:?}\n------------------------------", + response.data.links + ); println!(); - println!("------------ total links ------------\n{}\n------------------------------", response.data.total_links); + println!( + "------------ total links ------------\n{}\n------------------------------", + response.data.total_links + ); } fn example_crawling() { @@ -63,10 +76,18 @@ fn example_crawling() { .with_delay_between_requests(1000) .with_parallel_requests(3); - let response = BlessCrawl::default().crawl(url, Some(options)).expect("Failed to crawl"); + let response = BlessCrawl::default() + .crawl(url, Some(options)) + .expect("Failed to crawl"); println!("response: {:?}", response); println!(); - println!("------------ pages ------------\n{:?}\n------------------------------", response.data.pages); + println!( + "------------ pages ------------\n{:?}\n------------------------------", + response.data.pages + ); println!(); - println!("------------ total pages ------------\n{}\n------------------------------", response.data.total_pages); + println!( + "------------ total pages ------------\n{}\n------------------------------", + response.data.total_pages + ); } diff --git a/src/bless_crawl/html_transform.rs b/src/bless_crawl/html_transform.rs index 0a0a596..8c56ebe 100644 --- a/src/bless_crawl/html_transform.rs +++ b/src/bless_crawl/html_transform.rs @@ -101,14 +101,17 @@ impl std::error::Error for HtmlTransformError {} /// Transforms HTML by removing unwanted elements, filtering tags, and processing URLs pub fn transform_html(opts: TransformHtmlOptions) -> Result { let mut document = parse_html().one(opts.html); - + // If include_tags is specified, only include those tags if !opts.include_tags.is_empty() { let new_document = parse_html().one("
"); - let root = new_document.select_first("div").map_err(|_| HtmlTransformError::SelectError)?; + let root = new_document + .select_first("div") + .map_err(|_| HtmlTransformError::SelectError)?; for tag_selector in opts.include_tags.iter() { - let matching_nodes: Vec<_> = document.select(tag_selector) + let matching_nodes: Vec<_> = document + .select(tag_selector) .map_err(|_| HtmlTransformError::SelectError)? .collect(); for tag in matching_nodes { @@ -137,17 +140,20 @@ pub fn transform_html(opts: TransformHtmlOptions) -> Result = document.select(selector) + let elements: Vec<_> = document + .select(selector) .map_err(|_| HtmlTransformError::SelectError)? .collect(); for element in elements { // Check if this element contains any force-include tags let should_keep = FORCE_INCLUDE_MAIN_TAGS.iter().any(|force_selector| { - element.as_node().select(force_selector) + element + .as_node() + .select(force_selector) .map(|mut iter| iter.next().is_some()) .unwrap_or(false) }); - + if !should_keep { element.as_node().detach(); } @@ -156,35 +162,39 @@ pub fn transform_html(opts: TransformHtmlOptions) -> Result = document.select("img[srcset]") + let srcset_images: Vec<_> = document + .select("img[srcset]") .map_err(|_| HtmlTransformError::SelectError)? .collect(); - + for img in srcset_images { let srcset = img.attributes.borrow().get("srcset").map(|s| s.to_string()); if let Some(srcset) = srcset { - let mut sizes: Vec = srcset.split(',').filter_map(|entry| { - let tokens: Vec<&str> = entry.trim().split(' ').collect(); - if tokens.is_empty() { - return None; - } - - let size_token = if tokens.len() > 1 && !tokens[1].is_empty() { - tokens[1] - } else { - "1x" - }; - - if let Ok(parsed_size) = size_token[..size_token.len()-1].parse() { - Some(ImageSource { - url: tokens[0].to_string(), - size: parsed_size, - is_x: size_token.ends_with('x') - }) - } else { - None - } - }).collect(); + let mut sizes: Vec = srcset + .split(',') + .filter_map(|entry| { + let tokens: Vec<&str> = entry.trim().split(' ').collect(); + if tokens.is_empty() { + return None; + } + + let size_token = if tokens.len() > 1 && !tokens[1].is_empty() { + tokens[1] + } else { + "1x" + }; + + if let Ok(parsed_size) = size_token[..size_token.len() - 1].parse() { + Some(ImageSource { + url: tokens[0].to_string(), + size: parsed_size, + is_x: size_token.ends_with('x'), + }) + } else { + None + } + }) + .collect(); // Add src attribute as 1x if all sizes are x-based if sizes.iter().all(|s| s.is_x) { @@ -201,36 +211,49 @@ pub fn transform_html(opts: TransformHtmlOptions) -> Result = document.select("img[src]") + let src_images: Vec<_> = document + .select("img[src]") .map_err(|_| HtmlTransformError::SelectError)? .collect(); for img in src_images { let old_src = img.attributes.borrow().get("src").map(|s| s.to_string()); if let Some(old_src) = old_src { if let Ok(new_url) = base_url.join(&old_src) { - img.attributes.borrow_mut().insert("src", new_url.to_string()); + img.attributes + .borrow_mut() + .insert("src", new_url.to_string()); } } } // Process anchor href attributes - let href_anchors: Vec<_> = document.select("a[href]") + let href_anchors: Vec<_> = document + .select("a[href]") .map_err(|_| HtmlTransformError::SelectError)? .collect(); for anchor in href_anchors { - let old_href = anchor.attributes.borrow().get("href").map(|s| s.to_string()); + let old_href = anchor + .attributes + .borrow() + .get("href") + .map(|s| s.to_string()); if let Some(old_href) = old_href { if let Ok(new_url) = base_url.join(&old_href) { - anchor.attributes.borrow_mut().insert("href", new_url.to_string()); + anchor + .attributes + .borrow_mut() + .insert("href", new_url.to_string()); } } } @@ -268,7 +291,8 @@ mod tests { }; let result = transform_html(opts).unwrap(); - let expected = "
Keep this
"; + let expected = + "
Keep this
"; assert_eq!(result, expected); } @@ -290,7 +314,8 @@ mod tests { #[test] fn test_transform_html_relative_urls() { let opts = TransformHtmlOptions { - html: r#"Link"#.to_string(), + html: r#"Link"# + .to_string(), url: "https://example.com/subdir/".to_string(), include_tags: vec![], exclude_tags: vec![], @@ -346,4 +371,4 @@ mod tests { let expected = r#"

Important content

"#; assert_eq!(result, expected); } -} \ No newline at end of file +} diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs index 9c35b9b..1d1e509 100644 --- a/src/bless_crawl/mod.rs +++ b/src/bless_crawl/mod.rs @@ -22,7 +22,7 @@ //! // Create with default config //! let crawler = BlessCrawl::default(); //! let result = crawler.scrape("https://example.com", None).unwrap(); -//! +//! //! // Or override config per request //! let custom_config = ScrapeOptions { timeout: 30000, wait_time: 5000, ..Default::default() }; //! let result = crawler.scrape("https://example.com", Some(custom_config)).unwrap(); @@ -33,7 +33,7 @@ mod html_to_markdown; mod html_transform; use html_to_markdown::parse_markdown; -pub use html_transform::{transform_html, TransformHtmlOptions, HtmlTransformError}; +pub use html_transform::{transform_html, HtmlTransformError, TransformHtmlOptions}; use std::collections::HashMap; type Handle = u32; @@ -318,7 +318,10 @@ impl ScrapeOptions { } pub fn with_viewport(mut self, width: u32, height: u32) -> Self { - self.viewport = Some(Viewport { width: Some(width), height: Some(height) }); + self.viewport = Some(Viewport { + width: Some(width), + height: Some(height), + }); self } @@ -449,7 +452,11 @@ impl BlessCrawl { } /// Scrapes webpage content and returns it as markdown with metadata. - pub fn scrape(&self, url: &str, options: Option) -> Result, WebScrapeErrorKind> { + pub fn scrape( + &self, + url: &str, + options: Option, + ) -> Result, WebScrapeErrorKind> { // Use provided options or fall back to instance config let config = if let Some(opts) = options { self.validate_config(&opts)?; @@ -491,13 +498,11 @@ impl BlessCrawl { unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) }; // deserialize the result to host ScrapeResponse - let mut scrape_response = serde_json::from_slice::>( - result_bytes, - ) - .map_err(|e| { - eprintln!("error: {:?}", e); - WebScrapeErrorKind::ParseError - })?; + let mut scrape_response = serde_json::from_slice::>(result_bytes) + .map_err(|e| { + eprintln!("error: {:?}", e); + WebScrapeErrorKind::ParseError + })?; if let Some(error) = scrape_response.error { return Err(WebScrapeErrorKind::RuntimeError(error)); @@ -510,7 +515,8 @@ impl BlessCrawl { include_tags: config.include_tags.unwrap_or_default(), exclude_tags: config.exclude_tags.unwrap_or_default(), only_main_content: config.only_main_content, - }).map_err(|e| { + }) + .map_err(|e| { eprintln!("error: {:?}", e); WebScrapeErrorKind::TransformError })?; @@ -573,10 +579,11 @@ impl BlessCrawl { unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) }; // deserialize the result to MapResponse - let map_response = serde_json::from_slice::>(result_bytes).map_err(|e| { - eprintln!("error: {:?}", e); - WebScrapeErrorKind::ParseError - })?; + let map_response = + serde_json::from_slice::>(result_bytes).map_err(|e| { + eprintln!("error: {:?}", e); + WebScrapeErrorKind::ParseError + })?; if let Some(error) = map_response.error { return Err(WebScrapeErrorKind::RuntimeError(error)); @@ -630,11 +637,13 @@ impl BlessCrawl { unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) }; // deserialize the result to CrawlResponse - let mut host_crawl_response = - serde_json::from_slice::>>(result_bytes).map_err(|e| { - eprintln!("error: {:?}", e); - WebScrapeErrorKind::ParseError - })?; + let mut host_crawl_response = serde_json::from_slice::>>( + result_bytes, + ) + .map_err(|e| { + eprintln!("error: {:?}", e); + WebScrapeErrorKind::ParseError + })?; if let Some(error) = host_crawl_response.error { return Err(WebScrapeErrorKind::RuntimeError(error)); @@ -648,7 +657,8 @@ impl BlessCrawl { include_tags: self.config.include_tags.clone().unwrap_or_default(), exclude_tags: self.config.exclude_tags.clone().unwrap_or_default(), only_main_content: self.config.only_main_content, - }).map_err(|e| { + }) + .map_err(|e| { eprintln!("error: {:?}", e); WebScrapeErrorKind::TransformError })?; From 260340a9f92b4f380d60507b7ab61d95977a1c9d Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:56:05 +1200 Subject: [PATCH 09/12] fixed clippy errors --- src/bless_crawl/html_to_markdown.rs | 4 +--- src/bless_crawl/mod.rs | 8 +------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/bless_crawl/html_to_markdown.rs b/src/bless_crawl/html_to_markdown.rs index f8eb03f..9137634 100644 --- a/src/bless_crawl/html_to_markdown.rs +++ b/src/bless_crawl/html_to_markdown.rs @@ -24,9 +24,7 @@ pub fn parse_markdown(html: &str) -> String { // Process the markdown content let processed_markdown = process_multiline_links(&markdown); - let final_markdown = remove_skip_to_content_links(&processed_markdown); - - final_markdown + remove_skip_to_content_links(&processed_markdown) } /// Processes multi-line links by escaping newlines inside link content diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs index 1d1e509..bbc0858 100644 --- a/src/bless_crawl/mod.rs +++ b/src/bless_crawl/mod.rs @@ -174,12 +174,6 @@ pub enum Format { Json, } -impl std::fmt::Display for Format { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.to_string().to_lowercase()) - } -} - impl std::str::FromStr for Format { type Err = (); fn from_str(s: &str) -> Result { @@ -420,7 +414,7 @@ impl BlessCrawl { pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024; /// Maximum result buffer size in bytes (1MB) - pub const MAX_MAP_BUFFER_SIZE: usize = 1 * 1024 * 1024; + pub const MAX_MAP_BUFFER_SIZE: usize = 1024 * 1024; /// Maximum result buffer size in bytes (8MB) pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024; From 7460e59f72e0807b0500db76286951142b8576ca Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 16:58:42 +1200 Subject: [PATCH 10/12] fixed clippy warnings --- src/bless_crawl/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs index bbc0858..78d0ed1 100644 --- a/src/bless_crawl/mod.rs +++ b/src/bless_crawl/mod.rs @@ -43,6 +43,7 @@ type ExitCode = u8; #[link(wasm_import_module = "bless_crawl")] extern "C" { /// Scrape webpage content and return as markdown + #[allow(clippy::too_many_arguments)] fn scrape( h: *mut Handle, url_ptr: *const u8, @@ -55,6 +56,7 @@ extern "C" { ) -> ExitCode; /// Extract and return all discoverable links from webpage + #[allow(clippy::too_many_arguments)] fn map( h: *mut Handle, url_ptr: *const u8, @@ -67,6 +69,7 @@ extern "C" { ) -> ExitCode; /// Recursively crawl website starting from given URL + #[allow(clippy::too_many_arguments)] fn crawl( h: *mut Handle, url_ptr: *const u8, @@ -87,6 +90,7 @@ extern "C" { mod mock_ffi { use super::{ExitCode, Handle}; + #[allow(clippy::too_many_arguments)] pub unsafe fn scrape( h: *mut Handle, _url_ptr: *const u8, @@ -100,6 +104,7 @@ mod mock_ffi { unimplemented!() } + #[allow(clippy::too_many_arguments)] pub unsafe fn map( h: *mut Handle, _url_ptr: *const u8, @@ -113,6 +118,7 @@ mod mock_ffi { unimplemented!() } + #[allow(clippy::too_many_arguments)] pub unsafe fn crawl( h: *mut Handle, _url_ptr: *const u8, From eb02b5f54850d9a05767a83c1fc5080c0e434645 Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 17:01:10 +1200 Subject: [PATCH 11/12] return 1 as exitcode for mock-ffi impl --- src/bless_crawl/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs index 78d0ed1..159f67d 100644 --- a/src/bless_crawl/mod.rs +++ b/src/bless_crawl/mod.rs @@ -101,7 +101,7 @@ mod mock_ffi { result_len: usize, bytes_written: *mut usize, ) -> ExitCode { - unimplemented!() + 1 } #[allow(clippy::too_many_arguments)] @@ -115,7 +115,7 @@ mod mock_ffi { result_len: usize, bytes_written: *mut usize, ) -> ExitCode { - unimplemented!() + 1 } #[allow(clippy::too_many_arguments)] @@ -129,11 +129,11 @@ mod mock_ffi { result_len: usize, bytes_written: *mut usize, ) -> ExitCode { - unimplemented!() + 1 } pub unsafe fn close(_h: Handle) -> ExitCode { - unimplemented!() + 1 } } From 1e9295fb36baf8a4b151cdf507831fd1756dde86 Mon Sep 17 00:00:00 2001 From: z Date: Wed, 25 Jun 2025 17:09:03 +1200 Subject: [PATCH 12/12] fixed doc tests --- src/bless_crawl/mod.rs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs index 159f67d..8a3ffc2 100644 --- a/src/bless_crawl/mod.rs +++ b/src/bless_crawl/mod.rs @@ -13,21 +13,6 @@ //! - Timeout: 15s default, 120s max //! - Wait time: 3s default, 20s max //! - Buffer sizes: 2MB (scrape), 128KB (map), 8MB (crawl) -//! -//! ## Example -//! -//! ```rust -//! use blockless_sdk::*; -//! -//! // Create with default config -//! let crawler = BlessCrawl::default(); -//! let result = crawler.scrape("https://example.com", None).unwrap(); -//! -//! // Or override config per request -//! let custom_config = ScrapeOptions { timeout: 30000, wait_time: 5000, ..Default::default() }; -//! let result = crawler.scrape("https://example.com", Some(custom_config)).unwrap(); -//! println!("Content: {}", result.data.content); -//! ``` mod html_to_markdown; mod html_transform;