diff --git a/Cargo.toml b/Cargo.toml
index a9e2388..48b0121 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,11 +10,13 @@ license = "MIT/Apache-2.0"
repository = "https://github.com/blocklessnetwork/sdk-rust"
[dependencies]
+htmd = { version = "0.2.2", default-features = false }
json = { version = "0.12", default-features = false }
+kuchikiki = { version = "0.8", default-features = false }
+regex = { version = "1.11.1", default-features = false, features = ["unicode-case"] }
serde = { version = "1.0", features = ["derive"], optional = true }
-
-[dev-dependencies]
-serde_json = "1.0"
+serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
+url = { version = "2.5", default-features = false }
[features]
default = ["serde"]
diff --git a/README.md b/README.md
index 8e7d9dc..46a799e 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ cargo build --release --target wasm32-wasip1 --example llm-mcp
| [httpbin](./examples/httpbin.rs) | HTTP to query anything from httpbin | ✅ | ✅ |
| [llm](./examples/llm.rs) | LLM to chat with `Llama-3.1-8B-Instruct-q4f32_1-MLC` and `SmolLM2-1.7B-Instruct-q4f16_1-MLC` models | ✅ | ✅ |
| [llm-mcp](./examples/llm-mcp.rs) | LLM with MCP (Model Control Protocol) demonstrating tool integration using SSE endpoints | ✅ | ✅ |
-
+| [web-scrape](./examples/web-scrape.rs) | Web Scraping to scrape content from a single URL with custom configuration overrides | ✅ | ❌ |
## Testing
diff --git a/examples/web-scrape.rs b/examples/web-scrape.rs
new file mode 100644
index 0000000..d4165a3
--- /dev/null
+++ b/examples/web-scrape.rs
@@ -0,0 +1,93 @@
+use blockless_sdk::*;
+
+/// This example demonstrates how to use the Blockless SDK to perform web scraping
+/// using the BlessCrawl functionality.
+///
+/// It shows how to:
+/// - Create a BlessCrawl instance with default configuration
+/// - Scrape content from a single URL with custom configuration overrides
+/// - Map links from a webpage to discover available URLs
+/// - Handle errors and responses appropriately
+fn main() {
+ println!("=== Blockless Web Scraping SDK Example ===\n");
+
+ example_scraping();
+ example_mapping();
+ example_crawling();
+}
+
+fn example_scraping() {
+ println!("--- Example 1: Basic Web Scraping ---");
+
+ let url = "https://example.com";
+ println!("scraping: {}...", url);
+
+ // First scrape with default config
+ let response = BlessCrawl::default()
+ .scrape(url, None)
+ .expect("Failed to scrape");
+ println!("response with default config: {:?}", response);
+ println!();
+ println!(
+ "---------- markdown ----------\n{}\n------------------------------",
+ response.data.content
+ );
+}
+
+fn example_mapping() {
+ println!("--- Example 2: Link Mapping/Discovery ---");
+
+ let url = "https://example.com";
+ println!("Mapping links from: {}", url);
+
+ let options = MapOptions::new()
+ .with_link_types(vec!["internal".to_string(), "external".to_string()])
+ .with_base_url(url.to_string())
+ .with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]);
+
+ let response = BlessCrawl::default()
+ .map(url, Some(options))
+ .expect("Failed to map");
+ println!("response: {:?}", response);
+ println!();
+ println!(
+ "------------ links ------------\n{:?}\n------------------------------",
+ response.data.links
+ );
+ println!();
+ println!(
+ "------------ total links ------------\n{}\n------------------------------",
+ response.data.total_links
+ );
+}
+
+fn example_crawling() {
+ println!("--- Example 3: Recursive Website Crawling ---");
+
+ let url = "https://example.com";
+ println!("Crawling website: {}", url);
+
+ let options = CrawlOptions::new()
+ .with_max_depth(2)
+ .with_limit(10)
+ .with_include_paths(vec!["/".to_string()])
+ .with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()])
+ .with_follow_external(false)
+ .with_delay_between_requests(1000)
+ .with_parallel_requests(3);
+
+ let response = BlessCrawl::default()
+ .crawl(url, Some(options))
+ .expect("Failed to crawl");
+ println!("response: {:?}", response);
+ println!();
+ println!(
+ "------------ pages ------------\n{:?}\n------------------------------",
+ response.data.pages
+ );
+ println!();
+ println!(
+ "------------ total pages ------------\n{}\n------------------------------",
+ response.data.total_pages
+ );
+}
diff --git a/src/bless_crawl/html_to_markdown.rs b/src/bless_crawl/html_to_markdown.rs
new file mode 100644
index 0000000..9137634
--- /dev/null
+++ b/src/bless_crawl/html_to_markdown.rs
@@ -0,0 +1,119 @@
+use htmd::HtmlToMarkdown;
+use regex::Regex;
+
+/// Parses HTML content and converts it to Markdown
+///
+/// This function replicates the behavior of the JavaScript parseMarkdown function:
+/// - Converts HTML to Markdown using htmd
+/// - Processes multi-line links by escaping newlines inside link content
+/// - Removes "Skip to Content" links
+/// - Returns empty string for empty/null input
+pub fn parse_markdown(html: &str) -> String {
+ if html.is_empty() {
+ return String::new();
+ }
+
+ // Convert HTML to Markdown using htmd
+ let markdown = match HtmlToMarkdown::new().convert(html) {
+ Ok(md) => md,
+ Err(_) => {
+ // Return empty string if conversion fails
+ return String::new();
+ }
+ };
+
+ // Process the markdown content
+ let processed_markdown = process_multiline_links(&markdown);
+ remove_skip_to_content_links(&processed_markdown)
+}
+
+/// Processes multi-line links by escaping newlines inside link content
+///
+/// This function replicates the JavaScript processMultiLineLinks function:
+/// - Tracks when we're inside link content (between [ and ])
+/// - Escapes newlines with backslash when inside links
+fn process_multiline_links(markdown_content: &str) -> String {
+ let mut new_markdown_content = String::new();
+ let mut link_open_count: usize = 0;
+
+ for ch in markdown_content.chars() {
+ match ch {
+ '[' => {
+ link_open_count += 1;
+ }
+ ']' => {
+ link_open_count = link_open_count.saturating_sub(1);
+ }
+ _ => {}
+ }
+
+ let inside_link_content = link_open_count > 0;
+
+ if inside_link_content && ch == '\n' {
+ new_markdown_content.push('\\');
+ new_markdown_content.push('\n');
+ } else {
+ new_markdown_content.push(ch);
+ }
+ }
+
+ new_markdown_content
+}
+
+/// Removes "Skip to Content" links from the markdown content
+///
+/// This function replicates the JavaScript removeSkipToContentLinks function:
+/// - Removes [Skip to Content](#page) and [Skip to content](#skip) patterns
+/// - Case-insensitive matching
+fn remove_skip_to_content_links(markdown_content: &str) -> String {
+ let re = Regex::new(r"(?i)\[Skip to Content\]\(#[^)]*\)").unwrap();
+ re.replace_all(markdown_content, "").to_string()
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_parse_markdown_simple() {
+ let html = "
Hello, world!
";
+ let result = parse_markdown(html);
+ assert_eq!(result.trim(), "Hello, world!");
+ }
+
+ #[test]
+ fn test_parse_markdown_complex() {
+ let html =
+ "";
+ let result = parse_markdown(html);
+ assert_eq!(result.trim(), "Hello **bold** world!\n\n* List item");
+ }
+
+ #[test]
+ fn test_parse_markdown_empty() {
+ let html = "";
+ let result = parse_markdown(html);
+ assert_eq!(result, "");
+ }
+
+ #[test]
+ fn test_process_multiline_links() {
+ let markdown = "[Link\nwith newline](http://example.com)";
+ let result = process_multiline_links(markdown);
+ assert_eq!(result, "[Link\\\nwith newline](http://example.com)");
+ }
+
+ #[test]
+ fn test_remove_skip_to_content_links() {
+ let markdown = "Some content [Skip to Content](#page) more content";
+ let result = remove_skip_to_content_links(markdown);
+ assert_eq!(result, "Some content more content");
+ }
+
+ #[test]
+ fn test_remove_skip_to_content_links_case_insensitive() {
+ let markdown = "Some content [Skip to content](#skip) more content";
+ let result = remove_skip_to_content_links(markdown);
+ assert_eq!(result, "Some content more content");
+ }
+}
diff --git a/src/bless_crawl/html_transform.rs b/src/bless_crawl/html_transform.rs
new file mode 100644
index 0000000..8c56ebe
--- /dev/null
+++ b/src/bless_crawl/html_transform.rs
@@ -0,0 +1,374 @@
+use kuchikiki::{parse_html, traits::TendrilSink};
+use serde::{Deserialize, Serialize};
+use url::Url;
+
+const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
+ "header",
+ "footer",
+ "nav",
+ "aside",
+ ".header",
+ ".top",
+ ".navbar",
+ "#header",
+ ".footer",
+ ".bottom",
+ "#footer",
+ ".sidebar",
+ ".side",
+ ".aside",
+ "#sidebar",
+ ".modal",
+ ".popup",
+ "#modal",
+ ".overlay",
+ ".ad",
+ ".ads",
+ ".advert",
+ "#ad",
+ ".lang-selector",
+ ".language",
+ "#language-selector",
+ ".social",
+ ".social-media",
+ ".social-links",
+ "#social",
+ ".menu",
+ ".navigation",
+ "#nav",
+ ".breadcrumbs",
+ "#breadcrumbs",
+ ".share",
+ "#share",
+ ".widget",
+ "#widget",
+ ".cookie",
+ "#cookie",
+];
+
+const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [
+ "#main",
+ // swoogo event software as .widget in all of their content
+ ".swoogo-cols",
+ ".swoogo-text",
+ ".swoogo-table-div",
+ ".swoogo-space",
+ ".swoogo-alert",
+ ".swoogo-sponsors",
+ ".swoogo-title",
+ ".swoogo-tabs",
+ ".swoogo-logo",
+ ".swoogo-image",
+ ".swoogo-button",
+ ".swoogo-agenda",
+];
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TransformHtmlOptions {
+ pub html: String,
+ pub url: String,
+ pub include_tags: Vec,
+ pub exclude_tags: Vec,
+ pub only_main_content: bool,
+}
+
+#[derive(Debug)]
+struct ImageSource {
+ url: String,
+ size: i32,
+ is_x: bool,
+}
+
+#[derive(Debug)]
+pub enum HtmlTransformError {
+ ParseError,
+ UrlParseError,
+ SelectError,
+}
+
+impl std::fmt::Display for HtmlTransformError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ HtmlTransformError::ParseError => write!(f, "Failed to parse HTML"),
+ HtmlTransformError::UrlParseError => write!(f, "Failed to parse URL"),
+ HtmlTransformError::SelectError => write!(f, "Failed to select HTML elements"),
+ }
+ }
+}
+
+impl std::error::Error for HtmlTransformError {}
+
+/// Transforms HTML by removing unwanted elements, filtering tags, and processing URLs
+pub fn transform_html(opts: TransformHtmlOptions) -> Result {
+ let mut document = parse_html().one(opts.html);
+
+ // If include_tags is specified, only include those tags
+ if !opts.include_tags.is_empty() {
+ let new_document = parse_html().one("");
+ let root = new_document
+ .select_first("div")
+ .map_err(|_| HtmlTransformError::SelectError)?;
+
+ for tag_selector in opts.include_tags.iter() {
+ let matching_nodes: Vec<_> = document
+ .select(tag_selector)
+ .map_err(|_| HtmlTransformError::SelectError)?
+ .collect();
+ for tag in matching_nodes {
+ root.as_node().append(tag.as_node().clone());
+ }
+ }
+
+ document = new_document;
+ }
+
+ // Remove unwanted elements
+ let unwanted_selectors = ["head", "meta", "noscript", "style", "script"];
+ for selector in &unwanted_selectors {
+ while let Ok(element) = document.select_first(selector) {
+ element.as_node().detach();
+ }
+ }
+
+ // Remove excluded tags
+ for tag_selector in opts.exclude_tags.iter() {
+ while let Ok(element) = document.select_first(tag_selector) {
+ element.as_node().detach();
+ }
+ }
+
+ // Remove non-main content if requested
+ if opts.only_main_content {
+ for selector in EXCLUDE_NON_MAIN_TAGS.iter() {
+ let elements: Vec<_> = document
+ .select(selector)
+ .map_err(|_| HtmlTransformError::SelectError)?
+ .collect();
+ for element in elements {
+ // Check if this element contains any force-include tags
+ let should_keep = FORCE_INCLUDE_MAIN_TAGS.iter().any(|force_selector| {
+ element
+ .as_node()
+ .select(force_selector)
+ .map(|mut iter| iter.next().is_some())
+ .unwrap_or(false)
+ });
+
+ if !should_keep {
+ element.as_node().detach();
+ }
+ }
+ }
+ }
+
+ // Process images with srcset attributes
+ let srcset_images: Vec<_> = document
+ .select("img[srcset]")
+ .map_err(|_| HtmlTransformError::SelectError)?
+ .collect();
+
+ for img in srcset_images {
+ let srcset = img.attributes.borrow().get("srcset").map(|s| s.to_string());
+ if let Some(srcset) = srcset {
+ let mut sizes: Vec = srcset
+ .split(',')
+ .filter_map(|entry| {
+ let tokens: Vec<&str> = entry.trim().split(' ').collect();
+ if tokens.is_empty() {
+ return None;
+ }
+
+ let size_token = if tokens.len() > 1 && !tokens[1].is_empty() {
+ tokens[1]
+ } else {
+ "1x"
+ };
+
+ if let Ok(parsed_size) = size_token[..size_token.len() - 1].parse() {
+ Some(ImageSource {
+ url: tokens[0].to_string(),
+ size: parsed_size,
+ is_x: size_token.ends_with('x'),
+ })
+ } else {
+ None
+ }
+ })
+ .collect();
+
+ // Add src attribute as 1x if all sizes are x-based
+ if sizes.iter().all(|s| s.is_x) {
+ let src = img.attributes.borrow().get("src").map(|s| s.to_string());
+ if let Some(src) = src {
+ sizes.push(ImageSource {
+ url: src,
+ size: 1,
+ is_x: true,
+ });
+ }
+ }
+
+ // Sort by size (largest first) and use the biggest image
+ sizes.sort_by(|a, b| b.size.cmp(&a.size));
+ if let Some(biggest) = sizes.first() {
+ img.attributes
+ .borrow_mut()
+ .insert("src", biggest.url.clone());
+ }
+ }
+ }
+
+ // Convert relative URLs to absolute URLs
+ let base_url = Url::parse(&opts.url).map_err(|_| HtmlTransformError::UrlParseError)?;
+
+ // Process image src attributes
+ let src_images: Vec<_> = document
+ .select("img[src]")
+ .map_err(|_| HtmlTransformError::SelectError)?
+ .collect();
+ for img in src_images {
+ let old_src = img.attributes.borrow().get("src").map(|s| s.to_string());
+ if let Some(old_src) = old_src {
+ if let Ok(new_url) = base_url.join(&old_src) {
+ img.attributes
+ .borrow_mut()
+ .insert("src", new_url.to_string());
+ }
+ }
+ }
+
+ // Process anchor href attributes
+ let href_anchors: Vec<_> = document
+ .select("a[href]")
+ .map_err(|_| HtmlTransformError::SelectError)?
+ .collect();
+ for anchor in href_anchors {
+ let old_href = anchor
+ .attributes
+ .borrow()
+ .get("href")
+ .map(|s| s.to_string());
+ if let Some(old_href) = old_href {
+ if let Ok(new_url) = base_url.join(&old_href) {
+ anchor
+ .attributes
+ .borrow_mut()
+ .insert("href", new_url.to_string());
+ }
+ }
+ }
+
+ Ok(document.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_transform_html_removes_unwanted_elements() {
+ let opts = TransformHtmlOptions {
+ html: "TestContent
".to_string(),
+ url: "https://example.com".to_string(),
+ include_tags: vec![],
+ exclude_tags: vec![],
+ only_main_content: false,
+ };
+
+ let result = transform_html(opts).unwrap();
+ let expected = "Content
";
+ assert_eq!(result, expected);
+ }
+
+ #[test]
+ fn test_transform_html_include_tags() {
+ let opts = TransformHtmlOptions {
+ html: "Keep this
".to_string(),
+ url: "https://example.com".to_string(),
+ include_tags: vec![".content".to_string()],
+ exclude_tags: vec![],
+ only_main_content: false,
+ };
+
+ let result = transform_html(opts).unwrap();
+ let expected =
+ "";
+ assert_eq!(result, expected);
+ }
+
+ #[test]
+ fn test_transform_html_exclude_tags() {
+ let opts = TransformHtmlOptions {
+ html: "Keep this
Remove this
".to_string(),
+ url: "https://example.com".to_string(),
+ include_tags: vec![],
+ exclude_tags: vec![".ad".to_string()],
+ only_main_content: false,
+ };
+
+ let result = transform_html(opts).unwrap();
+ let expected = "Keep this
";
+ assert_eq!(result, expected);
+ }
+
+ #[test]
+ fn test_transform_html_relative_urls() {
+ let opts = TransformHtmlOptions {
+ html: r#"
Link"#
+ .to_string(),
+ url: "https://example.com/subdir/".to_string(),
+ include_tags: vec![],
+ exclude_tags: vec![],
+ only_main_content: false,
+ };
+
+ let result = transform_html(opts).unwrap();
+ let expected = r#"
Link"#;
+ assert_eq!(result, expected);
+ }
+
+ #[test]
+ fn test_transform_html_only_main_content() {
+ let opts = TransformHtmlOptions {
+ html: "Main content
".to_string(),
+ url: "https://example.com".to_string(),
+ include_tags: vec![],
+ exclude_tags: vec![],
+ only_main_content: true,
+ };
+
+ let result = transform_html(opts).unwrap();
+ let expected = "Main content
";
+ assert_eq!(result, expected);
+ }
+
+ #[test]
+ fn test_transform_html_srcset_processing() {
+ let opts = TransformHtmlOptions {
+ html: r#"
"#.to_string(),
+ url: "https://example.com".to_string(),
+ include_tags: vec![],
+ exclude_tags: vec![],
+ only_main_content: false,
+ };
+
+ let result = transform_html(opts).unwrap();
+ let expected = r#"
"#;
+ assert_eq!(result, expected);
+ }
+
+ #[test]
+ fn test_transform_html_force_include_tags() {
+ let opts = TransformHtmlOptions {
+ html: r#""#.to_string(),
+ url: "https://example.com".to_string(),
+ include_tags: vec![],
+ exclude_tags: vec![],
+ only_main_content: true,
+ };
+
+ let result = transform_html(opts).unwrap();
+ let expected = r#""#;
+ assert_eq!(result, expected);
+ }
+}
diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs
new file mode 100644
index 0000000..8a3ffc2
--- /dev/null
+++ b/src/bless_crawl/mod.rs
@@ -0,0 +1,746 @@
+//! # BlessCrawl - Distributed Web Scraping SDK
+//!
+//! Provides distributed web scraping across the BLESS network's browser nodes.
+//!
+//! ## Features
+//!
+//! - **scrape()**: Extract content from a URL as markdown
+//! - **map()**: Discover and extract all links from a webpage
+//! - **crawl()**: Recursively crawl websites with depth controls
+//!
+//! ## Limits
+//!
+//! - Timeout: 15s default, 120s max
+//! - Wait time: 3s default, 20s max
+//! - Buffer sizes: 2MB (scrape), 128KB (map), 8MB (crawl)
+
+mod html_to_markdown;
+mod html_transform;
+
+use html_to_markdown::parse_markdown;
+pub use html_transform::{transform_html, HtmlTransformError, TransformHtmlOptions};
+use std::collections::HashMap;
+
+type Handle = u32;
+type ExitCode = u8;
+
+#[cfg(not(feature = "mock-ffi"))]
+#[link(wasm_import_module = "bless_crawl")]
+extern "C" {
+ /// Scrape webpage content and return as markdown
+ #[allow(clippy::too_many_arguments)]
+ fn scrape(
+ h: *mut Handle,
+ url_ptr: *const u8,
+ url_len: usize,
+ options_ptr: *const u8,
+ options_len: usize,
+ result_ptr: *mut u8,
+ result_len: usize,
+ bytes_written: *mut usize,
+ ) -> ExitCode;
+
+ /// Extract and return all discoverable links from webpage
+ #[allow(clippy::too_many_arguments)]
+ fn map(
+ h: *mut Handle,
+ url_ptr: *const u8,
+ url_len: usize,
+ options_ptr: *const u8,
+ options_len: usize,
+ result_ptr: *mut u8,
+ result_len: usize,
+ bytes_written: *mut usize,
+ ) -> ExitCode;
+
+ /// Recursively crawl website starting from given URL
+ #[allow(clippy::too_many_arguments)]
+ fn crawl(
+ h: *mut Handle,
+ url_ptr: *const u8,
+ url_len: usize,
+ options_ptr: *const u8,
+ options_len: usize,
+ result_ptr: *mut u8,
+ result_len: usize,
+ bytes_written: *mut usize,
+ ) -> ExitCode;
+
+ /// Close and cleanup a web scraper instance
+ fn close(h: Handle) -> ExitCode;
+}
+
+#[cfg(feature = "mock-ffi")]
+#[allow(unused_variables)]
+mod mock_ffi {
+ use super::{ExitCode, Handle};
+
+ #[allow(clippy::too_many_arguments)]
+ pub unsafe fn scrape(
+ h: *mut Handle,
+ _url_ptr: *const u8,
+ _url_len: usize,
+ _options_ptr: *const u8,
+ _options_len: usize,
+ result_ptr: *mut u8,
+ result_len: usize,
+ bytes_written: *mut usize,
+ ) -> ExitCode {
+ 1
+ }
+
+ #[allow(clippy::too_many_arguments)]
+ pub unsafe fn map(
+ h: *mut Handle,
+ _url_ptr: *const u8,
+ _url_len: usize,
+ _options_ptr: *const u8,
+ _options_len: usize,
+ result_ptr: *mut u8,
+ result_len: usize,
+ bytes_written: *mut usize,
+ ) -> ExitCode {
+ 1
+ }
+
+ #[allow(clippy::too_many_arguments)]
+ pub unsafe fn crawl(
+ h: *mut Handle,
+ _url_ptr: *const u8,
+ _url_len: usize,
+ _options_ptr: *const u8,
+ _options_len: usize,
+ result_ptr: *mut u8,
+ result_len: usize,
+ bytes_written: *mut usize,
+ ) -> ExitCode {
+ 1
+ }
+
+ pub unsafe fn close(_h: Handle) -> ExitCode {
+ 1
+ }
+}
+
+#[cfg(feature = "mock-ffi")]
+use mock_ffi::*;
+
+#[derive(Debug, Clone, PartialEq, serde::Serialize)]
+pub struct ScrapeOptions {
+ pub timeout: u32,
+ pub wait_time: u32,
+ pub include_tags: Option>,
+ pub exclude_tags: Option>,
+ pub only_main_content: bool,
+ pub format: Format,
+ pub viewport: Option,
+ pub user_agent: Option,
+ pub headers: Option>,
+}
+
+impl Default for ScrapeOptions {
+ fn default() -> Self {
+ Self {
+ timeout: BlessCrawl::DEFAULT_TIMEOUT_MS,
+ wait_time: BlessCrawl::DEFAULT_WAIT_TIME_MS,
+ include_tags: None,
+ exclude_tags: None,
+ only_main_content: false,
+ format: Format::Markdown,
+ viewport: None,
+ user_agent: None,
+ headers: None,
+ }
+ }
+}
+
+#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
+pub enum Format {
+ #[default]
+ #[serde(rename = "markdown")]
+ Markdown,
+ #[serde(rename = "html")]
+ Html,
+ #[serde(rename = "json")]
+ Json,
+}
+
+impl std::str::FromStr for Format {
+ type Err = ();
+ fn from_str(s: &str) -> Result {
+ match s {
+ "markdown" => Ok(Format::Markdown),
+ "html" => Ok(Format::Html),
+ "json" => Ok(Format::Json),
+ _ => Err(()),
+ }
+ }
+}
+
+#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
+pub struct Viewport {
+ pub width: Option,
+ pub height: Option,
+}
+
+#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
+pub struct MapOptions {
+ pub link_types: Option>,
+ pub base_url: Option,
+ pub filter_extensions: Option>,
+}
+
+#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
+pub struct CrawlOptions {
+ pub limit: Option,
+ pub max_depth: Option,
+ pub exclude_paths: Option>,
+ pub include_paths: Option>,
+ pub follow_external: Option,
+ pub delay_between_requests: Option,
+ pub parallel_requests: Option,
+}
+
+#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
+pub struct PageMetadata {
+ pub title: Option,
+ pub description: Option,
+ pub url: String,
+ pub status_code: u16,
+ pub language: Option,
+ pub keywords: Option,
+ pub robots: Option,
+ pub author: Option,
+ pub creator: Option,
+ pub publisher: Option,
+ pub og_title: Option,
+ pub og_description: Option,
+ pub og_image: Option,
+ pub og_url: Option,
+ pub og_site_name: Option,
+ pub og_type: Option,
+ pub twitter_title: Option,
+ pub twitter_description: Option,
+ pub twitter_image: Option,
+ pub twitter_card: Option,
+ pub twitter_site: Option,
+ pub twitter_creator: Option,
+ pub favicon: Option,
+ pub viewport: Option,
+ pub referrer: Option,
+ pub content_type: Option,
+ pub scrape_id: Option,
+ pub source_url: Option,
+ pub proxy_used: Option,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct ScrapeData {
+ pub success: bool,
+ pub timestamp: u64,
+ pub format: Format,
+ pub content: String,
+ pub metadata: PageMetadata,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct Response {
+ pub success: bool,
+ pub error: Option,
+ pub data: T,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct LinkInfo {
+ pub url: String,
+ // TODO: use enum instead of string
+ pub link_type: String, // "internal", "external", "anchor"
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct MapData {
+ pub url: String,
+ pub links: Vec,
+ pub total_links: usize,
+ pub timestamp: u64,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct CrawlError {
+ pub url: String,
+ pub error: String,
+ pub depth: u32,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct CrawlData {
+ pub root_url: String,
+ pub pages: Vec,
+ pub link_map: Option,
+ pub depth_reached: u8,
+ pub total_pages: usize,
+ pub errors: Vec,
+}
+
+impl ScrapeOptions {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ pub fn with_include_tags(mut self, tags: Vec) -> Self {
+ self.include_tags = Some(tags);
+ self
+ }
+
+ pub fn with_exclude_tags(mut self, tags: Vec) -> Self {
+ self.exclude_tags = Some(tags);
+ self
+ }
+
+ pub fn with_format(mut self, format: Format) -> Self {
+ self.format = format;
+ self
+ }
+
+ pub fn with_viewport(mut self, width: u32, height: u32) -> Self {
+ self.viewport = Some(Viewport {
+ width: Some(width),
+ height: Some(height),
+ });
+ self
+ }
+
+ pub fn with_user_agent(mut self, user_agent: String) -> Self {
+ self.user_agent = Some(user_agent);
+ self
+ }
+
+ pub fn with_headers(mut self, headers: HashMap) -> Self {
+ self.headers = Some(headers);
+ self
+ }
+}
+
+impl MapOptions {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ pub fn with_link_types(mut self, link_types: Vec) -> Self {
+ self.link_types = Some(link_types);
+ self
+ }
+
+ pub fn with_base_url(mut self, base_url: String) -> Self {
+ self.base_url = Some(base_url);
+ self
+ }
+
+ pub fn with_filter_extensions(mut self, extensions: Vec) -> Self {
+ self.filter_extensions = Some(extensions);
+ self
+ }
+}
+
+impl CrawlOptions {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ pub fn with_limit(mut self, limit: u32) -> Self {
+ self.limit = Some(limit);
+ self
+ }
+
+ pub fn with_max_depth(mut self, max_depth: u8) -> Self {
+ self.max_depth = Some(max_depth);
+ self
+ }
+
+ pub fn with_exclude_paths(mut self, paths: Vec) -> Self {
+ self.exclude_paths = Some(paths);
+ self
+ }
+
+ pub fn with_include_paths(mut self, paths: Vec) -> Self {
+ self.include_paths = Some(paths);
+ self
+ }
+
+ pub fn with_follow_external(mut self, follow: bool) -> Self {
+ self.follow_external = Some(follow);
+ self
+ }
+
+ pub fn with_delay_between_requests(mut self, delay: u32) -> Self {
+ self.delay_between_requests = Some(delay);
+ self
+ }
+
+ pub fn with_parallel_requests(mut self, parallel: u32) -> Self {
+ self.parallel_requests = Some(parallel);
+ self
+ }
+}
+
+/// BlessCrawl client for distributed web scraping operations.
+#[derive(Debug, Clone, Default)]
+pub struct BlessCrawl {
+ inner: Handle,
+ config: ScrapeOptions,
+}
+
+impl BlessCrawl {
+ /// Default timeout in milliseconds (15 seconds)
+ pub const DEFAULT_TIMEOUT_MS: u32 = 15000;
+ /// Default wait time in milliseconds (3 seconds)
+ pub const DEFAULT_WAIT_TIME_MS: u32 = 3000;
+
+ /// Maximum timeout in milliseconds (2 minutes)
+ pub const MAX_TIMEOUT_MS: u32 = 120000;
+ /// Maximum wait time in milliseconds (20 seconds)
+ pub const MAX_WAIT_TIME_MS: u32 = 20000;
+
+ /// Maximum result buffer size in bytes (2MB)
+ pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024;
+
+ /// Maximum result buffer size in bytes (1MB)
+ pub const MAX_MAP_BUFFER_SIZE: usize = 1024 * 1024;
+
+ /// Maximum result buffer size in bytes (8MB)
+ pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024;
+
+ /// Creates a new BlessCrawl instance with the given configuration.
+ pub fn with_config(config: ScrapeOptions) -> Result {
+ let instance = Self { inner: 0, config };
+ instance.validate_config(&instance.config)?;
+ Ok(instance)
+ }
+
+ fn validate_config(&self, config: &ScrapeOptions) -> Result<(), WebScrapeErrorKind> {
+ if config.timeout > Self::MAX_TIMEOUT_MS {
+ return Err(WebScrapeErrorKind::InvalidTimeout);
+ }
+ if config.wait_time > Self::MAX_WAIT_TIME_MS {
+ return Err(WebScrapeErrorKind::InvalidWaitTime);
+ }
+ Ok(())
+ }
+
+ /// Returns a reference to the current configuration.
+ pub fn get_config(&self) -> &ScrapeOptions {
+ &self.config
+ }
+
+ pub fn handle(&self) -> Handle {
+ self.inner
+ }
+
+ /// Scrapes webpage content and returns it as markdown with metadata.
+ pub fn scrape(
+ &self,
+ url: &str,
+ options: Option,
+ ) -> Result, WebScrapeErrorKind> {
+ // Use provided options or fall back to instance config
+ let config = if let Some(opts) = options {
+ self.validate_config(&opts)?;
+ opts
+ } else {
+ self.config.clone()
+ };
+
+ let options_json = serde_json::to_vec(&config).unwrap();
+
+ let mut handle = self.inner;
+ let mut result_buf = vec![0u8; Self::MAX_SCRAPE_BUFFER_SIZE];
+ let mut bytes_written: usize = 0;
+
+ let code = unsafe {
+ scrape(
+ &mut handle,
+ url.as_ptr(),
+ url.len(),
+ options_json.as_ptr(),
+ options_json.len(),
+ result_buf.as_mut_ptr(),
+ result_buf.len(),
+ &mut bytes_written,
+ )
+ };
+
+ if code != 0 {
+ return Err(code.into());
+ }
+ if bytes_written == 0 {
+ return Err(WebScrapeErrorKind::EmptyResponse);
+ }
+ if bytes_written > result_buf.len() {
+ return Err(WebScrapeErrorKind::MemoryError);
+ }
+
+ let result_bytes =
+ unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
+
+ // deserialize the result to host ScrapeResponse
+ let mut scrape_response = serde_json::from_slice::>(result_bytes)
+ .map_err(|e| {
+ eprintln!("error: {:?}", e);
+ WebScrapeErrorKind::ParseError
+ })?;
+
+ if let Some(error) = scrape_response.error {
+ return Err(WebScrapeErrorKind::RuntimeError(error));
+ }
+
+ // post-process html
+ scrape_response.data.content = transform_html(TransformHtmlOptions {
+ html: scrape_response.data.content,
+ url: scrape_response.data.metadata.url.clone(),
+ include_tags: config.include_tags.unwrap_or_default(),
+ exclude_tags: config.exclude_tags.unwrap_or_default(),
+ only_main_content: config.only_main_content,
+ })
+ .map_err(|e| {
+ eprintln!("error: {:?}", e);
+ WebScrapeErrorKind::TransformError
+ })?;
+
+ // if the format is markdown, set the data to the markdown of the html
+ match config.format {
+ Format::Markdown => {
+ scrape_response.data.content = parse_markdown(&scrape_response.data.content);
+ }
+ Format::Html => (), // no need to do anything
+ Format::Json => unimplemented!(),
+ }
+
+ // convert the host ScrapeResponse to the user ScrapeResponse
+ Ok(scrape_response)
+ }
+
+ /// Extracts all links from a webpage, categorized by type.
+ pub fn map(
+ &self,
+ url: &str,
+ options: Option,
+ ) -> Result, WebScrapeErrorKind> {
+ let mut combined_options = serde_json::to_value(&self.config).unwrap();
+ if let Some(map_opts) = options {
+ combined_options["map_options"] = serde_json::to_value(map_opts).unwrap();
+ }
+ let options_json = serde_json::to_vec(&combined_options).unwrap();
+
+ let mut result_buf = vec![0u8; Self::MAX_MAP_BUFFER_SIZE];
+ let mut bytes_written: usize = 0;
+
+ let mut handle = self.inner;
+ let code = unsafe {
+ map(
+ &mut handle,
+ url.as_ptr(),
+ url.len(),
+ options_json.as_ptr(),
+ options_json.len(),
+ result_buf.as_mut_ptr(),
+ result_buf.len(),
+ &mut bytes_written,
+ )
+ };
+
+ if code != 0 {
+ return Err(code.into());
+ }
+
+ if bytes_written == 0 {
+ return Err(WebScrapeErrorKind::EmptyResponse);
+ }
+
+ if bytes_written > result_buf.len() {
+ return Err(WebScrapeErrorKind::MemoryError);
+ }
+
+ let result_bytes =
+ unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
+
+ // deserialize the result to MapResponse
+ let map_response =
+ serde_json::from_slice::>(result_bytes).map_err(|e| {
+ eprintln!("error: {:?}", e);
+ WebScrapeErrorKind::ParseError
+ })?;
+
+ if let Some(error) = map_response.error {
+ return Err(WebScrapeErrorKind::RuntimeError(error));
+ }
+
+ Ok(map_response)
+ }
+
+ /// Recursively crawls a website with configurable depth and filtering.
+ pub fn crawl(
+ &self,
+ url: &str,
+ options: Option,
+ ) -> Result>, WebScrapeErrorKind> {
+ let mut combined_options = serde_json::to_value(&self.config).unwrap();
+ if let Some(crawl_opts) = options {
+ combined_options["crawl_options"] = serde_json::to_value(crawl_opts).unwrap();
+ }
+ let options_json = serde_json::to_vec(&combined_options).unwrap();
+
+ let mut result_buf = vec![0u8; Self::MAX_CRAWL_BUFFER_SIZE];
+ let mut bytes_written: usize = 0;
+
+ let mut handle = self.inner;
+ let code = unsafe {
+ crawl(
+ &mut handle,
+ url.as_ptr(),
+ url.len(),
+ options_json.as_ptr(),
+ options_json.len(),
+ result_buf.as_mut_ptr(),
+ result_buf.len(),
+ &mut bytes_written,
+ )
+ };
+
+ if code != 0 {
+ return Err(code.into());
+ }
+
+ if bytes_written == 0 {
+ return Err(WebScrapeErrorKind::EmptyResponse);
+ }
+
+ if bytes_written > result_buf.len() {
+ return Err(WebScrapeErrorKind::MemoryError);
+ }
+
+ let result_bytes =
+ unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
+
+ // deserialize the result to CrawlResponse
+ let mut host_crawl_response = serde_json::from_slice::>>(
+ result_bytes,
+ )
+ .map_err(|e| {
+ eprintln!("error: {:?}", e);
+ WebScrapeErrorKind::ParseError
+ })?;
+
+ if let Some(error) = host_crawl_response.error {
+ return Err(WebScrapeErrorKind::RuntimeError(error));
+ }
+
+ // post-process html
+ for page in host_crawl_response.data.pages.iter_mut() {
+ page.content = transform_html(TransformHtmlOptions {
+ html: page.content.clone(),
+ url: page.metadata.url.clone(),
+ include_tags: self.config.include_tags.clone().unwrap_or_default(),
+ exclude_tags: self.config.exclude_tags.clone().unwrap_or_default(),
+ only_main_content: self.config.only_main_content,
+ })
+ .map_err(|e| {
+ eprintln!("error: {:?}", e);
+ WebScrapeErrorKind::TransformError
+ })?;
+
+ // if the format is markdown, set the content to the markdown of the html
+ match self.config.format {
+ Format::Markdown => {
+ page.content = parse_markdown(&page.content);
+ }
+ Format::Html => (), // no need to do anything
+ Format::Json => unimplemented!(),
+ }
+ }
+
+ // convert the host CrawlResponse to the user CrawlResponse
+ Ok(host_crawl_response)
+ }
+}
+
+impl Drop for BlessCrawl {
+ fn drop(&mut self) {
+ // if the handle is 0, it means the instance was never initialized on the host
+ if self.inner == 0 {
+ return;
+ }
+ let code = unsafe { close(self.inner) };
+ if code != 0 {
+ eprintln!("Error closing web scraper: {}", code);
+ }
+ }
+}
+
+#[derive(Debug)]
+pub enum WebScrapeErrorKind {
+ InvalidUrl,
+ Timeout,
+ NetworkError,
+ RenderingError,
+ MemoryError,
+ DepthExceeded,
+ RateLimited,
+ TransformError,
+ Utf8Error,
+ ParseError,
+ ScrapeFailed,
+ MapFailed,
+ CrawlFailed,
+ EmptyResponse,
+ InvalidTimeout,
+ InvalidWaitTime,
+ RuntimeError(String),
+}
+
+impl From for WebScrapeErrorKind {
+ fn from(code: u8) -> Self {
+ match code {
+ 1 => WebScrapeErrorKind::InvalidUrl,
+ 2 => WebScrapeErrorKind::Timeout,
+ 3 => WebScrapeErrorKind::NetworkError,
+ 4 => WebScrapeErrorKind::RenderingError,
+ 5 => WebScrapeErrorKind::MemoryError,
+ 6 => WebScrapeErrorKind::DepthExceeded,
+ 7 => WebScrapeErrorKind::RateLimited,
+ 8 => WebScrapeErrorKind::TransformError,
+ 9 => WebScrapeErrorKind::RuntimeError(String::from("Invalid timeout")),
+ 10 => WebScrapeErrorKind::RuntimeError(String::from("Invalid wait time")),
+ _ => WebScrapeErrorKind::RuntimeError(String::from("Unknown error")),
+ }
+ }
+}
+
+impl std::fmt::Display for WebScrapeErrorKind {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ WebScrapeErrorKind::InvalidUrl => write!(f, "Invalid URL provided"),
+ WebScrapeErrorKind::Timeout => write!(f, "Request timeout"),
+ WebScrapeErrorKind::NetworkError => write!(f, "Network error"),
+ WebScrapeErrorKind::RenderingError => write!(f, "Page rendering error"),
+ WebScrapeErrorKind::MemoryError => write!(f, "Memory allocation error"),
+ WebScrapeErrorKind::DepthExceeded => write!(f, "Maximum crawl depth exceeded"),
+ WebScrapeErrorKind::RateLimited => write!(f, "Rate limited"),
+ WebScrapeErrorKind::TransformError => write!(f, "Transform error"),
+ WebScrapeErrorKind::Utf8Error => write!(f, "UTF-8 conversion error"),
+ WebScrapeErrorKind::ParseError => write!(f, "JSON parse error"),
+ WebScrapeErrorKind::ScrapeFailed => write!(f, "Scrape operation failed"),
+ WebScrapeErrorKind::MapFailed => write!(f, "Map operation failed"),
+ WebScrapeErrorKind::CrawlFailed => write!(f, "Crawl operation failed"),
+ WebScrapeErrorKind::EmptyResponse => write!(f, "Empty response from host"),
+ WebScrapeErrorKind::InvalidTimeout => {
+ write!(f, "Timeout exceeds maximum allowed (120s)")
+ }
+ WebScrapeErrorKind::InvalidWaitTime => {
+ write!(f, "Wait time exceeds maximum allowed (20s)")
+ }
+ WebScrapeErrorKind::RuntimeError(error) => write!(f, "Runtime error: {}", error),
+ }
+ }
+}
+
+impl std::error::Error for WebScrapeErrorKind {}
diff --git a/src/lib.rs b/src/lib.rs
index d67e81e..b60c611 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,4 @@
+mod bless_crawl;
mod cgi;
mod error;
mod http;
@@ -5,6 +6,7 @@ mod llm;
mod memory;
mod socket;
+pub use bless_crawl::*;
pub use cgi::*;
pub use error::*;
pub use http::*;