From 85f6b015b6d0629bacb9e62746d0aef01f1b1cf5 Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:47:39 +1200
Subject: [PATCH 01/12] upd cargo.toml deps

---
 Cargo.toml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index a9e2388..48b0121 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,11 +10,13 @@ license = "MIT/Apache-2.0"
 repository = "https://github.com/blocklessnetwork/sdk-rust"
 
 [dependencies]
+htmd = { version = "0.2.2", default-features = false }
 json = { version = "0.12", default-features = false }
+kuchikiki = { version = "0.8", default-features = false }
+regex = { version = "1.11.1", default-features = false, features = ["unicode-case"] }
 serde = { version = "1.0", features = ["derive"], optional = true }
-
-[dev-dependencies]
-serde_json = "1.0"
+serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
+url = { version = "2.5", default-features = false }
 
 [features]
 default = ["serde"]

From ea2d17d646a69a0d56e36bb4eb2af8596c77d431 Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:48:28 +1200
Subject: [PATCH 02/12] bless-crawl plugin impl

---
 src/bless_crawl/mod.rs | 751 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 751 insertions(+)
 create mode 100644 src/bless_crawl/mod.rs

diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs
new file mode 100644
index 0000000..9c35b9b
--- /dev/null
+++ b/src/bless_crawl/mod.rs
@@ -0,0 +1,751 @@
+//! # BlessCrawl - Distributed Web Scraping SDK
+//!
+//! Provides distributed web scraping across the BLESS network's browser nodes.
+//!
+//! ## Features
+//!
+//! - **scrape()**: Extract content from a URL as markdown
+//! - **map()**: Discover and extract all links from a webpage
+//! - **crawl()**: Recursively crawl websites with depth controls
+//!
+//! ## Limits
+//!
+//! - Timeout: 15s default, 120s max
+//! - Wait time: 3s default, 20s max
+//! - Buffer sizes: 2MB (scrape), 128KB (map), 8MB (crawl)
+//!
+//! ## Example
+//!
+//! ```rust
+//! use blockless_sdk::*;
+//!
+//! // Create with default config
+//! let crawler = BlessCrawl::default();
+//! let result = crawler.scrape("https://example.com", None).unwrap();
+//! 
+//! // Or override config per request
+//! let custom_config = ScrapeOptions { timeout: 30000, wait_time: 5000, ..Default::default() };
+//! let result = crawler.scrape("https://example.com", Some(custom_config)).unwrap();
+//! println!("Content: {}", result.data.content);
+//! ```
+
+mod html_to_markdown;
+mod html_transform;
+
+use html_to_markdown::parse_markdown;
+pub use html_transform::{transform_html, TransformHtmlOptions, HtmlTransformError};
+use std::collections::HashMap;
+
+type Handle = u32;
+type ExitCode = u8;
+
+#[cfg(not(feature = "mock-ffi"))]
+#[link(wasm_import_module = "bless_crawl")]
+extern "C" {
+    /// Scrape webpage content and return as markdown
+    fn scrape(
+        h: *mut Handle,
+        url_ptr: *const u8,
+        url_len: usize,
+        options_ptr: *const u8,
+        options_len: usize,
+        result_ptr: *mut u8,
+        result_len: usize,
+        bytes_written: *mut usize,
+    ) -> ExitCode;
+
+    /// Extract and return all discoverable links from webpage
+    fn map(
+        h: *mut Handle,
+        url_ptr: *const u8,
+        url_len: usize,
+        options_ptr: *const u8,
+        options_len: usize,
+        result_ptr: *mut u8,
+        result_len: usize,
+        bytes_written: *mut usize,
+    ) -> ExitCode;
+
+    /// Recursively crawl website starting from given URL
+    fn crawl(
+        h: *mut Handle,
+        url_ptr: *const u8,
+        url_len: usize,
+        options_ptr: *const u8,
+        options_len: usize,
+        result_ptr: *mut u8,
+        result_len: usize,
+        bytes_written: *mut usize,
+    ) -> ExitCode;
+
+    /// Close and cleanup a web scraper instance
+    fn close(h: Handle) -> ExitCode;
+}
+
+#[cfg(feature = "mock-ffi")]
+#[allow(unused_variables)]
+mod mock_ffi {
+    use super::{ExitCode, Handle};
+
+    pub unsafe fn scrape(
+        h: *mut Handle,
+        _url_ptr: *const u8,
+        _url_len: usize,
+        _options_ptr: *const u8,
+        _options_len: usize,
+        result_ptr: *mut u8,
+        result_len: usize,
+        bytes_written: *mut usize,
+    ) -> ExitCode {
+        unimplemented!()
+    }
+
+    pub unsafe fn map(
+        h: *mut Handle,
+        _url_ptr: *const u8,
+        _url_len: usize,
+        _options_ptr: *const u8,
+        _options_len: usize,
+        result_ptr: *mut u8,
+        result_len: usize,
+        bytes_written: *mut usize,
+    ) -> ExitCode {
+        unimplemented!()
+    }
+
+    pub unsafe fn crawl(
+        h: *mut Handle,
+        _url_ptr: *const u8,
+        _url_len: usize,
+        _options_ptr: *const u8,
+        _options_len: usize,
+        result_ptr: *mut u8,
+        result_len: usize,
+        bytes_written: *mut usize,
+    ) -> ExitCode {
+        unimplemented!()
+    }
+
+    pub unsafe fn close(_h: Handle) -> ExitCode {
+        unimplemented!()
+    }
+}
+
+#[cfg(feature = "mock-ffi")]
+use mock_ffi::*;
+
+#[derive(Debug, Clone, PartialEq, serde::Serialize)]
+pub struct ScrapeOptions {
+    pub timeout: u32,
+    pub wait_time: u32,
+    pub include_tags: Option<Vec<String>>,
+    pub exclude_tags: Option<Vec<String>>,
+    pub only_main_content: bool,
+    pub format: Format,
+    pub viewport: Option<Viewport>,
+    pub user_agent: Option<String>,
+    pub headers: Option<HashMap<String, String>>,
+}
+
+impl Default for ScrapeOptions {
+    fn default() -> Self {
+        Self {
+            timeout: BlessCrawl::DEFAULT_TIMEOUT_MS,
+            wait_time: BlessCrawl::DEFAULT_WAIT_TIME_MS,
+            include_tags: None,
+            exclude_tags: None,
+            only_main_content: false,
+            format: Format::Markdown,
+            viewport: None,
+            user_agent: None,
+            headers: None,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
+pub enum Format {
+    #[default]
+    #[serde(rename = "markdown")]
+    Markdown,
+    #[serde(rename = "html")]
+    Html,
+    #[serde(rename = "json")]
+    Json,
+}
+
+impl std::fmt::Display for Format {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.to_string().to_lowercase())
+    }
+}
+
+impl std::str::FromStr for Format {
+    type Err = ();
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "markdown" => Ok(Format::Markdown),
+            "html" => Ok(Format::Html),
+            "json" => Ok(Format::Json),
+            _ => Err(()),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
+pub struct Viewport {
+    pub width: Option<u32>,
+    pub height: Option<u32>,
+}
+
+#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
+pub struct MapOptions {
+    pub link_types: Option<Vec<String>>,
+    pub base_url: Option<String>,
+    pub filter_extensions: Option<Vec<String>>,
+}
+
+#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
+pub struct CrawlOptions {
+    pub limit: Option<u32>,
+    pub max_depth: Option<u8>,
+    pub exclude_paths: Option<Vec<String>>,
+    pub include_paths: Option<Vec<String>>,
+    pub follow_external: Option<bool>,
+    pub delay_between_requests: Option<u32>,
+    pub parallel_requests: Option<u32>,
+}
+
+#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
+pub struct PageMetadata {
+    pub title: Option<String>,
+    pub description: Option<String>,
+    pub url: String,
+    pub status_code: u16,
+    pub language: Option<String>,
+    pub keywords: Option<String>,
+    pub robots: Option<String>,
+    pub author: Option<String>,
+    pub creator: Option<String>,
+    pub publisher: Option<String>,
+    pub og_title: Option<String>,
+    pub og_description: Option<String>,
+    pub og_image: Option<String>,
+    pub og_url: Option<String>,
+    pub og_site_name: Option<String>,
+    pub og_type: Option<String>,
+    pub twitter_title: Option<String>,
+    pub twitter_description: Option<String>,
+    pub twitter_image: Option<String>,
+    pub twitter_card: Option<String>,
+    pub twitter_site: Option<String>,
+    pub twitter_creator: Option<String>,
+    pub favicon: Option<String>,
+    pub viewport: Option<String>,
+    pub referrer: Option<String>,
+    pub content_type: Option<String>,
+    pub scrape_id: Option<String>,
+    pub source_url: Option<String>,
+    pub proxy_used: Option<String>,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct ScrapeData {
+    pub success: bool,
+    pub timestamp: u64,
+    pub format: Format,
+    pub content: String,
+    pub metadata: PageMetadata,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct Response<T> {
+    pub success: bool,
+    pub error: Option<String>,
+    pub data: T,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct LinkInfo {
+    pub url: String,
+    // TODO: use enum instead of string
+    pub link_type: String, // "internal", "external", "anchor"
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct MapData {
+    pub url: String,
+    pub links: Vec<LinkInfo>,
+    pub total_links: usize,
+    pub timestamp: u64,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct CrawlError {
+    pub url: String,
+    pub error: String,
+    pub depth: u32,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct CrawlData<T> {
+    pub root_url: String,
+    pub pages: Vec<T>,
+    pub link_map: Option<MapData>,
+    pub depth_reached: u8,
+    pub total_pages: usize,
+    pub errors: Vec<CrawlError>,
+}
+
+impl ScrapeOptions {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_include_tags(mut self, tags: Vec<String>) -> Self {
+        self.include_tags = Some(tags);
+        self
+    }
+
+    pub fn with_exclude_tags(mut self, tags: Vec<String>) -> Self {
+        self.exclude_tags = Some(tags);
+        self
+    }
+
+    pub fn with_format(mut self, format: Format) -> Self {
+        self.format = format;
+        self
+    }
+
+    pub fn with_viewport(mut self, width: u32, height: u32) -> Self {
+        self.viewport = Some(Viewport { width: Some(width), height: Some(height) });
+        self
+    }
+
+    pub fn with_user_agent(mut self, user_agent: String) -> Self {
+        self.user_agent = Some(user_agent);
+        self
+    }
+
+    pub fn with_headers(mut self, headers: HashMap<String, String>) -> Self {
+        self.headers = Some(headers);
+        self
+    }
+}
+
+impl MapOptions {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_link_types(mut self, link_types: Vec<String>) -> Self {
+        self.link_types = Some(link_types);
+        self
+    }
+
+    pub fn with_base_url(mut self, base_url: String) -> Self {
+        self.base_url = Some(base_url);
+        self
+    }
+
+    pub fn with_filter_extensions(mut self, extensions: Vec<String>) -> Self {
+        self.filter_extensions = Some(extensions);
+        self
+    }
+}
+
+impl CrawlOptions {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_limit(mut self, limit: u32) -> Self {
+        self.limit = Some(limit);
+        self
+    }
+
+    pub fn with_max_depth(mut self, max_depth: u8) -> Self {
+        self.max_depth = Some(max_depth);
+        self
+    }
+
+    pub fn with_exclude_paths(mut self, paths: Vec<String>) -> Self {
+        self.exclude_paths = Some(paths);
+        self
+    }
+
+    pub fn with_include_paths(mut self, paths: Vec<String>) -> Self {
+        self.include_paths = Some(paths);
+        self
+    }
+
+    pub fn with_follow_external(mut self, follow: bool) -> Self {
+        self.follow_external = Some(follow);
+        self
+    }
+
+    pub fn with_delay_between_requests(mut self, delay: u32) -> Self {
+        self.delay_between_requests = Some(delay);
+        self
+    }
+
+    pub fn with_parallel_requests(mut self, parallel: u32) -> Self {
+        self.parallel_requests = Some(parallel);
+        self
+    }
+}
+
+/// BlessCrawl client for distributed web scraping operations.
+#[derive(Debug, Clone, Default)]
+pub struct BlessCrawl {
+    inner: Handle,
+    config: ScrapeOptions,
+}
+
+impl BlessCrawl {
+    /// Default timeout in milliseconds (15 seconds)
+    pub const DEFAULT_TIMEOUT_MS: u32 = 15000;
+    /// Default wait time in milliseconds (3 seconds)
+    pub const DEFAULT_WAIT_TIME_MS: u32 = 3000;
+
+    /// Maximum timeout in milliseconds (2 minutes)
+    pub const MAX_TIMEOUT_MS: u32 = 120000;
+    /// Maximum wait time in milliseconds (20 seconds)
+    pub const MAX_WAIT_TIME_MS: u32 = 20000;
+
+    /// Maximum result buffer size in bytes (2MB)
+    pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024;
+
+    /// Maximum result buffer size in bytes (1MB)
+    pub const MAX_MAP_BUFFER_SIZE: usize = 1 * 1024 * 1024;
+
+    /// Maximum result buffer size in bytes (8MB)
+    pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024;
+
+    /// Creates a new BlessCrawl instance with the given configuration.
+    pub fn with_config(config: ScrapeOptions) -> Result<Self, WebScrapeErrorKind> {
+        let instance = Self { inner: 0, config };
+        instance.validate_config(&instance.config)?;
+        Ok(instance)
+    }
+
+    fn validate_config(&self, config: &ScrapeOptions) -> Result<(), WebScrapeErrorKind> {
+        if config.timeout > Self::MAX_TIMEOUT_MS {
+            return Err(WebScrapeErrorKind::InvalidTimeout);
+        }
+        if config.wait_time > Self::MAX_WAIT_TIME_MS {
+            return Err(WebScrapeErrorKind::InvalidWaitTime);
+        }
+        Ok(())
+    }
+
+    /// Returns a reference to the current configuration.
+    pub fn get_config(&self) -> &ScrapeOptions {
+        &self.config
+    }
+
+    pub fn handle(&self) -> Handle {
+        self.inner
+    }
+
+    /// Scrapes webpage content and returns it as markdown with metadata.
+    pub fn scrape(&self, url: &str, options: Option<ScrapeOptions>) -> Result<Response<ScrapeData>, WebScrapeErrorKind> {
+        // Use provided options or fall back to instance config
+        let config = if let Some(opts) = options {
+            self.validate_config(&opts)?;
+            opts
+        } else {
+            self.config.clone()
+        };
+
+        let options_json = serde_json::to_vec(&config).unwrap();
+
+        let mut handle = self.inner;
+        let mut result_buf = vec![0u8; Self::MAX_SCRAPE_BUFFER_SIZE];
+        let mut bytes_written: usize = 0;
+
+        let code = unsafe {
+            scrape(
+                &mut handle,
+                url.as_ptr(),
+                url.len(),
+                options_json.as_ptr(),
+                options_json.len(),
+                result_buf.as_mut_ptr(),
+                result_buf.len(),
+                &mut bytes_written,
+            )
+        };
+
+        if code != 0 {
+            return Err(code.into());
+        }
+        if bytes_written == 0 {
+            return Err(WebScrapeErrorKind::EmptyResponse);
+        }
+        if bytes_written > result_buf.len() {
+            return Err(WebScrapeErrorKind::MemoryError);
+        }
+
+        let result_bytes =
+            unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
+
+        // deserialize the result to host ScrapeResponse
+        let mut scrape_response = serde_json::from_slice::<Response<ScrapeData>>(
+            result_bytes,
+        )
+        .map_err(|e| {
+            eprintln!("error: {:?}", e);
+            WebScrapeErrorKind::ParseError
+        })?;
+
+        if let Some(error) = scrape_response.error {
+            return Err(WebScrapeErrorKind::RuntimeError(error));
+        }
+
+        // post-process html
+        scrape_response.data.content = transform_html(TransformHtmlOptions {
+            html: scrape_response.data.content,
+            url: scrape_response.data.metadata.url.clone(),
+            include_tags: config.include_tags.unwrap_or_default(),
+            exclude_tags: config.exclude_tags.unwrap_or_default(),
+            only_main_content: config.only_main_content,
+        }).map_err(|e| {
+            eprintln!("error: {:?}", e);
+            WebScrapeErrorKind::TransformError
+        })?;
+
+        // if the format is markdown, set the data to the markdown of the html
+        match config.format {
+            Format::Markdown => {
+                scrape_response.data.content = parse_markdown(&scrape_response.data.content);
+            }
+            Format::Html => (), // no need to do anything
+            Format::Json => unimplemented!(),
+        }
+
+        // convert the host ScrapeResponse to the user ScrapeResponse
+        Ok(scrape_response)
+    }
+
+    /// Extracts all links from a webpage, categorized by type.
+    pub fn map(
+        &self,
+        url: &str,
+        options: Option<MapOptions>,
+    ) -> Result<Response<MapData>, WebScrapeErrorKind> {
+        let mut combined_options = serde_json::to_value(&self.config).unwrap();
+        if let Some(map_opts) = options {
+            combined_options["map_options"] = serde_json::to_value(map_opts).unwrap();
+        }
+        let options_json = serde_json::to_vec(&combined_options).unwrap();
+
+        let mut result_buf = vec![0u8; Self::MAX_MAP_BUFFER_SIZE];
+        let mut bytes_written: usize = 0;
+
+        let mut handle = self.inner;
+        let code = unsafe {
+            map(
+                &mut handle,
+                url.as_ptr(),
+                url.len(),
+                options_json.as_ptr(),
+                options_json.len(),
+                result_buf.as_mut_ptr(),
+                result_buf.len(),
+                &mut bytes_written,
+            )
+        };
+
+        if code != 0 {
+            return Err(code.into());
+        }
+
+        if bytes_written == 0 {
+            return Err(WebScrapeErrorKind::EmptyResponse);
+        }
+
+        if bytes_written > result_buf.len() {
+            return Err(WebScrapeErrorKind::MemoryError);
+        }
+
+        let result_bytes =
+            unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
+
+        // deserialize the result to MapResponse
+        let map_response = serde_json::from_slice::<Response<MapData>>(result_bytes).map_err(|e| {
+            eprintln!("error: {:?}", e);
+            WebScrapeErrorKind::ParseError
+        })?;
+
+        if let Some(error) = map_response.error {
+            return Err(WebScrapeErrorKind::RuntimeError(error));
+        }
+
+        Ok(map_response)
+    }
+
+    /// Recursively crawls a website with configurable depth and filtering.
+    pub fn crawl(
+        &self,
+        url: &str,
+        options: Option<CrawlOptions>,
+    ) -> Result<Response<CrawlData<ScrapeData>>, WebScrapeErrorKind> {
+        let mut combined_options = serde_json::to_value(&self.config).unwrap();
+        if let Some(crawl_opts) = options {
+            combined_options["crawl_options"] = serde_json::to_value(crawl_opts).unwrap();
+        }
+        let options_json = serde_json::to_vec(&combined_options).unwrap();
+
+        let mut result_buf = vec![0u8; Self::MAX_CRAWL_BUFFER_SIZE];
+        let mut bytes_written: usize = 0;
+
+        let mut handle = self.inner;
+        let code = unsafe {
+            crawl(
+                &mut handle,
+                url.as_ptr(),
+                url.len(),
+                options_json.as_ptr(),
+                options_json.len(),
+                result_buf.as_mut_ptr(),
+                result_buf.len(),
+                &mut bytes_written,
+            )
+        };
+
+        if code != 0 {
+            return Err(code.into());
+        }
+
+        if bytes_written == 0 {
+            return Err(WebScrapeErrorKind::EmptyResponse);
+        }
+
+        if bytes_written > result_buf.len() {
+            return Err(WebScrapeErrorKind::MemoryError);
+        }
+
+        let result_bytes =
+            unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
+
+        // deserialize the result to CrawlResponse
+        let mut host_crawl_response =
+            serde_json::from_slice::<Response<CrawlData<ScrapeData>>>(result_bytes).map_err(|e| {
+                eprintln!("error: {:?}", e);
+                WebScrapeErrorKind::ParseError
+            })?;
+
+        if let Some(error) = host_crawl_response.error {
+            return Err(WebScrapeErrorKind::RuntimeError(error));
+        }
+
+        // post-process html
+        for page in host_crawl_response.data.pages.iter_mut() {
+            page.content = transform_html(TransformHtmlOptions {
+                html: page.content.clone(),
+                url: page.metadata.url.clone(),
+                include_tags: self.config.include_tags.clone().unwrap_or_default(),
+                exclude_tags: self.config.exclude_tags.clone().unwrap_or_default(),
+                only_main_content: self.config.only_main_content,
+            }).map_err(|e| {
+                eprintln!("error: {:?}", e);
+                WebScrapeErrorKind::TransformError
+            })?;
+
+            // if the format is markdown, set the content to the markdown of the html
+            match self.config.format {
+                Format::Markdown => {
+                    page.content = parse_markdown(&page.content);
+                }
+                Format::Html => (), // no need to do anything
+                Format::Json => unimplemented!(),
+            }
+        }
+
+        // convert the host CrawlResponse to the user CrawlResponse
+        Ok(host_crawl_response)
+    }
+}
+
+impl Drop for BlessCrawl {
+    fn drop(&mut self) {
+        // if the handle is 0, it means the instance was never initialized on the host
+        if self.inner == 0 {
+            return;
+        }
+        let code = unsafe { close(self.inner) };
+        if code != 0 {
+            eprintln!("Error closing web scraper: {}", code);
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum WebScrapeErrorKind {
+    InvalidUrl,
+    Timeout,
+    NetworkError,
+    RenderingError,
+    MemoryError,
+    DepthExceeded,
+    RateLimited,
+    TransformError,
+    Utf8Error,
+    ParseError,
+    ScrapeFailed,
+    MapFailed,
+    CrawlFailed,
+    EmptyResponse,
+    InvalidTimeout,
+    InvalidWaitTime,
+    RuntimeError(String),
+}
+
+impl From<u8> for WebScrapeErrorKind {
+    fn from(code: u8) -> Self {
+        match code {
+            1 => WebScrapeErrorKind::InvalidUrl,
+            2 => WebScrapeErrorKind::Timeout,
+            3 => WebScrapeErrorKind::NetworkError,
+            4 => WebScrapeErrorKind::RenderingError,
+            5 => WebScrapeErrorKind::MemoryError,
+            6 => WebScrapeErrorKind::DepthExceeded,
+            7 => WebScrapeErrorKind::RateLimited,
+            8 => WebScrapeErrorKind::TransformError,
+            9 => WebScrapeErrorKind::RuntimeError(String::from("Invalid timeout")),
+            10 => WebScrapeErrorKind::RuntimeError(String::from("Invalid wait time")),
+            _ => WebScrapeErrorKind::RuntimeError(String::from("Unknown error")),
+        }
+    }
+}
+
+impl std::fmt::Display for WebScrapeErrorKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            WebScrapeErrorKind::InvalidUrl => write!(f, "Invalid URL provided"),
+            WebScrapeErrorKind::Timeout => write!(f, "Request timeout"),
+            WebScrapeErrorKind::NetworkError => write!(f, "Network error"),
+            WebScrapeErrorKind::RenderingError => write!(f, "Page rendering error"),
+            WebScrapeErrorKind::MemoryError => write!(f, "Memory allocation error"),
+            WebScrapeErrorKind::DepthExceeded => write!(f, "Maximum crawl depth exceeded"),
+            WebScrapeErrorKind::RateLimited => write!(f, "Rate limited"),
+            WebScrapeErrorKind::TransformError => write!(f, "Transform error"),
+            WebScrapeErrorKind::Utf8Error => write!(f, "UTF-8 conversion error"),
+            WebScrapeErrorKind::ParseError => write!(f, "JSON parse error"),
+            WebScrapeErrorKind::ScrapeFailed => write!(f, "Scrape operation failed"),
+            WebScrapeErrorKind::MapFailed => write!(f, "Map operation failed"),
+            WebScrapeErrorKind::CrawlFailed => write!(f, "Crawl operation failed"),
+            WebScrapeErrorKind::EmptyResponse => write!(f, "Empty response from host"),
+            WebScrapeErrorKind::InvalidTimeout => {
+                write!(f, "Timeout exceeds maximum allowed (120s)")
+            }
+            WebScrapeErrorKind::InvalidWaitTime => {
+                write!(f, "Wait time exceeds maximum allowed (20s)")
+            }
+            WebScrapeErrorKind::RuntimeError(error) => write!(f, "Runtime error: {}", error),
+        }
+    }
+}
+
+impl std::error::Error for WebScrapeErrorKind {}

From deefc87f6a086de494a270ae998dda7d83a4c3fe Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:48:37 +1200
Subject: [PATCH 03/12] html to markdown impl

---
 src/bless_crawl/html_to_markdown.rs | 121 ++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 src/bless_crawl/html_to_markdown.rs

diff --git a/src/bless_crawl/html_to_markdown.rs b/src/bless_crawl/html_to_markdown.rs
new file mode 100644
index 0000000..f8eb03f
--- /dev/null
+++ b/src/bless_crawl/html_to_markdown.rs
@@ -0,0 +1,121 @@
+use htmd::HtmlToMarkdown;
+use regex::Regex;
+
+/// Parses HTML content and converts it to Markdown
+///
+/// This function replicates the behavior of the JavaScript parseMarkdown function:
+/// - Converts HTML to Markdown using htmd
+/// - Processes multi-line links by escaping newlines inside link content
+/// - Removes "Skip to Content" links
+/// - Returns empty string for empty/null input
+pub fn parse_markdown(html: &str) -> String {
+    if html.is_empty() {
+        return String::new();
+    }
+
+    // Convert HTML to Markdown using htmd
+    let markdown = match HtmlToMarkdown::new().convert(html) {
+        Ok(md) => md,
+        Err(_) => {
+            // Return empty string if conversion fails
+            return String::new();
+        }
+    };
+
+    // Process the markdown content
+    let processed_markdown = process_multiline_links(&markdown);
+    let final_markdown = remove_skip_to_content_links(&processed_markdown);
+
+    final_markdown
+}
+
+/// Processes multi-line links by escaping newlines inside link content
+///
+/// This function replicates the JavaScript processMultiLineLinks function:
+/// - Tracks when we're inside link content (between [ and ])
+/// - Escapes newlines with backslash when inside links
+fn process_multiline_links(markdown_content: &str) -> String {
+    let mut new_markdown_content = String::new();
+    let mut link_open_count: usize = 0;
+
+    for ch in markdown_content.chars() {
+        match ch {
+            '[' => {
+                link_open_count += 1;
+            }
+            ']' => {
+                link_open_count = link_open_count.saturating_sub(1);
+            }
+            _ => {}
+        }
+
+        let inside_link_content = link_open_count > 0;
+
+        if inside_link_content && ch == '\n' {
+            new_markdown_content.push('\\');
+            new_markdown_content.push('\n');
+        } else {
+            new_markdown_content.push(ch);
+        }
+    }
+
+    new_markdown_content
+}
+
+/// Removes "Skip to Content" links from the markdown content
+///
+/// This function replicates the JavaScript removeSkipToContentLinks function:
+/// - Removes [Skip to Content](#page) and [Skip to content](#skip) patterns
+/// - Case-insensitive matching
+fn remove_skip_to_content_links(markdown_content: &str) -> String {
+    let re = Regex::new(r"(?i)\[Skip to Content\]\(#[^)]*\)").unwrap();
+    re.replace_all(markdown_content, "").to_string()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_markdown_simple() {
+        let html = "<p>Hello, world!</p>";
+        let result = parse_markdown(html);
+        assert_eq!(result.trim(), "Hello, world!");
+    }
+
+    #[test]
+    fn test_parse_markdown_complex() {
+        let html =
+            "<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>";
+        let result = parse_markdown(html);
+        assert_eq!(result.trim(), "Hello **bold** world!\n\n*   List item");
+    }
+
+    #[test]
+    fn test_parse_markdown_empty() {
+        let html = "";
+        let result = parse_markdown(html);
+        assert_eq!(result, "");
+    }
+
+    #[test]
+    fn test_process_multiline_links() {
+        let markdown = "[Link\nwith newline](http://example.com)";
+        let result = process_multiline_links(markdown);
+        assert_eq!(result, "[Link\\\nwith newline](http://example.com)");
+    }
+
+    #[test]
+    fn test_remove_skip_to_content_links() {
+        let markdown = "Some content [Skip to Content](#page) more content";
+        let result = remove_skip_to_content_links(markdown);
+        assert_eq!(result, "Some content  more content");
+    }
+
+    #[test]
+    fn test_remove_skip_to_content_links_case_insensitive() {
+        let markdown = "Some content [Skip to content](#skip) more content";
+        let result = remove_skip_to_content_links(markdown);
+        assert_eq!(result, "Some content  more content");
+    }
+}

From dd01f92b1db6c3111a49a6129fa03ab355be472a Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:48:49 +1200
Subject: [PATCH 04/12] html transformation impl for include and exclude tags

---
 src/bless_crawl/html_transform.rs | 349 ++++++++++++++++++++++++++++++
 1 file changed, 349 insertions(+)
 create mode 100644 src/bless_crawl/html_transform.rs

diff --git a/src/bless_crawl/html_transform.rs b/src/bless_crawl/html_transform.rs
new file mode 100644
index 0000000..0a0a596
--- /dev/null
+++ b/src/bless_crawl/html_transform.rs
@@ -0,0 +1,349 @@
+use kuchikiki::{parse_html, traits::TendrilSink};
+use serde::{Deserialize, Serialize};
+use url::Url;
+
+const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
+    "header",
+    "footer",
+    "nav",
+    "aside",
+    ".header",
+    ".top",
+    ".navbar",
+    "#header",
+    ".footer",
+    ".bottom",
+    "#footer",
+    ".sidebar",
+    ".side",
+    ".aside",
+    "#sidebar",
+    ".modal",
+    ".popup",
+    "#modal",
+    ".overlay",
+    ".ad",
+    ".ads",
+    ".advert",
+    "#ad",
+    ".lang-selector",
+    ".language",
+    "#language-selector",
+    ".social",
+    ".social-media",
+    ".social-links",
+    "#social",
+    ".menu",
+    ".navigation",
+    "#nav",
+    ".breadcrumbs",
+    "#breadcrumbs",
+    ".share",
+    "#share",
+    ".widget",
+    "#widget",
+    ".cookie",
+    "#cookie",
+];
+
+const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [
+    "#main",
+    // swoogo event software as .widget in all of their content
+    ".swoogo-cols",
+    ".swoogo-text",
+    ".swoogo-table-div",
+    ".swoogo-space",
+    ".swoogo-alert",
+    ".swoogo-sponsors",
+    ".swoogo-title",
+    ".swoogo-tabs",
+    ".swoogo-logo",
+    ".swoogo-image",
+    ".swoogo-button",
+    ".swoogo-agenda",
+];
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TransformHtmlOptions {
+    pub html: String,
+    pub url: String,
+    pub include_tags: Vec<String>,
+    pub exclude_tags: Vec<String>,
+    pub only_main_content: bool,
+}
+
+#[derive(Debug)]
+struct ImageSource {
+    url: String,
+    size: i32,
+    is_x: bool,
+}
+
+#[derive(Debug)]
+pub enum HtmlTransformError {
+    ParseError,
+    UrlParseError,
+    SelectError,
+}
+
+impl std::fmt::Display for HtmlTransformError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            HtmlTransformError::ParseError => write!(f, "Failed to parse HTML"),
+            HtmlTransformError::UrlParseError => write!(f, "Failed to parse URL"),
+            HtmlTransformError::SelectError => write!(f, "Failed to select HTML elements"),
+        }
+    }
+}
+
+impl std::error::Error for HtmlTransformError {}
+
+/// Transforms HTML by removing unwanted elements, filtering tags, and processing URLs
+pub fn transform_html(opts: TransformHtmlOptions) -> Result<String, HtmlTransformError> {
+    let mut document = parse_html().one(opts.html);
+    
+    // If include_tags is specified, only include those tags
+    if !opts.include_tags.is_empty() {
+        let new_document = parse_html().one("<div></div>");
+        let root = new_document.select_first("div").map_err(|_| HtmlTransformError::SelectError)?;
+
+        for tag_selector in opts.include_tags.iter() {
+            let matching_nodes: Vec<_> = document.select(tag_selector)
+                .map_err(|_| HtmlTransformError::SelectError)?
+                .collect();
+            for tag in matching_nodes {
+                root.as_node().append(tag.as_node().clone());
+            }
+        }
+
+        document = new_document;
+    }
+
+    // Remove unwanted elements
+    let unwanted_selectors = ["head", "meta", "noscript", "style", "script"];
+    for selector in &unwanted_selectors {
+        while let Ok(element) = document.select_first(selector) {
+            element.as_node().detach();
+        }
+    }
+
+    // Remove excluded tags
+    for tag_selector in opts.exclude_tags.iter() {
+        while let Ok(element) = document.select_first(tag_selector) {
+            element.as_node().detach();
+        }
+    }
+
+    // Remove non-main content if requested
+    if opts.only_main_content {
+        for selector in EXCLUDE_NON_MAIN_TAGS.iter() {
+            let elements: Vec<_> = document.select(selector)
+                .map_err(|_| HtmlTransformError::SelectError)?
+                .collect();
+            for element in elements {
+                // Check if this element contains any force-include tags
+                let should_keep = FORCE_INCLUDE_MAIN_TAGS.iter().any(|force_selector| {
+                    element.as_node().select(force_selector)
+                        .map(|mut iter| iter.next().is_some())
+                        .unwrap_or(false)
+                });
+                
+                if !should_keep {
+                    element.as_node().detach();
+                }
+            }
+        }
+    }
+
+    // Process images with srcset attributes
+    let srcset_images: Vec<_> = document.select("img[srcset]")
+        .map_err(|_| HtmlTransformError::SelectError)?
+        .collect();
+    
+    for img in srcset_images {
+        let srcset = img.attributes.borrow().get("srcset").map(|s| s.to_string());
+        if let Some(srcset) = srcset {
+            let mut sizes: Vec<ImageSource> = srcset.split(',').filter_map(|entry| {
+                let tokens: Vec<&str> = entry.trim().split(' ').collect();
+                if tokens.is_empty() {
+                    return None;
+                }
+                
+                let size_token = if tokens.len() > 1 && !tokens[1].is_empty() {
+                    tokens[1]
+                } else {
+                    "1x"
+                };
+                
+                if let Ok(parsed_size) = size_token[..size_token.len()-1].parse() {
+                    Some(ImageSource {
+                        url: tokens[0].to_string(),
+                        size: parsed_size,
+                        is_x: size_token.ends_with('x')
+                    })
+                } else {
+                    None
+                }
+            }).collect();
+
+            // Add src attribute as 1x if all sizes are x-based
+            if sizes.iter().all(|s| s.is_x) {
+                let src = img.attributes.borrow().get("src").map(|s| s.to_string());
+                if let Some(src) = src {
+                    sizes.push(ImageSource {
+                        url: src,
+                        size: 1,
+                        is_x: true,
+                    });
+                }
+            }
+
+            // Sort by size (largest first) and use the biggest image
+            sizes.sort_by(|a, b| b.size.cmp(&a.size));
+            if let Some(biggest) = sizes.first() {
+                img.attributes.borrow_mut().insert("src", biggest.url.clone());
+            }
+        }
+    }
+
+    // Convert relative URLs to absolute URLs
+    let base_url = Url::parse(&opts.url).map_err(|_| HtmlTransformError::UrlParseError)?;
+    
+    // Process image src attributes
+    let src_images: Vec<_> = document.select("img[src]")
+        .map_err(|_| HtmlTransformError::SelectError)?
+        .collect();
+    for img in src_images {
+        let old_src = img.attributes.borrow().get("src").map(|s| s.to_string());
+        if let Some(old_src) = old_src {
+            if let Ok(new_url) = base_url.join(&old_src) {
+                img.attributes.borrow_mut().insert("src", new_url.to_string());
+            }
+        }
+    }
+
+    // Process anchor href attributes
+    let href_anchors: Vec<_> = document.select("a[href]")
+        .map_err(|_| HtmlTransformError::SelectError)?
+        .collect();
+    for anchor in href_anchors {
+        let old_href = anchor.attributes.borrow().get("href").map(|s| s.to_string());
+        if let Some(old_href) = old_href {
+            if let Ok(new_url) = base_url.join(&old_href) {
+                anchor.attributes.borrow_mut().insert("href", new_url.to_string());
+            }
+        }
+    }
+
+    Ok(document.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_transform_html_removes_unwanted_elements() {
+        let opts = TransformHtmlOptions {
+            html: "<html><head><title>Test</title></head><body><p>Content</p><script>alert('test')</script></body></html>".to_string(),
+            url: "https://example.com".to_string(),
+            include_tags: vec![],
+            exclude_tags: vec![],
+            only_main_content: false,
+        };
+
+        let result = transform_html(opts).unwrap();
+        let expected = "<html><body><p>Content</p></body></html>";
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_transform_html_include_tags() {
+        let opts = TransformHtmlOptions {
+            html: "<html><body><div class=\"content\">Keep this</div><div class=\"sidebar\">Remove this</div></body></html>".to_string(),
+            url: "https://example.com".to_string(),
+            include_tags: vec![".content".to_string()],
+            exclude_tags: vec![],
+            only_main_content: false,
+        };
+
+        let result = transform_html(opts).unwrap();
+        let expected = "<html><body><div><div class=\"content\">Keep this</div></div></body></html>";
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_transform_html_exclude_tags() {
+        let opts = TransformHtmlOptions {
+            html: "<html><body><div class=\"content\">Keep this</div><div class=\"ad\">Remove this</div></body></html>".to_string(),
+            url: "https://example.com".to_string(),
+            include_tags: vec![],
+            exclude_tags: vec![".ad".to_string()],
+            only_main_content: false,
+        };
+
+        let result = transform_html(opts).unwrap();
+        let expected = "<html><body><div class=\"content\">Keep this</div></body></html>";
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_transform_html_relative_urls() {
+        let opts = TransformHtmlOptions {
+            html: r#"<html><body><img src="/image.jpg"><a href="/page">Link</a></body></html>"#.to_string(),
+            url: "https://example.com/subdir/".to_string(),
+            include_tags: vec![],
+            exclude_tags: vec![],
+            only_main_content: false,
+        };
+
+        let result = transform_html(opts).unwrap();
+        let expected = r#"<html><body><img src="https://example.com/image.jpg"><a href="https://example.com/page">Link</a></body></html>"#;
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_transform_html_only_main_content() {
+        let opts = TransformHtmlOptions {
+            html: "<html><body><header>Header</header><main><p>Main content</p></main><footer>Footer</footer></body></html>".to_string(),
+            url: "https://example.com".to_string(),
+            include_tags: vec![],
+            exclude_tags: vec![],
+            only_main_content: true,
+        };
+
+        let result = transform_html(opts).unwrap();
+        let expected = "<html><body><main><p>Main content</p></main></body></html>";
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_transform_html_srcset_processing() {
+        let opts = TransformHtmlOptions {
+            html: r#"<html><body><img srcset="/small.jpg 1x, /large.jpg 2x" src="/default.jpg"></body></html>"#.to_string(),
+            url: "https://example.com".to_string(),
+            include_tags: vec![],
+            exclude_tags: vec![],
+            only_main_content: false,
+        };
+
+        let result = transform_html(opts).unwrap();
+        let expected = r#"<html><body><img srcset="/small.jpg 1x, /large.jpg 2x" src="https://example.com/large.jpg"></body></html>"#;
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_transform_html_force_include_tags() {
+        let opts = TransformHtmlOptions {
+            html: r#"<html><body><div class="widget"><div id="main"><p>Important content</p></div></div><div class="sidebar">Sidebar</div></body></html>"#.to_string(),
+            url: "https://example.com".to_string(),
+            include_tags: vec![],
+            exclude_tags: vec![],
+            only_main_content: true,
+        };
+
+        let result = transform_html(opts).unwrap();
+        let expected = r#"<html><body><div class="widget"><div id="main"><p>Important content</p></div></div></body></html>"#;
+        assert_eq!(result, expected);
+    }
+} 
\ No newline at end of file

From 3f82c4093dd9d5cae1adc9e420d06bd06a649fb0 Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:48:59 +1200
Subject: [PATCH 05/12] bless-crawl plugin impl - lib

---
 src/lib.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/lib.rs b/src/lib.rs
index d67e81e..b60c611 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,4 @@
+mod bless_crawl;
 mod cgi;
 mod error;
 mod http;
@@ -5,6 +6,7 @@ mod llm;
 mod memory;
 mod socket;
 
+pub use bless_crawl::*;
 pub use cgi::*;
 pub use error::*;
 pub use http::*;

From 057835fe9bf2f9e8afd336564fe86884884d83e7 Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:49:32 +1200
Subject: [PATCH 06/12] webscrape example

---
 examples/web-scrape.rs | 72 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 examples/web-scrape.rs

diff --git a/examples/web-scrape.rs b/examples/web-scrape.rs
new file mode 100644
index 0000000..63f25a7
--- /dev/null
+++ b/examples/web-scrape.rs
@@ -0,0 +1,72 @@
+use blockless_sdk::*;
+
+/// This example demonstrates how to use the Blockless SDK to perform web scraping
+/// using the BlessCrawl functionality.
+///
+/// It shows how to:
+/// - Create a BlessCrawl instance with default configuration
+/// - Scrape content from a single URL with custom configuration overrides
+/// - Map links from a webpage to discover available URLs
+/// - Handle errors and responses appropriately
+fn main() {
+    println!("=== Blockless Web Scraping SDK Example ===\n");
+
+    example_scraping();
+    example_mapping();
+    example_crawling();
+}
+
+fn example_scraping() {
+    println!("--- Example 1: Basic Web Scraping ---");
+
+    let url = "https://example.com";
+    println!("scraping: {}...", url);
+
+    // First scrape with default config
+    let response = BlessCrawl::default().scrape(url, None).expect("Failed to scrape");
+    println!("response with default config: {:?}", response);
+    println!();
+    println!("---------- markdown ----------\n{}\n------------------------------", response.data.content);
+}
+
+fn example_mapping() {
+    println!("--- Example 2: Link Mapping/Discovery ---");
+
+    let url = "https://example.com";
+    println!("Mapping links from: {}", url);
+
+    let options = MapOptions::new()
+        .with_link_types(vec!["internal".to_string(), "external".to_string()])
+        .with_base_url(url.to_string())
+        .with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]);
+
+    let response = BlessCrawl::default().map(url, Some(options)).expect("Failed to map");
+    println!("response: {:?}", response);
+    println!();
+    println!("------------ links ------------\n{:?}\n------------------------------", response.data.links);
+    println!();
+    println!("------------ total links ------------\n{}\n------------------------------", response.data.total_links);
+}
+
+fn example_crawling() {
+    println!("--- Example 3: Recursive Website Crawling ---");
+
+    let url = "https://example.com";
+    println!("Crawling website: {}", url);
+
+    let options = CrawlOptions::new()
+        .with_max_depth(2)
+        .with_limit(10)
+        .with_include_paths(vec!["/".to_string()])
+        .with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()])
+        .with_follow_external(false)
+        .with_delay_between_requests(1000)
+        .with_parallel_requests(3);
+
+    let response = BlessCrawl::default().crawl(url, Some(options)).expect("Failed to crawl");
+    println!("response: {:?}", response);
+    println!();
+    println!("------------ pages ------------\n{:?}\n------------------------------", response.data.pages);
+    println!();
+    println!("------------ total pages ------------\n{}\n------------------------------", response.data.total_pages);
+}

From 02316d3c948fa5994a5385b14742905f3be39645 Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:49:40 +1200
Subject: [PATCH 07/12] readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8e7d9dc..46a799e 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ cargo build --release --target wasm32-wasip1 --example llm-mcp
 | [httpbin](./examples/httpbin.rs) | HTTP to query anything from httpbin | ✅ | ✅ |
 | [llm](./examples/llm.rs) | LLM to chat with `Llama-3.1-8B-Instruct-q4f32_1-MLC` and `SmolLM2-1.7B-Instruct-q4f16_1-MLC` models | ✅ | ✅ |
 | [llm-mcp](./examples/llm-mcp.rs) | LLM with MCP (Model Control Protocol) demonstrating tool integration using SSE endpoints | ✅ | ✅ |
-
+| [web-scrape](./examples/web-scrape.rs) | Web Scraping to scrape content from a single URL with custom configuration overrides | ✅ | ❌ |
 
 ## Testing
 

From f161018353a45ee8183792a312d295d28b769331 Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:53:35 +1200
Subject: [PATCH 08/12] cargo fmt --all

---
 examples/web-scrape.rs            |  37 ++++++++---
 src/bless_crawl/html_transform.rs | 105 ++++++++++++++++++------------
 src/bless_crawl/mod.rs            |  54 ++++++++-------
 3 files changed, 126 insertions(+), 70 deletions(-)

diff --git a/examples/web-scrape.rs b/examples/web-scrape.rs
index 63f25a7..d4165a3 100644
--- a/examples/web-scrape.rs
+++ b/examples/web-scrape.rs
@@ -23,10 +23,15 @@ fn example_scraping() {
     println!("scraping: {}...", url);
 
     // First scrape with default config
-    let response = BlessCrawl::default().scrape(url, None).expect("Failed to scrape");
+    let response = BlessCrawl::default()
+        .scrape(url, None)
+        .expect("Failed to scrape");
     println!("response with default config: {:?}", response);
     println!();
-    println!("---------- markdown ----------\n{}\n------------------------------", response.data.content);
+    println!(
+        "---------- markdown ----------\n{}\n------------------------------",
+        response.data.content
+    );
 }
 
 fn example_mapping() {
@@ -40,12 +45,20 @@ fn example_mapping() {
         .with_base_url(url.to_string())
         .with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]);
 
-    let response = BlessCrawl::default().map(url, Some(options)).expect("Failed to map");
+    let response = BlessCrawl::default()
+        .map(url, Some(options))
+        .expect("Failed to map");
     println!("response: {:?}", response);
     println!();
-    println!("------------ links ------------\n{:?}\n------------------------------", response.data.links);
+    println!(
+        "------------ links ------------\n{:?}\n------------------------------",
+        response.data.links
+    );
     println!();
-    println!("------------ total links ------------\n{}\n------------------------------", response.data.total_links);
+    println!(
+        "------------ total links ------------\n{}\n------------------------------",
+        response.data.total_links
+    );
 }
 
 fn example_crawling() {
@@ -63,10 +76,18 @@ fn example_crawling() {
         .with_delay_between_requests(1000)
         .with_parallel_requests(3);
 
-    let response = BlessCrawl::default().crawl(url, Some(options)).expect("Failed to crawl");
+    let response = BlessCrawl::default()
+        .crawl(url, Some(options))
+        .expect("Failed to crawl");
     println!("response: {:?}", response);
     println!();
-    println!("------------ pages ------------\n{:?}\n------------------------------", response.data.pages);
+    println!(
+        "------------ pages ------------\n{:?}\n------------------------------",
+        response.data.pages
+    );
     println!();
-    println!("------------ total pages ------------\n{}\n------------------------------", response.data.total_pages);
+    println!(
+        "------------ total pages ------------\n{}\n------------------------------",
+        response.data.total_pages
+    );
 }
diff --git a/src/bless_crawl/html_transform.rs b/src/bless_crawl/html_transform.rs
index 0a0a596..8c56ebe 100644
--- a/src/bless_crawl/html_transform.rs
+++ b/src/bless_crawl/html_transform.rs
@@ -101,14 +101,17 @@ impl std::error::Error for HtmlTransformError {}
 /// Transforms HTML by removing unwanted elements, filtering tags, and processing URLs
 pub fn transform_html(opts: TransformHtmlOptions) -> Result<String, HtmlTransformError> {
     let mut document = parse_html().one(opts.html);
-    
+
     // If include_tags is specified, only include those tags
     if !opts.include_tags.is_empty() {
         let new_document = parse_html().one("<div></div>");
-        let root = new_document.select_first("div").map_err(|_| HtmlTransformError::SelectError)?;
+        let root = new_document
+            .select_first("div")
+            .map_err(|_| HtmlTransformError::SelectError)?;
 
         for tag_selector in opts.include_tags.iter() {
-            let matching_nodes: Vec<_> = document.select(tag_selector)
+            let matching_nodes: Vec<_> = document
+                .select(tag_selector)
                 .map_err(|_| HtmlTransformError::SelectError)?
                 .collect();
             for tag in matching_nodes {
@@ -137,17 +140,20 @@ pub fn transform_html(opts: TransformHtmlOptions) -> Result<String, HtmlTransfor
     // Remove non-main content if requested
     if opts.only_main_content {
         for selector in EXCLUDE_NON_MAIN_TAGS.iter() {
-            let elements: Vec<_> = document.select(selector)
+            let elements: Vec<_> = document
+                .select(selector)
                 .map_err(|_| HtmlTransformError::SelectError)?
                 .collect();
             for element in elements {
                 // Check if this element contains any force-include tags
                 let should_keep = FORCE_INCLUDE_MAIN_TAGS.iter().any(|force_selector| {
-                    element.as_node().select(force_selector)
+                    element
+                        .as_node()
+                        .select(force_selector)
                         .map(|mut iter| iter.next().is_some())
                         .unwrap_or(false)
                 });
-                
+
                 if !should_keep {
                     element.as_node().detach();
                 }
@@ -156,35 +162,39 @@ pub fn transform_html(opts: TransformHtmlOptions) -> Result<String, HtmlTransfor
     }
 
     // Process images with srcset attributes
-    let srcset_images: Vec<_> = document.select("img[srcset]")
+    let srcset_images: Vec<_> = document
+        .select("img[srcset]")
         .map_err(|_| HtmlTransformError::SelectError)?
         .collect();
-    
+
     for img in srcset_images {
         let srcset = img.attributes.borrow().get("srcset").map(|s| s.to_string());
         if let Some(srcset) = srcset {
-            let mut sizes: Vec<ImageSource> = srcset.split(',').filter_map(|entry| {
-                let tokens: Vec<&str> = entry.trim().split(' ').collect();
-                if tokens.is_empty() {
-                    return None;
-                }
-                
-                let size_token = if tokens.len() > 1 && !tokens[1].is_empty() {
-                    tokens[1]
-                } else {
-                    "1x"
-                };
-                
-                if let Ok(parsed_size) = size_token[..size_token.len()-1].parse() {
-                    Some(ImageSource {
-                        url: tokens[0].to_string(),
-                        size: parsed_size,
-                        is_x: size_token.ends_with('x')
-                    })
-                } else {
-                    None
-                }
-            }).collect();
+            let mut sizes: Vec<ImageSource> = srcset
+                .split(',')
+                .filter_map(|entry| {
+                    let tokens: Vec<&str> = entry.trim().split(' ').collect();
+                    if tokens.is_empty() {
+                        return None;
+                    }
+
+                    let size_token = if tokens.len() > 1 && !tokens[1].is_empty() {
+                        tokens[1]
+                    } else {
+                        "1x"
+                    };
+
+                    if let Ok(parsed_size) = size_token[..size_token.len() - 1].parse() {
+                        Some(ImageSource {
+                            url: tokens[0].to_string(),
+                            size: parsed_size,
+                            is_x: size_token.ends_with('x'),
+                        })
+                    } else {
+                        None
+                    }
+                })
+                .collect();
 
             // Add src attribute as 1x if all sizes are x-based
             if sizes.iter().all(|s| s.is_x) {
@@ -201,36 +211,49 @@ pub fn transform_html(opts: TransformHtmlOptions) -> Result<String, HtmlTransfor
             // Sort by size (largest first) and use the biggest image
             sizes.sort_by(|a, b| b.size.cmp(&a.size));
             if let Some(biggest) = sizes.first() {
-                img.attributes.borrow_mut().insert("src", biggest.url.clone());
+                img.attributes
+                    .borrow_mut()
+                    .insert("src", biggest.url.clone());
             }
         }
     }
 
     // Convert relative URLs to absolute URLs
     let base_url = Url::parse(&opts.url).map_err(|_| HtmlTransformError::UrlParseError)?;
-    
+
     // Process image src attributes
-    let src_images: Vec<_> = document.select("img[src]")
+    let src_images: Vec<_> = document
+        .select("img[src]")
         .map_err(|_| HtmlTransformError::SelectError)?
         .collect();
     for img in src_images {
         let old_src = img.attributes.borrow().get("src").map(|s| s.to_string());
         if let Some(old_src) = old_src {
             if let Ok(new_url) = base_url.join(&old_src) {
-                img.attributes.borrow_mut().insert("src", new_url.to_string());
+                img.attributes
+                    .borrow_mut()
+                    .insert("src", new_url.to_string());
             }
         }
     }
 
     // Process anchor href attributes
-    let href_anchors: Vec<_> = document.select("a[href]")
+    let href_anchors: Vec<_> = document
+        .select("a[href]")
         .map_err(|_| HtmlTransformError::SelectError)?
         .collect();
     for anchor in href_anchors {
-        let old_href = anchor.attributes.borrow().get("href").map(|s| s.to_string());
+        let old_href = anchor
+            .attributes
+            .borrow()
+            .get("href")
+            .map(|s| s.to_string());
         if let Some(old_href) = old_href {
             if let Ok(new_url) = base_url.join(&old_href) {
-                anchor.attributes.borrow_mut().insert("href", new_url.to_string());
+                anchor
+                    .attributes
+                    .borrow_mut()
+                    .insert("href", new_url.to_string());
             }
         }
     }
@@ -268,7 +291,8 @@ mod tests {
         };
 
         let result = transform_html(opts).unwrap();
-        let expected = "<html><body><div><div class=\"content\">Keep this</div></div></body></html>";
+        let expected =
+            "<html><body><div><div class=\"content\">Keep this</div></div></body></html>";
         assert_eq!(result, expected);
     }
 
@@ -290,7 +314,8 @@ mod tests {
     #[test]
     fn test_transform_html_relative_urls() {
         let opts = TransformHtmlOptions {
-            html: r#"<html><body><img src="/image.jpg"><a href="/page">Link</a></body></html>"#.to_string(),
+            html: r#"<html><body><img src="/image.jpg"><a href="/page">Link</a></body></html>"#
+                .to_string(),
             url: "https://example.com/subdir/".to_string(),
             include_tags: vec![],
             exclude_tags: vec![],
@@ -346,4 +371,4 @@ mod tests {
         let expected = r#"<html><body><div class="widget"><div id="main"><p>Important content</p></div></div></body></html>"#;
         assert_eq!(result, expected);
     }
-} 
\ No newline at end of file
+}
diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs
index 9c35b9b..1d1e509 100644
--- a/src/bless_crawl/mod.rs
+++ b/src/bless_crawl/mod.rs
@@ -22,7 +22,7 @@
 //! // Create with default config
 //! let crawler = BlessCrawl::default();
 //! let result = crawler.scrape("https://example.com", None).unwrap();
-//! 
+//!
 //! // Or override config per request
 //! let custom_config = ScrapeOptions { timeout: 30000, wait_time: 5000, ..Default::default() };
 //! let result = crawler.scrape("https://example.com", Some(custom_config)).unwrap();
@@ -33,7 +33,7 @@ mod html_to_markdown;
 mod html_transform;
 
 use html_to_markdown::parse_markdown;
-pub use html_transform::{transform_html, TransformHtmlOptions, HtmlTransformError};
+pub use html_transform::{transform_html, HtmlTransformError, TransformHtmlOptions};
 use std::collections::HashMap;
 
 type Handle = u32;
@@ -318,7 +318,10 @@ impl ScrapeOptions {
     }
 
     pub fn with_viewport(mut self, width: u32, height: u32) -> Self {
-        self.viewport = Some(Viewport { width: Some(width), height: Some(height) });
+        self.viewport = Some(Viewport {
+            width: Some(width),
+            height: Some(height),
+        });
         self
     }
 
@@ -449,7 +452,11 @@ impl BlessCrawl {
     }
 
     /// Scrapes webpage content and returns it as markdown with metadata.
-    pub fn scrape(&self, url: &str, options: Option<ScrapeOptions>) -> Result<Response<ScrapeData>, WebScrapeErrorKind> {
+    pub fn scrape(
+        &self,
+        url: &str,
+        options: Option<ScrapeOptions>,
+    ) -> Result<Response<ScrapeData>, WebScrapeErrorKind> {
         // Use provided options or fall back to instance config
         let config = if let Some(opts) = options {
             self.validate_config(&opts)?;
@@ -491,13 +498,11 @@ impl BlessCrawl {
             unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
 
         // deserialize the result to host ScrapeResponse
-        let mut scrape_response = serde_json::from_slice::<Response<ScrapeData>>(
-            result_bytes,
-        )
-        .map_err(|e| {
-            eprintln!("error: {:?}", e);
-            WebScrapeErrorKind::ParseError
-        })?;
+        let mut scrape_response = serde_json::from_slice::<Response<ScrapeData>>(result_bytes)
+            .map_err(|e| {
+                eprintln!("error: {:?}", e);
+                WebScrapeErrorKind::ParseError
+            })?;
 
         if let Some(error) = scrape_response.error {
             return Err(WebScrapeErrorKind::RuntimeError(error));
@@ -510,7 +515,8 @@ impl BlessCrawl {
             include_tags: config.include_tags.unwrap_or_default(),
             exclude_tags: config.exclude_tags.unwrap_or_default(),
             only_main_content: config.only_main_content,
-        }).map_err(|e| {
+        })
+        .map_err(|e| {
             eprintln!("error: {:?}", e);
             WebScrapeErrorKind::TransformError
         })?;
@@ -573,10 +579,11 @@ impl BlessCrawl {
             unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
 
         // deserialize the result to MapResponse
-        let map_response = serde_json::from_slice::<Response<MapData>>(result_bytes).map_err(|e| {
-            eprintln!("error: {:?}", e);
-            WebScrapeErrorKind::ParseError
-        })?;
+        let map_response =
+            serde_json::from_slice::<Response<MapData>>(result_bytes).map_err(|e| {
+                eprintln!("error: {:?}", e);
+                WebScrapeErrorKind::ParseError
+            })?;
 
         if let Some(error) = map_response.error {
             return Err(WebScrapeErrorKind::RuntimeError(error));
@@ -630,11 +637,13 @@ impl BlessCrawl {
             unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
 
         // deserialize the result to CrawlResponse
-        let mut host_crawl_response =
-            serde_json::from_slice::<Response<CrawlData<ScrapeData>>>(result_bytes).map_err(|e| {
-                eprintln!("error: {:?}", e);
-                WebScrapeErrorKind::ParseError
-            })?;
+        let mut host_crawl_response = serde_json::from_slice::<Response<CrawlData<ScrapeData>>>(
+            result_bytes,
+        )
+        .map_err(|e| {
+            eprintln!("error: {:?}", e);
+            WebScrapeErrorKind::ParseError
+        })?;
 
         if let Some(error) = host_crawl_response.error {
             return Err(WebScrapeErrorKind::RuntimeError(error));
@@ -648,7 +657,8 @@ impl BlessCrawl {
                 include_tags: self.config.include_tags.clone().unwrap_or_default(),
                 exclude_tags: self.config.exclude_tags.clone().unwrap_or_default(),
                 only_main_content: self.config.only_main_content,
-            }).map_err(|e| {
+            })
+            .map_err(|e| {
                 eprintln!("error: {:?}", e);
                 WebScrapeErrorKind::TransformError
             })?;

From 260340a9f92b4f380d60507b7ab61d95977a1c9d Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:56:05 +1200
Subject: [PATCH 09/12] fixed clippy errors

---
 src/bless_crawl/html_to_markdown.rs | 4 +---
 src/bless_crawl/mod.rs              | 8 +-------
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/bless_crawl/html_to_markdown.rs b/src/bless_crawl/html_to_markdown.rs
index f8eb03f..9137634 100644
--- a/src/bless_crawl/html_to_markdown.rs
+++ b/src/bless_crawl/html_to_markdown.rs
@@ -24,9 +24,7 @@ pub fn parse_markdown(html: &str) -> String {
 
     // Process the markdown content
     let processed_markdown = process_multiline_links(&markdown);
-    let final_markdown = remove_skip_to_content_links(&processed_markdown);
-
-    final_markdown
+    remove_skip_to_content_links(&processed_markdown)
 }
 
 /// Processes multi-line links by escaping newlines inside link content
diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs
index 1d1e509..bbc0858 100644
--- a/src/bless_crawl/mod.rs
+++ b/src/bless_crawl/mod.rs
@@ -174,12 +174,6 @@ pub enum Format {
     Json,
 }
 
-impl std::fmt::Display for Format {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.to_string().to_lowercase())
-    }
-}
-
 impl std::str::FromStr for Format {
     type Err = ();
     fn from_str(s: &str) -> Result<Self, Self::Err> {
@@ -420,7 +414,7 @@ impl BlessCrawl {
     pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024;
 
     /// Maximum result buffer size in bytes (1MB)
-    pub const MAX_MAP_BUFFER_SIZE: usize = 1 * 1024 * 1024;
+    pub const MAX_MAP_BUFFER_SIZE: usize = 1024 * 1024;
 
     /// Maximum result buffer size in bytes (8MB)
     pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024;

From 7460e59f72e0807b0500db76286951142b8576ca Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 16:58:42 +1200
Subject: [PATCH 10/12] fixed clippy warnings

---
 src/bless_crawl/mod.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs
index bbc0858..78d0ed1 100644
--- a/src/bless_crawl/mod.rs
+++ b/src/bless_crawl/mod.rs
@@ -43,6 +43,7 @@ type ExitCode = u8;
 #[link(wasm_import_module = "bless_crawl")]
 extern "C" {
     /// Scrape webpage content and return as markdown
+    #[allow(clippy::too_many_arguments)]
     fn scrape(
         h: *mut Handle,
         url_ptr: *const u8,
@@ -55,6 +56,7 @@ extern "C" {
     ) -> ExitCode;
 
     /// Extract and return all discoverable links from webpage
+    #[allow(clippy::too_many_arguments)]
     fn map(
         h: *mut Handle,
         url_ptr: *const u8,
@@ -67,6 +69,7 @@ extern "C" {
     ) -> ExitCode;
 
     /// Recursively crawl website starting from given URL
+    #[allow(clippy::too_many_arguments)]
     fn crawl(
         h: *mut Handle,
         url_ptr: *const u8,
@@ -87,6 +90,7 @@ extern "C" {
 mod mock_ffi {
     use super::{ExitCode, Handle};
 
+    #[allow(clippy::too_many_arguments)]
     pub unsafe fn scrape(
         h: *mut Handle,
         _url_ptr: *const u8,
@@ -100,6 +104,7 @@ mod mock_ffi {
         unimplemented!()
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub unsafe fn map(
         h: *mut Handle,
         _url_ptr: *const u8,
@@ -113,6 +118,7 @@ mod mock_ffi {
         unimplemented!()
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub unsafe fn crawl(
         h: *mut Handle,
         _url_ptr: *const u8,

From eb02b5f54850d9a05767a83c1fc5080c0e434645 Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 17:01:10 +1200
Subject: [PATCH 11/12] return 1 as exitcode for mock-ffi impl

---
 src/bless_crawl/mod.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs
index 78d0ed1..159f67d 100644
--- a/src/bless_crawl/mod.rs
+++ b/src/bless_crawl/mod.rs
@@ -101,7 +101,7 @@ mod mock_ffi {
         result_len: usize,
         bytes_written: *mut usize,
     ) -> ExitCode {
-        unimplemented!()
+        1
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -115,7 +115,7 @@ mod mock_ffi {
         result_len: usize,
         bytes_written: *mut usize,
     ) -> ExitCode {
-        unimplemented!()
+        1
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -129,11 +129,11 @@ mod mock_ffi {
         result_len: usize,
         bytes_written: *mut usize,
     ) -> ExitCode {
-        unimplemented!()
+        1
     }
 
     pub unsafe fn close(_h: Handle) -> ExitCode {
-        unimplemented!()
+        1
     }
 }
 

From 1e9295fb36baf8a4b151cdf507831fd1756dde86 Mon Sep 17 00:00:00 2001
From: z <zees-dev@users.noreply.github.com>
Date: Wed, 25 Jun 2025 17:09:03 +1200
Subject: [PATCH 12/12] fixed doc tests

---
 src/bless_crawl/mod.rs | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs
index 159f67d..8a3ffc2 100644
--- a/src/bless_crawl/mod.rs
+++ b/src/bless_crawl/mod.rs
@@ -13,21 +13,6 @@
 //! - Timeout: 15s default, 120s max
 //! - Wait time: 3s default, 20s max
 //! - Buffer sizes: 2MB (scrape), 128KB (map), 8MB (crawl)
-//!
-//! ## Example
-//!
-//! ```rust
-//! use blockless_sdk::*;
-//!
-//! // Create with default config
-//! let crawler = BlessCrawl::default();
-//! let result = crawler.scrape("https://example.com", None).unwrap();
-//!
-//! // Or override config per request
-//! let custom_config = ScrapeOptions { timeout: 30000, wait_time: 5000, ..Default::default() };
-//! let result = crawler.scrape("https://example.com", Some(custom_config)).unwrap();
-//! println!("Content: {}", result.data.content);
-//! ```
 
 mod html_to_markdown;
 mod html_transform;