blocklessnetwork · zees-dev · Jun 26, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,11 +10,13 @@ license = "MIT/Apache-2.0"
 repository = "https://github.com/blocklessnetwork/sdk-rust"
 
 [dependencies]
+htmd = { version = "0.2.2", default-features = false }
 json = { version = "0.12", default-features = false }
+kuchikiki = { version = "0.8", default-features = false }
+regex = { version = "1.11.1", default-features = false, features = ["unicode-case"] }
 serde = { version = "1.0", features = ["derive"], optional = true }
-
-[dev-dependencies]
-serde_json = "1.0"
+serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
+url = { version = "2.5", default-features = false }
 
 [features]
 default = ["serde"]

diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ cargo build --release --target wasm32-wasip1 --example llm-mcp
 | [httpbin](./examples/httpbin.rs) | HTTP to query anything from httpbin | ✅ | ✅ |
 | [llm](./examples/llm.rs) | LLM to chat with `Llama-3.1-8B-Instruct-q4f32_1-MLC` and `SmolLM2-1.7B-Instruct-q4f16_1-MLC` models | ✅ | ✅ |
 | [llm-mcp](./examples/llm-mcp.rs) | LLM with MCP (Model Control Protocol) demonstrating tool integration using SSE endpoints | ✅ | ✅ |
-
+| [web-scrape](./examples/web-scrape.rs) | Web Scraping to scrape content from a single URL with custom configuration overrides | ✅ | ❌ |
 
 ## Testing
 

diff --git a/examples/web-scrape.rs b/examples/web-scrape.rs
@@ -0,0 +1,93 @@
+use blockless_sdk::*;
+
+/// This example demonstrates how to use the Blockless SDK to perform web scraping
+/// using the BlessCrawl functionality.
+///
+/// It shows how to:
+/// - Create a BlessCrawl instance with default configuration
+/// - Scrape content from a single URL with custom configuration overrides
+/// - Map links from a webpage to discover available URLs
+/// - Handle errors and responses appropriately
+fn main() {
+    println!("=== Blockless Web Scraping SDK Example ===\n");
+
+    example_scraping();
+    example_mapping();
+    example_crawling();
+}
+
+fn example_scraping() {
+    println!("--- Example 1: Basic Web Scraping ---");
+
+    let url = "https://example.com";
+    println!("scraping: {}...", url);
+
+    // First scrape with default config
+    let response = BlessCrawl::default()
+        .scrape(url, None)
+        .expect("Failed to scrape");
+    println!("response with default config: {:?}", response);
+    println!();
+    println!(
+        "---------- markdown ----------\n{}\n------------------------------",
+        response.data.content
+    );
+}
+
+fn example_mapping() {
+    println!("--- Example 2: Link Mapping/Discovery ---");
+
+    let url = "https://example.com";
+    println!("Mapping links from: {}", url);
+
+    let options = MapOptions::new()
+        .with_link_types(vec!["internal".to_string(), "external".to_string()])
+        .with_base_url(url.to_string())
+        .with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]);
+
+    let response = BlessCrawl::default()
+        .map(url, Some(options))
+        .expect("Failed to map");
+    println!("response: {:?}", response);
+    println!();
+    println!(
+        "------------ links ------------\n{:?}\n------------------------------",
+        response.data.links
+    );
+    println!();
+    println!(
+        "------------ total links ------------\n{}\n------------------------------",
+        response.data.total_links
+    );
+}
+
+fn example_crawling() {
+    println!("--- Example 3: Recursive Website Crawling ---");
+
+    let url = "https://example.com";
+    println!("Crawling website: {}", url);
+
+    let options = CrawlOptions::new()
+        .with_max_depth(2)
+        .with_limit(10)
+        .with_include_paths(vec!["/".to_string()])
+        .with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()])
+        .with_follow_external(false)
+        .with_delay_between_requests(1000)
+        .with_parallel_requests(3);
+
+    let response = BlessCrawl::default()
+        .crawl(url, Some(options))
+        .expect("Failed to crawl");
+    println!("response: {:?}", response);
+    println!();
+    println!(
+        "------------ pages ------------\n{:?}\n------------------------------",
+        response.data.pages
+    );
+    println!();
+    println!(
+        "------------ total pages ------------\n{}\n------------------------------",
+        response.data.total_pages
+    );
+}
diff --git a/src/bless_crawl/html_to_markdown.rs b/src/bless_crawl/html_to_markdown.rs
@@ -0,0 +1,119 @@
+use htmd::HtmlToMarkdown;
+use regex::Regex;
+
+/// Parses HTML content and converts it to Markdown
+///
+/// This function replicates the behavior of the JavaScript parseMarkdown function:
+/// - Converts HTML to Markdown using htmd
+/// - Processes multi-line links by escaping newlines inside link content
+/// - Removes "Skip to Content" links
+/// - Returns empty string for empty/null input
+pub fn parse_markdown(html: &str) -> String {
+    if html.is_empty() {
+        return String::new();
+    }
+
+    // Convert HTML to Markdown using htmd
+    let markdown = match HtmlToMarkdown::new().convert(html) {
+        Ok(md) => md,
+        Err(_) => {
+            // Return empty string if conversion fails
+            return String::new();
+        }
+    };
+
+    // Process the markdown content
+    let processed_markdown = process_multiline_links(&markdown);
+    remove_skip_to_content_links(&processed_markdown)
+}
+
+/// Processes multi-line links by escaping newlines inside link content
+///
+/// This function replicates the JavaScript processMultiLineLinks function:
+/// - Tracks when we're inside link content (between [ and ])
+/// - Escapes newlines with backslash when inside links
+fn process_multiline_links(markdown_content: &str) -> String {
+    let mut new_markdown_content = String::new();
+    let mut link_open_count: usize = 0;
+
+    for ch in markdown_content.chars() {
+        match ch {
+            '[' => {
+                link_open_count += 1;
+            }
+            ']' => {
+                link_open_count = link_open_count.saturating_sub(1);
+            }
+            _ => {}
+        }
+
+        let inside_link_content = link_open_count > 0;
+
+        if inside_link_content && ch == '\n' {
+            new_markdown_content.push('\\');
+            new_markdown_content.push('\n');
+        } else {
+            new_markdown_content.push(ch);
+        }
+    }
+
+    new_markdown_content
+}
+
+/// Removes "Skip to Content" links from the markdown content
+///
+/// This function replicates the JavaScript removeSkipToContentLinks function:
+/// - Removes [Skip to Content](#page) and [Skip to content](#skip) patterns
+/// - Case-insensitive matching
+fn remove_skip_to_content_links(markdown_content: &str) -> String {
+    let re = Regex::new(r"(?i)\[Skip to Content\]\(#[^)]*\)").unwrap();
+    re.replace_all(markdown_content, "").to_string()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_markdown_simple() {
+        let html = "<p>Hello, world!</p>";
+        let result = parse_markdown(html);
+        assert_eq!(result.trim(), "Hello, world!");
+    }
+
+    #[test]
+    fn test_parse_markdown_complex() {
+        let html =
+            "<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>";
+        let result = parse_markdown(html);
+        assert_eq!(result.trim(), "Hello **bold** world!\n\n*   List item");
+    }
+
+    #[test]
+    fn test_parse_markdown_empty() {
+        let html = "";
+        let result = parse_markdown(html);
+        assert_eq!(result, "");
+    }
+
+    #[test]
+    fn test_process_multiline_links() {
+        let markdown = "[Link\nwith newline](http://example.com)";
+        let result = process_multiline_links(markdown);
+        assert_eq!(result, "[Link\\\nwith newline](http://example.com)");
+    }
+
+    #[test]
+    fn test_remove_skip_to_content_links() {
+        let markdown = "Some content [Skip to Content](#page) more content";
+        let result = remove_skip_to_content_links(markdown);
+        assert_eq!(result, "Some content  more content");
+    }
+
+    #[test]
+    fn test_remove_skip_to_content_links_case_insensitive() {
+        let markdown = "Some content [Skip to content](#skip) more content";
+        let result = remove_skip_to_content_links(markdown);
+        assert_eq!(result, "Some content  more content");
+    }
+}