Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ license = "MIT/Apache-2.0"
repository = "https://github.com/blocklessnetwork/sdk-rust"

[dependencies]
htmd = { version = "0.2.2", default-features = false }
json = { version = "0.12", default-features = false }
kuchikiki = { version = "0.8", default-features = false }
regex = { version = "1.11.1", default-features = false, features = ["unicode-case"] }
serde = { version = "1.0", features = ["derive"], optional = true }

[dev-dependencies]
serde_json = "1.0"
serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
url = { version = "2.5", default-features = false }

[features]
default = ["serde"]
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ cargo build --release --target wasm32-wasip1 --example llm-mcp
| [httpbin](./examples/httpbin.rs) | HTTP to query anything from httpbin | ✅ | ✅ |
| [llm](./examples/llm.rs) | LLM to chat with `Llama-3.1-8B-Instruct-q4f32_1-MLC` and `SmolLM2-1.7B-Instruct-q4f16_1-MLC` models | ✅ | ✅ |
| [llm-mcp](./examples/llm-mcp.rs) | LLM with MCP (Model Control Protocol) demonstrating tool integration using SSE endpoints | ✅ | ✅ |

| [web-scrape](./examples/web-scrape.rs) | Web Scraping to scrape content from a single URL with custom configuration overrides | ✅ | ❌ |

## Testing

Expand Down
93 changes: 93 additions & 0 deletions examples/web-scrape.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
use blockless_sdk::*;

/// This example demonstrates how to use the Blockless SDK to perform web scraping
/// using the BlessCrawl functionality.
///
/// It shows how to:
/// - Create a BlessCrawl instance with default configuration
/// - Scrape content from a single URL with custom configuration overrides
/// - Map links from a webpage to discover available URLs
/// - Handle errors and responses appropriately
fn main() {
println!("=== Blockless Web Scraping SDK Example ===\n");

example_scraping();
example_mapping();
example_crawling();
}

fn example_scraping() {
println!("--- Example 1: Basic Web Scraping ---");

let url = "https://example.com";
println!("scraping: {}...", url);

// First scrape with default config
let response = BlessCrawl::default()
.scrape(url, None)
.expect("Failed to scrape");
println!("response with default config: {:?}", response);
println!();
println!(
"---------- markdown ----------\n{}\n------------------------------",
response.data.content
);
}

fn example_mapping() {
println!("--- Example 2: Link Mapping/Discovery ---");

let url = "https://example.com";
println!("Mapping links from: {}", url);

let options = MapOptions::new()
.with_link_types(vec!["internal".to_string(), "external".to_string()])
.with_base_url(url.to_string())
.with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]);

let response = BlessCrawl::default()
.map(url, Some(options))
.expect("Failed to map");
println!("response: {:?}", response);
println!();
println!(
"------------ links ------------\n{:?}\n------------------------------",
response.data.links
);
println!();
println!(
"------------ total links ------------\n{}\n------------------------------",
response.data.total_links
);
}

fn example_crawling() {
println!("--- Example 3: Recursive Website Crawling ---");

let url = "https://example.com";
println!("Crawling website: {}", url);

let options = CrawlOptions::new()
.with_max_depth(2)
.with_limit(10)
.with_include_paths(vec!["/".to_string()])
.with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()])
.with_follow_external(false)
.with_delay_between_requests(1000)
.with_parallel_requests(3);

let response = BlessCrawl::default()
.crawl(url, Some(options))
.expect("Failed to crawl");
println!("response: {:?}", response);
println!();
println!(
"------------ pages ------------\n{:?}\n------------------------------",
response.data.pages
);
println!();
println!(
"------------ total pages ------------\n{}\n------------------------------",
response.data.total_pages
);
}
119 changes: 119 additions & 0 deletions src/bless_crawl/html_to_markdown.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
use htmd::HtmlToMarkdown;
use regex::Regex;

/// Parses HTML content and converts it to Markdown
///
/// This function replicates the behavior of the JavaScript parseMarkdown function:
/// - Converts HTML to Markdown using htmd
/// - Processes multi-line links by escaping newlines inside link content
/// - Removes "Skip to Content" links
/// - Returns empty string for empty/null input
pub fn parse_markdown(html: &str) -> String {
if html.is_empty() {
return String::new();
}

// Convert HTML to Markdown using htmd
let markdown = match HtmlToMarkdown::new().convert(html) {
Ok(md) => md,
Err(_) => {
// Return empty string if conversion fails
return String::new();
}
};

// Process the markdown content
let processed_markdown = process_multiline_links(&markdown);
remove_skip_to_content_links(&processed_markdown)
}

/// Processes multi-line links by escaping newlines inside link content
///
/// This function replicates the JavaScript processMultiLineLinks function:
/// - Tracks when we're inside link content (between [ and ])
/// - Escapes newlines with backslash when inside links
fn process_multiline_links(markdown_content: &str) -> String {
let mut new_markdown_content = String::new();
let mut link_open_count: usize = 0;

for ch in markdown_content.chars() {
match ch {
'[' => {
link_open_count += 1;
}
']' => {
link_open_count = link_open_count.saturating_sub(1);
}
_ => {}
}

let inside_link_content = link_open_count > 0;

if inside_link_content && ch == '\n' {
new_markdown_content.push('\\');
new_markdown_content.push('\n');
} else {
new_markdown_content.push(ch);
}
}

new_markdown_content
}

/// Removes "Skip to Content" links from the markdown content
///
/// This function replicates the JavaScript removeSkipToContentLinks function:
/// - Removes [Skip to Content](#page) and [Skip to content](#skip) patterns
/// - Case-insensitive matching
fn remove_skip_to_content_links(markdown_content: &str) -> String {
let re = Regex::new(r"(?i)\[Skip to Content\]\(#[^)]*\)").unwrap();
re.replace_all(markdown_content, "").to_string()
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_parse_markdown_simple() {
let html = "<p>Hello, world!</p>";
let result = parse_markdown(html);
assert_eq!(result.trim(), "Hello, world!");
}

#[test]
fn test_parse_markdown_complex() {
let html =
"<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>";
let result = parse_markdown(html);
assert_eq!(result.trim(), "Hello **bold** world!\n\n* List item");
}

#[test]
fn test_parse_markdown_empty() {
let html = "";
let result = parse_markdown(html);
assert_eq!(result, "");
}

#[test]
fn test_process_multiline_links() {
let markdown = "[Link\nwith newline](http://example.com)";
let result = process_multiline_links(markdown);
assert_eq!(result, "[Link\\\nwith newline](http://example.com)");
}

#[test]
fn test_remove_skip_to_content_links() {
let markdown = "Some content [Skip to Content](#page) more content";
let result = remove_skip_to_content_links(markdown);
assert_eq!(result, "Some content more content");
}

#[test]
fn test_remove_skip_to_content_links_case_insensitive() {
let markdown = "Some content [Skip to content](#skip) more content";
let result = remove_skip_to_content_links(markdown);
assert_eq!(result, "Some content more content");
}
}
Loading
Loading