diff --git a/Cargo.toml b/Cargo.toml
index a464f6d..99e23b7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,10 +11,7 @@ repository = "https://github.com/blocklessnetwork/sdk-rust"
[dependencies]
base64 = { version = "0.13", default-features = false, features = ["alloc"], optional = true }
-htmd = { version = "0.2.2", default-features = false, optional = true }
json = { version = "0.12", default-features = false }
-kuchikiki = { version = "0.8", default-features = false, optional = true }
-regex = { version = "1.11.1", default-features = false, features = ["unicode-case"], optional = true }
serde = { version = "1.0", features = ["derive"], optional = true }
serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
url = { version = "2.5", default-features = false }
@@ -23,7 +20,6 @@ url = { version = "2.5", default-features = false }
default = [
"http",
"llm",
- "bless-crawl",
"cgi",
"socket",
"memory",
@@ -32,7 +28,6 @@ default = [
mock-ffi = []
http = ["rpc", "dep:base64", "dep:serde"]
llm = ["dep:serde"]
-bless-crawl = ["http", "dep:htmd", "dep:kuchikiki", "dep:regex", "dep:serde"]
cgi = []
socket = []
memory = []
diff --git a/examples/web-scrape.rs b/examples/web-scrape.rs
deleted file mode 100644
index 552aa39..0000000
--- a/examples/web-scrape.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-use blockless_sdk::bless_crawl::*;
-
-/// This example demonstrates how to use the Blockless SDK to perform web scraping
-/// using the BlessCrawl functionality.
-///
-/// It shows how to:
-/// - Create a BlessCrawl instance with default configuration
-/// - Scrape content from a single URL with custom configuration overrides
-/// - Map links from a webpage to discover available URLs
-/// - Handle errors and responses appropriately
-fn main() {
- println!("=== Blockless Web Scraping SDK Example ===\n");
-
- example_scraping();
- example_mapping();
- example_crawling();
-}
-
-fn example_scraping() {
- println!("--- Example 1: Basic Web Scraping ---");
-
- let url = "https://example.com";
- println!("scraping: {}...", url);
-
- // First scrape with default config
- let response = BlessCrawl::default()
- .scrape(url, None)
- .expect("Failed to scrape");
- println!("response with default config: {:?}", response);
- println!();
- println!(
- "---------- markdown ----------\n{}\n------------------------------",
- response.data.content
- );
-}
-
-fn example_mapping() {
- println!("--- Example 2: Link Mapping/Discovery ---");
-
- let url = "https://example.com";
- println!("Mapping links from: {}", url);
-
- let options = MapOptions::new()
- .with_link_types(vec!["internal".to_string(), "external".to_string()])
- .with_base_url(url.to_string())
- .with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]);
-
- let response = BlessCrawl::default()
- .map(url, Some(options))
- .expect("Failed to map");
- println!("response: {:?}", response);
- println!();
- println!(
- "------------ links ------------\n{:?}\n------------------------------",
- response.data.links
- );
- println!();
- println!(
- "------------ total links ------------\n{}\n------------------------------",
- response.data.total_links
- );
-}
-
-fn example_crawling() {
- println!("--- Example 3: Recursive Website Crawling ---");
-
- let url = "https://example.com";
- println!("Crawling website: {}", url);
-
- let options = CrawlOptions::new()
- .with_max_depth(2)
- .with_limit(10)
- .with_include_paths(vec!["/".to_string()])
- .with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()])
- .with_follow_external(false)
- .with_delay_between_requests(1000)
- .with_parallel_requests(3);
-
- let response = BlessCrawl::default()
- .crawl(url, Some(options))
- .expect("Failed to crawl");
- println!("response: {:?}", response);
- println!();
- println!(
- "------------ pages ------------\n{:?}\n------------------------------",
- response.data.pages
- );
- println!();
- println!(
- "------------ total pages ------------\n{}\n------------------------------",
- response.data.total_pages
- );
-}
diff --git a/src/bless_crawl/html_to_markdown.rs b/src/bless_crawl/html_to_markdown.rs
deleted file mode 100644
index 9137634..0000000
--- a/src/bless_crawl/html_to_markdown.rs
+++ /dev/null
@@ -1,119 +0,0 @@
-use htmd::HtmlToMarkdown;
-use regex::Regex;
-
-/// Parses HTML content and converts it to Markdown
-///
-/// This function replicates the behavior of the JavaScript parseMarkdown function:
-/// - Converts HTML to Markdown using htmd
-/// - Processes multi-line links by escaping newlines inside link content
-/// - Removes "Skip to Content" links
-/// - Returns empty string for empty/null input
-pub fn parse_markdown(html: &str) -> String {
- if html.is_empty() {
- return String::new();
- }
-
- // Convert HTML to Markdown using htmd
- let markdown = match HtmlToMarkdown::new().convert(html) {
- Ok(md) => md,
- Err(_) => {
- // Return empty string if conversion fails
- return String::new();
- }
- };
-
- // Process the markdown content
- let processed_markdown = process_multiline_links(&markdown);
- remove_skip_to_content_links(&processed_markdown)
-}
-
-/// Processes multi-line links by escaping newlines inside link content
-///
-/// This function replicates the JavaScript processMultiLineLinks function:
-/// - Tracks when we're inside link content (between [ and ])
-/// - Escapes newlines with backslash when inside links
-fn process_multiline_links(markdown_content: &str) -> String {
- let mut new_markdown_content = String::new();
- let mut link_open_count: usize = 0;
-
- for ch in markdown_content.chars() {
- match ch {
- '[' => {
- link_open_count += 1;
- }
- ']' => {
- link_open_count = link_open_count.saturating_sub(1);
- }
- _ => {}
- }
-
- let inside_link_content = link_open_count > 0;
-
- if inside_link_content && ch == '\n' {
- new_markdown_content.push('\\');
- new_markdown_content.push('\n');
- } else {
- new_markdown_content.push(ch);
- }
- }
-
- new_markdown_content
-}
-
-/// Removes "Skip to Content" links from the markdown content
-///
-/// This function replicates the JavaScript removeSkipToContentLinks function:
-/// - Removes [Skip to Content](#page) and [Skip to content](#skip) patterns
-/// - Case-insensitive matching
-fn remove_skip_to_content_links(markdown_content: &str) -> String {
- let re = Regex::new(r"(?i)\[Skip to Content\]\(#[^)]*\)").unwrap();
- re.replace_all(markdown_content, "").to_string()
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn test_parse_markdown_simple() {
- let html = "
Hello, world!
";
- let result = parse_markdown(html);
- assert_eq!(result.trim(), "Hello, world!");
- }
-
- #[test]
- fn test_parse_markdown_complex() {
- let html =
- "";
- let result = parse_markdown(html);
- assert_eq!(result.trim(), "Hello **bold** world!\n\n* List item");
- }
-
- #[test]
- fn test_parse_markdown_empty() {
- let html = "";
- let result = parse_markdown(html);
- assert_eq!(result, "");
- }
-
- #[test]
- fn test_process_multiline_links() {
- let markdown = "[Link\nwith newline](http://example.com)";
- let result = process_multiline_links(markdown);
- assert_eq!(result, "[Link\\\nwith newline](http://example.com)");
- }
-
- #[test]
- fn test_remove_skip_to_content_links() {
- let markdown = "Some content [Skip to Content](#page) more content";
- let result = remove_skip_to_content_links(markdown);
- assert_eq!(result, "Some content more content");
- }
-
- #[test]
- fn test_remove_skip_to_content_links_case_insensitive() {
- let markdown = "Some content [Skip to content](#skip) more content";
- let result = remove_skip_to_content_links(markdown);
- assert_eq!(result, "Some content more content");
- }
-}
diff --git a/src/bless_crawl/html_transform.rs b/src/bless_crawl/html_transform.rs
deleted file mode 100644
index 8c56ebe..0000000
--- a/src/bless_crawl/html_transform.rs
+++ /dev/null
@@ -1,374 +0,0 @@
-use kuchikiki::{parse_html, traits::TendrilSink};
-use serde::{Deserialize, Serialize};
-use url::Url;
-
-const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
- "header",
- "footer",
- "nav",
- "aside",
- ".header",
- ".top",
- ".navbar",
- "#header",
- ".footer",
- ".bottom",
- "#footer",
- ".sidebar",
- ".side",
- ".aside",
- "#sidebar",
- ".modal",
- ".popup",
- "#modal",
- ".overlay",
- ".ad",
- ".ads",
- ".advert",
- "#ad",
- ".lang-selector",
- ".language",
- "#language-selector",
- ".social",
- ".social-media",
- ".social-links",
- "#social",
- ".menu",
- ".navigation",
- "#nav",
- ".breadcrumbs",
- "#breadcrumbs",
- ".share",
- "#share",
- ".widget",
- "#widget",
- ".cookie",
- "#cookie",
-];
-
-const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [
- "#main",
- // swoogo event software as .widget in all of their content
- ".swoogo-cols",
- ".swoogo-text",
- ".swoogo-table-div",
- ".swoogo-space",
- ".swoogo-alert",
- ".swoogo-sponsors",
- ".swoogo-title",
- ".swoogo-tabs",
- ".swoogo-logo",
- ".swoogo-image",
- ".swoogo-button",
- ".swoogo-agenda",
-];
-
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct TransformHtmlOptions {
- pub html: String,
- pub url: String,
- pub include_tags: Vec,
- pub exclude_tags: Vec,
- pub only_main_content: bool,
-}
-
-#[derive(Debug)]
-struct ImageSource {
- url: String,
- size: i32,
- is_x: bool,
-}
-
-#[derive(Debug)]
-pub enum HtmlTransformError {
- ParseError,
- UrlParseError,
- SelectError,
-}
-
-impl std::fmt::Display for HtmlTransformError {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- HtmlTransformError::ParseError => write!(f, "Failed to parse HTML"),
- HtmlTransformError::UrlParseError => write!(f, "Failed to parse URL"),
- HtmlTransformError::SelectError => write!(f, "Failed to select HTML elements"),
- }
- }
-}
-
-impl std::error::Error for HtmlTransformError {}
-
-/// Transforms HTML by removing unwanted elements, filtering tags, and processing URLs
-pub fn transform_html(opts: TransformHtmlOptions) -> Result {
- let mut document = parse_html().one(opts.html);
-
- // If include_tags is specified, only include those tags
- if !opts.include_tags.is_empty() {
- let new_document = parse_html().one("");
- let root = new_document
- .select_first("div")
- .map_err(|_| HtmlTransformError::SelectError)?;
-
- for tag_selector in opts.include_tags.iter() {
- let matching_nodes: Vec<_> = document
- .select(tag_selector)
- .map_err(|_| HtmlTransformError::SelectError)?
- .collect();
- for tag in matching_nodes {
- root.as_node().append(tag.as_node().clone());
- }
- }
-
- document = new_document;
- }
-
- // Remove unwanted elements
- let unwanted_selectors = ["head", "meta", "noscript", "style", "script"];
- for selector in &unwanted_selectors {
- while let Ok(element) = document.select_first(selector) {
- element.as_node().detach();
- }
- }
-
- // Remove excluded tags
- for tag_selector in opts.exclude_tags.iter() {
- while let Ok(element) = document.select_first(tag_selector) {
- element.as_node().detach();
- }
- }
-
- // Remove non-main content if requested
- if opts.only_main_content {
- for selector in EXCLUDE_NON_MAIN_TAGS.iter() {
- let elements: Vec<_> = document
- .select(selector)
- .map_err(|_| HtmlTransformError::SelectError)?
- .collect();
- for element in elements {
- // Check if this element contains any force-include tags
- let should_keep = FORCE_INCLUDE_MAIN_TAGS.iter().any(|force_selector| {
- element
- .as_node()
- .select(force_selector)
- .map(|mut iter| iter.next().is_some())
- .unwrap_or(false)
- });
-
- if !should_keep {
- element.as_node().detach();
- }
- }
- }
- }
-
- // Process images with srcset attributes
- let srcset_images: Vec<_> = document
- .select("img[srcset]")
- .map_err(|_| HtmlTransformError::SelectError)?
- .collect();
-
- for img in srcset_images {
- let srcset = img.attributes.borrow().get("srcset").map(|s| s.to_string());
- if let Some(srcset) = srcset {
- let mut sizes: Vec = srcset
- .split(',')
- .filter_map(|entry| {
- let tokens: Vec<&str> = entry.trim().split(' ').collect();
- if tokens.is_empty() {
- return None;
- }
-
- let size_token = if tokens.len() > 1 && !tokens[1].is_empty() {
- tokens[1]
- } else {
- "1x"
- };
-
- if let Ok(parsed_size) = size_token[..size_token.len() - 1].parse() {
- Some(ImageSource {
- url: tokens[0].to_string(),
- size: parsed_size,
- is_x: size_token.ends_with('x'),
- })
- } else {
- None
- }
- })
- .collect();
-
- // Add src attribute as 1x if all sizes are x-based
- if sizes.iter().all(|s| s.is_x) {
- let src = img.attributes.borrow().get("src").map(|s| s.to_string());
- if let Some(src) = src {
- sizes.push(ImageSource {
- url: src,
- size: 1,
- is_x: true,
- });
- }
- }
-
- // Sort by size (largest first) and use the biggest image
- sizes.sort_by(|a, b| b.size.cmp(&a.size));
- if let Some(biggest) = sizes.first() {
- img.attributes
- .borrow_mut()
- .insert("src", biggest.url.clone());
- }
- }
- }
-
- // Convert relative URLs to absolute URLs
- let base_url = Url::parse(&opts.url).map_err(|_| HtmlTransformError::UrlParseError)?;
-
- // Process image src attributes
- let src_images: Vec<_> = document
- .select("img[src]")
- .map_err(|_| HtmlTransformError::SelectError)?
- .collect();
- for img in src_images {
- let old_src = img.attributes.borrow().get("src").map(|s| s.to_string());
- if let Some(old_src) = old_src {
- if let Ok(new_url) = base_url.join(&old_src) {
- img.attributes
- .borrow_mut()
- .insert("src", new_url.to_string());
- }
- }
- }
-
- // Process anchor href attributes
- let href_anchors: Vec<_> = document
- .select("a[href]")
- .map_err(|_| HtmlTransformError::SelectError)?
- .collect();
- for anchor in href_anchors {
- let old_href = anchor
- .attributes
- .borrow()
- .get("href")
- .map(|s| s.to_string());
- if let Some(old_href) = old_href {
- if let Ok(new_url) = base_url.join(&old_href) {
- anchor
- .attributes
- .borrow_mut()
- .insert("href", new_url.to_string());
- }
- }
- }
-
- Ok(document.to_string())
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn test_transform_html_removes_unwanted_elements() {
- let opts = TransformHtmlOptions {
- html: "TestContent
".to_string(),
- url: "https://example.com".to_string(),
- include_tags: vec![],
- exclude_tags: vec![],
- only_main_content: false,
- };
-
- let result = transform_html(opts).unwrap();
- let expected = "Content
";
- assert_eq!(result, expected);
- }
-
- #[test]
- fn test_transform_html_include_tags() {
- let opts = TransformHtmlOptions {
- html: "Keep this
".to_string(),
- url: "https://example.com".to_string(),
- include_tags: vec![".content".to_string()],
- exclude_tags: vec![],
- only_main_content: false,
- };
-
- let result = transform_html(opts).unwrap();
- let expected =
- "";
- assert_eq!(result, expected);
- }
-
- #[test]
- fn test_transform_html_exclude_tags() {
- let opts = TransformHtmlOptions {
- html: "Keep this
Remove this
".to_string(),
- url: "https://example.com".to_string(),
- include_tags: vec![],
- exclude_tags: vec![".ad".to_string()],
- only_main_content: false,
- };
-
- let result = transform_html(opts).unwrap();
- let expected = "Keep this
";
- assert_eq!(result, expected);
- }
-
- #[test]
- fn test_transform_html_relative_urls() {
- let opts = TransformHtmlOptions {
- html: r#"
Link"#
- .to_string(),
- url: "https://example.com/subdir/".to_string(),
- include_tags: vec![],
- exclude_tags: vec![],
- only_main_content: false,
- };
-
- let result = transform_html(opts).unwrap();
- let expected = r#"
Link"#;
- assert_eq!(result, expected);
- }
-
- #[test]
- fn test_transform_html_only_main_content() {
- let opts = TransformHtmlOptions {
- html: "Main content
".to_string(),
- url: "https://example.com".to_string(),
- include_tags: vec![],
- exclude_tags: vec![],
- only_main_content: true,
- };
-
- let result = transform_html(opts).unwrap();
- let expected = "Main content
";
- assert_eq!(result, expected);
- }
-
- #[test]
- fn test_transform_html_srcset_processing() {
- let opts = TransformHtmlOptions {
- html: r#"
"#.to_string(),
- url: "https://example.com".to_string(),
- include_tags: vec![],
- exclude_tags: vec![],
- only_main_content: false,
- };
-
- let result = transform_html(opts).unwrap();
- let expected = r#"
"#;
- assert_eq!(result, expected);
- }
-
- #[test]
- fn test_transform_html_force_include_tags() {
- let opts = TransformHtmlOptions {
- html: r#""#.to_string(),
- url: "https://example.com".to_string(),
- include_tags: vec![],
- exclude_tags: vec![],
- only_main_content: true,
- };
-
- let result = transform_html(opts).unwrap();
- let expected = r#""#;
- assert_eq!(result, expected);
- }
-}
diff --git a/src/bless_crawl/mod.rs b/src/bless_crawl/mod.rs
deleted file mode 100644
index 2a6e542..0000000
--- a/src/bless_crawl/mod.rs
+++ /dev/null
@@ -1,642 +0,0 @@
-//! # BlessCrawl - Distributed Web Scraping SDK
-//!
-//! Provides distributed web scraping across the BLESS network's browser nodes.
-//!
-//! ## Features
-//!
-//! - **scrape()**: Extract content from a URL as markdown
-//! - **map()**: Discover and extract all links from a webpage
-//! - **crawl()**: Recursively crawl websites with depth controls
-//!
-//! ## Limits
-//!
-//! - Timeout: 15s default, 120s max
-//! - Wait time: 3s default, 20s max
-//! - Buffer sizes: 2MB (scrape), 128KB (map), 8MB (crawl)
-
-mod html_to_markdown;
-mod html_transform;
-
-use html_to_markdown::parse_markdown;
-pub use html_transform::{transform_html, HtmlTransformError, TransformHtmlOptions};
-use std::collections::HashMap;
-
-type Handle = u32;
-type ExitCode = u8;
-
-#[cfg(not(feature = "mock-ffi"))]
-#[link(wasm_import_module = "bless_crawl")]
-extern "C" {
- /// Scrape webpage content and return as markdown
- #[allow(clippy::too_many_arguments)]
- fn scrape(
- h: *mut Handle,
- url_ptr: *const u8,
- url_len: usize,
- options_ptr: *const u8,
- options_len: usize,
- result_ptr: *mut u8,
- result_len: usize,
- bytes_written: *mut usize,
- ) -> ExitCode;
-
- /// Close and cleanup a web scraper instance
- fn close(h: Handle) -> ExitCode;
-}
-
-#[cfg(feature = "mock-ffi")]
-#[allow(unused_variables)]
-mod mock_ffi {
- use super::{ExitCode, Handle};
-
- #[allow(clippy::too_many_arguments)]
- pub unsafe fn scrape(
- h: *mut Handle,
- _url_ptr: *const u8,
- _url_len: usize,
- _options_ptr: *const u8,
- _options_len: usize,
- result_ptr: *mut u8,
- result_len: usize,
- bytes_written: *mut usize,
- ) -> ExitCode {
- 1
- }
-
- pub unsafe fn close(_h: Handle) -> ExitCode {
- 1
- }
-}
-
-#[cfg(feature = "mock-ffi")]
-use mock_ffi::*;
-
-#[derive(Debug, Clone, PartialEq, serde::Serialize)]
-pub struct ScrapeOptions {
- pub timeout: u32,
- pub wait_time: u32,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub include_tags: Option>,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub exclude_tags: Option>,
- pub only_main_content: bool,
- pub format: Format,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub viewport: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub user_agent: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub headers: Option>,
-}
-
-impl Default for ScrapeOptions {
- fn default() -> Self {
- Self {
- timeout: BlessCrawl::DEFAULT_TIMEOUT_MS,
- wait_time: BlessCrawl::DEFAULT_WAIT_TIME_MS,
- include_tags: None,
- exclude_tags: None,
- only_main_content: false,
- format: Format::Markdown,
- viewport: None,
- user_agent: None,
- headers: None,
- }
- }
-}
-
-#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
-pub enum Format {
- #[default]
- #[serde(rename = "markdown")]
- Markdown,
- #[serde(rename = "html")]
- Html,
- #[serde(rename = "json")]
- Json,
-}
-
-impl std::str::FromStr for Format {
- type Err = ();
- fn from_str(s: &str) -> Result {
- match s {
- "markdown" => Ok(Format::Markdown),
- "html" => Ok(Format::Html),
- "json" => Ok(Format::Json),
- _ => Err(()),
- }
- }
-}
-
-#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
-pub struct Viewport {
- #[serde(skip_serializing_if = "Option::is_none")]
- pub width: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub height: Option,
-}
-
-#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
-pub struct MapOptions {
- #[serde(skip_serializing_if = "Option::is_none")]
- pub link_types: Option>,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub base_url: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub filter_extensions: Option>,
-}
-
-#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
-pub struct CrawlOptions {
- #[serde(skip_serializing_if = "Option::is_none")]
- pub limit: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub max_depth: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub exclude_paths: Option>,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub include_paths: Option>,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub follow_external: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub delay_between_requests: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub parallel_requests: Option,
-}
-
-#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
-pub struct PageMetadata {
- #[serde(skip_serializing_if = "Option::is_none")]
- pub title: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub description: Option,
- pub url: String,
- pub status_code: u16,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub language: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub keywords: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub robots: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub author: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub creator: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub publisher: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub og_title: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub og_description: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub og_image: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub og_url: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub og_site_name: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub og_type: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub twitter_title: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub twitter_description: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub twitter_image: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub twitter_card: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub twitter_site: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub twitter_creator: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub favicon: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub viewport: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub referrer: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub content_type: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub scrape_id: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub source_url: Option,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub proxy_used: Option,
-}
-
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct ScrapeData {
- pub success: bool,
- pub timestamp: u64,
- pub format: Format,
- pub content: String,
- pub metadata: PageMetadata,
-}
-
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct Response {
- pub success: bool,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub error: Option,
- pub data: T,
-}
-
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct LinkInfo {
- pub url: String,
- // TODO: use enum instead of string
- pub link_type: String, // "internal", "external", "anchor"
-}
-
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct MapData {
- pub url: String,
- pub links: Vec,
- pub total_links: usize,
- pub timestamp: u64,
-}
-
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct CrawlError {
- pub url: String,
- pub error: String,
- pub depth: u32,
-}
-
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct CrawlData {
- pub root_url: String,
- pub pages: Vec,
- #[serde(skip_serializing_if = "Option::is_none")]
- pub link_map: Option,
- pub depth_reached: u8,
- pub total_pages: usize,
- pub errors: Vec,
-}
-
-impl ScrapeOptions {
- pub fn new() -> Self {
- Self::default()
- }
-
- pub fn with_include_tags(mut self, tags: Vec) -> Self {
- self.include_tags = Some(tags);
- self
- }
-
- pub fn with_exclude_tags(mut self, tags: Vec) -> Self {
- self.exclude_tags = Some(tags);
- self
- }
-
- pub fn with_format(mut self, format: Format) -> Self {
- self.format = format;
- self
- }
-
- pub fn with_viewport(mut self, width: u32, height: u32) -> Self {
- self.viewport = Some(Viewport {
- width: Some(width),
- height: Some(height),
- });
- self
- }
-
- pub fn with_user_agent(mut self, user_agent: String) -> Self {
- self.user_agent = Some(user_agent);
- self
- }
-
- pub fn with_headers(mut self, headers: HashMap) -> Self {
- self.headers = Some(headers);
- self
- }
-}
-
-impl MapOptions {
- pub fn new() -> Self {
- Self::default()
- }
-
- pub fn with_link_types(mut self, link_types: Vec) -> Self {
- self.link_types = Some(link_types);
- self
- }
-
- pub fn with_base_url(mut self, base_url: String) -> Self {
- self.base_url = Some(base_url);
- self
- }
-
- pub fn with_filter_extensions(mut self, extensions: Vec) -> Self {
- self.filter_extensions = Some(extensions);
- self
- }
-}
-
-impl CrawlOptions {
- pub fn new() -> Self {
- Self::default()
- }
-
- pub fn with_limit(mut self, limit: u32) -> Self {
- self.limit = Some(limit);
- self
- }
-
- pub fn with_max_depth(mut self, max_depth: u8) -> Self {
- self.max_depth = Some(max_depth);
- self
- }
-
- pub fn with_exclude_paths(mut self, paths: Vec) -> Self {
- self.exclude_paths = Some(paths);
- self
- }
-
- pub fn with_include_paths(mut self, paths: Vec) -> Self {
- self.include_paths = Some(paths);
- self
- }
-
- pub fn with_follow_external(mut self, follow: bool) -> Self {
- self.follow_external = Some(follow);
- self
- }
-
- pub fn with_delay_between_requests(mut self, delay: u32) -> Self {
- self.delay_between_requests = Some(delay);
- self
- }
-
- pub fn with_parallel_requests(mut self, parallel: u32) -> Self {
- self.parallel_requests = Some(parallel);
- self
- }
-}
-
-/// BlessCrawl client for distributed web scraping operations.
-#[derive(Debug, Clone, Default)]
-pub struct BlessCrawl {
- inner: Handle,
- config: ScrapeOptions,
-}
-
-impl BlessCrawl {
- /// Default timeout in milliseconds (15 seconds)
- pub const DEFAULT_TIMEOUT_MS: u32 = 15000;
- /// Default wait time in milliseconds (3 seconds)
- pub const DEFAULT_WAIT_TIME_MS: u32 = 3000;
-
- /// Maximum timeout in milliseconds (2 minutes)
- pub const MAX_TIMEOUT_MS: u32 = 120000;
- /// Maximum wait time in milliseconds (20 seconds)
- pub const MAX_WAIT_TIME_MS: u32 = 20000;
-
- /// Maximum result buffer size in bytes (2MB)
- pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024;
-
- /// Maximum result buffer size in bytes (1MB)
- pub const MAX_MAP_BUFFER_SIZE: usize = 1024 * 1024;
-
- /// Maximum result buffer size in bytes (8MB)
- pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024;
-
- /// Creates a new BlessCrawl instance with the given configuration.
- pub fn with_config(config: ScrapeOptions) -> Result {
- let instance = Self { inner: 0, config };
- instance.validate_config(&instance.config)?;
- Ok(instance)
- }
-
- fn validate_config(&self, config: &ScrapeOptions) -> Result<(), WebScrapeErrorKind> {
- if config.timeout > Self::MAX_TIMEOUT_MS {
- return Err(WebScrapeErrorKind::InvalidTimeout);
- }
- if config.wait_time > Self::MAX_WAIT_TIME_MS {
- return Err(WebScrapeErrorKind::InvalidWaitTime);
- }
- Ok(())
- }
-
- /// Returns a reference to the current configuration.
- pub fn get_config(&self) -> &ScrapeOptions {
- &self.config
- }
-
- pub fn handle(&self) -> Handle {
- self.inner
- }
-
- /// Scrapes webpage content and returns it as markdown with metadata.
- pub fn scrape(
- &self,
- url: &str,
- options: Option,
- ) -> Result, WebScrapeErrorKind> {
- // Use provided options or fall back to instance config
- let config = if let Some(opts) = options {
- self.validate_config(&opts)?;
- opts
- } else {
- self.config.clone()
- };
-
- let options_json = serde_json::to_vec(&config).unwrap();
-
- let mut handle = self.inner;
- let mut result_buf = vec![0u8; Self::MAX_SCRAPE_BUFFER_SIZE];
- let mut bytes_written: usize = 0;
-
- let code = unsafe {
- scrape(
- &mut handle,
- url.as_ptr(),
- url.len(),
- options_json.as_ptr(),
- options_json.len(),
- result_buf.as_mut_ptr(),
- result_buf.len(),
- &mut bytes_written,
- )
- };
-
- if code != 0 {
- return Err(code.into());
- }
- if bytes_written == 0 {
- return Err(WebScrapeErrorKind::EmptyResponse);
- }
- if bytes_written > result_buf.len() {
- return Err(WebScrapeErrorKind::MemoryError);
- }
-
- let result_bytes =
- unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
-
- // deserialize the result to host ScrapeResponse
- let mut scrape_response = serde_json::from_slice::>(result_bytes)
- .map_err(|e| {
- eprintln!("error: {:?}", e);
- WebScrapeErrorKind::ParseError
- })?;
-
- if let Some(error) = scrape_response.error {
- return Err(WebScrapeErrorKind::RuntimeError(error));
- }
-
- // post-process html
- scrape_response.data.content = transform_html(TransformHtmlOptions {
- html: scrape_response.data.content,
- url: scrape_response.data.metadata.url.clone(),
- include_tags: config.include_tags.unwrap_or_default(),
- exclude_tags: config.exclude_tags.unwrap_or_default(),
- only_main_content: config.only_main_content,
- })
- .map_err(|e| {
- eprintln!("error: {:?}", e);
- WebScrapeErrorKind::TransformError
- })?;
-
- // if the format is markdown, set the data to the markdown of the html
- match config.format {
- Format::Markdown => {
- scrape_response.data.content = parse_markdown(&scrape_response.data.content);
- }
- Format::Html => (), // no need to do anything
- Format::Json => unimplemented!(),
- }
-
- // convert the host ScrapeResponse to the user ScrapeResponse
- Ok(scrape_response)
- }
-
- /// Extracts all links from a webpage, categorized by type.
- pub fn map(
- &self,
- url: &str,
- options: Option,
- ) -> Result, WebScrapeErrorKind> {
- let _map_options = options.unwrap_or_default();
-
- // let scrape_response = self.scrape(url, None)?;
- // TODO: implement map by post-processing the scrape response or using fetch
-
- Ok(Response {
- success: true,
- error: None,
- data: MapData {
- url: url.to_string(),
- links: vec![],
- total_links: 0,
- timestamp: 0,
- },
- })
- }
-
- /// Recursively crawls a website with configurable depth and filtering.
- pub fn crawl(
- &self,
- url: &str,
- options: Option,
- ) -> Result>, WebScrapeErrorKind> {
- let _crawl_options = options.unwrap_or_default();
-
- // TODO: implement crawl by post-processing the scrape response or using fetch
-
- Ok(Response {
- success: true,
- error: None,
- data: CrawlData {
- root_url: url.to_string(),
- pages: vec![],
- link_map: None,
- depth_reached: 0,
- total_pages: 0,
- errors: vec![],
- },
- })
- }
-}
-
-impl Drop for BlessCrawl {
- fn drop(&mut self) {
- // if the handle is 0, it means the instance was never initialized on the host
- if self.inner == 0 {
- return;
- }
- let code = unsafe { close(self.inner) };
- if code != 0 {
- eprintln!("Error closing web scraper: {}", code);
- }
- }
-}
-
-#[derive(Debug)]
-pub enum WebScrapeErrorKind {
- InvalidUrl,
- Timeout,
- NetworkError,
- RenderingError,
- MemoryError,
- DepthExceeded,
- RateLimited,
- TransformError,
- Utf8Error,
- ParseError,
- ScrapeFailed,
- MapFailed,
- CrawlFailed,
- EmptyResponse,
- InvalidTimeout,
- InvalidWaitTime,
- RuntimeError(String),
-}
-
-impl From for WebScrapeErrorKind {
- fn from(code: u8) -> Self {
- match code {
- 1 => WebScrapeErrorKind::InvalidUrl,
- 2 => WebScrapeErrorKind::Timeout,
- 3 => WebScrapeErrorKind::NetworkError,
- 4 => WebScrapeErrorKind::RenderingError,
- 5 => WebScrapeErrorKind::MemoryError,
- 6 => WebScrapeErrorKind::DepthExceeded,
- 7 => WebScrapeErrorKind::RateLimited,
- 8 => WebScrapeErrorKind::TransformError,
- 9 => WebScrapeErrorKind::RuntimeError(String::from("Invalid timeout")),
- 10 => WebScrapeErrorKind::RuntimeError(String::from("Invalid wait time")),
- _ => WebScrapeErrorKind::RuntimeError(String::from("Unknown error")),
- }
- }
-}
-
-impl std::fmt::Display for WebScrapeErrorKind {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- WebScrapeErrorKind::InvalidUrl => write!(f, "Invalid URL provided"),
- WebScrapeErrorKind::Timeout => write!(f, "Request timeout"),
- WebScrapeErrorKind::NetworkError => write!(f, "Network error"),
- WebScrapeErrorKind::RenderingError => write!(f, "Page rendering error"),
- WebScrapeErrorKind::MemoryError => write!(f, "Memory allocation error"),
- WebScrapeErrorKind::DepthExceeded => write!(f, "Maximum crawl depth exceeded"),
- WebScrapeErrorKind::RateLimited => write!(f, "Rate limited"),
- WebScrapeErrorKind::TransformError => write!(f, "Transform error"),
- WebScrapeErrorKind::Utf8Error => write!(f, "UTF-8 conversion error"),
- WebScrapeErrorKind::ParseError => write!(f, "JSON parse error"),
- WebScrapeErrorKind::ScrapeFailed => write!(f, "Scrape operation failed"),
- WebScrapeErrorKind::MapFailed => write!(f, "Map operation failed"),
- WebScrapeErrorKind::CrawlFailed => write!(f, "Crawl operation failed"),
- WebScrapeErrorKind::EmptyResponse => write!(f, "Empty response from host"),
- WebScrapeErrorKind::InvalidTimeout => {
- write!(f, "Timeout exceeds maximum allowed (120s)")
- }
- WebScrapeErrorKind::InvalidWaitTime => {
- write!(f, "Wait time exceeds maximum allowed (20s)")
- }
- WebScrapeErrorKind::RuntimeError(error) => write!(f, "Runtime error: {}", error),
- }
- }
-}
-
-impl std::error::Error for WebScrapeErrorKind {}
diff --git a/src/lib.rs b/src/lib.rs
index 80bfe1d..f399128 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -15,6 +15,3 @@ pub mod socket;
#[cfg(feature = "http")]
pub mod http;
-
-#[cfg(feature = "bless-crawl")]
-pub mod bless_crawl;