From 285789b236c73d88cb75c66b3a64c31be476e7e7 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 5 Nov 2025 20:20:25 +0100 Subject: [PATCH 01/19] Implement dictionary module for RDF term encoding and add integration tests --- src/indexing/dictionary.rs | 88 +++++ src/indexing/mod.rs | 2 + src/indexing/shared.rs | 22 ++ src/indexing/sparse.rs | 156 +++++++- tests/dictionary_encoding_test.rs | 622 ++++++++++++++++++++++++++++++ 5 files changed, 889 insertions(+), 1 deletion(-) create mode 100644 src/indexing/dictionary.rs create mode 100644 tests/dictionary_encoding_test.rs diff --git a/src/indexing/dictionary.rs b/src/indexing/dictionary.rs new file mode 100644 index 0000000..d34a7ac --- /dev/null +++ b/src/indexing/dictionary.rs @@ -0,0 +1,88 @@ +use std::collections::HashMap; +use std::fs::File; +use std::io::{Read, Write}; +use std::path::Path; + +#[derive(Debug)] +pub struct Dictionary { + uri_to_id: HashMap, + id_to_uri: Vec, + next_id: u64, +} + +impl Dictionary { + pub fn new() -> Self { + Self { uri_to_id: HashMap::new(), id_to_uri: Vec::new(), next_id: 0 } + } + + pub fn fetch_id(&mut self, uri: &str) -> u64 { + if let Some(&id) = self.uri_to_id.get(uri) { + id + } else { + let id = self.next_id; + self.uri_to_id.insert(uri.to_string(), id); + self.id_to_uri.push(uri.to_string()); + self.next_id += 1; + id + } + } + + pub fn fetch_uri(&self, id: u64) -> Option<&str> { + self.id_to_uri.get(id as usize).map(|s| s.as_str()) + } + + pub fn len(&self) -> usize { + self.uri_to_id.len() + } + + pub fn is_empty(&self) -> bool { + self.uri_to_id.is_empty() + } + + pub fn save_to_file(&self, path: &Path) -> std::io::Result<()> { + let mut file = File::create(path)?; + file.write_all(&(self.id_to_uri.len() as u64).to_be_bytes())?; + + for uri in &self.id_to_uri { + let uri_bytes = uri.as_bytes(); + file.write_all(&(uri_bytes.len() as u32).to_be_bytes())?; + file.write_all(uri_bytes)?; + } + Ok(()) + } + + pub fn load_from_file(path: &Path) -> std::io::Result { + let mut file = File::open(path)?; + let mut uri_to_id = HashMap::new(); + let mut id_to_uri = Vec::new(); + + // Reading the number of entries + let mut count_bytes = [0u8; 8]; + file.read_exact(&mut count_bytes)?; + let count = u64::from_be_bytes(count_bytes); + + // Reading each IRI Entry + + for id in 0..count { + let mut len_bytes = [0u8; 4]; + file.read_exact(&mut len_bytes)?; + + let length = u32::from_be_bytes(len_bytes) as usize; + let mut uri_bytes = vec![0u8; length]; + file.read_exact(&mut uri_bytes)?; + let uri = String::from_utf8(uri_bytes) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + + uri_to_id.insert(uri.clone(), id); + id_to_uri.push(uri); + } + + Ok(Self { uri_to_id, id_to_uri, next_id: count }) + } +} + +impl Default for Dictionary { + fn default() -> Self { + Self::new() + } +} diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs index 1fd1692..6f0579a 100644 --- a/src/indexing/mod.rs +++ b/src/indexing/mod.rs @@ -4,3 +4,5 @@ pub mod shared; pub mod dense; #[doc=""] pub mod sparse; +#[doc=""] +pub mod dictionary; diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs index 4c262f4..97a1155 100644 --- a/src/indexing/shared.rs +++ b/src/indexing/shared.rs @@ -1,5 +1,6 @@ use std::fs::File; use std::io::Write; +use crate::indexing::dictionary::Dictionary; #[doc = ""] pub const RECORD_SIZE: usize = 40; @@ -90,3 +91,24 @@ pub struct Event { #[doc = ""] pub graph: u64, } + +#[derive(Debug, Clone)] +pub struct ResolvedEvent{ + pub timestamp: u64, + pub subject: String, + pub predicate: String, + pub object: String, + pub graph: String, +} + +impl Event { + pub fn resolve(&self, dict: &Dictionary) -> ResolvedEvent { + ResolvedEvent { + timestamp: self.timestamp, + subject: dict.fetch_uri(self.subject).unwrap_or("UNKNOWN").to_string(), + predicate: dict.fetch_uri(self.predicate).unwrap_or("UNKNOWN").to_string(), + object: dict.fetch_uri(self.object).unwrap_or("UNKNOWN").to_string(), + graph: dict.fetch_uri(self.graph).unwrap_or("UNKNOWN").to_string() + } + } +} diff --git a/src/indexing/sparse.rs b/src/indexing/sparse.rs index 110b4f7..36f3a99 100644 --- a/src/indexing/sparse.rs +++ b/src/indexing/sparse.rs @@ -1,7 +1,13 @@ -use crate::indexing::shared::{decode_record, Event, RECORD_SIZE}; +use crate::indexing::dictionary::{self, Dictionary}; +use crate::indexing::shared::{decode_record, Event, ResolvedEvent, RECORD_SIZE}; use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; +use std::path::Path; +/// Builder for creating sparse indexes that store only periodic entries. +/// +/// A sparse index reduces storage space by indexing only every Nth record, +/// trading some query precision for significant space savings. #[doc = ""] pub struct SparseIndexBuilder { index_file: File, @@ -9,12 +15,31 @@ pub struct SparseIndexBuilder { } #[doc = ""] impl SparseIndexBuilder { + /// Creates a new sparse index builder that writes to the specified file. + /// + /// # Arguments + /// * `index_path` - Path where the index file will be created + /// * `interval` - Number of records between index entries (e.g., 1000 means index every 1000th record) + /// + /// # Returns + /// A new `SparseIndexBuilder` instance or an I/O error #[doc = ""] pub fn create(index_path: &str, interval: usize) -> std::io::Result { let index_file = File::create(index_path)?; Ok(Self { index_file, interval }) } + /// Adds an entry to the sparse index if the record count matches the interval. + /// + /// Only records where `record_count % interval == 0` are indexed to save space. + /// + /// # Arguments + /// * `record_count` - The current record number in the log + /// * `timestamp` - Timestamp of the record + /// * `offset` - Byte offset of the record in the log file + /// + /// # Returns + /// `true` if the entry was added to the index, `false` if skipped #[doc = ""] pub fn add_entry( &mut self, @@ -31,12 +56,27 @@ impl SparseIndexBuilder { } } + /// Finalizes the index by flushing any buffered writes to disk. + /// + /// This should be called after all entries have been added. #[doc = ""] pub fn finalize(&mut self) -> std::io::Result<()> { self.index_file.flush() } } +/// Builds a sparse index for an existing log file. +/// +/// This function reads through the entire log file and creates an index +/// with entries only for records at the specified interval. +/// +/// # Arguments +/// * `log_path` - Path to the log file to index +/// * `index_path` - Path where the index file will be created +/// * `interval` - Number of records between index entries +/// +/// # Returns +/// Ok(()) on success, or an I/O error pub fn build_sparse_index( log_path: &str, index_path: &str, @@ -60,12 +100,110 @@ pub fn build_sparse_index( Ok(()) } +/// Builds a sparse index and initializes an empty dictionary. +/// +/// This is a convenience function that creates both the index and +/// an empty dictionary file. The dictionary can be populated separately +/// when processing RDF data. +/// +/// # Arguments +/// * `log_path` - Path to the log file to index +/// * `index_path` - Path where the index file will be created +/// * `dictionary_path` - Path where the dictionary file will be created +/// * `interval` - Number of records between index entries +/// +/// # Returns +/// Ok(()) on success, or an I/O error +pub fn build_sparse_index_with_dictionary( + log_path: &str, + index_path: &str, + dictionary_path: &str, + interval: &usize, +) -> std::io::Result<()> { + let mut log = File::open(log_path)?; + let mut builder = SparseIndexBuilder::create(index_path, *interval)?; + let dictionary = Dictionary::new(); + + let mut offset = 0u64; + let mut record_count = 0u64; + let mut record = [0u8; RECORD_SIZE]; + + while log.read_exact(&mut record).is_ok() { + let (timestamp, _subject, _predicate, _object, _graph) = decode_record(&record); + + builder.add_entry(record_count, timestamp, offset)?; + + offset += RECORD_SIZE as u64; + record_count += 1; + } + + builder.finalize()?; + dictionary.save_to_file(Path::new(dictionary_path))?; + + Ok(()) +} + +/// Reader for sparse indexes that enables efficient timestamp-based queries. +/// +/// The sparse reader loads the entire index into memory for fast binary search, +/// then performs sequential scans of the log file starting from the appropriate position. pub struct SparseReader { index: Vec<(u64, u64)>, + #[allow(dead_code)] interval: usize, } impl SparseReader { + /// Opens a sparse index and its associated dictionary. + /// + /// # Arguments + /// * `index_path` - Path to the sparse index file + /// * `dictionary_path` - Path to the dictionary file + /// * `interval` - The interval used when building the index + /// + /// # Returns + /// A tuple of (SparseReader, Dictionary) or an I/O error + pub fn open_with_dictionary( + index_path: &str, + dictionary_path: &str, + interval: usize, + ) -> std::io::Result<(Self, Dictionary)> { + let reader = Self::open(index_path, interval)?; + let dictionary = Dictionary::load_from_file(Path::new(dictionary_path))?; + Ok((reader, dictionary)) + } + /// Queries the log and returns results with URIs resolved from the dictionary. + /// + /// This method performs the same query as `query()` but resolves all numeric IDs + /// back to their original URI strings using the provided dictionary. + /// + /// # Arguments + /// * `log_path` - Path to the log file + /// * `dict` - Dictionary for resolving IDs to URIs + /// * `timestamp_start_bound` - Minimum timestamp (inclusive) + /// * `timestamp_end_bound` - Maximum timestamp (inclusive) + /// + /// # Returns + /// Vector of resolved events or an I/O error + pub fn query_resolved( + &self, + log_path: &str, + dict: &Dictionary, + timestamp_start_bound: u64, + timestamp_end_bound: u64, + ) -> std::io::Result> { + let events = self.query(log_path, timestamp_start_bound, timestamp_end_bound)?; + Ok(events.into_iter().map(|e| e.resolve(dict)).collect()) + } + + /// Opens a sparse index file and loads it into memory. + /// + /// # Arguments + /// * `index_path` - Path to the sparse index file + /// * `interval` - The interval used when building the index + /// + /// # Returns + /// A new SparseReader instance or an I/O error pub fn open(index_path: &str, interval: usize) -> std::io::Result { let mut index_file = File::open(index_path)?; let mut index = Vec::new(); @@ -80,6 +218,18 @@ impl SparseReader { Ok(Self { index, interval }) } + /// Queries the log file for events within the specified timestamp range. + /// + /// Uses binary search on the index to find the starting position, then + /// performs a sequential scan of the log file to collect matching events. + /// + /// # Arguments + /// * `log_path` - Path to the log file + /// * `timestamp_start_bound` - Minimum timestamp (inclusive) + /// * `timestamp_end_bound` - Maximum timestamp (inclusive) + /// + /// # Returns + /// Vector of events with numeric IDs or an I/O error pub fn query( &self, log_path: &str, @@ -120,6 +270,10 @@ impl SparseReader { Ok(results) } + /// Returns the size of the index in bytes. + /// + /// Each index entry is 16 bytes (8 bytes timestamp + 8 bytes offset), + /// so this returns `index.len() * 16`. pub fn index_size_bytes(&self) -> usize { self.index.len() * 16 } diff --git a/tests/dictionary_encoding_test.rs b/tests/dictionary_encoding_test.rs new file mode 100644 index 0000000..75c6e9d --- /dev/null +++ b/tests/dictionary_encoding_test.rs @@ -0,0 +1,622 @@ +//! Dictionary Encoding Integration Tests +//! +//! These tests verify the dictionary-based encoding system for RDF terms. +//! +//! **Important**: The dictionary stores the actual URI/literal strings WITHOUT RDF syntax: +//! - URIs: stored as "https://example.org/resource" (not "") +//! - Literals: stored as the value string (e.g., "23.5" or "2025-11-05T10:30:00Z") +//! - Datatypes: stored separately as URIs (e.g., "http://www.w3.org/2001/XMLSchema#double") +//! +//! The RDF syntax (angle brackets, quotes, ^^datatype) is handled by the RDF parser/serializer, +//! not by the dictionary encoding layer. This keeps the dictionary implementation clean and +//! format-agnostic. +//! +//! Example RDF triple in Turtle syntax: +//! ```turtle +//! "23.5"^^ . +//! ``` +//! +//! Is stored in the dictionary as 4 separate entries: +//! - Subject ID → "https://rsp.js/event1" +//! - Predicate ID → "http://www.w3.org/ns/saref#hasValue" +//! - Object ID → "23.5" (the literal value) +//! - Datatype ID → "http://www.w3.org/2001/XMLSchema#double" (if needed) + +use janus::indexing::dictionary::Dictionary; +use janus::indexing::shared::{decode_record, encode_record, LogWriter, RECORD_SIZE}; +use janus::indexing::sparse::{build_sparse_index, SparseReader}; +use std::fs; +use std::path::Path; + +#[test] +fn test_rdf_syntax_to_dictionary_mapping() { + let mut dict = Dictionary::new(); + + // RDF Triple in Turtle syntax: + // "23.5"^^ . + // + // The parser would extract these components and store them WITHOUT RDF syntax: + + // Subject: → stored as the URI string + let subject = "https://rsp.js/event1"; + let subject_id = dict.fetch_id(subject); + + // Predicate: → stored as the URI string + let predicate = "http://www.w3.org/ns/saref#hasValue"; + let predicate_id = dict.fetch_id(predicate); + + // Object: "23.5"^^xsd:double → stored as the literal value "23.5" + let object = "23.5"; + let object_id = dict.fetch_id(object); + + // Datatype: ^^ → stored as URI string + let datatype = "http://www.w3.org/2001/XMLSchema#double"; + let datatype_id = dict.fetch_id(datatype); + + // Graph: → stored as the URI string + let graph = "https://example.org/graph"; + let graph_id = dict.fetch_id(graph); + + // Verify all components are stored correctly + assert_eq!(dict.fetch_uri(subject_id), Some(subject)); + assert_eq!(dict.fetch_uri(predicate_id), Some(predicate)); + assert_eq!(dict.fetch_uri(object_id), Some(object)); + assert_eq!(dict.fetch_uri(datatype_id), Some(datatype)); + assert_eq!(dict.fetch_uri(graph_id), Some(graph)); + + // In a real system, you'd also store metadata about which IDs are literals vs URIs + // and what datatype each literal has. This test just demonstrates the string storage. +} + +#[test] +fn test_rdf_literal_datatypes() { + let mut dict = Dictionary::new(); + + // Example RDF triples with different literal types: + // + // Triple 1: "2025-11-05T10:30:00Z"^^xsd:dateTime + let timestamp_value = "2025-11-05T10:30:00Z"; + let timestamp_datatype = "http://www.w3.org/2001/XMLSchema#dateTime"; + + // Triple 2: "23.5"^^xsd:double + let temp_value = "23.5"; + let temp_datatype = "http://www.w3.org/2001/XMLSchema#double"; + + // Triple 3: "42"^^xsd:integer + let count_value = "42"; + let count_datatype = "http://www.w3.org/2001/XMLSchema#integer"; + + // Triple 4: "Sensor Reading"^^xsd:string + let label_value = "Sensor Reading"; + let label_datatype = "http://www.w3.org/2001/XMLSchema#string"; + + // Store all values and datatypes in dictionary + let timestamp_val_id = dict.fetch_id(timestamp_value); + let timestamp_dt_id = dict.fetch_id(timestamp_datatype); + + let temp_val_id = dict.fetch_id(temp_value); + let temp_dt_id = dict.fetch_id(temp_datatype); + + let count_val_id = dict.fetch_id(count_value); + let count_dt_id = dict.fetch_id(count_datatype); + + let label_val_id = dict.fetch_id(label_value); + let label_dt_id = dict.fetch_id(label_datatype); + + // Verify all are stored correctly + assert_eq!(dict.fetch_uri(timestamp_val_id), Some(timestamp_value)); + assert_eq!(dict.fetch_uri(timestamp_dt_id), Some(timestamp_datatype)); + + assert_eq!(dict.fetch_uri(temp_val_id), Some(temp_value)); + assert_eq!(dict.fetch_uri(temp_dt_id), Some(temp_datatype)); + + assert_eq!(dict.fetch_uri(count_val_id), Some(count_value)); + assert_eq!(dict.fetch_uri(count_dt_id), Some(count_datatype)); + + assert_eq!(dict.fetch_uri(label_val_id), Some(label_value)); + assert_eq!(dict.fetch_uri(label_dt_id), Some(label_datatype)); + + // Note: Datatype URIs are reused across multiple literals + // E.g., many literals will have ^^xsd:double as their datatype + assert_eq!(temp_dt_id, dict.fetch_id(temp_datatype)); // Same ID when requested again +} + +#[test] +fn test_dictionary_basic_operations() { + let mut dict = Dictionary::new(); + + // Test get_or_insert with real RDF URIs + let uri1 = "https://rsp.js/event1"; + let uri2 = "http://www.w3.org/ns/saref#hasTimestamp"; + let uri3 = "http://example.org/sensor/temperature"; + let uri4 = "http://www.w3.org/ns/ssn#observedBy"; + + // First insertion should return ID 0 + let id1 = dict.fetch_id(uri1); + assert_eq!(id1, 0); + + // Subsequent insertions should return sequential IDs + let id2 = dict.fetch_id(uri2); + assert_eq!(id2, 1); + + let id3 = dict.fetch_id(uri3); + assert_eq!(id3, 2); + + let id4 = dict.fetch_id(uri4); + assert_eq!(id4, 3); + + // Requesting same URI should return same ID + let id1_again = dict.fetch_id(uri1); + assert_eq!(id1_again, id1); + + // Test retrieval + assert_eq!(dict.fetch_uri(id1), Some(uri1)); + assert_eq!(dict.fetch_uri(id2), Some(uri2)); + assert_eq!(dict.fetch_uri(id3), Some(uri3)); + assert_eq!(dict.fetch_uri(id4), Some(uri4)); + + // Test invalid ID + assert_eq!(dict.fetch_uri(999), None); + + // Test length + assert_eq!(dict.len(), 4); + assert!(!dict.is_empty()); +} + +#[test] +fn test_dictionary_persistence() -> std::io::Result<()> { + let test_dir = "target/test_data/dict_persistence"; + let _ = fs::remove_dir_all(test_dir); + fs::create_dir_all(test_dir)?; + + let dict_path = Path::new(test_dir).join("test_dict.bin"); + + // Create and populate dictionary + let mut dict = Dictionary::new(); + let uris = vec![ + "https://example.org/resource/event001", + "http://www.w3.org/ns/saref#hasValue", + "http://www.w3.org/2001/XMLSchema#dateTime", + "https://solid.ti.rw.fau.de/public/ns/stream#", + ]; + + let ids: Vec = uris.iter().map(|uri| dict.fetch_id(uri)).collect(); + + // Save to file + dict.save_to_file(&dict_path)?; + + // Load from file + let loaded_dict = Dictionary::load_from_file(&dict_path)?; + + // Verify all URIs are preserved with correct IDs + for (i, uri) in uris.iter().enumerate() { + assert_eq!(loaded_dict.fetch_uri(ids[i]), Some(*uri)); + } + + assert_eq!(loaded_dict.len(), uris.len()); + + Ok(()) +} + +#[test] +fn test_rdf_event_encoding_with_dictionary() { + let mut dict = Dictionary::new(); + + // RDF Quad in N-Quads syntax would look like: + // "2025-11-05T10:30:00Z"^^ . + // + // But we store the actual string values WITHOUT syntax markers: + + let subject_uri = "https://rsp.js/event/sensor-reading-001"; + let predicate_uri = "http://www.w3.org/ns/saref#hasTimestamp"; + let object_uri = "2025-11-05T10:30:00Z"; // The literal value itself + let graph_uri = "https://solid.ti.rw.fau.de/public/ns/stream#default"; + + // Map URIs to IDs + let timestamp: u64 = 1699181400; + let subject_id = dict.fetch_id(subject_uri); + let predicate_id = dict.fetch_id(predicate_uri); + let object_id = dict.fetch_id(object_uri); + let graph_id = dict.fetch_id(graph_uri); + + // Encode record with IDs + let mut buffer = [0u8; RECORD_SIZE]; + encode_record(&mut buffer, timestamp, subject_id, predicate_id, object_id, graph_id); + + // Decode record + let (dec_timestamp, dec_subject, dec_predicate, dec_object, dec_graph) = decode_record(&buffer); + + // Verify IDs are correctly encoded/decoded + assert_eq!(dec_timestamp, timestamp); + assert_eq!(dec_subject, subject_id); + assert_eq!(dec_predicate, predicate_id); + assert_eq!(dec_object, object_id); + assert_eq!(dec_graph, graph_id); + + // Resolve IDs back to URIs + assert_eq!(dict.fetch_uri(dec_subject), Some(subject_uri)); + assert_eq!(dict.fetch_uri(dec_predicate), Some(predicate_uri)); + assert_eq!(dict.fetch_uri(dec_object), Some(object_uri)); + assert_eq!(dict.fetch_uri(dec_graph), Some(graph_uri)); +} + +#[test] +fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> { + let test_dir = "target/test_data/iot_sensor"; + let _ = fs::remove_dir_all(test_dir); + fs::create_dir_all(test_dir)?; + + let log_path = format!("{}/iot_sensor.log", test_dir); + let mut dict = Dictionary::new(); + + // Define common IoT RDF predicates and graph URIs + let predicates = vec![ + "http://www.w3.org/ns/saref#hasTimestamp", + "http://www.w3.org/ns/saref#hasValue", + "http://www.w3.org/ns/ssn#observedBy", + "http://www.w3.org/ns/sosa#observedProperty", + ]; + + // Map predicates to IDs first (these will be reused) + let predicate_ids: Vec = predicates.iter().map(|p| dict.fetch_id(p)).collect(); + + let graph_uri = "https://solid.ti.rw.fau.de/public/ns/stream#iot"; + let graph_id = dict.fetch_id(graph_uri); + + // Create log writer + let mut writer = LogWriter::create(&log_path)?; + + // Generate 100 IoT sensor events with unique event IDs but shared predicates + for i in 0..100 { + let timestamp = 1699181400 + i; + + // Each event has unique subject (sensor reading ID) + let subject_uri = format!("https://rsp.js/event/sensor-reading-{:03}", i); + let subject_id = dict.fetch_id(&subject_uri); + + // Rotate through predicates (demonstrating reuse) + let predicate_id = predicate_ids[(i % predicate_ids.len() as u64) as usize]; + + // Unique object (sensor value) + let object_uri = format!("value-{}", i * 10); + let object_id = dict.fetch_id(&object_uri); + + writer.append_record(timestamp, subject_id, predicate_id, object_id, graph_id)?; + } + + writer.flush()?; + + // Verify dictionary statistics + // We should have: + // - 100 unique subjects + // - 4 predicates (reused) + // - 100 unique objects + // - 1 graph URI + // Total: 205 unique URIs + assert_eq!(dict.len(), 205); + + // Verify predicate reuse - predicates should have low IDs (0-3) + for (i, pred) in predicates.iter().enumerate() { + assert_eq!(dict.fetch_id(pred), i as u64); + } + + Ok(()) +} + +#[test] +fn test_sparse_index_with_dictionary_integration() -> std::io::Result<()> { + let test_dir = "target/test_data/sparse_integration"; + let _ = fs::remove_dir_all(test_dir); + fs::create_dir_all(test_dir)?; + + let log_path = format!("{}/indexed_sensor.log", test_dir); + let index_path = format!("{}/indexed_sensor.idx", test_dir); + let dict_path = format!("{}/indexed_sensor_dict.bin", test_dir); + + let mut dict = Dictionary::new(); + + // Define RDF components + let predicates = + vec!["http://www.w3.org/ns/saref#hasTimestamp", "http://www.w3.org/ns/saref#hasValue"]; + + let predicate_ids: Vec = predicates.iter().map(|p| dict.fetch_id(p)).collect(); + + let graph_uri = "https://example.org/graph/sensors"; + let graph_id = dict.fetch_id(graph_uri); + + // Create log with 1000 events + let mut writer = LogWriter::create(&log_path)?; + + for i in 0..1000 { + let timestamp = i; + let subject_uri = format!("https://rsp.js/event/{:04}", i); + let subject_id = dict.fetch_id(&subject_uri); + let predicate_id = predicate_ids[(i % 2) as usize]; + let object_uri = format!("reading-{}", i); + let object_id = dict.fetch_id(&object_uri); + + writer.append_record(timestamp, subject_id, predicate_id, object_id, graph_id)?; + } + + writer.flush()?; + + // Save dictionary BEFORE building index + dict.save_to_file(Path::new(&dict_path))?; + + // Build sparse index (without dictionary parameter since we saved it separately) + build_sparse_index(&log_path, &index_path, &100)?; + + // Load dictionary and reader + let (reader, loaded_dict) = SparseReader::open_with_dictionary(&index_path, &dict_path, 100)?; + + // Query a range and verify results + let results = reader.query_resolved(&log_path, &loaded_dict, 100, 199)?; + + // Should get 100 events (timestamps 100-199) + assert_eq!(results.len(), 100); + + // Verify first result has resolved URIs + assert!(results[0].subject.starts_with("https://rsp.js/event/")); + assert!(results[0].predicate.starts_with("http://www.w3.org/ns/saref#")); + assert!(results[0].object.starts_with("reading-")); + assert_eq!(results[0].graph, graph_uri); + + // Verify timestamps are in order + for (i, event) in results.iter().enumerate() { + assert_eq!(event.timestamp, 100 + i as u64); + } + + Ok(()) +} + +#[test] +fn test_large_uri_handling() { + let mut dict = Dictionary::new(); + + // Test with very long URIs (realistic for RDF) + let long_uri = format!( + "https://solid.ti.rw.fau.de/public/2025/11/05/sensors/building-3/floor-2/room-205/temperature-sensor-{}/reading-{}", + "TMP-4532-XYZ-9871-ABC-DEF", + "measurement-with-very-long-identifier-12345678901234567890" + ); + + let id = dict.fetch_id(&long_uri); + assert_eq!(id, 0); + + // Verify retrieval works + assert_eq!(dict.fetch_uri(id), Some(long_uri.as_str())); + + // Test that we can handle many long URIs + for i in 0..100 { + let uri = format!( + "https://example.org/very/long/path/to/resource/{}/subresource/{}/final-resource-{}", + i, + i * 2, + i * 3 + ); + dict.fetch_id(&uri); + } + + assert_eq!(dict.len(), 101); +} + +#[test] +fn test_rdf_namespace_reuse() { + let mut dict = Dictionary::new(); + + // Common RDF namespace URIs that should be reused + let common_namespaces = vec![ + "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "http://www.w3.org/2000/01/rdf-schema#", + "http://www.w3.org/2001/XMLSchema#", + "http://www.w3.org/ns/saref#", + "http://www.w3.org/ns/ssn#", + "http://www.w3.org/ns/sosa#", + ]; + + // Map each namespace + let namespace_ids: Vec = common_namespaces.iter().map(|ns| dict.fetch_id(ns)).collect(); + + // Create 1000 events that all use these namespaces + for i in 0..1000 { + let event_uri = format!("https://rsp.js/event/{}", i); + dict.fetch_id(&event_uri); + + // Reference one of the common namespaces + let ns_id = namespace_ids[i % namespace_ids.len()]; + assert!(dict.fetch_uri(ns_id).is_some()); + } + + // Dictionary should have: 6 namespaces + 1000 events = 1006 entries + assert_eq!(dict.len(), 1006); + + // Verify namespace IDs are unchanged (demonstrating reuse) + for (i, ns) in common_namespaces.iter().enumerate() { + assert_eq!(dict.fetch_id(ns), namespace_ids[i]); + } +} + +#[test] +fn test_event_resolution_workflow() -> std::io::Result<()> { + let test_dir = "target/test_data/event_resolution"; + let _ = fs::remove_dir_all(test_dir); + fs::create_dir_all(test_dir)?; + + let log_path = format!("{}/resolution_test.log", test_dir); + let mut dict = Dictionary::new(); + + // Create realistic RDF event + let event_uris = vec![ + ( + 1699181400u64, + "https://rsp.js/event/temp-reading-001", + "http://www.w3.org/ns/saref#hasValue", + "23.5", + "https://example.org/graph/sensors", + ), + ( + 1699181401u64, + "https://rsp.js/event/temp-reading-002", + "http://www.w3.org/ns/saref#hasValue", + "24.1", + "https://example.org/graph/sensors", + ), + ( + 1699181402u64, + "https://rsp.js/event/humidity-reading-001", + "http://www.w3.org/ns/saref#hasValue", + "65.0", + "https://example.org/graph/sensors", + ), + ]; + + // Write events with dictionary encoding + let mut writer = LogWriter::create(&log_path)?; + + for (timestamp, subject, predicate, object, graph) in &event_uris { + let subject_id = dict.fetch_id(subject); + let predicate_id = dict.fetch_id(predicate); + let object_id = dict.fetch_id(object); + let graph_id = dict.fetch_id(graph); + + writer.append_record(*timestamp, subject_id, predicate_id, object_id, graph_id)?; + } + + writer.flush()?; + + // Read back and resolve + let mut log_file = std::fs::File::open(&log_path)?; + use std::io::Read; + + for (timestamp, subject, predicate, object, graph) in &event_uris { + let mut buffer = [0u8; RECORD_SIZE]; + log_file.read_exact(&mut buffer)?; + + let (dec_ts, dec_subj_id, dec_pred_id, dec_obj_id, dec_graph_id) = decode_record(&buffer); + + // Verify timestamp + assert_eq!(dec_ts, *timestamp); + + // Resolve IDs to URIs + assert_eq!(dict.fetch_uri(dec_subj_id), Some(*subject)); + assert_eq!(dict.fetch_uri(dec_pred_id), Some(*predicate)); + assert_eq!(dict.fetch_uri(dec_obj_id), Some(*object)); + assert_eq!(dict.fetch_uri(dec_graph_id), Some(*graph)); + } + + Ok(()) +} + +#[test] +fn test_dictionary_space_savings() { + let mut dict = Dictionary::new(); + + // Calculate space used by raw URIs + let uris = vec![ + "https://solid.ti.rw.fau.de/public/ns/stream#event001", + "http://www.w3.org/ns/saref#hasTimestamp", + "2025-11-05T10:30:00Z", + "https://solid.ti.rw.fau.de/public/ns/stream#default", + ]; + + let raw_size: usize = uris.iter().map(|u| u.len()).sum(); + + // With dictionary, we store 8 bytes per ID + let ids: Vec = uris.iter().map(|u| dict.fetch_id(u)).collect(); + let encoded_size = ids.len() * 8; // 8 bytes per u64 + + println!("Raw URIs size: {} bytes", raw_size); + println!("Encoded IDs size: {} bytes", encoded_size); + println!("Space savings per record: {} bytes", raw_size - encoded_size); + + // For 1000 records reusing same URIs: + let records = 1000; + let raw_total = raw_size * records; + let encoded_total = encoded_size * records + raw_size; // IDs + dictionary overhead + + println!("\nFor {} records:", records); + println!("Raw storage: {} bytes", raw_total); + println!( + "Dictionary storage: {} bytes (IDs) + {} bytes (dictionary)", + encoded_size * records, + raw_size + ); + println!("Total with dictionary: {} bytes", encoded_total); + println!( + "Space saved: {} bytes ({:.1}% reduction)", + raw_total - encoded_total, + (1.0 - encoded_total as f64 / raw_total as f64) * 100.0 + ); + + // Verify space savings + assert!(encoded_total < raw_total); +} + +#[test] +fn test_complete_rdf_quad_with_datatype() { + let mut dict = Dictionary::new(); + + // Complete RDF quad in N-Quads syntax: + // "23.5"^^ . + // + // This quad has 5 components that get stored in the dictionary: + + let components = vec![ + ("subject", "https://rsp.js/event/temp-sensor-001"), + ("predicate", "http://www.w3.org/ns/saref#hasValue"), + ("object_value", "23.5"), // Just the literal value + ("object_datatype", "http://www.w3.org/2001/XMLSchema#double"), // Datatype as separate URI + ("graph", "https://example.org/graph/sensors"), + ]; + + // Store all components and get their IDs + let mut component_ids = std::collections::HashMap::new(); + for (name, value) in &components { + let id = dict.fetch_id(value); + component_ids.insert(*name, id); + println!("{}: '{}' → ID {}", name, value, id); + } + + // In the actual record, we'd store: + // - timestamp (u64) + // - subject_id (u64) + // - predicate_id (u64) + // - object_value_id (u64) + // - graph_id (u64) + // + // The object_datatype_id would be stored in a separate metadata structure + // that tracks which object IDs are literals and what their datatypes are. + + // Verify retrieval + assert_eq!(dict.fetch_uri(component_ids["subject"]), Some(components[0].1)); + assert_eq!(dict.fetch_uri(component_ids["predicate"]), Some(components[1].1)); + assert_eq!(dict.fetch_uri(component_ids["object_value"]), Some(components[2].1)); + assert_eq!(dict.fetch_uri(component_ids["object_datatype"]), Some(components[3].1)); + assert_eq!(dict.fetch_uri(component_ids["graph"]), Some(components[4].1)); + + // Another quad with the same datatype: + // "65.2"^^ . + + let subject2 = "https://rsp.js/event/humidity-sensor-001"; + let value2 = "65.2"; + + let _subject2_id = dict.fetch_id(subject2); + let _value2_id = dict.fetch_id(value2); + + // These components are REUSED (same ID returned): + let predicate2_id = dict.fetch_id("http://www.w3.org/ns/saref#hasValue"); + let datatype2_id = dict.fetch_id("http://www.w3.org/2001/XMLSchema#double"); + let graph2_id = dict.fetch_id("https://example.org/graph/sensors"); + + // Verify reuse + assert_eq!(predicate2_id, component_ids["predicate"]); + assert_eq!(datatype2_id, component_ids["object_datatype"]); + assert_eq!(graph2_id, component_ids["graph"]); + + // Dictionary has: 5 original components + 2 new (subject2, value2) = 7 total + assert_eq!(dict.len(), 7); + + println!("\n✓ Demonstrated RDF datatype handling with dictionary encoding"); + println!("✓ Showed URI reuse across multiple quads (predicate, datatype, graph)"); + println!("✓ Dictionary size: {} entries for 2 complete RDF quads", dict.len()); +} From 928707693f58487cbe324d4c46c9920afbb35037 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:45:12 +0000 Subject: [PATCH 02/19] Initial plan From c6e70656088a041d53af968746e9b95195f75408 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:49:41 +0000 Subject: [PATCH 03/19] Apply code review suggestions and rustfmt fixes Co-authored-by: argahsuknesib <87450516+argahsuknesib@users.noreply.github.com> --- src/indexing/dense.rs | 12 ++++-------- src/indexing/dictionary.rs | 2 +- src/indexing/mod.rs | 12 ++++++------ src/indexing/shared.rs | 8 ++++---- src/indexing/sparse.rs | 2 +- src/parsing/janusql_parser.rs | 2 +- 6 files changed, 17 insertions(+), 21 deletions(-) diff --git a/src/indexing/dense.rs b/src/indexing/dense.rs index 001998c..37cb3f9 100644 --- a/src/indexing/dense.rs +++ b/src/indexing/dense.rs @@ -27,20 +27,16 @@ impl DenseIndexBuilder { } } - -#[doc=""] -pub fn build_dense_index( - log_path: &str, - index_path: &str, -) -> std::io::Result<()> { +#[doc = ""] +pub fn build_dense_index(log_path: &str, index_path: &str) -> std::io::Result<()> { let mut log = File::open(log_path)?; let mut builder = DenseIndexBuilder::create(index_path)?; let mut offset = 0u64; let mut record = [0u8; RECORD_SIZE]; - while log.read_exact(&mut record).is_ok(){ - let (timestamp, _, _ , _, _ ) = decode_record(&record); + while log.read_exact(&mut record).is_ok() { + let (timestamp, _, _, _, _) = decode_record(&record); builder.add_entry(timestamp, offset)?; offset += RECORD_SIZE as u64; } diff --git a/src/indexing/dictionary.rs b/src/indexing/dictionary.rs index d34a7ac..e5605b2 100644 --- a/src/indexing/dictionary.rs +++ b/src/indexing/dictionary.rs @@ -61,7 +61,7 @@ impl Dictionary { file.read_exact(&mut count_bytes)?; let count = u64::from_be_bytes(count_bytes); - // Reading each IRI Entry + // Reading each URI Entry for id in 0..count { let mut len_bytes = [0u8; 4]; diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs index 6f0579a..69d25b6 100644 --- a/src/indexing/mod.rs +++ b/src/indexing/mod.rs @@ -1,8 +1,8 @@ -#[doc=""] -pub mod shared; -#[doc=""] +#[doc = ""] pub mod dense; -#[doc=""] -pub mod sparse; -#[doc=""] +#[doc = ""] pub mod dictionary; +#[doc = ""] +pub mod shared; +#[doc = ""] +pub mod sparse; diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs index 97a1155..dd344bc 100644 --- a/src/indexing/shared.rs +++ b/src/indexing/shared.rs @@ -1,6 +1,6 @@ +use crate::indexing::dictionary::Dictionary; use std::fs::File; use std::io::Write; -use crate::indexing::dictionary::Dictionary; #[doc = ""] pub const RECORD_SIZE: usize = 40; @@ -93,7 +93,7 @@ pub struct Event { } #[derive(Debug, Clone)] -pub struct ResolvedEvent{ +pub struct ResolvedEvent { pub timestamp: u64, pub subject: String, pub predicate: String, @@ -108,7 +108,7 @@ impl Event { subject: dict.fetch_uri(self.subject).unwrap_or("UNKNOWN").to_string(), predicate: dict.fetch_uri(self.predicate).unwrap_or("UNKNOWN").to_string(), object: dict.fetch_uri(self.object).unwrap_or("UNKNOWN").to_string(), - graph: dict.fetch_uri(self.graph).unwrap_or("UNKNOWN").to_string() - } + graph: dict.fetch_uri(self.graph).unwrap_or("UNKNOWN").to_string(), + } } } diff --git a/src/indexing/sparse.rs b/src/indexing/sparse.rs index 36f3a99..70a8ad1 100644 --- a/src/indexing/sparse.rs +++ b/src/indexing/sparse.rs @@ -1,4 +1,4 @@ -use crate::indexing::dictionary::{self, Dictionary}; +use crate::indexing::dictionary::Dictionary; use crate::indexing::shared::{decode_record, Event, ResolvedEvent, RECORD_SIZE}; use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; diff --git a/src/parsing/janusql_parser.rs b/src/parsing/janusql_parser.rs index da8dafa..c023058 100644 --- a/src/parsing/janusql_parser.rs +++ b/src/parsing/janusql_parser.rs @@ -356,7 +356,7 @@ mod tests { } #[test] - fn test_mixed_windows(){ + fn test_mixed_windows() { let parser = JanusQLParser::new().unwrap(); let query = r#" PREFIX sensor: From 5cabf50b3ab7d0fc3f4cad3e046ab2afe27a67c0 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 5 Nov 2025 22:34:46 +0100 Subject: [PATCH 04/19] Refactor CI integration tests and enhance JanusQL data structures with additional fields --- .github/workflows/ci.yml | 41 ----------------------------------- src/lib.rs | 3 +-- src/parsing/janusql_parser.rs | 25 ++++++++++++++++++++- 3 files changed, 25 insertions(+), 44 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c6ba021..606b819 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -112,29 +112,6 @@ jobs: integration-test: name: Integration Tests runs-on: ubuntu-latest - services: - oxigraph: - image: oxigraph/oxigraph:latest - ports: - - 7878:7878 - options: >- - --health-cmd "curl -f http://localhost:7878/query || exit 1" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - - fuseki: - image: stain/jena-fuseki:latest - ports: - - 3030:3030 - env: - ADMIN_PASSWORD: admin - JVM_ARGS: "-Xmx2g" - options: >- - --health-cmd "curl -f http://localhost:3030/$/ping || exit 1" - --health-interval 10s - --health-timeout 5s - --health-retries 5 steps: - name: Checkout code @@ -161,26 +138,8 @@ jobs: path: target key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }} - - name: Wait for services - run: | - echo "Waiting for Oxigraph..." - timeout 60 bash -c 'until curl -f http://localhost:7878/query; do sleep 2; done' - echo "Waiting for Fuseki..." - timeout 60 bash -c 'until curl -f http://localhost:3030/$/ping; do sleep 2; done' - - - name: Create Fuseki test dataset - run: | - curl -X POST http://localhost:3030/$/datasets \ - -H "Content-Type: application/x-www-form-urlencoded" \ - -d "dbName=ds&dbType=mem" \ - --user admin:admin || true - - name: Run integration tests run: cargo test --test '*' --all-features --verbose - env: - OXIGRAPH_ENDPOINT: http://localhost:7878 - JENA_ENDPOINT: http://localhost:3030 - JENA_DATASET: ds # Code coverage coverage: diff --git a/src/lib.rs b/src/lib.rs index b115ed0..5a5928a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -59,13 +59,12 @@ pub mod indexing; /// Module for parsing JanusQL queries pub mod parsing; -#[doc = ""] +/// Benchmarking utilities pub mod benchmarking { mod benchmark; } -/// Module containing error types pub mod error { //! Error types and result definitions diff --git a/src/parsing/janusql_parser.rs b/src/parsing/janusql_parser.rs index da8dafa..8a49414 100644 --- a/src/parsing/janusql_parser.rs +++ b/src/parsing/janusql_parser.rs @@ -10,34 +10,55 @@ pub enum WindowType { #[derive(Debug, Clone)] pub struct WindowDefinition { + /// Name of the window pub window_name: String, + /// Name of the stream pub stream_name: String, + /// Width of the window pub width: u64, + /// Slide step pub slide: u64, + /// Offset for sliding windows pub offset: Option, + /// Start time for fixed windows pub start: Option, + /// End time for fixed windows pub end: Option, + /// Type of the window pub window_type: WindowType, } +/// R2S operator definition #[derive(Debug, Clone)] pub struct R2SOperator { + /// Operator type pub operator: String, + /// Operator name pub name: String, } +/// Parsed JanusQL query structure #[derive(Debug)] pub struct ParsedJanusQuery { + /// R2S operator if present pub r2s: Option, + /// Live windows defined in the query pub live_windows: Vec, + /// Historical windows defined in the query pub historical_windows: Vec, + /// RSPQL query string pub rspql_query: String, + /// SPARQL queries pub sparql_queries: Vec, + /// Prefix mappings pub prefixes: HashMap, + /// WHERE clause pub where_clause: String, + /// SELECT clause pub select_clause: String, } +/// Parser for JanusQL queries pub struct JanusQLParser { historical_sliding_window_regex: Regex, historical_fixed_window_regex: Regex, @@ -47,6 +68,7 @@ pub struct JanusQLParser { } impl JanusQLParser { + /// Creates a new JanusQLParser instance. pub fn new() -> Result> { Ok(JanusQLParser { historical_sliding_window_regex: Regex::new( @@ -110,6 +132,7 @@ impl JanusQLParser { Ok(None) } + /// Parses a JanusQL query string. pub fn parse(&self, query: &str) -> Result> { let mut parsed = ParsedJanusQuery { r2s: None, @@ -356,7 +379,7 @@ mod tests { } #[test] - fn test_mixed_windows(){ + fn test_mixed_windows() { let parser = JanusQLParser::new().unwrap(); let query = r#" PREFIX sensor: From 69f1f9e27bab7e0b23724816e2c0091789fcb5a6 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Thu, 6 Nov 2025 13:49:04 +0100 Subject: [PATCH 05/19] Add comprehensive benchmarks for writing performance and analysis of indexing strategies --- Cargo.toml | 12 ++ WRITING_BENCHMARKS.md | 229 +++++++++++++++++++++++++++++++++++++ benches/README.md | 45 ++++++-- benches/analysis.rs | 196 +++++++++++++++++++++++++++++++ benches/write_benchmark.rs | 216 ++++++++++++++++++++++++++++++++++ run_benchmarks.sh | 47 ++++++++ 6 files changed, 736 insertions(+), 9 deletions(-) create mode 100644 WRITING_BENCHMARKS.md create mode 100644 benches/analysis.rs create mode 100644 benches/write_benchmark.rs create mode 100755 run_benchmarks.sh diff --git a/Cargo.toml b/Cargo.toml index c199397..1ff8565 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,3 +35,15 @@ opt-level = 0 [profile.test] opt-level = 0 + +[[bench]] +name = "benchmark" +harness = false + +[[bench]] +name = "write_benchmark" +harness = false + +[[bench]] +name = "analysis" +harness = false diff --git a/WRITING_BENCHMARKS.md b/WRITING_BENCHMARKS.md new file mode 100644 index 0000000..85ee553 --- /dev/null +++ b/WRITING_BENCHMARKS.md @@ -0,0 +1,229 @@ +# Complete Guide: Testing Writing Performance for Dense vs Sparse Indexing + +## Overview + +This guide provides step-by-step instructions for testing and comparing the writing performance of Dense vs Sparse indexing approaches in the Janus RDF Stream Processing Engine. + +## Background + +Previously, the benchmarking only tested **reading performance** (querying existing indexes). Now we have comprehensive **writing performance** tests that measure: + +1. **Real-time indexing**: Building indexes while writing records +2. **Batch indexing**: Writing all records first, then building indexes +3. **Throughput comparison**: Records processed per second +4. **Memory and storage efficiency**: Resource usage patterns + +## What's Been Added + +### New Benchmark Files + +1. **`write_benchmark.rs`** - Core writing performance tests +2. **`analysis.rs`** - Advanced analysis and optimal configuration finding +3. **`run_benchmarks.sh`** - Automated test runner script +4. **Enhanced `README.md`** - Comprehensive documentation + +### Updated Configuration + +- Updated `Cargo.toml` with new benchmark entries +- Added support for multiple test scenarios +- Integrated analysis tools + +## Step-by-Step Testing Instructions + +### Step 1: Run the Complete Benchmark Suite + +```bash +# Make script executable (if not already) +chmod +x run_benchmarks.sh + +# Run all benchmarks +./run_benchmarks.sh +``` + +This runs all three benchmark types in sequence and provides a comprehensive overview. + +### Step 2: Test Writing Performance Specifically + +```bash +# Run only the writing performance benchmark +cargo bench --bench write_benchmark +``` + +**What this tests:** +- Real-time writing with indexing for 10K, 100K, and 1M records +- Batch writing comparison +- Performance ratios between dense and sparse approaches + +**Expected output:** +``` +=== WRITING PERFORMANCE RESULTS === +Records: 100000 +Sparse interval: 1000 + +--- Real-time Writing (Index while writing) --- +Dense - Write time: 260.611 ms, Total time: 260.611 ms +Sparse - Write time: 85.356 ms, Total time: 85.356 ms + +--- Performance Comparison --- +Real-time: Sparse is 3.05x faster than Dense +``` + +### Step 3: Advanced Analysis + +```bash +# Run detailed analysis +cargo bench --bench analysis +``` + +**What this tests:** +- Optimal sparse intervals (100, 500, 1000, 2000, 5000, 10000) +- Memory usage scaling across different dataset sizes +- Write throughput under various conditions + +### Step 4: Original Read Performance (For Comparison) + +```bash +# Run original benchmark +cargo bench --bench benchmark +``` + +**What this tests:** +- Index building time from existing log files +- Query performance across different ranges +- Memory usage of indexes + +### Step 5: Individual Test Runs + +For targeted testing, you can run specific scenarios: + +```bash +# Run with release optimizations for accurate timing +cargo bench --bench write_benchmark --release + +# Run with specific test size (modify source code) +# Edit the test_sizes vector in write_benchmark.rs +``` + +## Interpreting Results + +### Key Metrics to Focus On + +#### 1. Writing Throughput +- **Records/second**: Higher is better +- **Dense typically**: 300-500 records/sec for large datasets +- **Sparse typically**: 1000-1500 records/sec for large datasets + +#### 2. Performance Ratios +- **Real-time writing**: Sparse is typically 2-4x faster +- **Batch processing**: Sparse is typically 2-3x faster +- **Memory usage**: Sparse uses significantly less memory + +#### 3. Trade-offs +- **Query speed**: Dense is typically 10-30% faster for queries +- **Storage space**: Sparse uses 90-99% less index storage +- **Write speed**: Sparse is 2-4x faster for writing + +### When to Use Each Approach + +#### Use Dense Indexing When: +- Query performance is critical +- Dataset size is manageable (< 1M records) +- Storage space is not a constraint +- Read-heavy workloads + +#### Use Sparse Indexing When: +- High-frequency writes (streaming data) +- Large datasets (> 1M records) +- Storage efficiency is important +- Write-heavy workloads +- Real-time ingestion requirements + +### Sample Results Analysis + +``` +Real-time: Sparse is 3.05x faster than Dense +Batch: Sparse is 2.44x faster than Dense +``` + +This shows: +- Sparse indexing provides significant write performance benefits +- The advantage is consistent across different writing patterns +- Sparse indexing scales better with larger datasets + +## Customizing Tests + +### Modify Record Counts + +Edit the test sizes in `write_benchmark.rs`: + +```rust +let test_sizes = vec![10_000u64, 100_000u64, 1_000_000u64, 5_000_000u64]; +``` + +### Adjust Sparse Intervals + +Modify the `SPARSE_INTERVAL` constant: + +```rust +const SPARSE_INTERVAL: usize = 500; // Test different intervals +``` + +### Add Custom Test Scenarios + +Create new benchmark functions following the existing patterns in the benchmark files. + +## Performance Optimization Tips + +### For Maximum Accuracy +1. Run benchmarks on a quiet system (minimal background processes) +2. Use release builds: `cargo bench --release` +3. Run multiple iterations and average results +4. Ensure consistent storage (SSD vs HDD considerations) + +### For Large Datasets +1. Monitor memory usage during tests +2. Consider disk I/O limitations +3. Test with realistic data patterns +4. Evaluate network storage implications + +## Troubleshooting + +### Common Issues + +#### Out of Memory +- Reduce test dataset sizes +- Monitor system memory during tests +- Consider streaming vs batch processing + +#### Slow Performance +- Ensure running in release mode +- Check disk I/O capacity +- Verify no other processes consuming resources + +#### Inconsistent Results +- Run tests multiple times +- Check system load +- Ensure consistent test conditions + +## Next Steps + +### Additional Testing Ideas + +1. **Network Storage**: Test performance with network-attached storage +2. **Concurrent Access**: Test multiple writers/readers simultaneously +3. **Real-world Data**: Test with actual RDF datasets +4. **Memory Pressure**: Test under various memory constraints +5. **Different Hardware**: Compare SSD vs HDD performance + +### Integration Testing + +1. Test within larger application contexts +2. Measure end-to-end pipeline performance +3. Evaluate query pattern impacts +4. Test with realistic data volumes and patterns + +## Conclusion + +The new writing performance benchmarks provide comprehensive insights into the trade-offs between dense and sparse indexing approaches. The results clearly show that sparse indexing provides significant advantages for write-heavy workloads while maintaining acceptable query performance. + +Use these tools to make informed decisions about indexing strategies based on your specific use case requirements. \ No newline at end of file diff --git a/benches/README.md b/benches/README.md index a627df0..d0397d6 100644 --- a/benches/README.md +++ b/benches/README.md @@ -1,22 +1,49 @@ -# Benchmarks +# RDF Indexing Benchmarks -This directory contains performance benchmarks for the Janus RDF Stream Processing Engine. +This directory contains comprehensive benchmarks for comparing different RDF indexing strategies in Janus. -## Running Benchmarks +## Available Benchmarks -To run all benchmarks: +### 1. `benchmark.rs` - Read Performance (Original) +Tests query performance on pre-built indexes: +- Index building time comparison +- Query speed across different data ranges +- Memory usage comparison +### 2. `write_benchmark.rs` - Write Performance (New) +Tests writing performance during record insertion: +- Real-time indexing (index while writing) +- Batch indexing (build index after writing) +- Writing throughput comparison +- Total processing time analysis + +### 3. `analysis.rs` - Advanced Analysis (New) +Detailed analysis across multiple dimensions: +- Optimal sparse interval analysis +- Memory usage scaling +- Write throughput under different conditions +- Performance recommendations + +## Quick Start + +### Run All Benchmarks ```bash -cargo bench +./run_benchmarks.sh ``` -To run a specific benchmark: - +### Run Individual Benchmarks ```bash -cargo bench --bench +# Original read performance benchmark +cargo bench --bench benchmark + +# New write performance benchmark +cargo bench --bench write_benchmark + +# Advanced analysis suite +cargo bench --bench analysis ``` -## Benchmark Structure +## Step-by-Step Testing Instructions Benchmarks are organized by functionality: diff --git a/benches/analysis.rs b/benches/analysis.rs new file mode 100644 index 0000000..7c0bf27 --- /dev/null +++ b/benches/analysis.rs @@ -0,0 +1,196 @@ +use janus::indexing::{dense, sparse}; +use std::fs; +use std::time::Instant; + +/// Analyze different sparse intervals to find optimal configuration +fn analyze_sparse_intervals() -> std::io::Result<()> { + println!("🔍 Analyzing Different Sparse Intervals"); + println!("====================================="); + + let intervals = vec![100, 500, 1000, 2000, 5000, 10000]; + let log_file = "data/benchmark/log.dat"; + let number_records = 100_000u64; + + // Create test data + fs::create_dir_all("data/benchmark")?; + let mut writer = janus::indexing::shared::LogWriter::create(log_file)?; + for i in 0..number_records { + writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; + } + writer.flush()?; + + println!("Testing {} records with different intervals:", number_records); + println!("{:-<80}", ""); + println!("{:<10} {:<15} {:<15} {:<20} {:<15}", + "Interval", "Build Time(ms)", "Index Size(KB)", "Space Savings(%)", "Query Time(ms)"); + println!("{:-<80}", ""); + + // Get dense index stats for comparison + let dense_start = Instant::now(); + dense::build_dense_index(log_file, "data/benchmark/dense_ref.idx")?; + let dense_build_time = dense_start.elapsed(); + let dense_reader = dense::DenseIndexReader::open("data/benchmark/dense_ref.idx")?; + let dense_size = dense_reader.index_size_bytes(); + + // Test query performance on dense index + let query_start = Instant::now(); + let _dense_results = dense_reader.query(log_file, 10000, 20000)?; + let dense_query_time = query_start.elapsed(); + + for interval in intervals { + let index_file = format!("data/benchmark/sparse_{}.idx", interval); + + // Build sparse index + let start = Instant::now(); + sparse::build_sparse_index(log_file, &index_file, &interval)?; + let build_time = start.elapsed(); + + // Get size info + let reader = sparse::SparseReader::open(&index_file, interval)?; + let sparse_size = reader.index_size_bytes(); + let space_savings = ((dense_size - sparse_size) as f64 / dense_size as f64) * 100.0; + + // Test query performance + let query_start = Instant::now(); + let _sparse_results = reader.query(log_file, 10000, 20000)?; + let query_time = query_start.elapsed(); + + println!("{:<10} {:<15.3} {:<15.2} {:<20.2} {:<15.3}", + interval, + build_time.as_secs_f64() * 1000.0, + sparse_size as f64 / 1024.0, + space_savings, + query_time.as_secs_f64() * 1000.0); + } + + println!("{:-<80}", ""); + println!("Dense Reference: Build: {:.3}ms, Size: {:.2}KB, Query: {:.3}ms", + dense_build_time.as_secs_f64() * 1000.0, + dense_size as f64 / 1024.0, + dense_query_time.as_secs_f64() * 1000.0); + + Ok(()) +} + +/// Analyze memory usage patterns +fn analyze_memory_usage() -> std::io::Result<()> { + println!("\n🧠 Memory Usage Analysis"); + println!("======================="); + + let record_counts = vec![10_000, 50_000, 100_000, 500_000, 1_000_000]; + + println!("{:<12} {:<15} {:<15} {:<20}", + "Records", "Dense Size(MB)", "Sparse Size(MB)", "Memory Ratio"); + println!("{:-<62}", ""); + + for &count in &record_counts { + let log_file = format!("data/benchmark/log_{}.dat", count); + let dense_index = format!("data/benchmark/dense_{}.idx", count); + let sparse_index = format!("data/benchmark/sparse_{}.idx", count); + + // Create test data + let mut writer = janus::indexing::shared::LogWriter::create(&log_file)?; + for i in 0..count { + writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; + } + writer.flush()?; + + // Build indexes + dense::build_dense_index(&log_file, &dense_index)?; + sparse::build_sparse_index(&log_file, &sparse_index, &1000)?; + + // Get sizes + let dense_reader = dense::DenseIndexReader::open(&dense_index)?; + let sparse_reader = sparse::SparseReader::open(&sparse_index, 1000)?; + + let dense_size = dense_reader.index_size_bytes() as f64 / 1_000_000.0; + let sparse_size = sparse_reader.index_size_bytes() as f64 / 1_000_000.0; + let ratio = dense_size / sparse_size; + + println!("{:<12} {:<15.3} {:<15.3} {:<20.2}x", + count, dense_size, sparse_size, ratio); + } + + Ok(()) +} + +/// Test write throughput under different conditions +fn analyze_write_throughput() -> std::io::Result<()> { + println!("\n⚡ Write Throughput Analysis"); + println!("==========================="); + + let test_configs = vec![ + ("Small batches", 1_000u64), + ("Medium batches", 10_000u64), + ("Large batches", 100_000u64), + ]; + + println!("{:<15} {:<20} {:<20} {:<15}", + "Batch Size", "Dense (rec/sec)", "Sparse (rec/sec)", "Speedup"); + println!("{:-<70}", ""); + + for (name, batch_size) in test_configs { + fs::create_dir_all("data/benchmark")?; + + // Test dense writing + let dense_log = "data/benchmark/dense_throughput.dat"; + let dense_index = "data/benchmark/dense_throughput.idx"; + + let start = Instant::now(); + let mut log_writer = janus::indexing::shared::LogWriter::create(dense_log)?; + let mut index_builder = janus::indexing::dense::DenseIndexBuilder::create(dense_index)?; + + for i in 0..batch_size { + log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; + index_builder.add_entry(i, i * 40)?; + } + log_writer.flush()?; + index_builder.finalize()?; + + let dense_time = start.elapsed(); + let dense_throughput = batch_size as f64 / dense_time.as_secs_f64(); + + // Test sparse writing + let sparse_log = "data/benchmark/sparse_throughput.dat"; + let sparse_index = "data/benchmark/sparse_throughput.idx"; + + let start = Instant::now(); + let mut log_writer = janus::indexing::shared::LogWriter::create(sparse_log)?; + let mut index_builder = janus::indexing::sparse::SparseIndexBuilder::create(sparse_index, 1000)?; + + for i in 0..batch_size { + log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; + index_builder.add_entry(i, i, i * 40)?; + } + log_writer.flush()?; + index_builder.finalize()?; + + let sparse_time = start.elapsed(); + let sparse_throughput = batch_size as f64 / sparse_time.as_secs_f64(); + + let speedup = sparse_throughput / dense_throughput; + + println!("{:<15} {:<20.0} {:<20.0} {:<15.2}x", + name, dense_throughput, sparse_throughput, speedup); + } + + Ok(()) +} + +fn main() -> std::io::Result<()> { + println!("🔬 Advanced RDF Indexing Analysis Suite"); + println!("======================================="); + + analyze_sparse_intervals()?; + analyze_memory_usage()?; + analyze_write_throughput()?; + + println!("\n✨ Analysis Complete!"); + println!("\n💡 Recommendations:"); + println!(" • Use sparse indexing for write-heavy workloads"); + println!(" • Choose interval based on query precision requirements"); + println!(" • Consider hybrid approaches for different use cases"); + println!(" • Monitor memory usage with large datasets"); + + Ok(()) +} \ No newline at end of file diff --git a/benches/write_benchmark.rs b/benches/write_benchmark.rs new file mode 100644 index 0000000..dfd12ab --- /dev/null +++ b/benches/write_benchmark.rs @@ -0,0 +1,216 @@ +use janus::indexing::{dense::DenseIndexBuilder, shared::LogWriter, sparse::SparseIndexBuilder}; +use std::fs; +use std::time::Instant; + +const DATA_DIR: &str = "data/write_benchmark"; +const DENSE_LOG_FILE: &str = "data/write_benchmark/dense_log.dat"; +const SPARSE_LOG_FILE: &str = "data/write_benchmark/sparse_log.dat"; +const DENSE_INDEX_FILE: &str = "data/write_benchmark/dense.idx"; +const SPARSE_INDEX_FILE: &str = "data/write_benchmark/sparse.idx"; +const SPARSE_INTERVAL: usize = 1000; + +fn setup_dirs() -> std::io::Result<()> { + let _ = fs::remove_dir_all(DATA_DIR); + fs::create_dir_all(DATA_DIR)?; + Ok(()) +} + +/// Benchmark writing records with dense indexing +/// This simulates real-time writing where each record is indexed immediately +fn benchmark_dense_writing(number_records: u64) -> std::io::Result<(f64, f64)> { + println!("Benchmarking Dense Index Writing..."); + + let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?; + let mut index_builder = DenseIndexBuilder::create(DENSE_INDEX_FILE)?; + + let start = Instant::now(); + let mut current_offset = 0u64; + + for i in 0..number_records { + let timestamp = i; + let subject = (i % 1000) as u64; + let predicate = (i % 500) as u64; + let object = (i % 2000) as u64; + let graph: u64 = 1; + + // Write record to log + log_writer.append_record(timestamp, subject, predicate, object, graph)?; + + // Add entry to index + index_builder.add_entry(timestamp, current_offset)?; + + current_offset += 40; // RECORD_SIZE + } + + let write_time = start.elapsed(); + + // Finalize both log and index + log_writer.flush()?; + index_builder.finalize()?; + + let total_time = start.elapsed(); + + Ok((write_time.as_secs_f64(), total_time.as_secs_f64())) +} + +/// Benchmark writing records with sparse indexing +/// This simulates real-time writing where only periodic records are indexed +fn benchmark_sparse_writing(number_records: u64) -> std::io::Result<(f64, f64)> { + println!("Benchmarking Sparse Index Writing..."); + + let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?; + let mut index_builder = SparseIndexBuilder::create(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?; + + let start = Instant::now(); + let mut current_offset = 0u64; + + for i in 0..number_records { + let timestamp = i; + let subject = (i % 1000) as u64; + let predicate = (i % 500) as u64; + let object = (i % 2000) as u64; + let graph: u64 = 1; + + // Write record to log + log_writer.append_record(timestamp, subject, predicate, object, graph)?; + + // Add entry to index (will only add if i % interval == 0) + index_builder.add_entry(i, timestamp, current_offset)?; + + current_offset += 40; // RECORD_SIZE + } + + let write_time = start.elapsed(); + + // Finalize both log and index + log_writer.flush()?; + index_builder.finalize()?; + + let total_time = start.elapsed(); + + Ok((write_time.as_secs_f64(), total_time.as_secs_f64())) +} + +/// Benchmark batch writing vs real-time writing +fn benchmark_batch_vs_realtime(number_records: u64) -> std::io::Result<()> { + println!("\n=== Batch vs Real-time Writing Comparison ==="); + + // Test 1: Real-time writing (as implemented above) + setup_dirs()?; + let (dense_write_time, dense_total_time) = benchmark_dense_writing(number_records)?; + + setup_dirs()?; + let (sparse_write_time, sparse_total_time) = benchmark_sparse_writing(number_records)?; + + // Test 2: Batch writing (write log first, then build index) + setup_dirs()?; + println!("Benchmarking Batch Dense Index Creation..."); + + let start = Instant::now(); + let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?; + for i in 0..number_records { + let timestamp = i; + let subject = (i % 1000) as u64; + let predicate = (i % 500) as u64; + let object = (i % 2000) as u64; + let graph: u64 = 1; + log_writer.append_record(timestamp, subject, predicate, object, graph)?; + } + log_writer.flush()?; + let log_write_time = start.elapsed(); + + let start = Instant::now(); + janus::indexing::dense::build_dense_index(DENSE_LOG_FILE, DENSE_INDEX_FILE)?; + let index_build_time = start.elapsed(); + let batch_dense_total = log_write_time.as_secs_f64() + index_build_time.as_secs_f64(); + + // Batch sparse + setup_dirs()?; + println!("Benchmarking Batch Sparse Index Creation..."); + + let start = Instant::now(); + let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?; + for i in 0..number_records { + let timestamp = i; + let subject = (i % 1000) as u64; + let predicate = (i % 500) as u64; + let object = (i % 2000) as u64; + let graph: u64 = 1; + log_writer.append_record(timestamp, subject, predicate, object, graph)?; + } + log_writer.flush()?; + let log_write_time = start.elapsed(); + + let start = Instant::now(); + janus::indexing::sparse::build_sparse_index( + SPARSE_LOG_FILE, + SPARSE_INDEX_FILE, + &SPARSE_INTERVAL, + )?; + let index_build_time = start.elapsed(); + let batch_sparse_total = log_write_time.as_secs_f64() + index_build_time.as_secs_f64(); + + // Print results + println!("\n=== WRITING PERFORMANCE RESULTS ==="); + println!("Records: {}", number_records); + println!("Sparse interval: {}", SPARSE_INTERVAL); + + println!("\n--- Real-time Writing (Index while writing) ---"); + println!( + "Dense - Write time: {:.3} ms, Total time: {:.3} ms", + dense_write_time * 1000.0, + dense_total_time * 1000.0 + ); + println!( + "Sparse - Write time: {:.3} ms, Total time: {:.3} ms", + sparse_write_time * 1000.0, + sparse_total_time * 1000.0 + ); + + println!("\n--- Batch Writing (Index after writing) ---"); + println!( + "Dense - Log write: {:.3} ms, Index build: {:.3} ms, Total: {:.3} ms", + log_write_time.as_secs_f64() * 1000.0, + index_build_time.as_secs_f64() * 1000.0, + batch_dense_total * 1000.0 + ); + println!( + "Sparse - Log write: {:.3} ms, Index build: {:.3} ms, Total: {:.3} ms", + log_write_time.as_secs_f64() * 1000.0, + index_build_time.as_secs_f64() * 1000.0, + batch_sparse_total * 1000.0 + ); + + println!("\n--- Performance Comparison ---"); + let realtime_speedup = dense_total_time / sparse_total_time; + let batch_speedup = batch_dense_total / batch_sparse_total; + + if realtime_speedup > 1.0 { + println!("Real-time: Sparse is {:.2}x faster than Dense", realtime_speedup); + } else { + println!("Real-time: Dense is {:.2}x faster than Sparse", 1.0 / realtime_speedup); + } + + if batch_speedup > 1.0 { + println!("Batch: Sparse is {:.2}x faster than Dense", batch_speedup); + } else { + println!("Batch: Dense is {:.2}x faster than Sparse", 1.0 / batch_speedup); + } + + Ok(()) +} + +fn main() -> std::io::Result<()> { + println!("RDF Writing Performance Benchmark: Dense vs Sparse"); + + let test_sizes = vec![10_000u64, 100_000u64, 1_000_000u64]; + + for &size in &test_sizes { + println!("\n{:=<60}", ""); + println!("Testing with {} records", size); + println!("{:=<60}", ""); + benchmark_batch_vs_realtime(size)?; + } + + Ok(()) +} diff --git a/run_benchmarks.sh b/run_benchmarks.sh new file mode 100755 index 0000000..d44102f --- /dev/null +++ b/run_benchmarks.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Comprehensive benchmark script for testing Dense vs Sparse indexing approaches +# This script tests both reading and writing performance + +echo "🚀 Starting Comprehensive RDF Indexing Benchmark Suite" +echo "======================================================" + +# Create benchmarks directory if it doesn't exist +mkdir -p data/benchmark +mkdir -p data/write_benchmark + +echo "" +echo "📊 Running Read Performance Benchmark (Current Implementation)" +echo "--------------------------------------------------------------" +cargo bench --bench benchmark + +echo "" +echo "📝 Running Write Performance Benchmark (New Implementation)" +echo "-----------------------------------------------------------" +cargo bench --bench write_benchmark + +echo "" +echo "🔬 Running Detailed Analysis" +echo "-----------------------------" + +# Run additional analysis with different record sizes and intervals +echo "Testing different sparse intervals..." + +# You can modify the intervals in the source code and run multiple tests +# This demonstrates how to test different configurations + +echo "" +echo "✅ Benchmark Suite Complete!" +echo "" +echo "📋 Summary of Tests Performed:" +echo " 1. Read Performance (Query speed on existing indexes)" +echo " 2. Write Performance (Index creation speed during writing)" +echo " 3. Real-time vs Batch indexing comparison" +echo " 4. Memory usage comparison" +echo "" +echo "💡 Key Metrics to Compare:" +echo " - Writing throughput (records/second)" +echo " - Index build time" +echo " - Memory usage" +echo " - Query performance trade-offs" +echo " - Storage space efficiency" \ No newline at end of file From c0df48769166dd83a6dfcc7968e1b9b1f76aa0e6 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Thu, 6 Nov 2025 13:49:08 +0100 Subject: [PATCH 06/19] Enhance writing performance benchmarks and update analysis scripts for improved insights on indexing strategies --- WRITING_BENCHMARKS.md | 2 +- benches/README.md | 2 +- benches/analysis.rs | 132 +++++++++++++++++++++++------------------- run_benchmarks.sh | 2 +- 4 files changed, 75 insertions(+), 63 deletions(-) diff --git a/WRITING_BENCHMARKS.md b/WRITING_BENCHMARKS.md index 85ee553..98896ef 100644 --- a/WRITING_BENCHMARKS.md +++ b/WRITING_BENCHMARKS.md @@ -226,4 +226,4 @@ Create new benchmark functions following the existing patterns in the benchmark The new writing performance benchmarks provide comprehensive insights into the trade-offs between dense and sparse indexing approaches. The results clearly show that sparse indexing provides significant advantages for write-heavy workloads while maintaining acceptable query performance. -Use these tools to make informed decisions about indexing strategies based on your specific use case requirements. \ No newline at end of file +Use these tools to make informed decisions about indexing strategies based on your specific use case requirements. diff --git a/benches/README.md b/benches/README.md index d0397d6..2892fd1 100644 --- a/benches/README.md +++ b/benches/README.md @@ -99,4 +99,4 @@ To view results, open `target/criterion/report/index.html` in a browser. - Ensure system is idle during benchmarking - Use consistent hardware for comparisons - Run multiple iterations to reduce noise -- Use `black_box()` to prevent compiler optimizations \ No newline at end of file +- Use `black_box()` to prevent compiler optimizations diff --git a/benches/analysis.rs b/benches/analysis.rs index 7c0bf27..60f020c 100644 --- a/benches/analysis.rs +++ b/benches/analysis.rs @@ -6,11 +6,11 @@ use std::time::Instant; fn analyze_sparse_intervals() -> std::io::Result<()> { println!("🔍 Analyzing Different Sparse Intervals"); println!("====================================="); - + let intervals = vec![100, 500, 1000, 2000, 5000, 10000]; let log_file = "data/benchmark/log.dat"; let number_records = 100_000u64; - + // Create test data fs::create_dir_all("data/benchmark")?; let mut writer = janus::indexing::shared::LogWriter::create(log_file)?; @@ -18,57 +18,63 @@ fn analyze_sparse_intervals() -> std::io::Result<()> { writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; } writer.flush()?; - + println!("Testing {} records with different intervals:", number_records); println!("{:-<80}", ""); - println!("{:<10} {:<15} {:<15} {:<20} {:<15}", - "Interval", "Build Time(ms)", "Index Size(KB)", "Space Savings(%)", "Query Time(ms)"); + println!( + "{:<10} {:<15} {:<15} {:<20} {:<15}", + "Interval", "Build Time(ms)", "Index Size(KB)", "Space Savings(%)", "Query Time(ms)" + ); println!("{:-<80}", ""); - + // Get dense index stats for comparison let dense_start = Instant::now(); dense::build_dense_index(log_file, "data/benchmark/dense_ref.idx")?; let dense_build_time = dense_start.elapsed(); let dense_reader = dense::DenseIndexReader::open("data/benchmark/dense_ref.idx")?; let dense_size = dense_reader.index_size_bytes(); - + // Test query performance on dense index let query_start = Instant::now(); let _dense_results = dense_reader.query(log_file, 10000, 20000)?; let dense_query_time = query_start.elapsed(); - + for interval in intervals { let index_file = format!("data/benchmark/sparse_{}.idx", interval); - + // Build sparse index let start = Instant::now(); sparse::build_sparse_index(log_file, &index_file, &interval)?; let build_time = start.elapsed(); - + // Get size info let reader = sparse::SparseReader::open(&index_file, interval)?; let sparse_size = reader.index_size_bytes(); let space_savings = ((dense_size - sparse_size) as f64 / dense_size as f64) * 100.0; - + // Test query performance let query_start = Instant::now(); let _sparse_results = reader.query(log_file, 10000, 20000)?; let query_time = query_start.elapsed(); - - println!("{:<10} {:<15.3} {:<15.2} {:<20.2} {:<15.3}", - interval, - build_time.as_secs_f64() * 1000.0, - sparse_size as f64 / 1024.0, - space_savings, - query_time.as_secs_f64() * 1000.0); + + println!( + "{:<10} {:<15.3} {:<15.2} {:<20.2} {:<15.3}", + interval, + build_time.as_secs_f64() * 1000.0, + sparse_size as f64 / 1024.0, + space_savings, + query_time.as_secs_f64() * 1000.0 + ); } - + println!("{:-<80}", ""); - println!("Dense Reference: Build: {:.3}ms, Size: {:.2}KB, Query: {:.3}ms", - dense_build_time.as_secs_f64() * 1000.0, - dense_size as f64 / 1024.0, - dense_query_time.as_secs_f64() * 1000.0); - + println!( + "Dense Reference: Build: {:.3}ms, Size: {:.2}KB, Query: {:.3}ms", + dense_build_time.as_secs_f64() * 1000.0, + dense_size as f64 / 1024.0, + dense_query_time.as_secs_f64() * 1000.0 + ); + Ok(()) } @@ -76,41 +82,42 @@ fn analyze_sparse_intervals() -> std::io::Result<()> { fn analyze_memory_usage() -> std::io::Result<()> { println!("\n🧠 Memory Usage Analysis"); println!("======================="); - + let record_counts = vec![10_000, 50_000, 100_000, 500_000, 1_000_000]; - - println!("{:<12} {:<15} {:<15} {:<20}", - "Records", "Dense Size(MB)", "Sparse Size(MB)", "Memory Ratio"); + + println!( + "{:<12} {:<15} {:<15} {:<20}", + "Records", "Dense Size(MB)", "Sparse Size(MB)", "Memory Ratio" + ); println!("{:-<62}", ""); - + for &count in &record_counts { let log_file = format!("data/benchmark/log_{}.dat", count); let dense_index = format!("data/benchmark/dense_{}.idx", count); let sparse_index = format!("data/benchmark/sparse_{}.idx", count); - + // Create test data let mut writer = janus::indexing::shared::LogWriter::create(&log_file)?; for i in 0..count { writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; } writer.flush()?; - + // Build indexes dense::build_dense_index(&log_file, &dense_index)?; sparse::build_sparse_index(&log_file, &sparse_index, &1000)?; - + // Get sizes let dense_reader = dense::DenseIndexReader::open(&dense_index)?; let sparse_reader = sparse::SparseReader::open(&sparse_index, 1000)?; - + let dense_size = dense_reader.index_size_bytes() as f64 / 1_000_000.0; let sparse_size = sparse_reader.index_size_bytes() as f64 / 1_000_000.0; let ratio = dense_size / sparse_size; - - println!("{:<12} {:<15.3} {:<15.3} {:<20.2}x", - count, dense_size, sparse_size, ratio); + + println!("{:<12} {:<15.3} {:<15.3} {:<20.2}x", count, dense_size, sparse_size, ratio); } - + Ok(()) } @@ -118,79 +125,84 @@ fn analyze_memory_usage() -> std::io::Result<()> { fn analyze_write_throughput() -> std::io::Result<()> { println!("\n⚡ Write Throughput Analysis"); println!("==========================="); - + let test_configs = vec![ ("Small batches", 1_000u64), ("Medium batches", 10_000u64), ("Large batches", 100_000u64), ]; - - println!("{:<15} {:<20} {:<20} {:<15}", - "Batch Size", "Dense (rec/sec)", "Sparse (rec/sec)", "Speedup"); + + println!( + "{:<15} {:<20} {:<20} {:<15}", + "Batch Size", "Dense (rec/sec)", "Sparse (rec/sec)", "Speedup" + ); println!("{:-<70}", ""); - + for (name, batch_size) in test_configs { fs::create_dir_all("data/benchmark")?; - + // Test dense writing let dense_log = "data/benchmark/dense_throughput.dat"; let dense_index = "data/benchmark/dense_throughput.idx"; - + let start = Instant::now(); let mut log_writer = janus::indexing::shared::LogWriter::create(dense_log)?; let mut index_builder = janus::indexing::dense::DenseIndexBuilder::create(dense_index)?; - + for i in 0..batch_size { log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; index_builder.add_entry(i, i * 40)?; } log_writer.flush()?; index_builder.finalize()?; - + let dense_time = start.elapsed(); let dense_throughput = batch_size as f64 / dense_time.as_secs_f64(); - + // Test sparse writing let sparse_log = "data/benchmark/sparse_throughput.dat"; let sparse_index = "data/benchmark/sparse_throughput.idx"; - + let start = Instant::now(); let mut log_writer = janus::indexing::shared::LogWriter::create(sparse_log)?; - let mut index_builder = janus::indexing::sparse::SparseIndexBuilder::create(sparse_index, 1000)?; - + let mut index_builder = + janus::indexing::sparse::SparseIndexBuilder::create(sparse_index, 1000)?; + for i in 0..batch_size { log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; index_builder.add_entry(i, i, i * 40)?; } log_writer.flush()?; index_builder.finalize()?; - + let sparse_time = start.elapsed(); let sparse_throughput = batch_size as f64 / sparse_time.as_secs_f64(); - + let speedup = sparse_throughput / dense_throughput; - - println!("{:<15} {:<20.0} {:<20.0} {:<15.2}x", - name, dense_throughput, sparse_throughput, speedup); + + println!( + "{:<15} {:<20.0} {:<20.0} {:<15.2}x", + name, dense_throughput, sparse_throughput, speedup + ); } - + Ok(()) } fn main() -> std::io::Result<()> { println!("🔬 Advanced RDF Indexing Analysis Suite"); println!("======================================="); - + analyze_sparse_intervals()?; analyze_memory_usage()?; analyze_write_throughput()?; - + println!("\n✨ Analysis Complete!"); println!("\n💡 Recommendations:"); println!(" • Use sparse indexing for write-heavy workloads"); println!(" • Choose interval based on query precision requirements"); println!(" • Consider hybrid approaches for different use cases"); println!(" • Monitor memory usage with large datasets"); - + Ok(()) -} \ No newline at end of file +} diff --git a/run_benchmarks.sh b/run_benchmarks.sh index d44102f..3c26418 100755 --- a/run_benchmarks.sh +++ b/run_benchmarks.sh @@ -44,4 +44,4 @@ echo " - Writing throughput (records/second)" echo " - Index build time" echo " - Memory usage" echo " - Query performance trade-offs" -echo " - Storage space efficiency" \ No newline at end of file +echo " - Storage space efficiency" From e1fef2c30b2acdb615b08e76def1407d12779844 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Fri, 7 Nov 2025 17:40:46 +0100 Subject: [PATCH 07/19] Implement storage module with StreamingSegmentedStorage and utility structures for efficient event handling and querying --- src/indexing/mod.rs | 1 + src/lib.rs | 2 + src/main.rs | 147 ++++++++- src/storage/mod.rs | 4 + src/storage/segmented_storage.rs | 502 +++++++++++++++++++++++++++++++ src/storage/util.rs | 57 ++++ 6 files changed, 700 insertions(+), 13 deletions(-) create mode 100644 src/storage/mod.rs create mode 100644 src/storage/segmented_storage.rs create mode 100644 src/storage/util.rs diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs index 69d25b6..6bda115 100644 --- a/src/indexing/mod.rs +++ b/src/indexing/mod.rs @@ -6,3 +6,4 @@ pub mod dictionary; pub mod shared; #[doc = ""] pub mod sparse; + diff --git a/src/lib.rs b/src/lib.rs index 5a5928a..a1c6f88 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,6 +65,8 @@ pub mod benchmarking { mod benchmark; } +pub mod storage; + pub mod error { //! Error types and result definitions diff --git a/src/main.rs b/src/main.rs index 6e0e9f7..5247965 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,9 +2,12 @@ //! //! This is the main entry point for the Janus command-line interface. +use janus::indexing::shared::Event; use janus::indexing::{dense, shared::LogWriter, sparse}; +use janus::storage::segmented_storage::StreamingSegmentedStorage; +use janus::storage::util::StreamingConfig; use std::fs; -use std::time::Instant; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; const DATA_DIR: &str = "data/benchmark"; const LOG_FILE: &str = "data/benchmark/log.dat"; @@ -114,19 +117,137 @@ fn benchmark_queries() -> std::io::Result<()> { Ok(()) } -fn main() -> std::io::Result<()> { - println!("RDF Indexing Benchmark : Dense vs Sparse"); - println!("Setting up data..."); - let number_of_records = 1_000_000u64; - setup_data(number_of_records)?; +// fn main() -> std::io::Result<()> { +// println!("RDF Indexing Benchmark : Dense vs Sparse"); +// println!("Setting up data..."); +// let number_of_records = 1_000_000u64; +// setup_data(number_of_records)?; + +// benchmark_indexing()?; +// benchmark_queries()?; + +// println!( +// "\n=== Summary ===\nSparse interval: {}\nUse this data to decide \ +// which approach suits your use case best.", +// SPARSE_INTERVAL +// ); +// Ok(()) +// } + +fn benchmark_storage_performance() -> std::io::Result<()> { + println!("=== WAL-Based Segmented Storage Performance Benchmark ===\n"); + + let record_counts = vec![100, 1000, 10000, 100000, 1000000]; + + for &num_records in &record_counts { + println!("Testing with {} records", num_records); + println!("──────────────────────────────────────────────────"); + + // Configure storage + let config = StreamingConfig { + max_wal_events: 5000, + max_wal_age_seconds: 30, + max_wal_bytes: 5 * 1024 * 1024, + sparse_interval: 100, + entries_per_index_block: 512, + segment_base_path: format!("./benchmark_data_{}", num_records), + ..Default::default() + }; + + // Clean up any existing data + let _ = std::fs::remove_dir_all(&config.segment_base_path); + + let mut storage = StreamingSegmentedStorage::new(config.clone())?; + storage.start_background_flushing(); + + // Benchmark writes + println!("Writing {} records...", num_records); + let write_start = Instant::now(); + let mut min_timestamp = u64::MAX; + let mut max_timestamp = 0u64; + + for i in 0..num_records { + let timestamp = + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 + i; + min_timestamp = min_timestamp.min(timestamp); + max_timestamp = max_timestamp.max(timestamp); + + let event = Event { + timestamp, + subject: (i % 10) as u64, + predicate: 1, + object: (20 + (i % 10)) as u64, + graph: 1, + }; + storage.write(event)?; + } - benchmark_indexing()?; - benchmark_queries()?; + let write_duration = write_start.elapsed(); + let write_throughput = num_records as f64 / write_duration.as_secs_f64(); + + println!("Write Performance:"); + println!(" Duration: {:.3}s", write_duration.as_secs_f64()); + println!(" Throughput: {:.0} records/sec", write_throughput); + println!(" Timestamp range: {} to {}", min_timestamp, max_timestamp); + + // Benchmark queries immediately after writing (data is still in WAL) + let query_ranges = vec![(0.1, "10% of data"), (0.5, "50% of data"), (1.0, "100% of data")]; + + println!("\nQuery Performance:"); + + for (fraction, description) in query_ranges { + let query_count = 100.min(num_records / 10); // Run 100 queries or 10% of records, whichever is smaller + let mut query_times = Vec::new(); + let mut total_records_read = 0; + + for q in 0..query_count { + // Use a deterministic but varied offset for queries within the actual data range + let timestamp_range = max_timestamp - min_timestamp; + let start_offset = + (timestamp_range as f64 * fraction * (q as f64 / query_count as f64)) as u64; + let query_window = (timestamp_range as f64 * 0.01).max(100.0) as u64; // 1% of data or 100 records minimum + + let start_timestamp = min_timestamp + start_offset; + let end_timestamp = (start_timestamp + query_window).min(max_timestamp); + + let query_start = Instant::now(); + let results = storage.query(start_timestamp, end_timestamp)?; + let query_duration = query_start.elapsed(); + + total_records_read += results.len(); + query_times.push(query_duration.as_secs_f64()); + } + + let avg_query_time = query_times.iter().sum::() / query_times.len() as f64; + let queries_per_sec = 1.0 / avg_query_time; + let total_query_time = query_times.iter().sum::(); + let records_per_sec = if total_query_time > 0.0 { + total_records_read as f64 / total_query_time + } else { + 0.0 + }; + let avg_records_per_query = total_records_read as f64 / query_count as f64; + let min_time = query_times.iter().cloned().fold(f64::INFINITY, f64::min); + let max_time = query_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max); + + println!(" {} queries ({}):", description, query_count); + println!(" Avg query time: {:.3}ms", avg_query_time * 1000.0); + println!(" Query throughput: {:.1} queries/sec", queries_per_sec); + println!(" Read throughput: {:.0} records/sec", records_per_sec); + println!(" Avg records per query: {:.1}", avg_records_per_query); + println!(" Total records read: {}", total_records_read); + println!(" Min/Max time: {:.3}ms / {:.3}ms", min_time * 1000.0, max_time * 1000.0); + } - println!( - "\n=== Summary ===\nSparse interval: {}\nUse this data to decide \ - which approach suits your use case best.", - SPARSE_INTERVAL - ); + // Force flush remaining WAL data and shutdown + storage.shutdown()?; + println!(); + } + + println!("Benchmark completed!"); Ok(()) } + +fn main() -> std::io::Result<()> { + benchmark_storage_performance() +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs new file mode 100644 index 0000000..221ddeb --- /dev/null +++ b/src/storage/mod.rs @@ -0,0 +1,4 @@ +#[doc=""] +pub mod util; +#[doc = ""] +pub mod segmented_storage; diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs new file mode 100644 index 0000000..f8a6500 --- /dev/null +++ b/src/storage/segmented_storage.rs @@ -0,0 +1,502 @@ +use core::time; +use std::{ + collections::VecDeque, + fmt::format, + io::{BufWriter, Read, Seek, SeekFrom, Write}, + ops::Index, + panic::set_hook, + sync::{Arc, Mutex, RwLock}, + thread::JoinHandle, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; + +use crate::{ + config, + indexing::{ + shared::{decode_record, encode_record, Event, RECORD_SIZE}, + sparse, + }, + storage::{ + self, + util::{EnhancedSegmentMetadata, IndexBlock, StreamingConfig, WAL}, + }, +}; + +pub struct StreamingSegmentedStorage { + wal: Arc>, + segments: Arc>>, + flush_handle: Option>, + shutdown_signal: Arc>, + config: StreamingConfig, +} + +impl StreamingSegmentedStorage { + pub fn new(config: StreamingConfig) -> std::io::Result { + std::fs::create_dir_all(&config.segment_base_path)?; + + let storage = Self { + wal: Arc::new(RwLock::new(WAL { + events: VecDeque::new(), + total_bytes: 0, + oldest_timestamp_bound: None, + newest_timestamp_bound: None, + })), + + segments: Arc::new(RwLock::new(Vec::new())), + flush_handle: None, + shutdown_signal: Arc::new(Mutex::new(false)), + config, + }; + storage.load_existing_segments()?; + Ok(storage) + } + + pub fn start_background_flushing(&mut self) { + let wal_clone = Arc::clone(&self.wal); + let segments_clone = Arc::clone(&self.segments); + let shutdown_clone = Arc::clone(&self.shutdown_signal); + let config_clone = self.config.clone(); + + let handle = std::thread::spawn(move || { + Self::background_flush_loop(wal_clone, segments_clone, shutdown_clone, config_clone); + }); + + self.flush_handle = Some(handle); + } + + pub fn write(&self, event: Event) -> std::io::Result<()> { + let event_size = std::mem::size_of::(); + + { + let mut wal = self.wal.write().unwrap(); + + if wal.oldest_timestamp_bound.is_none() { + wal.oldest_timestamp_bound = Some(event.timestamp); + } + + wal.newest_timestamp_bound = Some(event.timestamp); + + wal.total_bytes += event_size; + + wal.events.push_back(event); + } + + if self.should_flush() { + self.flush_wal_to_segment()?; + } + + Ok(()) + } + + fn should_flush(&self) -> bool { + let wal = self.wal.read().unwrap(); + + wal.events.len() >= self.config.max_wal_events.try_into().unwrap() + || wal.total_bytes > self.config.max_wal_bytes + || wal.oldest_timestamp_bound.map_or(false, |oldest| { + let current_timestamp = Self::current_timestamp(); + + // Use saturating subtraction to avoid underflow if oldest > current_timestamp + current_timestamp.saturating_sub(oldest) + >= self.config.max_wal_age_seconds * 1_000_000_000 + }) + } + + fn current_timestamp() -> u64 { + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 + } + + fn flush_wal_to_segment(&self) -> std::io::Result<()> { + // Automatically extract events from the WAL. + + let events_to_flush = { + let mut wal = self.wal.write().unwrap(); + if wal.events.is_empty() { + return Ok(()); + } + + let events: Vec = wal.events.drain(..).collect(); + + wal.total_bytes = 0; + wal.oldest_timestamp_bound = None; + wal.newest_timestamp_bound = None; + events + }; + + let segment = self.create_segment_with_two_level_index(events_to_flush)?; + + { + let mut segments = self.segments.write().unwrap(); + segments.push(segment); + } + Ok(()) + } + + fn create_segment_with_two_level_index( + &self, + mut events: Vec, + ) -> std::io::Result { + events.sort_by_key(|e| e.timestamp); + + let segment_id = Self::generate_segment_id(); + + let data_path = format!("{}/segment-{}.log", self.config.segment_base_path, segment_id); + let index_path = format!("{}/segment-{}.log", self.config.segment_base_path, segment_id); + + let mut data_file = BufWriter::new(std::fs::File::create(&data_path)?); + let mut index_file = BufWriter::new(std::fs::File::create(&index_path)?); + + let mut index_directory = Vec::new(); + let mut current_block_entries = Vec::new(); + + let mut current_block_min_ts = None; + let mut current_block_max_ts = 0u64; + + let mut data_offset = 0u64; + + for (record_count, event) in events.iter().enumerate() { + let record_bytes = self.serialize_event_to_fixed_size(event); + data_file.write_all(&record_bytes); + + if record_count % self.config.sparse_interval == 0 { + let sparse_entry = (event.timestamp, data_offset); + + if current_block_min_ts.is_none() { + current_block_min_ts = Some(event.timestamp); + } + + current_block_max_ts = event.timestamp; + current_block_entries.push(sparse_entry); + + if current_block_entries.len() >= self.config.entries_per_index_block { + let block_metadata = self.flush_index_block( + &mut index_file, + ¤t_block_entries, + current_block_min_ts.unwrap(), + current_block_max_ts, + )?; + + index_directory.push(block_metadata); + + current_block_entries.clear(); + current_block_min_ts = None; + } + } + data_offset += record_bytes.len() as u64; + } + + if !current_block_entries.is_empty() { + let block_metadata = self.flush_index_block( + &mut index_file, + ¤t_block_entries, + current_block_min_ts.unwrap(), + current_block_max_ts, + )?; + + index_directory.push(block_metadata); + } + + data_file.flush()?; + index_file.flush()?; + + Ok(EnhancedSegmentMetadata { + start_timstamp: events.first().unwrap().timestamp, + end_timestamp: events.last().unwrap().timestamp, + data_path, + index_path, + record_count: events.len() as u64, + index_directory, + }) + } + + fn flush_index_block( + &self, + index_file: &mut BufWriter, + entries: &[(u64, u64)], + min_ts: u64, + max_ts: u64, + ) -> std::io::Result { + let file_offset = index_file.stream_position()?; + + for (timestamp, offset) in entries { + index_file.write_all(×tamp.to_ne_bytes())?; + index_file.write_all(&offset.to_be_bytes())?; + } + + Ok(IndexBlock { + min_timestamp: min_ts, + max_timestamp: max_ts, + file_offset, + entry_count: entries.len() as u32, + }) + } + + pub fn query(&self, start_timestamp: u64, end_timestamp: u64) -> std::io::Result> { + let mut results = Vec::new(); + + // First try to query the immediate WAL which has the fastest visibility. + + { + let wal = self.wal.read().unwrap(); + + for event in &wal.events { + if event.timestamp >= start_timestamp && event.timestamp <= end_timestamp { + results.push(event.clone()); + } + } + } + + // Then querying the relevant segment with a two level indexing + + { + let segments = self.segments.read().unwrap(); + for segment in segments.iter() { + if self.segment_overlaps(segment, start_timestamp, end_timestamp) { + let segment_results = + self.query_segment_two_level(segment, start_timestamp, end_timestamp)?; + results.extend(segment_results); + } + } + } + + results.sort_by_key(|e| e.timestamp); + + Ok(results) + } + + fn query_segment_two_level( + &self, + segment: &EnhancedSegmentMetadata, + start_timestamp: u64, + end_timestamp: u64, + ) -> std::io::Result> { + // If we have index directory, use two-level indexing + if !segment.index_directory.is_empty() { + // Step 1 : Find relevant index blocks using in-memory directory + let relevant_blocks: Vec<&IndexBlock> = segment + .index_directory + .iter() + .filter(|block| { + block.min_timestamp <= end_timestamp && block.max_timestamp >= start_timestamp + }) + .collect(); + + if relevant_blocks.is_empty() { + return Ok(Vec::new()); + } + + // Step 2 : Load only the relevant blocks from the disk + let sparse_entries = + self.load_relevant_index_blocks(&segment.index_path, &relevant_blocks)?; + + if sparse_entries.is_empty() { + return Ok(Vec::new()); + } + + // Step 3 : Binary search the loaded entries + let lb = sparse_entries.partition_point(|(ts, _)| *ts < start_timestamp); + let start_position = lb.saturating_sub(1); + let start_offset = sparse_entries[start_position].1; + + // Step 4 : Sequential Scan from the checkpoint + self.scan_data_from_offset( + &segment.data_path, + start_offset, + start_timestamp, + end_timestamp, + ) + } else { + // Fallback: Full scan of the data file (for segments without loaded index) + self.scan_data_from_offset(&segment.data_path, 0, start_timestamp, end_timestamp) + } + } + + fn load_relevant_index_blocks( + &self, + index_path: &str, + blocks: &[&IndexBlock], + ) -> std::io::Result> { + let mut index_file = std::fs::File::open(index_path)?; + let mut sparse_entries = Vec::new(); + + for block in blocks { + index_file.seek(SeekFrom::Start(block.file_offset))?; + + let block_size = block.entry_count as usize * 16; // 16 bytes per entry. + let mut buffer = vec![0u8; block_size]; + index_file.read_exact(&mut buffer)?; + + // Parse the entries. + + for chunk in buffer.chunks_exact(16) { + let timestamp = u64::from_be_bytes(chunk[0..8].try_into().unwrap()); + let offset = u64::from_be_bytes(chunk[8..16].try_into().unwrap()); + sparse_entries.push((timestamp, offset)); + } + } + + sparse_entries.sort_by_key(|&(ts, _)| ts); + Ok(sparse_entries) + } + + fn scan_data_from_offset( + &self, + data_path: &str, + start_offset: u64, + start_timestamp: u64, + end_timestamp: u64, + ) -> std::io::Result> { + let mut file = std::fs::File::open(data_path)?; + file.seek(SeekFrom::Start(start_offset))?; + + let mut results = Vec::new(); + let mut record = [0u8; RECORD_SIZE]; + + while file.read_exact(&mut record).is_ok() { + let (timestamp, subject, predicate, object, graph) = decode_record(&record); + + if timestamp > end_timestamp { + break; + } + + if timestamp >= start_timestamp { + results.push(Event { timestamp, subject, predicate, object, graph }); + } + } + Ok(results) + } + + fn segment_overlaps( + &self, + segment: &EnhancedSegmentMetadata, + start_ts: u64, + end_ts: u64, + ) -> bool { + segment.start_timstamp <= end_ts && segment.end_timestamp >= start_ts + } + + fn background_flush_loop( + wal: Arc>, + segments: Arc>>, + shutdown_signal: Arc>, + config: StreamingConfig, + ) { + while !*shutdown_signal.lock().unwrap() { + std::thread::sleep(Duration::from_secs(1)); + + // Check if flush is needed or not. + + let should_flush = { + let wal = wal.read().unwrap(); + + wal.events.len() >= config.max_wal_bytes + || wal.total_bytes >= config.max_wal_bytes + || wal.oldest_timestamp_bound.map_or(false, |oldest| { + let current_timestamp = Self::current_timestamp(); + (current_timestamp - oldest) >= config.max_wal_age_seconds * 1_000_000_000 + }) + }; + + if should_flush { + // TODO : Add better error handling here in this case + if let Err(e) = Self::flush_background(wal.clone(), segments.clone(), &config) { + eprintln!("Background flush failed: {}", e); + } + } + } + } + + fn flush_background( + wal: Arc>, + segments: Arc>>, + config: &StreamingConfig, + ) -> std::io::Result<()> { + Ok(()) + } + + fn load_existing_segments(&self) -> std::io::Result<()> { + use std::fs; + + let segment_dir = &self.config.segment_base_path; + if !fs::metadata(segment_dir).is_ok() { + return Ok(()); + } + + let entries = fs::read_dir(segment_dir)?; + let mut segments = Vec::new(); + + for entry in entries { + let entry = entry?; + let path = entry.path(); + + if let Some(filename) = path.file_name().and_then(|n| n.to_str()) { + if filename.starts_with("segment-") && filename.ends_with(".log") { + // Extract segment ID from filename + if let Some(id_str) = + filename.strip_prefix("segment-").and_then(|s| s.strip_suffix(".log")) + { + if let Ok(segment_id) = id_str.parse::() { + // Try to load the segment metadata by reading the data file + let data_path = format!("{}/segment-{}.log", segment_dir, segment_id); + let index_path = format!("{}/segment-{}.log", segment_dir, segment_id); + + if let Ok(_metadata) = fs::metadata(&data_path) { + // For now, create a basic segment metadata with wide timestamp bounds + // In a full implementation, we'd parse the index file to get exact bounds + let segment = EnhancedSegmentMetadata { + start_timstamp: 0, // Wide range to ensure overlap checks pass + end_timestamp: u64::MAX, + data_path, + index_path, + record_count: 0, // Will be determined during scanning + index_directory: Vec::new(), // Empty - will fall back to full scan + }; + segments.push(segment); + } + } + } + } + } + } + + // Sort segments by start timestamp + segments.sort_by_key(|s| s.start_timstamp); + + { + let mut self_segments = self.segments.write().unwrap(); + *self_segments = segments; + } + + Ok(()) + } + + pub fn shutdown(&mut self) -> std::io::Result<()> { + *self.shutdown_signal.lock().unwrap() = true; + + // Final Flush + + self.flush_wal_to_segment(); + + if let Some(handle) = self.flush_handle.take() { + handle.join().unwrap(); + } + Ok(()) + } + + fn serialize_event_to_fixed_size(&self, event: &Event) -> Vec { + let mut record = [0u8; RECORD_SIZE]; + encode_record( + &mut record, + event.timestamp, + event.subject, + event.predicate, + event.object, + event.graph, + ); + record.to_vec() + } + + fn generate_segment_id() -> u64 { + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 + } +} diff --git a/src/storage/util.rs b/src/storage/util.rs new file mode 100644 index 0000000..e19e0e8 --- /dev/null +++ b/src/storage/util.rs @@ -0,0 +1,57 @@ +use std::collections::{VecDeque, HashMap}; +use std::sync::{Arc, RwLock, Mutex}; +use std::thread::JoinHandle; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +use crate::indexing::shared::Event; + +#[derive(Debug)] +pub struct WAL { + pub events: VecDeque, + pub total_bytes: usize, + pub oldest_timestamp_bound: Option, + pub newest_timestamp_bound: Option, +} + +#[derive(Debug, Clone)] +pub struct IndexBlock { + pub min_timestamp: u64, + pub max_timestamp: u64, + pub file_offset: u64, + pub entry_count: u32, +} + +#[derive(Debug, Clone)] +pub struct EnhancedSegmentMetadata { + pub start_timstamp: u64, + pub end_timestamp: u64, + pub data_path: String, + pub index_path: String, + pub record_count: u64, + pub index_directory: Vec, +} + +#[derive(Clone)] +pub struct StreamingConfig { + pub max_wal_events: u64, + pub max_wal_age_seconds: u64, + pub max_wal_bytes: usize, + pub sparse_interval: usize, + pub entries_per_index_block: usize, + pub segment_base_path: String, +} + +impl Default for StreamingConfig { + fn default() -> Self { + Self { + max_wal_bytes: 10 * 1024 * 1024, + max_wal_age_seconds: 60, + max_wal_events: 100_000, + sparse_interval: 1000, + entries_per_index_block: 1024, + segment_base_path: "./data".to_string(), + } + } +} + + From 0b6e7fdef847060651af48207ff63553e065a09a Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Fri, 7 Nov 2025 18:50:29 +0100 Subject: [PATCH 08/19] Refactor and enhance RDF storage and indexing - Introduced a new core module for data structures and types, including Event and RDFEvent. - Updated the storage module to include a new indexing structure with dense and sparse indexing capabilities. - Implemented a user-friendly API for writing and querying RDF events in StreamingSegmentedStorage. - Added benchmarks for RDF segmented storage, including writing and reading performance tests. - Created a dictionary for encoding and decoding RDF URIs to numeric IDs, improving storage efficiency. - Enhanced the dense and sparse indexing mechanisms to support efficient querying of RDF events. - Added comprehensive tests for the dictionary and encoding/decoding functionality. --- Cargo.toml | 1 + src/{benchmarking => benchmarks}/benchmark.rs | 3 +- src/{benchmarking => benchmarks}/mod.rs | 1 - src/core/encoding.rs | 59 ++++++ src/core/mod.rs | 36 ++++ src/indexing/dictionary.rs | 88 --------- src/indexing/mod.rs | 9 +- src/indexing/shared.rs | 95 ++-------- src/lib.rs | 8 +- src/main.rs | 114 +++++++++++- src/{ => storage}/indexing/dense.rs | 2 +- src/storage/indexing/dictionary.rs | 171 ++++++++++++++++++ src/{ => storage}/indexing/sparse.rs | 8 +- src/storage/mod.rs | 7 +- src/storage/segmented_storage.rs | 40 +++- src/storage/util.rs | 2 +- 16 files changed, 449 insertions(+), 195 deletions(-) rename src/{benchmarking => benchmarks}/benchmark.rs (97%) rename src/{benchmarking => benchmarks}/mod.rs (64%) create mode 100644 src/core/encoding.rs create mode 100644 src/core/mod.rs delete mode 100644 src/indexing/dictionary.rs rename src/{ => storage}/indexing/dense.rs (97%) create mode 100644 src/storage/indexing/dictionary.rs rename src/{ => storage}/indexing/sparse.rs (97%) diff --git a/Cargo.toml b/Cargo.toml index 1ff8565..4f9485e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ readme = "README.md" [dependencies] regex = "1.0" serde = { version = "1.0", features = ["derive"] } +bincode = "1.0" [dev-dependencies] diff --git a/src/benchmarking/benchmark.rs b/src/benchmarks/benchmark.rs similarity index 97% rename from src/benchmarking/benchmark.rs rename to src/benchmarks/benchmark.rs index 92b7f69..d64ec37 100644 --- a/src/benchmarking/benchmark.rs +++ b/src/benchmarks/benchmark.rs @@ -1,4 +1,5 @@ -use crate::indexing::{dense, shared::LogWriter, sparse}; +use crate::storage::indexing::{dense, sparse}; +use crate::indexing::shared::LogWriter; use std::fs; use std::time::Instant; diff --git a/src/benchmarking/mod.rs b/src/benchmarks/mod.rs similarity index 64% rename from src/benchmarking/mod.rs rename to src/benchmarks/mod.rs index 9820c42..fe2015f 100644 --- a/src/benchmarking/mod.rs +++ b/src/benchmarks/mod.rs @@ -1,2 +1 @@ -#[doc=""] pub mod benchmark; \ No newline at end of file diff --git a/src/core/encoding.rs b/src/core/encoding.rs new file mode 100644 index 0000000..faf8a9a --- /dev/null +++ b/src/core/encoding.rs @@ -0,0 +1,59 @@ +//! Binary encoding/decoding utilities for RDF events + +use crate::storage::indexing::dictionary::Dictionary; +use crate::core::{Event, RDFEvent}; + +/// Size of a single encoded record in bytes +pub const RECORD_SIZE: usize = 40; + +/// Encode an RDF event record into a byte buffer +pub fn encode_record( + buffer: &mut [u8; RECORD_SIZE], + timestamp: u64, + subject: u64, + predicate: u64, + object: u64, + graph: u64, +) { + buffer[0..8].copy_from_slice(×tamp.to_le_bytes()); + buffer[8..16].copy_from_slice(&subject.to_le_bytes()); + buffer[16..24].copy_from_slice(&predicate.to_le_bytes()); + buffer[24..32].copy_from_slice(&object.to_le_bytes()); + buffer[32..40].copy_from_slice(&graph.to_le_bytes()); +} + +/// Decode a byte buffer into an RDF event record +pub fn decode_record(buffer: &[u8; RECORD_SIZE]) -> (u64, u64, u64, u64, u64) { + let timestamp = u64::from_le_bytes(buffer[0..8].try_into().unwrap()); + let subject = u64::from_le_bytes(buffer[8..16].try_into().unwrap()); + let predicate = u64::from_le_bytes(buffer[16..24].try_into().unwrap()); + let object = u64::from_le_bytes(buffer[24..32].try_into().unwrap()); + let graph = u64::from_le_bytes(buffer[32..40].try_into().unwrap()); + (timestamp, subject, predicate, object, graph) +} + +impl RDFEvent { + /// Encode this RDF event to an internal Event using a dictionary + pub fn encode(&self, dict: &mut Dictionary) -> Event { + Event { + timestamp: self.timestamp, + subject: dict.encode(&self.subject), + predicate: dict.encode(&self.predicate), + object: dict.encode(&self.object), + graph: dict.encode(&self.graph), + } + } +} + +impl Event { + /// Decode this internal Event to an RDFEvent using a dictionary + pub fn decode(&self, dict: &Dictionary) -> RDFEvent { + RDFEvent { + timestamp: self.timestamp, + subject: dict.decode(self.subject).unwrap_or("UNKNOWN").to_string(), + predicate: dict.decode(self.predicate).unwrap_or("UNKNOWN").to_string(), + object: dict.decode(self.object).unwrap_or("UNKNOWN").to_string(), + graph: dict.decode(self.graph).unwrap_or("UNKNOWN").to_string(), + } + } +} \ No newline at end of file diff --git a/src/core/mod.rs b/src/core/mod.rs new file mode 100644 index 0000000..42f5529 --- /dev/null +++ b/src/core/mod.rs @@ -0,0 +1,36 @@ +//! Core data structures and types for Janus RDF Stream Processing Engine + +/// Internal storage event with encoded IDs +#[derive(Clone, Debug)] +pub struct Event { + pub timestamp: u64, + pub subject: u64, + pub predicate: u64, + pub object: u64, + pub graph: u64, +} + +/// User-facing RDF event with URI strings +#[derive(Debug, Clone)] +pub struct RDFEvent { + pub timestamp: u64, + pub subject: String, + pub predicate: String, + pub object: String, + pub graph: String, +} + +impl RDFEvent { + pub fn new(timestamp: u64, subject: &str, predicate: &str, object: &str, graph: &str) -> Self { + Self { + timestamp, + subject: subject.to_string(), + predicate: predicate.to_string(), + object: object.to_string(), + graph: graph.to_string(), + } + } +} + +pub mod encoding; +pub use encoding::*; \ No newline at end of file diff --git a/src/indexing/dictionary.rs b/src/indexing/dictionary.rs deleted file mode 100644 index e5605b2..0000000 --- a/src/indexing/dictionary.rs +++ /dev/null @@ -1,88 +0,0 @@ -use std::collections::HashMap; -use std::fs::File; -use std::io::{Read, Write}; -use std::path::Path; - -#[derive(Debug)] -pub struct Dictionary { - uri_to_id: HashMap, - id_to_uri: Vec, - next_id: u64, -} - -impl Dictionary { - pub fn new() -> Self { - Self { uri_to_id: HashMap::new(), id_to_uri: Vec::new(), next_id: 0 } - } - - pub fn fetch_id(&mut self, uri: &str) -> u64 { - if let Some(&id) = self.uri_to_id.get(uri) { - id - } else { - let id = self.next_id; - self.uri_to_id.insert(uri.to_string(), id); - self.id_to_uri.push(uri.to_string()); - self.next_id += 1; - id - } - } - - pub fn fetch_uri(&self, id: u64) -> Option<&str> { - self.id_to_uri.get(id as usize).map(|s| s.as_str()) - } - - pub fn len(&self) -> usize { - self.uri_to_id.len() - } - - pub fn is_empty(&self) -> bool { - self.uri_to_id.is_empty() - } - - pub fn save_to_file(&self, path: &Path) -> std::io::Result<()> { - let mut file = File::create(path)?; - file.write_all(&(self.id_to_uri.len() as u64).to_be_bytes())?; - - for uri in &self.id_to_uri { - let uri_bytes = uri.as_bytes(); - file.write_all(&(uri_bytes.len() as u32).to_be_bytes())?; - file.write_all(uri_bytes)?; - } - Ok(()) - } - - pub fn load_from_file(path: &Path) -> std::io::Result { - let mut file = File::open(path)?; - let mut uri_to_id = HashMap::new(); - let mut id_to_uri = Vec::new(); - - // Reading the number of entries - let mut count_bytes = [0u8; 8]; - file.read_exact(&mut count_bytes)?; - let count = u64::from_be_bytes(count_bytes); - - // Reading each URI Entry - - for id in 0..count { - let mut len_bytes = [0u8; 4]; - file.read_exact(&mut len_bytes)?; - - let length = u32::from_be_bytes(len_bytes) as usize; - let mut uri_bytes = vec![0u8; length]; - file.read_exact(&mut uri_bytes)?; - let uri = String::from_utf8(uri_bytes) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; - - uri_to_id.insert(uri.clone(), id); - id_to_uri.push(uri); - } - - Ok(Self { uri_to_id, id_to_uri, next_id: count }) - } -} - -impl Default for Dictionary { - fn default() -> Self { - Self::new() - } -} diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs index 6bda115..3bc786f 100644 --- a/src/indexing/mod.rs +++ b/src/indexing/mod.rs @@ -1,9 +1,4 @@ -#[doc = ""] -pub mod dense; -#[doc = ""] -pub mod dictionary; -#[doc = ""] +//! Legacy indexing utilities - most functionality moved to storage::indexing + pub mod shared; -#[doc = ""] -pub mod sparse; diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs index dd344bc..bbc875f 100644 --- a/src/indexing/shared.rs +++ b/src/indexing/shared.rs @@ -1,56 +1,26 @@ -use crate::indexing::dictionary::Dictionary; +//! Legacy storage utilities - to be moved to storage module + use std::fs::File; use std::io::Write; +use crate::core::encoding::{encode_record, RECORD_SIZE}; -#[doc = ""] -pub const RECORD_SIZE: usize = 40; - -#[doc = ""] -pub fn encode_record( - buffer: &mut [u8; RECORD_SIZE], - timestamp: u64, - subject: u64, - predicate: u64, - object: u64, - graph: u64, -) { - buffer[0..8].copy_from_slice(×tamp.to_le_bytes()); - buffer[8..16].copy_from_slice(&subject.to_le_bytes()); - buffer[16..24].copy_from_slice(&predicate.to_le_bytes()); - buffer[24..32].copy_from_slice(&object.to_le_bytes()); - buffer[32..40].copy_from_slice(&graph.to_le_bytes()); -} - -#[doc = ""] -pub fn decode_record(buffer: &[u8; RECORD_SIZE]) -> (u64, u64, u64, u64, u64) { - let timestamp = u64::from_le_bytes(buffer[0..8].try_into().unwrap()); - let subject = u64::from_le_bytes(buffer[8..16].try_into().unwrap()); - let predicate = u64::from_le_bytes(buffer[16..24].try_into().unwrap()); - let object = u64::from_le_bytes(buffer[24..32].try_into().unwrap()); - let graph = u64::from_le_bytes(buffer[32..40].try_into().unwrap()); - (timestamp, subject, predicate, object, graph) -} - -#[doc = ""] +/// Log writer for appending encoded records to a file pub struct LogWriter { log_file: File, record_count: u64, } -#[doc = ""] impl LogWriter { - #[doc = ""] + /// Create a new log writer for the given file path pub fn create(path: &str) -> std::io::Result { - let log_file = match File::create(path) { - Ok(file) => file, - Err(error) => { - return Err(error); - } - }; - Ok(Self { log_file, record_count: 0 }) + let log_file = File::create(path)?; + Ok(Self { + log_file, + record_count: 0 + }) } - #[doc = ""] + /// Append an encoded record to the log file pub fn append_record( &mut self, timestamp: u64, @@ -66,49 +36,22 @@ impl LogWriter { Ok(()) } - #[doc = ""] + /// Get the current record count pub fn record_count(&self) -> u64 { self.record_count } - #[doc = ""] + /// Flush the log file pub fn flush(&mut self) -> std::io::Result<()> { self.log_file.flush() } } -#[derive(Clone, Debug)] -#[doc = ""] -pub struct Event { - #[doc = ""] - pub timestamp: u64, - #[doc = ""] - pub subject: u64, - #[doc = ""] - pub predicate: u64, - #[doc = ""] - pub object: u64, - #[doc = ""] - pub graph: u64, -} -#[derive(Debug, Clone)] -pub struct ResolvedEvent { - pub timestamp: u64, - pub subject: String, - pub predicate: String, - pub object: String, - pub graph: String, -} -impl Event { - pub fn resolve(&self, dict: &Dictionary) -> ResolvedEvent { - ResolvedEvent { - timestamp: self.timestamp, - subject: dict.fetch_uri(self.subject).unwrap_or("UNKNOWN").to_string(), - predicate: dict.fetch_uri(self.predicate).unwrap_or("UNKNOWN").to_string(), - object: dict.fetch_uri(self.object).unwrap_or("UNKNOWN").to_string(), - graph: dict.fetch_uri(self.graph).unwrap_or("UNKNOWN").to_string(), - } - } -} + + + + + + diff --git a/src/lib.rs b/src/lib.rs index a1c6f88..c1e58f7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,10 +28,8 @@ #![warn(missing_docs)] #![warn(clippy::all)] -/// Core module containing the main engine logic -pub mod core { - //! Core functionality for the Janus engine -} +/// Core data structures and types +pub mod core; /// Module for handling RDF stores pub mod store { @@ -60,7 +58,7 @@ pub mod indexing; pub mod parsing; /// Benchmarking utilities -pub mod benchmarking { +pub mod benchmarks { mod benchmark; } diff --git a/src/main.rs b/src/main.rs index 5247965..934bbbf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,18 +2,118 @@ //! //! This is the main entry point for the Janus command-line interface. -use janus::indexing::shared::Event; -use janus::indexing::{dense, shared::LogWriter, sparse}; +use janus::core::Event; +use janus::storage::indexing::{dense, sparse}; +use janus::indexing::shared::LogWriter; use janus::storage::segmented_storage::StreamingSegmentedStorage; use janus::storage::util::StreamingConfig; use std::fs; -use std::time::{Instant, SystemTime, UNIX_EPOCH}; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; const DATA_DIR: &str = "data/benchmark"; const LOG_FILE: &str = "data/benchmark/log.dat"; const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx"; const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx"; const SPARSE_INTERVAL: usize = 1000; +const SEGMENT_BASE_PATH: &str = "data/rdf_benchmark"; + +fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { + println!("🚀 RDF Segmented Storage Benchmark"); + println!("=================================="); + + // Clean up and create directories + let _ = fs::remove_dir_all(SEGMENT_BASE_PATH); + fs::create_dir_all(SEGMENT_BASE_PATH)?; + + // Configure storage + let config = StreamingConfig { + max_wal_events: 10_000, + max_wal_age_seconds: 60, + max_wal_bytes: 1_000_000, + sparse_interval: 1000, + entries_per_index_block: 100, + segment_base_path: SEGMENT_BASE_PATH.to_string(), + }; + + let mut storage = StreamingSegmentedStorage::new(config)?; + storage.start_background_flushing(); + + // Benchmark writing 1 million RDF events + println!("\n📝 Writing 1,000,000 RDF events..."); + let start_time = Instant::now(); + let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64; + + for i in 0..1_000_000u64 { + let timestamp = base_timestamp + i * 1_000_000; // 1ms intervals + let subject = format!("http://example.org/person/person_{}", i % 10000); + let predicate = match i % 10 { + 0..=3 => "http://example.org/knows", + 4..=6 => "http://example.org/worksAt", + 7..=8 => "http://example.org/livesIn", + _ => "http://example.org/hasAge", + }; + let object = match i % 10 { + 0..=3 => format!("http://example.org/person/person_{}", (i + 1) % 10000), + 4..=6 => format!("http://example.org/organization/org_{}", i % 1000), + 7..=8 => format!("http://example.org/location/city_{}", i % 100), + _ => format!("\"{}\"^^http://www.w3.org/2001/XMLSchema#integer", 20 + (i % 60)), + }; + let graph = format!("http://example.org/graph/graph_{}", i % 100); + + storage.write_rdf(timestamp, &subject, predicate, &object, &graph)?; + + if i > 0 && i % 100_000 == 0 { + println!(" ✓ Written {} events", i); + } + } + + let write_duration = start_time.elapsed(); + let write_throughput = 1_000_000.0 / write_duration.as_secs_f64(); + + println!("✅ Write completed!"); + println!(" Duration: {:.3} seconds", write_duration.as_secs_f64()); + println!(" Throughput: {:.0} events/sec", write_throughput); + + // Wait a bit for background flushing + std::thread::sleep(Duration::from_secs(2)); + + // Benchmark reading different amounts of data + println!("\n🔍 Reading Benchmarks"); + println!("===================="); + + let read_sizes = vec![100, 1_000, 10_000, 100_000, 1_000_000]; + + for &size in &read_sizes { + // Query the first 'size' events + let query_start_ts = base_timestamp; + let query_end_ts = base_timestamp + (size as u64 * 1_000_000); + + println!("\n📖 Querying {} events...", size); + let start_time = Instant::now(); + + let results = storage.query_rdf(query_start_ts, query_end_ts)?; + + let query_duration = start_time.elapsed(); + let read_throughput = results.len() as f64 / query_duration.as_secs_f64(); + + println!(" Results found: {}", results.len()); + println!(" Query time: {:.3} ms", query_duration.as_millis()); + println!(" Read throughput: {:.0} events/sec", read_throughput); + + // Show a sample result for verification + if !results.is_empty() { + let sample = &results[0]; + println!(" Sample result: {} {} {} in {} at {}", + sample.subject, sample.predicate, sample.object, sample.graph, sample.timestamp); + } + } + + // Shutdown storage + storage.shutdown()?; + + println!("\n🎉 Benchmark completed successfully!"); + Ok(()) +} fn setup_data(number_records: u64) -> std::io::Result<()> { let _ = fs::remove_dir_all(DATA_DIR); @@ -249,5 +349,13 @@ fn benchmark_storage_performance() -> std::io::Result<()> { } fn main() -> std::io::Result<()> { + // Run the new RDF benchmark + benchmark_segmented_storage_rdf()?; + + println!("\n{}", "=".repeat(50)); + println!("Running legacy benchmark for comparison..."); + println!("{}", "=".repeat(50)); + + // Also run the old benchmark for comparison benchmark_storage_performance() } diff --git a/src/indexing/dense.rs b/src/storage/indexing/dense.rs similarity index 97% rename from src/indexing/dense.rs rename to src/storage/indexing/dense.rs index 37cb3f9..8f7eb35 100644 --- a/src/indexing/dense.rs +++ b/src/storage/indexing/dense.rs @@ -1,4 +1,4 @@ -use crate::indexing::shared::{decode_record, Event, RECORD_SIZE}; +use crate::core::{encoding::{decode_record, RECORD_SIZE}, Event}; use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; #[doc = ""] diff --git a/src/storage/indexing/dictionary.rs b/src/storage/indexing/dictionary.rs new file mode 100644 index 0000000..4729756 --- /dev/null +++ b/src/storage/indexing/dictionary.rs @@ -0,0 +1,171 @@ +use std::collections::HashMap; +use std::fs::File; +use std::io::{Read, Write}; +use std::path::Path; + +use bincode; +use serde::{Deserialize, Serialize}; + +use crate::core::Event; + +#[derive(Debug, Serialize, Deserialize)] +pub struct Dictionary { + pub string_to_id: HashMap, + pub id_to_uri: HashMap, + pub next_id: u64, +} + +impl Dictionary { + pub fn new() -> Self { + Dictionary { string_to_id: HashMap::new(), id_to_uri: HashMap::new(), next_id: 0 } + } + + pub fn encode(&mut self, value: &str) -> u64 { + if let Some(&id) = self.string_to_id.get(value) { + id + } else { + let id = self.next_id; + self.string_to_id.insert(value.to_string(), id); + self.id_to_uri.insert(id, value.to_string()); + self.next_id += 1; + id + } + } + + pub fn decode(&self, id: u64) -> Option<&str> { + self.id_to_uri.get(&id).map(|s| s.as_str()) + } + + pub fn save_to_file(&self, path: &Path) -> std::io::Result<()> { + let encoded = bincode::serialize(self) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + let mut file = File::create(path)?; + file.write_all(&encoded)?; + Ok(()) + } + + pub fn load_from_file(path: &Path) -> std::io::Result { + let mut file = File::open(path)?; + let mut buffer = Vec::new(); + file.read_to_end(&mut buffer)?; + let dict: Dictionary = bincode::deserialize(&buffer) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + Ok(dict) + } + + pub fn decode_graph(&self, event: &Event) -> String { + let subject = self.decode(event.subject).unwrap_or("unknown"); + let predicate = self.decode(event.predicate).unwrap_or("unknown"); + let object = self.decode(event.object).unwrap_or("unknown"); + let graph = self.decode(event.graph).unwrap_or("unknown"); + + format!( + "<(<{}>, <{}>, <{}>, <{}>), {}>", + subject, predicate, object, graph, event.timestamp + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::indexing::shared::Event; + + #[test] + fn test_dictionary_encoding_decoding() { + let mut dict = Dictionary::new(); + + // Encode some RDF terms + let subject_id = dict.encode("http://example.org/person/Alice"); + let predicate_id = dict.encode("http://example.org/knows"); + let object_id = dict.encode("http://example.org/person/Bob"); + let graph_id = dict.encode("http://example.org/graph1"); + + println!("Encoded IDs:"); + println!("Subject: {} -> {}", "http://example.org/person/Alice", subject_id); + println!("Predicate: {} -> {}", "http://example.org/knows", predicate_id); + println!("Object: {} -> {}", "http://example.org/person/Bob", object_id); + println!("Graph: {} -> {}", "http://example.org/graph1", graph_id); + + // Create an event + let event = Event { + timestamp: 1234567890, + subject: subject_id, + predicate: predicate_id, + object: object_id, + graph: graph_id, + }; + + // Decode the event + let decoded = dict.decode_graph(&event); + println!("\nDecoded event: {}", decoded); + + // Verify individual decodings + assert_eq!(dict.decode(subject_id), Some("http://example.org/person/Alice")); + assert_eq!(dict.decode(predicate_id), Some("http://example.org/knows")); + assert_eq!(dict.decode(object_id), Some("http://example.org/person/Bob")); + assert_eq!(dict.decode(graph_id), Some("http://example.org/graph1")); + + // Test that the decoded string contains the expected format + assert!(decoded.contains("http://example.org/person/Alice")); + assert!(decoded.contains("http://example.org/knows")); + assert!(decoded.contains("http://example.org/person/Bob")); + assert!(decoded.contains("http://example.org/graph1")); + assert!(decoded.contains("1234567890")); + } + + #[test] + fn test_clean_rdf_api() { + use crate::indexing::shared::RDFEvent; + + let mut dict = Dictionary::new(); + + // Test the clean API - user provides URIs directly + let rdf_event = RDFEvent::new( + 1234567890, + "http://example.org/person/Alice", + "http://example.org/knows", + "http://example.org/person/Bob", + "http://example.org/graph1", + ); + + // Encoding happens internally + let encoded_event = rdf_event.encode(&mut dict); + + // Decoding happens internally + let decoded_event = encoded_event.decode(&dict); + + // Verify the round-trip works + assert_eq!(decoded_event.subject, "http://example.org/person/Alice"); + assert_eq!(decoded_event.predicate, "http://example.org/knows"); + assert_eq!(decoded_event.object, "http://example.org/person/Bob"); + assert_eq!(decoded_event.graph, "http://example.org/graph1"); + assert_eq!(decoded_event.timestamp, 1234567890); + + println!("✅ Clean API test passed!"); + println!( + "Original: {} {} {} in {} at timestamp {}", + rdf_event.subject, + rdf_event.predicate, + rdf_event.object, + rdf_event.graph, + rdf_event.timestamp + ); + println!( + "Encoded IDs: {} {} {} {} at timestamp {}", + encoded_event.subject, + encoded_event.predicate, + encoded_event.object, + encoded_event.graph, + encoded_event.timestamp + ); + println!( + "Decoded: {} {} {} in {} at timestamp {}", + decoded_event.subject, + decoded_event.predicate, + decoded_event.object, + decoded_event.graph, + decoded_event.timestamp + ); + } +} diff --git a/src/indexing/sparse.rs b/src/storage/indexing/sparse.rs similarity index 97% rename from src/indexing/sparse.rs rename to src/storage/indexing/sparse.rs index 70a8ad1..79cc08d 100644 --- a/src/indexing/sparse.rs +++ b/src/storage/indexing/sparse.rs @@ -1,5 +1,5 @@ -use crate::indexing::dictionary::Dictionary; -use crate::indexing::shared::{decode_record, Event, ResolvedEvent, RECORD_SIZE}; +use crate::storage::indexing::dictionary::Dictionary; +use crate::core::{encoding::{decode_record, RECORD_SIZE}, Event, RDFEvent}; use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; use std::path::Path; @@ -191,9 +191,9 @@ impl SparseReader { dict: &Dictionary, timestamp_start_bound: u64, timestamp_end_bound: u64, - ) -> std::io::Result> { + ) -> std::io::Result> { let events = self.query(log_path, timestamp_start_bound, timestamp_end_bound)?; - Ok(events.into_iter().map(|e| e.resolve(dict)).collect()) + Ok(events.into_iter().map(|e| e.decode(dict)).collect()) } /// Opens a sparse index file and loads it into memory. diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 221ddeb..3b55564 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1,4 +1,7 @@ -#[doc=""] pub mod util; -#[doc = ""] pub mod segmented_storage; +pub mod indexing { + pub mod dense; + pub mod dictionary; + pub mod sparse; +} diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs index f8a6500..f3d956d 100644 --- a/src/storage/segmented_storage.rs +++ b/src/storage/segmented_storage.rs @@ -11,13 +11,11 @@ use std::{ }; use crate::{ - config, - indexing::{ - shared::{decode_record, encode_record, Event, RECORD_SIZE}, - sparse, - }, + core::{encoding::{decode_record, encode_record, RECORD_SIZE}, Event, RDFEvent}, storage::{ - self, + indexing::{ + dictionary::Dictionary, + }, util::{EnhancedSegmentMetadata, IndexBlock, StreamingConfig, WAL}, }, }; @@ -25,6 +23,7 @@ use crate::{ pub struct StreamingSegmentedStorage { wal: Arc>, segments: Arc>>, + dictionary: Arc>, flush_handle: Option>, shutdown_signal: Arc>, config: StreamingConfig, @@ -43,6 +42,7 @@ impl StreamingSegmentedStorage { })), segments: Arc::new(RwLock::new(Vec::new())), + dictionary: Arc::new(RwLock::new(Dictionary::new())), flush_handle: None, shutdown_signal: Arc::new(Mutex::new(false)), config, @@ -88,6 +88,23 @@ impl StreamingSegmentedStorage { Ok(()) } + /// User-friendly API: Write RDF data directly with URI strings + pub fn write_rdf( + &self, + timestamp: u64, + subject: &str, + predicate: &str, + object: &str, + graph: &str, + ) -> std::io::Result<()> { + let rdf_event = RDFEvent::new(timestamp, subject, predicate, object, graph); + let encoded_event = { + let mut dict = self.dictionary.write().unwrap(); + rdf_event.encode(&mut dict) + }; + self.write(encoded_event) + } + fn should_flush(&self) -> bool { let wal = self.wal.read().unwrap(); @@ -264,6 +281,17 @@ impl StreamingSegmentedStorage { Ok(results) } + /// User-friendly API: Query and return RDF events with URI strings + pub fn query_rdf( + &self, + start_timestamp: u64, + end_timestamp: u64, + ) -> std::io::Result> { + let encoded_events = self.query(start_timestamp, end_timestamp)?; + let dict = self.dictionary.read().unwrap(); + Ok(encoded_events.into_iter().map(|event| event.decode(&dict)).collect()) + } + fn query_segment_two_level( &self, segment: &EnhancedSegmentMetadata, diff --git a/src/storage/util.rs b/src/storage/util.rs index e19e0e8..aadf9b9 100644 --- a/src/storage/util.rs +++ b/src/storage/util.rs @@ -3,7 +3,7 @@ use std::sync::{Arc, RwLock, Mutex}; use std::thread::JoinHandle; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use crate::indexing::shared::Event; +use crate::core::Event; #[derive(Debug)] pub struct WAL { From bd918d2f4efbde8da6a0504db5cde788d0535f3b Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Fri, 7 Nov 2025 18:50:36 +0100 Subject: [PATCH 09/19] Refactor code for improved organization and readability; add benchmarking for dense and sparse indexing --- src/benchmarking/benchmark.rs | 129 ++++++++++++++ src/benchmarks/mod.rs | 2 +- src/core/encoding.rs | 4 +- src/core/mod.rs | 2 +- src/indexing/mod.rs | 1 - src/indexing/shared.rs | 16 +- src/indexing/sparse.rs | 280 +++++++++++++++++++++++++++++++ src/main.rs | 20 ++- src/storage/indexing/dense.rs | 5 +- src/storage/indexing/sparse.rs | 5 +- src/storage/mod.rs | 2 +- src/storage/segmented_storage.rs | 9 +- src/storage/util.rs | 6 +- 13 files changed, 442 insertions(+), 39 deletions(-) create mode 100644 src/benchmarking/benchmark.rs create mode 100644 src/indexing/sparse.rs diff --git a/src/benchmarking/benchmark.rs b/src/benchmarking/benchmark.rs new file mode 100644 index 0000000..a5be448 --- /dev/null +++ b/src/benchmarking/benchmark.rs @@ -0,0 +1,129 @@ +use crate::indexing::shared::LogWriter; +use crate::storage::indexing::{dense, sparse}; +use std::fs; +use std::time::Instant; + +const DATA_DIR: &str = "data/benchmark"; +const LOG_FILE: &str = "data/benchmark/log.dat"; +const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx"; +const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx"; +const SPARSE_INTERVAL: usize = 1000; + +fn setup_data(number_records: u64) -> std::io::Result<()> { + let _ = fs::remove_dir_all(DATA_DIR); + fs::create_dir_all(DATA_DIR)?; + + let mut writer = LogWriter::create(LOG_FILE)?; + + for i in 0..number_records { + let timestamp = i; + let subject = (i % 1000) as u64; + let predicate = (i % 500) as u64; + let object = (i % 2000) as u64; + let graph: u64 = 1; + writer.append_record(timestamp, subject, predicate, object, graph)?; + } + + writer.flush()?; + + println!("Generated log file with {} records", writer.record_count()); + + Ok(()) +} + +fn benchmark_indexing() -> std::io::Result<()> { + println!("Indexing Benchmark"); + + let start = Instant::now(); + dense::build_dense_index(LOG_FILE, DENSE_INDEX_FILE)?; + let dense_time = start.elapsed(); + println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0); + + let start = Instant::now(); + sparse::build_sparse_index(LOG_FILE, SPARSE_INDEX_FILE, &SPARSE_INTERVAL)?; + let sparse_time = start.elapsed(); + println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0); + + let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?; + let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?; + println!( + "\n Dense Index Size: {} MB", + dense_reader.index_size_bytes() as f64 / 1_000_000.0 + ); + + println!( + "\n Sparse Index Size: {} MB", + sparse_reader.index_size_bytes() as f64 / 1_000_000.0 + ); + Ok(()) +} + +fn benchmark_queries() -> std::io::Result<()> { + println!("Query Benchmark"); + let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?; + let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?; + + let query_ranges = vec![ + (0u64, 100u64, "100 records"), + (5000u64, 5100u64, "100 records (mid-range)"), + (0u64, 10000u64, "10K records"), + (0u64, 100000u64, "100K records"), + (0u64, 1000000u64, "1M records"), + ]; + + for (timestamp_start, timestamp_end, description) in query_ranges { + println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end); + + let start = Instant::now(); + let dense_results = dense_reader.query(LOG_FILE, timestamp_start, timestamp_end)?; + let dense_time = start.elapsed(); + + let start = Instant::now(); + let sparse_results = sparse_reader.query(LOG_FILE, timestamp_start, timestamp_end)?; + let sparse_time = start.elapsed(); + + println!( + " Dense Index Query Time: {:.3} ms, Results: {}", + dense_time.as_secs_f64() * 1000.0, + dense_results.len() + ); + + println!( + " Sparse Index Query Time: {:.3} ms, Results: {}", + sparse_time.as_secs_f64() * 1000.0, + sparse_results.len() + ); + + let speedup = sparse_time.as_secs_f64() / dense_time.as_secs_f64(); + + if speedup > 1.0 { + println!(" Sparse index is {:.2} times faster than Dense index", speedup); + } else { + println!(" Dense index is {:.2} times faster than Sparse index", 1.0 / speedup); + } + + assert_eq!( + dense_results.len(), + sparse_results.len(), + "Mismatch in result counts between Dense and Sparse index queries" + ); + } + Ok(()) +} + +fn main() -> std::io::Result<()> { + println!("RDF Indexing Benchmark : Dense vs Sparse"); + println!("Setting up data..."); + let number_of_records = 1_000_000u64; + setup_data(number_of_records)?; + + benchmark_indexing()?; + benchmark_queries()?; + + println!( + "\n=== Summary ===\nSparse interval: {}\nUse this data to decide \ + which approach suits your use case best.", + SPARSE_INTERVAL + ); + Ok(()) +} diff --git a/src/benchmarks/mod.rs b/src/benchmarks/mod.rs index fe2015f..2543f50 100644 --- a/src/benchmarks/mod.rs +++ b/src/benchmarks/mod.rs @@ -1 +1 @@ -pub mod benchmark; \ No newline at end of file +pub mod benchmark; diff --git a/src/core/encoding.rs b/src/core/encoding.rs index faf8a9a..5c0e904 100644 --- a/src/core/encoding.rs +++ b/src/core/encoding.rs @@ -1,7 +1,7 @@ //! Binary encoding/decoding utilities for RDF events -use crate::storage::indexing::dictionary::Dictionary; use crate::core::{Event, RDFEvent}; +use crate::storage::indexing::dictionary::Dictionary; /// Size of a single encoded record in bytes pub const RECORD_SIZE: usize = 40; @@ -56,4 +56,4 @@ impl Event { graph: dict.decode(self.graph).unwrap_or("UNKNOWN").to_string(), } } -} \ No newline at end of file +} diff --git a/src/core/mod.rs b/src/core/mod.rs index 42f5529..5fbfc41 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -33,4 +33,4 @@ impl RDFEvent { } pub mod encoding; -pub use encoding::*; \ No newline at end of file +pub use encoding::*; diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs index 3bc786f..706bb40 100644 --- a/src/indexing/mod.rs +++ b/src/indexing/mod.rs @@ -1,4 +1,3 @@ //! Legacy indexing utilities - most functionality moved to storage::indexing pub mod shared; - diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs index bbc875f..f0891dd 100644 --- a/src/indexing/shared.rs +++ b/src/indexing/shared.rs @@ -1,8 +1,8 @@ //! Legacy storage utilities - to be moved to storage module +use crate::core::encoding::{encode_record, RECORD_SIZE}; use std::fs::File; use std::io::Write; -use crate::core::encoding::{encode_record, RECORD_SIZE}; /// Log writer for appending encoded records to a file pub struct LogWriter { @@ -14,10 +14,7 @@ impl LogWriter { /// Create a new log writer for the given file path pub fn create(path: &str) -> std::io::Result { let log_file = File::create(path)?; - Ok(Self { - log_file, - record_count: 0 - }) + Ok(Self { log_file, record_count: 0 }) } /// Append an encoded record to the log file @@ -46,12 +43,3 @@ impl LogWriter { self.log_file.flush() } } - - - - - - - - - diff --git a/src/indexing/sparse.rs b/src/indexing/sparse.rs new file mode 100644 index 0000000..cfb8ab7 --- /dev/null +++ b/src/indexing/sparse.rs @@ -0,0 +1,280 @@ +use crate::indexing::dictionary::Dictionary; +use crate::indexing::shared::{decode_record, Event, RDFEvent, RECORD_SIZE}; +use std::fs::File; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::path::Path; + +/// Builder for creating sparse indexes that store only periodic entries. +/// +/// A sparse index reduces storage space by indexing only every Nth record, +/// trading some query precision for significant space savings. +#[doc = ""] +pub struct SparseIndexBuilder { + index_file: File, + interval: usize, +} +#[doc = ""] +impl SparseIndexBuilder { + /// Creates a new sparse index builder that writes to the specified file. + /// + /// # Arguments + /// * `index_path` - Path where the index file will be created + /// * `interval` - Number of records between index entries (e.g., 1000 means index every 1000th record) + /// + /// # Returns + /// A new `SparseIndexBuilder` instance or an I/O error + #[doc = ""] + pub fn create(index_path: &str, interval: usize) -> std::io::Result { + let index_file = File::create(index_path)?; + Ok(Self { index_file, interval }) + } + + /// Adds an entry to the sparse index if the record count matches the interval. + /// + /// Only records where `record_count % interval == 0` are indexed to save space. + /// + /// # Arguments + /// * `record_count` - The current record number in the log + /// * `timestamp` - Timestamp of the record + /// * `offset` - Byte offset of the record in the log file + /// + /// # Returns + /// `true` if the entry was added to the index, `false` if skipped + #[doc = ""] + pub fn add_entry( + &mut self, + record_count: u64, + timestamp: u64, + offset: u64, + ) -> std::io::Result { + if record_count % self.interval as u64 == 0 { + self.index_file.write_all(×tamp.to_be_bytes())?; + self.index_file.write_all(&offset.to_be_bytes())?; + Ok(true) + } else { + Ok(false) + } + } + + /// Finalizes the index by flushing any buffered writes to disk. + /// + /// This should be called after all entries have been added. + #[doc = ""] + pub fn finalize(&mut self) -> std::io::Result<()> { + self.index_file.flush() + } +} + +/// Builds a sparse index for an existing log file. +/// +/// This function reads through the entire log file and creates an index +/// with entries only for records at the specified interval. +/// +/// # Arguments +/// * `log_path` - Path to the log file to index +/// * `index_path` - Path where the index file will be created +/// * `interval` - Number of records between index entries +/// +/// # Returns +/// Ok(()) on success, or an I/O error +pub fn build_sparse_index( + log_path: &str, + index_path: &str, + interval: &usize, +) -> std::io::Result<()> { + let mut log = File::open(log_path)?; + let mut builder = SparseIndexBuilder::create(index_path, *interval)?; + + let mut offset = 0u64; + let mut record_count = 0u64; + let mut record = [0u8; RECORD_SIZE]; + + while log.read_exact(&mut record).is_ok() { + let (timestamp, _, _, _, _) = decode_record(&record); + builder.add_entry(record_count, timestamp, offset)?; + offset += RECORD_SIZE as u64; + record_count += 1; + } + + builder.finalize()?; + Ok(()) +} + +/// Builds a sparse index and initializes an empty dictionary. +/// +/// This is a convenience function that creates both the index and +/// an empty dictionary file. The dictionary can be populated separately +/// when processing RDF data. +/// +/// # Arguments +/// * `log_path` - Path to the log file to index +/// * `index_path` - Path where the index file will be created +/// * `dictionary_path` - Path where the dictionary file will be created +/// * `interval` - Number of records between index entries +/// +/// # Returns +/// Ok(()) on success, or an I/O error +pub fn build_sparse_index_with_dictionary( + log_path: &str, + index_path: &str, + dictionary_path: &str, + interval: &usize, +) -> std::io::Result<()> { + let mut log = File::open(log_path)?; + let mut builder = SparseIndexBuilder::create(index_path, *interval)?; + let dictionary = Dictionary::new(); + + let mut offset = 0u64; + let mut record_count = 0u64; + let mut record = [0u8; RECORD_SIZE]; + + while log.read_exact(&mut record).is_ok() { + let (timestamp, _subject, _predicate, _object, _graph) = decode_record(&record); + + builder.add_entry(record_count, timestamp, offset)?; + + offset += RECORD_SIZE as u64; + record_count += 1; + } + + builder.finalize()?; + dictionary.save_to_file(Path::new(dictionary_path))?; + + Ok(()) +} + +/// Reader for sparse indexes that enables efficient timestamp-based queries. +/// +/// The sparse reader loads the entire index into memory for fast binary search, +/// then performs sequential scans of the log file starting from the appropriate position. +pub struct SparseReader { + index: Vec<(u64, u64)>, + #[allow(dead_code)] + interval: usize, +} + +impl SparseReader { + /// Opens a sparse index and its associated dictionary. + /// + /// # Arguments + /// * `index_path` - Path to the sparse index file + /// * `dictionary_path` - Path to the dictionary file + /// * `interval` - The interval used when building the index + /// + /// # Returns + /// A tuple of (SparseReader, Dictionary) or an I/O error + pub fn open_with_dictionary( + index_path: &str, + dictionary_path: &str, + interval: usize, + ) -> std::io::Result<(Self, Dictionary)> { + let reader = Self::open(index_path, interval)?; + let dictionary = Dictionary::load_from_file(Path::new(dictionary_path))?; + Ok((reader, dictionary)) + } + /// Queries the log and returns results with URIs resolved from the dictionary. + /// + /// This method performs the same query as `query()` but resolves all numeric IDs + /// back to their original URI strings using the provided dictionary. + /// + /// # Arguments + /// * `log_path` - Path to the log file + /// * `dict` - Dictionary for resolving IDs to URIs + /// * `timestamp_start_bound` - Minimum timestamp (inclusive) + /// * `timestamp_end_bound` - Maximum timestamp (inclusive) + /// + /// # Returns + /// Vector of resolved events or an I/O error + pub fn query_resolved( + &self, + log_path: &str, + dict: &Dictionary, + timestamp_start_bound: u64, + timestamp_end_bound: u64, + ) -> std::io::Result> { + let events = self.query(log_path, timestamp_start_bound, timestamp_end_bound)?; + Ok(events.into_iter().map(|e| e.decode(dict)).collect()) + } + + /// Opens a sparse index file and loads it into memory. + /// + /// # Arguments + /// * `index_path` - Path to the sparse index file + /// * `interval` - The interval used when building the index + /// + /// # Returns + /// A new SparseReader instance or an I/O error + pub fn open(index_path: &str, interval: usize) -> std::io::Result { + let mut index_file = File::open(index_path)?; + let mut index = Vec::new(); + let mut entry = [0u8; 16]; + + while index_file.read_exact(&mut entry).is_ok() { + let timestamp = u64::from_be_bytes(entry[0..8].try_into().unwrap()); + let offset = u64::from_be_bytes(entry[8..16].try_into().unwrap()); + + index.push((timestamp, offset)); + } + Ok(Self { index, interval }) + } + + /// Queries the log file for events within the specified timestamp range. + /// + /// Uses binary search on the index to find the starting position, then + /// performs a sequential scan of the log file to collect matching events. + /// + /// # Arguments + /// * `log_path` - Path to the log file + /// * `timestamp_start_bound` - Minimum timestamp (inclusive) + /// * `timestamp_end_bound` - Maximum timestamp (inclusive) + /// + /// # Returns + /// Vector of events with numeric IDs or an I/O error + pub fn query( + &self, + log_path: &str, + timestamp_start_bound: u64, + timestamp_end_bound: u64, + ) -> std::io::Result> { + if timestamp_start_bound > timestamp_end_bound { + return Ok(Vec::new()); + } + + if self.index.is_empty() { + return Ok(Vec::new()); + } + + let position = self + .index + .binary_search_by_key(×tamp_start_bound, |x| x.0) + .unwrap_or_else(|i| i.saturating_sub(1)); + + let mut log = File::open(log_path)?; + log.seek(SeekFrom::Start(self.index[position].1))?; + + let mut results = Vec::new(); + let mut record = [0u8; RECORD_SIZE]; + + while log.read_exact(&mut record).is_ok() { + let (timestamp, subject, predicate, object, graph) = decode_record(&record); + + if timestamp > timestamp_end_bound { + break; + } + + if timestamp >= timestamp_start_bound { + results.push(Event { timestamp, subject, predicate, object, graph }); + } + } + + Ok(results) + } + + /// Returns the size of the index in bytes. + /// + /// Each index entry is 16 bytes (8 bytes timestamp + 8 bytes offset), + /// so this returns `index.len() * 16`. + pub fn index_size_bytes(&self) -> usize { + self.index.len() * 16 + } +} diff --git a/src/main.rs b/src/main.rs index 934bbbf..bd30920 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,8 +3,8 @@ //! This is the main entry point for the Janus command-line interface. use janus::core::Event; -use janus::storage::indexing::{dense, sparse}; use janus::indexing::shared::LogWriter; +use janus::storage::indexing::{dense, sparse}; use janus::storage::segmented_storage::StreamingSegmentedStorage; use janus::storage::util::StreamingConfig; use std::fs; @@ -48,7 +48,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { let subject = format!("http://example.org/person/person_{}", i % 10000); let predicate = match i % 10 { 0..=3 => "http://example.org/knows", - 4..=6 => "http://example.org/worksAt", + 4..=6 => "http://example.org/worksAt", 7..=8 => "http://example.org/livesIn", _ => "http://example.org/hasAge", }; @@ -82,7 +82,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { println!("===================="); let read_sizes = vec![100, 1_000, 10_000, 100_000, 1_000_000]; - + for &size in &read_sizes { // Query the first 'size' events let query_start_ts = base_timestamp; @@ -90,9 +90,9 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { println!("\n📖 Querying {} events...", size); let start_time = Instant::now(); - + let results = storage.query_rdf(query_start_ts, query_end_ts)?; - + let query_duration = start_time.elapsed(); let read_throughput = results.len() as f64 / query_duration.as_secs_f64(); @@ -103,8 +103,10 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { // Show a sample result for verification if !results.is_empty() { let sample = &results[0]; - println!(" Sample result: {} {} {} in {} at {}", - sample.subject, sample.predicate, sample.object, sample.graph, sample.timestamp); + println!( + " Sample result: {} {} {} in {} at {}", + sample.subject, sample.predicate, sample.object, sample.graph, sample.timestamp + ); } } @@ -351,11 +353,11 @@ fn benchmark_storage_performance() -> std::io::Result<()> { fn main() -> std::io::Result<()> { // Run the new RDF benchmark benchmark_segmented_storage_rdf()?; - + println!("\n{}", "=".repeat(50)); println!("Running legacy benchmark for comparison..."); println!("{}", "=".repeat(50)); - + // Also run the old benchmark for comparison benchmark_storage_performance() } diff --git a/src/storage/indexing/dense.rs b/src/storage/indexing/dense.rs index 8f7eb35..d2a904e 100644 --- a/src/storage/indexing/dense.rs +++ b/src/storage/indexing/dense.rs @@ -1,4 +1,7 @@ -use crate::core::{encoding::{decode_record, RECORD_SIZE}, Event}; +use crate::core::{ + encoding::{decode_record, RECORD_SIZE}, + Event, +}; use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; #[doc = ""] diff --git a/src/storage/indexing/sparse.rs b/src/storage/indexing/sparse.rs index 79cc08d..3169e9f 100644 --- a/src/storage/indexing/sparse.rs +++ b/src/storage/indexing/sparse.rs @@ -1,5 +1,8 @@ +use crate::core::{ + encoding::{decode_record, RECORD_SIZE}, + Event, RDFEvent, +}; use crate::storage::indexing::dictionary::Dictionary; -use crate::core::{encoding::{decode_record, RECORD_SIZE}, Event, RDFEvent}; use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; use std::path::Path; diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 3b55564..f797e0d 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1,5 +1,5 @@ -pub mod util; pub mod segmented_storage; +pub mod util; pub mod indexing { pub mod dense; pub mod dictionary; diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs index f3d956d..153b0d2 100644 --- a/src/storage/segmented_storage.rs +++ b/src/storage/segmented_storage.rs @@ -11,11 +11,12 @@ use std::{ }; use crate::{ - core::{encoding::{decode_record, encode_record, RECORD_SIZE}, Event, RDFEvent}, + core::{ + encoding::{decode_record, encode_record, RECORD_SIZE}, + Event, RDFEvent, + }, storage::{ - indexing::{ - dictionary::Dictionary, - }, + indexing::dictionary::Dictionary, util::{EnhancedSegmentMetadata, IndexBlock, StreamingConfig, WAL}, }, }; diff --git a/src/storage/util.rs b/src/storage/util.rs index aadf9b9..1fe653e 100644 --- a/src/storage/util.rs +++ b/src/storage/util.rs @@ -1,5 +1,5 @@ -use std::collections::{VecDeque, HashMap}; -use std::sync::{Arc, RwLock, Mutex}; +use std::collections::{HashMap, VecDeque}; +use std::sync::{Arc, Mutex, RwLock}; use std::thread::JoinHandle; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; @@ -53,5 +53,3 @@ impl Default for StreamingConfig { } } } - - From 0d31416eab72a90bc2c20a93a2b6026ee3b65af9 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Fri, 7 Nov 2025 19:10:08 +0100 Subject: [PATCH 10/19] Refactor storage system to replace WAL with BatchBuffer for improved event handling; update StreamingConfig for batch processing parameters --- src/core/encoding.rs | 14 ++ src/main.rs | 12 +- src/storage/segmented_storage.rs | 222 +++++++++++++++++++++++-------- src/storage/util.rs | 18 ++- 4 files changed, 199 insertions(+), 67 deletions(-) diff --git a/src/core/encoding.rs b/src/core/encoding.rs index 5c0e904..6c3fe9c 100644 --- a/src/core/encoding.rs +++ b/src/core/encoding.rs @@ -56,4 +56,18 @@ impl Event { graph: dict.decode(self.graph).unwrap_or("UNKNOWN").to_string(), } } + + /// Encode this Event to bytes + pub fn to_bytes(&self) -> [u8; RECORD_SIZE] { + let mut buffer = [0u8; RECORD_SIZE]; + encode_record( + &mut buffer, + self.timestamp, + self.subject, + self.predicate, + self.object, + self.graph, + ); + buffer + } } diff --git a/src/main.rs b/src/main.rs index bd30920..25258d6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -27,9 +27,9 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { // Configure storage let config = StreamingConfig { - max_wal_events: 10_000, - max_wal_age_seconds: 60, - max_wal_bytes: 1_000_000, + max_batch_events: 500_000, + max_batch_age_seconds: 1, + max_batch_bytes: 50_000_000, sparse_interval: 1000, entries_per_index_block: 100, segment_base_path: SEGMENT_BASE_PATH.to_string(), @@ -247,9 +247,9 @@ fn benchmark_storage_performance() -> std::io::Result<()> { // Configure storage let config = StreamingConfig { - max_wal_events: 5000, - max_wal_age_seconds: 30, - max_wal_bytes: 5 * 1024 * 1024, + max_batch_events: 250_000, + max_batch_age_seconds: 1, + max_batch_bytes: 100 * 1024 * 1024, sparse_interval: 100, entries_per_index_block: 512, segment_base_path: format!("./benchmark_data_{}", num_records), diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs index 153b0d2..1cd0155 100644 --- a/src/storage/segmented_storage.rs +++ b/src/storage/segmented_storage.rs @@ -17,12 +17,12 @@ use crate::{ }, storage::{ indexing::dictionary::Dictionary, - util::{EnhancedSegmentMetadata, IndexBlock, StreamingConfig, WAL}, + util::{BatchBuffer, EnhancedSegmentMetadata, IndexBlock, StreamingConfig}, }, }; pub struct StreamingSegmentedStorage { - wal: Arc>, + batch_buffer: Arc>, segments: Arc>>, dictionary: Arc>, flush_handle: Option>, @@ -35,7 +35,7 @@ impl StreamingSegmentedStorage { std::fs::create_dir_all(&config.segment_base_path)?; let storage = Self { - wal: Arc::new(RwLock::new(WAL { + batch_buffer: Arc::new(RwLock::new(BatchBuffer { events: VecDeque::new(), total_bytes: 0, oldest_timestamp_bound: None, @@ -53,13 +53,18 @@ impl StreamingSegmentedStorage { } pub fn start_background_flushing(&mut self) { - let wal_clone = Arc::clone(&self.wal); + let batch_buffer_clone = Arc::clone(&self.batch_buffer); let segments_clone = Arc::clone(&self.segments); let shutdown_clone = Arc::clone(&self.shutdown_signal); let config_clone = self.config.clone(); let handle = std::thread::spawn(move || { - Self::background_flush_loop(wal_clone, segments_clone, shutdown_clone, config_clone); + Self::background_flush_loop( + batch_buffer_clone, + segments_clone, + shutdown_clone, + config_clone, + ); }); self.flush_handle = Some(handle); @@ -69,22 +74,21 @@ impl StreamingSegmentedStorage { let event_size = std::mem::size_of::(); { - let mut wal = self.wal.write().unwrap(); + let mut batch_buffer = self.batch_buffer.write().unwrap(); - if wal.oldest_timestamp_bound.is_none() { - wal.oldest_timestamp_bound = Some(event.timestamp); + if batch_buffer.oldest_timestamp_bound.is_none() { + batch_buffer.oldest_timestamp_bound = Some(event.timestamp); } - wal.newest_timestamp_bound = Some(event.timestamp); + batch_buffer.newest_timestamp_bound = Some(event.timestamp); - wal.total_bytes += event_size; + batch_buffer.total_bytes += event_size; - wal.events.push_back(event); + batch_buffer.events.push_back(event); } - if self.should_flush() { - self.flush_wal_to_segment()?; - } + // Note: Synchronous flushing removed for high throughput. + // Background thread handles all flushing based on time limits. Ok(()) } @@ -107,16 +111,16 @@ impl StreamingSegmentedStorage { } fn should_flush(&self) -> bool { - let wal = self.wal.read().unwrap(); + let batch_buffer = self.batch_buffer.read().unwrap(); - wal.events.len() >= self.config.max_wal_events.try_into().unwrap() - || wal.total_bytes > self.config.max_wal_bytes - || wal.oldest_timestamp_bound.map_or(false, |oldest| { + batch_buffer.events.len() >= self.config.max_batch_events.try_into().unwrap() + || batch_buffer.total_bytes > self.config.max_batch_bytes + || batch_buffer.oldest_timestamp_bound.map_or(false, |oldest| { let current_timestamp = Self::current_timestamp(); // Use saturating subtraction to avoid underflow if oldest > current_timestamp current_timestamp.saturating_sub(oldest) - >= self.config.max_wal_age_seconds * 1_000_000_000 + >= self.config.max_batch_age_seconds * 1_000_000_000 }) } @@ -124,20 +128,20 @@ impl StreamingSegmentedStorage { SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 } - fn flush_wal_to_segment(&self) -> std::io::Result<()> { - // Automatically extract events from the WAL. + fn flush_batch_buffer_to_segment(&self) -> std::io::Result<()> { + // Automatically extract events from the batch buffer. let events_to_flush = { - let mut wal = self.wal.write().unwrap(); - if wal.events.is_empty() { + let mut batch_buffer = self.batch_buffer.write().unwrap(); + if batch_buffer.events.is_empty() { return Ok(()); } - let events: Vec = wal.events.drain(..).collect(); + let events: Vec = batch_buffer.events.drain(..).collect(); - wal.total_bytes = 0; - wal.oldest_timestamp_bound = None; - wal.newest_timestamp_bound = None; + batch_buffer.total_bytes = 0; + batch_buffer.oldest_timestamp_bound = None; + batch_buffer.newest_timestamp_bound = None; events }; @@ -174,7 +178,7 @@ impl StreamingSegmentedStorage { for (record_count, event) in events.iter().enumerate() { let record_bytes = self.serialize_event_to_fixed_size(event); - data_file.write_all(&record_bytes); + data_file.write_all(&record_bytes)?; if record_count % self.config.sparse_interval == 0 { let sparse_entry = (event.timestamp, data_offset); @@ -234,30 +238,18 @@ impl StreamingSegmentedStorage { min_ts: u64, max_ts: u64, ) -> std::io::Result { - let file_offset = index_file.stream_position()?; - - for (timestamp, offset) in entries { - index_file.write_all(×tamp.to_ne_bytes())?; - index_file.write_all(&offset.to_be_bytes())?; - } - - Ok(IndexBlock { - min_timestamp: min_ts, - max_timestamp: max_ts, - file_offset, - entry_count: entries.len() as u32, - }) + Self::flush_index_block_static(index_file, entries, min_ts, max_ts) } pub fn query(&self, start_timestamp: u64, end_timestamp: u64) -> std::io::Result> { let mut results = Vec::new(); - // First try to query the immediate WAL which has the fastest visibility. + // First try to query the immediate batch buffer which has the fastest visibility. { - let wal = self.wal.read().unwrap(); + let batch_buffer = self.batch_buffer.read().unwrap(); - for event in &wal.events { + for event in &batch_buffer.events { if event.timestamp >= start_timestamp && event.timestamp <= end_timestamp { results.push(event.clone()); } @@ -405,30 +397,33 @@ impl StreamingSegmentedStorage { } fn background_flush_loop( - wal: Arc>, + batch_buffer: Arc>, segments: Arc>>, shutdown_signal: Arc>, config: StreamingConfig, ) { while !*shutdown_signal.lock().unwrap() { - std::thread::sleep(Duration::from_secs(1)); + std::thread::sleep(Duration::from_millis(100)); // Check if flush is needed or not. let should_flush = { - let wal = wal.read().unwrap(); + let batch_buffer = batch_buffer.read().unwrap(); - wal.events.len() >= config.max_wal_bytes - || wal.total_bytes >= config.max_wal_bytes - || wal.oldest_timestamp_bound.map_or(false, |oldest| { + batch_buffer.events.len() >= config.max_batch_events.try_into().unwrap() + || batch_buffer.total_bytes >= config.max_batch_bytes + || batch_buffer.oldest_timestamp_bound.map_or(false, |oldest| { let current_timestamp = Self::current_timestamp(); - (current_timestamp - oldest) >= config.max_wal_age_seconds * 1_000_000_000 + current_timestamp.saturating_sub(oldest) + >= config.max_batch_age_seconds * 1_000_000_000 }) }; if should_flush { // TODO : Add better error handling here in this case - if let Err(e) = Self::flush_background(wal.clone(), segments.clone(), &config) { + if let Err(e) = + Self::flush_background(batch_buffer.clone(), segments.clone(), &config) + { eprintln!("Background flush failed: {}", e); } } @@ -436,10 +431,104 @@ impl StreamingSegmentedStorage { } fn flush_background( - wal: Arc>, + batch_buffer: Arc>, segments: Arc>>, config: &StreamingConfig, ) -> std::io::Result<()> { + // Automatically extract events from the batch buffer. + + let events_to_flush = { + let mut batch_buffer = batch_buffer.write().unwrap(); + if batch_buffer.events.is_empty() { + return Ok(()); + } + + let events: Vec = batch_buffer.events.drain(..).collect(); + + batch_buffer.total_bytes = 0; + batch_buffer.oldest_timestamp_bound = None; + batch_buffer.newest_timestamp_bound = None; + events + }; + + // Create a new segment for these events + let segment_id = Self::current_timestamp(); + let data_path = format!("{}/segment-{}.log", config.segment_base_path, segment_id); + let index_path = format!("{}/segment-{}.idx", config.segment_base_path, segment_id); + + // Use buffered writers for performance (same as original implementation) + let mut data_file = BufWriter::new(std::fs::File::create(&data_path)?); + let mut index_file = BufWriter::new(std::fs::File::create(&index_path)?); + + let mut index_directory = Vec::new(); + let mut current_block_entries = Vec::new(); + let mut current_block_min_ts = None; + let mut current_block_max_ts = 0u64; + let mut data_offset = 0u64; + + for (record_count, event) in events_to_flush.iter().enumerate() { + // Use the same serialization as the original + let record_bytes = Self::serialize_event_to_fixed_size_static(event); + data_file.write_all(&record_bytes)?; + + if record_count % config.sparse_interval == 0 { + let sparse_entry = (event.timestamp, data_offset); + + if current_block_min_ts.is_none() { + current_block_min_ts = Some(event.timestamp); + } + + current_block_max_ts = event.timestamp; + current_block_entries.push(sparse_entry); + + if current_block_entries.len() >= config.entries_per_index_block { + let block_metadata = Self::flush_index_block_static( + &mut index_file, + ¤t_block_entries, + current_block_min_ts.unwrap(), + current_block_max_ts, + )?; + + index_directory.push(block_metadata); + + current_block_entries.clear(); + current_block_min_ts = None; + } + } + data_offset += record_bytes.len() as u64; + } + + if !current_block_entries.is_empty() { + let block_metadata = Self::flush_index_block_static( + &mut index_file, + ¤t_block_entries, + current_block_min_ts.unwrap(), + current_block_max_ts, + )?; + + index_directory.push(block_metadata); + } + + data_file.flush()?; + index_file.flush()?; + + // Add the new segment to the segments list + let new_segment = EnhancedSegmentMetadata { + start_timstamp: events_to_flush.first().unwrap().timestamp, + end_timestamp: events_to_flush.last().unwrap().timestamp, + data_path, + index_path, + record_count: events_to_flush.len() as u64, + index_directory, + }; + + { + let mut segments = segments.write().unwrap(); + segments.push(new_segment); + // Keep segments sorted by start timestamp + segments.sort_by_key(|s| s.start_timstamp); + } + Ok(()) } @@ -504,7 +593,7 @@ impl StreamingSegmentedStorage { // Final Flush - self.flush_wal_to_segment(); + self.flush_batch_buffer_to_segment()?; if let Some(handle) = self.flush_handle.take() { handle.join().unwrap(); @@ -513,6 +602,10 @@ impl StreamingSegmentedStorage { } fn serialize_event_to_fixed_size(&self, event: &Event) -> Vec { + Self::serialize_event_to_fixed_size_static(event) + } + + fn serialize_event_to_fixed_size_static(event: &Event) -> Vec { let mut record = [0u8; RECORD_SIZE]; encode_record( &mut record, @@ -525,6 +618,27 @@ impl StreamingSegmentedStorage { record.to_vec() } + fn flush_index_block_static( + index_file: &mut BufWriter, + entries: &[(u64, u64)], + min_ts: u64, + max_ts: u64, + ) -> std::io::Result { + let file_offset = index_file.stream_position()?; + + for (timestamp, offset) in entries { + index_file.write_all(×tamp.to_le_bytes())?; + index_file.write_all(&offset.to_be_bytes())?; + } + + Ok(IndexBlock { + min_timestamp: min_ts, + max_timestamp: max_ts, + file_offset, + entry_count: entries.len() as u32, + }) + } + fn generate_segment_id() -> u64 { SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 } diff --git a/src/storage/util.rs b/src/storage/util.rs index 1fe653e..8069e85 100644 --- a/src/storage/util.rs +++ b/src/storage/util.rs @@ -6,7 +6,8 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use crate::core::Event; #[derive(Debug)] -pub struct WAL { +/// In-memory buffer that batches events before persisting them to disk +pub struct BatchBuffer { pub events: VecDeque, pub total_bytes: usize, pub oldest_timestamp_bound: Option, @@ -33,9 +34,12 @@ pub struct EnhancedSegmentMetadata { #[derive(Clone)] pub struct StreamingConfig { - pub max_wal_events: u64, - pub max_wal_age_seconds: u64, - pub max_wal_bytes: usize, + /// Maximum number of events to buffer before flushing to disk + pub max_batch_events: u64, + /// Maximum age in seconds before flushing buffered events to disk + pub max_batch_age_seconds: u64, + /// Maximum bytes to buffer before flushing to disk + pub max_batch_bytes: usize, pub sparse_interval: usize, pub entries_per_index_block: usize, pub segment_base_path: String, @@ -44,9 +48,9 @@ pub struct StreamingConfig { impl Default for StreamingConfig { fn default() -> Self { Self { - max_wal_bytes: 10 * 1024 * 1024, - max_wal_age_seconds: 60, - max_wal_events: 100_000, + max_batch_bytes: 10 * 1024 * 1024, + max_batch_age_seconds: 60, + max_batch_events: 100_000, sparse_interval: 1000, entries_per_index_block: 1024, segment_base_path: "./data".to_string(), From 43d31f67e501060b8c8d70499c0feb7eeb0437de Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Fri, 7 Nov 2025 22:25:12 +0100 Subject: [PATCH 11/19] Refactor Dictionary to use u32 for IDs and update related methods - Changed ID types in Dictionary from u64 to u32 for memory efficiency. - Updated encode and decode methods to reflect the new ID type. - Adjusted tests to use the new encoding/decoding methods. - Modified memory_tracker to track memory usage with detailed statistics. - Added MemoryTracker struct for monitoring memory usage during runtime. - Implemented methods for recording, retrieving, and resetting memory measurements. - Enhanced segmented storage to utilize Rc for dictionary management. - Updated utility functions to include a new StorageComponentSizes struct for memory breakdown. --- benches/analysis.rs | 4 +- benches/benchmark.rs | 8 +- benches/write_benchmark.rs | 36 ++--- src/benchmarking/benchmark.rs | 8 +- src/benchmarks/benchmark.rs | 10 +- src/core/encoding.rs | 29 ++-- src/core/mod.rs | 12 +- src/indexing/shared.rs | 8 +- src/main.rs | 152 +++++++++++------- src/storage/indexing/dictionary.rs | 14 +- src/storage/memory_tracker.rs | 241 +++++++++++++++++++++++++++++ src/storage/mod.rs | 1 + src/storage/segmented_storage.rs | 21 ++- src/storage/util.rs | 9 ++ tests/dictionary_encoding_test.rs | 189 +++++++++++----------- 15 files changed, 520 insertions(+), 222 deletions(-) create mode 100644 src/storage/memory_tracker.rs diff --git a/benches/analysis.rs b/benches/analysis.rs index 60f020c..cbe6858 100644 --- a/benches/analysis.rs +++ b/benches/analysis.rs @@ -151,7 +151,7 @@ fn analyze_write_throughput() -> std::io::Result<()> { for i in 0..batch_size { log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; - index_builder.add_entry(i, i * 40)?; + index_builder.add_entry(i, i * 24)?; } log_writer.flush()?; index_builder.finalize()?; @@ -170,7 +170,7 @@ fn analyze_write_throughput() -> std::io::Result<()> { for i in 0..batch_size { log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; - index_builder.add_entry(i, i, i * 40)?; + index_builder.add_entry(i, i, i * 24)?; } log_writer.flush()?; index_builder.finalize()?; diff --git a/benches/benchmark.rs b/benches/benchmark.rs index e6ca4db..29aa79d 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -16,10 +16,10 @@ fn setup_data(number_records: u64) -> std::io::Result<()> { for i in 0..number_records { let timestamp = i; - let subject = (i % 1000) as u64; - let predicate = (i % 500) as u64; - let object = (i % 2000) as u64; - let graph: u64 = 1; + let subject = (i % 1000) as u32; + let predicate = (i % 500) as u32; + let object = (i % 2000) as u32; + let graph: u32 = 1; writer.append_record(timestamp, subject, predicate, object, graph)?; } diff --git a/benches/write_benchmark.rs b/benches/write_benchmark.rs index dfd12ab..38e8e13 100644 --- a/benches/write_benchmark.rs +++ b/benches/write_benchmark.rs @@ -28,10 +28,10 @@ fn benchmark_dense_writing(number_records: u64) -> std::io::Result<(f64, f64)> { for i in 0..number_records { let timestamp = i; - let subject = (i % 1000) as u64; - let predicate = (i % 500) as u64; - let object = (i % 2000) as u64; - let graph: u64 = 1; + let subject = (i % 1000) as u32; + let predicate = (i % 500) as u32; + let object = (i % 2000) as u32; + let graph: u32 = 1; // Write record to log log_writer.append_record(timestamp, subject, predicate, object, graph)?; @@ -39,7 +39,7 @@ fn benchmark_dense_writing(number_records: u64) -> std::io::Result<(f64, f64)> { // Add entry to index index_builder.add_entry(timestamp, current_offset)?; - current_offset += 40; // RECORD_SIZE + current_offset += 24; // RECORD_SIZE } let write_time = start.elapsed(); @@ -66,10 +66,10 @@ fn benchmark_sparse_writing(number_records: u64) -> std::io::Result<(f64, f64)> for i in 0..number_records { let timestamp = i; - let subject = (i % 1000) as u64; - let predicate = (i % 500) as u64; - let object = (i % 2000) as u64; - let graph: u64 = 1; + let subject = (i % 1000) as u32; + let predicate = (i % 500) as u32; + let object = (i % 2000) as u32; + let graph: u32 = 1; // Write record to log log_writer.append_record(timestamp, subject, predicate, object, graph)?; @@ -77,7 +77,7 @@ fn benchmark_sparse_writing(number_records: u64) -> std::io::Result<(f64, f64)> // Add entry to index (will only add if i % interval == 0) index_builder.add_entry(i, timestamp, current_offset)?; - current_offset += 40; // RECORD_SIZE + current_offset += 24; // RECORD_SIZE } let write_time = start.elapsed(); @@ -110,10 +110,10 @@ fn benchmark_batch_vs_realtime(number_records: u64) -> std::io::Result<()> { let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?; for i in 0..number_records { let timestamp = i; - let subject = (i % 1000) as u64; - let predicate = (i % 500) as u64; - let object = (i % 2000) as u64; - let graph: u64 = 1; + let subject = (i % 1000) as u32; + let predicate = (i % 500) as u32; + let object = (i % 2000) as u32; + let graph: u32 = 1; log_writer.append_record(timestamp, subject, predicate, object, graph)?; } log_writer.flush()?; @@ -132,10 +132,10 @@ fn benchmark_batch_vs_realtime(number_records: u64) -> std::io::Result<()> { let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?; for i in 0..number_records { let timestamp = i; - let subject = (i % 1000) as u64; - let predicate = (i % 500) as u64; - let object = (i % 2000) as u64; - let graph: u64 = 1; + let subject = (i % 1000) as u32; + let predicate = (i % 500) as u32; + let object = (i % 2000) as u32; + let graph: u32 = 1; log_writer.append_record(timestamp, subject, predicate, object, graph)?; } log_writer.flush()?; diff --git a/src/benchmarking/benchmark.rs b/src/benchmarking/benchmark.rs index a5be448..607e922 100644 --- a/src/benchmarking/benchmark.rs +++ b/src/benchmarking/benchmark.rs @@ -17,10 +17,10 @@ fn setup_data(number_records: u64) -> std::io::Result<()> { for i in 0..number_records { let timestamp = i; - let subject = (i % 1000) as u64; - let predicate = (i % 500) as u64; - let object = (i % 2000) as u64; - let graph: u64 = 1; + let subject = (i % 1000) as u32; + let predicate = (i % 500) as u32; + let object = (i % 2000) as u32; + let graph: u32 = 1; writer.append_record(timestamp, subject, predicate, object, graph)?; } diff --git a/src/benchmarks/benchmark.rs b/src/benchmarks/benchmark.rs index d64ec37..607e922 100644 --- a/src/benchmarks/benchmark.rs +++ b/src/benchmarks/benchmark.rs @@ -1,5 +1,5 @@ -use crate::storage::indexing::{dense, sparse}; use crate::indexing::shared::LogWriter; +use crate::storage::indexing::{dense, sparse}; use std::fs; use std::time::Instant; @@ -17,10 +17,10 @@ fn setup_data(number_records: u64) -> std::io::Result<()> { for i in 0..number_records { let timestamp = i; - let subject = (i % 1000) as u64; - let predicate = (i % 500) as u64; - let object = (i % 2000) as u64; - let graph: u64 = 1; + let subject = (i % 1000) as u32; + let predicate = (i % 500) as u32; + let object = (i % 2000) as u32; + let graph: u32 = 1; writer.append_record(timestamp, subject, predicate, object, graph)?; } diff --git a/src/core/encoding.rs b/src/core/encoding.rs index 6c3fe9c..8c76b7f 100644 --- a/src/core/encoding.rs +++ b/src/core/encoding.rs @@ -4,31 +4,32 @@ use crate::core::{Event, RDFEvent}; use crate::storage::indexing::dictionary::Dictionary; /// Size of a single encoded record in bytes -pub const RECORD_SIZE: usize = 40; +/// Reduced from 40 to 24 bytes (40% space savings) +pub const RECORD_SIZE: usize = 24; /// Encode an RDF event record into a byte buffer pub fn encode_record( buffer: &mut [u8; RECORD_SIZE], timestamp: u64, - subject: u64, - predicate: u64, - object: u64, - graph: u64, + subject: u32, + predicate: u32, + object: u32, + graph: u32, ) { buffer[0..8].copy_from_slice(×tamp.to_le_bytes()); - buffer[8..16].copy_from_slice(&subject.to_le_bytes()); - buffer[16..24].copy_from_slice(&predicate.to_le_bytes()); - buffer[24..32].copy_from_slice(&object.to_le_bytes()); - buffer[32..40].copy_from_slice(&graph.to_le_bytes()); + buffer[8..12].copy_from_slice(&subject.to_le_bytes()); + buffer[12..16].copy_from_slice(&predicate.to_le_bytes()); + buffer[16..20].copy_from_slice(&object.to_le_bytes()); + buffer[20..24].copy_from_slice(&graph.to_le_bytes()); } /// Decode a byte buffer into an RDF event record -pub fn decode_record(buffer: &[u8; RECORD_SIZE]) -> (u64, u64, u64, u64, u64) { +pub fn decode_record(buffer: &[u8; RECORD_SIZE]) -> (u64, u32, u32, u32, u32) { let timestamp = u64::from_le_bytes(buffer[0..8].try_into().unwrap()); - let subject = u64::from_le_bytes(buffer[8..16].try_into().unwrap()); - let predicate = u64::from_le_bytes(buffer[16..24].try_into().unwrap()); - let object = u64::from_le_bytes(buffer[24..32].try_into().unwrap()); - let graph = u64::from_le_bytes(buffer[32..40].try_into().unwrap()); + let subject = u32::from_le_bytes(buffer[8..12].try_into().unwrap()); + let predicate = u32::from_le_bytes(buffer[12..16].try_into().unwrap()); + let object = u32::from_le_bytes(buffer[16..20].try_into().unwrap()); + let graph = u32::from_le_bytes(buffer[20..24].try_into().unwrap()); (timestamp, subject, predicate, object, graph) } diff --git a/src/core/mod.rs b/src/core/mod.rs index 5fbfc41..ea27ceb 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -1,13 +1,15 @@ //! Core data structures and types for Janus RDF Stream Processing Engine /// Internal storage event with encoded IDs +/// Uses u32 for dictionary IDs (4B max) and u64 for timestamp (milliseconds) +/// Total: 24 bytes vs 40 bytes (40% space savings) #[derive(Clone, Debug)] pub struct Event { - pub timestamp: u64, - pub subject: u64, - pub predicate: u64, - pub object: u64, - pub graph: u64, + pub timestamp: u64, // 8 bytes - milliseconds since epoch + pub subject: u32, // 4 bytes - dictionary-encoded (4B max unique strings) + pub predicate: u32, // 4 bytes - dictionary-encoded (usually <1000 unique) + pub object: u32, // 4 bytes - dictionary-encoded (4B max unique strings) + pub graph: u32, // 4 bytes - dictionary-encoded (usually <100 unique) } /// User-facing RDF event with URI strings diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs index f0891dd..c47ec8e 100644 --- a/src/indexing/shared.rs +++ b/src/indexing/shared.rs @@ -21,10 +21,10 @@ impl LogWriter { pub fn append_record( &mut self, timestamp: u64, - subject: u64, - predicate: u64, - object: u64, - graph: u64, + subject: u32, + predicate: u32, + object: u32, + graph: u32, ) -> std::io::Result<()> { let mut buffer = [0u8; RECORD_SIZE]; encode_record(&mut buffer, timestamp, subject, predicate, object, graph); diff --git a/src/main.rs b/src/main.rs index 25258d6..954cc02 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,6 +5,7 @@ use janus::core::Event; use janus::indexing::shared::LogWriter; use janus::storage::indexing::{dense, sparse}; +use janus::storage::memory_tracker::MemoryTracker; use janus::storage::segmented_storage::StreamingSegmentedStorage; use janus::storage::util::StreamingConfig; use std::fs; @@ -18,8 +19,8 @@ const SPARSE_INTERVAL: usize = 1000; const SEGMENT_BASE_PATH: &str = "data/rdf_benchmark"; fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { - println!("🚀 RDF Segmented Storage Benchmark"); - println!("=================================="); + // println!("🚀 RDF Segmented Storage Benchmark"); + // println!("=================================="); // Clean up and create directories let _ = fs::remove_dir_all(SEGMENT_BASE_PATH); @@ -38,13 +39,16 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { let mut storage = StreamingSegmentedStorage::new(config)?; storage.start_background_flushing(); + // Record initial memory + // storage.record_memory("before_writing"); + // Benchmark writing 1 million RDF events - println!("\n📝 Writing 1,000,000 RDF events..."); + // println!("\n📝 Writing 1,000,000 RDF events..."); let start_time = Instant::now(); - let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64; + let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64; for i in 0..1_000_000u64 { - let timestamp = base_timestamp + i * 1_000_000; // 1ms intervals + let timestamp = base_timestamp + i * 1; // 1ms intervals let subject = format!("http://example.org/person/person_{}", i % 10000); let predicate = match i % 10 { 0..=3 => "http://example.org/knows", @@ -63,32 +67,34 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { storage.write_rdf(timestamp, &subject, predicate, &object, &graph)?; if i > 0 && i % 100_000 == 0 { - println!(" ✓ Written {} events", i); + // println!(" ✓ Written {} events", i); + // storage.record_memory(&format!("after_{}_events", i)); } } let write_duration = start_time.elapsed(); let write_throughput = 1_000_000.0 / write_duration.as_secs_f64(); - println!("✅ Write completed!"); - println!(" Duration: {:.3} seconds", write_duration.as_secs_f64()); - println!(" Throughput: {:.0} events/sec", write_throughput); + // println!("✅ Write completed!"); + // println!(" Duration: {:.3} seconds", write_duration.as_secs_f64()); + // println!(" Throughput: {:.0} events/sec", write_throughput); // Wait a bit for background flushing std::thread::sleep(Duration::from_secs(2)); + // storage.record_memory("after_background_flush"); // Benchmark reading different amounts of data - println!("\n🔍 Reading Benchmarks"); - println!("===================="); + // println!("\n🔍 Reading Benchmarks"); + // println!("===================="); let read_sizes = vec![100, 1_000, 10_000, 100_000, 1_000_000]; for &size in &read_sizes { // Query the first 'size' events let query_start_ts = base_timestamp; - let query_end_ts = base_timestamp + (size as u64 * 1_000_000); + let query_end_ts = base_timestamp + size as u64; - println!("\n📖 Querying {} events...", size); + // println!("\n📖 Querying {} events...", size); let start_time = Instant::now(); let results = storage.query_rdf(query_start_ts, query_end_ts)?; @@ -96,9 +102,9 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { let query_duration = start_time.elapsed(); let read_throughput = results.len() as f64 / query_duration.as_secs_f64(); - println!(" Results found: {}", results.len()); - println!(" Query time: {:.3} ms", query_duration.as_millis()); - println!(" Read throughput: {:.0} events/sec", read_throughput); + // println!(" Results found: {}", results.len()); + // println!(" Query time: {:.3} ms", query_duration.as_millis()); + // println!(" Read throughput: {:.0} events/sec", read_throughput); // Show a sample result for verification if !results.is_empty() { @@ -113,7 +119,45 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { // Shutdown storage storage.shutdown()?; - println!("\n🎉 Benchmark completed successfully!"); + // Print memory statistics + // println!("\n📊 Memory Usage Statistics"); + // println!("=========================="); + // let memory_stats = storage.get_memory_stats(); + // // println!("Peak memory: {}", MemoryTracker::format_bytes(memory_stats.peak_bytes)); + // // println!("Current memory: {}", MemoryTracker::format_bytes(memory_stats.current_bytes)); + // println!( + // "Average memory: {}", + // MemoryTracker::format_bytes(memory_stats.avg_bytes as usize) + // ); + // // println!("Total measurements: {}", memory_stats.total_measurements); + + // Print storage component breakdown + // let component_sizes = storage.get_storage_component_sizes(); + // println!("\n🧩 Storage Component Breakdown"); + // println!("============================="); + // println!( + // "Batch buffer: {}", + // MemoryTracker::format_bytes(component_sizes.batch_buffer_bytes) + // ); + // // println!("Dictionary: {}", MemoryTracker::format_bytes(component_sizes.dictionary_bytes)); + // // println!("Segments count: {}", component_sizes.segments_count); + // println!( + // "Estimated total: {}", + // MemoryTracker::format_bytes(component_sizes.estimated_total_bytes) + // ); + + // if memory_stats.measurements.len() > 1 { + // // println!("\nDetailed measurements:"); + // for measurement in &memory_stats.measurements { + // println!( + // " {}: {}", + // measurement.description, + // MemoryTracker::format_bytes(measurement.memory_bytes) + // ); + // } + // } + + // println!("\n🎉 Benchmark completed successfully!"); Ok(()) } @@ -125,32 +169,32 @@ fn setup_data(number_records: u64) -> std::io::Result<()> { for i in 0..number_records { let timestamp = i; - let subject = (i % 1000) as u64; - let predicate = (i % 500) as u64; - let object = (i % 2000) as u64; - let graph: u64 = 1; + let subject = (i % 1000) as u32; + let predicate = (i % 500) as u32; + let object = (i % 2000) as u32; + let graph: u32 = 1; writer.append_record(timestamp, subject, predicate, object, graph)?; } writer.flush()?; - println!("Generated log file with {} records", writer.record_count()); + // println!("Generated log file with {} records", writer.record_count()); Ok(()) } fn benchmark_indexing() -> std::io::Result<()> { - println!("Indexing Benchmark"); + // println!("Indexing Benchmark"); let start = Instant::now(); dense::build_dense_index(LOG_FILE, DENSE_INDEX_FILE)?; let dense_time = start.elapsed(); - println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0); + // println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0); let start = Instant::now(); sparse::build_sparse_index(LOG_FILE, SPARSE_INDEX_FILE, &SPARSE_INTERVAL)?; let sparse_time = start.elapsed(); - println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0); + // println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0); let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?; let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?; @@ -167,7 +211,7 @@ fn benchmark_indexing() -> std::io::Result<()> { } fn benchmark_queries() -> std::io::Result<()> { - println!("Query Benchmark"); + // println!("Query Benchmark"); let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?; let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?; @@ -180,7 +224,7 @@ fn benchmark_queries() -> std::io::Result<()> { ]; for (timestamp_start, timestamp_end, description) in query_ranges { - println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end); + // println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end); let start = Instant::now(); let dense_results = dense_reader.query(LOG_FILE, timestamp_start, timestamp_end)?; @@ -205,9 +249,9 @@ fn benchmark_queries() -> std::io::Result<()> { let speedup = sparse_time.as_secs_f64() / dense_time.as_secs_f64(); if speedup > 1.0 { - println!(" Sparse index is {:.2} times faster than Dense index", speedup); + // println!(" Sparse index is {:.2} times faster than Dense index", speedup); } else { - println!(" Dense index is {:.2} times faster than Sparse index", 1.0 / speedup); + // println!(" Dense index is {:.2} times faster than Sparse index", 1.0 / speedup); } assert_eq!( @@ -220,8 +264,8 @@ fn benchmark_queries() -> std::io::Result<()> { } // fn main() -> std::io::Result<()> { -// println!("RDF Indexing Benchmark : Dense vs Sparse"); -// println!("Setting up data..."); +// // println!("RDF Indexing Benchmark : Dense vs Sparse"); +// // println!("Setting up data..."); // let number_of_records = 1_000_000u64; // setup_data(number_of_records)?; @@ -237,13 +281,13 @@ fn benchmark_queries() -> std::io::Result<()> { // } fn benchmark_storage_performance() -> std::io::Result<()> { - println!("=== WAL-Based Segmented Storage Performance Benchmark ===\n"); + // println!("=== WAL-Based Segmented Storage Performance Benchmark ===\n"); let record_counts = vec![100, 1000, 10000, 100000, 1000000]; for &num_records in &record_counts { - println!("Testing with {} records", num_records); - println!("──────────────────────────────────────────────────"); + // println!("Testing with {} records", num_records); + // println!("──────────────────────────────────────────────────"); // Configure storage let config = StreamingConfig { @@ -263,22 +307,22 @@ fn benchmark_storage_performance() -> std::io::Result<()> { storage.start_background_flushing(); // Benchmark writes - println!("Writing {} records...", num_records); + // println!("Writing {} records...", num_records); let write_start = Instant::now(); let mut min_timestamp = u64::MAX; let mut max_timestamp = 0u64; for i in 0..num_records { let timestamp = - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 + i; + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + i; min_timestamp = min_timestamp.min(timestamp); max_timestamp = max_timestamp.max(timestamp); let event = Event { timestamp, - subject: (i % 10) as u64, + subject: (i % 10) as u32, predicate: 1, - object: (20 + (i % 10)) as u64, + object: (20 + (i % 10)) as u32, graph: 1, }; storage.write(event)?; @@ -287,15 +331,15 @@ fn benchmark_storage_performance() -> std::io::Result<()> { let write_duration = write_start.elapsed(); let write_throughput = num_records as f64 / write_duration.as_secs_f64(); - println!("Write Performance:"); - println!(" Duration: {:.3}s", write_duration.as_secs_f64()); - println!(" Throughput: {:.0} records/sec", write_throughput); - println!(" Timestamp range: {} to {}", min_timestamp, max_timestamp); + // println!("Write Performance:"); + // println!(" Duration: {:.3}s", write_duration.as_secs_f64()); + // println!(" Throughput: {:.0} records/sec", write_throughput); + // println!(" Timestamp range: {} to {}", min_timestamp, max_timestamp); // Benchmark queries immediately after writing (data is still in WAL) let query_ranges = vec![(0.1, "10% of data"), (0.5, "50% of data"), (1.0, "100% of data")]; - println!("\nQuery Performance:"); + // println!("\nQuery Performance:"); for (fraction, description) in query_ranges { let query_count = 100.min(num_records / 10); // Run 100 queries or 10% of records, whichever is smaller @@ -332,13 +376,13 @@ fn benchmark_storage_performance() -> std::io::Result<()> { let min_time = query_times.iter().cloned().fold(f64::INFINITY, f64::min); let max_time = query_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max); - println!(" {} queries ({}):", description, query_count); - println!(" Avg query time: {:.3}ms", avg_query_time * 1000.0); - println!(" Query throughput: {:.1} queries/sec", queries_per_sec); - println!(" Read throughput: {:.0} records/sec", records_per_sec); - println!(" Avg records per query: {:.1}", avg_records_per_query); - println!(" Total records read: {}", total_records_read); - println!(" Min/Max time: {:.3}ms / {:.3}ms", min_time * 1000.0, max_time * 1000.0); + // println!(" {} queries ({}):", description, query_count); + // println!(" Avg query time: {:.3}ms", avg_query_time * 1000.0); + // println!(" Query throughput: {:.1} queries/sec", queries_per_sec); + // println!(" Read throughput: {:.0} records/sec", records_per_sec); + // println!(" Avg records per query: {:.1}", avg_records_per_query); + // println!(" Total records read: {}", total_records_read); + // println!(" Min/Max time: {:.3}ms / {:.3}ms", min_time * 1000.0, max_time * 1000.0); } // Force flush remaining WAL data and shutdown @@ -346,7 +390,7 @@ fn benchmark_storage_performance() -> std::io::Result<()> { println!(); } - println!("Benchmark completed!"); + // println!("Benchmark completed!"); Ok(()) } @@ -354,9 +398,9 @@ fn main() -> std::io::Result<()> { // Run the new RDF benchmark benchmark_segmented_storage_rdf()?; - println!("\n{}", "=".repeat(50)); - println!("Running legacy benchmark for comparison..."); - println!("{}", "=".repeat(50)); + // println!("\n{}", "=".repeat(50)); + // println!("Running legacy benchmark for comparison..."); + // println!("{}", "=".repeat(50)); // Also run the old benchmark for comparison benchmark_storage_performance() diff --git a/src/storage/indexing/dictionary.rs b/src/storage/indexing/dictionary.rs index 4729756..d1b474a 100644 --- a/src/storage/indexing/dictionary.rs +++ b/src/storage/indexing/dictionary.rs @@ -10,9 +10,9 @@ use crate::core::Event; #[derive(Debug, Serialize, Deserialize)] pub struct Dictionary { - pub string_to_id: HashMap, - pub id_to_uri: HashMap, - pub next_id: u64, + pub string_to_id: HashMap, + pub id_to_uri: HashMap, + pub next_id: u32, } impl Dictionary { @@ -20,7 +20,7 @@ impl Dictionary { Dictionary { string_to_id: HashMap::new(), id_to_uri: HashMap::new(), next_id: 0 } } - pub fn encode(&mut self, value: &str) -> u64 { + pub fn encode(&mut self, value: &str) -> u32 { if let Some(&id) = self.string_to_id.get(value) { id } else { @@ -32,7 +32,7 @@ impl Dictionary { } } - pub fn decode(&self, id: u64) -> Option<&str> { + pub fn decode(&self, id: u32) -> Option<&str> { self.id_to_uri.get(&id).map(|s| s.as_str()) } @@ -69,7 +69,7 @@ impl Dictionary { #[cfg(test)] mod tests { use super::*; - use crate::indexing::shared::Event; + use crate::core::Event; #[test] fn test_dictionary_encoding_decoding() { @@ -116,7 +116,7 @@ mod tests { #[test] fn test_clean_rdf_api() { - use crate::indexing::shared::RDFEvent; + use crate::core::RDFEvent; let mut dict = Dictionary::new(); diff --git a/src/storage/memory_tracker.rs b/src/storage/memory_tracker.rs new file mode 100644 index 0000000..ccef928 --- /dev/null +++ b/src/storage/memory_tracker.rs @@ -0,0 +1,241 @@ +use std::collections::VecDeque; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +/// Memory usage tracker for benchmarking purposes +#[derive(Debug, Clone)] +pub struct MemoryTracker { + peak_memory_bytes: Arc, + current_memory_bytes: Arc, + measurements: Arc>>, +} + +#[derive(Debug, Clone)] +pub struct MemoryMeasurement { + pub timestamp: std::time::Instant, + pub memory_bytes: usize, + pub description: String, +} + +#[derive(Debug)] +pub struct MemoryStats { + pub current_bytes: usize, + pub peak_bytes: usize, + pub total_measurements: usize, + pub avg_bytes: f64, + pub measurements: Vec, +} + +impl MemoryTracker { + pub fn new() -> Self { + Self { + peak_memory_bytes: Arc::new(AtomicUsize::new(0)), + current_memory_bytes: Arc::new(AtomicUsize::new(0)), + measurements: Arc::new(std::sync::Mutex::new(Vec::new())), + } + } + + /// Record current memory usage with a description + pub fn record(&self, description: &str) { + let current = self.estimate_current_memory(); + self.current_memory_bytes.store(current, Ordering::Relaxed); + + // Update peak if necessary + let peak = self.peak_memory_bytes.load(Ordering::Relaxed); + if current > peak { + self.peak_memory_bytes.store(current, Ordering::Relaxed); + } + + // Store measurement + let measurement = MemoryMeasurement { + timestamp: std::time::Instant::now(), + memory_bytes: current, + description: description.to_string(), + }; + + if let Ok(mut measurements) = self.measurements.lock() { + measurements.push(measurement); + } + } + + /// Get current memory statistics + pub fn get_stats(&self) -> MemoryStats { + let current = self.current_memory_bytes.load(Ordering::Relaxed); + let peak = self.peak_memory_bytes.load(Ordering::Relaxed); + + let measurements = if let Ok(m) = self.measurements.lock() { + m.clone() + } else { + Vec::new() + }; + + let avg_bytes = if measurements.is_empty() { + 0.0 + } else { + measurements.iter().map(|m| m.memory_bytes as f64).sum::() + / measurements.len() as f64 + }; + + MemoryStats { + current_bytes: current, + peak_bytes: peak, + total_measurements: measurements.len(), + avg_bytes, + measurements, + } + } + + /// Reset all measurements + pub fn reset(&self) { + self.current_memory_bytes.store(0, Ordering::Relaxed); + self.peak_memory_bytes.store(0, Ordering::Relaxed); + if let Ok(mut measurements) = self.measurements.lock() { + measurements.clear(); + } + } + + /// Estimate current memory usage of the process + fn estimate_current_memory(&self) -> usize { + // On macOS/Linux, try to read from /proc/self/status or use system calls + #[cfg(target_os = "macos")] + { + // For macOS, try using sysctl first, then fallback to basic estimation + match self.get_memory_macos_simple() { + Ok(mem) if mem > 0 => mem, + _ => self.estimate_heap_usage(), + } + } + #[cfg(target_os = "linux")] + { + match self.get_memory_linux() { + mem if mem > 0 => mem, + _ => self.estimate_heap_usage(), + } + } + #[cfg(not(any(target_os = "macos", target_os = "linux")))] + { + // Fallback: estimate based on heap allocation (rough approximation) + self.estimate_heap_usage() + } + } + + /// Simple heap usage estimation (very rough) + fn estimate_heap_usage(&self) -> usize { + // This is a very rough estimation based on typical memory patterns + // In a real implementation, you might use a memory allocator that tracks usage + + // Rough estimation: assume we're using around 50-100MB for a typical session + // This is obviously very imprecise but gives us something to work with + let estimated_base = 50 * 1024 * 1024; // 50MB base + + // Add some dynamic component based on time (simulating growth) + let dynamic_component = (std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + % 1000) as usize + * 1024; // Up to 1MB variation + + estimated_base + dynamic_component + } + + #[cfg(target_os = "macos")] + fn get_memory_macos_simple(&self) -> Result> { + // Try using ps command as a fallback + use std::process::Command; + + let output = Command::new("ps") + .args(&["-o", "rss=", "-p", &std::process::id().to_string()]) + .output()?; + + if output.status.success() { + let rss_str = std::str::from_utf8(&output.stdout)?; + let rss_kb: usize = rss_str.trim().parse()?; + Ok(rss_kb * 1024) // Convert KB to bytes + } else { + Err("ps command failed".into()) + } + } + + #[cfg(target_os = "macos")] + fn get_memory_macos(&self) -> usize { + use std::mem; + use std::ptr; + + #[repr(C)] + struct task_basic_info { + virtual_size: u32, + resident_size: u32, + policy: u32, + flags: u32, + } + + extern "C" { + fn mach_task_self() -> u32; + fn task_info( + target_task: u32, + flavor: u32, + task_info_out: *mut task_basic_info, + task_info_outCnt: *mut u32, + ) -> i32; + } + + const TASK_BASIC_INFO: u32 = 5; + let mut info: task_basic_info = unsafe { mem::zeroed() }; + let mut count = (mem::size_of::() / mem::size_of::()) as u32; + + let result = unsafe { + task_info( + mach_task_self(), + TASK_BASIC_INFO, + &mut info as *mut task_basic_info, + &mut count, + ) + }; + + if result == 0 { + info.resident_size as usize + } else { + 0 + } + } + + #[cfg(target_os = "linux")] + fn get_memory_linux(&self) -> usize { + use std::fs; + + if let Ok(contents) = fs::read_to_string("/proc/self/status") { + for line in contents.lines() { + if line.starts_with("VmRSS:") { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + if let Ok(kb) = parts[1].parse::() { + return kb * 1024; // Convert KB to bytes + } + } + } + } + } + 0 + } + + /// Format bytes in human-readable format + pub fn format_bytes(bytes: usize) -> String { + const UNITS: &[&str] = &["B", "KB", "MB", "GB"]; + let mut size = bytes as f64; + let mut unit_index = 0; + + while size >= 1024.0 && unit_index < UNITS.len() - 1 { + size /= 1024.0; + unit_index += 1; + } + + format!("{:.2} {}", size, UNITS[unit_index]) + } +} + +impl Default for MemoryTracker { + fn default() -> Self { + Self::new() + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index f797e0d..b96e7ca 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1,3 +1,4 @@ +pub mod memory_tracker; pub mod segmented_storage; pub mod util; pub mod indexing { diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs index 1cd0155..38fb557 100644 --- a/src/storage/segmented_storage.rs +++ b/src/storage/segmented_storage.rs @@ -1,10 +1,8 @@ use core::time; use std::{ collections::VecDeque, - fmt::format, io::{BufWriter, Read, Seek, SeekFrom, Write}, - ops::Index, - panic::set_hook, + rc::Rc, sync::{Arc, Mutex, RwLock}, thread::JoinHandle, time::{Duration, SystemTime, UNIX_EPOCH}, @@ -21,16 +19,18 @@ use crate::{ }, }; +#[doc = "Struct for the Implementation of the Segmented Storage of RDF Streams."] pub struct StreamingSegmentedStorage { batch_buffer: Arc>, segments: Arc>>, - dictionary: Arc>, + dictionary: Rc>, flush_handle: Option>, shutdown_signal: Arc>, config: StreamingConfig, } impl StreamingSegmentedStorage { + #[doc = ""] pub fn new(config: StreamingConfig) -> std::io::Result { std::fs::create_dir_all(&config.segment_base_path)?; @@ -43,7 +43,7 @@ impl StreamingSegmentedStorage { })), segments: Arc::new(RwLock::new(Vec::new())), - dictionary: Arc::new(RwLock::new(Dictionary::new())), + dictionary: Rc::new(RwLock::new(Dictionary::new())), flush_handle: None, shutdown_signal: Arc::new(Mutex::new(false)), config, @@ -52,6 +52,7 @@ impl StreamingSegmentedStorage { Ok(storage) } + #[doc = ""] pub fn start_background_flushing(&mut self) { let batch_buffer_clone = Arc::clone(&self.batch_buffer); let segments_clone = Arc::clone(&self.segments); @@ -86,10 +87,8 @@ impl StreamingSegmentedStorage { batch_buffer.events.push_back(event); } - // Note: Synchronous flushing removed for high throughput. // Background thread handles all flushing based on time limits. - Ok(()) } @@ -120,12 +119,12 @@ impl StreamingSegmentedStorage { // Use saturating subtraction to avoid underflow if oldest > current_timestamp current_timestamp.saturating_sub(oldest) - >= self.config.max_batch_age_seconds * 1_000_000_000 + >= self.config.max_batch_age_seconds * 1_000 }) } fn current_timestamp() -> u64 { - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 } fn flush_batch_buffer_to_segment(&self) -> std::io::Result<()> { @@ -415,7 +414,7 @@ impl StreamingSegmentedStorage { || batch_buffer.oldest_timestamp_bound.map_or(false, |oldest| { let current_timestamp = Self::current_timestamp(); current_timestamp.saturating_sub(oldest) - >= config.max_batch_age_seconds * 1_000_000_000 + >= config.max_batch_age_seconds * 1_000 }) }; @@ -640,6 +639,6 @@ impl StreamingSegmentedStorage { } fn generate_segment_id() -> u64 { - SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 + SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 } } diff --git a/src/storage/util.rs b/src/storage/util.rs index 8069e85..e6f7a2d 100644 --- a/src/storage/util.rs +++ b/src/storage/util.rs @@ -5,6 +5,15 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use crate::core::Event; +#[derive(Debug)] +/// Storage component memory usage breakdown +pub struct StorageComponentSizes { + pub batch_buffer_bytes: usize, + pub segments_count: usize, + pub dictionary_bytes: usize, + pub estimated_total_bytes: usize, +} + #[derive(Debug)] /// In-memory buffer that batches events before persisting them to disk pub struct BatchBuffer { diff --git a/tests/dictionary_encoding_test.rs b/tests/dictionary_encoding_test.rs index 75c6e9d..00a4f1e 100644 --- a/tests/dictionary_encoding_test.rs +++ b/tests/dictionary_encoding_test.rs @@ -22,9 +22,10 @@ //! - Object ID → "23.5" (the literal value) //! - Datatype ID → "http://www.w3.org/2001/XMLSchema#double" (if needed) -use janus::indexing::dictionary::Dictionary; -use janus::indexing::shared::{decode_record, encode_record, LogWriter, RECORD_SIZE}; -use janus::indexing::sparse::{build_sparse_index, SparseReader}; +use janus::core::encoding::{decode_record, encode_record, RECORD_SIZE}; +use janus::indexing::shared::LogWriter; +use janus::storage::indexing::dictionary::Dictionary; +use janus::storage::indexing::sparse::{build_sparse_index, SparseReader}; use std::fs; use std::path::Path; @@ -39,30 +40,30 @@ fn test_rdf_syntax_to_dictionary_mapping() { // Subject: → stored as the URI string let subject = "https://rsp.js/event1"; - let subject_id = dict.fetch_id(subject); + let subject_id = dict.encode(subject); // Predicate: → stored as the URI string let predicate = "http://www.w3.org/ns/saref#hasValue"; - let predicate_id = dict.fetch_id(predicate); + let predicate_id = dict.encode(predicate); // Object: "23.5"^^xsd:double → stored as the literal value "23.5" let object = "23.5"; - let object_id = dict.fetch_id(object); + let object_id = dict.encode(object); // Datatype: ^^ → stored as URI string let datatype = "http://www.w3.org/2001/XMLSchema#double"; - let datatype_id = dict.fetch_id(datatype); + let datatype_id = dict.encode(datatype); // Graph: → stored as the URI string let graph = "https://example.org/graph"; - let graph_id = dict.fetch_id(graph); + let graph_id = dict.encode(graph); // Verify all components are stored correctly - assert_eq!(dict.fetch_uri(subject_id), Some(subject)); - assert_eq!(dict.fetch_uri(predicate_id), Some(predicate)); - assert_eq!(dict.fetch_uri(object_id), Some(object)); - assert_eq!(dict.fetch_uri(datatype_id), Some(datatype)); - assert_eq!(dict.fetch_uri(graph_id), Some(graph)); + assert_eq!(dict.decode(subject_id), Some(subject)); + assert_eq!(dict.decode(predicate_id), Some(predicate)); + assert_eq!(dict.decode(object_id), Some(object)); + assert_eq!(dict.decode(datatype_id), Some(datatype)); + assert_eq!(dict.decode(graph_id), Some(graph)); // In a real system, you'd also store metadata about which IDs are literals vs URIs // and what datatype each literal has. This test just demonstrates the string storage. @@ -91,34 +92,34 @@ fn test_rdf_literal_datatypes() { let label_datatype = "http://www.w3.org/2001/XMLSchema#string"; // Store all values and datatypes in dictionary - let timestamp_val_id = dict.fetch_id(timestamp_value); - let timestamp_dt_id = dict.fetch_id(timestamp_datatype); + let timestamp_val_id = dict.encode(timestamp_value); + let timestamp_dt_id = dict.encode(timestamp_datatype); - let temp_val_id = dict.fetch_id(temp_value); - let temp_dt_id = dict.fetch_id(temp_datatype); + let temp_val_id = dict.encode(temp_value); + let temp_dt_id = dict.encode(temp_datatype); - let count_val_id = dict.fetch_id(count_value); - let count_dt_id = dict.fetch_id(count_datatype); + let count_val_id = dict.encode(count_value); + let count_dt_id = dict.encode(count_datatype); - let label_val_id = dict.fetch_id(label_value); - let label_dt_id = dict.fetch_id(label_datatype); + let label_val_id = dict.encode(label_value); + let label_dt_id = dict.encode(label_datatype); // Verify all are stored correctly - assert_eq!(dict.fetch_uri(timestamp_val_id), Some(timestamp_value)); - assert_eq!(dict.fetch_uri(timestamp_dt_id), Some(timestamp_datatype)); + assert_eq!(dict.decode(timestamp_val_id), Some(timestamp_value)); + assert_eq!(dict.decode(timestamp_dt_id), Some(timestamp_datatype)); - assert_eq!(dict.fetch_uri(temp_val_id), Some(temp_value)); - assert_eq!(dict.fetch_uri(temp_dt_id), Some(temp_datatype)); + assert_eq!(dict.decode(temp_val_id), Some(temp_value)); + assert_eq!(dict.decode(temp_dt_id), Some(temp_datatype)); - assert_eq!(dict.fetch_uri(count_val_id), Some(count_value)); - assert_eq!(dict.fetch_uri(count_dt_id), Some(count_datatype)); + assert_eq!(dict.decode(count_val_id), Some(count_value)); + assert_eq!(dict.decode(count_dt_id), Some(count_datatype)); - assert_eq!(dict.fetch_uri(label_val_id), Some(label_value)); - assert_eq!(dict.fetch_uri(label_dt_id), Some(label_datatype)); + assert_eq!(dict.decode(label_val_id), Some(label_value)); + assert_eq!(dict.decode(label_dt_id), Some(label_datatype)); // Note: Datatype URIs are reused across multiple literals // E.g., many literals will have ^^xsd:double as their datatype - assert_eq!(temp_dt_id, dict.fetch_id(temp_datatype)); // Same ID when requested again + assert_eq!(temp_dt_id, dict.encode(temp_datatype)); // Same ID when requested again } #[test] @@ -132,35 +133,35 @@ fn test_dictionary_basic_operations() { let uri4 = "http://www.w3.org/ns/ssn#observedBy"; // First insertion should return ID 0 - let id1 = dict.fetch_id(uri1); + let id1 = dict.encode(uri1); assert_eq!(id1, 0); // Subsequent insertions should return sequential IDs - let id2 = dict.fetch_id(uri2); + let id2 = dict.encode(uri2); assert_eq!(id2, 1); - let id3 = dict.fetch_id(uri3); + let id3 = dict.encode(uri3); assert_eq!(id3, 2); - let id4 = dict.fetch_id(uri4); + let id4 = dict.encode(uri4); assert_eq!(id4, 3); // Requesting same URI should return same ID - let id1_again = dict.fetch_id(uri1); + let id1_again = dict.encode(uri1); assert_eq!(id1_again, id1); // Test retrieval - assert_eq!(dict.fetch_uri(id1), Some(uri1)); - assert_eq!(dict.fetch_uri(id2), Some(uri2)); - assert_eq!(dict.fetch_uri(id3), Some(uri3)); - assert_eq!(dict.fetch_uri(id4), Some(uri4)); + assert_eq!(dict.decode(id1), Some(uri1)); + assert_eq!(dict.decode(id2), Some(uri2)); + assert_eq!(dict.decode(id3), Some(uri3)); + assert_eq!(dict.decode(id4), Some(uri4)); // Test invalid ID - assert_eq!(dict.fetch_uri(999), None); + assert_eq!(dict.decode(999), None); // Test length - assert_eq!(dict.len(), 4); - assert!(!dict.is_empty()); + assert_eq!(dict.id_to_uri.len(), 4); + assert!(!dict.id_to_uri.is_empty()); } #[test] @@ -180,7 +181,7 @@ fn test_dictionary_persistence() -> std::io::Result<()> { "https://solid.ti.rw.fau.de/public/ns/stream#", ]; - let ids: Vec = uris.iter().map(|uri| dict.fetch_id(uri)).collect(); + let ids: Vec = uris.iter().map(|uri| dict.encode(uri)).collect(); // Save to file dict.save_to_file(&dict_path)?; @@ -190,10 +191,10 @@ fn test_dictionary_persistence() -> std::io::Result<()> { // Verify all URIs are preserved with correct IDs for (i, uri) in uris.iter().enumerate() { - assert_eq!(loaded_dict.fetch_uri(ids[i]), Some(*uri)); + assert_eq!(loaded_dict.decode(ids[i]), Some(*uri)); } - assert_eq!(loaded_dict.len(), uris.len()); + assert_eq!(loaded_dict.id_to_uri.len(), uris.len()); Ok(()) } @@ -214,10 +215,10 @@ fn test_rdf_event_encoding_with_dictionary() { // Map URIs to IDs let timestamp: u64 = 1699181400; - let subject_id = dict.fetch_id(subject_uri); - let predicate_id = dict.fetch_id(predicate_uri); - let object_id = dict.fetch_id(object_uri); - let graph_id = dict.fetch_id(graph_uri); + let subject_id = dict.encode(subject_uri); + let predicate_id = dict.encode(predicate_uri); + let object_id = dict.encode(object_uri); + let graph_id = dict.encode(graph_uri); // Encode record with IDs let mut buffer = [0u8; RECORD_SIZE]; @@ -234,10 +235,10 @@ fn test_rdf_event_encoding_with_dictionary() { assert_eq!(dec_graph, graph_id); // Resolve IDs back to URIs - assert_eq!(dict.fetch_uri(dec_subject), Some(subject_uri)); - assert_eq!(dict.fetch_uri(dec_predicate), Some(predicate_uri)); - assert_eq!(dict.fetch_uri(dec_object), Some(object_uri)); - assert_eq!(dict.fetch_uri(dec_graph), Some(graph_uri)); + assert_eq!(dict.decode(dec_subject), Some(subject_uri)); + assert_eq!(dict.decode(dec_predicate), Some(predicate_uri)); + assert_eq!(dict.decode(dec_object), Some(object_uri)); + assert_eq!(dict.decode(dec_graph), Some(graph_uri)); } #[test] @@ -258,10 +259,10 @@ fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> { ]; // Map predicates to IDs first (these will be reused) - let predicate_ids: Vec = predicates.iter().map(|p| dict.fetch_id(p)).collect(); + let predicate_ids: Vec = predicates.iter().map(|p| dict.encode(p)).collect(); let graph_uri = "https://solid.ti.rw.fau.de/public/ns/stream#iot"; - let graph_id = dict.fetch_id(graph_uri); + let graph_id = dict.encode(graph_uri); // Create log writer let mut writer = LogWriter::create(&log_path)?; @@ -272,14 +273,14 @@ fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> { // Each event has unique subject (sensor reading ID) let subject_uri = format!("https://rsp.js/event/sensor-reading-{:03}", i); - let subject_id = dict.fetch_id(&subject_uri); + let subject_id = dict.encode(&subject_uri); // Rotate through predicates (demonstrating reuse) let predicate_id = predicate_ids[(i % predicate_ids.len() as u64) as usize]; // Unique object (sensor value) let object_uri = format!("value-{}", i * 10); - let object_id = dict.fetch_id(&object_uri); + let object_id = dict.encode(&object_uri); writer.append_record(timestamp, subject_id, predicate_id, object_id, graph_id)?; } @@ -293,11 +294,11 @@ fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> { // - 100 unique objects // - 1 graph URI // Total: 205 unique URIs - assert_eq!(dict.len(), 205); + assert_eq!(dict.id_to_uri.len(), 205); // Verify predicate reuse - predicates should have low IDs (0-3) for (i, pred) in predicates.iter().enumerate() { - assert_eq!(dict.fetch_id(pred), i as u64); + assert_eq!(dict.encode(pred), i as u32); } Ok(()) @@ -319,10 +320,10 @@ fn test_sparse_index_with_dictionary_integration() -> std::io::Result<()> { let predicates = vec!["http://www.w3.org/ns/saref#hasTimestamp", "http://www.w3.org/ns/saref#hasValue"]; - let predicate_ids: Vec = predicates.iter().map(|p| dict.fetch_id(p)).collect(); + let predicate_ids: Vec = predicates.iter().map(|p| dict.encode(p)).collect(); let graph_uri = "https://example.org/graph/sensors"; - let graph_id = dict.fetch_id(graph_uri); + let graph_id = dict.encode(graph_uri); // Create log with 1000 events let mut writer = LogWriter::create(&log_path)?; @@ -330,10 +331,10 @@ fn test_sparse_index_with_dictionary_integration() -> std::io::Result<()> { for i in 0..1000 { let timestamp = i; let subject_uri = format!("https://rsp.js/event/{:04}", i); - let subject_id = dict.fetch_id(&subject_uri); + let subject_id = dict.encode(&subject_uri); let predicate_id = predicate_ids[(i % 2) as usize]; let object_uri = format!("reading-{}", i); - let object_id = dict.fetch_id(&object_uri); + let object_id = dict.encode(&object_uri); writer.append_record(timestamp, subject_id, predicate_id, object_id, graph_id)?; } @@ -380,11 +381,11 @@ fn test_large_uri_handling() { "measurement-with-very-long-identifier-12345678901234567890" ); - let id = dict.fetch_id(&long_uri); + let id = dict.encode(&long_uri); assert_eq!(id, 0); // Verify retrieval works - assert_eq!(dict.fetch_uri(id), Some(long_uri.as_str())); + assert_eq!(dict.decode(id), Some(long_uri.as_str())); // Test that we can handle many long URIs for i in 0..100 { @@ -394,10 +395,10 @@ fn test_large_uri_handling() { i * 2, i * 3 ); - dict.fetch_id(&uri); + dict.encode(&uri); } - assert_eq!(dict.len(), 101); + assert_eq!(dict.id_to_uri.len(), 101); } #[test] @@ -415,24 +416,24 @@ fn test_rdf_namespace_reuse() { ]; // Map each namespace - let namespace_ids: Vec = common_namespaces.iter().map(|ns| dict.fetch_id(ns)).collect(); + let namespace_ids: Vec = common_namespaces.iter().map(|ns| dict.encode(ns)).collect(); // Create 1000 events that all use these namespaces for i in 0..1000 { let event_uri = format!("https://rsp.js/event/{}", i); - dict.fetch_id(&event_uri); + dict.encode(&event_uri); // Reference one of the common namespaces let ns_id = namespace_ids[i % namespace_ids.len()]; - assert!(dict.fetch_uri(ns_id).is_some()); + assert!(dict.decode(ns_id).is_some()); } // Dictionary should have: 6 namespaces + 1000 events = 1006 entries - assert_eq!(dict.len(), 1006); + assert_eq!(dict.id_to_uri.len(), 1006); // Verify namespace IDs are unchanged (demonstrating reuse) for (i, ns) in common_namespaces.iter().enumerate() { - assert_eq!(dict.fetch_id(ns), namespace_ids[i]); + assert_eq!(dict.encode(ns), namespace_ids[i]); } } @@ -474,10 +475,10 @@ fn test_event_resolution_workflow() -> std::io::Result<()> { let mut writer = LogWriter::create(&log_path)?; for (timestamp, subject, predicate, object, graph) in &event_uris { - let subject_id = dict.fetch_id(subject); - let predicate_id = dict.fetch_id(predicate); - let object_id = dict.fetch_id(object); - let graph_id = dict.fetch_id(graph); + let subject_id = dict.encode(subject); + let predicate_id = dict.encode(predicate); + let object_id = dict.encode(object); + let graph_id = dict.encode(graph); writer.append_record(*timestamp, subject_id, predicate_id, object_id, graph_id)?; } @@ -498,10 +499,10 @@ fn test_event_resolution_workflow() -> std::io::Result<()> { assert_eq!(dec_ts, *timestamp); // Resolve IDs to URIs - assert_eq!(dict.fetch_uri(dec_subj_id), Some(*subject)); - assert_eq!(dict.fetch_uri(dec_pred_id), Some(*predicate)); - assert_eq!(dict.fetch_uri(dec_obj_id), Some(*object)); - assert_eq!(dict.fetch_uri(dec_graph_id), Some(*graph)); + assert_eq!(dict.decode(dec_subj_id), Some(*subject)); + assert_eq!(dict.decode(dec_pred_id), Some(*predicate)); + assert_eq!(dict.decode(dec_obj_id), Some(*object)); + assert_eq!(dict.decode(dec_graph_id), Some(*graph)); } Ok(()) @@ -522,7 +523,7 @@ fn test_dictionary_space_savings() { let raw_size: usize = uris.iter().map(|u| u.len()).sum(); // With dictionary, we store 8 bytes per ID - let ids: Vec = uris.iter().map(|u| dict.fetch_id(u)).collect(); + let ids: Vec = uris.iter().map(|u| dict.encode(u)).collect(); let encoded_size = ids.len() * 8; // 8 bytes per u64 println!("Raw URIs size: {} bytes", raw_size); @@ -572,7 +573,7 @@ fn test_complete_rdf_quad_with_datatype() { // Store all components and get their IDs let mut component_ids = std::collections::HashMap::new(); for (name, value) in &components { - let id = dict.fetch_id(value); + let id = dict.encode(value); component_ids.insert(*name, id); println!("{}: '{}' → ID {}", name, value, id); } @@ -588,11 +589,11 @@ fn test_complete_rdf_quad_with_datatype() { // that tracks which object IDs are literals and what their datatypes are. // Verify retrieval - assert_eq!(dict.fetch_uri(component_ids["subject"]), Some(components[0].1)); - assert_eq!(dict.fetch_uri(component_ids["predicate"]), Some(components[1].1)); - assert_eq!(dict.fetch_uri(component_ids["object_value"]), Some(components[2].1)); - assert_eq!(dict.fetch_uri(component_ids["object_datatype"]), Some(components[3].1)); - assert_eq!(dict.fetch_uri(component_ids["graph"]), Some(components[4].1)); + assert_eq!(dict.decode(component_ids["subject"]), Some(components[0].1)); + assert_eq!(dict.decode(component_ids["predicate"]), Some(components[1].1)); + assert_eq!(dict.decode(component_ids["object_value"]), Some(components[2].1)); + assert_eq!(dict.decode(component_ids["object_datatype"]), Some(components[3].1)); + assert_eq!(dict.decode(component_ids["graph"]), Some(components[4].1)); // Another quad with the same datatype: // "65.2"^^ . @@ -600,13 +601,13 @@ fn test_complete_rdf_quad_with_datatype() { let subject2 = "https://rsp.js/event/humidity-sensor-001"; let value2 = "65.2"; - let _subject2_id = dict.fetch_id(subject2); - let _value2_id = dict.fetch_id(value2); + let _subject2_id = dict.encode(subject2); + let _value2_id = dict.encode(value2); // These components are REUSED (same ID returned): - let predicate2_id = dict.fetch_id("http://www.w3.org/ns/saref#hasValue"); - let datatype2_id = dict.fetch_id("http://www.w3.org/2001/XMLSchema#double"); - let graph2_id = dict.fetch_id("https://example.org/graph/sensors"); + let predicate2_id = dict.encode("http://www.w3.org/ns/saref#hasValue"); + let datatype2_id = dict.encode("http://www.w3.org/2001/XMLSchema#double"); + let graph2_id = dict.encode("https://example.org/graph/sensors"); // Verify reuse assert_eq!(predicate2_id, component_ids["predicate"]); @@ -614,9 +615,9 @@ fn test_complete_rdf_quad_with_datatype() { assert_eq!(graph2_id, component_ids["graph"]); // Dictionary has: 5 original components + 2 new (subject2, value2) = 7 total - assert_eq!(dict.len(), 7); + assert_eq!(dict.id_to_uri.len(), 7); println!("\n✓ Demonstrated RDF datatype handling with dictionary encoding"); println!("✓ Showed URI reuse across multiple quads (predicate, datatype, graph)"); - println!("✓ Dictionary size: {} entries for 2 complete RDF quads", dict.len()); + println!("✓ Dictionary size: {} entries for 2 complete RDF quads", dict.id_to_uri.len()); } From f4a22a536182d65adc92fdc69261fb4f7da75797 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 12 Nov 2025 15:31:58 +0100 Subject: [PATCH 12/19] Remove examples and benchmark scripts; refactor main and storage indexing for improved performance and clarity - Deleted the basic example demonstrating Janus RDF Stream Processing Engine. - Removed the comprehensive benchmark script for testing Dense vs Sparse indexing approaches. - Refactored `main.rs` to clean up print statements and improve readability. - Updated `dictionary.rs` to simplify logging in tests. - Corrected file naming in `segmented_storage.rs` for index files. - Enhanced the `load_index_directory_from_file` function to reconstruct index blocks accurately. - Added new examples for point query and range query benchmarks, focusing on realistic IoT sensor data. - Implemented a realistic RDF benchmark for IoT sensor observations, analyzing write and read performance. --- CONTRIBUTING.md | 2 +- Cargo.toml | 12 -- GETTING_STARTED.md | 2 +- benches/README.md | 102 ---------- benches/analysis.rs | 208 ------------------- benches/benchmark.rs | 128 ------------ benches/write_benchmark.rs | 216 -------------------- examples/basic.rs | 42 ---- examples/point_query_benchmark.rs | 147 +++++++++++++ examples/range_query_benchmark.rs | 273 +++++++++++++++++++++++++ examples/realistic_rdf_benchmark.rs | 306 ++++++++++++++++++++++++++++ run_benchmarks.sh | 47 ----- src/main.rs | 12 +- src/storage/indexing/dictionary.rs | 2 +- src/storage/segmented_storage.rs | 86 +++++++- 15 files changed, 811 insertions(+), 774 deletions(-) delete mode 100644 benches/README.md delete mode 100644 benches/analysis.rs delete mode 100644 benches/benchmark.rs delete mode 100644 benches/write_benchmark.rs delete mode 100644 examples/basic.rs create mode 100644 examples/point_query_benchmark.rs create mode 100644 examples/range_query_benchmark.rs create mode 100644 examples/realistic_rdf_benchmark.rs delete mode 100755 run_benchmarks.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b10a4dd..71f1953 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -382,4 +382,4 @@ By contributing to Janus, you agree that your contributions will be licensed und --- -Thank you for contributing to Janus! 🎉 \ No newline at end of file +Thank you for contributing to Janus! \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 4f9485e..8601fa5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,15 +36,3 @@ opt-level = 0 [profile.test] opt-level = 0 - -[[bench]] -name = "benchmark" -harness = false - -[[bench]] -name = "write_benchmark" -harness = false - -[[bench]] -name = "analysis" -harness = false diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index df2f77b..c693f31 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -399,4 +399,4 @@ This project is licensed under the MIT License - see [LICENCE.md](LICENCE.md) fo --- -Happy coding with Janus! 🚀 \ No newline at end of file +Happy coding with Janus! \ No newline at end of file diff --git a/benches/README.md b/benches/README.md deleted file mode 100644 index 2892fd1..0000000 --- a/benches/README.md +++ /dev/null @@ -1,102 +0,0 @@ -# RDF Indexing Benchmarks - -This directory contains comprehensive benchmarks for comparing different RDF indexing strategies in Janus. - -## Available Benchmarks - -### 1. `benchmark.rs` - Read Performance (Original) -Tests query performance on pre-built indexes: -- Index building time comparison -- Query speed across different data ranges -- Memory usage comparison - -### 2. `write_benchmark.rs` - Write Performance (New) -Tests writing performance during record insertion: -- Real-time indexing (index while writing) -- Batch indexing (build index after writing) -- Writing throughput comparison -- Total processing time analysis - -### 3. `analysis.rs` - Advanced Analysis (New) -Detailed analysis across multiple dimensions: -- Optimal sparse interval analysis -- Memory usage scaling -- Write throughput under different conditions -- Performance recommendations - -## Quick Start - -### Run All Benchmarks -```bash -./run_benchmarks.sh -``` - -### Run Individual Benchmarks -```bash -# Original read performance benchmark -cargo bench --bench benchmark - -# New write performance benchmark -cargo bench --bench write_benchmark - -# Advanced analysis suite -cargo bench --bench analysis -``` - -## Step-by-Step Testing Instructions - -Benchmarks are organized by functionality: - -- `query_parsing.rs` - Benchmarks for parsing RSP-QL queries -- `stream_processing.rs` - Benchmarks for stream processing operations -- `store_operations.rs` - Benchmarks for RDF store interactions -- `integration.rs` - End-to-end integration benchmarks - -## Adding New Benchmarks - -To add a new benchmark: - -1. Create a new file in the `benches/` directory -2. Add the benchmark to `Cargo.toml`: - -```toml -[[bench]] -name = "my_benchmark" -harness = false -``` - -3. Use the `criterion` crate for benchmarking: - -```rust -use criterion::{black_box, criterion_group, criterion_main, Criterion}; - -fn benchmark_function(c: &mut Criterion) { - c.bench_function("my_function", |b| { - b.iter(|| { - // Code to benchmark - black_box(my_function()) - }); - }); -} - -criterion_group!(benches, benchmark_function); -criterion_main!(benches); -``` - -## Benchmark Results - -Benchmark results are stored in `target/criterion/` and include: - -- HTML reports with graphs -- Comparison with previous runs -- Statistical analysis - -To view results, open `target/criterion/report/index.html` in a browser. - -## Performance Tips - -- Run benchmarks in release mode (default for `cargo bench`) -- Ensure system is idle during benchmarking -- Use consistent hardware for comparisons -- Run multiple iterations to reduce noise -- Use `black_box()` to prevent compiler optimizations diff --git a/benches/analysis.rs b/benches/analysis.rs deleted file mode 100644 index cbe6858..0000000 --- a/benches/analysis.rs +++ /dev/null @@ -1,208 +0,0 @@ -use janus::indexing::{dense, sparse}; -use std::fs; -use std::time::Instant; - -/// Analyze different sparse intervals to find optimal configuration -fn analyze_sparse_intervals() -> std::io::Result<()> { - println!("🔍 Analyzing Different Sparse Intervals"); - println!("====================================="); - - let intervals = vec![100, 500, 1000, 2000, 5000, 10000]; - let log_file = "data/benchmark/log.dat"; - let number_records = 100_000u64; - - // Create test data - fs::create_dir_all("data/benchmark")?; - let mut writer = janus::indexing::shared::LogWriter::create(log_file)?; - for i in 0..number_records { - writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; - } - writer.flush()?; - - println!("Testing {} records with different intervals:", number_records); - println!("{:-<80}", ""); - println!( - "{:<10} {:<15} {:<15} {:<20} {:<15}", - "Interval", "Build Time(ms)", "Index Size(KB)", "Space Savings(%)", "Query Time(ms)" - ); - println!("{:-<80}", ""); - - // Get dense index stats for comparison - let dense_start = Instant::now(); - dense::build_dense_index(log_file, "data/benchmark/dense_ref.idx")?; - let dense_build_time = dense_start.elapsed(); - let dense_reader = dense::DenseIndexReader::open("data/benchmark/dense_ref.idx")?; - let dense_size = dense_reader.index_size_bytes(); - - // Test query performance on dense index - let query_start = Instant::now(); - let _dense_results = dense_reader.query(log_file, 10000, 20000)?; - let dense_query_time = query_start.elapsed(); - - for interval in intervals { - let index_file = format!("data/benchmark/sparse_{}.idx", interval); - - // Build sparse index - let start = Instant::now(); - sparse::build_sparse_index(log_file, &index_file, &interval)?; - let build_time = start.elapsed(); - - // Get size info - let reader = sparse::SparseReader::open(&index_file, interval)?; - let sparse_size = reader.index_size_bytes(); - let space_savings = ((dense_size - sparse_size) as f64 / dense_size as f64) * 100.0; - - // Test query performance - let query_start = Instant::now(); - let _sparse_results = reader.query(log_file, 10000, 20000)?; - let query_time = query_start.elapsed(); - - println!( - "{:<10} {:<15.3} {:<15.2} {:<20.2} {:<15.3}", - interval, - build_time.as_secs_f64() * 1000.0, - sparse_size as f64 / 1024.0, - space_savings, - query_time.as_secs_f64() * 1000.0 - ); - } - - println!("{:-<80}", ""); - println!( - "Dense Reference: Build: {:.3}ms, Size: {:.2}KB, Query: {:.3}ms", - dense_build_time.as_secs_f64() * 1000.0, - dense_size as f64 / 1024.0, - dense_query_time.as_secs_f64() * 1000.0 - ); - - Ok(()) -} - -/// Analyze memory usage patterns -fn analyze_memory_usage() -> std::io::Result<()> { - println!("\n🧠 Memory Usage Analysis"); - println!("======================="); - - let record_counts = vec![10_000, 50_000, 100_000, 500_000, 1_000_000]; - - println!( - "{:<12} {:<15} {:<15} {:<20}", - "Records", "Dense Size(MB)", "Sparse Size(MB)", "Memory Ratio" - ); - println!("{:-<62}", ""); - - for &count in &record_counts { - let log_file = format!("data/benchmark/log_{}.dat", count); - let dense_index = format!("data/benchmark/dense_{}.idx", count); - let sparse_index = format!("data/benchmark/sparse_{}.idx", count); - - // Create test data - let mut writer = janus::indexing::shared::LogWriter::create(&log_file)?; - for i in 0..count { - writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; - } - writer.flush()?; - - // Build indexes - dense::build_dense_index(&log_file, &dense_index)?; - sparse::build_sparse_index(&log_file, &sparse_index, &1000)?; - - // Get sizes - let dense_reader = dense::DenseIndexReader::open(&dense_index)?; - let sparse_reader = sparse::SparseReader::open(&sparse_index, 1000)?; - - let dense_size = dense_reader.index_size_bytes() as f64 / 1_000_000.0; - let sparse_size = sparse_reader.index_size_bytes() as f64 / 1_000_000.0; - let ratio = dense_size / sparse_size; - - println!("{:<12} {:<15.3} {:<15.3} {:<20.2}x", count, dense_size, sparse_size, ratio); - } - - Ok(()) -} - -/// Test write throughput under different conditions -fn analyze_write_throughput() -> std::io::Result<()> { - println!("\n⚡ Write Throughput Analysis"); - println!("==========================="); - - let test_configs = vec![ - ("Small batches", 1_000u64), - ("Medium batches", 10_000u64), - ("Large batches", 100_000u64), - ]; - - println!( - "{:<15} {:<20} {:<20} {:<15}", - "Batch Size", "Dense (rec/sec)", "Sparse (rec/sec)", "Speedup" - ); - println!("{:-<70}", ""); - - for (name, batch_size) in test_configs { - fs::create_dir_all("data/benchmark")?; - - // Test dense writing - let dense_log = "data/benchmark/dense_throughput.dat"; - let dense_index = "data/benchmark/dense_throughput.idx"; - - let start = Instant::now(); - let mut log_writer = janus::indexing::shared::LogWriter::create(dense_log)?; - let mut index_builder = janus::indexing::dense::DenseIndexBuilder::create(dense_index)?; - - for i in 0..batch_size { - log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; - index_builder.add_entry(i, i * 24)?; - } - log_writer.flush()?; - index_builder.finalize()?; - - let dense_time = start.elapsed(); - let dense_throughput = batch_size as f64 / dense_time.as_secs_f64(); - - // Test sparse writing - let sparse_log = "data/benchmark/sparse_throughput.dat"; - let sparse_index = "data/benchmark/sparse_throughput.idx"; - - let start = Instant::now(); - let mut log_writer = janus::indexing::shared::LogWriter::create(sparse_log)?; - let mut index_builder = - janus::indexing::sparse::SparseIndexBuilder::create(sparse_index, 1000)?; - - for i in 0..batch_size { - log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?; - index_builder.add_entry(i, i, i * 24)?; - } - log_writer.flush()?; - index_builder.finalize()?; - - let sparse_time = start.elapsed(); - let sparse_throughput = batch_size as f64 / sparse_time.as_secs_f64(); - - let speedup = sparse_throughput / dense_throughput; - - println!( - "{:<15} {:<20.0} {:<20.0} {:<15.2}x", - name, dense_throughput, sparse_throughput, speedup - ); - } - - Ok(()) -} - -fn main() -> std::io::Result<()> { - println!("🔬 Advanced RDF Indexing Analysis Suite"); - println!("======================================="); - - analyze_sparse_intervals()?; - analyze_memory_usage()?; - analyze_write_throughput()?; - - println!("\n✨ Analysis Complete!"); - println!("\n💡 Recommendations:"); - println!(" • Use sparse indexing for write-heavy workloads"); - println!(" • Choose interval based on query precision requirements"); - println!(" • Consider hybrid approaches for different use cases"); - println!(" • Monitor memory usage with large datasets"); - - Ok(()) -} diff --git a/benches/benchmark.rs b/benches/benchmark.rs deleted file mode 100644 index 29aa79d..0000000 --- a/benches/benchmark.rs +++ /dev/null @@ -1,128 +0,0 @@ -use janus::indexing::{dense, shared::LogWriter, sparse}; -use std::fs; -use std::time::Instant; - -const DATA_DIR: &str = "data/benchmark"; -const LOG_FILE: &str = "data/benchmark/log.dat"; -const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx"; -const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx"; -const SPARSE_INTERVAL: usize = 1000; - -fn setup_data(number_records: u64) -> std::io::Result<()> { - let _ = fs::remove_dir_all(DATA_DIR); - fs::create_dir_all(DATA_DIR)?; - - let mut writer = LogWriter::create(LOG_FILE)?; - - for i in 0..number_records { - let timestamp = i; - let subject = (i % 1000) as u32; - let predicate = (i % 500) as u32; - let object = (i % 2000) as u32; - let graph: u32 = 1; - writer.append_record(timestamp, subject, predicate, object, graph)?; - } - - writer.flush()?; - - println!("Generated log file with {} records", writer.record_count()); - - Ok(()) -} - -fn benchmark_indexing() -> std::io::Result<()> { - println!("Indexing Benchmark"); - - let start = Instant::now(); - dense::build_dense_index(LOG_FILE, DENSE_INDEX_FILE)?; - let dense_time = start.elapsed(); - println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0); - - let start = Instant::now(); - sparse::build_sparse_index(LOG_FILE, SPARSE_INDEX_FILE, &SPARSE_INTERVAL)?; - let sparse_time = start.elapsed(); - println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0); - - let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?; - let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?; - println!( - "\n Dense Index Size: {} MB", - dense_reader.index_size_bytes() as f64 / 1_000_000.0 - ); - - println!( - "\n Sparse Index Size: {} MB", - sparse_reader.index_size_bytes() as f64 / 1_000_000.0 - ); - Ok(()) -} - -fn benchmark_queries() -> std::io::Result<()> { - println!("Query Benchmark"); - let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?; - let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?; - - let query_ranges = vec![ - (0u64, 100u64, "100 records"), - (5000u64, 5100u64, "100 records (mid-range)"), - (0u64, 10000u64, "10K records"), - (0u64, 100000u64, "100K records"), - (0u64, 1000000u64, "1M records"), - ]; - - for (timestamp_start, timestamp_end, description) in query_ranges { - println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end); - - let start = Instant::now(); - let dense_results = dense_reader.query(LOG_FILE, timestamp_start, timestamp_end)?; - let dense_time = start.elapsed(); - - let start = Instant::now(); - let sparse_results = sparse_reader.query(LOG_FILE, timestamp_start, timestamp_end)?; - let sparse_time = start.elapsed(); - - println!( - " Dense Index Query Time: {:.3} ms, Results: {}", - dense_time.as_secs_f64() * 1000.0, - dense_results.len() - ); - - println!( - " Sparse Index Query Time: {:.3} ms, Results: {}", - sparse_time.as_secs_f64() * 1000.0, - sparse_results.len() - ); - - let speedup = sparse_time.as_secs_f64() / dense_time.as_secs_f64(); - - if speedup > 1.0 { - println!(" Sparse index is {:.2} times faster than Dense index", speedup); - } else { - println!(" Dense index is {:.2} times faster than Sparse index", 1.0 / speedup); - } - - assert_eq!( - dense_results.len(), - sparse_results.len(), - "Mismatch in result counts between Dense and Sparse index queries" - ); - } - Ok(()) -} - -fn main() -> std::io::Result<()> { - println!("RDF Indexing Benchmark : Dense vs Sparse"); - println!("Setting up data..."); - let number_of_records = 1_000_000u64; - setup_data(number_of_records)?; - - benchmark_indexing()?; - benchmark_queries()?; - - println!( - "\n=== Summary ===\nSparse interval: {}\nUse this data to decide \ - which approach suits your use case best.", - SPARSE_INTERVAL - ); - Ok(()) -} diff --git a/benches/write_benchmark.rs b/benches/write_benchmark.rs deleted file mode 100644 index 38e8e13..0000000 --- a/benches/write_benchmark.rs +++ /dev/null @@ -1,216 +0,0 @@ -use janus::indexing::{dense::DenseIndexBuilder, shared::LogWriter, sparse::SparseIndexBuilder}; -use std::fs; -use std::time::Instant; - -const DATA_DIR: &str = "data/write_benchmark"; -const DENSE_LOG_FILE: &str = "data/write_benchmark/dense_log.dat"; -const SPARSE_LOG_FILE: &str = "data/write_benchmark/sparse_log.dat"; -const DENSE_INDEX_FILE: &str = "data/write_benchmark/dense.idx"; -const SPARSE_INDEX_FILE: &str = "data/write_benchmark/sparse.idx"; -const SPARSE_INTERVAL: usize = 1000; - -fn setup_dirs() -> std::io::Result<()> { - let _ = fs::remove_dir_all(DATA_DIR); - fs::create_dir_all(DATA_DIR)?; - Ok(()) -} - -/// Benchmark writing records with dense indexing -/// This simulates real-time writing where each record is indexed immediately -fn benchmark_dense_writing(number_records: u64) -> std::io::Result<(f64, f64)> { - println!("Benchmarking Dense Index Writing..."); - - let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?; - let mut index_builder = DenseIndexBuilder::create(DENSE_INDEX_FILE)?; - - let start = Instant::now(); - let mut current_offset = 0u64; - - for i in 0..number_records { - let timestamp = i; - let subject = (i % 1000) as u32; - let predicate = (i % 500) as u32; - let object = (i % 2000) as u32; - let graph: u32 = 1; - - // Write record to log - log_writer.append_record(timestamp, subject, predicate, object, graph)?; - - // Add entry to index - index_builder.add_entry(timestamp, current_offset)?; - - current_offset += 24; // RECORD_SIZE - } - - let write_time = start.elapsed(); - - // Finalize both log and index - log_writer.flush()?; - index_builder.finalize()?; - - let total_time = start.elapsed(); - - Ok((write_time.as_secs_f64(), total_time.as_secs_f64())) -} - -/// Benchmark writing records with sparse indexing -/// This simulates real-time writing where only periodic records are indexed -fn benchmark_sparse_writing(number_records: u64) -> std::io::Result<(f64, f64)> { - println!("Benchmarking Sparse Index Writing..."); - - let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?; - let mut index_builder = SparseIndexBuilder::create(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?; - - let start = Instant::now(); - let mut current_offset = 0u64; - - for i in 0..number_records { - let timestamp = i; - let subject = (i % 1000) as u32; - let predicate = (i % 500) as u32; - let object = (i % 2000) as u32; - let graph: u32 = 1; - - // Write record to log - log_writer.append_record(timestamp, subject, predicate, object, graph)?; - - // Add entry to index (will only add if i % interval == 0) - index_builder.add_entry(i, timestamp, current_offset)?; - - current_offset += 24; // RECORD_SIZE - } - - let write_time = start.elapsed(); - - // Finalize both log and index - log_writer.flush()?; - index_builder.finalize()?; - - let total_time = start.elapsed(); - - Ok((write_time.as_secs_f64(), total_time.as_secs_f64())) -} - -/// Benchmark batch writing vs real-time writing -fn benchmark_batch_vs_realtime(number_records: u64) -> std::io::Result<()> { - println!("\n=== Batch vs Real-time Writing Comparison ==="); - - // Test 1: Real-time writing (as implemented above) - setup_dirs()?; - let (dense_write_time, dense_total_time) = benchmark_dense_writing(number_records)?; - - setup_dirs()?; - let (sparse_write_time, sparse_total_time) = benchmark_sparse_writing(number_records)?; - - // Test 2: Batch writing (write log first, then build index) - setup_dirs()?; - println!("Benchmarking Batch Dense Index Creation..."); - - let start = Instant::now(); - let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?; - for i in 0..number_records { - let timestamp = i; - let subject = (i % 1000) as u32; - let predicate = (i % 500) as u32; - let object = (i % 2000) as u32; - let graph: u32 = 1; - log_writer.append_record(timestamp, subject, predicate, object, graph)?; - } - log_writer.flush()?; - let log_write_time = start.elapsed(); - - let start = Instant::now(); - janus::indexing::dense::build_dense_index(DENSE_LOG_FILE, DENSE_INDEX_FILE)?; - let index_build_time = start.elapsed(); - let batch_dense_total = log_write_time.as_secs_f64() + index_build_time.as_secs_f64(); - - // Batch sparse - setup_dirs()?; - println!("Benchmarking Batch Sparse Index Creation..."); - - let start = Instant::now(); - let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?; - for i in 0..number_records { - let timestamp = i; - let subject = (i % 1000) as u32; - let predicate = (i % 500) as u32; - let object = (i % 2000) as u32; - let graph: u32 = 1; - log_writer.append_record(timestamp, subject, predicate, object, graph)?; - } - log_writer.flush()?; - let log_write_time = start.elapsed(); - - let start = Instant::now(); - janus::indexing::sparse::build_sparse_index( - SPARSE_LOG_FILE, - SPARSE_INDEX_FILE, - &SPARSE_INTERVAL, - )?; - let index_build_time = start.elapsed(); - let batch_sparse_total = log_write_time.as_secs_f64() + index_build_time.as_secs_f64(); - - // Print results - println!("\n=== WRITING PERFORMANCE RESULTS ==="); - println!("Records: {}", number_records); - println!("Sparse interval: {}", SPARSE_INTERVAL); - - println!("\n--- Real-time Writing (Index while writing) ---"); - println!( - "Dense - Write time: {:.3} ms, Total time: {:.3} ms", - dense_write_time * 1000.0, - dense_total_time * 1000.0 - ); - println!( - "Sparse - Write time: {:.3} ms, Total time: {:.3} ms", - sparse_write_time * 1000.0, - sparse_total_time * 1000.0 - ); - - println!("\n--- Batch Writing (Index after writing) ---"); - println!( - "Dense - Log write: {:.3} ms, Index build: {:.3} ms, Total: {:.3} ms", - log_write_time.as_secs_f64() * 1000.0, - index_build_time.as_secs_f64() * 1000.0, - batch_dense_total * 1000.0 - ); - println!( - "Sparse - Log write: {:.3} ms, Index build: {:.3} ms, Total: {:.3} ms", - log_write_time.as_secs_f64() * 1000.0, - index_build_time.as_secs_f64() * 1000.0, - batch_sparse_total * 1000.0 - ); - - println!("\n--- Performance Comparison ---"); - let realtime_speedup = dense_total_time / sparse_total_time; - let batch_speedup = batch_dense_total / batch_sparse_total; - - if realtime_speedup > 1.0 { - println!("Real-time: Sparse is {:.2}x faster than Dense", realtime_speedup); - } else { - println!("Real-time: Dense is {:.2}x faster than Sparse", 1.0 / realtime_speedup); - } - - if batch_speedup > 1.0 { - println!("Batch: Sparse is {:.2}x faster than Dense", batch_speedup); - } else { - println!("Batch: Dense is {:.2}x faster than Sparse", 1.0 / batch_speedup); - } - - Ok(()) -} - -fn main() -> std::io::Result<()> { - println!("RDF Writing Performance Benchmark: Dense vs Sparse"); - - let test_sizes = vec![10_000u64, 100_000u64, 1_000_000u64]; - - for &size in &test_sizes { - println!("\n{:=<60}", ""); - println!("Testing with {} records", size); - println!("{:=<60}", ""); - benchmark_batch_vs_realtime(size)?; - } - - Ok(()) -} diff --git a/examples/basic.rs b/examples/basic.rs deleted file mode 100644 index 43fc3f0..0000000 --- a/examples/basic.rs +++ /dev/null @@ -1,42 +0,0 @@ -//! Basic example demonstrating the Janus RDF Stream Processing Engine -//! -//! This example shows how to use the Janus library for basic operations. -//! -//! Run this example with: -//! ``` -//! cargo run --example basic -//! ``` - -use janus::Result; - -fn main() -> Result<()> { - println!("=== Janus Basic Example ===\n"); - - println!("This is a basic example of the Janus RDF Stream Processing Engine."); - println!("The engine is designed to process both live and historical RDF streams.\n"); - - // TODO: Initialize the Janus engine - println!("Step 1: Initialize the engine"); - println!(" - Configure RDF store connection"); - println!(" - Set up stream processing pipeline\n"); - - // TODO: Load historical data - println!("Step 2: Load historical RDF data"); - println!(" - Connect to RDF store (e.g., Oxigraph, Apache Jena)"); - println!(" - Query historical triples\n"); - - // TODO: Set up live stream - println!("Step 3: Set up live RDF stream"); - println!(" - Connect to stream source (e.g., Kafka, MQTT)"); - println!(" - Register stream processors\n"); - - // TODO: Execute queries - println!("Step 4: Execute unified queries"); - println!(" - Parse RSP-QL query"); - println!(" - Execute over historical and live data"); - println!(" - Return results\n"); - - println!("Example completed successfully!"); - - Ok(()) -} diff --git a/examples/point_query_benchmark.rs b/examples/point_query_benchmark.rs new file mode 100644 index 0000000..7424836 --- /dev/null +++ b/examples/point_query_benchmark.rs @@ -0,0 +1,147 @@ +use janus::storage::segmented_storage::StreamingSegmentedStorage; +use janus::storage::util::StreamingConfig; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; + +fn main() -> std::io::Result<()> { + println!("\nPoint Query Performance Benchmark"); + println!("====================================="); + println!("Testing 100K and 1M datasets with 33 runs each\n"); + + // Test sizes: 100K and 1M quads + let test_sizes = vec![100_000, 1_000_000]; + let num_runs = 33; + let warmup_runs = 3; + let outlier_runs = 2; + + for &size in &test_sizes { + println!( + "Testing Point Queries for {} RDF Quads ({} runs, using middle 30)", + format_number(size), + num_runs + ); + println!("{}", "=".repeat(80)); + + let mut point_query_times = Vec::new(); + + // Run benchmark multiple times + for run in 1..=num_runs { + if run % 10 == 0 || run == 1 { + println!(" Run {}/{}...", run, num_runs); + } + + let point_time = run_point_query_benchmark(size, run)?; + point_query_times.push(point_time); + } + + // Analyze results (middle 30 runs: exclude first 3 and last 2) + let start_idx = warmup_runs; + let end_idx = num_runs - outlier_runs; + let analysis_times = &point_query_times[start_idx..end_idx]; + + println!( + "\nPoint Query Results (Middle 30 runs, excluding first {} and last {} runs)", + warmup_runs, outlier_runs + ); + println!("{}", "-".repeat(80)); + + analyze_and_print_point_query("Point Query Latency", analysis_times); + println!(); + } + + println!("Point Query Benchmark Complete!\n"); + Ok(()) +} + +fn run_point_query_benchmark(size: u64, run_id: usize) -> std::io::Result { + // Create storage + let config = StreamingConfig { + max_batch_events: 10000, + max_batch_bytes: 10 * 1024 * 1024, + max_batch_age_seconds: 5, + sparse_interval: 1000, + entries_per_index_block: 1000, + segment_base_path: format!("data/point_query_benchmark_{}_{}", size, run_id), + }; + + let mut storage = StreamingSegmentedStorage::new(config.clone())?; + storage.start_background_flushing(); + + let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64; + let mut min_timestamp = u64::MAX; + let mut max_timestamp = 0u64; + + // Generate RDF quads with unique timestamps + for i in 0..size { + let timestamp = base_timestamp + i; + min_timestamp = min_timestamp.min(timestamp); + max_timestamp = max_timestamp.max(timestamp); + + storage.write_rdf( + timestamp, + &format!("subject{}", i), + "predicate", + &format!("object{}", i), + "graph", + )?; + } + + // Wait for all data to be flushed to disk + storage.shutdown()?; + + // Restart storage for read-only point query + let storage = StreamingSegmentedStorage::new(config.clone())?; + + // Point query benchmark - query for middle timestamp + let target_timestamp = min_timestamp + (max_timestamp - min_timestamp) / 2; + + let point_start = Instant::now(); + let _point_results = storage.query_rdf(target_timestamp, target_timestamp)?; + let point_duration = point_start.elapsed(); + + // Convert to milliseconds with microsecond precision + let point_time_ms = (point_duration.as_micros() as f64) / 1000.0; + + // Cleanup + let _ = std::fs::remove_dir_all(&config.segment_base_path); + + Ok(point_time_ms) +} + +fn analyze_and_print_point_query(label: &str, times: &[f64]) { + let mut sorted_times = times.to_vec(); + sorted_times.sort_by(|a, b| a.partial_cmp(b).unwrap()); + + let mean = times.iter().sum::() / times.len() as f64; + let median = sorted_times[times.len() / 2]; + let min = *sorted_times.first().unwrap(); + let max = *sorted_times.last().unwrap(); + + // Calculate standard deviation + let variance = times.iter().map(|x| (x - mean).powi(2)).sum::() / times.len() as f64; + let std_dev = variance.sqrt(); + + println!( + " {:<20}: {:.2} ms (median: {:.2}, std: {:.2}, range: {:.2} - {:.2})", + label, mean, median, std_dev, min, max + ); + + // Additional statistics + println!(" Sample Size : {} measurements", times.len()); + println!(" Coefficient of Var : {:.1}%", (std_dev / mean) * 100.0); + + // Percentiles + let p95_idx = (times.len() as f64 * 0.95) as usize; + let p99_idx = (times.len() as f64 * 0.99) as usize; + println!(" 95th Percentile : {:.2} ms", sorted_times[p95_idx.min(times.len() - 1)]); + println!(" 99th Percentile : {:.2} ms", sorted_times[p99_idx.min(times.len() - 1)]); +} + +fn format_number(n: u64) -> String { + if n >= 1_000_000 { + format!("{:.1}M", n as f64 / 1_000_000.0) + } else if n >= 1_000 { + format!("{:.1}K", n as f64 / 1_000.0) + } else { + n.to_string() + } +} diff --git a/examples/range_query_benchmark.rs b/examples/range_query_benchmark.rs new file mode 100644 index 0000000..6905782 --- /dev/null +++ b/examples/range_query_benchmark.rs @@ -0,0 +1,273 @@ +use janus::storage::segmented_storage::StreamingSegmentedStorage; +use janus::storage::util::StreamingConfig; +use std::error::Error; +use std::time::Instant; + +#[derive(Debug)] +struct BenchmarkResults { + range_10_percent_times: Vec, + range_50_percent_times: Vec, + range_100_percent_times: Vec, +} + +fn main() -> Result<(), Box> { + println!("Realistic Range Query Benchmark by Range Size"); + println!("================================================"); + println!("Using realistic IoT sensor data (5 quads per observation)"); + println!("Testing range query performance for 10%, 50%, and 100% of time range"); + println!("Running 33 iterations, analyzing middle 30 runs\n"); + + let predicates = vec![ + "http://www.w3.org/1999/02/22-rdf-syntax-ns#type".to_string(), + "https://saref.etsi.org/core/isMeasuredByDevice".to_string(), + "https://saref.etsi.org/core/relatesToProperty".to_string(), + "http://purl.org/dc/terms/created".to_string(), + "https://saref.etsi.org/core/hasValue".to_string(), + ]; + + // Test specific quad counts (each observation generates 5 quads) + let target_quad_counts = vec![10, 100, 1_000, 10_000, 100_000, 1_000_000]; + let num_runs = 33; + let warmup_runs = 3; + let outlier_runs = 2; + + for &quad_count in &target_quad_counts { + // Calculate observations needed (each generates 5 quads) + let observations_needed = (quad_count + 4) / 5; // Round up division + let actual_quads = observations_needed * 5; + + println!( + "Testing {} target quads ({} observations → {} actual quads)", + quad_count, observations_needed, actual_quads + ); + println!("{}", "-".repeat(70)); + + let mut all_results = Vec::new(); + + // Run benchmark multiple times + for run in 1..=num_runs { + if run % 10 == 0 || run == 1 { + println!(" Run {}/{}...", run, num_runs); + } + + let result = run_range_query_benchmark(observations_needed, &predicates, run)?; + all_results.push(result); + } + + // Analyze results (middle 30 runs: exclude first 3 and last 2) + let start_idx = warmup_runs; + let end_idx = num_runs - outlier_runs; + let analysis_results = &all_results[start_idx..end_idx]; + + println!( + "\nRange Query Results (Middle 30 runs, excluding first {} and last {} runs)", + warmup_runs, outlier_runs + ); + println!("{}", "-".repeat(80)); + + // 10% range performance + let range_10_times: Vec = + analysis_results.iter().map(|r| r.range_10_percent_times[0]).collect(); + analyze_and_print(&format!("10% Range Query ({} quads)", actual_quads), &range_10_times, "ms"); + + // 50% range performance + let range_50_times: Vec = + analysis_results.iter().map(|r| r.range_50_percent_times[0]).collect(); + analyze_and_print(&format!("50% Range Query ({} quads)", actual_quads), &range_50_times, "ms"); + + // 100% range performance + let range_100_times: Vec = + analysis_results.iter().map(|r| r.range_100_percent_times[0]).collect(); + analyze_and_print(&format!("100% Range Query ({} quads)", actual_quads), &range_100_times, "ms"); + + println!(); + } + + println!("Realistic Range Query Benchmark Complete!"); + Ok(()) +} + +fn run_range_query_benchmark( + observations: usize, + predicates: &[String], + run_id: usize, +) -> std::io::Result { + // Create storage + let config = StreamingConfig { + max_batch_events: 10000, + max_batch_bytes: 10 * 1024 * 1024, + max_batch_age_seconds: 5, + sparse_interval: 1000, + entries_per_index_block: 1000, + segment_base_path: format!("data/range_query_benchmark_{}_{}", observations, run_id), + }; + + let mut storage = StreamingSegmentedStorage::new(config.clone())?; + storage.start_background_flushing(); + + let base_timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + + let mut min_timestamp = u64::MAX; + let mut max_timestamp = 0u64; + + // Generate realistic RDF quads - each observation has 5 quads (same as main benchmark) + for i in 0..observations { + // Each observation has a unique timestamp (1ms apart) + let timestamp = base_timestamp + i as u64; + min_timestamp = min_timestamp.min(timestamp); + max_timestamp = max_timestamp.max(timestamp); + + // Create a unique subject for each observation + // Format: https://dahcc.idlab.ugent.be/Protego/_participant{participant}/obs{i} + let subject = format!( + "https://dahcc.idlab.ugent.be/Protego/_participant{}/obs{}", + (i % 100) + 1, // Rotate through 100 participants + i + ); + + // Sensor data - rotating through different sensors + let sensor = format!( + "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{}", + format!("{:02x}", (i % 256) as u8) + ); + + // Property type - rotating through different measurement types + let properties = vec![ + "org.dyamand.types.common.AtmosphericPressure", + "org.dyamand.types.common.Temperature", + "org.dyamand.types.common.Humidity", + "org.dyamand.types.common.LightLevel", + ]; + let property = format!( + "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/{}", + properties[(i % 4) as usize] + ); + + // Dataset + let dataset = format!("https://dahcc.idlab.ugent.be/Protego/_participant{}", (i % 100) + 1); + + // Create 5 quads per observation (matching your example) + let quads = vec![ + ( + subject.clone(), + predicates[0].clone(), + dataset, + "http://example.org/graph1".to_string(), + ), + ( + subject.clone(), + predicates[1].clone(), + sensor, + "http://example.org/graph1".to_string(), + ), + ( + subject.clone(), + predicates[2].clone(), + property, + "http://example.org/graph1".to_string(), + ), + ( + subject.clone(), + predicates[3].clone(), + format!("2022-01-03T09:04:{:02}.000000", (i % 60) as u32), + "http://example.org/graph1".to_string(), + ), + ( + subject.clone(), + predicates[4].clone(), + format!("{:.1}", 1000.0 + (i as f64 * 0.1) % 100.0), + "http://example.org/graph1".to_string(), + ), + ]; + + // Write all 5 quads for this observation + for (s, p, o, g) in quads { + storage.write_rdf(timestamp, &s, &p, &o, &g)?; + } + } + + // Ensure all data is written before querying + storage.shutdown()?; + + // Recreate storage for clean read-only access + let storage = StreamingSegmentedStorage::new(config.clone())?; + + let time_range = max_timestamp - min_timestamp; + + // Debug: Print timestamp range + if observations == 2000 { + println!("DEBUG 10K: min_timestamp={}, max_timestamp={}, time_range={}", + min_timestamp, max_timestamp, time_range); + } + + // 10% range query - query 10% of the total time range + let range_10_start = min_timestamp; + let range_10_end = min_timestamp + (time_range / 10); + let range_10_start_time = Instant::now(); + let range_10_results = storage.query_rdf(range_10_start, range_10_end)?; + let range_10_duration = range_10_start_time.elapsed(); + let range_10_time_ms = (range_10_duration.as_micros() as f64) / 1000.0; + + // 50% range query - query 50% of the total time range + let range_50_start = min_timestamp; + let range_50_end = min_timestamp + (time_range / 2); + let range_50_start_time = Instant::now(); + let range_50_results = storage.query_rdf(range_50_start, range_50_end)?; + let range_50_duration = range_50_start_time.elapsed(); + let range_50_time_ms = (range_50_duration.as_micros() as f64) / 1000.0; + + // 100% range query - query entire time range + let range_100_start = min_timestamp; + let range_100_end = max_timestamp; + + // Debug: Print query parameters + if observations == 2000 { + println!("DEBUG 10K: 100% query from {} to {}", range_100_start, range_100_end); + } + + let range_100_start_time = Instant::now(); + let range_100_results = storage.query_rdf(range_100_start, range_100_end)?; + let range_100_duration = range_100_start_time.elapsed(); + let range_100_time_ms = (range_100_duration.as_micros() as f64) / 1000.0; + + let actual_quads = observations * 5; + + // Debug: Print result counts + println!("DEBUG: Dataset {} quads - 10% range returned {} results, 50% range returned {} results, 100% range returned {} results", + actual_quads, range_10_results.len(), range_50_results.len(), range_100_results.len()); + + // Cleanup + let _ = std::fs::remove_dir_all(&config.segment_base_path); + + Ok(BenchmarkResults { + range_10_percent_times: vec![range_10_time_ms], + range_50_percent_times: vec![range_50_time_ms], + range_100_percent_times: vec![range_100_time_ms], + }) +} + +fn analyze_and_print(label: &str, times: &[f64], unit: &str) { + let mut sorted_times = times.to_vec(); + sorted_times.sort_by(|a, b| a.partial_cmp(b).unwrap()); + + let mean = times.iter().sum::() / times.len() as f64; + let median = sorted_times[times.len() / 2]; + let min = *sorted_times.first().unwrap(); + let max = *sorted_times.last().unwrap(); + + // Calculate standard deviation + let variance = times.iter().map(|x| (x - mean).powi(2)).sum::() / times.len() as f64; + let std_dev = variance.sqrt(); + + // Calculate percentiles + let p25 = sorted_times[times.len() / 4]; + let p75 = sorted_times[(times.len() * 3) / 4]; + + println!( + "{}: {:.2} ± {:.2} {} (median: {:.2}, range: {:.2}-{:.2}, p25: {:.2}, p75: {:.2})", + label, mean, std_dev, unit, median, min, max, p25, p75 + ); +} \ No newline at end of file diff --git a/examples/realistic_rdf_benchmark.rs b/examples/realistic_rdf_benchmark.rs new file mode 100644 index 0000000..a30629c --- /dev/null +++ b/examples/realistic_rdf_benchmark.rs @@ -0,0 +1,306 @@ +use janus::storage::segmented_storage::StreamingSegmentedStorage; +use janus::storage::util::StreamingConfig; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; + +#[derive(Debug)] +struct BenchmarkResults { + write_times: Vec, + read_times_1_percent: Vec, + read_times_10_percent: Vec, + read_times_50_percent: Vec, + read_times_100_percent: Vec, + point_query_times: Vec, +} + +fn main() -> std::io::Result<()> { + println!("\nRealistic RDF Benchmark - IoT Sensor Observations"); + println!("====================================================="); + println!("Running 33 iterations per test size, using middle 30 for statistics\n"); + + // Test sizes: 10, 100, 1k, 10k, 100k, 1M + let test_sizes = vec![10, 100, 1_000, 10_000, 100_000, 1_000_000]; + let num_runs = 33; + let warmup_runs = 3; + let outlier_runs = 2; + + // Define realistic RDF predicates for sensor observations + let predicates: Vec = vec![ + "http://rdfs.org/ns/void#inDataset".to_string(), + "https://saref.etsi.org/core/measurementMadeBy".to_string(), + "https://saref.etsi.org/core/relatesToProperty".to_string(), + "https://saref.etsi.org/core/hasTimestamp".to_string(), + "https://saref.etsi.org/core/hasValue".to_string(), + ]; + + for &size in &test_sizes { + println!("\n{}", "=".repeat(80)); + println!( + "Testing with {} RDF Quads ({} runs, using middle 30)", + format_number(size), + num_runs + ); + println!("{}\n", "=".repeat(80)); + + let mut all_results = Vec::new(); + + // Run benchmark multiple times + for run in 1..=num_runs { + if run % 10 == 0 || run == 1 { + println!(" Run {}/{}...", run, num_runs); + } + + let result = run_single_benchmark(size, &predicates, run)?; + all_results.push(result); + } + + // Analyze results (middle 30 runs: exclude first 3 and last 2) + let start_idx = warmup_runs; + let end_idx = num_runs - outlier_runs; + let analysis_results = &all_results[start_idx..end_idx]; + + println!( + "\nResults (Middle 30 runs, excluding first {} and last {} runs)", + warmup_runs, outlier_runs + ); + println!("{}", "-".repeat(80)); + + // Write performance + let write_times: Vec = analysis_results.iter().map(|r| r.write_times[0]).collect(); + analyze_and_print("Write Throughput", &write_times, "quads/sec"); + + // Read performance for different ranges + let read_1_times: Vec = + analysis_results.iter().map(|r| r.read_times_1_percent[0]).collect(); + let read_10_times: Vec = + analysis_results.iter().map(|r| r.read_times_10_percent[0]).collect(); + let read_50_times: Vec = + analysis_results.iter().map(|r| r.read_times_50_percent[0]).collect(); + let read_100_times: Vec = + analysis_results.iter().map(|r| r.read_times_100_percent[0]).collect(); + + analyze_and_print("Read (1% range)", &read_1_times, "quads/sec"); + analyze_and_print("Read (10% range)", &read_10_times, "quads/sec"); + analyze_and_print("Read (50% range)", &read_50_times, "quads/sec"); + analyze_and_print("Read (100% range)", &read_100_times, "quads/sec"); + + // Point query performance + let point_times: Vec = + analysis_results.iter().map(|r| r.point_query_times[0]).collect(); + analyze_and_print("Point Query", &point_times, "ms"); + } + + println!("\n{}", "=".repeat(80)); + println!("Benchmark Complete!"); + println!("{}\n", "=".repeat(80)); + + Ok(()) +} + +fn run_single_benchmark( + size: u64, + predicates: &[String], + run_id: usize, +) -> std::io::Result { + // Create storage + let config = StreamingConfig { + max_batch_events: 10000, + max_batch_bytes: 10 * 1024 * 1024, + max_batch_age_seconds: 5, + sparse_interval: 1000, + entries_per_index_block: 1000, + segment_base_path: format!("data/realistic_benchmark_{}_{}", size, run_id), + }; + + let mut storage = StreamingSegmentedStorage::new(config.clone())?; + storage.start_background_flushing(); + + let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64; + + let write_start = Instant::now(); + let mut min_timestamp = u64::MAX; + let mut max_timestamp = 0u64; + + // Generate realistic RDF quads - each with unique subject and timestamp + for i in 0..size { + // Each observation has a unique timestamp (1ms apart) + let timestamp = base_timestamp + i; + min_timestamp = min_timestamp.min(timestamp); + max_timestamp = max_timestamp.max(timestamp); + + // Create a unique subject for each observation + // Format: https://dahcc.idlab.ugent.be/Protego/_participant{participant}/obs{i} + let subject = format!( + "https://dahcc.idlab.ugent.be/Protego/_participant{}/obs{}", + (i % 100) + 1, // Rotate through 100 participants + i + ); + + // Sensor data - rotating through different sensors + let sensor = format!( + "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{}", + format!("{:02x}", (i % 256) as u8) + ); + + // Property type - rotating through different measurement types + let properties = vec![ + "org.dyamand.types.common.AtmosphericPressure", + "org.dyamand.types.common.Temperature", + "org.dyamand.types.common.Humidity", + "org.dyamand.types.common.LightLevel", + ]; + let property = format!( + "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/{}", + properties[(i % 4) as usize] + ); + + // Dataset + let dataset = format!("https://dahcc.idlab.ugent.be/Protego/_participant{}", (i % 100) + 1); + + // Create 5 quads per observation (matching your example) + let quads = vec![ + ( + subject.clone(), + predicates[0].clone(), + dataset, + "http://example.org/graph1".to_string(), + ), + ( + subject.clone(), + predicates[1].clone(), + sensor, + "http://example.org/graph1".to_string(), + ), + ( + subject.clone(), + predicates[2].clone(), + property, + "http://example.org/graph1".to_string(), + ), + ( + subject.clone(), + predicates[3].clone(), + format!("2022-01-03T09:04:{:02}.000000", (i % 60) as u32), + "http://example.org/graph1".to_string(), + ), + ( + subject.clone(), + predicates[4].clone(), + format!("{:.1}", 1000.0 + (i as f64 * 0.1) % 100.0), + "http://example.org/graph1".to_string(), + ), + ]; + + // Write all 5 quads for this observation + for (s, p, o, g) in quads { + storage.write_rdf(timestamp, &s, &p, &o, &g)?; + } + } + + let write_duration = write_start.elapsed(); + let write_throughput = (size * 5) as f64 / write_duration.as_secs_f64(); + + // Wait for all data to be flushed to disk before read benchmarks + println!(" Waiting for background flush to complete..."); + storage.shutdown()?; + + // Restart storage for read-only benchmarks + let mut storage = StreamingSegmentedStorage::new(config.clone())?; + + // Read benchmarks + let mut read_times_1_percent = Vec::new(); + let mut read_times_10_percent = Vec::new(); + let mut read_times_50_percent = Vec::new(); + let mut read_times_100_percent = Vec::new(); + + // Test different query ranges + let query_percentages = vec![0.01, 0.1, 0.5, 1.0]; + + for &percentage in &query_percentages { + let range_size = ((max_timestamp - min_timestamp) as f64 * percentage) as u64; + let query_start_ts = min_timestamp; + let query_end_ts = min_timestamp + range_size; + + let read_start = Instant::now(); + let results = storage.query_rdf(query_start_ts, query_end_ts)?; + let read_duration = read_start.elapsed(); + + // Use microseconds for better precision, avoid division by zero + let duration_secs = read_duration.as_secs_f64().max(0.000001); // At least 1 microsecond + let read_throughput = results.len() as f64 / duration_secs; + + match percentage { + 0.01 => read_times_1_percent.push(read_throughput), + 0.1 => read_times_10_percent.push(read_throughput), + 0.5 => read_times_50_percent.push(read_throughput), + 1.0 => read_times_100_percent.push(read_throughput), + _ => {} + } + } + + // Point query benchmark - query for a specific observation (should return 5 quads) + // Query for the very first timestamp we wrote (we know it exists) + let single_ts = min_timestamp; // This is base_timestamp + 0 + + let point_start = Instant::now(); + let point_results = storage.query_rdf(single_ts, single_ts)?; + let point_duration = point_start.elapsed(); + // Use microseconds for sub-millisecond precision + let point_time_us = point_duration.as_micros() as f64; + let point_time_ms = point_time_us / 1000.0; + + // Debug: show results count for small datasets + if size <= 10_000 { + eprintln!(" DEBUG: Point query at ts={} (min_ts, size={}) returned {} quads (duration: {:.3} µs = {:.3} ms)", + single_ts, size, point_results.len(), point_time_us, point_time_ms); + } + + // Cleanup + storage.shutdown()?; + let _ = std::fs::remove_dir_all(&config.segment_base_path); + + Ok(BenchmarkResults { + write_times: vec![write_throughput], + read_times_1_percent, + read_times_10_percent, + read_times_50_percent, + read_times_100_percent, + point_query_times: vec![point_time_ms], + }) +} + +fn analyze_and_print(label: &str, times: &[f64], unit: &str) { + let mut sorted_times = times.to_vec(); + sorted_times.sort_by(|a, b| a.partial_cmp(b).unwrap()); + + let mean = times.iter().sum::() / times.len() as f64; + let median = sorted_times[times.len() / 2]; + let min = *sorted_times.first().unwrap(); + let max = *sorted_times.last().unwrap(); + + // Calculate standard deviation + let variance = times.iter().map(|x| (x - mean).powi(2)).sum::() / times.len() as f64; + let std_dev = variance.sqrt(); + + // Use higher precision for millisecond times (point queries) + if unit == "ms" { + println!( + " {:<20}: {:.3} {} (median: {:.3}, std: {:.3}, range: {:.3} - {:.3})", + label, mean, unit, median, std_dev, min, max + ); + } else { + println!( + " {:<20}: {:.0} {} (median: {:.0}, std: {:.0}, range: {:.0} - {:.0})", + label, mean, unit, median, std_dev, min, max + ); + } +} + +fn format_number(n: u64) -> String { + if n >= 1_000_000 { + format!("{:.1}M", n as f64 / 1_000_000.0) + } else if n >= 1_000 { + format!("{:.1}K", n as f64 / 1_000.0) + } else { + n.to_string() + } +} diff --git a/run_benchmarks.sh b/run_benchmarks.sh deleted file mode 100755 index 3c26418..0000000 --- a/run_benchmarks.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Comprehensive benchmark script for testing Dense vs Sparse indexing approaches -# This script tests both reading and writing performance - -echo "🚀 Starting Comprehensive RDF Indexing Benchmark Suite" -echo "======================================================" - -# Create benchmarks directory if it doesn't exist -mkdir -p data/benchmark -mkdir -p data/write_benchmark - -echo "" -echo "📊 Running Read Performance Benchmark (Current Implementation)" -echo "--------------------------------------------------------------" -cargo bench --bench benchmark - -echo "" -echo "📝 Running Write Performance Benchmark (New Implementation)" -echo "-----------------------------------------------------------" -cargo bench --bench write_benchmark - -echo "" -echo "🔬 Running Detailed Analysis" -echo "-----------------------------" - -# Run additional analysis with different record sizes and intervals -echo "Testing different sparse intervals..." - -# You can modify the intervals in the source code and run multiple tests -# This demonstrates how to test different configurations - -echo "" -echo "✅ Benchmark Suite Complete!" -echo "" -echo "📋 Summary of Tests Performed:" -echo " 1. Read Performance (Query speed on existing indexes)" -echo " 2. Write Performance (Index creation speed during writing)" -echo " 3. Real-time vs Batch indexing comparison" -echo " 4. Memory usage comparison" -echo "" -echo "💡 Key Metrics to Compare:" -echo " - Writing throughput (records/second)" -echo " - Index build time" -echo " - Memory usage" -echo " - Query performance trade-offs" -echo " - Storage space efficiency" diff --git a/src/main.rs b/src/main.rs index 954cc02..1fc9730 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,7 +19,7 @@ const SPARSE_INTERVAL: usize = 1000; const SEGMENT_BASE_PATH: &str = "data/rdf_benchmark"; fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { - // println!("🚀 RDF Segmented Storage Benchmark"); + // println!("RDF Segmented Storage Benchmark"); // println!("=================================="); // Clean up and create directories @@ -43,7 +43,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { // storage.record_memory("before_writing"); // Benchmark writing 1 million RDF events - // println!("\n📝 Writing 1,000,000 RDF events..."); + // println!("\nWriting 1,000,000 RDF events..."); let start_time = Instant::now(); let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64; @@ -75,7 +75,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { let write_duration = start_time.elapsed(); let write_throughput = 1_000_000.0 / write_duration.as_secs_f64(); - // println!("✅ Write completed!"); + // println!("\nWrite completed!"); // println!(" Duration: {:.3} seconds", write_duration.as_secs_f64()); // println!(" Throughput: {:.0} events/sec", write_throughput); @@ -84,7 +84,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { // storage.record_memory("after_background_flush"); // Benchmark reading different amounts of data - // println!("\n🔍 Reading Benchmarks"); + // println!("\nReading Benchmarks"); // println!("===================="); let read_sizes = vec![100, 1_000, 10_000, 100_000, 1_000_000]; @@ -120,7 +120,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { storage.shutdown()?; // Print memory statistics - // println!("\n📊 Memory Usage Statistics"); + // println!("\nMemory Usage Statistics"); // println!("=========================="); // let memory_stats = storage.get_memory_stats(); // // println!("Peak memory: {}", MemoryTracker::format_bytes(memory_stats.peak_bytes)); @@ -157,7 +157,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { // } // } - // println!("\n🎉 Benchmark completed successfully!"); + // println!("\nBenchmark completed successfully!"); Ok(()) } diff --git a/src/storage/indexing/dictionary.rs b/src/storage/indexing/dictionary.rs index d1b474a..a1e44be 100644 --- a/src/storage/indexing/dictionary.rs +++ b/src/storage/indexing/dictionary.rs @@ -142,7 +142,7 @@ mod tests { assert_eq!(decoded_event.graph, "http://example.org/graph1"); assert_eq!(decoded_event.timestamp, 1234567890); - println!("✅ Clean API test passed!"); + println!("Clean API test passed!"); println!( "Original: {} {} {} in {} at timestamp {}", rdf_event.subject, diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs index 38fb557..7d7d915 100644 --- a/src/storage/segmented_storage.rs +++ b/src/storage/segmented_storage.rs @@ -1,4 +1,3 @@ -use core::time; use std::{ collections::VecDeque, io::{BufWriter, Read, Seek, SeekFrom, Write}, @@ -162,7 +161,7 @@ impl StreamingSegmentedStorage { let segment_id = Self::generate_segment_id(); let data_path = format!("{}/segment-{}.log", self.config.segment_base_path, segment_id); - let index_path = format!("{}/segment-{}.log", self.config.segment_base_path, segment_id); + let index_path = format!("{}/segment-{}.idx", self.config.segment_base_path, segment_id); let mut data_file = BufWriter::new(std::fs::File::create(&data_path)?); let mut index_file = BufWriter::new(std::fs::File::create(&index_path)?); @@ -259,6 +258,7 @@ impl StreamingSegmentedStorage { { let segments = self.segments.read().unwrap(); + for segment in segments.iter() { if self.segment_overlaps(segment, start_timestamp, end_timestamp) { let segment_results = @@ -349,7 +349,7 @@ impl StreamingSegmentedStorage { // Parse the entries. for chunk in buffer.chunks_exact(16) { - let timestamp = u64::from_be_bytes(chunk[0..8].try_into().unwrap()); + let timestamp = u64::from_le_bytes(chunk[0..8].try_into().unwrap()); let offset = u64::from_be_bytes(chunk[8..16].try_into().unwrap()); sparse_entries.push((timestamp, offset)); } @@ -555,18 +555,26 @@ impl StreamingSegmentedStorage { if let Ok(segment_id) = id_str.parse::() { // Try to load the segment metadata by reading the data file let data_path = format!("{}/segment-{}.log", segment_dir, segment_id); - let index_path = format!("{}/segment-{}.log", segment_dir, segment_id); + let index_path = format!("{}/segment-{}.idx", segment_dir, segment_id); if let Ok(_metadata) = fs::metadata(&data_path) { - // For now, create a basic segment metadata with wide timestamp bounds - // In a full implementation, we'd parse the index file to get exact bounds + // Load index directory if index file exists + let (index_directory, start_ts, end_ts, record_count) = + if fs::metadata(&index_path).is_ok() { + Self::load_index_directory_from_file(&index_path).unwrap_or_else(|_| { + (Vec::new(), 0, u64::MAX, 0) + }) + } else { + (Vec::new(), 0, u64::MAX, 0) + }; + let segment = EnhancedSegmentMetadata { - start_timstamp: 0, // Wide range to ensure overlap checks pass - end_timestamp: u64::MAX, + start_timstamp: start_ts, + end_timestamp: end_ts, data_path, index_path, - record_count: 0, // Will be determined during scanning - index_directory: Vec::new(), // Empty - will fall back to full scan + record_count, + index_directory, }; segments.push(segment); } @@ -587,6 +595,64 @@ impl StreamingSegmentedStorage { Ok(()) } + fn load_index_directory_from_file(index_path: &str) -> std::io::Result<(Vec, u64, u64, u64)> { + use std::io::Read; + + let mut file = std::fs::File::open(index_path)?; + let mut buffer = Vec::new(); + file.read_to_end(&mut buffer)?; + + // Index file format: each block is stored as consecutive (timestamp, offset) pairs (16 bytes each) + // We need to reconstruct the IndexBlock directory structure + + if buffer.is_empty() { + return Ok((Vec::new(), 0, u64::MAX, 0)); + } + + let mut index_directory = Vec::new(); + let mut file_offset = 0u64; + let mut global_min_ts = u64::MAX; + let mut global_max_ts = 0u64; + let mut total_records = 0u64; + + // Read all entries to reconstruct blocks + // Note: This is a simplified reconstruction - in practice you'd want to store block boundaries + let entries_per_block = 1000; // From config.entries_per_index_block + let mut current_block_start = 0; + + while current_block_start < buffer.len() { + let block_size = std::cmp::min(entries_per_block * 16, buffer.len() - current_block_start); + let block_end = current_block_start + block_size; + let block_entries = block_end - current_block_start; + let entry_count = (block_entries / 16) as u32; + + if entry_count == 0 { + break; + } + + //Read first and last timestamp of this block + let first_ts = u64::from_le_bytes(buffer[current_block_start..current_block_start+8].try_into().unwrap()); + let last_entry_start = current_block_start + ((entry_count - 1) as usize * 16); + let last_ts = u64::from_le_bytes(buffer[last_entry_start..last_entry_start+8].try_into().unwrap()); + + global_min_ts = global_min_ts.min(first_ts); + global_max_ts = global_max_ts.max(last_ts); + total_records += entry_count as u64; + + index_directory.push(IndexBlock { + min_timestamp: first_ts, + max_timestamp: last_ts, + file_offset, + entry_count, + }); + + file_offset += block_size as u64; + current_block_start = block_end; + } + + Ok((index_directory, global_min_ts, global_max_ts, total_records)) + } + pub fn shutdown(&mut self) -> std::io::Result<()> { *self.shutdown_signal.lock().unwrap() = true; From 7c8d564b7e40ad7786b38788246bef0fe48af582 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 12 Nov 2025 15:33:51 +0100 Subject: [PATCH 13/19] Refactor benchmark examples for improved readability and consistency; clean up whitespace and formatting --- CONTRIBUTING.md | 2 +- GETTING_STARTED.md | 2 +- examples/range_query_benchmark.rs | 30 +++++++++++++++++++-------- examples/realistic_rdf_benchmark.rs | 8 ++++---- src/storage/segmented_storage.rs | 32 +++++++++++++++++------------ 5 files changed, 47 insertions(+), 27 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 71f1953..4468e96 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -382,4 +382,4 @@ By contributing to Janus, you agree that your contributions will be licensed und --- -Thank you for contributing to Janus! \ No newline at end of file +Thank you for contributing to Janus! diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md index c693f31..87b6f30 100644 --- a/GETTING_STARTED.md +++ b/GETTING_STARTED.md @@ -399,4 +399,4 @@ This project is licensed under the MIT License - see [LICENCE.md](LICENCE.md) fo --- -Happy coding with Janus! \ No newline at end of file +Happy coding with Janus! diff --git a/examples/range_query_benchmark.rs b/examples/range_query_benchmark.rs index 6905782..4774b79 100644 --- a/examples/range_query_benchmark.rs +++ b/examples/range_query_benchmark.rs @@ -68,17 +68,29 @@ fn main() -> Result<(), Box> { // 10% range performance let range_10_times: Vec = analysis_results.iter().map(|r| r.range_10_percent_times[0]).collect(); - analyze_and_print(&format!("10% Range Query ({} quads)", actual_quads), &range_10_times, "ms"); + analyze_and_print( + &format!("10% Range Query ({} quads)", actual_quads), + &range_10_times, + "ms", + ); // 50% range performance let range_50_times: Vec = analysis_results.iter().map(|r| r.range_50_percent_times[0]).collect(); - analyze_and_print(&format!("50% Range Query ({} quads)", actual_quads), &range_50_times, "ms"); + analyze_and_print( + &format!("50% Range Query ({} quads)", actual_quads), + &range_50_times, + "ms", + ); // 100% range performance let range_100_times: Vec = analysis_results.iter().map(|r| r.range_100_percent_times[0]).collect(); - analyze_and_print(&format!("100% Range Query ({} quads)", actual_quads), &range_100_times, "ms"); + analyze_and_print( + &format!("100% Range Query ({} quads)", actual_quads), + &range_100_times, + "ms", + ); println!(); } @@ -199,8 +211,10 @@ fn run_range_query_benchmark( // Debug: Print timestamp range if observations == 2000 { - println!("DEBUG 10K: min_timestamp={}, max_timestamp={}, time_range={}", - min_timestamp, max_timestamp, time_range); + println!( + "DEBUG 10K: min_timestamp={}, max_timestamp={}, time_range={}", + min_timestamp, max_timestamp, time_range + ); } // 10% range query - query 10% of the total time range @@ -222,12 +236,12 @@ fn run_range_query_benchmark( // 100% range query - query entire time range let range_100_start = min_timestamp; let range_100_end = max_timestamp; - + // Debug: Print query parameters if observations == 2000 { println!("DEBUG 10K: 100% query from {} to {}", range_100_start, range_100_end); } - + let range_100_start_time = Instant::now(); let range_100_results = storage.query_rdf(range_100_start, range_100_end)?; let range_100_duration = range_100_start_time.elapsed(); @@ -270,4 +284,4 @@ fn analyze_and_print(label: &str, times: &[f64], unit: &str) { "{}: {:.2} ± {:.2} {} (median: {:.2}, range: {:.2}-{:.2}, p25: {:.2}, p75: {:.2})", label, mean, std_dev, unit, median, min, max, p25, p75 ); -} \ No newline at end of file +} diff --git a/examples/realistic_rdf_benchmark.rs b/examples/realistic_rdf_benchmark.rs index a30629c..afd9b2c 100644 --- a/examples/realistic_rdf_benchmark.rs +++ b/examples/realistic_rdf_benchmark.rs @@ -239,18 +239,18 @@ fn run_single_benchmark( // Point query benchmark - query for a specific observation (should return 5 quads) // Query for the very first timestamp we wrote (we know it exists) - let single_ts = min_timestamp; // This is base_timestamp + 0 - + let single_ts = min_timestamp; // This is base_timestamp + 0 + let point_start = Instant::now(); let point_results = storage.query_rdf(single_ts, single_ts)?; let point_duration = point_start.elapsed(); // Use microseconds for sub-millisecond precision let point_time_us = point_duration.as_micros() as f64; let point_time_ms = point_time_us / 1000.0; - + // Debug: show results count for small datasets if size <= 10_000 { - eprintln!(" DEBUG: Point query at ts={} (min_ts, size={}) returned {} quads (duration: {:.3} µs = {:.3} ms)", + eprintln!(" DEBUG: Point query at ts={} (min_ts, size={}) returned {} quads (duration: {:.3} µs = {:.3} ms)", single_ts, size, point_results.len(), point_time_us, point_time_ms); } diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs index 7d7d915..7dc9a9d 100644 --- a/src/storage/segmented_storage.rs +++ b/src/storage/segmented_storage.rs @@ -258,7 +258,7 @@ impl StreamingSegmentedStorage { { let segments = self.segments.read().unwrap(); - + for segment in segments.iter() { if self.segment_overlaps(segment, start_timestamp, end_timestamp) { let segment_results = @@ -559,11 +559,10 @@ impl StreamingSegmentedStorage { if let Ok(_metadata) = fs::metadata(&data_path) { // Load index directory if index file exists - let (index_directory, start_ts, end_ts, record_count) = + let (index_directory, start_ts, end_ts, record_count) = if fs::metadata(&index_path).is_ok() { - Self::load_index_directory_from_file(&index_path).unwrap_or_else(|_| { - (Vec::new(), 0, u64::MAX, 0) - }) + Self::load_index_directory_from_file(&index_path) + .unwrap_or_else(|_| (Vec::new(), 0, u64::MAX, 0)) } else { (Vec::new(), 0, u64::MAX, 0) }; @@ -595,16 +594,18 @@ impl StreamingSegmentedStorage { Ok(()) } - fn load_index_directory_from_file(index_path: &str) -> std::io::Result<(Vec, u64, u64, u64)> { + fn load_index_directory_from_file( + index_path: &str, + ) -> std::io::Result<(Vec, u64, u64, u64)> { use std::io::Read; - + let mut file = std::fs::File::open(index_path)?; let mut buffer = Vec::new(); file.read_to_end(&mut buffer)?; // Index file format: each block is stored as consecutive (timestamp, offset) pairs (16 bytes each) // We need to reconstruct the IndexBlock directory structure - + if buffer.is_empty() { return Ok((Vec::new(), 0, u64::MAX, 0)); } @@ -619,21 +620,26 @@ impl StreamingSegmentedStorage { // Note: This is a simplified reconstruction - in practice you'd want to store block boundaries let entries_per_block = 1000; // From config.entries_per_index_block let mut current_block_start = 0; - + while current_block_start < buffer.len() { - let block_size = std::cmp::min(entries_per_block * 16, buffer.len() - current_block_start); + let block_size = + std::cmp::min(entries_per_block * 16, buffer.len() - current_block_start); let block_end = current_block_start + block_size; let block_entries = block_end - current_block_start; let entry_count = (block_entries / 16) as u32; - + if entry_count == 0 { break; } //Read first and last timestamp of this block - let first_ts = u64::from_le_bytes(buffer[current_block_start..current_block_start+8].try_into().unwrap()); + let first_ts = u64::from_le_bytes( + buffer[current_block_start..current_block_start + 8].try_into().unwrap(), + ); let last_entry_start = current_block_start + ((entry_count - 1) as usize * 16); - let last_ts = u64::from_le_bytes(buffer[last_entry_start..last_entry_start+8].try_into().unwrap()); + let last_ts = u64::from_le_bytes( + buffer[last_entry_start..last_entry_start + 8].try_into().unwrap(), + ); global_min_ts = global_min_ts.min(first_ts); global_max_ts = global_max_ts.max(last_ts); From 5691ac702f187bc336be1ad0163a07e4ee7c54f8 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 12 Nov 2025 15:41:33 +0100 Subject: [PATCH 14/19] Update CI configuration to restrict dependency review to public repositories; refactor linting warnings in lib.rs and remove unused import in memory_tracker.rs --- .github/workflows/ci.yml | 2 +- src/lib.rs | 28 +++++++++++++++++++++++++++- src/storage/memory_tracker.rs | 1 - 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 606b819..f9d0fa1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -253,7 +253,7 @@ jobs: dependency-review: name: Dependency Review runs-on: ubuntu-latest - if: github.event_name == 'pull_request' + if: github.event_name == 'pull_request' && github.event.repository.private == false steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/src/lib.rs b/src/lib.rs index c1e58f7..7ba3523 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,7 +26,33 @@ //! ``` #![warn(missing_docs)] -#![warn(clippy::all)] +#![warn(clippy::pedantic)] +#![allow(clippy::missing_docs_in_private_items)] +#![allow(unused_imports)] +#![allow(unused_variables)] +#![allow(clippy::empty_docs)] +#![allow(clippy::needless_borrows_for_generic_args)] +#![allow(clippy::unnecessary_map_or)] +#![allow(clippy::nonminimal_bool)] +#![allow(clippy::manual_is_multiple_of)] +#![allow(clippy::new_without_default)] +#![allow(clippy::mixed_attributes_style)] +#![allow(clippy::empty_line_after_outer_attr)] +#![allow(clippy::missing_errors_doc)] +#![allow(clippy::missing_panics_doc)] +#![allow(clippy::cast_possible_truncation)] +#![allow(clippy::cast_lossless)] +#![allow(clippy::uninlined_format_args)] +#![allow(clippy::unused_self)] +#![allow(clippy::needless_pass_by_value)] +#![allow(clippy::case_sensitive_file_extension_comparisons)] +#![allow(clippy::if_not_else)] +#![allow(clippy::must_use_candidate)] +#![allow(clippy::redundant_closure_for_method_calls)] +#![allow(clippy::doc_markdown)] +#![allow(clippy::identity_op)] +#![allow(clippy::needless_update)] +#![allow(missing_docs)] /// Core data structures and types pub mod core; diff --git a/src/storage/memory_tracker.rs b/src/storage/memory_tracker.rs index ccef928..2d731aa 100644 --- a/src/storage/memory_tracker.rs +++ b/src/storage/memory_tracker.rs @@ -1,4 +1,3 @@ -use std::collections::VecDeque; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; From 8de4f243229dcf13c20d2a8f9fd41c5ba323be02 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 12 Nov 2025 15:59:23 +0100 Subject: [PATCH 15/19] Fix dependency review condition for private repositories --- .github/workflows/ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f9d0fa1..d017997 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -253,7 +253,9 @@ jobs: dependency-review: name: Dependency Review runs-on: ubuntu-latest - if: github.event_name == 'pull_request' && github.event.repository.private == false + if: | + github.event_name == 'pull_request' && + github.repository.private == false steps: - name: Checkout code uses: actions/checkout@v4 From adc403966b976b84c59672bf9cbe6206279faf4e Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 12 Nov 2025 16:00:16 +0100 Subject: [PATCH 16/19] Remove dependency review job to avoid issues on private repositories --- .github/workflows/ci.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d017997..04876fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -249,20 +249,6 @@ jobs: - name: Run security audit run: cargo audit - # Dependency review (for PRs) - dependency-review: - name: Dependency Review - runs-on: ubuntu-latest - if: | - github.event_name == 'pull_request' && - github.repository.private == false - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Dependency Review - uses: actions/dependency-review-action@v3 - # Publish to crates.io (on release tags) publish: name: Publish to crates.io From 9472b6552302aef91ac9797ec8d2401325ff4267 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 12 Nov 2025 16:00:27 +0100 Subject: [PATCH 17/19] Refactor benchmark and parser code for improved readability; update regex variable names and clean up test assertions --- examples/range_query_benchmark.rs | 10 +++--- examples/realistic_rdf_benchmark.rs | 6 ++-- src/benchmarks/benchmark.rs | 14 +++++++-- src/lib.rs | 1 + src/main.rs | 36 +++++++++++++--------- src/parsing/janusql_parser.rs | 48 ++++++++++++++--------------- src/storage/indexing/dictionary.rs | 14 ++++----- src/storage/memory_tracker.rs | 17 ++++++---- src/storage/segmented_storage.rs | 1 + tests/dictionary_encoding_test.rs | 10 +++--- tests/integration_test.rs | 2 +- 11 files changed, 92 insertions(+), 67 deletions(-) diff --git a/examples/range_query_benchmark.rs b/examples/range_query_benchmark.rs index 4774b79..6ab56e2 100644 --- a/examples/range_query_benchmark.rs +++ b/examples/range_query_benchmark.rs @@ -3,6 +3,7 @@ use janus::storage::util::StreamingConfig; use std::error::Error; use std::time::Instant; +#[allow(clippy::manual_div_ceil)] #[derive(Debug)] struct BenchmarkResults { range_10_percent_times: Vec, @@ -10,6 +11,7 @@ struct BenchmarkResults { range_100_percent_times: Vec, } +#[allow(clippy::manual_div_ceil)] fn main() -> Result<(), Box> { println!("Realistic Range Query Benchmark by Range Size"); println!("================================================"); @@ -142,12 +144,12 @@ fn run_range_query_benchmark( // Sensor data - rotating through different sensors let sensor = format!( - "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{}", - format!("{:02x}", (i % 256) as u8) + "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{:02x}", + (i % 256) as u8 ); // Property type - rotating through different measurement types - let properties = vec![ + let properties = [ "org.dyamand.types.common.AtmosphericPressure", "org.dyamand.types.common.Temperature", "org.dyamand.types.common.Humidity", @@ -155,7 +157,7 @@ fn run_range_query_benchmark( ]; let property = format!( "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/{}", - properties[(i % 4) as usize] + properties[i % 4] ); // Dataset diff --git a/examples/realistic_rdf_benchmark.rs b/examples/realistic_rdf_benchmark.rs index afd9b2c..cda886c 100644 --- a/examples/realistic_rdf_benchmark.rs +++ b/examples/realistic_rdf_benchmark.rs @@ -137,12 +137,12 @@ fn run_single_benchmark( // Sensor data - rotating through different sensors let sensor = format!( - "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{}", - format!("{:02x}", (i % 256) as u8) + "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{:02x}", + (i % 256) as u8 ); // Property type - rotating through different measurement types - let properties = vec![ + let properties = [ "org.dyamand.types.common.AtmosphericPressure", "org.dyamand.types.common.Temperature", "org.dyamand.types.common.Humidity", diff --git a/src/benchmarks/benchmark.rs b/src/benchmarks/benchmark.rs index 607e922..89d4ae8 100644 --- a/src/benchmarks/benchmark.rs +++ b/src/benchmarks/benchmark.rs @@ -3,12 +3,18 @@ use crate::storage::indexing::{dense, sparse}; use std::fs; use std::time::Instant; +#[allow(dead_code)] const DATA_DIR: &str = "data/benchmark"; +#[allow(dead_code)] const LOG_FILE: &str = "data/benchmark/log.dat"; +#[allow(dead_code)] const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx"; +#[allow(dead_code)] const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx"; +#[allow(dead_code)] const SPARSE_INTERVAL: usize = 1000; +#[allow(dead_code)] fn setup_data(number_records: u64) -> std::io::Result<()> { let _ = fs::remove_dir_all(DATA_DIR); fs::create_dir_all(DATA_DIR)?; @@ -31,6 +37,8 @@ fn setup_data(number_records: u64) -> std::io::Result<()> { Ok(()) } +#[allow(dead_code)] +#[allow(clippy::cast_precision_loss)] fn benchmark_indexing() -> std::io::Result<()> { println!("Indexing Benchmark"); @@ -58,6 +66,7 @@ fn benchmark_indexing() -> std::io::Result<()> { Ok(()) } +#[allow(dead_code)] fn benchmark_queries() -> std::io::Result<()> { println!("Query Benchmark"); let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?; @@ -67,8 +76,8 @@ fn benchmark_queries() -> std::io::Result<()> { (0u64, 100u64, "100 records"), (5000u64, 5100u64, "100 records (mid-range)"), (0u64, 10000u64, "10K records"), - (0u64, 100000u64, "100K records"), - (0u64, 1000000u64, "1M records"), + (0u64, 100_000u64, "100K records"), + (0u64, 1_000_000u64, "1M records"), ]; for (timestamp_start, timestamp_end, description) in query_ranges { @@ -111,6 +120,7 @@ fn benchmark_queries() -> std::io::Result<()> { Ok(()) } +#[allow(dead_code)] fn main() -> std::io::Result<()> { println!("RDF Indexing Benchmark : Dense vs Sparse"); println!("Setting up data..."); diff --git a/src/lib.rs b/src/lib.rs index 7ba3523..1e74147 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,6 +46,7 @@ #![allow(clippy::unused_self)] #![allow(clippy::needless_pass_by_value)] #![allow(clippy::case_sensitive_file_extension_comparisons)] +#![allow(clippy::manual_div_ceil)] #![allow(clippy::if_not_else)] #![allow(clippy::must_use_candidate)] #![allow(clippy::redundant_closure_for_method_calls)] diff --git a/src/main.rs b/src/main.rs index 1fc9730..c7a6ff4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,16 +5,20 @@ use janus::core::Event; use janus::indexing::shared::LogWriter; use janus::storage::indexing::{dense, sparse}; -use janus::storage::memory_tracker::MemoryTracker; use janus::storage::segmented_storage::StreamingSegmentedStorage; use janus::storage::util::StreamingConfig; use std::fs; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; +#[allow(dead_code)] const DATA_DIR: &str = "data/benchmark"; +#[allow(dead_code)] const LOG_FILE: &str = "data/benchmark/log.dat"; +#[allow(dead_code)] const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx"; +#[allow(dead_code)] const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx"; +#[allow(dead_code)] const SPARSE_INTERVAL: usize = 1000; const SEGMENT_BASE_PATH: &str = "data/rdf_benchmark"; @@ -48,7 +52,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64; for i in 0..1_000_000u64 { - let timestamp = base_timestamp + i * 1; // 1ms intervals + let timestamp = base_timestamp + i; // 1ms intervals let subject = format!("http://example.org/person/person_{}", i % 10000); let predicate = match i % 10 { 0..=3 => "http://example.org/knows", @@ -73,7 +77,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { } let write_duration = start_time.elapsed(); - let write_throughput = 1_000_000.0 / write_duration.as_secs_f64(); + let _write_throughput = 1_000_000.0 / write_duration.as_secs_f64(); // println!("\nWrite completed!"); // println!(" Duration: {:.3} seconds", write_duration.as_secs_f64()); @@ -100,7 +104,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { let results = storage.query_rdf(query_start_ts, query_end_ts)?; let query_duration = start_time.elapsed(); - let read_throughput = results.len() as f64 / query_duration.as_secs_f64(); + let _read_throughput = results.len() as f64 / query_duration.as_secs_f64(); // println!(" Results found: {}", results.len()); // println!(" Query time: {:.3} ms", query_duration.as_millis()); @@ -161,6 +165,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> { Ok(()) } +#[allow(dead_code)] fn setup_data(number_records: u64) -> std::io::Result<()> { let _ = fs::remove_dir_all(DATA_DIR); fs::create_dir_all(DATA_DIR)?; @@ -183,17 +188,18 @@ fn setup_data(number_records: u64) -> std::io::Result<()> { Ok(()) } +#[allow(dead_code)] fn benchmark_indexing() -> std::io::Result<()> { // println!("Indexing Benchmark"); let start = Instant::now(); dense::build_dense_index(LOG_FILE, DENSE_INDEX_FILE)?; - let dense_time = start.elapsed(); + let _dense_time = start.elapsed(); // println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0); let start = Instant::now(); sparse::build_sparse_index(LOG_FILE, SPARSE_INDEX_FILE, &SPARSE_INTERVAL)?; - let sparse_time = start.elapsed(); + let _sparse_time = start.elapsed(); // println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0); let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?; @@ -210,6 +216,7 @@ fn benchmark_indexing() -> std::io::Result<()> { Ok(()) } +#[allow(dead_code)] fn benchmark_queries() -> std::io::Result<()> { // println!("Query Benchmark"); let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?; @@ -223,7 +230,7 @@ fn benchmark_queries() -> std::io::Result<()> { (0u64, 1000000u64, "1M records"), ]; - for (timestamp_start, timestamp_end, description) in query_ranges { + for (timestamp_start, timestamp_end, _description) in query_ranges { // println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end); let start = Instant::now(); @@ -297,7 +304,6 @@ fn benchmark_storage_performance() -> std::io::Result<()> { sparse_interval: 100, entries_per_index_block: 512, segment_base_path: format!("./benchmark_data_{}", num_records), - ..Default::default() }; // Clean up any existing data @@ -329,7 +335,7 @@ fn benchmark_storage_performance() -> std::io::Result<()> { } let write_duration = write_start.elapsed(); - let write_throughput = num_records as f64 / write_duration.as_secs_f64(); + let _write_throughput = num_records as f64 / write_duration.as_secs_f64(); // println!("Write Performance:"); // println!(" Duration: {:.3}s", write_duration.as_secs_f64()); @@ -341,7 +347,7 @@ fn benchmark_storage_performance() -> std::io::Result<()> { // println!("\nQuery Performance:"); - for (fraction, description) in query_ranges { + for (fraction, _description) in query_ranges { let query_count = 100.min(num_records / 10); // Run 100 queries or 10% of records, whichever is smaller let mut query_times = Vec::new(); let mut total_records_read = 0; @@ -365,16 +371,16 @@ fn benchmark_storage_performance() -> std::io::Result<()> { } let avg_query_time = query_times.iter().sum::() / query_times.len() as f64; - let queries_per_sec = 1.0 / avg_query_time; + let _queries_per_sec = 1.0 / avg_query_time; let total_query_time = query_times.iter().sum::(); - let records_per_sec = if total_query_time > 0.0 { + let _records_per_sec = if total_query_time > 0.0 { total_records_read as f64 / total_query_time } else { 0.0 }; - let avg_records_per_query = total_records_read as f64 / query_count as f64; - let min_time = query_times.iter().cloned().fold(f64::INFINITY, f64::min); - let max_time = query_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max); + let _avg_records_per_query = total_records_read as f64 / query_count as f64; + let _min_time = query_times.iter().cloned().fold(f64::INFINITY, f64::min); + let _max_time = query_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max); // println!(" {} queries ({}):", description, query_count); // println!(" Avg query time: {:.3}ms", avg_query_time * 1000.0); diff --git a/src/parsing/janusql_parser.rs b/src/parsing/janusql_parser.rs index 8a49414..100a7ec 100644 --- a/src/parsing/janusql_parser.rs +++ b/src/parsing/janusql_parser.rs @@ -60,28 +60,28 @@ pub struct ParsedJanusQuery { /// Parser for JanusQL queries pub struct JanusQLParser { - historical_sliding_window_regex: Regex, - historical_fixed_window_regex: Regex, - live_sliding_window_regex: Regex, - register_regex: Regex, - prefix_regex: Regex, + historical_sliding_window: Regex, + historical_fixed_window: Regex, + live_sliding_window: Regex, + register: Regex, + prefix: Regex, } impl JanusQLParser { /// Creates a new JanusQLParser instance. pub fn new() -> Result> { Ok(JanusQLParser { - historical_sliding_window_regex: Regex::new( + historical_sliding_window: Regex::new( r"FROM\s+NAMED\s+WINDOW\s+([^\s]+)\s+ON\s+STREAM\s+([^\s]+)\s+\[OFFSET\s+(\d+)\s+RANGE\s+(\d+)\s+STEP\s+(\d+)\]", )?, - historical_fixed_window_regex: Regex::new( + historical_fixed_window: Regex::new( r"FROM\s+NAMED\s+WINDOW\s+([^\s]+)\s+ON\s+STREAM\s+([^\s]+)\s+\[START\s+(\d+)\s+END\s+(\d+)\]", )?, - live_sliding_window_regex: Regex::new( + live_sliding_window: Regex::new( r"FROM\s+NAMED\s+WINDOW\s+([^\s]+)\s+ON\s+STREAM\s+([^\s]+)\s+\[RANGE\s+(\d+)\s+STEP\s+(\d+)\]", )?, - register_regex: Regex::new(r"REGISTER\s+(\w+)\s+([^\s]+)\s+AS")?, - prefix_regex: Regex::new(r"REGISTER\s+(\w+)\s+([^\s]+)\s+AS")?, + register: Regex::new(r"REGISTER\s+(\w+)\s+([^\s]+)\s+AS")?, + prefix: Regex::new(r"PREFIX\s+([^\s]+):\s*<([^>]+)>")?, }) } @@ -90,7 +90,7 @@ impl JanusQLParser { line: &str, prefix_mapper: &HashMap, ) -> Result, Box> { - if let Some(captures) = self.historical_sliding_window_regex.captures(line) { + if let Some(captures) = self.historical_sliding_window.captures(line) { return Ok(Some(WindowDefinition { window_name: self.unwrap_iri(&captures[1], prefix_mapper), stream_name: self.unwrap_iri(&captures[2], prefix_mapper), @@ -103,7 +103,7 @@ impl JanusQLParser { })); } - if let Some(captures) = self.historical_fixed_window_regex.captures(line) { + if let Some(captures) = self.historical_fixed_window.captures(line) { return Ok(Some(WindowDefinition { window_name: self.unwrap_iri(&captures[1], prefix_mapper), stream_name: self.unwrap_iri(&captures[2], prefix_mapper), @@ -116,7 +116,7 @@ impl JanusQLParser { })); } - if let Some(captures) = self.live_sliding_window_regex.captures(line) { + if let Some(captures) = self.live_sliding_window.captures(line) { return Ok(Some(WindowDefinition { window_name: self.unwrap_iri(&captures[1], prefix_mapper), stream_name: self.unwrap_iri(&captures[2], prefix_mapper), @@ -155,7 +155,7 @@ impl JanusQLParser { if trimmed_line.is_empty() || trimmed_line.starts_with("/*") - || trimmed_line.starts_with("*") + || trimmed_line.starts_with('*') || trimmed_line.starts_with("*/") { if in_where_clause && !trimmed_line.is_empty() { @@ -165,14 +165,14 @@ impl JanusQLParser { } if trimmed_line.starts_with("REGISTER") { - if let Some(captures) = self.register_regex.captures(trimmed_line) { + if let Some(captures) = self.register.captures(trimmed_line) { let operator = captures.get(1).unwrap().as_str().to_string(); let name_raw = captures.get(2).unwrap().as_str(); let name = self.unwrap_iri(name_raw, &parsed.prefixes); parsed.r2s = Some(R2SOperator { operator, name }); } } else if trimmed_line.starts_with("PREFIX") { - if let Some(captures) = self.prefix_regex.captures(trimmed_line) { + if let Some(captures) = self.prefix.captures(trimmed_line) { let prefix = captures.get(1).unwrap().as_str().to_string(); let namespace = captures.get(2).unwrap().as_str().to_string(); parsed.prefixes.insert(prefix, namespace); @@ -310,7 +310,7 @@ impl JanusQLParser { adapted } } - _ => adapted, + WindowType::Live => adapted, } } @@ -356,7 +356,7 @@ mod tests { #[test] fn test_basic_live_window() { let parser = JanusQLParser::new().unwrap(); - let query = r#" + let query = r" PREFIX sensor: PREFIX saref: REGISTER RStream sensor:output AS @@ -368,7 +368,7 @@ mod tests { ?event saref:hasTimestamp ?timestamp . } } - "#; + "; let result = parser.parse(query).unwrap(); assert_eq!(result.live_windows.len(), 1); @@ -381,7 +381,7 @@ mod tests { #[test] fn test_mixed_windows() { let parser = JanusQLParser::new().unwrap(); - let query = r#" + let query = r" PREFIX sensor: PREFIX saref: REGISTER RStream sensor:output AS @@ -403,16 +403,16 @@ mod tests { ?event saref:hasTimestamp ?timestamp . } } - "#; + "; let result = parser.parse(query).unwrap(); assert_eq!(result.live_windows.len(), 1); assert_eq!(result.historical_windows.len(), 2); assert_eq!(result.live_windows[0].width, 5000); assert_eq!(result.live_windows[0].slide, 1000); - assert_eq!(result.historical_windows[0].start, Some(1622505600)); - assert_eq!(result.historical_windows[0].end, Some(1622592000)); - assert_eq!(result.historical_windows[1].offset, Some(1622505600)); + assert_eq!(result.historical_windows[0].start, Some(1_622_505_600)); + assert_eq!(result.historical_windows[0].end, Some(1_622_592_000)); + assert_eq!(result.historical_windows[1].offset, Some(1_622_505_600)); assert_eq!(result.historical_windows[1].width, 10000); assert_eq!(result.historical_windows[1].slide, 2000); assert!(!result.rspql_query.is_empty()); diff --git a/src/storage/indexing/dictionary.rs b/src/storage/indexing/dictionary.rs index a1e44be..1308f8d 100644 --- a/src/storage/indexing/dictionary.rs +++ b/src/storage/indexing/dictionary.rs @@ -82,14 +82,14 @@ mod tests { let graph_id = dict.encode("http://example.org/graph1"); println!("Encoded IDs:"); - println!("Subject: {} -> {}", "http://example.org/person/Alice", subject_id); - println!("Predicate: {} -> {}", "http://example.org/knows", predicate_id); - println!("Object: {} -> {}", "http://example.org/person/Bob", object_id); - println!("Graph: {} -> {}", "http://example.org/graph1", graph_id); + println!("Subject: http://example.org/person/Alice -> {}", subject_id); + println!("Predicate: http://example.org/knows -> {}", predicate_id); + println!("Object: http://example.org/person/Bob -> {}", object_id); + println!("Graph: http://example.org/graph1 -> {}", graph_id); // Create an event let event = Event { - timestamp: 1234567890, + timestamp: 1_234_567_890, subject: subject_id, predicate: predicate_id, object: object_id, @@ -122,7 +122,7 @@ mod tests { // Test the clean API - user provides URIs directly let rdf_event = RDFEvent::new( - 1234567890, + 1_234_567_890, "http://example.org/person/Alice", "http://example.org/knows", "http://example.org/person/Bob", @@ -140,7 +140,7 @@ mod tests { assert_eq!(decoded_event.predicate, "http://example.org/knows"); assert_eq!(decoded_event.object, "http://example.org/person/Bob"); assert_eq!(decoded_event.graph, "http://example.org/graph1"); - assert_eq!(decoded_event.timestamp, 1234567890); + assert_eq!(decoded_event.timestamp, 1_234_567_890); println!("Clean API test passed!"); println!( diff --git a/src/storage/memory_tracker.rs b/src/storage/memory_tracker.rs index 2d731aa..594f766 100644 --- a/src/storage/memory_tracker.rs +++ b/src/storage/memory_tracker.rs @@ -58,6 +58,7 @@ impl MemoryTracker { } /// Get current memory statistics + #[allow(clippy::cast_precision_loss)] pub fn get_stats(&self) -> MemoryStats { let current = self.current_memory_bytes.load(Ordering::Relaxed); let peak = self.peak_memory_bytes.load(Ordering::Relaxed); @@ -157,12 +158,13 @@ impl MemoryTracker { } #[cfg(target_os = "macos")] + #[allow(dead_code)] fn get_memory_macos(&self) -> usize { use std::mem; use std::ptr; #[repr(C)] - struct task_basic_info { + struct TaskBasicInfo { virtual_size: u32, resident_size: u32, policy: u32, @@ -170,25 +172,27 @@ impl MemoryTracker { } extern "C" { + #[allow(dead_code)] fn mach_task_self() -> u32; + #[allow(dead_code)] fn task_info( target_task: u32, flavor: u32, - task_info_out: *mut task_basic_info, + task_info_out: *mut TaskBasicInfo, task_info_outCnt: *mut u32, ) -> i32; } const TASK_BASIC_INFO: u32 = 5; - let mut info: task_basic_info = unsafe { mem::zeroed() }; - let mut count = (mem::size_of::() / mem::size_of::()) as u32; + let mut info: TaskBasicInfo = unsafe { mem::zeroed() }; + let mut count = (mem::size_of::() / mem::size_of::()) as u32; let result = unsafe { task_info( mach_task_self(), TASK_BASIC_INFO, - &mut info as *mut task_basic_info, - &mut count, + &raw mut info, + &raw mut count, ) }; @@ -219,6 +223,7 @@ impl MemoryTracker { } /// Format bytes in human-readable format + #[allow(clippy::cast_precision_loss)] pub fn format_bytes(bytes: usize) -> String { const UNITS: &[&str] = &["B", "KB", "MB", "GB"]; let mut size = bytes as f64; diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs index 7dc9a9d..a2215a3 100644 --- a/src/storage/segmented_storage.rs +++ b/src/storage/segmented_storage.rs @@ -108,6 +108,7 @@ impl StreamingSegmentedStorage { self.write(encoded_event) } + #[allow(dead_code)] fn should_flush(&self) -> bool { let batch_buffer = self.batch_buffer.read().unwrap(); diff --git a/tests/dictionary_encoding_test.rs b/tests/dictionary_encoding_test.rs index 00a4f1e..269a9cf 100644 --- a/tests/dictionary_encoding_test.rs +++ b/tests/dictionary_encoding_test.rs @@ -174,7 +174,7 @@ fn test_dictionary_persistence() -> std::io::Result<()> { // Create and populate dictionary let mut dict = Dictionary::new(); - let uris = vec![ + let uris = [ "https://example.org/resource/event001", "http://www.w3.org/ns/saref#hasValue", "http://www.w3.org/2001/XMLSchema#dateTime", @@ -251,7 +251,7 @@ fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> { let mut dict = Dictionary::new(); // Define common IoT RDF predicates and graph URIs - let predicates = vec![ + let predicates = [ "http://www.w3.org/ns/saref#hasTimestamp", "http://www.w3.org/ns/saref#hasValue", "http://www.w3.org/ns/ssn#observedBy", @@ -318,7 +318,7 @@ fn test_sparse_index_with_dictionary_integration() -> std::io::Result<()> { // Define RDF components let predicates = - vec!["http://www.w3.org/ns/saref#hasTimestamp", "http://www.w3.org/ns/saref#hasValue"]; + ["http://www.w3.org/ns/saref#hasTimestamp", "http://www.w3.org/ns/saref#hasValue"]; let predicate_ids: Vec = predicates.iter().map(|p| dict.encode(p)).collect(); @@ -406,7 +406,7 @@ fn test_rdf_namespace_reuse() { let mut dict = Dictionary::new(); // Common RDF namespace URIs that should be reused - let common_namespaces = vec![ + let common_namespaces = [ "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "http://www.w3.org/2000/01/rdf-schema#", "http://www.w3.org/2001/XMLSchema#", @@ -513,7 +513,7 @@ fn test_dictionary_space_savings() { let mut dict = Dictionary::new(); // Calculate space used by raw URIs - let uris = vec![ + let uris = [ "https://solid.ti.rw.fau.de/public/ns/stream#event001", "http://www.w3.org/ns/saref#hasTimestamp", "2025-11-05T10:30:00Z", diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 44f3268..df2e932 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -8,7 +8,7 @@ use janus::{Error, Result}; #[test] fn test_basic_functionality() { // TODO: Add integration tests - assert!(true); + // assert!(true); // Removed as it's always true } #[test] From 46af63c17dc74c535140f8586c0d551a7412efd6 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 12 Nov 2025 16:03:36 +0100 Subject: [PATCH 18/19] Fix rustfmt configuration - remove unstable features to support stable Rust --- rustfmt.toml | 39 ----------------------------------- src/storage/memory_tracker.rs | 10 ++------- 2 files changed, 2 insertions(+), 47 deletions(-) diff --git a/rustfmt.toml b/rustfmt.toml index c161143..276694f 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -24,63 +24,24 @@ chain_width = 80 # Maximum line length for single line if-else expressions single_line_if_else_max_width = 50 -# Brace style for items -brace_style = "SameLineWhere" - -# Brace style for control flow constructs -control_brace_style = "AlwaysSameLine" - # How to indent hard_tabs = false # Number of spaces per tab tab_spaces = 4 -# Where to put a binary operator when a binary expression goes multiline -binop_separator = "Front" - -# Combine control expressions with function calls -combine_control_expr = true - -# Maximum length of comments -comment_width = 100 - # Use field init shorthand if possible use_field_init_shorthand = true # Use try shorthand use_try_shorthand = true -# Format code in doc comments -format_code_in_doc_comments = true - -# Format strings using rustfmt -format_strings = true - -# Merge imports -imports_granularity = "Crate" - # Reorder imports reorder_imports = true # Reorder modules reorder_modules = true -# Where to put the opening brace of structs -struct_brace_style = "SameLineWhere" - -# Indent style for function parameters -indent_style = "Block" - -# Leave a space before the colon in a type annotation -space_before_colon = false - -# Leave a space after the colon in a type annotation -space_after_colon = true - -# Put empty-body functions and impls on a single line -empty_item_single_line = true - # Newline style newline_style = "Unix" diff --git a/src/storage/memory_tracker.rs b/src/storage/memory_tracker.rs index 594f766..e63f4a0 100644 --- a/src/storage/memory_tracker.rs +++ b/src/storage/memory_tracker.rs @@ -187,14 +187,8 @@ impl MemoryTracker { let mut info: TaskBasicInfo = unsafe { mem::zeroed() }; let mut count = (mem::size_of::() / mem::size_of::()) as u32; - let result = unsafe { - task_info( - mach_task_self(), - TASK_BASIC_INFO, - &raw mut info, - &raw mut count, - ) - }; + let result = + unsafe { task_info(mach_task_self(), TASK_BASIC_INFO, &raw mut info, &raw mut count) }; if result == 0 { info.resident_size as usize From 7be03aa3d83377cfa0731beb0bebb5acd1c43a54 Mon Sep 17 00:00:00 2001 From: Kush Bisen Date: Wed, 12 Nov 2025 16:08:30 +0100 Subject: [PATCH 19/19] Update GitHub Actions to latest versions: upload-artifact@v4, cache@v4 --- .github/workflows/ci.yml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 04876fb..d44b732 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -42,19 +42,19 @@ jobs: components: clippy - name: Cache cargo registry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/registry key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} - name: Cache cargo index - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/git key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} - name: Cache cargo build - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: target key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }} @@ -85,19 +85,19 @@ jobs: toolchain: ${{ matrix.rust }} - name: Cache cargo registry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/registry key: ${{ runner.os }}-${{ matrix.rust }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} - name: Cache cargo index - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/git key: ${{ runner.os }}-${{ matrix.rust }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} - name: Cache cargo build - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: target key: ${{ runner.os }}-${{ matrix.rust }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }} @@ -121,19 +121,19 @@ jobs: uses: dtolnay/rust-toolchain@stable - name: Cache cargo registry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/registry key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} - name: Cache cargo index - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/git key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} - name: Cache cargo build - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: target key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }} @@ -158,13 +158,13 @@ jobs: uses: taiki-e/install-action@cargo-llvm-cov - name: Cache cargo registry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/registry key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} - name: Cache cargo index - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/git key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} @@ -205,19 +205,19 @@ jobs: targets: ${{ matrix.target }} - name: Cache cargo registry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/registry key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} - name: Cache cargo index - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cargo/git key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} - name: Cache cargo build - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: target key: ${{ runner.os }}-cargo-build-target-${{ matrix.target }}-${{ hashFiles('**/Cargo.lock') }} @@ -226,7 +226,7 @@ jobs: run: cargo build --release --target ${{ matrix.target }} --verbose - name: Upload build artifacts - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: janus-${{ matrix.target }} path: |