From 285789b236c73d88cb75c66b3a64c31be476e7e7 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 5 Nov 2025 20:20:25 +0100
Subject: [PATCH 01/19] Implement dictionary module for RDF term encoding and
 add integration tests

---
 src/indexing/dictionary.rs        |  88 +++++
 src/indexing/mod.rs               |   2 +
 src/indexing/shared.rs            |  22 ++
 src/indexing/sparse.rs            | 156 +++++++-
 tests/dictionary_encoding_test.rs | 622 ++++++++++++++++++++++++++++++
 5 files changed, 889 insertions(+), 1 deletion(-)
 create mode 100644 src/indexing/dictionary.rs
 create mode 100644 tests/dictionary_encoding_test.rs
diff --git a/src/indexing/dictionary.rs b/src/indexing/dictionary.rs
new file mode 100644
index 0000000..d34a7ac
--- /dev/null
+++ b/src/indexing/dictionary.rs
@@ -0,0 +1,88 @@
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::{Read, Write};
+use std::path::Path;
+
+#[derive(Debug)]
+pub struct Dictionary {
+    uri_to_id: HashMap<String, u64>,
+    id_to_uri: Vec<String>,
+    next_id: u64,
+}
+
+impl Dictionary {
+    pub fn new() -> Self {
+        Self { uri_to_id: HashMap::new(), id_to_uri: Vec::new(), next_id: 0 }
+    }
+
+    pub fn fetch_id(&mut self, uri: &str) -> u64 {
+        if let Some(&id) = self.uri_to_id.get(uri) {
+            id
+        } else {
+            let id = self.next_id;
+            self.uri_to_id.insert(uri.to_string(), id);
+            self.id_to_uri.push(uri.to_string());
+            self.next_id += 1;
+            id
+        }
+    }
+
+    pub fn fetch_uri(&self, id: u64) -> Option<&str> {
+        self.id_to_uri.get(id as usize).map(|s| s.as_str())
+    }
+
+    pub fn len(&self) -> usize {
+        self.uri_to_id.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.uri_to_id.is_empty()
+    }
+
+    pub fn save_to_file(&self, path: &Path) -> std::io::Result<()> {
+        let mut file = File::create(path)?;
+        file.write_all(&(self.id_to_uri.len() as u64).to_be_bytes())?;
+
+        for uri in &self.id_to_uri {
+            let uri_bytes = uri.as_bytes();
+            file.write_all(&(uri_bytes.len() as u32).to_be_bytes())?;
+            file.write_all(uri_bytes)?;
+        }
+        Ok(())
+    }
+
+    pub fn load_from_file(path: &Path) -> std::io::Result<Self> {
+        let mut file = File::open(path)?;
+        let mut uri_to_id = HashMap::new();
+        let mut id_to_uri = Vec::new();
+
+        // Reading the number of entries
+        let mut count_bytes = [0u8; 8];
+        file.read_exact(&mut count_bytes)?;
+        let count = u64::from_be_bytes(count_bytes);
+
+        // Reading each IRI Entry
+
+        for id in 0..count {
+            let mut len_bytes = [0u8; 4];
+            file.read_exact(&mut len_bytes)?;
+
+            let length = u32::from_be_bytes(len_bytes) as usize;
+            let mut uri_bytes = vec![0u8; length];
+            file.read_exact(&mut uri_bytes)?;
+            let uri = String::from_utf8(uri_bytes)
+                .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
+
+            uri_to_id.insert(uri.clone(), id);
+            id_to_uri.push(uri);
+        }
+
+        Ok(Self { uri_to_id, id_to_uri, next_id: count })
+    }
+}
+
+impl Default for Dictionary {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs
index 1fd1692..6f0579a 100644
--- a/src/indexing/mod.rs
+++ b/src/indexing/mod.rs
@@ -4,3 +4,5 @@ pub mod shared;
 pub mod dense;
 #[doc=""]
 pub mod sparse;
+#[doc=""]
+pub mod dictionary;
diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs
index 4c262f4..97a1155 100644
--- a/src/indexing/shared.rs
+++ b/src/indexing/shared.rs
@@ -1,5 +1,6 @@
 use std::fs::File;
 use std::io::Write;
+use crate::indexing::dictionary::Dictionary;
 
 #[doc = ""]
 pub const RECORD_SIZE: usize = 40;
@@ -90,3 +91,24 @@ pub struct Event {
     #[doc = ""]
     pub graph: u64,
 }
+
+#[derive(Debug, Clone)]
+pub struct ResolvedEvent{
+    pub timestamp: u64,
+    pub subject: String,
+    pub predicate: String,
+    pub object: String,
+    pub graph: String,
+}
+
+impl Event {
+    pub fn resolve(&self, dict: &Dictionary) -> ResolvedEvent {
+        ResolvedEvent {
+            timestamp: self.timestamp,
+            subject: dict.fetch_uri(self.subject).unwrap_or("UNKNOWN").to_string(),
+            predicate: dict.fetch_uri(self.predicate).unwrap_or("UNKNOWN").to_string(),
+            object: dict.fetch_uri(self.object).unwrap_or("UNKNOWN").to_string(),
+            graph: dict.fetch_uri(self.graph).unwrap_or("UNKNOWN").to_string()
+         }
+    }
+}
diff --git a/src/indexing/sparse.rs b/src/indexing/sparse.rs
index 110b4f7..36f3a99 100644
--- a/src/indexing/sparse.rs
+++ b/src/indexing/sparse.rs
@@ -1,7 +1,13 @@
-use crate::indexing::shared::{decode_record, Event, RECORD_SIZE};
+use crate::indexing::dictionary::{self, Dictionary};
+use crate::indexing::shared::{decode_record, Event, ResolvedEvent, RECORD_SIZE};
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom, Write};
+use std::path::Path;
 
+/// Builder for creating sparse indexes that store only periodic entries.
+///
+/// A sparse index reduces storage space by indexing only every Nth record,
+/// trading some query precision for significant space savings.
 #[doc = ""]
 pub struct SparseIndexBuilder {
     index_file: File,
@@ -9,12 +15,31 @@ pub struct SparseIndexBuilder {
 }
 #[doc = ""]
 impl SparseIndexBuilder {
+    /// Creates a new sparse index builder that writes to the specified file.
+    ///
+    /// # Arguments
+    /// * `index_path` - Path where the index file will be created
+    /// * `interval` - Number of records between index entries (e.g., 1000 means index every 1000th record)
+    ///
+    /// # Returns
+    /// A new `SparseIndexBuilder` instance or an I/O error
     #[doc = ""]
     pub fn create(index_path: &str, interval: usize) -> std::io::Result<Self> {
         let index_file = File::create(index_path)?;
         Ok(Self { index_file, interval })
     }
 
+    /// Adds an entry to the sparse index if the record count matches the interval.
+    ///
+    /// Only records where `record_count % interval == 0` are indexed to save space.
+    ///
+    /// # Arguments
+    /// * `record_count` - The current record number in the log
+    /// * `timestamp` - Timestamp of the record
+    /// * `offset` - Byte offset of the record in the log file
+    ///
+    /// # Returns
+    /// `true` if the entry was added to the index, `false` if skipped
     #[doc = ""]
     pub fn add_entry(
         &mut self,
@@ -31,12 +56,27 @@ impl SparseIndexBuilder {
         }
     }
 
+    /// Finalizes the index by flushing any buffered writes to disk.
+    ///
+    /// This should be called after all entries have been added.
     #[doc = ""]
     pub fn finalize(&mut self) -> std::io::Result<()> {
         self.index_file.flush()
     }
 }
 
+/// Builds a sparse index for an existing log file.
+///
+/// This function reads through the entire log file and creates an index
+/// with entries only for records at the specified interval.
+///
+/// # Arguments
+/// * `log_path` - Path to the log file to index
+/// * `index_path` - Path where the index file will be created
+/// * `interval` - Number of records between index entries
+///
+/// # Returns
+/// Ok(()) on success, or an I/O error
 pub fn build_sparse_index(
     log_path: &str,
     index_path: &str,
@@ -60,12 +100,110 @@ pub fn build_sparse_index(
     Ok(())
 }
 
+/// Builds a sparse index and initializes an empty dictionary.
+///
+/// This is a convenience function that creates both the index and
+/// an empty dictionary file. The dictionary can be populated separately
+/// when processing RDF data.
+///
+/// # Arguments
+/// * `log_path` - Path to the log file to index
+/// * `index_path` - Path where the index file will be created
+/// * `dictionary_path` - Path where the dictionary file will be created
+/// * `interval` - Number of records between index entries
+///
+/// # Returns
+/// Ok(()) on success, or an I/O error
+pub fn build_sparse_index_with_dictionary(
+    log_path: &str,
+    index_path: &str,
+    dictionary_path: &str,
+    interval: &usize,
+) -> std::io::Result<()> {
+    let mut log = File::open(log_path)?;
+    let mut builder = SparseIndexBuilder::create(index_path, *interval)?;
+    let dictionary = Dictionary::new();
+
+    let mut offset = 0u64;
+    let mut record_count = 0u64;
+    let mut record = [0u8; RECORD_SIZE];
+
+    while log.read_exact(&mut record).is_ok() {
+        let (timestamp, _subject, _predicate, _object, _graph) = decode_record(&record);
+
+        builder.add_entry(record_count, timestamp, offset)?;
+
+        offset += RECORD_SIZE as u64;
+        record_count += 1;
+    }
+
+    builder.finalize()?;
+    dictionary.save_to_file(Path::new(dictionary_path))?;
+
+    Ok(())
+}
+
+/// Reader for sparse indexes that enables efficient timestamp-based queries.
+///
+/// The sparse reader loads the entire index into memory for fast binary search,
+/// then performs sequential scans of the log file starting from the appropriate position.
 pub struct SparseReader {
     index: Vec<(u64, u64)>,
+    #[allow(dead_code)]
     interval: usize,
 }
 
 impl SparseReader {
+    /// Opens a sparse index and its associated dictionary.
+    ///
+    /// # Arguments
+    /// * `index_path` - Path to the sparse index file
+    /// * `dictionary_path` - Path to the dictionary file
+    /// * `interval` - The interval used when building the index
+    ///
+    /// # Returns
+    /// A tuple of (SparseReader, Dictionary) or an I/O error
+    pub fn open_with_dictionary(
+        index_path: &str,
+        dictionary_path: &str,
+        interval: usize,
+    ) -> std::io::Result<(Self, Dictionary)> {
+        let reader = Self::open(index_path, interval)?;
+        let dictionary = Dictionary::load_from_file(Path::new(dictionary_path))?;
+        Ok((reader, dictionary))
+    }
+    /// Queries the log and returns results with URIs resolved from the dictionary.
+    ///
+    /// This method performs the same query as `query()` but resolves all numeric IDs
+    /// back to their original URI strings using the provided dictionary.
+    ///
+    /// # Arguments
+    /// * `log_path` - Path to the log file
+    /// * `dict` - Dictionary for resolving IDs to URIs
+    /// * `timestamp_start_bound` - Minimum timestamp (inclusive)
+    /// * `timestamp_end_bound` - Maximum timestamp (inclusive)
+    ///
+    /// # Returns
+    /// Vector of resolved events or an I/O error
+    pub fn query_resolved(
+        &self,
+        log_path: &str,
+        dict: &Dictionary,
+        timestamp_start_bound: u64,
+        timestamp_end_bound: u64,
+    ) -> std::io::Result<Vec<ResolvedEvent>> {
+        let events = self.query(log_path, timestamp_start_bound, timestamp_end_bound)?;
+        Ok(events.into_iter().map(|e| e.resolve(dict)).collect())
+    }
+
+    /// Opens a sparse index file and loads it into memory.
+    ///
+    /// # Arguments
+    /// * `index_path` - Path to the sparse index file
+    /// * `interval` - The interval used when building the index
+    ///
+    /// # Returns
+    /// A new SparseReader instance or an I/O error
     pub fn open(index_path: &str, interval: usize) -> std::io::Result<Self> {
         let mut index_file = File::open(index_path)?;
         let mut index = Vec::new();
@@ -80,6 +218,18 @@ impl SparseReader {
         Ok(Self { index, interval })
     }
 
+    /// Queries the log file for events within the specified timestamp range.
+    ///
+    /// Uses binary search on the index to find the starting position, then
+    /// performs a sequential scan of the log file to collect matching events.
+    ///
+    /// # Arguments
+    /// * `log_path` - Path to the log file
+    /// * `timestamp_start_bound` - Minimum timestamp (inclusive)
+    /// * `timestamp_end_bound` - Maximum timestamp (inclusive)
+    ///
+    /// # Returns
+    /// Vector of events with numeric IDs or an I/O error
     pub fn query(
         &self,
         log_path: &str,
@@ -120,6 +270,10 @@ impl SparseReader {
         Ok(results)
     }
 
+    /// Returns the size of the index in bytes.
+    ///
+    /// Each index entry is 16 bytes (8 bytes timestamp + 8 bytes offset),
+    /// so this returns `index.len() * 16`.
     pub fn index_size_bytes(&self) -> usize {
         self.index.len() * 16
     }
diff --git a/tests/dictionary_encoding_test.rs b/tests/dictionary_encoding_test.rs
new file mode 100644
index 0000000..75c6e9d
--- /dev/null
+++ b/tests/dictionary_encoding_test.rs
@@ -0,0 +1,622 @@
+//! Dictionary Encoding Integration Tests
+//!
+//! These tests verify the dictionary-based encoding system for RDF terms.
+//!
+//! **Important**: The dictionary stores the actual URI/literal strings WITHOUT RDF syntax:
+//! - URIs: stored as "https://example.org/resource" (not "<https://example.org/resource>")
+//! - Literals: stored as the value string (e.g., "23.5" or "2025-11-05T10:30:00Z")
+//! - Datatypes: stored separately as URIs (e.g., "http://www.w3.org/2001/XMLSchema#double")
+//!
+//! The RDF syntax (angle brackets, quotes, ^^datatype) is handled by the RDF parser/serializer,
+//! not by the dictionary encoding layer. This keeps the dictionary implementation clean and
+//! format-agnostic.
+//!
+//! Example RDF triple in Turtle syntax:
+//! ```turtle
+//! <https://rsp.js/event1> <http://www.w3.org/ns/saref#hasValue> "23.5"^^<http://www.w3.org/2001/XMLSchema#double> .
+//! ```
+//!
+//! Is stored in the dictionary as 4 separate entries:
+//! - Subject ID → "https://rsp.js/event1"
+//! - Predicate ID → "http://www.w3.org/ns/saref#hasValue"
+//! - Object ID → "23.5" (the literal value)
+//! - Datatype ID → "http://www.w3.org/2001/XMLSchema#double" (if needed)
+
+use janus::indexing::dictionary::Dictionary;
+use janus::indexing::shared::{decode_record, encode_record, LogWriter, RECORD_SIZE};
+use janus::indexing::sparse::{build_sparse_index, SparseReader};
+use std::fs;
+use std::path::Path;
+
+#[test]
+fn test_rdf_syntax_to_dictionary_mapping() {
+    let mut dict = Dictionary::new();
+
+    // RDF Triple in Turtle syntax:
+    // <https://rsp.js/event1> <http://www.w3.org/ns/saref#hasValue> "23.5"^^<http://www.w3.org/2001/XMLSchema#double> <https://example.org/graph> .
+    //
+    // The parser would extract these components and store them WITHOUT RDF syntax:
+
+    // Subject: <https://rsp.js/event1> → stored as the URI string
+    let subject = "https://rsp.js/event1";
+    let subject_id = dict.fetch_id(subject);
+
+    // Predicate: <http://www.w3.org/ns/saref#hasValue> → stored as the URI string
+    let predicate = "http://www.w3.org/ns/saref#hasValue";
+    let predicate_id = dict.fetch_id(predicate);
+
+    // Object: "23.5"^^xsd:double → stored as the literal value "23.5"
+    let object = "23.5";
+    let object_id = dict.fetch_id(object);
+
+    // Datatype: ^^<http://www.w3.org/2001/XMLSchema#double> → stored as URI string
+    let datatype = "http://www.w3.org/2001/XMLSchema#double";
+    let datatype_id = dict.fetch_id(datatype);
+
+    // Graph: <https://example.org/graph> → stored as the URI string
+    let graph = "https://example.org/graph";
+    let graph_id = dict.fetch_id(graph);
+
+    // Verify all components are stored correctly
+    assert_eq!(dict.fetch_uri(subject_id), Some(subject));
+    assert_eq!(dict.fetch_uri(predicate_id), Some(predicate));
+    assert_eq!(dict.fetch_uri(object_id), Some(object));
+    assert_eq!(dict.fetch_uri(datatype_id), Some(datatype));
+    assert_eq!(dict.fetch_uri(graph_id), Some(graph));
+
+    // In a real system, you'd also store metadata about which IDs are literals vs URIs
+    // and what datatype each literal has. This test just demonstrates the string storage.
+}
+
+#[test]
+fn test_rdf_literal_datatypes() {
+    let mut dict = Dictionary::new();
+
+    // Example RDF triples with different literal types:
+    //
+    // Triple 1: <event1> <hasTimestamp> "2025-11-05T10:30:00Z"^^xsd:dateTime
+    let timestamp_value = "2025-11-05T10:30:00Z";
+    let timestamp_datatype = "http://www.w3.org/2001/XMLSchema#dateTime";
+
+    // Triple 2: <event1> <hasTemperature> "23.5"^^xsd:double
+    let temp_value = "23.5";
+    let temp_datatype = "http://www.w3.org/2001/XMLSchema#double";
+
+    // Triple 3: <event1> <hasCount> "42"^^xsd:integer
+    let count_value = "42";
+    let count_datatype = "http://www.w3.org/2001/XMLSchema#integer";
+
+    // Triple 4: <event1> <hasLabel> "Sensor Reading"^^xsd:string
+    let label_value = "Sensor Reading";
+    let label_datatype = "http://www.w3.org/2001/XMLSchema#string";
+
+    // Store all values and datatypes in dictionary
+    let timestamp_val_id = dict.fetch_id(timestamp_value);
+    let timestamp_dt_id = dict.fetch_id(timestamp_datatype);
+
+    let temp_val_id = dict.fetch_id(temp_value);
+    let temp_dt_id = dict.fetch_id(temp_datatype);
+
+    let count_val_id = dict.fetch_id(count_value);
+    let count_dt_id = dict.fetch_id(count_datatype);
+
+    let label_val_id = dict.fetch_id(label_value);
+    let label_dt_id = dict.fetch_id(label_datatype);
+
+    // Verify all are stored correctly
+    assert_eq!(dict.fetch_uri(timestamp_val_id), Some(timestamp_value));
+    assert_eq!(dict.fetch_uri(timestamp_dt_id), Some(timestamp_datatype));
+
+    assert_eq!(dict.fetch_uri(temp_val_id), Some(temp_value));
+    assert_eq!(dict.fetch_uri(temp_dt_id), Some(temp_datatype));
+
+    assert_eq!(dict.fetch_uri(count_val_id), Some(count_value));
+    assert_eq!(dict.fetch_uri(count_dt_id), Some(count_datatype));
+
+    assert_eq!(dict.fetch_uri(label_val_id), Some(label_value));
+    assert_eq!(dict.fetch_uri(label_dt_id), Some(label_datatype));
+
+    // Note: Datatype URIs are reused across multiple literals
+    // E.g., many literals will have ^^xsd:double as their datatype
+    assert_eq!(temp_dt_id, dict.fetch_id(temp_datatype)); // Same ID when requested again
+}
+
+#[test]
+fn test_dictionary_basic_operations() {
+    let mut dict = Dictionary::new();
+
+    // Test get_or_insert with real RDF URIs
+    let uri1 = "https://rsp.js/event1";
+    let uri2 = "http://www.w3.org/ns/saref#hasTimestamp";
+    let uri3 = "http://example.org/sensor/temperature";
+    let uri4 = "http://www.w3.org/ns/ssn#observedBy";
+
+    // First insertion should return ID 0
+    let id1 = dict.fetch_id(uri1);
+    assert_eq!(id1, 0);
+
+    // Subsequent insertions should return sequential IDs
+    let id2 = dict.fetch_id(uri2);
+    assert_eq!(id2, 1);
+
+    let id3 = dict.fetch_id(uri3);
+    assert_eq!(id3, 2);
+
+    let id4 = dict.fetch_id(uri4);
+    assert_eq!(id4, 3);
+
+    // Requesting same URI should return same ID
+    let id1_again = dict.fetch_id(uri1);
+    assert_eq!(id1_again, id1);
+
+    // Test retrieval
+    assert_eq!(dict.fetch_uri(id1), Some(uri1));
+    assert_eq!(dict.fetch_uri(id2), Some(uri2));
+    assert_eq!(dict.fetch_uri(id3), Some(uri3));
+    assert_eq!(dict.fetch_uri(id4), Some(uri4));
+
+    // Test invalid ID
+    assert_eq!(dict.fetch_uri(999), None);
+
+    // Test length
+    assert_eq!(dict.len(), 4);
+    assert!(!dict.is_empty());
+}
+
+#[test]
+fn test_dictionary_persistence() -> std::io::Result<()> {
+    let test_dir = "target/test_data/dict_persistence";
+    let _ = fs::remove_dir_all(test_dir);
+    fs::create_dir_all(test_dir)?;
+
+    let dict_path = Path::new(test_dir).join("test_dict.bin");
+
+    // Create and populate dictionary
+    let mut dict = Dictionary::new();
+    let uris = vec![
+        "https://example.org/resource/event001",
+        "http://www.w3.org/ns/saref#hasValue",
+        "http://www.w3.org/2001/XMLSchema#dateTime",
+        "https://solid.ti.rw.fau.de/public/ns/stream#",
+    ];
+
+    let ids: Vec<u64> = uris.iter().map(|uri| dict.fetch_id(uri)).collect();
+
+    // Save to file
+    dict.save_to_file(&dict_path)?;
+
+    // Load from file
+    let loaded_dict = Dictionary::load_from_file(&dict_path)?;
+
+    // Verify all URIs are preserved with correct IDs
+    for (i, uri) in uris.iter().enumerate() {
+        assert_eq!(loaded_dict.fetch_uri(ids[i]), Some(*uri));
+    }
+
+    assert_eq!(loaded_dict.len(), uris.len());
+
+    Ok(())
+}
+
+#[test]
+fn test_rdf_event_encoding_with_dictionary() {
+    let mut dict = Dictionary::new();
+
+    // RDF Quad in N-Quads syntax would look like:
+    // <https://rsp.js/event/sensor-reading-001> <http://www.w3.org/ns/saref#hasTimestamp> "2025-11-05T10:30:00Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> <https://solid.ti.rw.fau.de/public/ns/stream#default> .
+    //
+    // But we store the actual string values WITHOUT syntax markers:
+
+    let subject_uri = "https://rsp.js/event/sensor-reading-001";
+    let predicate_uri = "http://www.w3.org/ns/saref#hasTimestamp";
+    let object_uri = "2025-11-05T10:30:00Z"; // The literal value itself
+    let graph_uri = "https://solid.ti.rw.fau.de/public/ns/stream#default";
+
+    // Map URIs to IDs
+    let timestamp: u64 = 1699181400;
+    let subject_id = dict.fetch_id(subject_uri);
+    let predicate_id = dict.fetch_id(predicate_uri);
+    let object_id = dict.fetch_id(object_uri);
+    let graph_id = dict.fetch_id(graph_uri);
+
+    // Encode record with IDs
+    let mut buffer = [0u8; RECORD_SIZE];
+    encode_record(&mut buffer, timestamp, subject_id, predicate_id, object_id, graph_id);
+
+    // Decode record
+    let (dec_timestamp, dec_subject, dec_predicate, dec_object, dec_graph) = decode_record(&buffer);
+
+    // Verify IDs are correctly encoded/decoded
+    assert_eq!(dec_timestamp, timestamp);
+    assert_eq!(dec_subject, subject_id);
+    assert_eq!(dec_predicate, predicate_id);
+    assert_eq!(dec_object, object_id);
+    assert_eq!(dec_graph, graph_id);
+
+    // Resolve IDs back to URIs
+    assert_eq!(dict.fetch_uri(dec_subject), Some(subject_uri));
+    assert_eq!(dict.fetch_uri(dec_predicate), Some(predicate_uri));
+    assert_eq!(dict.fetch_uri(dec_object), Some(object_uri));
+    assert_eq!(dict.fetch_uri(dec_graph), Some(graph_uri));
+}
+
+#[test]
+fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> {
+    let test_dir = "target/test_data/iot_sensor";
+    let _ = fs::remove_dir_all(test_dir);
+    fs::create_dir_all(test_dir)?;
+
+    let log_path = format!("{}/iot_sensor.log", test_dir);
+    let mut dict = Dictionary::new();
+
+    // Define common IoT RDF predicates and graph URIs
+    let predicates = vec![
+        "http://www.w3.org/ns/saref#hasTimestamp",
+        "http://www.w3.org/ns/saref#hasValue",
+        "http://www.w3.org/ns/ssn#observedBy",
+        "http://www.w3.org/ns/sosa#observedProperty",
+    ];
+
+    // Map predicates to IDs first (these will be reused)
+    let predicate_ids: Vec<u64> = predicates.iter().map(|p| dict.fetch_id(p)).collect();
+
+    let graph_uri = "https://solid.ti.rw.fau.de/public/ns/stream#iot";
+    let graph_id = dict.fetch_id(graph_uri);
+
+    // Create log writer
+    let mut writer = LogWriter::create(&log_path)?;
+
+    // Generate 100 IoT sensor events with unique event IDs but shared predicates
+    for i in 0..100 {
+        let timestamp = 1699181400 + i;
+
+        // Each event has unique subject (sensor reading ID)
+        let subject_uri = format!("https://rsp.js/event/sensor-reading-{:03}", i);
+        let subject_id = dict.fetch_id(&subject_uri);
+
+        // Rotate through predicates (demonstrating reuse)
+        let predicate_id = predicate_ids[(i % predicate_ids.len() as u64) as usize];
+
+        // Unique object (sensor value)
+        let object_uri = format!("value-{}", i * 10);
+        let object_id = dict.fetch_id(&object_uri);
+
+        writer.append_record(timestamp, subject_id, predicate_id, object_id, graph_id)?;
+    }
+
+    writer.flush()?;
+
+    // Verify dictionary statistics
+    // We should have:
+    // - 100 unique subjects
+    // - 4 predicates (reused)
+    // - 100 unique objects
+    // - 1 graph URI
+    // Total: 205 unique URIs
+    assert_eq!(dict.len(), 205);
+
+    // Verify predicate reuse - predicates should have low IDs (0-3)
+    for (i, pred) in predicates.iter().enumerate() {
+        assert_eq!(dict.fetch_id(pred), i as u64);
+    }
+
+    Ok(())
+}
+
+#[test]
+fn test_sparse_index_with_dictionary_integration() -> std::io::Result<()> {
+    let test_dir = "target/test_data/sparse_integration";
+    let _ = fs::remove_dir_all(test_dir);
+    fs::create_dir_all(test_dir)?;
+
+    let log_path = format!("{}/indexed_sensor.log", test_dir);
+    let index_path = format!("{}/indexed_sensor.idx", test_dir);
+    let dict_path = format!("{}/indexed_sensor_dict.bin", test_dir);
+
+    let mut dict = Dictionary::new();
+
+    // Define RDF components
+    let predicates =
+        vec!["http://www.w3.org/ns/saref#hasTimestamp", "http://www.w3.org/ns/saref#hasValue"];
+
+    let predicate_ids: Vec<u64> = predicates.iter().map(|p| dict.fetch_id(p)).collect();
+
+    let graph_uri = "https://example.org/graph/sensors";
+    let graph_id = dict.fetch_id(graph_uri);
+
+    // Create log with 1000 events
+    let mut writer = LogWriter::create(&log_path)?;
+
+    for i in 0..1000 {
+        let timestamp = i;
+        let subject_uri = format!("https://rsp.js/event/{:04}", i);
+        let subject_id = dict.fetch_id(&subject_uri);
+        let predicate_id = predicate_ids[(i % 2) as usize];
+        let object_uri = format!("reading-{}", i);
+        let object_id = dict.fetch_id(&object_uri);
+
+        writer.append_record(timestamp, subject_id, predicate_id, object_id, graph_id)?;
+    }
+
+    writer.flush()?;
+
+    // Save dictionary BEFORE building index
+    dict.save_to_file(Path::new(&dict_path))?;
+
+    // Build sparse index (without dictionary parameter since we saved it separately)
+    build_sparse_index(&log_path, &index_path, &100)?;
+
+    // Load dictionary and reader
+    let (reader, loaded_dict) = SparseReader::open_with_dictionary(&index_path, &dict_path, 100)?;
+
+    // Query a range and verify results
+    let results = reader.query_resolved(&log_path, &loaded_dict, 100, 199)?;
+
+    // Should get 100 events (timestamps 100-199)
+    assert_eq!(results.len(), 100);
+
+    // Verify first result has resolved URIs
+    assert!(results[0].subject.starts_with("https://rsp.js/event/"));
+    assert!(results[0].predicate.starts_with("http://www.w3.org/ns/saref#"));
+    assert!(results[0].object.starts_with("reading-"));
+    assert_eq!(results[0].graph, graph_uri);
+
+    // Verify timestamps are in order
+    for (i, event) in results.iter().enumerate() {
+        assert_eq!(event.timestamp, 100 + i as u64);
+    }
+
+    Ok(())
+}
+
+#[test]
+fn test_large_uri_handling() {
+    let mut dict = Dictionary::new();
+
+    // Test with very long URIs (realistic for RDF)
+    let long_uri = format!(
+        "https://solid.ti.rw.fau.de/public/2025/11/05/sensors/building-3/floor-2/room-205/temperature-sensor-{}/reading-{}",
+        "TMP-4532-XYZ-9871-ABC-DEF",
+        "measurement-with-very-long-identifier-12345678901234567890"
+    );
+
+    let id = dict.fetch_id(&long_uri);
+    assert_eq!(id, 0);
+
+    // Verify retrieval works
+    assert_eq!(dict.fetch_uri(id), Some(long_uri.as_str()));
+
+    // Test that we can handle many long URIs
+    for i in 0..100 {
+        let uri = format!(
+            "https://example.org/very/long/path/to/resource/{}/subresource/{}/final-resource-{}",
+            i,
+            i * 2,
+            i * 3
+        );
+        dict.fetch_id(&uri);
+    }
+
+    assert_eq!(dict.len(), 101);
+}
+
+#[test]
+fn test_rdf_namespace_reuse() {
+    let mut dict = Dictionary::new();
+
+    // Common RDF namespace URIs that should be reused
+    let common_namespaces = vec![
+        "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+        "http://www.w3.org/2000/01/rdf-schema#",
+        "http://www.w3.org/2001/XMLSchema#",
+        "http://www.w3.org/ns/saref#",
+        "http://www.w3.org/ns/ssn#",
+        "http://www.w3.org/ns/sosa#",
+    ];
+
+    // Map each namespace
+    let namespace_ids: Vec<u64> = common_namespaces.iter().map(|ns| dict.fetch_id(ns)).collect();
+
+    // Create 1000 events that all use these namespaces
+    for i in 0..1000 {
+        let event_uri = format!("https://rsp.js/event/{}", i);
+        dict.fetch_id(&event_uri);
+
+        // Reference one of the common namespaces
+        let ns_id = namespace_ids[i % namespace_ids.len()];
+        assert!(dict.fetch_uri(ns_id).is_some());
+    }
+
+    // Dictionary should have: 6 namespaces + 1000 events = 1006 entries
+    assert_eq!(dict.len(), 1006);
+
+    // Verify namespace IDs are unchanged (demonstrating reuse)
+    for (i, ns) in common_namespaces.iter().enumerate() {
+        assert_eq!(dict.fetch_id(ns), namespace_ids[i]);
+    }
+}
+
+#[test]
+fn test_event_resolution_workflow() -> std::io::Result<()> {
+    let test_dir = "target/test_data/event_resolution";
+    let _ = fs::remove_dir_all(test_dir);
+    fs::create_dir_all(test_dir)?;
+
+    let log_path = format!("{}/resolution_test.log", test_dir);
+    let mut dict = Dictionary::new();
+
+    // Create realistic RDF event
+    let event_uris = vec![
+        (
+            1699181400u64,
+            "https://rsp.js/event/temp-reading-001",
+            "http://www.w3.org/ns/saref#hasValue",
+            "23.5",
+            "https://example.org/graph/sensors",
+        ),
+        (
+            1699181401u64,
+            "https://rsp.js/event/temp-reading-002",
+            "http://www.w3.org/ns/saref#hasValue",
+            "24.1",
+            "https://example.org/graph/sensors",
+        ),
+        (
+            1699181402u64,
+            "https://rsp.js/event/humidity-reading-001",
+            "http://www.w3.org/ns/saref#hasValue",
+            "65.0",
+            "https://example.org/graph/sensors",
+        ),
+    ];
+
+    // Write events with dictionary encoding
+    let mut writer = LogWriter::create(&log_path)?;
+
+    for (timestamp, subject, predicate, object, graph) in &event_uris {
+        let subject_id = dict.fetch_id(subject);
+        let predicate_id = dict.fetch_id(predicate);
+        let object_id = dict.fetch_id(object);
+        let graph_id = dict.fetch_id(graph);
+
+        writer.append_record(*timestamp, subject_id, predicate_id, object_id, graph_id)?;
+    }
+
+    writer.flush()?;
+
+    // Read back and resolve
+    let mut log_file = std::fs::File::open(&log_path)?;
+    use std::io::Read;
+
+    for (timestamp, subject, predicate, object, graph) in &event_uris {
+        let mut buffer = [0u8; RECORD_SIZE];
+        log_file.read_exact(&mut buffer)?;
+
+        let (dec_ts, dec_subj_id, dec_pred_id, dec_obj_id, dec_graph_id) = decode_record(&buffer);
+
+        // Verify timestamp
+        assert_eq!(dec_ts, *timestamp);
+
+        // Resolve IDs to URIs
+        assert_eq!(dict.fetch_uri(dec_subj_id), Some(*subject));
+        assert_eq!(dict.fetch_uri(dec_pred_id), Some(*predicate));
+        assert_eq!(dict.fetch_uri(dec_obj_id), Some(*object));
+        assert_eq!(dict.fetch_uri(dec_graph_id), Some(*graph));
+    }
+
+    Ok(())
+}
+
+#[test]
+fn test_dictionary_space_savings() {
+    let mut dict = Dictionary::new();
+
+    // Calculate space used by raw URIs
+    let uris = vec![
+        "https://solid.ti.rw.fau.de/public/ns/stream#event001",
+        "http://www.w3.org/ns/saref#hasTimestamp",
+        "2025-11-05T10:30:00Z",
+        "https://solid.ti.rw.fau.de/public/ns/stream#default",
+    ];
+
+    let raw_size: usize = uris.iter().map(|u| u.len()).sum();
+
+    // With dictionary, we store 8 bytes per ID
+    let ids: Vec<u64> = uris.iter().map(|u| dict.fetch_id(u)).collect();
+    let encoded_size = ids.len() * 8; // 8 bytes per u64
+
+    println!("Raw URIs size: {} bytes", raw_size);
+    println!("Encoded IDs size: {} bytes", encoded_size);
+    println!("Space savings per record: {} bytes", raw_size - encoded_size);
+
+    // For 1000 records reusing same URIs:
+    let records = 1000;
+    let raw_total = raw_size * records;
+    let encoded_total = encoded_size * records + raw_size; // IDs + dictionary overhead
+
+    println!("\nFor {} records:", records);
+    println!("Raw storage: {} bytes", raw_total);
+    println!(
+        "Dictionary storage: {} bytes (IDs) + {} bytes (dictionary)",
+        encoded_size * records,
+        raw_size
+    );
+    println!("Total with dictionary: {} bytes", encoded_total);
+    println!(
+        "Space saved: {} bytes ({:.1}% reduction)",
+        raw_total - encoded_total,
+        (1.0 - encoded_total as f64 / raw_total as f64) * 100.0
+    );
+
+    // Verify space savings
+    assert!(encoded_total < raw_total);
+}
+
+#[test]
+fn test_complete_rdf_quad_with_datatype() {
+    let mut dict = Dictionary::new();
+
+    // Complete RDF quad in N-Quads syntax:
+    // <https://rsp.js/event/temp-sensor-001> <http://www.w3.org/ns/saref#hasValue> "23.5"^^<http://www.w3.org/2001/XMLSchema#double> <https://example.org/graph/sensors> .
+    //
+    // This quad has 5 components that get stored in the dictionary:
+
+    let components = vec![
+        ("subject", "https://rsp.js/event/temp-sensor-001"),
+        ("predicate", "http://www.w3.org/ns/saref#hasValue"),
+        ("object_value", "23.5"), // Just the literal value
+        ("object_datatype", "http://www.w3.org/2001/XMLSchema#double"), // Datatype as separate URI
+        ("graph", "https://example.org/graph/sensors"),
+    ];
+
+    // Store all components and get their IDs
+    let mut component_ids = std::collections::HashMap::new();
+    for (name, value) in &components {
+        let id = dict.fetch_id(value);
+        component_ids.insert(*name, id);
+        println!("{}: '{}' → ID {}", name, value, id);
+    }
+
+    // In the actual record, we'd store:
+    // - timestamp (u64)
+    // - subject_id (u64)
+    // - predicate_id (u64)
+    // - object_value_id (u64)
+    // - graph_id (u64)
+    //
+    // The object_datatype_id would be stored in a separate metadata structure
+    // that tracks which object IDs are literals and what their datatypes are.
+
+    // Verify retrieval
+    assert_eq!(dict.fetch_uri(component_ids["subject"]), Some(components[0].1));
+    assert_eq!(dict.fetch_uri(component_ids["predicate"]), Some(components[1].1));
+    assert_eq!(dict.fetch_uri(component_ids["object_value"]), Some(components[2].1));
+    assert_eq!(dict.fetch_uri(component_ids["object_datatype"]), Some(components[3].1));
+    assert_eq!(dict.fetch_uri(component_ids["graph"]), Some(components[4].1));
+
+    // Another quad with the same datatype:
+    // <https://rsp.js/event/humidity-sensor-001> <http://www.w3.org/ns/saref#hasValue> "65.2"^^<http://www.w3.org/2001/XMLSchema#double> <https://example.org/graph/sensors> .
+
+    let subject2 = "https://rsp.js/event/humidity-sensor-001";
+    let value2 = "65.2";
+
+    let _subject2_id = dict.fetch_id(subject2);
+    let _value2_id = dict.fetch_id(value2);
+
+    // These components are REUSED (same ID returned):
+    let predicate2_id = dict.fetch_id("http://www.w3.org/ns/saref#hasValue");
+    let datatype2_id = dict.fetch_id("http://www.w3.org/2001/XMLSchema#double");
+    let graph2_id = dict.fetch_id("https://example.org/graph/sensors");
+
+    // Verify reuse
+    assert_eq!(predicate2_id, component_ids["predicate"]);
+    assert_eq!(datatype2_id, component_ids["object_datatype"]);
+    assert_eq!(graph2_id, component_ids["graph"]);
+
+    // Dictionary has: 5 original components + 2 new (subject2, value2) = 7 total
+    assert_eq!(dict.len(), 7);
+
+    println!("\n✓ Demonstrated RDF datatype handling with dictionary encoding");
+    println!("✓ Showed URI reuse across multiple quads (predicate, datatype, graph)");
+    println!("✓ Dictionary size: {} entries for 2 complete RDF quads", dict.len());
+}

From 928707693f58487cbe324d4c46c9920afbb35037 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 5 Nov 2025 19:45:12 +0000
Subject: [PATCH 02/19] Initial plan


From c6e70656088a041d53af968746e9b95195f75408 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 5 Nov 2025 19:49:41 +0000
Subject: [PATCH 03/19] Apply code review suggestions and rustfmt fixes

Co-authored-by: argahsuknesib <87450516+argahsuknesib@users.noreply.github.com>
---
 src/indexing/dense.rs         | 12 ++++--------
 src/indexing/dictionary.rs    |  2 +-
 src/indexing/mod.rs           | 12 ++++++------
 src/indexing/shared.rs        |  8 ++++----
 src/indexing/sparse.rs        |  2 +-
 src/parsing/janusql_parser.rs |  2 +-
 6 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/src/indexing/dense.rs b/src/indexing/dense.rs
index 001998c..37cb3f9 100644
--- a/src/indexing/dense.rs
+++ b/src/indexing/dense.rs
@@ -27,20 +27,16 @@ impl DenseIndexBuilder {
     }
 }
 
-
-#[doc=""]
-pub fn build_dense_index(
-    log_path: &str,
-    index_path: &str,
-) -> std::io::Result<()> {
+#[doc = ""]
+pub fn build_dense_index(log_path: &str, index_path: &str) -> std::io::Result<()> {
     let mut log = File::open(log_path)?;
     let mut builder = DenseIndexBuilder::create(index_path)?;
 
     let mut offset = 0u64;
     let mut record = [0u8; RECORD_SIZE];
 
-    while log.read_exact(&mut record).is_ok(){
-        let (timestamp, _, _ , _, _ ) = decode_record(&record);
+    while log.read_exact(&mut record).is_ok() {
+        let (timestamp, _, _, _, _) = decode_record(&record);
         builder.add_entry(timestamp, offset)?;
         offset += RECORD_SIZE as u64;
     }
diff --git a/src/indexing/dictionary.rs b/src/indexing/dictionary.rs
index d34a7ac..e5605b2 100644
--- a/src/indexing/dictionary.rs
+++ b/src/indexing/dictionary.rs
@@ -61,7 +61,7 @@ impl Dictionary {
         file.read_exact(&mut count_bytes)?;
         let count = u64::from_be_bytes(count_bytes);
 
-        // Reading each IRI Entry
+        // Reading each URI Entry
 
         for id in 0..count {
             let mut len_bytes = [0u8; 4];
diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs
index 6f0579a..69d25b6 100644
--- a/src/indexing/mod.rs
+++ b/src/indexing/mod.rs
@@ -1,8 +1,8 @@
-#[doc=""]
-pub mod shared;
-#[doc=""]
+#[doc = ""]
 pub mod dense;
-#[doc=""]
-pub mod sparse;
-#[doc=""]
+#[doc = ""]
 pub mod dictionary;
+#[doc = ""]
+pub mod shared;
+#[doc = ""]
+pub mod sparse;
diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs
index 97a1155..dd344bc 100644
--- a/src/indexing/shared.rs
+++ b/src/indexing/shared.rs
@@ -1,6 +1,6 @@
+use crate::indexing::dictionary::Dictionary;
 use std::fs::File;
 use std::io::Write;
-use crate::indexing::dictionary::Dictionary;
 
 #[doc = ""]
 pub const RECORD_SIZE: usize = 40;
@@ -93,7 +93,7 @@ pub struct Event {
 }
 
 #[derive(Debug, Clone)]
-pub struct ResolvedEvent{
+pub struct ResolvedEvent {
     pub timestamp: u64,
     pub subject: String,
     pub predicate: String,
@@ -108,7 +108,7 @@ impl Event {
             subject: dict.fetch_uri(self.subject).unwrap_or("UNKNOWN").to_string(),
             predicate: dict.fetch_uri(self.predicate).unwrap_or("UNKNOWN").to_string(),
             object: dict.fetch_uri(self.object).unwrap_or("UNKNOWN").to_string(),
-            graph: dict.fetch_uri(self.graph).unwrap_or("UNKNOWN").to_string()
-         }
+            graph: dict.fetch_uri(self.graph).unwrap_or("UNKNOWN").to_string(),
+        }
     }
 }
diff --git a/src/indexing/sparse.rs b/src/indexing/sparse.rs
index 36f3a99..70a8ad1 100644
--- a/src/indexing/sparse.rs
+++ b/src/indexing/sparse.rs
@@ -1,4 +1,4 @@
-use crate::indexing::dictionary::{self, Dictionary};
+use crate::indexing::dictionary::Dictionary;
 use crate::indexing::shared::{decode_record, Event, ResolvedEvent, RECORD_SIZE};
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom, Write};
diff --git a/src/parsing/janusql_parser.rs b/src/parsing/janusql_parser.rs
index da8dafa..c023058 100644
--- a/src/parsing/janusql_parser.rs
+++ b/src/parsing/janusql_parser.rs
@@ -356,7 +356,7 @@ mod tests {
     }
 
     #[test]
-    fn test_mixed_windows(){
+    fn test_mixed_windows() {
         let parser = JanusQLParser::new().unwrap();
         let query = r#"
         PREFIX sensor: <https://rsp.js/sensors/>

From 5cabf50b3ab7d0fc3f4cad3e046ab2afe27a67c0 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 5 Nov 2025 22:34:46 +0100
Subject: [PATCH 04/19] Refactor CI integration tests and enhance JanusQL data
 structures with additional fields

---
 .github/workflows/ci.yml      | 41 -----------------------------------
 src/lib.rs                    |  3 +--
 src/parsing/janusql_parser.rs | 25 ++++++++++++++++++++-
 3 files changed, 25 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c6ba021..606b819 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -112,29 +112,6 @@ jobs:
   integration-test:
     name: Integration Tests
     runs-on: ubuntu-latest
-    services:
-      oxigraph:
-        image: oxigraph/oxigraph:latest
-        ports:
-          - 7878:7878
-        options: >-
-          --health-cmd "curl -f http://localhost:7878/query || exit 1"
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 5
-
-      fuseki:
-        image: stain/jena-fuseki:latest
-        ports:
-          - 3030:3030
-        env:
-          ADMIN_PASSWORD: admin
-          JVM_ARGS: "-Xmx2g"
-        options: >-
-          --health-cmd "curl -f http://localhost:3030/$/ping || exit 1"
-          --health-interval 10s
-          --health-timeout 5s
-          --health-retries 5
 
     steps:
       - name: Checkout code
@@ -161,26 +138,8 @@ jobs:
           path: target
           key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }}
 
-      - name: Wait for services
-        run: |
-          echo "Waiting for Oxigraph..."
-          timeout 60 bash -c 'until curl -f http://localhost:7878/query; do sleep 2; done'
-          echo "Waiting for Fuseki..."
-          timeout 60 bash -c 'until curl -f http://localhost:3030/$/ping; do sleep 2; done'
-
-      - name: Create Fuseki test dataset
-        run: |
-          curl -X POST http://localhost:3030/$/datasets \
-            -H "Content-Type: application/x-www-form-urlencoded" \
-            -d "dbName=ds&dbType=mem" \
-            --user admin:admin || true
-
       - name: Run integration tests
         run: cargo test --test '*' --all-features --verbose
-        env:
-          OXIGRAPH_ENDPOINT: http://localhost:7878
-          JENA_ENDPOINT: http://localhost:3030
-          JENA_DATASET: ds
 
   # Code coverage
   coverage:
diff --git a/src/lib.rs b/src/lib.rs
index b115ed0..5a5928a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -59,13 +59,12 @@ pub mod indexing;
 /// Module for parsing JanusQL queries
 pub mod parsing;
 
-#[doc = ""]
+/// Benchmarking utilities
 pub mod benchmarking {
 
     mod benchmark;
 }
 
-/// Module containing error types
 pub mod error {
     //! Error types and result definitions
 
diff --git a/src/parsing/janusql_parser.rs b/src/parsing/janusql_parser.rs
index da8dafa..8a49414 100644
--- a/src/parsing/janusql_parser.rs
+++ b/src/parsing/janusql_parser.rs
@@ -10,34 +10,55 @@ pub enum WindowType {
 
 #[derive(Debug, Clone)]
 pub struct WindowDefinition {
+    /// Name of the window
     pub window_name: String,
+    /// Name of the stream
     pub stream_name: String,
+    /// Width of the window
     pub width: u64,
+    /// Slide step
     pub slide: u64,
+    /// Offset for sliding windows
     pub offset: Option<u64>,
+    /// Start time for fixed windows
     pub start: Option<u64>,
+    /// End time for fixed windows
     pub end: Option<u64>,
+    /// Type of the window
     pub window_type: WindowType,
 }
 
+/// R2S operator definition
 #[derive(Debug, Clone)]
 pub struct R2SOperator {
+    /// Operator type
     pub operator: String,
+    /// Operator name
     pub name: String,
 }
 
+/// Parsed JanusQL query structure
 #[derive(Debug)]
 pub struct ParsedJanusQuery {
+    /// R2S operator if present
     pub r2s: Option<R2SOperator>,
+    /// Live windows defined in the query
     pub live_windows: Vec<WindowDefinition>,
+    /// Historical windows defined in the query
     pub historical_windows: Vec<WindowDefinition>,
+    /// RSPQL query string
     pub rspql_query: String,
+    /// SPARQL queries
     pub sparql_queries: Vec<String>,
+    /// Prefix mappings
     pub prefixes: HashMap<String, String>,
+    /// WHERE clause
     pub where_clause: String,
+    /// SELECT clause
     pub select_clause: String,
 }
 
+/// Parser for JanusQL queries
 pub struct JanusQLParser {
     historical_sliding_window_regex: Regex,
     historical_fixed_window_regex: Regex,
@@ -47,6 +68,7 @@ pub struct JanusQLParser {
 }
 
 impl JanusQLParser {
+    /// Creates a new JanusQLParser instance.
     pub fn new() -> Result<Self, Box<dyn std::error::Error>> {
         Ok(JanusQLParser {
             historical_sliding_window_regex: Regex::new(
@@ -110,6 +132,7 @@ impl JanusQLParser {
         Ok(None)
     }
 
+    /// Parses a JanusQL query string.
     pub fn parse(&self, query: &str) -> Result<ParsedJanusQuery, Box<dyn std::error::Error>> {
         let mut parsed = ParsedJanusQuery {
             r2s: None,
@@ -356,7 +379,7 @@ mod tests {
     }
 
     #[test]
-    fn test_mixed_windows(){
+    fn test_mixed_windows() {
         let parser = JanusQLParser::new().unwrap();
         let query = r#"
         PREFIX sensor: <https://rsp.js/sensors/>

From 69f1f9e27bab7e0b23724816e2c0091789fcb5a6 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Thu, 6 Nov 2025 13:49:04 +0100
Subject: [PATCH 05/19] Add comprehensive benchmarks for writing performance
 and analysis of indexing strategies

---
 Cargo.toml                 |  12 ++
 WRITING_BENCHMARKS.md      | 229 +++++++++++++++++++++++++++++++++++++
 benches/README.md          |  45 ++++++--
 benches/analysis.rs        | 196 +++++++++++++++++++++++++++++++
 benches/write_benchmark.rs | 216 ++++++++++++++++++++++++++++++++++
 run_benchmarks.sh          |  47 ++++++++
 6 files changed, 736 insertions(+), 9 deletions(-)
 create mode 100644 WRITING_BENCHMARKS.md
 create mode 100644 benches/analysis.rs
 create mode 100644 benches/write_benchmark.rs
 create mode 100755 run_benchmarks.sh

diff --git a/Cargo.toml b/Cargo.toml
index c199397..1ff8565 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,3 +35,15 @@ opt-level = 0
 
 [profile.test]
 opt-level = 0
+
+[[bench]]
+name = "benchmark"
+harness = false
+
+[[bench]]
+name = "write_benchmark"
+harness = false
+
+[[bench]]
+name = "analysis"
+harness = false
diff --git a/WRITING_BENCHMARKS.md b/WRITING_BENCHMARKS.md
new file mode 100644
index 0000000..85ee553
--- /dev/null
+++ b/WRITING_BENCHMARKS.md
@@ -0,0 +1,229 @@
+# Complete Guide: Testing Writing Performance for Dense vs Sparse Indexing
+
+## Overview
+
+This guide provides step-by-step instructions for testing and comparing the writing performance of Dense vs Sparse indexing approaches in the Janus RDF Stream Processing Engine.
+
+## Background
+
+Previously, the benchmarking only tested **reading performance** (querying existing indexes). Now we have comprehensive **writing performance** tests that measure:
+
+1. **Real-time indexing**: Building indexes while writing records
+2. **Batch indexing**: Writing all records first, then building indexes
+3. **Throughput comparison**: Records processed per second
+4. **Memory and storage efficiency**: Resource usage patterns
+
+## What's Been Added
+
+### New Benchmark Files
+
+1. **`write_benchmark.rs`** - Core writing performance tests
+2. **`analysis.rs`** - Advanced analysis and optimal configuration finding
+3. **`run_benchmarks.sh`** - Automated test runner script
+4. **Enhanced `README.md`** - Comprehensive documentation
+
+### Updated Configuration
+
+- Updated `Cargo.toml` with new benchmark entries
+- Added support for multiple test scenarios
+- Integrated analysis tools
+
+## Step-by-Step Testing Instructions
+
+### Step 1: Run the Complete Benchmark Suite
+
+```bash
+# Make script executable (if not already)
+chmod +x run_benchmarks.sh
+
+# Run all benchmarks
+./run_benchmarks.sh
+```
+
+This runs all three benchmark types in sequence and provides a comprehensive overview.
+
+### Step 2: Test Writing Performance Specifically
+
+```bash
+# Run only the writing performance benchmark
+cargo bench --bench write_benchmark
+```
+
+**What this tests:**
+- Real-time writing with indexing for 10K, 100K, and 1M records
+- Batch writing comparison
+- Performance ratios between dense and sparse approaches
+
+**Expected output:**
+```
+=== WRITING PERFORMANCE RESULTS ===
+Records: 100000
+Sparse interval: 1000
+
+--- Real-time Writing (Index while writing) ---
+Dense - Write time: 260.611 ms, Total time: 260.611 ms
+Sparse - Write time: 85.356 ms, Total time: 85.356 ms
+
+--- Performance Comparison ---
+Real-time: Sparse is 3.05x faster than Dense
+```
+
+### Step 3: Advanced Analysis
+
+```bash
+# Run detailed analysis
+cargo bench --bench analysis
+```
+
+**What this tests:**
+- Optimal sparse intervals (100, 500, 1000, 2000, 5000, 10000)
+- Memory usage scaling across different dataset sizes
+- Write throughput under various conditions
+
+### Step 4: Original Read Performance (For Comparison)
+
+```bash
+# Run original benchmark
+cargo bench --bench benchmark
+```
+
+**What this tests:**
+- Index building time from existing log files
+- Query performance across different ranges
+- Memory usage of indexes
+
+### Step 5: Individual Test Runs
+
+For targeted testing, you can run specific scenarios:
+
+```bash
+# Run with release optimizations for accurate timing
+cargo bench --bench write_benchmark --release
+
+# Run with specific test size (modify source code)
+# Edit the test_sizes vector in write_benchmark.rs
+```
+
+## Interpreting Results
+
+### Key Metrics to Focus On
+
+#### 1. Writing Throughput
+- **Records/second**: Higher is better
+- **Dense typically**: 300-500 records/sec for large datasets
+- **Sparse typically**: 1000-1500 records/sec for large datasets
+
+#### 2. Performance Ratios
+- **Real-time writing**: Sparse is typically 2-4x faster
+- **Batch processing**: Sparse is typically 2-3x faster
+- **Memory usage**: Sparse uses significantly less memory
+
+#### 3. Trade-offs
+- **Query speed**: Dense is typically 10-30% faster for queries
+- **Storage space**: Sparse uses 90-99% less index storage
+- **Write speed**: Sparse is 2-4x faster for writing
+
+### When to Use Each Approach
+
+#### Use Dense Indexing When:
+- Query performance is critical
+- Dataset size is manageable (< 1M records)
+- Storage space is not a constraint
+- Read-heavy workloads
+
+#### Use Sparse Indexing When:
+- High-frequency writes (streaming data)
+- Large datasets (> 1M records)
+- Storage efficiency is important
+- Write-heavy workloads
+- Real-time ingestion requirements
+
+### Sample Results Analysis
+
+```
+Real-time: Sparse is 3.05x faster than Dense
+Batch: Sparse is 2.44x faster than Dense
+```
+
+This shows:
+- Sparse indexing provides significant write performance benefits
+- The advantage is consistent across different writing patterns
+- Sparse indexing scales better with larger datasets
+
+## Customizing Tests
+
+### Modify Record Counts
+
+Edit the test sizes in `write_benchmark.rs`:
+
+```rust
+let test_sizes = vec![10_000u64, 100_000u64, 1_000_000u64, 5_000_000u64];
+```
+
+### Adjust Sparse Intervals
+
+Modify the `SPARSE_INTERVAL` constant:
+
+```rust
+const SPARSE_INTERVAL: usize = 500; // Test different intervals
+```
+
+### Add Custom Test Scenarios
+
+Create new benchmark functions following the existing patterns in the benchmark files.
+
+## Performance Optimization Tips
+
+### For Maximum Accuracy
+1. Run benchmarks on a quiet system (minimal background processes)
+2. Use release builds: `cargo bench --release`
+3. Run multiple iterations and average results
+4. Ensure consistent storage (SSD vs HDD considerations)
+
+### For Large Datasets
+1. Monitor memory usage during tests
+2. Consider disk I/O limitations
+3. Test with realistic data patterns
+4. Evaluate network storage implications
+
+## Troubleshooting
+
+### Common Issues
+
+#### Out of Memory
+- Reduce test dataset sizes
+- Monitor system memory during tests
+- Consider streaming vs batch processing
+
+#### Slow Performance
+- Ensure running in release mode
+- Check disk I/O capacity
+- Verify no other processes consuming resources
+
+#### Inconsistent Results
+- Run tests multiple times
+- Check system load
+- Ensure consistent test conditions
+
+## Next Steps
+
+### Additional Testing Ideas
+
+1. **Network Storage**: Test performance with network-attached storage
+2. **Concurrent Access**: Test multiple writers/readers simultaneously
+3. **Real-world Data**: Test with actual RDF datasets
+4. **Memory Pressure**: Test under various memory constraints
+5. **Different Hardware**: Compare SSD vs HDD performance
+
+### Integration Testing
+
+1. Test within larger application contexts
+2. Measure end-to-end pipeline performance
+3. Evaluate query pattern impacts
+4. Test with realistic data volumes and patterns
+
+## Conclusion
+
+The new writing performance benchmarks provide comprehensive insights into the trade-offs between dense and sparse indexing approaches. The results clearly show that sparse indexing provides significant advantages for write-heavy workloads while maintaining acceptable query performance.
+
+Use these tools to make informed decisions about indexing strategies based on your specific use case requirements.
\ No newline at end of file
diff --git a/benches/README.md b/benches/README.md
index a627df0..d0397d6 100644
--- a/benches/README.md
+++ b/benches/README.md
@@ -1,22 +1,49 @@
-# Benchmarks
+# RDF Indexing Benchmarks
 
-This directory contains performance benchmarks for the Janus RDF Stream Processing Engine.
+This directory contains comprehensive benchmarks for comparing different RDF indexing strategies in Janus.
 
-## Running Benchmarks
+## Available Benchmarks
 
-To run all benchmarks:
+### 1. `benchmark.rs` - Read Performance (Original)
+Tests query performance on pre-built indexes:
+- Index building time comparison
+- Query speed across different data ranges
+- Memory usage comparison
 
+### 2. `write_benchmark.rs` - Write Performance (New)
+Tests writing performance during record insertion:
+- Real-time indexing (index while writing)
+- Batch indexing (build index after writing)
+- Writing throughput comparison
+- Total processing time analysis
+
+### 3. `analysis.rs` - Advanced Analysis (New)
+Detailed analysis across multiple dimensions:
+- Optimal sparse interval analysis
+- Memory usage scaling
+- Write throughput under different conditions
+- Performance recommendations
+
+## Quick Start
+
+### Run All Benchmarks
 ```bash
-cargo bench
+./run_benchmarks.sh
 ```
 
-To run a specific benchmark:
-
+### Run Individual Benchmarks
 ```bash
-cargo bench --bench <benchmark_name>
+# Original read performance benchmark
+cargo bench --bench benchmark
+
+# New write performance benchmark  
+cargo bench --bench write_benchmark
+
+# Advanced analysis suite
+cargo bench --bench analysis
 ```
 
-## Benchmark Structure
+## Step-by-Step Testing Instructions
 
 Benchmarks are organized by functionality:
 
diff --git a/benches/analysis.rs b/benches/analysis.rs
new file mode 100644
index 0000000..7c0bf27
--- /dev/null
+++ b/benches/analysis.rs
@@ -0,0 +1,196 @@
+use janus::indexing::{dense, sparse};
+use std::fs;
+use std::time::Instant;
+
+/// Analyze different sparse intervals to find optimal configuration
+fn analyze_sparse_intervals() -> std::io::Result<()> {
+    println!("🔍 Analyzing Different Sparse Intervals");
+    println!("=====================================");
+    
+    let intervals = vec![100, 500, 1000, 2000, 5000, 10000];
+    let log_file = "data/benchmark/log.dat";
+    let number_records = 100_000u64;
+    
+    // Create test data
+    fs::create_dir_all("data/benchmark")?;
+    let mut writer = janus::indexing::shared::LogWriter::create(log_file)?;
+    for i in 0..number_records {
+        writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
+    }
+    writer.flush()?;
+    
+    println!("Testing {} records with different intervals:", number_records);
+    println!("{:-<80}", "");
+    println!("{:<10} {:<15} {:<15} {:<20} {:<15}", 
+             "Interval", "Build Time(ms)", "Index Size(KB)", "Space Savings(%)", "Query Time(ms)");
+    println!("{:-<80}", "");
+    
+    // Get dense index stats for comparison
+    let dense_start = Instant::now();
+    dense::build_dense_index(log_file, "data/benchmark/dense_ref.idx")?;
+    let dense_build_time = dense_start.elapsed();
+    let dense_reader = dense::DenseIndexReader::open("data/benchmark/dense_ref.idx")?;
+    let dense_size = dense_reader.index_size_bytes();
+    
+    // Test query performance on dense index
+    let query_start = Instant::now();
+    let _dense_results = dense_reader.query(log_file, 10000, 20000)?;
+    let dense_query_time = query_start.elapsed();
+    
+    for interval in intervals {
+        let index_file = format!("data/benchmark/sparse_{}.idx", interval);
+        
+        // Build sparse index
+        let start = Instant::now();
+        sparse::build_sparse_index(log_file, &index_file, &interval)?;
+        let build_time = start.elapsed();
+        
+        // Get size info
+        let reader = sparse::SparseReader::open(&index_file, interval)?;
+        let sparse_size = reader.index_size_bytes();
+        let space_savings = ((dense_size - sparse_size) as f64 / dense_size as f64) * 100.0;
+        
+        // Test query performance
+        let query_start = Instant::now();
+        let _sparse_results = reader.query(log_file, 10000, 20000)?;
+        let query_time = query_start.elapsed();
+        
+        println!("{:<10} {:<15.3} {:<15.2} {:<20.2} {:<15.3}",
+                 interval,
+                 build_time.as_secs_f64() * 1000.0,
+                 sparse_size as f64 / 1024.0,
+                 space_savings,
+                 query_time.as_secs_f64() * 1000.0);
+    }
+    
+    println!("{:-<80}", "");
+    println!("Dense Reference: Build: {:.3}ms, Size: {:.2}KB, Query: {:.3}ms",
+             dense_build_time.as_secs_f64() * 1000.0,
+             dense_size as f64 / 1024.0,
+             dense_query_time.as_secs_f64() * 1000.0);
+    
+    Ok(())
+}
+
+/// Analyze memory usage patterns
+fn analyze_memory_usage() -> std::io::Result<()> {
+    println!("\n🧠 Memory Usage Analysis");
+    println!("=======================");
+    
+    let record_counts = vec![10_000, 50_000, 100_000, 500_000, 1_000_000];
+    
+    println!("{:<12} {:<15} {:<15} {:<20}", 
+             "Records", "Dense Size(MB)", "Sparse Size(MB)", "Memory Ratio");
+    println!("{:-<62}", "");
+    
+    for &count in &record_counts {
+        let log_file = format!("data/benchmark/log_{}.dat", count);
+        let dense_index = format!("data/benchmark/dense_{}.idx", count);
+        let sparse_index = format!("data/benchmark/sparse_{}.idx", count);
+        
+        // Create test data
+        let mut writer = janus::indexing::shared::LogWriter::create(&log_file)?;
+        for i in 0..count {
+            writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
+        }
+        writer.flush()?;
+        
+        // Build indexes
+        dense::build_dense_index(&log_file, &dense_index)?;
+        sparse::build_sparse_index(&log_file, &sparse_index, &1000)?;
+        
+        // Get sizes
+        let dense_reader = dense::DenseIndexReader::open(&dense_index)?;
+        let sparse_reader = sparse::SparseReader::open(&sparse_index, 1000)?;
+        
+        let dense_size = dense_reader.index_size_bytes() as f64 / 1_000_000.0;
+        let sparse_size = sparse_reader.index_size_bytes() as f64 / 1_000_000.0;
+        let ratio = dense_size / sparse_size;
+        
+        println!("{:<12} {:<15.3} {:<15.3} {:<20.2}x",
+                 count, dense_size, sparse_size, ratio);
+    }
+    
+    Ok(())
+}
+
+/// Test write throughput under different conditions
+fn analyze_write_throughput() -> std::io::Result<()> {
+    println!("\n⚡ Write Throughput Analysis");
+    println!("===========================");
+    
+    let test_configs = vec![
+        ("Small batches", 1_000u64),
+        ("Medium batches", 10_000u64),
+        ("Large batches", 100_000u64),
+    ];
+    
+    println!("{:<15} {:<20} {:<20} {:<15}", 
+             "Batch Size", "Dense (rec/sec)", "Sparse (rec/sec)", "Speedup");
+    println!("{:-<70}", "");
+    
+    for (name, batch_size) in test_configs {
+        fs::create_dir_all("data/benchmark")?;
+        
+        // Test dense writing
+        let dense_log = "data/benchmark/dense_throughput.dat";
+        let dense_index = "data/benchmark/dense_throughput.idx";
+        
+        let start = Instant::now();
+        let mut log_writer = janus::indexing::shared::LogWriter::create(dense_log)?;
+        let mut index_builder = janus::indexing::dense::DenseIndexBuilder::create(dense_index)?;
+        
+        for i in 0..batch_size {
+            log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
+            index_builder.add_entry(i, i * 40)?;
+        }
+        log_writer.flush()?;
+        index_builder.finalize()?;
+        
+        let dense_time = start.elapsed();
+        let dense_throughput = batch_size as f64 / dense_time.as_secs_f64();
+        
+        // Test sparse writing
+        let sparse_log = "data/benchmark/sparse_throughput.dat";
+        let sparse_index = "data/benchmark/sparse_throughput.idx";
+        
+        let start = Instant::now();
+        let mut log_writer = janus::indexing::shared::LogWriter::create(sparse_log)?;
+        let mut index_builder = janus::indexing::sparse::SparseIndexBuilder::create(sparse_index, 1000)?;
+        
+        for i in 0..batch_size {
+            log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
+            index_builder.add_entry(i, i, i * 40)?;
+        }
+        log_writer.flush()?;
+        index_builder.finalize()?;
+        
+        let sparse_time = start.elapsed();
+        let sparse_throughput = batch_size as f64 / sparse_time.as_secs_f64();
+        
+        let speedup = sparse_throughput / dense_throughput;
+        
+        println!("{:<15} {:<20.0} {:<20.0} {:<15.2}x",
+                 name, dense_throughput, sparse_throughput, speedup);
+    }
+    
+    Ok(())
+}
+
+fn main() -> std::io::Result<()> {
+    println!("🔬 Advanced RDF Indexing Analysis Suite");
+    println!("=======================================");
+    
+    analyze_sparse_intervals()?;
+    analyze_memory_usage()?;
+    analyze_write_throughput()?;
+    
+    println!("\n✨ Analysis Complete!");
+    println!("\n💡 Recommendations:");
+    println!("  • Use sparse indexing for write-heavy workloads");
+    println!("  • Choose interval based on query precision requirements");
+    println!("  • Consider hybrid approaches for different use cases");
+    println!("  • Monitor memory usage with large datasets");
+    
+    Ok(())
+}
\ No newline at end of file
diff --git a/benches/write_benchmark.rs b/benches/write_benchmark.rs
new file mode 100644
index 0000000..dfd12ab
--- /dev/null
+++ b/benches/write_benchmark.rs
@@ -0,0 +1,216 @@
+use janus::indexing::{dense::DenseIndexBuilder, shared::LogWriter, sparse::SparseIndexBuilder};
+use std::fs;
+use std::time::Instant;
+
+const DATA_DIR: &str = "data/write_benchmark";
+const DENSE_LOG_FILE: &str = "data/write_benchmark/dense_log.dat";
+const SPARSE_LOG_FILE: &str = "data/write_benchmark/sparse_log.dat";
+const DENSE_INDEX_FILE: &str = "data/write_benchmark/dense.idx";
+const SPARSE_INDEX_FILE: &str = "data/write_benchmark/sparse.idx";
+const SPARSE_INTERVAL: usize = 1000;
+
+fn setup_dirs() -> std::io::Result<()> {
+    let _ = fs::remove_dir_all(DATA_DIR);
+    fs::create_dir_all(DATA_DIR)?;
+    Ok(())
+}
+
+/// Benchmark writing records with dense indexing
+/// This simulates real-time writing where each record is indexed immediately
+fn benchmark_dense_writing(number_records: u64) -> std::io::Result<(f64, f64)> {
+    println!("Benchmarking Dense Index Writing...");
+
+    let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?;
+    let mut index_builder = DenseIndexBuilder::create(DENSE_INDEX_FILE)?;
+
+    let start = Instant::now();
+    let mut current_offset = 0u64;
+
+    for i in 0..number_records {
+        let timestamp = i;
+        let subject = (i % 1000) as u64;
+        let predicate = (i % 500) as u64;
+        let object = (i % 2000) as u64;
+        let graph: u64 = 1;
+
+        // Write record to log
+        log_writer.append_record(timestamp, subject, predicate, object, graph)?;
+
+        // Add entry to index
+        index_builder.add_entry(timestamp, current_offset)?;
+
+        current_offset += 40; // RECORD_SIZE
+    }
+
+    let write_time = start.elapsed();
+
+    // Finalize both log and index
+    log_writer.flush()?;
+    index_builder.finalize()?;
+
+    let total_time = start.elapsed();
+
+    Ok((write_time.as_secs_f64(), total_time.as_secs_f64()))
+}
+
+/// Benchmark writing records with sparse indexing
+/// This simulates real-time writing where only periodic records are indexed
+fn benchmark_sparse_writing(number_records: u64) -> std::io::Result<(f64, f64)> {
+    println!("Benchmarking Sparse Index Writing...");
+
+    let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?;
+    let mut index_builder = SparseIndexBuilder::create(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?;
+
+    let start = Instant::now();
+    let mut current_offset = 0u64;
+
+    for i in 0..number_records {
+        let timestamp = i;
+        let subject = (i % 1000) as u64;
+        let predicate = (i % 500) as u64;
+        let object = (i % 2000) as u64;
+        let graph: u64 = 1;
+
+        // Write record to log
+        log_writer.append_record(timestamp, subject, predicate, object, graph)?;
+
+        // Add entry to index (will only add if i % interval == 0)
+        index_builder.add_entry(i, timestamp, current_offset)?;
+
+        current_offset += 40; // RECORD_SIZE
+    }
+
+    let write_time = start.elapsed();
+
+    // Finalize both log and index
+    log_writer.flush()?;
+    index_builder.finalize()?;
+
+    let total_time = start.elapsed();
+
+    Ok((write_time.as_secs_f64(), total_time.as_secs_f64()))
+}
+
+/// Benchmark batch writing vs real-time writing
+fn benchmark_batch_vs_realtime(number_records: u64) -> std::io::Result<()> {
+    println!("\n=== Batch vs Real-time Writing Comparison ===");
+
+    // Test 1: Real-time writing (as implemented above)
+    setup_dirs()?;
+    let (dense_write_time, dense_total_time) = benchmark_dense_writing(number_records)?;
+
+    setup_dirs()?;
+    let (sparse_write_time, sparse_total_time) = benchmark_sparse_writing(number_records)?;
+
+    // Test 2: Batch writing (write log first, then build index)
+    setup_dirs()?;
+    println!("Benchmarking Batch Dense Index Creation...");
+
+    let start = Instant::now();
+    let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?;
+    for i in 0..number_records {
+        let timestamp = i;
+        let subject = (i % 1000) as u64;
+        let predicate = (i % 500) as u64;
+        let object = (i % 2000) as u64;
+        let graph: u64 = 1;
+        log_writer.append_record(timestamp, subject, predicate, object, graph)?;
+    }
+    log_writer.flush()?;
+    let log_write_time = start.elapsed();
+
+    let start = Instant::now();
+    janus::indexing::dense::build_dense_index(DENSE_LOG_FILE, DENSE_INDEX_FILE)?;
+    let index_build_time = start.elapsed();
+    let batch_dense_total = log_write_time.as_secs_f64() + index_build_time.as_secs_f64();
+
+    // Batch sparse
+    setup_dirs()?;
+    println!("Benchmarking Batch Sparse Index Creation...");
+
+    let start = Instant::now();
+    let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?;
+    for i in 0..number_records {
+        let timestamp = i;
+        let subject = (i % 1000) as u64;
+        let predicate = (i % 500) as u64;
+        let object = (i % 2000) as u64;
+        let graph: u64 = 1;
+        log_writer.append_record(timestamp, subject, predicate, object, graph)?;
+    }
+    log_writer.flush()?;
+    let log_write_time = start.elapsed();
+
+    let start = Instant::now();
+    janus::indexing::sparse::build_sparse_index(
+        SPARSE_LOG_FILE,
+        SPARSE_INDEX_FILE,
+        &SPARSE_INTERVAL,
+    )?;
+    let index_build_time = start.elapsed();
+    let batch_sparse_total = log_write_time.as_secs_f64() + index_build_time.as_secs_f64();
+
+    // Print results
+    println!("\n=== WRITING PERFORMANCE RESULTS ===");
+    println!("Records: {}", number_records);
+    println!("Sparse interval: {}", SPARSE_INTERVAL);
+
+    println!("\n--- Real-time Writing (Index while writing) ---");
+    println!(
+        "Dense - Write time: {:.3} ms, Total time: {:.3} ms",
+        dense_write_time * 1000.0,
+        dense_total_time * 1000.0
+    );
+    println!(
+        "Sparse - Write time: {:.3} ms, Total time: {:.3} ms",
+        sparse_write_time * 1000.0,
+        sparse_total_time * 1000.0
+    );
+
+    println!("\n--- Batch Writing (Index after writing) ---");
+    println!(
+        "Dense - Log write: {:.3} ms, Index build: {:.3} ms, Total: {:.3} ms",
+        log_write_time.as_secs_f64() * 1000.0,
+        index_build_time.as_secs_f64() * 1000.0,
+        batch_dense_total * 1000.0
+    );
+    println!(
+        "Sparse - Log write: {:.3} ms, Index build: {:.3} ms, Total: {:.3} ms",
+        log_write_time.as_secs_f64() * 1000.0,
+        index_build_time.as_secs_f64() * 1000.0,
+        batch_sparse_total * 1000.0
+    );
+
+    println!("\n--- Performance Comparison ---");
+    let realtime_speedup = dense_total_time / sparse_total_time;
+    let batch_speedup = batch_dense_total / batch_sparse_total;
+
+    if realtime_speedup > 1.0 {
+        println!("Real-time: Sparse is {:.2}x faster than Dense", realtime_speedup);
+    } else {
+        println!("Real-time: Dense is {:.2}x faster than Sparse", 1.0 / realtime_speedup);
+    }
+
+    if batch_speedup > 1.0 {
+        println!("Batch: Sparse is {:.2}x faster than Dense", batch_speedup);
+    } else {
+        println!("Batch: Dense is {:.2}x faster than Sparse", 1.0 / batch_speedup);
+    }
+
+    Ok(())
+}
+
+fn main() -> std::io::Result<()> {
+    println!("RDF Writing Performance Benchmark: Dense vs Sparse");
+
+    let test_sizes = vec![10_000u64, 100_000u64, 1_000_000u64];
+
+    for &size in &test_sizes {
+        println!("\n{:=<60}", "");
+        println!("Testing with {} records", size);
+        println!("{:=<60}", "");
+        benchmark_batch_vs_realtime(size)?;
+    }
+
+    Ok(())
+}
diff --git a/run_benchmarks.sh b/run_benchmarks.sh
new file mode 100755
index 0000000..d44102f
--- /dev/null
+++ b/run_benchmarks.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Comprehensive benchmark script for testing Dense vs Sparse indexing approaches
+# This script tests both reading and writing performance
+
+echo "🚀 Starting Comprehensive RDF Indexing Benchmark Suite"
+echo "======================================================"
+
+# Create benchmarks directory if it doesn't exist
+mkdir -p data/benchmark
+mkdir -p data/write_benchmark
+
+echo ""
+echo "📊 Running Read Performance Benchmark (Current Implementation)"
+echo "--------------------------------------------------------------"
+cargo bench --bench benchmark
+
+echo ""
+echo "📝 Running Write Performance Benchmark (New Implementation)"
+echo "-----------------------------------------------------------"
+cargo bench --bench write_benchmark
+
+echo ""
+echo "🔬 Running Detailed Analysis"
+echo "-----------------------------"
+
+# Run additional analysis with different record sizes and intervals
+echo "Testing different sparse intervals..."
+
+# You can modify the intervals in the source code and run multiple tests
+# This demonstrates how to test different configurations
+
+echo ""
+echo "✅ Benchmark Suite Complete!"
+echo ""
+echo "📋 Summary of Tests Performed:"
+echo "  1. Read Performance (Query speed on existing indexes)"
+echo "  2. Write Performance (Index creation speed during writing)"
+echo "  3. Real-time vs Batch indexing comparison"
+echo "  4. Memory usage comparison"
+echo ""
+echo "💡 Key Metrics to Compare:"
+echo "  - Writing throughput (records/second)"
+echo "  - Index build time"
+echo "  - Memory usage"
+echo "  - Query performance trade-offs"
+echo "  - Storage space efficiency"
\ No newline at end of file

From c0df48769166dd83a6dfcc7968e1b9b1f76aa0e6 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Thu, 6 Nov 2025 13:49:08 +0100
Subject: [PATCH 06/19] Enhance writing performance benchmarks and update
 analysis scripts for improved insights on indexing strategies

---
 WRITING_BENCHMARKS.md |   2 +-
 benches/README.md     |   2 +-
 benches/analysis.rs   | 132 +++++++++++++++++++++++-------------------
 run_benchmarks.sh     |   2 +-
 4 files changed, 75 insertions(+), 63 deletions(-)

diff --git a/WRITING_BENCHMARKS.md b/WRITING_BENCHMARKS.md
index 85ee553..98896ef 100644
--- a/WRITING_BENCHMARKS.md
+++ b/WRITING_BENCHMARKS.md
@@ -226,4 +226,4 @@ Create new benchmark functions following the existing patterns in the benchmark
 
 The new writing performance benchmarks provide comprehensive insights into the trade-offs between dense and sparse indexing approaches. The results clearly show that sparse indexing provides significant advantages for write-heavy workloads while maintaining acceptable query performance.
 
-Use these tools to make informed decisions about indexing strategies based on your specific use case requirements.
\ No newline at end of file
+Use these tools to make informed decisions about indexing strategies based on your specific use case requirements.
diff --git a/benches/README.md b/benches/README.md
index d0397d6..2892fd1 100644
--- a/benches/README.md
+++ b/benches/README.md
@@ -99,4 +99,4 @@ To view results, open `target/criterion/report/index.html` in a browser.
 - Ensure system is idle during benchmarking
 - Use consistent hardware for comparisons
 - Run multiple iterations to reduce noise
-- Use `black_box()` to prevent compiler optimizations
\ No newline at end of file
+- Use `black_box()` to prevent compiler optimizations
diff --git a/benches/analysis.rs b/benches/analysis.rs
index 7c0bf27..60f020c 100644
--- a/benches/analysis.rs
+++ b/benches/analysis.rs
@@ -6,11 +6,11 @@ use std::time::Instant;
 fn analyze_sparse_intervals() -> std::io::Result<()> {
     println!("🔍 Analyzing Different Sparse Intervals");
     println!("=====================================");
-    
+
     let intervals = vec![100, 500, 1000, 2000, 5000, 10000];
     let log_file = "data/benchmark/log.dat";
     let number_records = 100_000u64;
-    
+
     // Create test data
     fs::create_dir_all("data/benchmark")?;
     let mut writer = janus::indexing::shared::LogWriter::create(log_file)?;
@@ -18,57 +18,63 @@ fn analyze_sparse_intervals() -> std::io::Result<()> {
         writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
     }
     writer.flush()?;
-    
+
     println!("Testing {} records with different intervals:", number_records);
     println!("{:-<80}", "");
-    println!("{:<10} {:<15} {:<15} {:<20} {:<15}", 
-             "Interval", "Build Time(ms)", "Index Size(KB)", "Space Savings(%)", "Query Time(ms)");
+    println!(
+        "{:<10} {:<15} {:<15} {:<20} {:<15}",
+        "Interval", "Build Time(ms)", "Index Size(KB)", "Space Savings(%)", "Query Time(ms)"
+    );
     println!("{:-<80}", "");
-    
+
     // Get dense index stats for comparison
     let dense_start = Instant::now();
     dense::build_dense_index(log_file, "data/benchmark/dense_ref.idx")?;
     let dense_build_time = dense_start.elapsed();
     let dense_reader = dense::DenseIndexReader::open("data/benchmark/dense_ref.idx")?;
     let dense_size = dense_reader.index_size_bytes();
-    
+
     // Test query performance on dense index
     let query_start = Instant::now();
     let _dense_results = dense_reader.query(log_file, 10000, 20000)?;
     let dense_query_time = query_start.elapsed();
-    
+
     for interval in intervals {
         let index_file = format!("data/benchmark/sparse_{}.idx", interval);
-        
+
         // Build sparse index
         let start = Instant::now();
         sparse::build_sparse_index(log_file, &index_file, &interval)?;
         let build_time = start.elapsed();
-        
+
         // Get size info
         let reader = sparse::SparseReader::open(&index_file, interval)?;
         let sparse_size = reader.index_size_bytes();
         let space_savings = ((dense_size - sparse_size) as f64 / dense_size as f64) * 100.0;
-        
+
         // Test query performance
         let query_start = Instant::now();
         let _sparse_results = reader.query(log_file, 10000, 20000)?;
         let query_time = query_start.elapsed();
-        
-        println!("{:<10} {:<15.3} {:<15.2} {:<20.2} {:<15.3}",
-                 interval,
-                 build_time.as_secs_f64() * 1000.0,
-                 sparse_size as f64 / 1024.0,
-                 space_savings,
-                 query_time.as_secs_f64() * 1000.0);
+
+        println!(
+            "{:<10} {:<15.3} {:<15.2} {:<20.2} {:<15.3}",
+            interval,
+            build_time.as_secs_f64() * 1000.0,
+            sparse_size as f64 / 1024.0,
+            space_savings,
+            query_time.as_secs_f64() * 1000.0
+        );
     }
-    
+
     println!("{:-<80}", "");
-    println!("Dense Reference: Build: {:.3}ms, Size: {:.2}KB, Query: {:.3}ms",
-             dense_build_time.as_secs_f64() * 1000.0,
-             dense_size as f64 / 1024.0,
-             dense_query_time.as_secs_f64() * 1000.0);
-    
+    println!(
+        "Dense Reference: Build: {:.3}ms, Size: {:.2}KB, Query: {:.3}ms",
+        dense_build_time.as_secs_f64() * 1000.0,
+        dense_size as f64 / 1024.0,
+        dense_query_time.as_secs_f64() * 1000.0
+    );
+
     Ok(())
 }
 
@@ -76,41 +82,42 @@ fn analyze_sparse_intervals() -> std::io::Result<()> {
 fn analyze_memory_usage() -> std::io::Result<()> {
     println!("\n🧠 Memory Usage Analysis");
     println!("=======================");
-    
+
     let record_counts = vec![10_000, 50_000, 100_000, 500_000, 1_000_000];
-    
-    println!("{:<12} {:<15} {:<15} {:<20}", 
-             "Records", "Dense Size(MB)", "Sparse Size(MB)", "Memory Ratio");
+
+    println!(
+        "{:<12} {:<15} {:<15} {:<20}",
+        "Records", "Dense Size(MB)", "Sparse Size(MB)", "Memory Ratio"
+    );
     println!("{:-<62}", "");
-    
+
     for &count in &record_counts {
         let log_file = format!("data/benchmark/log_{}.dat", count);
         let dense_index = format!("data/benchmark/dense_{}.idx", count);
         let sparse_index = format!("data/benchmark/sparse_{}.idx", count);
-        
+
         // Create test data
         let mut writer = janus::indexing::shared::LogWriter::create(&log_file)?;
         for i in 0..count {
             writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
         }
         writer.flush()?;
-        
+
         // Build indexes
         dense::build_dense_index(&log_file, &dense_index)?;
         sparse::build_sparse_index(&log_file, &sparse_index, &1000)?;
-        
+
         // Get sizes
         let dense_reader = dense::DenseIndexReader::open(&dense_index)?;
         let sparse_reader = sparse::SparseReader::open(&sparse_index, 1000)?;
-        
+
         let dense_size = dense_reader.index_size_bytes() as f64 / 1_000_000.0;
         let sparse_size = sparse_reader.index_size_bytes() as f64 / 1_000_000.0;
         let ratio = dense_size / sparse_size;
-        
-        println!("{:<12} {:<15.3} {:<15.3} {:<20.2}x",
-                 count, dense_size, sparse_size, ratio);
+
+        println!("{:<12} {:<15.3} {:<15.3} {:<20.2}x", count, dense_size, sparse_size, ratio);
     }
-    
+
     Ok(())
 }
 
@@ -118,79 +125,84 @@ fn analyze_memory_usage() -> std::io::Result<()> {
 fn analyze_write_throughput() -> std::io::Result<()> {
     println!("\n⚡ Write Throughput Analysis");
     println!("===========================");
-    
+
     let test_configs = vec![
         ("Small batches", 1_000u64),
         ("Medium batches", 10_000u64),
         ("Large batches", 100_000u64),
     ];
-    
-    println!("{:<15} {:<20} {:<20} {:<15}", 
-             "Batch Size", "Dense (rec/sec)", "Sparse (rec/sec)", "Speedup");
+
+    println!(
+        "{:<15} {:<20} {:<20} {:<15}",
+        "Batch Size", "Dense (rec/sec)", "Sparse (rec/sec)", "Speedup"
+    );
     println!("{:-<70}", "");
-    
+
     for (name, batch_size) in test_configs {
         fs::create_dir_all("data/benchmark")?;
-        
+
         // Test dense writing
         let dense_log = "data/benchmark/dense_throughput.dat";
         let dense_index = "data/benchmark/dense_throughput.idx";
-        
+
         let start = Instant::now();
         let mut log_writer = janus::indexing::shared::LogWriter::create(dense_log)?;
         let mut index_builder = janus::indexing::dense::DenseIndexBuilder::create(dense_index)?;
-        
+
         for i in 0..batch_size {
             log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
             index_builder.add_entry(i, i * 40)?;
         }
         log_writer.flush()?;
         index_builder.finalize()?;
-        
+
         let dense_time = start.elapsed();
         let dense_throughput = batch_size as f64 / dense_time.as_secs_f64();
-        
+
         // Test sparse writing
         let sparse_log = "data/benchmark/sparse_throughput.dat";
         let sparse_index = "data/benchmark/sparse_throughput.idx";
-        
+
         let start = Instant::now();
         let mut log_writer = janus::indexing::shared::LogWriter::create(sparse_log)?;
-        let mut index_builder = janus::indexing::sparse::SparseIndexBuilder::create(sparse_index, 1000)?;
-        
+        let mut index_builder =
+            janus::indexing::sparse::SparseIndexBuilder::create(sparse_index, 1000)?;
+
         for i in 0..batch_size {
             log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
             index_builder.add_entry(i, i, i * 40)?;
         }
         log_writer.flush()?;
         index_builder.finalize()?;
-        
+
         let sparse_time = start.elapsed();
         let sparse_throughput = batch_size as f64 / sparse_time.as_secs_f64();
-        
+
         let speedup = sparse_throughput / dense_throughput;
-        
-        println!("{:<15} {:<20.0} {:<20.0} {:<15.2}x",
-                 name, dense_throughput, sparse_throughput, speedup);
+
+        println!(
+            "{:<15} {:<20.0} {:<20.0} {:<15.2}x",
+            name, dense_throughput, sparse_throughput, speedup
+        );
     }
-    
+
     Ok(())
 }
 
 fn main() -> std::io::Result<()> {
     println!("🔬 Advanced RDF Indexing Analysis Suite");
     println!("=======================================");
-    
+
     analyze_sparse_intervals()?;
     analyze_memory_usage()?;
     analyze_write_throughput()?;
-    
+
     println!("\n✨ Analysis Complete!");
     println!("\n💡 Recommendations:");
     println!("  • Use sparse indexing for write-heavy workloads");
     println!("  • Choose interval based on query precision requirements");
     println!("  • Consider hybrid approaches for different use cases");
     println!("  • Monitor memory usage with large datasets");
-    
+
     Ok(())
-}
\ No newline at end of file
+}
diff --git a/run_benchmarks.sh b/run_benchmarks.sh
index d44102f..3c26418 100755
--- a/run_benchmarks.sh
+++ b/run_benchmarks.sh
@@ -44,4 +44,4 @@ echo "  - Writing throughput (records/second)"
 echo "  - Index build time"
 echo "  - Memory usage"
 echo "  - Query performance trade-offs"
-echo "  - Storage space efficiency"
\ No newline at end of file
+echo "  - Storage space efficiency"

From e1fef2c30b2acdb615b08e76def1407d12779844 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Fri, 7 Nov 2025 17:40:46 +0100
Subject: [PATCH 07/19] Implement storage module with StreamingSegmentedStorage
 and utility structures for efficient event handling and querying

---
 src/indexing/mod.rs              |   1 +
 src/lib.rs                       |   2 +
 src/main.rs                      | 147 ++++++++-
 src/storage/mod.rs               |   4 +
 src/storage/segmented_storage.rs | 502 +++++++++++++++++++++++++++++++
 src/storage/util.rs              |  57 ++++
 6 files changed, 700 insertions(+), 13 deletions(-)
 create mode 100644 src/storage/mod.rs
 create mode 100644 src/storage/segmented_storage.rs
 create mode 100644 src/storage/util.rs

diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs
index 69d25b6..6bda115 100644
--- a/src/indexing/mod.rs
+++ b/src/indexing/mod.rs
@@ -6,3 +6,4 @@ pub mod dictionary;
 pub mod shared;
 #[doc = ""]
 pub mod sparse;
+
diff --git a/src/lib.rs b/src/lib.rs
index 5a5928a..a1c6f88 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -65,6 +65,8 @@ pub mod benchmarking {
     mod benchmark;
 }
 
+pub mod storage;
+
 pub mod error {
     //! Error types and result definitions
 
diff --git a/src/main.rs b/src/main.rs
index 6e0e9f7..5247965 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,9 +2,12 @@
 //!
 //! This is the main entry point for the Janus command-line interface.
 
+use janus::indexing::shared::Event;
 use janus::indexing::{dense, shared::LogWriter, sparse};
+use janus::storage::segmented_storage::StreamingSegmentedStorage;
+use janus::storage::util::StreamingConfig;
 use std::fs;
-use std::time::Instant;
+use std::time::{Instant, SystemTime, UNIX_EPOCH};
 
 const DATA_DIR: &str = "data/benchmark";
 const LOG_FILE: &str = "data/benchmark/log.dat";
@@ -114,19 +117,137 @@ fn benchmark_queries() -> std::io::Result<()> {
     Ok(())
 }
 
-fn main() -> std::io::Result<()> {
-    println!("RDF Indexing Benchmark : Dense vs Sparse");
-    println!("Setting up data...");
-    let number_of_records = 1_000_000u64;
-    setup_data(number_of_records)?;
+// fn main() -> std::io::Result<()> {
+//     println!("RDF Indexing Benchmark : Dense vs Sparse");
+//     println!("Setting up data...");
+//     let number_of_records = 1_000_000u64;
+//     setup_data(number_of_records)?;
+
+//     benchmark_indexing()?;
+//     benchmark_queries()?;
+
+//     println!(
+//         "\n=== Summary ===\nSparse interval: {}\nUse this data to decide \
+//          which approach suits your use case best.",
+//         SPARSE_INTERVAL
+//     );
+//     Ok(())
+// }
+
+fn benchmark_storage_performance() -> std::io::Result<()> {
+    println!("=== WAL-Based Segmented Storage Performance Benchmark ===\n");
+
+    let record_counts = vec![100, 1000, 10000, 100000, 1000000];
+
+    for &num_records in &record_counts {
+        println!("Testing with {} records", num_records);
+        println!("──────────────────────────────────────────────────");
+
+        // Configure storage
+        let config = StreamingConfig {
+            max_wal_events: 5000,
+            max_wal_age_seconds: 30,
+            max_wal_bytes: 5 * 1024 * 1024,
+            sparse_interval: 100,
+            entries_per_index_block: 512,
+            segment_base_path: format!("./benchmark_data_{}", num_records),
+            ..Default::default()
+        };
+
+        // Clean up any existing data
+        let _ = std::fs::remove_dir_all(&config.segment_base_path);
+
+        let mut storage = StreamingSegmentedStorage::new(config.clone())?;
+        storage.start_background_flushing();
+
+        // Benchmark writes
+        println!("Writing {} records...", num_records);
+        let write_start = Instant::now();
+        let mut min_timestamp = u64::MAX;
+        let mut max_timestamp = 0u64;
+
+        for i in 0..num_records {
+            let timestamp =
+                SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 + i;
+            min_timestamp = min_timestamp.min(timestamp);
+            max_timestamp = max_timestamp.max(timestamp);
+
+            let event = Event {
+                timestamp,
+                subject: (i % 10) as u64,
+                predicate: 1,
+                object: (20 + (i % 10)) as u64,
+                graph: 1,
+            };
+            storage.write(event)?;
+        }
 
-    benchmark_indexing()?;
-    benchmark_queries()?;
+        let write_duration = write_start.elapsed();
+        let write_throughput = num_records as f64 / write_duration.as_secs_f64();
+
+        println!("Write Performance:");
+        println!("  Duration: {:.3}s", write_duration.as_secs_f64());
+        println!("  Throughput: {:.0} records/sec", write_throughput);
+        println!("  Timestamp range: {} to {}", min_timestamp, max_timestamp);
+
+        // Benchmark queries immediately after writing (data is still in WAL)
+        let query_ranges = vec![(0.1, "10% of data"), (0.5, "50% of data"), (1.0, "100% of data")];
+
+        println!("\nQuery Performance:");
+
+        for (fraction, description) in query_ranges {
+            let query_count = 100.min(num_records / 10); // Run 100 queries or 10% of records, whichever is smaller
+            let mut query_times = Vec::new();
+            let mut total_records_read = 0;
+
+            for q in 0..query_count {
+                // Use a deterministic but varied offset for queries within the actual data range
+                let timestamp_range = max_timestamp - min_timestamp;
+                let start_offset =
+                    (timestamp_range as f64 * fraction * (q as f64 / query_count as f64)) as u64;
+                let query_window = (timestamp_range as f64 * 0.01).max(100.0) as u64; // 1% of data or 100 records minimum
+
+                let start_timestamp = min_timestamp + start_offset;
+                let end_timestamp = (start_timestamp + query_window).min(max_timestamp);
+
+                let query_start = Instant::now();
+                let results = storage.query(start_timestamp, end_timestamp)?;
+                let query_duration = query_start.elapsed();
+
+                total_records_read += results.len();
+                query_times.push(query_duration.as_secs_f64());
+            }
+
+            let avg_query_time = query_times.iter().sum::<f64>() / query_times.len() as f64;
+            let queries_per_sec = 1.0 / avg_query_time;
+            let total_query_time = query_times.iter().sum::<f64>();
+            let records_per_sec = if total_query_time > 0.0 {
+                total_records_read as f64 / total_query_time
+            } else {
+                0.0
+            };
+            let avg_records_per_query = total_records_read as f64 / query_count as f64;
+            let min_time = query_times.iter().cloned().fold(f64::INFINITY, f64::min);
+            let max_time = query_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+
+            println!("  {} queries ({}):", description, query_count);
+            println!("    Avg query time: {:.3}ms", avg_query_time * 1000.0);
+            println!("    Query throughput: {:.1} queries/sec", queries_per_sec);
+            println!("    Read throughput: {:.0} records/sec", records_per_sec);
+            println!("    Avg records per query: {:.1}", avg_records_per_query);
+            println!("    Total records read: {}", total_records_read);
+            println!("    Min/Max time: {:.3}ms / {:.3}ms", min_time * 1000.0, max_time * 1000.0);
+        }
 
-    println!(
-        "\n=== Summary ===\nSparse interval: {}\nUse this data to decide \
-         which approach suits your use case best.",
-        SPARSE_INTERVAL
-    );
+        // Force flush remaining WAL data and shutdown
+        storage.shutdown()?;
+        println!();
+    }
+
+    println!("Benchmark completed!");
     Ok(())
 }
+
+fn main() -> std::io::Result<()> {
+    benchmark_storage_performance()
+}
diff --git a/src/storage/mod.rs b/src/storage/mod.rs
new file mode 100644
index 0000000..221ddeb
--- /dev/null
+++ b/src/storage/mod.rs
@@ -0,0 +1,4 @@
+#[doc=""]
+pub mod util;
+#[doc = ""]
+pub mod segmented_storage;
diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs
new file mode 100644
index 0000000..f8a6500
--- /dev/null
+++ b/src/storage/segmented_storage.rs
@@ -0,0 +1,502 @@
+use core::time;
+use std::{
+    collections::VecDeque,
+    fmt::format,
+    io::{BufWriter, Read, Seek, SeekFrom, Write},
+    ops::Index,
+    panic::set_hook,
+    sync::{Arc, Mutex, RwLock},
+    thread::JoinHandle,
+    time::{Duration, SystemTime, UNIX_EPOCH},
+};
+
+use crate::{
+    config,
+    indexing::{
+        shared::{decode_record, encode_record, Event, RECORD_SIZE},
+        sparse,
+    },
+    storage::{
+        self,
+        util::{EnhancedSegmentMetadata, IndexBlock, StreamingConfig, WAL},
+    },
+};
+
+pub struct StreamingSegmentedStorage {
+    wal: Arc<RwLock<WAL>>,
+    segments: Arc<RwLock<Vec<EnhancedSegmentMetadata>>>,
+    flush_handle: Option<JoinHandle<()>>,
+    shutdown_signal: Arc<Mutex<bool>>,
+    config: StreamingConfig,
+}
+
+impl StreamingSegmentedStorage {
+    pub fn new(config: StreamingConfig) -> std::io::Result<Self> {
+        std::fs::create_dir_all(&config.segment_base_path)?;
+
+        let storage = Self {
+            wal: Arc::new(RwLock::new(WAL {
+                events: VecDeque::new(),
+                total_bytes: 0,
+                oldest_timestamp_bound: None,
+                newest_timestamp_bound: None,
+            })),
+
+            segments: Arc::new(RwLock::new(Vec::new())),
+            flush_handle: None,
+            shutdown_signal: Arc::new(Mutex::new(false)),
+            config,
+        };
+        storage.load_existing_segments()?;
+        Ok(storage)
+    }
+
+    pub fn start_background_flushing(&mut self) {
+        let wal_clone = Arc::clone(&self.wal);
+        let segments_clone = Arc::clone(&self.segments);
+        let shutdown_clone = Arc::clone(&self.shutdown_signal);
+        let config_clone = self.config.clone();
+
+        let handle = std::thread::spawn(move || {
+            Self::background_flush_loop(wal_clone, segments_clone, shutdown_clone, config_clone);
+        });
+
+        self.flush_handle = Some(handle);
+    }
+
+    pub fn write(&self, event: Event) -> std::io::Result<()> {
+        let event_size = std::mem::size_of::<Event>();
+
+        {
+            let mut wal = self.wal.write().unwrap();
+
+            if wal.oldest_timestamp_bound.is_none() {
+                wal.oldest_timestamp_bound = Some(event.timestamp);
+            }
+
+            wal.newest_timestamp_bound = Some(event.timestamp);
+
+            wal.total_bytes += event_size;
+
+            wal.events.push_back(event);
+        }
+
+        if self.should_flush() {
+            self.flush_wal_to_segment()?;
+        }
+
+        Ok(())
+    }
+
+    fn should_flush(&self) -> bool {
+        let wal = self.wal.read().unwrap();
+
+        wal.events.len() >= self.config.max_wal_events.try_into().unwrap()
+            || wal.total_bytes > self.config.max_wal_bytes
+            || wal.oldest_timestamp_bound.map_or(false, |oldest| {
+                let current_timestamp = Self::current_timestamp();
+
+                // Use saturating subtraction to avoid underflow if oldest > current_timestamp
+                current_timestamp.saturating_sub(oldest)
+                    >= self.config.max_wal_age_seconds * 1_000_000_000
+            })
+    }
+
+    fn current_timestamp() -> u64 {
+        SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64
+    }
+
+    fn flush_wal_to_segment(&self) -> std::io::Result<()> {
+        // Automatically extract events from the WAL.
+
+        let events_to_flush = {
+            let mut wal = self.wal.write().unwrap();
+            if wal.events.is_empty() {
+                return Ok(());
+            }
+
+            let events: Vec<Event> = wal.events.drain(..).collect();
+
+            wal.total_bytes = 0;
+            wal.oldest_timestamp_bound = None;
+            wal.newest_timestamp_bound = None;
+            events
+        };
+
+        let segment = self.create_segment_with_two_level_index(events_to_flush)?;
+
+        {
+            let mut segments = self.segments.write().unwrap();
+            segments.push(segment);
+        }
+        Ok(())
+    }
+
+    fn create_segment_with_two_level_index(
+        &self,
+        mut events: Vec<Event>,
+    ) -> std::io::Result<EnhancedSegmentMetadata> {
+        events.sort_by_key(|e| e.timestamp);
+
+        let segment_id = Self::generate_segment_id();
+
+        let data_path = format!("{}/segment-{}.log", self.config.segment_base_path, segment_id);
+        let index_path = format!("{}/segment-{}.log", self.config.segment_base_path, segment_id);
+
+        let mut data_file = BufWriter::new(std::fs::File::create(&data_path)?);
+        let mut index_file = BufWriter::new(std::fs::File::create(&index_path)?);
+
+        let mut index_directory = Vec::new();
+        let mut current_block_entries = Vec::new();
+
+        let mut current_block_min_ts = None;
+        let mut current_block_max_ts = 0u64;
+
+        let mut data_offset = 0u64;
+
+        for (record_count, event) in events.iter().enumerate() {
+            let record_bytes = self.serialize_event_to_fixed_size(event);
+            data_file.write_all(&record_bytes);
+
+            if record_count % self.config.sparse_interval == 0 {
+                let sparse_entry = (event.timestamp, data_offset);
+
+                if current_block_min_ts.is_none() {
+                    current_block_min_ts = Some(event.timestamp);
+                }
+
+                current_block_max_ts = event.timestamp;
+                current_block_entries.push(sparse_entry);
+
+                if current_block_entries.len() >= self.config.entries_per_index_block {
+                    let block_metadata = self.flush_index_block(
+                        &mut index_file,
+                        &current_block_entries,
+                        current_block_min_ts.unwrap(),
+                        current_block_max_ts,
+                    )?;
+
+                    index_directory.push(block_metadata);
+
+                    current_block_entries.clear();
+                    current_block_min_ts = None;
+                }
+            }
+            data_offset += record_bytes.len() as u64;
+        }
+
+        if !current_block_entries.is_empty() {
+            let block_metadata = self.flush_index_block(
+                &mut index_file,
+                &current_block_entries,
+                current_block_min_ts.unwrap(),
+                current_block_max_ts,
+            )?;
+
+            index_directory.push(block_metadata);
+        }
+
+        data_file.flush()?;
+        index_file.flush()?;
+
+        Ok(EnhancedSegmentMetadata {
+            start_timstamp: events.first().unwrap().timestamp,
+            end_timestamp: events.last().unwrap().timestamp,
+            data_path,
+            index_path,
+            record_count: events.len() as u64,
+            index_directory,
+        })
+    }
+
+    fn flush_index_block(
+        &self,
+        index_file: &mut BufWriter<std::fs::File>,
+        entries: &[(u64, u64)],
+        min_ts: u64,
+        max_ts: u64,
+    ) -> std::io::Result<IndexBlock> {
+        let file_offset = index_file.stream_position()?;
+
+        for (timestamp, offset) in entries {
+            index_file.write_all(&timestamp.to_ne_bytes())?;
+            index_file.write_all(&offset.to_be_bytes())?;
+        }
+
+        Ok(IndexBlock {
+            min_timestamp: min_ts,
+            max_timestamp: max_ts,
+            file_offset,
+            entry_count: entries.len() as u32,
+        })
+    }
+
+    pub fn query(&self, start_timestamp: u64, end_timestamp: u64) -> std::io::Result<Vec<Event>> {
+        let mut results = Vec::new();
+
+        // First try to query the immediate WAL which has the fastest visibility.
+
+        {
+            let wal = self.wal.read().unwrap();
+
+            for event in &wal.events {
+                if event.timestamp >= start_timestamp && event.timestamp <= end_timestamp {
+                    results.push(event.clone());
+                }
+            }
+        }
+
+        // Then querying the relevant segment with a two level indexing
+
+        {
+            let segments = self.segments.read().unwrap();
+            for segment in segments.iter() {
+                if self.segment_overlaps(segment, start_timestamp, end_timestamp) {
+                    let segment_results =
+                        self.query_segment_two_level(segment, start_timestamp, end_timestamp)?;
+                    results.extend(segment_results);
+                }
+            }
+        }
+
+        results.sort_by_key(|e| e.timestamp);
+
+        Ok(results)
+    }
+
+    fn query_segment_two_level(
+        &self,
+        segment: &EnhancedSegmentMetadata,
+        start_timestamp: u64,
+        end_timestamp: u64,
+    ) -> std::io::Result<Vec<Event>> {
+        // If we have index directory, use two-level indexing
+        if !segment.index_directory.is_empty() {
+            // Step 1 : Find relevant index blocks using in-memory directory
+            let relevant_blocks: Vec<&IndexBlock> = segment
+                .index_directory
+                .iter()
+                .filter(|block| {
+                    block.min_timestamp <= end_timestamp && block.max_timestamp >= start_timestamp
+                })
+                .collect();
+
+            if relevant_blocks.is_empty() {
+                return Ok(Vec::new());
+            }
+
+            // Step 2 : Load only the relevant blocks from the disk
+            let sparse_entries =
+                self.load_relevant_index_blocks(&segment.index_path, &relevant_blocks)?;
+
+            if sparse_entries.is_empty() {
+                return Ok(Vec::new());
+            }
+
+            // Step 3 : Binary search the loaded entries
+            let lb = sparse_entries.partition_point(|(ts, _)| *ts < start_timestamp);
+            let start_position = lb.saturating_sub(1);
+            let start_offset = sparse_entries[start_position].1;
+
+            // Step 4 : Sequential Scan from the checkpoint
+            self.scan_data_from_offset(
+                &segment.data_path,
+                start_offset,
+                start_timestamp,
+                end_timestamp,
+            )
+        } else {
+            // Fallback: Full scan of the data file (for segments without loaded index)
+            self.scan_data_from_offset(&segment.data_path, 0, start_timestamp, end_timestamp)
+        }
+    }
+
+    fn load_relevant_index_blocks(
+        &self,
+        index_path: &str,
+        blocks: &[&IndexBlock],
+    ) -> std::io::Result<Vec<(u64, u64)>> {
+        let mut index_file = std::fs::File::open(index_path)?;
+        let mut sparse_entries = Vec::new();
+
+        for block in blocks {
+            index_file.seek(SeekFrom::Start(block.file_offset))?;
+
+            let block_size = block.entry_count as usize * 16; // 16 bytes per entry.
+            let mut buffer = vec![0u8; block_size];
+            index_file.read_exact(&mut buffer)?;
+
+            // Parse the entries.
+
+            for chunk in buffer.chunks_exact(16) {
+                let timestamp = u64::from_be_bytes(chunk[0..8].try_into().unwrap());
+                let offset = u64::from_be_bytes(chunk[8..16].try_into().unwrap());
+                sparse_entries.push((timestamp, offset));
+            }
+        }
+
+        sparse_entries.sort_by_key(|&(ts, _)| ts);
+        Ok(sparse_entries)
+    }
+
+    fn scan_data_from_offset(
+        &self,
+        data_path: &str,
+        start_offset: u64,
+        start_timestamp: u64,
+        end_timestamp: u64,
+    ) -> std::io::Result<Vec<Event>> {
+        let mut file = std::fs::File::open(data_path)?;
+        file.seek(SeekFrom::Start(start_offset))?;
+
+        let mut results = Vec::new();
+        let mut record = [0u8; RECORD_SIZE];
+
+        while file.read_exact(&mut record).is_ok() {
+            let (timestamp, subject, predicate, object, graph) = decode_record(&record);
+
+            if timestamp > end_timestamp {
+                break;
+            }
+
+            if timestamp >= start_timestamp {
+                results.push(Event { timestamp, subject, predicate, object, graph });
+            }
+        }
+        Ok(results)
+    }
+
+    fn segment_overlaps(
+        &self,
+        segment: &EnhancedSegmentMetadata,
+        start_ts: u64,
+        end_ts: u64,
+    ) -> bool {
+        segment.start_timstamp <= end_ts && segment.end_timestamp >= start_ts
+    }
+
+    fn background_flush_loop(
+        wal: Arc<RwLock<WAL>>,
+        segments: Arc<RwLock<Vec<EnhancedSegmentMetadata>>>,
+        shutdown_signal: Arc<Mutex<bool>>,
+        config: StreamingConfig,
+    ) {
+        while !*shutdown_signal.lock().unwrap() {
+            std::thread::sleep(Duration::from_secs(1));
+
+            // Check if flush is needed or not.
+
+            let should_flush = {
+                let wal = wal.read().unwrap();
+
+                wal.events.len() >= config.max_wal_bytes
+                    || wal.total_bytes >= config.max_wal_bytes
+                    || wal.oldest_timestamp_bound.map_or(false, |oldest| {
+                        let current_timestamp = Self::current_timestamp();
+                        (current_timestamp - oldest) >= config.max_wal_age_seconds * 1_000_000_000
+                    })
+            };
+
+            if should_flush {
+                // TODO : Add better error handling here in this case
+                if let Err(e) = Self::flush_background(wal.clone(), segments.clone(), &config) {
+                    eprintln!("Background flush failed: {}", e);
+                }
+            }
+        }
+    }
+
+    fn flush_background(
+        wal: Arc<RwLock<WAL>>,
+        segments: Arc<RwLock<Vec<EnhancedSegmentMetadata>>>,
+        config: &StreamingConfig,
+    ) -> std::io::Result<()> {
+        Ok(())
+    }
+
+    fn load_existing_segments(&self) -> std::io::Result<()> {
+        use std::fs;
+
+        let segment_dir = &self.config.segment_base_path;
+        if !fs::metadata(segment_dir).is_ok() {
+            return Ok(());
+        }
+
+        let entries = fs::read_dir(segment_dir)?;
+        let mut segments = Vec::new();
+
+        for entry in entries {
+            let entry = entry?;
+            let path = entry.path();
+
+            if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
+                if filename.starts_with("segment-") && filename.ends_with(".log") {
+                    // Extract segment ID from filename
+                    if let Some(id_str) =
+                        filename.strip_prefix("segment-").and_then(|s| s.strip_suffix(".log"))
+                    {
+                        if let Ok(segment_id) = id_str.parse::<u64>() {
+                            // Try to load the segment metadata by reading the data file
+                            let data_path = format!("{}/segment-{}.log", segment_dir, segment_id);
+                            let index_path = format!("{}/segment-{}.log", segment_dir, segment_id);
+
+                            if let Ok(_metadata) = fs::metadata(&data_path) {
+                                // For now, create a basic segment metadata with wide timestamp bounds
+                                // In a full implementation, we'd parse the index file to get exact bounds
+                                let segment = EnhancedSegmentMetadata {
+                                    start_timstamp: 0, // Wide range to ensure overlap checks pass
+                                    end_timestamp: u64::MAX,
+                                    data_path,
+                                    index_path,
+                                    record_count: 0, // Will be determined during scanning
+                                    index_directory: Vec::new(), // Empty - will fall back to full scan
+                                };
+                                segments.push(segment);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Sort segments by start timestamp
+        segments.sort_by_key(|s| s.start_timstamp);
+
+        {
+            let mut self_segments = self.segments.write().unwrap();
+            *self_segments = segments;
+        }
+
+        Ok(())
+    }
+
+    pub fn shutdown(&mut self) -> std::io::Result<()> {
+        *self.shutdown_signal.lock().unwrap() = true;
+
+        // Final Flush
+
+        self.flush_wal_to_segment();
+
+        if let Some(handle) = self.flush_handle.take() {
+            handle.join().unwrap();
+        }
+        Ok(())
+    }
+
+    fn serialize_event_to_fixed_size(&self, event: &Event) -> Vec<u8> {
+        let mut record = [0u8; RECORD_SIZE];
+        encode_record(
+            &mut record,
+            event.timestamp,
+            event.subject,
+            event.predicate,
+            event.object,
+            event.graph,
+        );
+        record.to_vec()
+    }
+
+    fn generate_segment_id() -> u64 {
+        SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64
+    }
+}
diff --git a/src/storage/util.rs b/src/storage/util.rs
new file mode 100644
index 0000000..e19e0e8
--- /dev/null
+++ b/src/storage/util.rs
@@ -0,0 +1,57 @@
+use std::collections::{VecDeque, HashMap};
+use std::sync::{Arc, RwLock, Mutex};
+use std::thread::JoinHandle;
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+
+use crate::indexing::shared::Event;
+
+#[derive(Debug)]
+pub struct WAL {
+    pub events: VecDeque<Event>,
+    pub total_bytes: usize,
+    pub oldest_timestamp_bound: Option<u64>,
+    pub newest_timestamp_bound: Option<u64>,
+}
+
+#[derive(Debug, Clone)]
+pub struct IndexBlock {
+    pub min_timestamp: u64,
+    pub max_timestamp: u64,
+    pub file_offset: u64,
+    pub entry_count: u32,
+}
+
+#[derive(Debug, Clone)]
+pub struct EnhancedSegmentMetadata {
+    pub start_timstamp: u64,
+    pub end_timestamp: u64,
+    pub data_path: String,
+    pub index_path: String,
+    pub record_count: u64,
+    pub index_directory: Vec<IndexBlock>,
+}
+
+#[derive(Clone)]
+pub struct StreamingConfig {
+    pub max_wal_events: u64,
+    pub max_wal_age_seconds: u64,
+    pub max_wal_bytes: usize,
+    pub sparse_interval: usize,
+    pub entries_per_index_block: usize,
+    pub segment_base_path: String,
+}
+
+impl Default for StreamingConfig {
+    fn default() -> Self {
+        Self {
+            max_wal_bytes: 10 * 1024 * 1024,
+            max_wal_age_seconds: 60,
+            max_wal_events: 100_000,
+            sparse_interval: 1000,
+            entries_per_index_block: 1024,
+            segment_base_path: "./data".to_string(),
+        }
+    }
+}
+
+

From 0b6e7fdef847060651af48207ff63553e065a09a Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Fri, 7 Nov 2025 18:50:29 +0100
Subject: [PATCH 08/19] Refactor and enhance RDF storage and indexing

- Introduced a new core module for data structures and types, including Event and RDFEvent.
- Updated the storage module to include a new indexing structure with dense and sparse indexing capabilities.
- Implemented a user-friendly API for writing and querying RDF events in StreamingSegmentedStorage.
- Added benchmarks for RDF segmented storage, including writing and reading performance tests.
- Created a dictionary for encoding and decoding RDF URIs to numeric IDs, improving storage efficiency.
- Enhanced the dense and sparse indexing mechanisms to support efficient querying of RDF events.
- Added comprehensive tests for the dictionary and encoding/decoding functionality.
---
 Cargo.toml                                    |   1 +
 src/{benchmarking => benchmarks}/benchmark.rs |   3 +-
 src/{benchmarking => benchmarks}/mod.rs       |   1 -
 src/core/encoding.rs                          |  59 ++++++
 src/core/mod.rs                               |  36 ++++
 src/indexing/dictionary.rs                    |  88 ---------
 src/indexing/mod.rs                           |   9 +-
 src/indexing/shared.rs                        |  95 ++--------
 src/lib.rs                                    |   8 +-
 src/main.rs                                   | 114 +++++++++++-
 src/{ => storage}/indexing/dense.rs           |   2 +-
 src/storage/indexing/dictionary.rs            | 171 ++++++++++++++++++
 src/{ => storage}/indexing/sparse.rs          |   8 +-
 src/storage/mod.rs                            |   7 +-
 src/storage/segmented_storage.rs              |  40 +++-
 src/storage/util.rs                           |   2 +-
 16 files changed, 449 insertions(+), 195 deletions(-)
 rename src/{benchmarking => benchmarks}/benchmark.rs (97%)
 rename src/{benchmarking => benchmarks}/mod.rs (64%)
 create mode 100644 src/core/encoding.rs
 create mode 100644 src/core/mod.rs
 delete mode 100644 src/indexing/dictionary.rs
 rename src/{ => storage}/indexing/dense.rs (97%)
 create mode 100644 src/storage/indexing/dictionary.rs
 rename src/{ => storage}/indexing/sparse.rs (97%)

diff --git a/Cargo.toml b/Cargo.toml
index 1ff8565..4f9485e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,7 @@ readme = "README.md"
 [dependencies]
 regex = "1.0"
 serde = { version = "1.0", features = ["derive"] }
+bincode = "1.0"
 
 [dev-dependencies]
 
diff --git a/src/benchmarking/benchmark.rs b/src/benchmarks/benchmark.rs
similarity index 97%
rename from src/benchmarking/benchmark.rs
rename to src/benchmarks/benchmark.rs
index 92b7f69..d64ec37 100644
--- a/src/benchmarking/benchmark.rs
+++ b/src/benchmarks/benchmark.rs
@@ -1,4 +1,5 @@
-use crate::indexing::{dense, shared::LogWriter, sparse};
+use crate::storage::indexing::{dense, sparse};
+use crate::indexing::shared::LogWriter;
 use std::fs;
 use std::time::Instant;
 
diff --git a/src/benchmarking/mod.rs b/src/benchmarks/mod.rs
similarity index 64%
rename from src/benchmarking/mod.rs
rename to src/benchmarks/mod.rs
index 9820c42..fe2015f 100644
--- a/src/benchmarking/mod.rs
+++ b/src/benchmarks/mod.rs
@@ -1,2 +1 @@
-#[doc=""]
 pub mod benchmark;
\ No newline at end of file
diff --git a/src/core/encoding.rs b/src/core/encoding.rs
new file mode 100644
index 0000000..faf8a9a
--- /dev/null
+++ b/src/core/encoding.rs
@@ -0,0 +1,59 @@
+//! Binary encoding/decoding utilities for RDF events
+
+use crate::storage::indexing::dictionary::Dictionary;
+use crate::core::{Event, RDFEvent};
+
+/// Size of a single encoded record in bytes
+pub const RECORD_SIZE: usize = 40;
+
+/// Encode an RDF event record into a byte buffer
+pub fn encode_record(
+    buffer: &mut [u8; RECORD_SIZE],
+    timestamp: u64,
+    subject: u64,
+    predicate: u64,
+    object: u64,
+    graph: u64,
+) {
+    buffer[0..8].copy_from_slice(&timestamp.to_le_bytes());
+    buffer[8..16].copy_from_slice(&subject.to_le_bytes());
+    buffer[16..24].copy_from_slice(&predicate.to_le_bytes());
+    buffer[24..32].copy_from_slice(&object.to_le_bytes());
+    buffer[32..40].copy_from_slice(&graph.to_le_bytes());
+}
+
+/// Decode a byte buffer into an RDF event record
+pub fn decode_record(buffer: &[u8; RECORD_SIZE]) -> (u64, u64, u64, u64, u64) {
+    let timestamp = u64::from_le_bytes(buffer[0..8].try_into().unwrap());
+    let subject = u64::from_le_bytes(buffer[8..16].try_into().unwrap());
+    let predicate = u64::from_le_bytes(buffer[16..24].try_into().unwrap());
+    let object = u64::from_le_bytes(buffer[24..32].try_into().unwrap());
+    let graph = u64::from_le_bytes(buffer[32..40].try_into().unwrap());
+    (timestamp, subject, predicate, object, graph)
+}
+
+impl RDFEvent {
+    /// Encode this RDF event to an internal Event using a dictionary
+    pub fn encode(&self, dict: &mut Dictionary) -> Event {
+        Event {
+            timestamp: self.timestamp,
+            subject: dict.encode(&self.subject),
+            predicate: dict.encode(&self.predicate),
+            object: dict.encode(&self.object),
+            graph: dict.encode(&self.graph),
+        }
+    }
+}
+
+impl Event {
+    /// Decode this internal Event to an RDFEvent using a dictionary
+    pub fn decode(&self, dict: &Dictionary) -> RDFEvent {
+        RDFEvent {
+            timestamp: self.timestamp,
+            subject: dict.decode(self.subject).unwrap_or("UNKNOWN").to_string(),
+            predicate: dict.decode(self.predicate).unwrap_or("UNKNOWN").to_string(),
+            object: dict.decode(self.object).unwrap_or("UNKNOWN").to_string(),
+            graph: dict.decode(self.graph).unwrap_or("UNKNOWN").to_string(),
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/core/mod.rs b/src/core/mod.rs
new file mode 100644
index 0000000..42f5529
--- /dev/null
+++ b/src/core/mod.rs
@@ -0,0 +1,36 @@
+//! Core data structures and types for Janus RDF Stream Processing Engine
+
+/// Internal storage event with encoded IDs
+#[derive(Clone, Debug)]
+pub struct Event {
+    pub timestamp: u64,
+    pub subject: u64,
+    pub predicate: u64,
+    pub object: u64,
+    pub graph: u64,
+}
+
+/// User-facing RDF event with URI strings
+#[derive(Debug, Clone)]
+pub struct RDFEvent {
+    pub timestamp: u64,
+    pub subject: String,
+    pub predicate: String,
+    pub object: String,
+    pub graph: String,
+}
+
+impl RDFEvent {
+    pub fn new(timestamp: u64, subject: &str, predicate: &str, object: &str, graph: &str) -> Self {
+        Self {
+            timestamp,
+            subject: subject.to_string(),
+            predicate: predicate.to_string(),
+            object: object.to_string(),
+            graph: graph.to_string(),
+        }
+    }
+}
+
+pub mod encoding;
+pub use encoding::*;
\ No newline at end of file
diff --git a/src/indexing/dictionary.rs b/src/indexing/dictionary.rs
deleted file mode 100644
index e5605b2..0000000
--- a/src/indexing/dictionary.rs
+++ /dev/null
@@ -1,88 +0,0 @@
-use std::collections::HashMap;
-use std::fs::File;
-use std::io::{Read, Write};
-use std::path::Path;
-
-#[derive(Debug)]
-pub struct Dictionary {
-    uri_to_id: HashMap<String, u64>,
-    id_to_uri: Vec<String>,
-    next_id: u64,
-}
-
-impl Dictionary {
-    pub fn new() -> Self {
-        Self { uri_to_id: HashMap::new(), id_to_uri: Vec::new(), next_id: 0 }
-    }
-
-    pub fn fetch_id(&mut self, uri: &str) -> u64 {
-        if let Some(&id) = self.uri_to_id.get(uri) {
-            id
-        } else {
-            let id = self.next_id;
-            self.uri_to_id.insert(uri.to_string(), id);
-            self.id_to_uri.push(uri.to_string());
-            self.next_id += 1;
-            id
-        }
-    }
-
-    pub fn fetch_uri(&self, id: u64) -> Option<&str> {
-        self.id_to_uri.get(id as usize).map(|s| s.as_str())
-    }
-
-    pub fn len(&self) -> usize {
-        self.uri_to_id.len()
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.uri_to_id.is_empty()
-    }
-
-    pub fn save_to_file(&self, path: &Path) -> std::io::Result<()> {
-        let mut file = File::create(path)?;
-        file.write_all(&(self.id_to_uri.len() as u64).to_be_bytes())?;
-
-        for uri in &self.id_to_uri {
-            let uri_bytes = uri.as_bytes();
-            file.write_all(&(uri_bytes.len() as u32).to_be_bytes())?;
-            file.write_all(uri_bytes)?;
-        }
-        Ok(())
-    }
-
-    pub fn load_from_file(path: &Path) -> std::io::Result<Self> {
-        let mut file = File::open(path)?;
-        let mut uri_to_id = HashMap::new();
-        let mut id_to_uri = Vec::new();
-
-        // Reading the number of entries
-        let mut count_bytes = [0u8; 8];
-        file.read_exact(&mut count_bytes)?;
-        let count = u64::from_be_bytes(count_bytes);
-
-        // Reading each URI Entry
-
-        for id in 0..count {
-            let mut len_bytes = [0u8; 4];
-            file.read_exact(&mut len_bytes)?;
-
-            let length = u32::from_be_bytes(len_bytes) as usize;
-            let mut uri_bytes = vec![0u8; length];
-            file.read_exact(&mut uri_bytes)?;
-            let uri = String::from_utf8(uri_bytes)
-                .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
-
-            uri_to_id.insert(uri.clone(), id);
-            id_to_uri.push(uri);
-        }
-
-        Ok(Self { uri_to_id, id_to_uri, next_id: count })
-    }
-}
-
-impl Default for Dictionary {
-    fn default() -> Self {
-        Self::new()
-    }
-}
diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs
index 6bda115..3bc786f 100644
--- a/src/indexing/mod.rs
+++ b/src/indexing/mod.rs
@@ -1,9 +1,4 @@
-#[doc = ""]
-pub mod dense;
-#[doc = ""]
-pub mod dictionary;
-#[doc = ""]
+//! Legacy indexing utilities - most functionality moved to storage::indexing
+
 pub mod shared;
-#[doc = ""]
-pub mod sparse;
 
diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs
index dd344bc..bbc875f 100644
--- a/src/indexing/shared.rs
+++ b/src/indexing/shared.rs
@@ -1,56 +1,26 @@
-use crate::indexing::dictionary::Dictionary;
+//! Legacy storage utilities - to be moved to storage module
+
 use std::fs::File;
 use std::io::Write;
+use crate::core::encoding::{encode_record, RECORD_SIZE};
 
-#[doc = ""]
-pub const RECORD_SIZE: usize = 40;
-
-#[doc = ""]
-pub fn encode_record(
-    buffer: &mut [u8; RECORD_SIZE],
-    timestamp: u64,
-    subject: u64,
-    predicate: u64,
-    object: u64,
-    graph: u64,
-) {
-    buffer[0..8].copy_from_slice(&timestamp.to_le_bytes());
-    buffer[8..16].copy_from_slice(&subject.to_le_bytes());
-    buffer[16..24].copy_from_slice(&predicate.to_le_bytes());
-    buffer[24..32].copy_from_slice(&object.to_le_bytes());
-    buffer[32..40].copy_from_slice(&graph.to_le_bytes());
-}
-
-#[doc = ""]
-pub fn decode_record(buffer: &[u8; RECORD_SIZE]) -> (u64, u64, u64, u64, u64) {
-    let timestamp = u64::from_le_bytes(buffer[0..8].try_into().unwrap());
-    let subject = u64::from_le_bytes(buffer[8..16].try_into().unwrap());
-    let predicate = u64::from_le_bytes(buffer[16..24].try_into().unwrap());
-    let object = u64::from_le_bytes(buffer[24..32].try_into().unwrap());
-    let graph = u64::from_le_bytes(buffer[32..40].try_into().unwrap());
-    (timestamp, subject, predicate, object, graph)
-}
-
-#[doc = ""]
+/// Log writer for appending encoded records to a file
 pub struct LogWriter {
     log_file: File,
     record_count: u64,
 }
 
-#[doc = ""]
 impl LogWriter {
-    #[doc = ""]
+    /// Create a new log writer for the given file path
     pub fn create(path: &str) -> std::io::Result<Self> {
-        let log_file = match File::create(path) {
-            Ok(file) => file,
-            Err(error) => {
-                return Err(error);
-            }
-        };
-        Ok(Self { log_file, record_count: 0 })
+        let log_file = File::create(path)?;
+        Ok(Self {
+            log_file,
+            record_count: 0
+        })
     }
 
-    #[doc = ""]
+    /// Append an encoded record to the log file
     pub fn append_record(
         &mut self,
         timestamp: u64,
@@ -66,49 +36,22 @@ impl LogWriter {
         Ok(())
     }
 
-    #[doc = ""]
+    /// Get the current record count
     pub fn record_count(&self) -> u64 {
         self.record_count
     }
 
-    #[doc = ""]
+    /// Flush the log file
     pub fn flush(&mut self) -> std::io::Result<()> {
         self.log_file.flush()
     }
 }
 
-#[derive(Clone, Debug)]
-#[doc = ""]
-pub struct Event {
-    #[doc = ""]
-    pub timestamp: u64,
-    #[doc = ""]
-    pub subject: u64,
-    #[doc = ""]
-    pub predicate: u64,
-    #[doc = ""]
-    pub object: u64,
-    #[doc = ""]
-    pub graph: u64,
-}
 
-#[derive(Debug, Clone)]
-pub struct ResolvedEvent {
-    pub timestamp: u64,
-    pub subject: String,
-    pub predicate: String,
-    pub object: String,
-    pub graph: String,
-}
 
-impl Event {
-    pub fn resolve(&self, dict: &Dictionary) -> ResolvedEvent {
-        ResolvedEvent {
-            timestamp: self.timestamp,
-            subject: dict.fetch_uri(self.subject).unwrap_or("UNKNOWN").to_string(),
-            predicate: dict.fetch_uri(self.predicate).unwrap_or("UNKNOWN").to_string(),
-            object: dict.fetch_uri(self.object).unwrap_or("UNKNOWN").to_string(),
-            graph: dict.fetch_uri(self.graph).unwrap_or("UNKNOWN").to_string(),
-        }
-    }
-}
+
+
+
+
+
+
diff --git a/src/lib.rs b/src/lib.rs
index a1c6f88..c1e58f7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -28,10 +28,8 @@
 #![warn(missing_docs)]
 #![warn(clippy::all)]
 
-/// Core module containing the main engine logic
-pub mod core {
-    //! Core functionality for the Janus engine
-}
+/// Core data structures and types
+pub mod core;
 
 /// Module for handling RDF stores
 pub mod store {
@@ -60,7 +58,7 @@ pub mod indexing;
 pub mod parsing;
 
 /// Benchmarking utilities
-pub mod benchmarking {
+pub mod benchmarks {
 
     mod benchmark;
 }
diff --git a/src/main.rs b/src/main.rs
index 5247965..934bbbf 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,18 +2,118 @@
 //!
 //! This is the main entry point for the Janus command-line interface.
 
-use janus::indexing::shared::Event;
-use janus::indexing::{dense, shared::LogWriter, sparse};
+use janus::core::Event;
+use janus::storage::indexing::{dense, sparse};
+use janus::indexing::shared::LogWriter;
 use janus::storage::segmented_storage::StreamingSegmentedStorage;
 use janus::storage::util::StreamingConfig;
 use std::fs;
-use std::time::{Instant, SystemTime, UNIX_EPOCH};
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
 const DATA_DIR: &str = "data/benchmark";
 const LOG_FILE: &str = "data/benchmark/log.dat";
 const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx";
 const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx";
 const SPARSE_INTERVAL: usize = 1000;
+const SEGMENT_BASE_PATH: &str = "data/rdf_benchmark";
+
+fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
+    println!("🚀 RDF Segmented Storage Benchmark");
+    println!("==================================");
+
+    // Clean up and create directories
+    let _ = fs::remove_dir_all(SEGMENT_BASE_PATH);
+    fs::create_dir_all(SEGMENT_BASE_PATH)?;
+
+    // Configure storage
+    let config = StreamingConfig {
+        max_wal_events: 10_000,
+        max_wal_age_seconds: 60,
+        max_wal_bytes: 1_000_000,
+        sparse_interval: 1000,
+        entries_per_index_block: 100,
+        segment_base_path: SEGMENT_BASE_PATH.to_string(),
+    };
+
+    let mut storage = StreamingSegmentedStorage::new(config)?;
+    storage.start_background_flushing();
+
+    // Benchmark writing 1 million RDF events
+    println!("\n📝 Writing 1,000,000 RDF events...");
+    let start_time = Instant::now();
+    let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64;
+
+    for i in 0..1_000_000u64 {
+        let timestamp = base_timestamp + i * 1_000_000; // 1ms intervals
+        let subject = format!("http://example.org/person/person_{}", i % 10000);
+        let predicate = match i % 10 {
+            0..=3 => "http://example.org/knows",
+            4..=6 => "http://example.org/worksAt", 
+            7..=8 => "http://example.org/livesIn",
+            _ => "http://example.org/hasAge",
+        };
+        let object = match i % 10 {
+            0..=3 => format!("http://example.org/person/person_{}", (i + 1) % 10000),
+            4..=6 => format!("http://example.org/organization/org_{}", i % 1000),
+            7..=8 => format!("http://example.org/location/city_{}", i % 100),
+            _ => format!("\"{}\"^^http://www.w3.org/2001/XMLSchema#integer", 20 + (i % 60)),
+        };
+        let graph = format!("http://example.org/graph/graph_{}", i % 100);
+
+        storage.write_rdf(timestamp, &subject, predicate, &object, &graph)?;
+
+        if i > 0 && i % 100_000 == 0 {
+            println!("  ✓ Written {} events", i);
+        }
+    }
+
+    let write_duration = start_time.elapsed();
+    let write_throughput = 1_000_000.0 / write_duration.as_secs_f64();
+
+    println!("✅ Write completed!");
+    println!("   Duration: {:.3} seconds", write_duration.as_secs_f64());
+    println!("   Throughput: {:.0} events/sec", write_throughput);
+
+    // Wait a bit for background flushing
+    std::thread::sleep(Duration::from_secs(2));
+
+    // Benchmark reading different amounts of data
+    println!("\n🔍 Reading Benchmarks");
+    println!("====================");
+
+    let read_sizes = vec![100, 1_000, 10_000, 100_000, 1_000_000];
+    
+    for &size in &read_sizes {
+        // Query the first 'size' events
+        let query_start_ts = base_timestamp;
+        let query_end_ts = base_timestamp + (size as u64 * 1_000_000);
+
+        println!("\n📖 Querying {} events...", size);
+        let start_time = Instant::now();
+        
+        let results = storage.query_rdf(query_start_ts, query_end_ts)?;
+        
+        let query_duration = start_time.elapsed();
+        let read_throughput = results.len() as f64 / query_duration.as_secs_f64();
+
+        println!("   Results found: {}", results.len());
+        println!("   Query time: {:.3} ms", query_duration.as_millis());
+        println!("   Read throughput: {:.0} events/sec", read_throughput);
+
+        // Show a sample result for verification
+        if !results.is_empty() {
+            let sample = &results[0];
+            println!("   Sample result: {} {} {} in {} at {}", 
+                     sample.subject, sample.predicate, sample.object, sample.graph, sample.timestamp);
+        }
+    }
+
+    // Shutdown storage
+    storage.shutdown()?;
+
+    println!("\n🎉 Benchmark completed successfully!");
+    Ok(())
+}
 
 fn setup_data(number_records: u64) -> std::io::Result<()> {
     let _ = fs::remove_dir_all(DATA_DIR);
@@ -249,5 +349,13 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
 }
 
 fn main() -> std::io::Result<()> {
+    // Run the new RDF benchmark
+    benchmark_segmented_storage_rdf()?;
+    
+    println!("\n{}", "=".repeat(50));
+    println!("Running legacy benchmark for comparison...");
+    println!("{}", "=".repeat(50));
+    
+    // Also run the old benchmark for comparison
     benchmark_storage_performance()
 }
diff --git a/src/indexing/dense.rs b/src/storage/indexing/dense.rs
similarity index 97%
rename from src/indexing/dense.rs
rename to src/storage/indexing/dense.rs
index 37cb3f9..8f7eb35 100644
--- a/src/indexing/dense.rs
+++ b/src/storage/indexing/dense.rs
@@ -1,4 +1,4 @@
-use crate::indexing::shared::{decode_record, Event, RECORD_SIZE};
+use crate::core::{encoding::{decode_record, RECORD_SIZE}, Event};
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom, Write};
 #[doc = ""]
diff --git a/src/storage/indexing/dictionary.rs b/src/storage/indexing/dictionary.rs
new file mode 100644
index 0000000..4729756
--- /dev/null
+++ b/src/storage/indexing/dictionary.rs
@@ -0,0 +1,171 @@
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::{Read, Write};
+use std::path::Path;
+
+use bincode;
+use serde::{Deserialize, Serialize};
+
+use crate::core::Event;
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct Dictionary {
+    pub string_to_id: HashMap<String, u64>,
+    pub id_to_uri: HashMap<u64, String>,
+    pub next_id: u64,
+}
+
+impl Dictionary {
+    pub fn new() -> Self {
+        Dictionary { string_to_id: HashMap::new(), id_to_uri: HashMap::new(), next_id: 0 }
+    }
+
+    pub fn encode(&mut self, value: &str) -> u64 {
+        if let Some(&id) = self.string_to_id.get(value) {
+            id
+        } else {
+            let id = self.next_id;
+            self.string_to_id.insert(value.to_string(), id);
+            self.id_to_uri.insert(id, value.to_string());
+            self.next_id += 1;
+            id
+        }
+    }
+
+    pub fn decode(&self, id: u64) -> Option<&str> {
+        self.id_to_uri.get(&id).map(|s| s.as_str())
+    }
+
+    pub fn save_to_file(&self, path: &Path) -> std::io::Result<()> {
+        let encoded = bincode::serialize(self)
+            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
+        let mut file = File::create(path)?;
+        file.write_all(&encoded)?;
+        Ok(())
+    }
+
+    pub fn load_from_file(path: &Path) -> std::io::Result<Self> {
+        let mut file = File::open(path)?;
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+        let dict: Dictionary = bincode::deserialize(&buffer)
+            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
+        Ok(dict)
+    }
+
+    pub fn decode_graph(&self, event: &Event) -> String {
+        let subject = self.decode(event.subject).unwrap_or("unknown");
+        let predicate = self.decode(event.predicate).unwrap_or("unknown");
+        let object = self.decode(event.object).unwrap_or("unknown");
+        let graph = self.decode(event.graph).unwrap_or("unknown");
+
+        format!(
+            "<(<{}>, <{}>, <{}>, <{}>), {}>",
+            subject, predicate, object, graph, event.timestamp
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::indexing::shared::Event;
+
+    #[test]
+    fn test_dictionary_encoding_decoding() {
+        let mut dict = Dictionary::new();
+
+        // Encode some RDF terms
+        let subject_id = dict.encode("http://example.org/person/Alice");
+        let predicate_id = dict.encode("http://example.org/knows");
+        let object_id = dict.encode("http://example.org/person/Bob");
+        let graph_id = dict.encode("http://example.org/graph1");
+
+        println!("Encoded IDs:");
+        println!("Subject: {} -> {}", "http://example.org/person/Alice", subject_id);
+        println!("Predicate: {} -> {}", "http://example.org/knows", predicate_id);
+        println!("Object: {} -> {}", "http://example.org/person/Bob", object_id);
+        println!("Graph: {} -> {}", "http://example.org/graph1", graph_id);
+
+        // Create an event
+        let event = Event {
+            timestamp: 1234567890,
+            subject: subject_id,
+            predicate: predicate_id,
+            object: object_id,
+            graph: graph_id,
+        };
+
+        // Decode the event
+        let decoded = dict.decode_graph(&event);
+        println!("\nDecoded event: {}", decoded);
+
+        // Verify individual decodings
+        assert_eq!(dict.decode(subject_id), Some("http://example.org/person/Alice"));
+        assert_eq!(dict.decode(predicate_id), Some("http://example.org/knows"));
+        assert_eq!(dict.decode(object_id), Some("http://example.org/person/Bob"));
+        assert_eq!(dict.decode(graph_id), Some("http://example.org/graph1"));
+
+        // Test that the decoded string contains the expected format
+        assert!(decoded.contains("http://example.org/person/Alice"));
+        assert!(decoded.contains("http://example.org/knows"));
+        assert!(decoded.contains("http://example.org/person/Bob"));
+        assert!(decoded.contains("http://example.org/graph1"));
+        assert!(decoded.contains("1234567890"));
+    }
+
+    #[test]
+    fn test_clean_rdf_api() {
+        use crate::indexing::shared::RDFEvent;
+
+        let mut dict = Dictionary::new();
+
+        // Test the clean API - user provides URIs directly
+        let rdf_event = RDFEvent::new(
+            1234567890,
+            "http://example.org/person/Alice",
+            "http://example.org/knows",
+            "http://example.org/person/Bob",
+            "http://example.org/graph1",
+        );
+
+        // Encoding happens internally
+        let encoded_event = rdf_event.encode(&mut dict);
+
+        // Decoding happens internally
+        let decoded_event = encoded_event.decode(&dict);
+
+        // Verify the round-trip works
+        assert_eq!(decoded_event.subject, "http://example.org/person/Alice");
+        assert_eq!(decoded_event.predicate, "http://example.org/knows");
+        assert_eq!(decoded_event.object, "http://example.org/person/Bob");
+        assert_eq!(decoded_event.graph, "http://example.org/graph1");
+        assert_eq!(decoded_event.timestamp, 1234567890);
+
+        println!("✅ Clean API test passed!");
+        println!(
+            "Original: {} {} {} in {} at timestamp {}",
+            rdf_event.subject,
+            rdf_event.predicate,
+            rdf_event.object,
+            rdf_event.graph,
+            rdf_event.timestamp
+        );
+        println!(
+            "Encoded IDs: {} {} {} {} at timestamp {}",
+            encoded_event.subject,
+            encoded_event.predicate,
+            encoded_event.object,
+            encoded_event.graph,
+            encoded_event.timestamp
+        );
+        println!(
+            "Decoded: {} {} {} in {} at timestamp {}",
+            decoded_event.subject,
+            decoded_event.predicate,
+            decoded_event.object,
+            decoded_event.graph,
+            decoded_event.timestamp
+        );
+    }
+}
diff --git a/src/indexing/sparse.rs b/src/storage/indexing/sparse.rs
similarity index 97%
rename from src/indexing/sparse.rs
rename to src/storage/indexing/sparse.rs
index 70a8ad1..79cc08d 100644
--- a/src/indexing/sparse.rs
+++ b/src/storage/indexing/sparse.rs
@@ -1,5 +1,5 @@
-use crate::indexing::dictionary::Dictionary;
-use crate::indexing::shared::{decode_record, Event, ResolvedEvent, RECORD_SIZE};
+use crate::storage::indexing::dictionary::Dictionary;
+use crate::core::{encoding::{decode_record, RECORD_SIZE}, Event, RDFEvent};
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom, Write};
 use std::path::Path;
@@ -191,9 +191,9 @@ impl SparseReader {
         dict: &Dictionary,
         timestamp_start_bound: u64,
         timestamp_end_bound: u64,
-    ) -> std::io::Result<Vec<ResolvedEvent>> {
+    ) -> std::io::Result<Vec<RDFEvent>> {
         let events = self.query(log_path, timestamp_start_bound, timestamp_end_bound)?;
-        Ok(events.into_iter().map(|e| e.resolve(dict)).collect())
+        Ok(events.into_iter().map(|e| e.decode(dict)).collect())
     }
 
     /// Opens a sparse index file and loads it into memory.
diff --git a/src/storage/mod.rs b/src/storage/mod.rs
index 221ddeb..3b55564 100644
--- a/src/storage/mod.rs
+++ b/src/storage/mod.rs
@@ -1,4 +1,7 @@
-#[doc=""]
 pub mod util;
-#[doc = ""]
 pub mod segmented_storage;
+pub mod indexing {
+    pub mod dense;
+    pub mod dictionary;
+    pub mod sparse;
+}
diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs
index f8a6500..f3d956d 100644
--- a/src/storage/segmented_storage.rs
+++ b/src/storage/segmented_storage.rs
@@ -11,13 +11,11 @@ use std::{
 };
 
 use crate::{
-    config,
-    indexing::{
-        shared::{decode_record, encode_record, Event, RECORD_SIZE},
-        sparse,
-    },
+    core::{encoding::{decode_record, encode_record, RECORD_SIZE}, Event, RDFEvent},
     storage::{
-        self,
+        indexing::{
+            dictionary::Dictionary,
+        },
         util::{EnhancedSegmentMetadata, IndexBlock, StreamingConfig, WAL},
     },
 };
@@ -25,6 +23,7 @@ use crate::{
 pub struct StreamingSegmentedStorage {
     wal: Arc<RwLock<WAL>>,
     segments: Arc<RwLock<Vec<EnhancedSegmentMetadata>>>,
+    dictionary: Arc<RwLock<Dictionary>>,
     flush_handle: Option<JoinHandle<()>>,
     shutdown_signal: Arc<Mutex<bool>>,
     config: StreamingConfig,
@@ -43,6 +42,7 @@ impl StreamingSegmentedStorage {
             })),
 
             segments: Arc::new(RwLock::new(Vec::new())),
+            dictionary: Arc::new(RwLock::new(Dictionary::new())),
             flush_handle: None,
             shutdown_signal: Arc::new(Mutex::new(false)),
             config,
@@ -88,6 +88,23 @@ impl StreamingSegmentedStorage {
         Ok(())
     }
 
+    /// User-friendly API: Write RDF data directly with URI strings
+    pub fn write_rdf(
+        &self,
+        timestamp: u64,
+        subject: &str,
+        predicate: &str,
+        object: &str,
+        graph: &str,
+    ) -> std::io::Result<()> {
+        let rdf_event = RDFEvent::new(timestamp, subject, predicate, object, graph);
+        let encoded_event = {
+            let mut dict = self.dictionary.write().unwrap();
+            rdf_event.encode(&mut dict)
+        };
+        self.write(encoded_event)
+    }
+
     fn should_flush(&self) -> bool {
         let wal = self.wal.read().unwrap();
 
@@ -264,6 +281,17 @@ impl StreamingSegmentedStorage {
         Ok(results)
     }
 
+    /// User-friendly API: Query and return RDF events with URI strings
+    pub fn query_rdf(
+        &self,
+        start_timestamp: u64,
+        end_timestamp: u64,
+    ) -> std::io::Result<Vec<RDFEvent>> {
+        let encoded_events = self.query(start_timestamp, end_timestamp)?;
+        let dict = self.dictionary.read().unwrap();
+        Ok(encoded_events.into_iter().map(|event| event.decode(&dict)).collect())
+    }
+
     fn query_segment_two_level(
         &self,
         segment: &EnhancedSegmentMetadata,
diff --git a/src/storage/util.rs b/src/storage/util.rs
index e19e0e8..aadf9b9 100644
--- a/src/storage/util.rs
+++ b/src/storage/util.rs
@@ -3,7 +3,7 @@ use std::sync::{Arc, RwLock, Mutex};
 use std::thread::JoinHandle;
 use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
-use crate::indexing::shared::Event;
+use crate::core::Event;
 
 #[derive(Debug)]
 pub struct WAL {

From bd918d2f4efbde8da6a0504db5cde788d0535f3b Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Fri, 7 Nov 2025 18:50:36 +0100
Subject: [PATCH 09/19] Refactor code for improved organization and
 readability; add benchmarking for dense and sparse indexing

---
 src/benchmarking/benchmark.rs    | 129 ++++++++++++++
 src/benchmarks/mod.rs            |   2 +-
 src/core/encoding.rs             |   4 +-
 src/core/mod.rs                  |   2 +-
 src/indexing/mod.rs              |   1 -
 src/indexing/shared.rs           |  16 +-
 src/indexing/sparse.rs           | 280 +++++++++++++++++++++++++++++++
 src/main.rs                      |  20 ++-
 src/storage/indexing/dense.rs    |   5 +-
 src/storage/indexing/sparse.rs   |   5 +-
 src/storage/mod.rs               |   2 +-
 src/storage/segmented_storage.rs |   9 +-
 src/storage/util.rs              |   6 +-
 13 files changed, 442 insertions(+), 39 deletions(-)
 create mode 100644 src/benchmarking/benchmark.rs
 create mode 100644 src/indexing/sparse.rs

diff --git a/src/benchmarking/benchmark.rs b/src/benchmarking/benchmark.rs
new file mode 100644
index 0000000..a5be448
--- /dev/null
+++ b/src/benchmarking/benchmark.rs
@@ -0,0 +1,129 @@
+use crate::indexing::shared::LogWriter;
+use crate::storage::indexing::{dense, sparse};
+use std::fs;
+use std::time::Instant;
+
+const DATA_DIR: &str = "data/benchmark";
+const LOG_FILE: &str = "data/benchmark/log.dat";
+const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx";
+const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx";
+const SPARSE_INTERVAL: usize = 1000;
+
+fn setup_data(number_records: u64) -> std::io::Result<()> {
+    let _ = fs::remove_dir_all(DATA_DIR);
+    fs::create_dir_all(DATA_DIR)?;
+
+    let mut writer = LogWriter::create(LOG_FILE)?;
+
+    for i in 0..number_records {
+        let timestamp = i;
+        let subject = (i % 1000) as u64;
+        let predicate = (i % 500) as u64;
+        let object = (i % 2000) as u64;
+        let graph: u64 = 1;
+        writer.append_record(timestamp, subject, predicate, object, graph)?;
+    }
+
+    writer.flush()?;
+
+    println!("Generated log file with {} records", writer.record_count());
+
+    Ok(())
+}
+
+fn benchmark_indexing() -> std::io::Result<()> {
+    println!("Indexing Benchmark");
+
+    let start = Instant::now();
+    dense::build_dense_index(LOG_FILE, DENSE_INDEX_FILE)?;
+    let dense_time = start.elapsed();
+    println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0);
+
+    let start = Instant::now();
+    sparse::build_sparse_index(LOG_FILE, SPARSE_INDEX_FILE, &SPARSE_INTERVAL)?;
+    let sparse_time = start.elapsed();
+    println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0);
+
+    let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?;
+    let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?;
+    println!(
+        "\n Dense Index Size: {} MB",
+        dense_reader.index_size_bytes() as f64 / 1_000_000.0
+    );
+
+    println!(
+        "\n Sparse Index Size: {} MB",
+        sparse_reader.index_size_bytes() as f64 / 1_000_000.0
+    );
+    Ok(())
+}
+
+fn benchmark_queries() -> std::io::Result<()> {
+    println!("Query Benchmark");
+    let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?;
+    let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?;
+
+    let query_ranges = vec![
+        (0u64, 100u64, "100 records"),
+        (5000u64, 5100u64, "100 records (mid-range)"),
+        (0u64, 10000u64, "10K records"),
+        (0u64, 100000u64, "100K records"),
+        (0u64, 1000000u64, "1M records"),
+    ];
+
+    for (timestamp_start, timestamp_end, description) in query_ranges {
+        println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end);
+
+        let start = Instant::now();
+        let dense_results = dense_reader.query(LOG_FILE, timestamp_start, timestamp_end)?;
+        let dense_time = start.elapsed();
+
+        let start = Instant::now();
+        let sparse_results = sparse_reader.query(LOG_FILE, timestamp_start, timestamp_end)?;
+        let sparse_time = start.elapsed();
+
+        println!(
+            " Dense Index Query Time: {:.3} ms, Results: {}",
+            dense_time.as_secs_f64() * 1000.0,
+            dense_results.len()
+        );
+
+        println!(
+            " Sparse Index Query Time: {:.3} ms, Results: {}",
+            sparse_time.as_secs_f64() * 1000.0,
+            sparse_results.len()
+        );
+
+        let speedup = sparse_time.as_secs_f64() / dense_time.as_secs_f64();
+
+        if speedup > 1.0 {
+            println!(" Sparse index is {:.2} times faster than Dense index", speedup);
+        } else {
+            println!(" Dense index is {:.2} times faster than Sparse index", 1.0 / speedup);
+        }
+
+        assert_eq!(
+            dense_results.len(),
+            sparse_results.len(),
+            "Mismatch in result counts between Dense and Sparse index queries"
+        );
+    }
+    Ok(())
+}
+
+fn main() -> std::io::Result<()> {
+    println!("RDF Indexing Benchmark : Dense vs Sparse");
+    println!("Setting up data...");
+    let number_of_records = 1_000_000u64;
+    setup_data(number_of_records)?;
+
+    benchmark_indexing()?;
+    benchmark_queries()?;
+
+    println!(
+        "\n=== Summary ===\nSparse interval: {}\nUse this data to decide \
+         which approach suits your use case best.",
+        SPARSE_INTERVAL
+    );
+    Ok(())
+}
diff --git a/src/benchmarks/mod.rs b/src/benchmarks/mod.rs
index fe2015f..2543f50 100644
--- a/src/benchmarks/mod.rs
+++ b/src/benchmarks/mod.rs
@@ -1 +1 @@
-pub mod benchmark;
\ No newline at end of file
+pub mod benchmark;
diff --git a/src/core/encoding.rs b/src/core/encoding.rs
index faf8a9a..5c0e904 100644
--- a/src/core/encoding.rs
+++ b/src/core/encoding.rs
@@ -1,7 +1,7 @@
 //! Binary encoding/decoding utilities for RDF events
 
-use crate::storage::indexing::dictionary::Dictionary;
 use crate::core::{Event, RDFEvent};
+use crate::storage::indexing::dictionary::Dictionary;
 
 /// Size of a single encoded record in bytes
 pub const RECORD_SIZE: usize = 40;
@@ -56,4 +56,4 @@ impl Event {
             graph: dict.decode(self.graph).unwrap_or("UNKNOWN").to_string(),
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/mod.rs b/src/core/mod.rs
index 42f5529..5fbfc41 100644
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -33,4 +33,4 @@ impl RDFEvent {
 }
 
 pub mod encoding;
-pub use encoding::*;
\ No newline at end of file
+pub use encoding::*;
diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs
index 3bc786f..706bb40 100644
--- a/src/indexing/mod.rs
+++ b/src/indexing/mod.rs
@@ -1,4 +1,3 @@
 //! Legacy indexing utilities - most functionality moved to storage::indexing
 
 pub mod shared;
-
diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs
index bbc875f..f0891dd 100644
--- a/src/indexing/shared.rs
+++ b/src/indexing/shared.rs
@@ -1,8 +1,8 @@
 //! Legacy storage utilities - to be moved to storage module
 
+use crate::core::encoding::{encode_record, RECORD_SIZE};
 use std::fs::File;
 use std::io::Write;
-use crate::core::encoding::{encode_record, RECORD_SIZE};
 
 /// Log writer for appending encoded records to a file
 pub struct LogWriter {
@@ -14,10 +14,7 @@ impl LogWriter {
     /// Create a new log writer for the given file path
     pub fn create(path: &str) -> std::io::Result<Self> {
         let log_file = File::create(path)?;
-        Ok(Self {
-            log_file,
-            record_count: 0
-        })
+        Ok(Self { log_file, record_count: 0 })
     }
 
     /// Append an encoded record to the log file
@@ -46,12 +43,3 @@ impl LogWriter {
         self.log_file.flush()
     }
 }
-
-
-
-
-
-
-
-
-
diff --git a/src/indexing/sparse.rs b/src/indexing/sparse.rs
new file mode 100644
index 0000000..cfb8ab7
--- /dev/null
+++ b/src/indexing/sparse.rs
@@ -0,0 +1,280 @@
+use crate::indexing::dictionary::Dictionary;
+use crate::indexing::shared::{decode_record, Event, RDFEvent, RECORD_SIZE};
+use std::fs::File;
+use std::io::{Read, Seek, SeekFrom, Write};
+use std::path::Path;
+
+/// Builder for creating sparse indexes that store only periodic entries.
+///
+/// A sparse index reduces storage space by indexing only every Nth record,
+/// trading some query precision for significant space savings.
+#[doc = ""]
+pub struct SparseIndexBuilder {
+    index_file: File,
+    interval: usize,
+}
+#[doc = ""]
+impl SparseIndexBuilder {
+    /// Creates a new sparse index builder that writes to the specified file.
+    ///
+    /// # Arguments
+    /// * `index_path` - Path where the index file will be created
+    /// * `interval` - Number of records between index entries (e.g., 1000 means index every 1000th record)
+    ///
+    /// # Returns
+    /// A new `SparseIndexBuilder` instance or an I/O error
+    #[doc = ""]
+    pub fn create(index_path: &str, interval: usize) -> std::io::Result<Self> {
+        let index_file = File::create(index_path)?;
+        Ok(Self { index_file, interval })
+    }
+
+    /// Adds an entry to the sparse index if the record count matches the interval.
+    ///
+    /// Only records where `record_count % interval == 0` are indexed to save space.
+    ///
+    /// # Arguments
+    /// * `record_count` - The current record number in the log
+    /// * `timestamp` - Timestamp of the record
+    /// * `offset` - Byte offset of the record in the log file
+    ///
+    /// # Returns
+    /// `true` if the entry was added to the index, `false` if skipped
+    #[doc = ""]
+    pub fn add_entry(
+        &mut self,
+        record_count: u64,
+        timestamp: u64,
+        offset: u64,
+    ) -> std::io::Result<bool> {
+        if record_count % self.interval as u64 == 0 {
+            self.index_file.write_all(&timestamp.to_be_bytes())?;
+            self.index_file.write_all(&offset.to_be_bytes())?;
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+
+    /// Finalizes the index by flushing any buffered writes to disk.
+    ///
+    /// This should be called after all entries have been added.
+    #[doc = ""]
+    pub fn finalize(&mut self) -> std::io::Result<()> {
+        self.index_file.flush()
+    }
+}
+
+/// Builds a sparse index for an existing log file.
+///
+/// This function reads through the entire log file and creates an index
+/// with entries only for records at the specified interval.
+///
+/// # Arguments
+/// * `log_path` - Path to the log file to index
+/// * `index_path` - Path where the index file will be created
+/// * `interval` - Number of records between index entries
+///
+/// # Returns
+/// Ok(()) on success, or an I/O error
+pub fn build_sparse_index(
+    log_path: &str,
+    index_path: &str,
+    interval: &usize,
+) -> std::io::Result<()> {
+    let mut log = File::open(log_path)?;
+    let mut builder = SparseIndexBuilder::create(index_path, *interval)?;
+
+    let mut offset = 0u64;
+    let mut record_count = 0u64;
+    let mut record = [0u8; RECORD_SIZE];
+
+    while log.read_exact(&mut record).is_ok() {
+        let (timestamp, _, _, _, _) = decode_record(&record);
+        builder.add_entry(record_count, timestamp, offset)?;
+        offset += RECORD_SIZE as u64;
+        record_count += 1;
+    }
+
+    builder.finalize()?;
+    Ok(())
+}
+
+/// Builds a sparse index and initializes an empty dictionary.
+///
+/// This is a convenience function that creates both the index and
+/// an empty dictionary file. The dictionary can be populated separately
+/// when processing RDF data.
+///
+/// # Arguments
+/// * `log_path` - Path to the log file to index
+/// * `index_path` - Path where the index file will be created
+/// * `dictionary_path` - Path where the dictionary file will be created
+/// * `interval` - Number of records between index entries
+///
+/// # Returns
+/// Ok(()) on success, or an I/O error
+pub fn build_sparse_index_with_dictionary(
+    log_path: &str,
+    index_path: &str,
+    dictionary_path: &str,
+    interval: &usize,
+) -> std::io::Result<()> {
+    let mut log = File::open(log_path)?;
+    let mut builder = SparseIndexBuilder::create(index_path, *interval)?;
+    let dictionary = Dictionary::new();
+
+    let mut offset = 0u64;
+    let mut record_count = 0u64;
+    let mut record = [0u8; RECORD_SIZE];
+
+    while log.read_exact(&mut record).is_ok() {
+        let (timestamp, _subject, _predicate, _object, _graph) = decode_record(&record);
+
+        builder.add_entry(record_count, timestamp, offset)?;
+
+        offset += RECORD_SIZE as u64;
+        record_count += 1;
+    }
+
+    builder.finalize()?;
+    dictionary.save_to_file(Path::new(dictionary_path))?;
+
+    Ok(())
+}
+
+/// Reader for sparse indexes that enables efficient timestamp-based queries.
+///
+/// The sparse reader loads the entire index into memory for fast binary search,
+/// then performs sequential scans of the log file starting from the appropriate position.
+pub struct SparseReader {
+    index: Vec<(u64, u64)>,
+    #[allow(dead_code)]
+    interval: usize,
+}
+
+impl SparseReader {
+    /// Opens a sparse index and its associated dictionary.
+    ///
+    /// # Arguments
+    /// * `index_path` - Path to the sparse index file
+    /// * `dictionary_path` - Path to the dictionary file
+    /// * `interval` - The interval used when building the index
+    ///
+    /// # Returns
+    /// A tuple of (SparseReader, Dictionary) or an I/O error
+    pub fn open_with_dictionary(
+        index_path: &str,
+        dictionary_path: &str,
+        interval: usize,
+    ) -> std::io::Result<(Self, Dictionary)> {
+        let reader = Self::open(index_path, interval)?;
+        let dictionary = Dictionary::load_from_file(Path::new(dictionary_path))?;
+        Ok((reader, dictionary))
+    }
+    /// Queries the log and returns results with URIs resolved from the dictionary.
+    ///
+    /// This method performs the same query as `query()` but resolves all numeric IDs
+    /// back to their original URI strings using the provided dictionary.
+    ///
+    /// # Arguments
+    /// * `log_path` - Path to the log file
+    /// * `dict` - Dictionary for resolving IDs to URIs
+    /// * `timestamp_start_bound` - Minimum timestamp (inclusive)
+    /// * `timestamp_end_bound` - Maximum timestamp (inclusive)
+    ///
+    /// # Returns
+    /// Vector of resolved events or an I/O error
+    pub fn query_resolved(
+        &self,
+        log_path: &str,
+        dict: &Dictionary,
+        timestamp_start_bound: u64,
+        timestamp_end_bound: u64,
+    ) -> std::io::Result<Vec<RDFEvent>> {
+        let events = self.query(log_path, timestamp_start_bound, timestamp_end_bound)?;
+        Ok(events.into_iter().map(|e| e.decode(dict)).collect())
+    }
+
+    /// Opens a sparse index file and loads it into memory.
+    ///
+    /// # Arguments
+    /// * `index_path` - Path to the sparse index file
+    /// * `interval` - The interval used when building the index
+    ///
+    /// # Returns
+    /// A new SparseReader instance or an I/O error
+    pub fn open(index_path: &str, interval: usize) -> std::io::Result<Self> {
+        let mut index_file = File::open(index_path)?;
+        let mut index = Vec::new();
+        let mut entry = [0u8; 16];
+
+        while index_file.read_exact(&mut entry).is_ok() {
+            let timestamp = u64::from_be_bytes(entry[0..8].try_into().unwrap());
+            let offset = u64::from_be_bytes(entry[8..16].try_into().unwrap());
+
+            index.push((timestamp, offset));
+        }
+        Ok(Self { index, interval })
+    }
+
+    /// Queries the log file for events within the specified timestamp range.
+    ///
+    /// Uses binary search on the index to find the starting position, then
+    /// performs a sequential scan of the log file to collect matching events.
+    ///
+    /// # Arguments
+    /// * `log_path` - Path to the log file
+    /// * `timestamp_start_bound` - Minimum timestamp (inclusive)
+    /// * `timestamp_end_bound` - Maximum timestamp (inclusive)
+    ///
+    /// # Returns
+    /// Vector of events with numeric IDs or an I/O error
+    pub fn query(
+        &self,
+        log_path: &str,
+        timestamp_start_bound: u64,
+        timestamp_end_bound: u64,
+    ) -> std::io::Result<Vec<Event>> {
+        if timestamp_start_bound > timestamp_end_bound {
+            return Ok(Vec::new());
+        }
+
+        if self.index.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        let position = self
+            .index
+            .binary_search_by_key(&timestamp_start_bound, |x| x.0)
+            .unwrap_or_else(|i| i.saturating_sub(1));
+
+        let mut log = File::open(log_path)?;
+        log.seek(SeekFrom::Start(self.index[position].1))?;
+
+        let mut results = Vec::new();
+        let mut record = [0u8; RECORD_SIZE];
+
+        while log.read_exact(&mut record).is_ok() {
+            let (timestamp, subject, predicate, object, graph) = decode_record(&record);
+
+            if timestamp > timestamp_end_bound {
+                break;
+            }
+
+            if timestamp >= timestamp_start_bound {
+                results.push(Event { timestamp, subject, predicate, object, graph });
+            }
+        }
+
+        Ok(results)
+    }
+
+    /// Returns the size of the index in bytes.
+    ///
+    /// Each index entry is 16 bytes (8 bytes timestamp + 8 bytes offset),
+    /// so this returns `index.len() * 16`.
+    pub fn index_size_bytes(&self) -> usize {
+        self.index.len() * 16
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 934bbbf..bd30920 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3,8 +3,8 @@
 //! This is the main entry point for the Janus command-line interface.
 
 use janus::core::Event;
-use janus::storage::indexing::{dense, sparse};
 use janus::indexing::shared::LogWriter;
+use janus::storage::indexing::{dense, sparse};
 use janus::storage::segmented_storage::StreamingSegmentedStorage;
 use janus::storage::util::StreamingConfig;
 use std::fs;
@@ -48,7 +48,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
         let subject = format!("http://example.org/person/person_{}", i % 10000);
         let predicate = match i % 10 {
             0..=3 => "http://example.org/knows",
-            4..=6 => "http://example.org/worksAt", 
+            4..=6 => "http://example.org/worksAt",
             7..=8 => "http://example.org/livesIn",
             _ => "http://example.org/hasAge",
         };
@@ -82,7 +82,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     println!("====================");
 
     let read_sizes = vec![100, 1_000, 10_000, 100_000, 1_000_000];
-    
+
     for &size in &read_sizes {
         // Query the first 'size' events
         let query_start_ts = base_timestamp;
@@ -90,9 +90,9 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
 
         println!("\n📖 Querying {} events...", size);
         let start_time = Instant::now();
-        
+
         let results = storage.query_rdf(query_start_ts, query_end_ts)?;
-        
+
         let query_duration = start_time.elapsed();
         let read_throughput = results.len() as f64 / query_duration.as_secs_f64();
 
@@ -103,8 +103,10 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
         // Show a sample result for verification
         if !results.is_empty() {
             let sample = &results[0];
-            println!("   Sample result: {} {} {} in {} at {}", 
-                     sample.subject, sample.predicate, sample.object, sample.graph, sample.timestamp);
+            println!(
+                "   Sample result: {} {} {} in {} at {}",
+                sample.subject, sample.predicate, sample.object, sample.graph, sample.timestamp
+            );
         }
     }
 
@@ -351,11 +353,11 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
 fn main() -> std::io::Result<()> {
     // Run the new RDF benchmark
     benchmark_segmented_storage_rdf()?;
-    
+
     println!("\n{}", "=".repeat(50));
     println!("Running legacy benchmark for comparison...");
     println!("{}", "=".repeat(50));
-    
+
     // Also run the old benchmark for comparison
     benchmark_storage_performance()
 }
diff --git a/src/storage/indexing/dense.rs b/src/storage/indexing/dense.rs
index 8f7eb35..d2a904e 100644
--- a/src/storage/indexing/dense.rs
+++ b/src/storage/indexing/dense.rs
@@ -1,4 +1,7 @@
-use crate::core::{encoding::{decode_record, RECORD_SIZE}, Event};
+use crate::core::{
+    encoding::{decode_record, RECORD_SIZE},
+    Event,
+};
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom, Write};
 #[doc = ""]
diff --git a/src/storage/indexing/sparse.rs b/src/storage/indexing/sparse.rs
index 79cc08d..3169e9f 100644
--- a/src/storage/indexing/sparse.rs
+++ b/src/storage/indexing/sparse.rs
@@ -1,5 +1,8 @@
+use crate::core::{
+    encoding::{decode_record, RECORD_SIZE},
+    Event, RDFEvent,
+};
 use crate::storage::indexing::dictionary::Dictionary;
-use crate::core::{encoding::{decode_record, RECORD_SIZE}, Event, RDFEvent};
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom, Write};
 use std::path::Path;
diff --git a/src/storage/mod.rs b/src/storage/mod.rs
index 3b55564..f797e0d 100644
--- a/src/storage/mod.rs
+++ b/src/storage/mod.rs
@@ -1,5 +1,5 @@
-pub mod util;
 pub mod segmented_storage;
+pub mod util;
 pub mod indexing {
     pub mod dense;
     pub mod dictionary;
diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs
index f3d956d..153b0d2 100644
--- a/src/storage/segmented_storage.rs
+++ b/src/storage/segmented_storage.rs
@@ -11,11 +11,12 @@ use std::{
 };
 
 use crate::{
-    core::{encoding::{decode_record, encode_record, RECORD_SIZE}, Event, RDFEvent},
+    core::{
+        encoding::{decode_record, encode_record, RECORD_SIZE},
+        Event, RDFEvent,
+    },
     storage::{
-        indexing::{
-            dictionary::Dictionary,
-        },
+        indexing::dictionary::Dictionary,
         util::{EnhancedSegmentMetadata, IndexBlock, StreamingConfig, WAL},
     },
 };
diff --git a/src/storage/util.rs b/src/storage/util.rs
index aadf9b9..1fe653e 100644
--- a/src/storage/util.rs
+++ b/src/storage/util.rs
@@ -1,5 +1,5 @@
-use std::collections::{VecDeque, HashMap};
-use std::sync::{Arc, RwLock, Mutex};
+use std::collections::{HashMap, VecDeque};
+use std::sync::{Arc, Mutex, RwLock};
 use std::thread::JoinHandle;
 use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
@@ -53,5 +53,3 @@ impl Default for StreamingConfig {
         }
     }
 }
-
-

From 0d31416eab72a90bc2c20a93a2b6026ee3b65af9 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Fri, 7 Nov 2025 19:10:08 +0100
Subject: [PATCH 10/19] Refactor storage system to replace WAL with BatchBuffer
 for improved event handling; update StreamingConfig for batch processing
 parameters

---
 src/core/encoding.rs             |  14 ++
 src/main.rs                      |  12 +-
 src/storage/segmented_storage.rs | 222 +++++++++++++++++++++++--------
 src/storage/util.rs              |  18 ++-
 4 files changed, 199 insertions(+), 67 deletions(-)

diff --git a/src/core/encoding.rs b/src/core/encoding.rs
index 5c0e904..6c3fe9c 100644
--- a/src/core/encoding.rs
+++ b/src/core/encoding.rs
@@ -56,4 +56,18 @@ impl Event {
             graph: dict.decode(self.graph).unwrap_or("UNKNOWN").to_string(),
         }
     }
+
+    /// Encode this Event to bytes
+    pub fn to_bytes(&self) -> [u8; RECORD_SIZE] {
+        let mut buffer = [0u8; RECORD_SIZE];
+        encode_record(
+            &mut buffer,
+            self.timestamp,
+            self.subject,
+            self.predicate,
+            self.object,
+            self.graph,
+        );
+        buffer
+    }
 }
diff --git a/src/main.rs b/src/main.rs
index bd30920..25258d6 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -27,9 +27,9 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
 
     // Configure storage
     let config = StreamingConfig {
-        max_wal_events: 10_000,
-        max_wal_age_seconds: 60,
-        max_wal_bytes: 1_000_000,
+        max_batch_events: 500_000,
+        max_batch_age_seconds: 1,
+        max_batch_bytes: 50_000_000,
         sparse_interval: 1000,
         entries_per_index_block: 100,
         segment_base_path: SEGMENT_BASE_PATH.to_string(),
@@ -247,9 +247,9 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
 
         // Configure storage
         let config = StreamingConfig {
-            max_wal_events: 5000,
-            max_wal_age_seconds: 30,
-            max_wal_bytes: 5 * 1024 * 1024,
+            max_batch_events: 250_000,
+            max_batch_age_seconds: 1,
+            max_batch_bytes: 100 * 1024 * 1024,
             sparse_interval: 100,
             entries_per_index_block: 512,
             segment_base_path: format!("./benchmark_data_{}", num_records),
diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs
index 153b0d2..1cd0155 100644
--- a/src/storage/segmented_storage.rs
+++ b/src/storage/segmented_storage.rs
@@ -17,12 +17,12 @@ use crate::{
     },
     storage::{
         indexing::dictionary::Dictionary,
-        util::{EnhancedSegmentMetadata, IndexBlock, StreamingConfig, WAL},
+        util::{BatchBuffer, EnhancedSegmentMetadata, IndexBlock, StreamingConfig},
     },
 };
 
 pub struct StreamingSegmentedStorage {
-    wal: Arc<RwLock<WAL>>,
+    batch_buffer: Arc<RwLock<BatchBuffer>>,
     segments: Arc<RwLock<Vec<EnhancedSegmentMetadata>>>,
     dictionary: Arc<RwLock<Dictionary>>,
     flush_handle: Option<JoinHandle<()>>,
@@ -35,7 +35,7 @@ impl StreamingSegmentedStorage {
         std::fs::create_dir_all(&config.segment_base_path)?;
 
         let storage = Self {
-            wal: Arc::new(RwLock::new(WAL {
+            batch_buffer: Arc::new(RwLock::new(BatchBuffer {
                 events: VecDeque::new(),
                 total_bytes: 0,
                 oldest_timestamp_bound: None,
@@ -53,13 +53,18 @@ impl StreamingSegmentedStorage {
     }
 
     pub fn start_background_flushing(&mut self) {
-        let wal_clone = Arc::clone(&self.wal);
+        let batch_buffer_clone = Arc::clone(&self.batch_buffer);
         let segments_clone = Arc::clone(&self.segments);
         let shutdown_clone = Arc::clone(&self.shutdown_signal);
         let config_clone = self.config.clone();
 
         let handle = std::thread::spawn(move || {
-            Self::background_flush_loop(wal_clone, segments_clone, shutdown_clone, config_clone);
+            Self::background_flush_loop(
+                batch_buffer_clone,
+                segments_clone,
+                shutdown_clone,
+                config_clone,
+            );
         });
 
         self.flush_handle = Some(handle);
@@ -69,22 +74,21 @@ impl StreamingSegmentedStorage {
         let event_size = std::mem::size_of::<Event>();
 
         {
-            let mut wal = self.wal.write().unwrap();
+            let mut batch_buffer = self.batch_buffer.write().unwrap();
 
-            if wal.oldest_timestamp_bound.is_none() {
-                wal.oldest_timestamp_bound = Some(event.timestamp);
+            if batch_buffer.oldest_timestamp_bound.is_none() {
+                batch_buffer.oldest_timestamp_bound = Some(event.timestamp);
             }
 
-            wal.newest_timestamp_bound = Some(event.timestamp);
+            batch_buffer.newest_timestamp_bound = Some(event.timestamp);
 
-            wal.total_bytes += event_size;
+            batch_buffer.total_bytes += event_size;
 
-            wal.events.push_back(event);
+            batch_buffer.events.push_back(event);
         }
 
-        if self.should_flush() {
-            self.flush_wal_to_segment()?;
-        }
+        // Note: Synchronous flushing removed for high throughput.
+        // Background thread handles all flushing based on time limits.
 
         Ok(())
     }
@@ -107,16 +111,16 @@ impl StreamingSegmentedStorage {
     }
 
     fn should_flush(&self) -> bool {
-        let wal = self.wal.read().unwrap();
+        let batch_buffer = self.batch_buffer.read().unwrap();
 
-        wal.events.len() >= self.config.max_wal_events.try_into().unwrap()
-            || wal.total_bytes > self.config.max_wal_bytes
-            || wal.oldest_timestamp_bound.map_or(false, |oldest| {
+        batch_buffer.events.len() >= self.config.max_batch_events.try_into().unwrap()
+            || batch_buffer.total_bytes > self.config.max_batch_bytes
+            || batch_buffer.oldest_timestamp_bound.map_or(false, |oldest| {
                 let current_timestamp = Self::current_timestamp();
 
                 // Use saturating subtraction to avoid underflow if oldest > current_timestamp
                 current_timestamp.saturating_sub(oldest)
-                    >= self.config.max_wal_age_seconds * 1_000_000_000
+                    >= self.config.max_batch_age_seconds * 1_000_000_000
             })
     }
 
@@ -124,20 +128,20 @@ impl StreamingSegmentedStorage {
         SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64
     }
 
-    fn flush_wal_to_segment(&self) -> std::io::Result<()> {
-        // Automatically extract events from the WAL.
+    fn flush_batch_buffer_to_segment(&self) -> std::io::Result<()> {
+        // Automatically extract events from the batch buffer.
 
         let events_to_flush = {
-            let mut wal = self.wal.write().unwrap();
-            if wal.events.is_empty() {
+            let mut batch_buffer = self.batch_buffer.write().unwrap();
+            if batch_buffer.events.is_empty() {
                 return Ok(());
             }
 
-            let events: Vec<Event> = wal.events.drain(..).collect();
+            let events: Vec<Event> = batch_buffer.events.drain(..).collect();
 
-            wal.total_bytes = 0;
-            wal.oldest_timestamp_bound = None;
-            wal.newest_timestamp_bound = None;
+            batch_buffer.total_bytes = 0;
+            batch_buffer.oldest_timestamp_bound = None;
+            batch_buffer.newest_timestamp_bound = None;
             events
         };
 
@@ -174,7 +178,7 @@ impl StreamingSegmentedStorage {
 
         for (record_count, event) in events.iter().enumerate() {
             let record_bytes = self.serialize_event_to_fixed_size(event);
-            data_file.write_all(&record_bytes);
+            data_file.write_all(&record_bytes)?;
 
             if record_count % self.config.sparse_interval == 0 {
                 let sparse_entry = (event.timestamp, data_offset);
@@ -234,30 +238,18 @@ impl StreamingSegmentedStorage {
         min_ts: u64,
         max_ts: u64,
     ) -> std::io::Result<IndexBlock> {
-        let file_offset = index_file.stream_position()?;
-
-        for (timestamp, offset) in entries {
-            index_file.write_all(&timestamp.to_ne_bytes())?;
-            index_file.write_all(&offset.to_be_bytes())?;
-        }
-
-        Ok(IndexBlock {
-            min_timestamp: min_ts,
-            max_timestamp: max_ts,
-            file_offset,
-            entry_count: entries.len() as u32,
-        })
+        Self::flush_index_block_static(index_file, entries, min_ts, max_ts)
     }
 
     pub fn query(&self, start_timestamp: u64, end_timestamp: u64) -> std::io::Result<Vec<Event>> {
         let mut results = Vec::new();
 
-        // First try to query the immediate WAL which has the fastest visibility.
+        // First try to query the immediate batch buffer which has the fastest visibility.
 
         {
-            let wal = self.wal.read().unwrap();
+            let batch_buffer = self.batch_buffer.read().unwrap();
 
-            for event in &wal.events {
+            for event in &batch_buffer.events {
                 if event.timestamp >= start_timestamp && event.timestamp <= end_timestamp {
                     results.push(event.clone());
                 }
@@ -405,30 +397,33 @@ impl StreamingSegmentedStorage {
     }
 
     fn background_flush_loop(
-        wal: Arc<RwLock<WAL>>,
+        batch_buffer: Arc<RwLock<BatchBuffer>>,
         segments: Arc<RwLock<Vec<EnhancedSegmentMetadata>>>,
         shutdown_signal: Arc<Mutex<bool>>,
         config: StreamingConfig,
     ) {
         while !*shutdown_signal.lock().unwrap() {
-            std::thread::sleep(Duration::from_secs(1));
+            std::thread::sleep(Duration::from_millis(100));
 
             // Check if flush is needed or not.
 
             let should_flush = {
-                let wal = wal.read().unwrap();
+                let batch_buffer = batch_buffer.read().unwrap();
 
-                wal.events.len() >= config.max_wal_bytes
-                    || wal.total_bytes >= config.max_wal_bytes
-                    || wal.oldest_timestamp_bound.map_or(false, |oldest| {
+                batch_buffer.events.len() >= config.max_batch_events.try_into().unwrap()
+                    || batch_buffer.total_bytes >= config.max_batch_bytes
+                    || batch_buffer.oldest_timestamp_bound.map_or(false, |oldest| {
                         let current_timestamp = Self::current_timestamp();
-                        (current_timestamp - oldest) >= config.max_wal_age_seconds * 1_000_000_000
+                        current_timestamp.saturating_sub(oldest)
+                            >= config.max_batch_age_seconds * 1_000_000_000
                     })
             };
 
             if should_flush {
                 // TODO : Add better error handling here in this case
-                if let Err(e) = Self::flush_background(wal.clone(), segments.clone(), &config) {
+                if let Err(e) =
+                    Self::flush_background(batch_buffer.clone(), segments.clone(), &config)
+                {
                     eprintln!("Background flush failed: {}", e);
                 }
             }
@@ -436,10 +431,104 @@ impl StreamingSegmentedStorage {
     }
 
     fn flush_background(
-        wal: Arc<RwLock<WAL>>,
+        batch_buffer: Arc<RwLock<BatchBuffer>>,
         segments: Arc<RwLock<Vec<EnhancedSegmentMetadata>>>,
         config: &StreamingConfig,
     ) -> std::io::Result<()> {
+        // Automatically extract events from the batch buffer.
+
+        let events_to_flush = {
+            let mut batch_buffer = batch_buffer.write().unwrap();
+            if batch_buffer.events.is_empty() {
+                return Ok(());
+            }
+
+            let events: Vec<Event> = batch_buffer.events.drain(..).collect();
+
+            batch_buffer.total_bytes = 0;
+            batch_buffer.oldest_timestamp_bound = None;
+            batch_buffer.newest_timestamp_bound = None;
+            events
+        };
+
+        // Create a new segment for these events
+        let segment_id = Self::current_timestamp();
+        let data_path = format!("{}/segment-{}.log", config.segment_base_path, segment_id);
+        let index_path = format!("{}/segment-{}.idx", config.segment_base_path, segment_id);
+
+        // Use buffered writers for performance (same as original implementation)
+        let mut data_file = BufWriter::new(std::fs::File::create(&data_path)?);
+        let mut index_file = BufWriter::new(std::fs::File::create(&index_path)?);
+
+        let mut index_directory = Vec::new();
+        let mut current_block_entries = Vec::new();
+        let mut current_block_min_ts = None;
+        let mut current_block_max_ts = 0u64;
+        let mut data_offset = 0u64;
+
+        for (record_count, event) in events_to_flush.iter().enumerate() {
+            // Use the same serialization as the original
+            let record_bytes = Self::serialize_event_to_fixed_size_static(event);
+            data_file.write_all(&record_bytes)?;
+
+            if record_count % config.sparse_interval == 0 {
+                let sparse_entry = (event.timestamp, data_offset);
+
+                if current_block_min_ts.is_none() {
+                    current_block_min_ts = Some(event.timestamp);
+                }
+
+                current_block_max_ts = event.timestamp;
+                current_block_entries.push(sparse_entry);
+
+                if current_block_entries.len() >= config.entries_per_index_block {
+                    let block_metadata = Self::flush_index_block_static(
+                        &mut index_file,
+                        &current_block_entries,
+                        current_block_min_ts.unwrap(),
+                        current_block_max_ts,
+                    )?;
+
+                    index_directory.push(block_metadata);
+
+                    current_block_entries.clear();
+                    current_block_min_ts = None;
+                }
+            }
+            data_offset += record_bytes.len() as u64;
+        }
+
+        if !current_block_entries.is_empty() {
+            let block_metadata = Self::flush_index_block_static(
+                &mut index_file,
+                &current_block_entries,
+                current_block_min_ts.unwrap(),
+                current_block_max_ts,
+            )?;
+
+            index_directory.push(block_metadata);
+        }
+
+        data_file.flush()?;
+        index_file.flush()?;
+
+        // Add the new segment to the segments list
+        let new_segment = EnhancedSegmentMetadata {
+            start_timstamp: events_to_flush.first().unwrap().timestamp,
+            end_timestamp: events_to_flush.last().unwrap().timestamp,
+            data_path,
+            index_path,
+            record_count: events_to_flush.len() as u64,
+            index_directory,
+        };
+
+        {
+            let mut segments = segments.write().unwrap();
+            segments.push(new_segment);
+            // Keep segments sorted by start timestamp
+            segments.sort_by_key(|s| s.start_timstamp);
+        }
+
         Ok(())
     }
 
@@ -504,7 +593,7 @@ impl StreamingSegmentedStorage {
 
         // Final Flush
 
-        self.flush_wal_to_segment();
+        self.flush_batch_buffer_to_segment()?;
 
         if let Some(handle) = self.flush_handle.take() {
             handle.join().unwrap();
@@ -513,6 +602,10 @@ impl StreamingSegmentedStorage {
     }
 
     fn serialize_event_to_fixed_size(&self, event: &Event) -> Vec<u8> {
+        Self::serialize_event_to_fixed_size_static(event)
+    }
+
+    fn serialize_event_to_fixed_size_static(event: &Event) -> Vec<u8> {
         let mut record = [0u8; RECORD_SIZE];
         encode_record(
             &mut record,
@@ -525,6 +618,27 @@ impl StreamingSegmentedStorage {
         record.to_vec()
     }
 
+    fn flush_index_block_static(
+        index_file: &mut BufWriter<std::fs::File>,
+        entries: &[(u64, u64)],
+        min_ts: u64,
+        max_ts: u64,
+    ) -> std::io::Result<IndexBlock> {
+        let file_offset = index_file.stream_position()?;
+
+        for (timestamp, offset) in entries {
+            index_file.write_all(&timestamp.to_le_bytes())?;
+            index_file.write_all(&offset.to_be_bytes())?;
+        }
+
+        Ok(IndexBlock {
+            min_timestamp: min_ts,
+            max_timestamp: max_ts,
+            file_offset,
+            entry_count: entries.len() as u32,
+        })
+    }
+
     fn generate_segment_id() -> u64 {
         SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64
     }
diff --git a/src/storage/util.rs b/src/storage/util.rs
index 1fe653e..8069e85 100644
--- a/src/storage/util.rs
+++ b/src/storage/util.rs
@@ -6,7 +6,8 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 use crate::core::Event;
 
 #[derive(Debug)]
-pub struct WAL {
+/// In-memory buffer that batches events before persisting them to disk
+pub struct BatchBuffer {
     pub events: VecDeque<Event>,
     pub total_bytes: usize,
     pub oldest_timestamp_bound: Option<u64>,
@@ -33,9 +34,12 @@ pub struct EnhancedSegmentMetadata {
 
 #[derive(Clone)]
 pub struct StreamingConfig {
-    pub max_wal_events: u64,
-    pub max_wal_age_seconds: u64,
-    pub max_wal_bytes: usize,
+    /// Maximum number of events to buffer before flushing to disk
+    pub max_batch_events: u64,
+    /// Maximum age in seconds before flushing buffered events to disk
+    pub max_batch_age_seconds: u64,
+    /// Maximum bytes to buffer before flushing to disk
+    pub max_batch_bytes: usize,
     pub sparse_interval: usize,
     pub entries_per_index_block: usize,
     pub segment_base_path: String,
@@ -44,9 +48,9 @@ pub struct StreamingConfig {
 impl Default for StreamingConfig {
     fn default() -> Self {
         Self {
-            max_wal_bytes: 10 * 1024 * 1024,
-            max_wal_age_seconds: 60,
-            max_wal_events: 100_000,
+            max_batch_bytes: 10 * 1024 * 1024,
+            max_batch_age_seconds: 60,
+            max_batch_events: 100_000,
             sparse_interval: 1000,
             entries_per_index_block: 1024,
             segment_base_path: "./data".to_string(),

From 43d31f67e501060b8c8d70499c0feb7eeb0437de Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Fri, 7 Nov 2025 22:25:12 +0100
Subject: [PATCH 11/19] Refactor Dictionary to use u32 for IDs and update
 related methods

- Changed ID types in Dictionary from u64 to u32 for memory efficiency.
- Updated encode and decode methods to reflect the new ID type.
- Adjusted tests to use the new encoding/decoding methods.
- Modified memory_tracker to track memory usage with detailed statistics.
- Added MemoryTracker struct for monitoring memory usage during runtime.
- Implemented methods for recording, retrieving, and resetting memory measurements.
- Enhanced segmented storage to utilize Rc for dictionary management.
- Updated utility functions to include a new StorageComponentSizes struct for memory breakdown.
---
 benches/analysis.rs                |   4 +-
 benches/benchmark.rs               |   8 +-
 benches/write_benchmark.rs         |  36 ++---
 src/benchmarking/benchmark.rs      |   8 +-
 src/benchmarks/benchmark.rs        |  10 +-
 src/core/encoding.rs               |  29 ++--
 src/core/mod.rs                    |  12 +-
 src/indexing/shared.rs             |   8 +-
 src/main.rs                        | 152 +++++++++++-------
 src/storage/indexing/dictionary.rs |  14 +-
 src/storage/memory_tracker.rs      | 241 +++++++++++++++++++++++++++++
 src/storage/mod.rs                 |   1 +
 src/storage/segmented_storage.rs   |  21 ++-
 src/storage/util.rs                |   9 ++
 tests/dictionary_encoding_test.rs  | 189 +++++++++++-----------
 15 files changed, 520 insertions(+), 222 deletions(-)
 create mode 100644 src/storage/memory_tracker.rs

diff --git a/benches/analysis.rs b/benches/analysis.rs
index 60f020c..cbe6858 100644
--- a/benches/analysis.rs
+++ b/benches/analysis.rs
@@ -151,7 +151,7 @@ fn analyze_write_throughput() -> std::io::Result<()> {
 
         for i in 0..batch_size {
             log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
-            index_builder.add_entry(i, i * 40)?;
+            index_builder.add_entry(i, i * 24)?;
         }
         log_writer.flush()?;
         index_builder.finalize()?;
@@ -170,7 +170,7 @@ fn analyze_write_throughput() -> std::io::Result<()> {
 
         for i in 0..batch_size {
             log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
-            index_builder.add_entry(i, i, i * 40)?;
+            index_builder.add_entry(i, i, i * 24)?;
         }
         log_writer.flush()?;
         index_builder.finalize()?;
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
index e6ca4db..29aa79d 100644
--- a/benches/benchmark.rs
+++ b/benches/benchmark.rs
@@ -16,10 +16,10 @@ fn setup_data(number_records: u64) -> std::io::Result<()> {
 
     for i in 0..number_records {
         let timestamp = i;
-        let subject = (i % 1000) as u64;
-        let predicate = (i % 500) as u64;
-        let object = (i % 2000) as u64;
-        let graph: u64 = 1;
+        let subject = (i % 1000) as u32;
+        let predicate = (i % 500) as u32;
+        let object = (i % 2000) as u32;
+        let graph: u32 = 1;
         writer.append_record(timestamp, subject, predicate, object, graph)?;
     }
 
diff --git a/benches/write_benchmark.rs b/benches/write_benchmark.rs
index dfd12ab..38e8e13 100644
--- a/benches/write_benchmark.rs
+++ b/benches/write_benchmark.rs
@@ -28,10 +28,10 @@ fn benchmark_dense_writing(number_records: u64) -> std::io::Result<(f64, f64)> {
 
     for i in 0..number_records {
         let timestamp = i;
-        let subject = (i % 1000) as u64;
-        let predicate = (i % 500) as u64;
-        let object = (i % 2000) as u64;
-        let graph: u64 = 1;
+        let subject = (i % 1000) as u32;
+        let predicate = (i % 500) as u32;
+        let object = (i % 2000) as u32;
+        let graph: u32 = 1;
 
         // Write record to log
         log_writer.append_record(timestamp, subject, predicate, object, graph)?;
@@ -39,7 +39,7 @@ fn benchmark_dense_writing(number_records: u64) -> std::io::Result<(f64, f64)> {
         // Add entry to index
         index_builder.add_entry(timestamp, current_offset)?;
 
-        current_offset += 40; // RECORD_SIZE
+        current_offset += 24; // RECORD_SIZE
     }
 
     let write_time = start.elapsed();
@@ -66,10 +66,10 @@ fn benchmark_sparse_writing(number_records: u64) -> std::io::Result<(f64, f64)>
 
     for i in 0..number_records {
         let timestamp = i;
-        let subject = (i % 1000) as u64;
-        let predicate = (i % 500) as u64;
-        let object = (i % 2000) as u64;
-        let graph: u64 = 1;
+        let subject = (i % 1000) as u32;
+        let predicate = (i % 500) as u32;
+        let object = (i % 2000) as u32;
+        let graph: u32 = 1;
 
         // Write record to log
         log_writer.append_record(timestamp, subject, predicate, object, graph)?;
@@ -77,7 +77,7 @@ fn benchmark_sparse_writing(number_records: u64) -> std::io::Result<(f64, f64)>
         // Add entry to index (will only add if i % interval == 0)
         index_builder.add_entry(i, timestamp, current_offset)?;
 
-        current_offset += 40; // RECORD_SIZE
+        current_offset += 24; // RECORD_SIZE
     }
 
     let write_time = start.elapsed();
@@ -110,10 +110,10 @@ fn benchmark_batch_vs_realtime(number_records: u64) -> std::io::Result<()> {
     let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?;
     for i in 0..number_records {
         let timestamp = i;
-        let subject = (i % 1000) as u64;
-        let predicate = (i % 500) as u64;
-        let object = (i % 2000) as u64;
-        let graph: u64 = 1;
+        let subject = (i % 1000) as u32;
+        let predicate = (i % 500) as u32;
+        let object = (i % 2000) as u32;
+        let graph: u32 = 1;
         log_writer.append_record(timestamp, subject, predicate, object, graph)?;
     }
     log_writer.flush()?;
@@ -132,10 +132,10 @@ fn benchmark_batch_vs_realtime(number_records: u64) -> std::io::Result<()> {
     let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?;
     for i in 0..number_records {
         let timestamp = i;
-        let subject = (i % 1000) as u64;
-        let predicate = (i % 500) as u64;
-        let object = (i % 2000) as u64;
-        let graph: u64 = 1;
+        let subject = (i % 1000) as u32;
+        let predicate = (i % 500) as u32;
+        let object = (i % 2000) as u32;
+        let graph: u32 = 1;
         log_writer.append_record(timestamp, subject, predicate, object, graph)?;
     }
     log_writer.flush()?;
diff --git a/src/benchmarking/benchmark.rs b/src/benchmarking/benchmark.rs
index a5be448..607e922 100644
--- a/src/benchmarking/benchmark.rs
+++ b/src/benchmarking/benchmark.rs
@@ -17,10 +17,10 @@ fn setup_data(number_records: u64) -> std::io::Result<()> {
 
     for i in 0..number_records {
         let timestamp = i;
-        let subject = (i % 1000) as u64;
-        let predicate = (i % 500) as u64;
-        let object = (i % 2000) as u64;
-        let graph: u64 = 1;
+        let subject = (i % 1000) as u32;
+        let predicate = (i % 500) as u32;
+        let object = (i % 2000) as u32;
+        let graph: u32 = 1;
         writer.append_record(timestamp, subject, predicate, object, graph)?;
     }
 
diff --git a/src/benchmarks/benchmark.rs b/src/benchmarks/benchmark.rs
index d64ec37..607e922 100644
--- a/src/benchmarks/benchmark.rs
+++ b/src/benchmarks/benchmark.rs
@@ -1,5 +1,5 @@
-use crate::storage::indexing::{dense, sparse};
 use crate::indexing::shared::LogWriter;
+use crate::storage::indexing::{dense, sparse};
 use std::fs;
 use std::time::Instant;
 
@@ -17,10 +17,10 @@ fn setup_data(number_records: u64) -> std::io::Result<()> {
 
     for i in 0..number_records {
         let timestamp = i;
-        let subject = (i % 1000) as u64;
-        let predicate = (i % 500) as u64;
-        let object = (i % 2000) as u64;
-        let graph: u64 = 1;
+        let subject = (i % 1000) as u32;
+        let predicate = (i % 500) as u32;
+        let object = (i % 2000) as u32;
+        let graph: u32 = 1;
         writer.append_record(timestamp, subject, predicate, object, graph)?;
     }
 
diff --git a/src/core/encoding.rs b/src/core/encoding.rs
index 6c3fe9c..8c76b7f 100644
--- a/src/core/encoding.rs
+++ b/src/core/encoding.rs
@@ -4,31 +4,32 @@ use crate::core::{Event, RDFEvent};
 use crate::storage::indexing::dictionary::Dictionary;
 
 /// Size of a single encoded record in bytes
-pub const RECORD_SIZE: usize = 40;
+/// Reduced from 40 to 24 bytes (40% space savings)
+pub const RECORD_SIZE: usize = 24;
 
 /// Encode an RDF event record into a byte buffer
 pub fn encode_record(
     buffer: &mut [u8; RECORD_SIZE],
     timestamp: u64,
-    subject: u64,
-    predicate: u64,
-    object: u64,
-    graph: u64,
+    subject: u32,
+    predicate: u32,
+    object: u32,
+    graph: u32,
 ) {
     buffer[0..8].copy_from_slice(&timestamp.to_le_bytes());
-    buffer[8..16].copy_from_slice(&subject.to_le_bytes());
-    buffer[16..24].copy_from_slice(&predicate.to_le_bytes());
-    buffer[24..32].copy_from_slice(&object.to_le_bytes());
-    buffer[32..40].copy_from_slice(&graph.to_le_bytes());
+    buffer[8..12].copy_from_slice(&subject.to_le_bytes());
+    buffer[12..16].copy_from_slice(&predicate.to_le_bytes());
+    buffer[16..20].copy_from_slice(&object.to_le_bytes());
+    buffer[20..24].copy_from_slice(&graph.to_le_bytes());
 }
 
 /// Decode a byte buffer into an RDF event record
-pub fn decode_record(buffer: &[u8; RECORD_SIZE]) -> (u64, u64, u64, u64, u64) {
+pub fn decode_record(buffer: &[u8; RECORD_SIZE]) -> (u64, u32, u32, u32, u32) {
     let timestamp = u64::from_le_bytes(buffer[0..8].try_into().unwrap());
-    let subject = u64::from_le_bytes(buffer[8..16].try_into().unwrap());
-    let predicate = u64::from_le_bytes(buffer[16..24].try_into().unwrap());
-    let object = u64::from_le_bytes(buffer[24..32].try_into().unwrap());
-    let graph = u64::from_le_bytes(buffer[32..40].try_into().unwrap());
+    let subject = u32::from_le_bytes(buffer[8..12].try_into().unwrap());
+    let predicate = u32::from_le_bytes(buffer[12..16].try_into().unwrap());
+    let object = u32::from_le_bytes(buffer[16..20].try_into().unwrap());
+    let graph = u32::from_le_bytes(buffer[20..24].try_into().unwrap());
     (timestamp, subject, predicate, object, graph)
 }
 
diff --git a/src/core/mod.rs b/src/core/mod.rs
index 5fbfc41..ea27ceb 100644
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -1,13 +1,15 @@
 //! Core data structures and types for Janus RDF Stream Processing Engine
 
 /// Internal storage event with encoded IDs
+/// Uses u32 for dictionary IDs (4B max) and u64 for timestamp (milliseconds)
+/// Total: 24 bytes vs 40 bytes (40% space savings)
 #[derive(Clone, Debug)]
 pub struct Event {
-    pub timestamp: u64,
-    pub subject: u64,
-    pub predicate: u64,
-    pub object: u64,
-    pub graph: u64,
+    pub timestamp: u64, // 8 bytes - milliseconds since epoch
+    pub subject: u32,   // 4 bytes - dictionary-encoded (4B max unique strings)
+    pub predicate: u32, // 4 bytes - dictionary-encoded (usually <1000 unique)
+    pub object: u32,    // 4 bytes - dictionary-encoded (4B max unique strings)
+    pub graph: u32,     // 4 bytes - dictionary-encoded (usually <100 unique)
 }
 
 /// User-facing RDF event with URI strings
diff --git a/src/indexing/shared.rs b/src/indexing/shared.rs
index f0891dd..c47ec8e 100644
--- a/src/indexing/shared.rs
+++ b/src/indexing/shared.rs
@@ -21,10 +21,10 @@ impl LogWriter {
     pub fn append_record(
         &mut self,
         timestamp: u64,
-        subject: u64,
-        predicate: u64,
-        object: u64,
-        graph: u64,
+        subject: u32,
+        predicate: u32,
+        object: u32,
+        graph: u32,
     ) -> std::io::Result<()> {
         let mut buffer = [0u8; RECORD_SIZE];
         encode_record(&mut buffer, timestamp, subject, predicate, object, graph);
diff --git a/src/main.rs b/src/main.rs
index 25258d6..954cc02 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -5,6 +5,7 @@
 use janus::core::Event;
 use janus::indexing::shared::LogWriter;
 use janus::storage::indexing::{dense, sparse};
+use janus::storage::memory_tracker::MemoryTracker;
 use janus::storage::segmented_storage::StreamingSegmentedStorage;
 use janus::storage::util::StreamingConfig;
 use std::fs;
@@ -18,8 +19,8 @@ const SPARSE_INTERVAL: usize = 1000;
 const SEGMENT_BASE_PATH: &str = "data/rdf_benchmark";
 
 fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
-    println!("🚀 RDF Segmented Storage Benchmark");
-    println!("==================================");
+    // println!("🚀 RDF Segmented Storage Benchmark");
+    // println!("==================================");
 
     // Clean up and create directories
     let _ = fs::remove_dir_all(SEGMENT_BASE_PATH);
@@ -38,13 +39,16 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     let mut storage = StreamingSegmentedStorage::new(config)?;
     storage.start_background_flushing();
 
+    // Record initial memory
+    // storage.record_memory("before_writing");
+
     // Benchmark writing 1 million RDF events
-    println!("\n📝 Writing 1,000,000 RDF events...");
+    // println!("\n📝 Writing 1,000,000 RDF events...");
     let start_time = Instant::now();
-    let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64;
+    let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64;
 
     for i in 0..1_000_000u64 {
-        let timestamp = base_timestamp + i * 1_000_000; // 1ms intervals
+        let timestamp = base_timestamp + i * 1; // 1ms intervals
         let subject = format!("http://example.org/person/person_{}", i % 10000);
         let predicate = match i % 10 {
             0..=3 => "http://example.org/knows",
@@ -63,32 +67,34 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
         storage.write_rdf(timestamp, &subject, predicate, &object, &graph)?;
 
         if i > 0 && i % 100_000 == 0 {
-            println!("  ✓ Written {} events", i);
+            // println!("  ✓ Written {} events", i);
+            // storage.record_memory(&format!("after_{}_events", i));
         }
     }
 
     let write_duration = start_time.elapsed();
     let write_throughput = 1_000_000.0 / write_duration.as_secs_f64();
 
-    println!("✅ Write completed!");
-    println!("   Duration: {:.3} seconds", write_duration.as_secs_f64());
-    println!("   Throughput: {:.0} events/sec", write_throughput);
+    // println!("✅ Write completed!");
+    // println!("   Duration: {:.3} seconds", write_duration.as_secs_f64());
+    // println!("   Throughput: {:.0} events/sec", write_throughput);
 
     // Wait a bit for background flushing
     std::thread::sleep(Duration::from_secs(2));
+    // storage.record_memory("after_background_flush");
 
     // Benchmark reading different amounts of data
-    println!("\n🔍 Reading Benchmarks");
-    println!("====================");
+    // println!("\n🔍 Reading Benchmarks");
+    // println!("====================");
 
     let read_sizes = vec![100, 1_000, 10_000, 100_000, 1_000_000];
 
     for &size in &read_sizes {
         // Query the first 'size' events
         let query_start_ts = base_timestamp;
-        let query_end_ts = base_timestamp + (size as u64 * 1_000_000);
+        let query_end_ts = base_timestamp + size as u64;
 
-        println!("\n📖 Querying {} events...", size);
+        // println!("\n📖 Querying {} events...", size);
         let start_time = Instant::now();
 
         let results = storage.query_rdf(query_start_ts, query_end_ts)?;
@@ -96,9 +102,9 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
         let query_duration = start_time.elapsed();
         let read_throughput = results.len() as f64 / query_duration.as_secs_f64();
 
-        println!("   Results found: {}", results.len());
-        println!("   Query time: {:.3} ms", query_duration.as_millis());
-        println!("   Read throughput: {:.0} events/sec", read_throughput);
+        // println!("   Results found: {}", results.len());
+        // println!("   Query time: {:.3} ms", query_duration.as_millis());
+        // println!("   Read throughput: {:.0} events/sec", read_throughput);
 
         // Show a sample result for verification
         if !results.is_empty() {
@@ -113,7 +119,45 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     // Shutdown storage
     storage.shutdown()?;
 
-    println!("\n🎉 Benchmark completed successfully!");
+    // Print memory statistics
+    // println!("\n📊 Memory Usage Statistics");
+    // println!("==========================");
+    // let memory_stats = storage.get_memory_stats();
+    // // println!("Peak memory: {}", MemoryTracker::format_bytes(memory_stats.peak_bytes));
+    // // println!("Current memory: {}", MemoryTracker::format_bytes(memory_stats.current_bytes));
+    // println!(
+    //     "Average memory: {}",
+    //     MemoryTracker::format_bytes(memory_stats.avg_bytes as usize)
+    // );
+    // // println!("Total measurements: {}", memory_stats.total_measurements);
+
+    // Print storage component breakdown
+    // let component_sizes = storage.get_storage_component_sizes();
+    // println!("\n🧩 Storage Component Breakdown");
+    // println!("=============================");
+    // println!(
+    //     "Batch buffer: {}",
+    //     MemoryTracker::format_bytes(component_sizes.batch_buffer_bytes)
+    // );
+    // // println!("Dictionary: {}", MemoryTracker::format_bytes(component_sizes.dictionary_bytes));
+    // // println!("Segments count: {}", component_sizes.segments_count);
+    // println!(
+    //     "Estimated total: {}",
+    //     MemoryTracker::format_bytes(component_sizes.estimated_total_bytes)
+    // );
+
+    // if memory_stats.measurements.len() > 1 {
+    //     // println!("\nDetailed measurements:");
+    //     for measurement in &memory_stats.measurements {
+    //         println!(
+    //             "  {}: {}",
+    //             measurement.description,
+    //             MemoryTracker::format_bytes(measurement.memory_bytes)
+    //         );
+    //     }
+    // }
+
+    // println!("\n🎉 Benchmark completed successfully!");
     Ok(())
 }
 
@@ -125,32 +169,32 @@ fn setup_data(number_records: u64) -> std::io::Result<()> {
 
     for i in 0..number_records {
         let timestamp = i;
-        let subject = (i % 1000) as u64;
-        let predicate = (i % 500) as u64;
-        let object = (i % 2000) as u64;
-        let graph: u64 = 1;
+        let subject = (i % 1000) as u32;
+        let predicate = (i % 500) as u32;
+        let object = (i % 2000) as u32;
+        let graph: u32 = 1;
         writer.append_record(timestamp, subject, predicate, object, graph)?;
     }
 
     writer.flush()?;
 
-    println!("Generated log file with {} records", writer.record_count());
+    // println!("Generated log file with {} records", writer.record_count());
 
     Ok(())
 }
 
 fn benchmark_indexing() -> std::io::Result<()> {
-    println!("Indexing Benchmark");
+    // println!("Indexing Benchmark");
 
     let start = Instant::now();
     dense::build_dense_index(LOG_FILE, DENSE_INDEX_FILE)?;
     let dense_time = start.elapsed();
-    println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0);
+    // println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0);
 
     let start = Instant::now();
     sparse::build_sparse_index(LOG_FILE, SPARSE_INDEX_FILE, &SPARSE_INTERVAL)?;
     let sparse_time = start.elapsed();
-    println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0);
+    // println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0);
 
     let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?;
     let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?;
@@ -167,7 +211,7 @@ fn benchmark_indexing() -> std::io::Result<()> {
 }
 
 fn benchmark_queries() -> std::io::Result<()> {
-    println!("Query Benchmark");
+    // println!("Query Benchmark");
     let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?;
     let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?;
 
@@ -180,7 +224,7 @@ fn benchmark_queries() -> std::io::Result<()> {
     ];
 
     for (timestamp_start, timestamp_end, description) in query_ranges {
-        println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end);
+        // println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end);
 
         let start = Instant::now();
         let dense_results = dense_reader.query(LOG_FILE, timestamp_start, timestamp_end)?;
@@ -205,9 +249,9 @@ fn benchmark_queries() -> std::io::Result<()> {
         let speedup = sparse_time.as_secs_f64() / dense_time.as_secs_f64();
 
         if speedup > 1.0 {
-            println!(" Sparse index is {:.2} times faster than Dense index", speedup);
+            // println!(" Sparse index is {:.2} times faster than Dense index", speedup);
         } else {
-            println!(" Dense index is {:.2} times faster than Sparse index", 1.0 / speedup);
+            // println!(" Dense index is {:.2} times faster than Sparse index", 1.0 / speedup);
         }
 
         assert_eq!(
@@ -220,8 +264,8 @@ fn benchmark_queries() -> std::io::Result<()> {
 }
 
 // fn main() -> std::io::Result<()> {
-//     println!("RDF Indexing Benchmark : Dense vs Sparse");
-//     println!("Setting up data...");
+//     // println!("RDF Indexing Benchmark : Dense vs Sparse");
+//     // println!("Setting up data...");
 //     let number_of_records = 1_000_000u64;
 //     setup_data(number_of_records)?;
 
@@ -237,13 +281,13 @@ fn benchmark_queries() -> std::io::Result<()> {
 // }
 
 fn benchmark_storage_performance() -> std::io::Result<()> {
-    println!("=== WAL-Based Segmented Storage Performance Benchmark ===\n");
+    // println!("=== WAL-Based Segmented Storage Performance Benchmark ===\n");
 
     let record_counts = vec![100, 1000, 10000, 100000, 1000000];
 
     for &num_records in &record_counts {
-        println!("Testing with {} records", num_records);
-        println!("──────────────────────────────────────────────────");
+        // println!("Testing with {} records", num_records);
+        // println!("──────────────────────────────────────────────────");
 
         // Configure storage
         let config = StreamingConfig {
@@ -263,22 +307,22 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
         storage.start_background_flushing();
 
         // Benchmark writes
-        println!("Writing {} records...", num_records);
+        // println!("Writing {} records...", num_records);
         let write_start = Instant::now();
         let mut min_timestamp = u64::MAX;
         let mut max_timestamp = 0u64;
 
         for i in 0..num_records {
             let timestamp =
-                SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64 + i;
+                SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64 + i;
             min_timestamp = min_timestamp.min(timestamp);
             max_timestamp = max_timestamp.max(timestamp);
 
             let event = Event {
                 timestamp,
-                subject: (i % 10) as u64,
+                subject: (i % 10) as u32,
                 predicate: 1,
-                object: (20 + (i % 10)) as u64,
+                object: (20 + (i % 10)) as u32,
                 graph: 1,
             };
             storage.write(event)?;
@@ -287,15 +331,15 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
         let write_duration = write_start.elapsed();
         let write_throughput = num_records as f64 / write_duration.as_secs_f64();
 
-        println!("Write Performance:");
-        println!("  Duration: {:.3}s", write_duration.as_secs_f64());
-        println!("  Throughput: {:.0} records/sec", write_throughput);
-        println!("  Timestamp range: {} to {}", min_timestamp, max_timestamp);
+        // println!("Write Performance:");
+        // println!("  Duration: {:.3}s", write_duration.as_secs_f64());
+        // println!("  Throughput: {:.0} records/sec", write_throughput);
+        // println!("  Timestamp range: {} to {}", min_timestamp, max_timestamp);
 
         // Benchmark queries immediately after writing (data is still in WAL)
         let query_ranges = vec![(0.1, "10% of data"), (0.5, "50% of data"), (1.0, "100% of data")];
 
-        println!("\nQuery Performance:");
+        // println!("\nQuery Performance:");
 
         for (fraction, description) in query_ranges {
             let query_count = 100.min(num_records / 10); // Run 100 queries or 10% of records, whichever is smaller
@@ -332,13 +376,13 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
             let min_time = query_times.iter().cloned().fold(f64::INFINITY, f64::min);
             let max_time = query_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
 
-            println!("  {} queries ({}):", description, query_count);
-            println!("    Avg query time: {:.3}ms", avg_query_time * 1000.0);
-            println!("    Query throughput: {:.1} queries/sec", queries_per_sec);
-            println!("    Read throughput: {:.0} records/sec", records_per_sec);
-            println!("    Avg records per query: {:.1}", avg_records_per_query);
-            println!("    Total records read: {}", total_records_read);
-            println!("    Min/Max time: {:.3}ms / {:.3}ms", min_time * 1000.0, max_time * 1000.0);
+            // println!("  {} queries ({}):", description, query_count);
+            // println!("    Avg query time: {:.3}ms", avg_query_time * 1000.0);
+            // println!("    Query throughput: {:.1} queries/sec", queries_per_sec);
+            // println!("    Read throughput: {:.0} records/sec", records_per_sec);
+            // println!("    Avg records per query: {:.1}", avg_records_per_query);
+            // println!("    Total records read: {}", total_records_read);
+            // println!("    Min/Max time: {:.3}ms / {:.3}ms", min_time * 1000.0, max_time * 1000.0);
         }
 
         // Force flush remaining WAL data and shutdown
@@ -346,7 +390,7 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
         println!();
     }
 
-    println!("Benchmark completed!");
+    // println!("Benchmark completed!");
     Ok(())
 }
 
@@ -354,9 +398,9 @@ fn main() -> std::io::Result<()> {
     // Run the new RDF benchmark
     benchmark_segmented_storage_rdf()?;
 
-    println!("\n{}", "=".repeat(50));
-    println!("Running legacy benchmark for comparison...");
-    println!("{}", "=".repeat(50));
+    // println!("\n{}", "=".repeat(50));
+    // println!("Running legacy benchmark for comparison...");
+    // println!("{}", "=".repeat(50));
 
     // Also run the old benchmark for comparison
     benchmark_storage_performance()
diff --git a/src/storage/indexing/dictionary.rs b/src/storage/indexing/dictionary.rs
index 4729756..d1b474a 100644
--- a/src/storage/indexing/dictionary.rs
+++ b/src/storage/indexing/dictionary.rs
@@ -10,9 +10,9 @@ use crate::core::Event;
 
 #[derive(Debug, Serialize, Deserialize)]
 pub struct Dictionary {
-    pub string_to_id: HashMap<String, u64>,
-    pub id_to_uri: HashMap<u64, String>,
-    pub next_id: u64,
+    pub string_to_id: HashMap<String, u32>,
+    pub id_to_uri: HashMap<u32, String>,
+    pub next_id: u32,
 }
 
 impl Dictionary {
@@ -20,7 +20,7 @@ impl Dictionary {
         Dictionary { string_to_id: HashMap::new(), id_to_uri: HashMap::new(), next_id: 0 }
     }
 
-    pub fn encode(&mut self, value: &str) -> u64 {
+    pub fn encode(&mut self, value: &str) -> u32 {
         if let Some(&id) = self.string_to_id.get(value) {
             id
         } else {
@@ -32,7 +32,7 @@ impl Dictionary {
         }
     }
 
-    pub fn decode(&self, id: u64) -> Option<&str> {
+    pub fn decode(&self, id: u32) -> Option<&str> {
         self.id_to_uri.get(&id).map(|s| s.as_str())
     }
 
@@ -69,7 +69,7 @@ impl Dictionary {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::indexing::shared::Event;
+    use crate::core::Event;
 
     #[test]
     fn test_dictionary_encoding_decoding() {
@@ -116,7 +116,7 @@ mod tests {
 
     #[test]
     fn test_clean_rdf_api() {
-        use crate::indexing::shared::RDFEvent;
+        use crate::core::RDFEvent;
 
         let mut dict = Dictionary::new();
 
diff --git a/src/storage/memory_tracker.rs b/src/storage/memory_tracker.rs
new file mode 100644
index 0000000..ccef928
--- /dev/null
+++ b/src/storage/memory_tracker.rs
@@ -0,0 +1,241 @@
+use std::collections::VecDeque;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Arc;
+
+/// Memory usage tracker for benchmarking purposes
+#[derive(Debug, Clone)]
+pub struct MemoryTracker {
+    peak_memory_bytes: Arc<AtomicUsize>,
+    current_memory_bytes: Arc<AtomicUsize>,
+    measurements: Arc<std::sync::Mutex<Vec<MemoryMeasurement>>>,
+}
+
+#[derive(Debug, Clone)]
+pub struct MemoryMeasurement {
+    pub timestamp: std::time::Instant,
+    pub memory_bytes: usize,
+    pub description: String,
+}
+
+#[derive(Debug)]
+pub struct MemoryStats {
+    pub current_bytes: usize,
+    pub peak_bytes: usize,
+    pub total_measurements: usize,
+    pub avg_bytes: f64,
+    pub measurements: Vec<MemoryMeasurement>,
+}
+
+impl MemoryTracker {
+    pub fn new() -> Self {
+        Self {
+            peak_memory_bytes: Arc::new(AtomicUsize::new(0)),
+            current_memory_bytes: Arc::new(AtomicUsize::new(0)),
+            measurements: Arc::new(std::sync::Mutex::new(Vec::new())),
+        }
+    }
+
+    /// Record current memory usage with a description
+    pub fn record(&self, description: &str) {
+        let current = self.estimate_current_memory();
+        self.current_memory_bytes.store(current, Ordering::Relaxed);
+
+        // Update peak if necessary
+        let peak = self.peak_memory_bytes.load(Ordering::Relaxed);
+        if current > peak {
+            self.peak_memory_bytes.store(current, Ordering::Relaxed);
+        }
+
+        // Store measurement
+        let measurement = MemoryMeasurement {
+            timestamp: std::time::Instant::now(),
+            memory_bytes: current,
+            description: description.to_string(),
+        };
+
+        if let Ok(mut measurements) = self.measurements.lock() {
+            measurements.push(measurement);
+        }
+    }
+
+    /// Get current memory statistics
+    pub fn get_stats(&self) -> MemoryStats {
+        let current = self.current_memory_bytes.load(Ordering::Relaxed);
+        let peak = self.peak_memory_bytes.load(Ordering::Relaxed);
+
+        let measurements = if let Ok(m) = self.measurements.lock() {
+            m.clone()
+        } else {
+            Vec::new()
+        };
+
+        let avg_bytes = if measurements.is_empty() {
+            0.0
+        } else {
+            measurements.iter().map(|m| m.memory_bytes as f64).sum::<f64>()
+                / measurements.len() as f64
+        };
+
+        MemoryStats {
+            current_bytes: current,
+            peak_bytes: peak,
+            total_measurements: measurements.len(),
+            avg_bytes,
+            measurements,
+        }
+    }
+
+    /// Reset all measurements
+    pub fn reset(&self) {
+        self.current_memory_bytes.store(0, Ordering::Relaxed);
+        self.peak_memory_bytes.store(0, Ordering::Relaxed);
+        if let Ok(mut measurements) = self.measurements.lock() {
+            measurements.clear();
+        }
+    }
+
+    /// Estimate current memory usage of the process
+    fn estimate_current_memory(&self) -> usize {
+        // On macOS/Linux, try to read from /proc/self/status or use system calls
+        #[cfg(target_os = "macos")]
+        {
+            // For macOS, try using sysctl first, then fallback to basic estimation
+            match self.get_memory_macos_simple() {
+                Ok(mem) if mem > 0 => mem,
+                _ => self.estimate_heap_usage(),
+            }
+        }
+        #[cfg(target_os = "linux")]
+        {
+            match self.get_memory_linux() {
+                mem if mem > 0 => mem,
+                _ => self.estimate_heap_usage(),
+            }
+        }
+        #[cfg(not(any(target_os = "macos", target_os = "linux")))]
+        {
+            // Fallback: estimate based on heap allocation (rough approximation)
+            self.estimate_heap_usage()
+        }
+    }
+
+    /// Simple heap usage estimation (very rough)
+    fn estimate_heap_usage(&self) -> usize {
+        // This is a very rough estimation based on typical memory patterns
+        // In a real implementation, you might use a memory allocator that tracks usage
+
+        // Rough estimation: assume we're using around 50-100MB for a typical session
+        // This is obviously very imprecise but gives us something to work with
+        let estimated_base = 50 * 1024 * 1024; // 50MB base
+
+        // Add some dynamic component based on time (simulating growth)
+        let dynamic_component = (std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap_or_default()
+            .as_secs()
+            % 1000) as usize
+            * 1024; // Up to 1MB variation
+
+        estimated_base + dynamic_component
+    }
+
+    #[cfg(target_os = "macos")]
+    fn get_memory_macos_simple(&self) -> Result<usize, Box<dyn std::error::Error>> {
+        // Try using ps command as a fallback
+        use std::process::Command;
+
+        let output = Command::new("ps")
+            .args(&["-o", "rss=", "-p", &std::process::id().to_string()])
+            .output()?;
+
+        if output.status.success() {
+            let rss_str = std::str::from_utf8(&output.stdout)?;
+            let rss_kb: usize = rss_str.trim().parse()?;
+            Ok(rss_kb * 1024) // Convert KB to bytes
+        } else {
+            Err("ps command failed".into())
+        }
+    }
+
+    #[cfg(target_os = "macos")]
+    fn get_memory_macos(&self) -> usize {
+        use std::mem;
+        use std::ptr;
+
+        #[repr(C)]
+        struct task_basic_info {
+            virtual_size: u32,
+            resident_size: u32,
+            policy: u32,
+            flags: u32,
+        }
+
+        extern "C" {
+            fn mach_task_self() -> u32;
+            fn task_info(
+                target_task: u32,
+                flavor: u32,
+                task_info_out: *mut task_basic_info,
+                task_info_outCnt: *mut u32,
+            ) -> i32;
+        }
+
+        const TASK_BASIC_INFO: u32 = 5;
+        let mut info: task_basic_info = unsafe { mem::zeroed() };
+        let mut count = (mem::size_of::<task_basic_info>() / mem::size_of::<u32>()) as u32;
+
+        let result = unsafe {
+            task_info(
+                mach_task_self(),
+                TASK_BASIC_INFO,
+                &mut info as *mut task_basic_info,
+                &mut count,
+            )
+        };
+
+        if result == 0 {
+            info.resident_size as usize
+        } else {
+            0
+        }
+    }
+
+    #[cfg(target_os = "linux")]
+    fn get_memory_linux(&self) -> usize {
+        use std::fs;
+
+        if let Ok(contents) = fs::read_to_string("/proc/self/status") {
+            for line in contents.lines() {
+                if line.starts_with("VmRSS:") {
+                    let parts: Vec<&str> = line.split_whitespace().collect();
+                    if parts.len() >= 2 {
+                        if let Ok(kb) = parts[1].parse::<usize>() {
+                            return kb * 1024; // Convert KB to bytes
+                        }
+                    }
+                }
+            }
+        }
+        0
+    }
+
+    /// Format bytes in human-readable format
+    pub fn format_bytes(bytes: usize) -> String {
+        const UNITS: &[&str] = &["B", "KB", "MB", "GB"];
+        let mut size = bytes as f64;
+        let mut unit_index = 0;
+
+        while size >= 1024.0 && unit_index < UNITS.len() - 1 {
+            size /= 1024.0;
+            unit_index += 1;
+        }
+
+        format!("{:.2} {}", size, UNITS[unit_index])
+    }
+}
+
+impl Default for MemoryTracker {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/src/storage/mod.rs b/src/storage/mod.rs
index f797e0d..b96e7ca 100644
--- a/src/storage/mod.rs
+++ b/src/storage/mod.rs
@@ -1,3 +1,4 @@
+pub mod memory_tracker;
 pub mod segmented_storage;
 pub mod util;
 pub mod indexing {
diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs
index 1cd0155..38fb557 100644
--- a/src/storage/segmented_storage.rs
+++ b/src/storage/segmented_storage.rs
@@ -1,10 +1,8 @@
 use core::time;
 use std::{
     collections::VecDeque,
-    fmt::format,
     io::{BufWriter, Read, Seek, SeekFrom, Write},
-    ops::Index,
-    panic::set_hook,
+    rc::Rc,
     sync::{Arc, Mutex, RwLock},
     thread::JoinHandle,
     time::{Duration, SystemTime, UNIX_EPOCH},
@@ -21,16 +19,18 @@ use crate::{
     },
 };
 
+#[doc = "Struct for the Implementation of the Segmented Storage of RDF Streams."]
 pub struct StreamingSegmentedStorage {
     batch_buffer: Arc<RwLock<BatchBuffer>>,
     segments: Arc<RwLock<Vec<EnhancedSegmentMetadata>>>,
-    dictionary: Arc<RwLock<Dictionary>>,
+    dictionary: Rc<RwLock<Dictionary>>,
     flush_handle: Option<JoinHandle<()>>,
     shutdown_signal: Arc<Mutex<bool>>,
     config: StreamingConfig,
 }
 
 impl StreamingSegmentedStorage {
+    #[doc = ""]
     pub fn new(config: StreamingConfig) -> std::io::Result<Self> {
         std::fs::create_dir_all(&config.segment_base_path)?;
 
@@ -43,7 +43,7 @@ impl StreamingSegmentedStorage {
             })),
 
             segments: Arc::new(RwLock::new(Vec::new())),
-            dictionary: Arc::new(RwLock::new(Dictionary::new())),
+            dictionary: Rc::new(RwLock::new(Dictionary::new())),
             flush_handle: None,
             shutdown_signal: Arc::new(Mutex::new(false)),
             config,
@@ -52,6 +52,7 @@ impl StreamingSegmentedStorage {
         Ok(storage)
     }
 
+    #[doc = ""]
     pub fn start_background_flushing(&mut self) {
         let batch_buffer_clone = Arc::clone(&self.batch_buffer);
         let segments_clone = Arc::clone(&self.segments);
@@ -86,10 +87,8 @@ impl StreamingSegmentedStorage {
 
             batch_buffer.events.push_back(event);
         }
-
         // Note: Synchronous flushing removed for high throughput.
         // Background thread handles all flushing based on time limits.
-
         Ok(())
     }
 
@@ -120,12 +119,12 @@ impl StreamingSegmentedStorage {
 
                 // Use saturating subtraction to avoid underflow if oldest > current_timestamp
                 current_timestamp.saturating_sub(oldest)
-                    >= self.config.max_batch_age_seconds * 1_000_000_000
+                    >= self.config.max_batch_age_seconds * 1_000
             })
     }
 
     fn current_timestamp() -> u64 {
-        SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64
+        SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64
     }
 
     fn flush_batch_buffer_to_segment(&self) -> std::io::Result<()> {
@@ -415,7 +414,7 @@ impl StreamingSegmentedStorage {
                     || batch_buffer.oldest_timestamp_bound.map_or(false, |oldest| {
                         let current_timestamp = Self::current_timestamp();
                         current_timestamp.saturating_sub(oldest)
-                            >= config.max_batch_age_seconds * 1_000_000_000
+                            >= config.max_batch_age_seconds * 1_000
                     })
             };
 
@@ -640,6 +639,6 @@ impl StreamingSegmentedStorage {
     }
 
     fn generate_segment_id() -> u64 {
-        SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_nanos() as u64
+        SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64
     }
 }
diff --git a/src/storage/util.rs b/src/storage/util.rs
index 8069e85..e6f7a2d 100644
--- a/src/storage/util.rs
+++ b/src/storage/util.rs
@@ -5,6 +5,15 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
 use crate::core::Event;
 
+#[derive(Debug)]
+/// Storage component memory usage breakdown
+pub struct StorageComponentSizes {
+    pub batch_buffer_bytes: usize,
+    pub segments_count: usize,
+    pub dictionary_bytes: usize,
+    pub estimated_total_bytes: usize,
+}
+
 #[derive(Debug)]
 /// In-memory buffer that batches events before persisting them to disk
 pub struct BatchBuffer {
diff --git a/tests/dictionary_encoding_test.rs b/tests/dictionary_encoding_test.rs
index 75c6e9d..00a4f1e 100644
--- a/tests/dictionary_encoding_test.rs
+++ b/tests/dictionary_encoding_test.rs
@@ -22,9 +22,10 @@
 //! - Object ID → "23.5" (the literal value)
 //! - Datatype ID → "http://www.w3.org/2001/XMLSchema#double" (if needed)
 
-use janus::indexing::dictionary::Dictionary;
-use janus::indexing::shared::{decode_record, encode_record, LogWriter, RECORD_SIZE};
-use janus::indexing::sparse::{build_sparse_index, SparseReader};
+use janus::core::encoding::{decode_record, encode_record, RECORD_SIZE};
+use janus::indexing::shared::LogWriter;
+use janus::storage::indexing::dictionary::Dictionary;
+use janus::storage::indexing::sparse::{build_sparse_index, SparseReader};
 use std::fs;
 use std::path::Path;
 
@@ -39,30 +40,30 @@ fn test_rdf_syntax_to_dictionary_mapping() {
 
     // Subject: <https://rsp.js/event1> → stored as the URI string
     let subject = "https://rsp.js/event1";
-    let subject_id = dict.fetch_id(subject);
+    let subject_id = dict.encode(subject);
 
     // Predicate: <http://www.w3.org/ns/saref#hasValue> → stored as the URI string
     let predicate = "http://www.w3.org/ns/saref#hasValue";
-    let predicate_id = dict.fetch_id(predicate);
+    let predicate_id = dict.encode(predicate);
 
     // Object: "23.5"^^xsd:double → stored as the literal value "23.5"
     let object = "23.5";
-    let object_id = dict.fetch_id(object);
+    let object_id = dict.encode(object);
 
     // Datatype: ^^<http://www.w3.org/2001/XMLSchema#double> → stored as URI string
     let datatype = "http://www.w3.org/2001/XMLSchema#double";
-    let datatype_id = dict.fetch_id(datatype);
+    let datatype_id = dict.encode(datatype);
 
     // Graph: <https://example.org/graph> → stored as the URI string
     let graph = "https://example.org/graph";
-    let graph_id = dict.fetch_id(graph);
+    let graph_id = dict.encode(graph);
 
     // Verify all components are stored correctly
-    assert_eq!(dict.fetch_uri(subject_id), Some(subject));
-    assert_eq!(dict.fetch_uri(predicate_id), Some(predicate));
-    assert_eq!(dict.fetch_uri(object_id), Some(object));
-    assert_eq!(dict.fetch_uri(datatype_id), Some(datatype));
-    assert_eq!(dict.fetch_uri(graph_id), Some(graph));
+    assert_eq!(dict.decode(subject_id), Some(subject));
+    assert_eq!(dict.decode(predicate_id), Some(predicate));
+    assert_eq!(dict.decode(object_id), Some(object));
+    assert_eq!(dict.decode(datatype_id), Some(datatype));
+    assert_eq!(dict.decode(graph_id), Some(graph));
 
     // In a real system, you'd also store metadata about which IDs are literals vs URIs
     // and what datatype each literal has. This test just demonstrates the string storage.
@@ -91,34 +92,34 @@ fn test_rdf_literal_datatypes() {
     let label_datatype = "http://www.w3.org/2001/XMLSchema#string";
 
     // Store all values and datatypes in dictionary
-    let timestamp_val_id = dict.fetch_id(timestamp_value);
-    let timestamp_dt_id = dict.fetch_id(timestamp_datatype);
+    let timestamp_val_id = dict.encode(timestamp_value);
+    let timestamp_dt_id = dict.encode(timestamp_datatype);
 
-    let temp_val_id = dict.fetch_id(temp_value);
-    let temp_dt_id = dict.fetch_id(temp_datatype);
+    let temp_val_id = dict.encode(temp_value);
+    let temp_dt_id = dict.encode(temp_datatype);
 
-    let count_val_id = dict.fetch_id(count_value);
-    let count_dt_id = dict.fetch_id(count_datatype);
+    let count_val_id = dict.encode(count_value);
+    let count_dt_id = dict.encode(count_datatype);
 
-    let label_val_id = dict.fetch_id(label_value);
-    let label_dt_id = dict.fetch_id(label_datatype);
+    let label_val_id = dict.encode(label_value);
+    let label_dt_id = dict.encode(label_datatype);
 
     // Verify all are stored correctly
-    assert_eq!(dict.fetch_uri(timestamp_val_id), Some(timestamp_value));
-    assert_eq!(dict.fetch_uri(timestamp_dt_id), Some(timestamp_datatype));
+    assert_eq!(dict.decode(timestamp_val_id), Some(timestamp_value));
+    assert_eq!(dict.decode(timestamp_dt_id), Some(timestamp_datatype));
 
-    assert_eq!(dict.fetch_uri(temp_val_id), Some(temp_value));
-    assert_eq!(dict.fetch_uri(temp_dt_id), Some(temp_datatype));
+    assert_eq!(dict.decode(temp_val_id), Some(temp_value));
+    assert_eq!(dict.decode(temp_dt_id), Some(temp_datatype));
 
-    assert_eq!(dict.fetch_uri(count_val_id), Some(count_value));
-    assert_eq!(dict.fetch_uri(count_dt_id), Some(count_datatype));
+    assert_eq!(dict.decode(count_val_id), Some(count_value));
+    assert_eq!(dict.decode(count_dt_id), Some(count_datatype));
 
-    assert_eq!(dict.fetch_uri(label_val_id), Some(label_value));
-    assert_eq!(dict.fetch_uri(label_dt_id), Some(label_datatype));
+    assert_eq!(dict.decode(label_val_id), Some(label_value));
+    assert_eq!(dict.decode(label_dt_id), Some(label_datatype));
 
     // Note: Datatype URIs are reused across multiple literals
     // E.g., many literals will have ^^xsd:double as their datatype
-    assert_eq!(temp_dt_id, dict.fetch_id(temp_datatype)); // Same ID when requested again
+    assert_eq!(temp_dt_id, dict.encode(temp_datatype)); // Same ID when requested again
 }
 
 #[test]
@@ -132,35 +133,35 @@ fn test_dictionary_basic_operations() {
     let uri4 = "http://www.w3.org/ns/ssn#observedBy";
 
     // First insertion should return ID 0
-    let id1 = dict.fetch_id(uri1);
+    let id1 = dict.encode(uri1);
     assert_eq!(id1, 0);
 
     // Subsequent insertions should return sequential IDs
-    let id2 = dict.fetch_id(uri2);
+    let id2 = dict.encode(uri2);
     assert_eq!(id2, 1);
 
-    let id3 = dict.fetch_id(uri3);
+    let id3 = dict.encode(uri3);
     assert_eq!(id3, 2);
 
-    let id4 = dict.fetch_id(uri4);
+    let id4 = dict.encode(uri4);
     assert_eq!(id4, 3);
 
     // Requesting same URI should return same ID
-    let id1_again = dict.fetch_id(uri1);
+    let id1_again = dict.encode(uri1);
     assert_eq!(id1_again, id1);
 
     // Test retrieval
-    assert_eq!(dict.fetch_uri(id1), Some(uri1));
-    assert_eq!(dict.fetch_uri(id2), Some(uri2));
-    assert_eq!(dict.fetch_uri(id3), Some(uri3));
-    assert_eq!(dict.fetch_uri(id4), Some(uri4));
+    assert_eq!(dict.decode(id1), Some(uri1));
+    assert_eq!(dict.decode(id2), Some(uri2));
+    assert_eq!(dict.decode(id3), Some(uri3));
+    assert_eq!(dict.decode(id4), Some(uri4));
 
     // Test invalid ID
-    assert_eq!(dict.fetch_uri(999), None);
+    assert_eq!(dict.decode(999), None);
 
     // Test length
-    assert_eq!(dict.len(), 4);
-    assert!(!dict.is_empty());
+    assert_eq!(dict.id_to_uri.len(), 4);
+    assert!(!dict.id_to_uri.is_empty());
 }
 
 #[test]
@@ -180,7 +181,7 @@ fn test_dictionary_persistence() -> std::io::Result<()> {
         "https://solid.ti.rw.fau.de/public/ns/stream#",
     ];
 
-    let ids: Vec<u64> = uris.iter().map(|uri| dict.fetch_id(uri)).collect();
+    let ids: Vec<u32> = uris.iter().map(|uri| dict.encode(uri)).collect();
 
     // Save to file
     dict.save_to_file(&dict_path)?;
@@ -190,10 +191,10 @@ fn test_dictionary_persistence() -> std::io::Result<()> {
 
     // Verify all URIs are preserved with correct IDs
     for (i, uri) in uris.iter().enumerate() {
-        assert_eq!(loaded_dict.fetch_uri(ids[i]), Some(*uri));
+        assert_eq!(loaded_dict.decode(ids[i]), Some(*uri));
     }
 
-    assert_eq!(loaded_dict.len(), uris.len());
+    assert_eq!(loaded_dict.id_to_uri.len(), uris.len());
 
     Ok(())
 }
@@ -214,10 +215,10 @@ fn test_rdf_event_encoding_with_dictionary() {
 
     // Map URIs to IDs
     let timestamp: u64 = 1699181400;
-    let subject_id = dict.fetch_id(subject_uri);
-    let predicate_id = dict.fetch_id(predicate_uri);
-    let object_id = dict.fetch_id(object_uri);
-    let graph_id = dict.fetch_id(graph_uri);
+    let subject_id = dict.encode(subject_uri);
+    let predicate_id = dict.encode(predicate_uri);
+    let object_id = dict.encode(object_uri);
+    let graph_id = dict.encode(graph_uri);
 
     // Encode record with IDs
     let mut buffer = [0u8; RECORD_SIZE];
@@ -234,10 +235,10 @@ fn test_rdf_event_encoding_with_dictionary() {
     assert_eq!(dec_graph, graph_id);
 
     // Resolve IDs back to URIs
-    assert_eq!(dict.fetch_uri(dec_subject), Some(subject_uri));
-    assert_eq!(dict.fetch_uri(dec_predicate), Some(predicate_uri));
-    assert_eq!(dict.fetch_uri(dec_object), Some(object_uri));
-    assert_eq!(dict.fetch_uri(dec_graph), Some(graph_uri));
+    assert_eq!(dict.decode(dec_subject), Some(subject_uri));
+    assert_eq!(dict.decode(dec_predicate), Some(predicate_uri));
+    assert_eq!(dict.decode(dec_object), Some(object_uri));
+    assert_eq!(dict.decode(dec_graph), Some(graph_uri));
 }
 
 #[test]
@@ -258,10 +259,10 @@ fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> {
     ];
 
     // Map predicates to IDs first (these will be reused)
-    let predicate_ids: Vec<u64> = predicates.iter().map(|p| dict.fetch_id(p)).collect();
+    let predicate_ids: Vec<u32> = predicates.iter().map(|p| dict.encode(p)).collect();
 
     let graph_uri = "https://solid.ti.rw.fau.de/public/ns/stream#iot";
-    let graph_id = dict.fetch_id(graph_uri);
+    let graph_id = dict.encode(graph_uri);
 
     // Create log writer
     let mut writer = LogWriter::create(&log_path)?;
@@ -272,14 +273,14 @@ fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> {
 
         // Each event has unique subject (sensor reading ID)
         let subject_uri = format!("https://rsp.js/event/sensor-reading-{:03}", i);
-        let subject_id = dict.fetch_id(&subject_uri);
+        let subject_id = dict.encode(&subject_uri);
 
         // Rotate through predicates (demonstrating reuse)
         let predicate_id = predicate_ids[(i % predicate_ids.len() as u64) as usize];
 
         // Unique object (sensor value)
         let object_uri = format!("value-{}", i * 10);
-        let object_id = dict.fetch_id(&object_uri);
+        let object_id = dict.encode(&object_uri);
 
         writer.append_record(timestamp, subject_id, predicate_id, object_id, graph_id)?;
     }
@@ -293,11 +294,11 @@ fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> {
     // - 100 unique objects
     // - 1 graph URI
     // Total: 205 unique URIs
-    assert_eq!(dict.len(), 205);
+    assert_eq!(dict.id_to_uri.len(), 205);
 
     // Verify predicate reuse - predicates should have low IDs (0-3)
     for (i, pred) in predicates.iter().enumerate() {
-        assert_eq!(dict.fetch_id(pred), i as u64);
+        assert_eq!(dict.encode(pred), i as u32);
     }
 
     Ok(())
@@ -319,10 +320,10 @@ fn test_sparse_index_with_dictionary_integration() -> std::io::Result<()> {
     let predicates =
         vec!["http://www.w3.org/ns/saref#hasTimestamp", "http://www.w3.org/ns/saref#hasValue"];
 
-    let predicate_ids: Vec<u64> = predicates.iter().map(|p| dict.fetch_id(p)).collect();
+    let predicate_ids: Vec<u32> = predicates.iter().map(|p| dict.encode(p)).collect();
 
     let graph_uri = "https://example.org/graph/sensors";
-    let graph_id = dict.fetch_id(graph_uri);
+    let graph_id = dict.encode(graph_uri);
 
     // Create log with 1000 events
     let mut writer = LogWriter::create(&log_path)?;
@@ -330,10 +331,10 @@ fn test_sparse_index_with_dictionary_integration() -> std::io::Result<()> {
     for i in 0..1000 {
         let timestamp = i;
         let subject_uri = format!("https://rsp.js/event/{:04}", i);
-        let subject_id = dict.fetch_id(&subject_uri);
+        let subject_id = dict.encode(&subject_uri);
         let predicate_id = predicate_ids[(i % 2) as usize];
         let object_uri = format!("reading-{}", i);
-        let object_id = dict.fetch_id(&object_uri);
+        let object_id = dict.encode(&object_uri);
 
         writer.append_record(timestamp, subject_id, predicate_id, object_id, graph_id)?;
     }
@@ -380,11 +381,11 @@ fn test_large_uri_handling() {
         "measurement-with-very-long-identifier-12345678901234567890"
     );
 
-    let id = dict.fetch_id(&long_uri);
+    let id = dict.encode(&long_uri);
     assert_eq!(id, 0);
 
     // Verify retrieval works
-    assert_eq!(dict.fetch_uri(id), Some(long_uri.as_str()));
+    assert_eq!(dict.decode(id), Some(long_uri.as_str()));
 
     // Test that we can handle many long URIs
     for i in 0..100 {
@@ -394,10 +395,10 @@ fn test_large_uri_handling() {
             i * 2,
             i * 3
         );
-        dict.fetch_id(&uri);
+        dict.encode(&uri);
     }
 
-    assert_eq!(dict.len(), 101);
+    assert_eq!(dict.id_to_uri.len(), 101);
 }
 
 #[test]
@@ -415,24 +416,24 @@ fn test_rdf_namespace_reuse() {
     ];
 
     // Map each namespace
-    let namespace_ids: Vec<u64> = common_namespaces.iter().map(|ns| dict.fetch_id(ns)).collect();
+    let namespace_ids: Vec<u32> = common_namespaces.iter().map(|ns| dict.encode(ns)).collect();
 
     // Create 1000 events that all use these namespaces
     for i in 0..1000 {
         let event_uri = format!("https://rsp.js/event/{}", i);
-        dict.fetch_id(&event_uri);
+        dict.encode(&event_uri);
 
         // Reference one of the common namespaces
         let ns_id = namespace_ids[i % namespace_ids.len()];
-        assert!(dict.fetch_uri(ns_id).is_some());
+        assert!(dict.decode(ns_id).is_some());
     }
 
     // Dictionary should have: 6 namespaces + 1000 events = 1006 entries
-    assert_eq!(dict.len(), 1006);
+    assert_eq!(dict.id_to_uri.len(), 1006);
 
     // Verify namespace IDs are unchanged (demonstrating reuse)
     for (i, ns) in common_namespaces.iter().enumerate() {
-        assert_eq!(dict.fetch_id(ns), namespace_ids[i]);
+        assert_eq!(dict.encode(ns), namespace_ids[i]);
     }
 }
 
@@ -474,10 +475,10 @@ fn test_event_resolution_workflow() -> std::io::Result<()> {
     let mut writer = LogWriter::create(&log_path)?;
 
     for (timestamp, subject, predicate, object, graph) in &event_uris {
-        let subject_id = dict.fetch_id(subject);
-        let predicate_id = dict.fetch_id(predicate);
-        let object_id = dict.fetch_id(object);
-        let graph_id = dict.fetch_id(graph);
+        let subject_id = dict.encode(subject);
+        let predicate_id = dict.encode(predicate);
+        let object_id = dict.encode(object);
+        let graph_id = dict.encode(graph);
 
         writer.append_record(*timestamp, subject_id, predicate_id, object_id, graph_id)?;
     }
@@ -498,10 +499,10 @@ fn test_event_resolution_workflow() -> std::io::Result<()> {
         assert_eq!(dec_ts, *timestamp);
 
         // Resolve IDs to URIs
-        assert_eq!(dict.fetch_uri(dec_subj_id), Some(*subject));
-        assert_eq!(dict.fetch_uri(dec_pred_id), Some(*predicate));
-        assert_eq!(dict.fetch_uri(dec_obj_id), Some(*object));
-        assert_eq!(dict.fetch_uri(dec_graph_id), Some(*graph));
+        assert_eq!(dict.decode(dec_subj_id), Some(*subject));
+        assert_eq!(dict.decode(dec_pred_id), Some(*predicate));
+        assert_eq!(dict.decode(dec_obj_id), Some(*object));
+        assert_eq!(dict.decode(dec_graph_id), Some(*graph));
     }
 
     Ok(())
@@ -522,7 +523,7 @@ fn test_dictionary_space_savings() {
     let raw_size: usize = uris.iter().map(|u| u.len()).sum();
 
     // With dictionary, we store 8 bytes per ID
-    let ids: Vec<u64> = uris.iter().map(|u| dict.fetch_id(u)).collect();
+    let ids: Vec<u32> = uris.iter().map(|u| dict.encode(u)).collect();
     let encoded_size = ids.len() * 8; // 8 bytes per u64
 
     println!("Raw URIs size: {} bytes", raw_size);
@@ -572,7 +573,7 @@ fn test_complete_rdf_quad_with_datatype() {
     // Store all components and get their IDs
     let mut component_ids = std::collections::HashMap::new();
     for (name, value) in &components {
-        let id = dict.fetch_id(value);
+        let id = dict.encode(value);
         component_ids.insert(*name, id);
         println!("{}: '{}' → ID {}", name, value, id);
     }
@@ -588,11 +589,11 @@ fn test_complete_rdf_quad_with_datatype() {
     // that tracks which object IDs are literals and what their datatypes are.
 
     // Verify retrieval
-    assert_eq!(dict.fetch_uri(component_ids["subject"]), Some(components[0].1));
-    assert_eq!(dict.fetch_uri(component_ids["predicate"]), Some(components[1].1));
-    assert_eq!(dict.fetch_uri(component_ids["object_value"]), Some(components[2].1));
-    assert_eq!(dict.fetch_uri(component_ids["object_datatype"]), Some(components[3].1));
-    assert_eq!(dict.fetch_uri(component_ids["graph"]), Some(components[4].1));
+    assert_eq!(dict.decode(component_ids["subject"]), Some(components[0].1));
+    assert_eq!(dict.decode(component_ids["predicate"]), Some(components[1].1));
+    assert_eq!(dict.decode(component_ids["object_value"]), Some(components[2].1));
+    assert_eq!(dict.decode(component_ids["object_datatype"]), Some(components[3].1));
+    assert_eq!(dict.decode(component_ids["graph"]), Some(components[4].1));
 
     // Another quad with the same datatype:
     // <https://rsp.js/event/humidity-sensor-001> <http://www.w3.org/ns/saref#hasValue> "65.2"^^<http://www.w3.org/2001/XMLSchema#double> <https://example.org/graph/sensors> .
@@ -600,13 +601,13 @@ fn test_complete_rdf_quad_with_datatype() {
     let subject2 = "https://rsp.js/event/humidity-sensor-001";
     let value2 = "65.2";
 
-    let _subject2_id = dict.fetch_id(subject2);
-    let _value2_id = dict.fetch_id(value2);
+    let _subject2_id = dict.encode(subject2);
+    let _value2_id = dict.encode(value2);
 
     // These components are REUSED (same ID returned):
-    let predicate2_id = dict.fetch_id("http://www.w3.org/ns/saref#hasValue");
-    let datatype2_id = dict.fetch_id("http://www.w3.org/2001/XMLSchema#double");
-    let graph2_id = dict.fetch_id("https://example.org/graph/sensors");
+    let predicate2_id = dict.encode("http://www.w3.org/ns/saref#hasValue");
+    let datatype2_id = dict.encode("http://www.w3.org/2001/XMLSchema#double");
+    let graph2_id = dict.encode("https://example.org/graph/sensors");
 
     // Verify reuse
     assert_eq!(predicate2_id, component_ids["predicate"]);
@@ -614,9 +615,9 @@ fn test_complete_rdf_quad_with_datatype() {
     assert_eq!(graph2_id, component_ids["graph"]);
 
     // Dictionary has: 5 original components + 2 new (subject2, value2) = 7 total
-    assert_eq!(dict.len(), 7);
+    assert_eq!(dict.id_to_uri.len(), 7);
 
     println!("\n✓ Demonstrated RDF datatype handling with dictionary encoding");
     println!("✓ Showed URI reuse across multiple quads (predicate, datatype, graph)");
-    println!("✓ Dictionary size: {} entries for 2 complete RDF quads", dict.len());
+    println!("✓ Dictionary size: {} entries for 2 complete RDF quads", dict.id_to_uri.len());
 }

From f4a22a536182d65adc92fdc69261fb4f7da75797 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 12 Nov 2025 15:31:58 +0100
Subject: [PATCH 12/19] Remove examples and benchmark scripts; refactor main
 and storage indexing for improved performance and clarity

- Deleted the basic example demonstrating Janus RDF Stream Processing Engine.
- Removed the comprehensive benchmark script for testing Dense vs Sparse indexing approaches.
- Refactored `main.rs` to clean up print statements and improve readability.
- Updated `dictionary.rs` to simplify logging in tests.
- Corrected file naming in `segmented_storage.rs` for index files.
- Enhanced the `load_index_directory_from_file` function to reconstruct index blocks accurately.
- Added new examples for point query and range query benchmarks, focusing on realistic IoT sensor data.
- Implemented a realistic RDF benchmark for IoT sensor observations, analyzing write and read performance.
---
 CONTRIBUTING.md                     |   2 +-
 Cargo.toml                          |  12 --
 GETTING_STARTED.md                  |   2 +-
 benches/README.md                   | 102 ----------
 benches/analysis.rs                 | 208 -------------------
 benches/benchmark.rs                | 128 ------------
 benches/write_benchmark.rs          | 216 --------------------
 examples/basic.rs                   |  42 ----
 examples/point_query_benchmark.rs   | 147 +++++++++++++
 examples/range_query_benchmark.rs   | 273 +++++++++++++++++++++++++
 examples/realistic_rdf_benchmark.rs | 306 ++++++++++++++++++++++++++++
 run_benchmarks.sh                   |  47 -----
 src/main.rs                         |  12 +-
 src/storage/indexing/dictionary.rs  |   2 +-
 src/storage/segmented_storage.rs    |  86 +++++++-
 15 files changed, 811 insertions(+), 774 deletions(-)
 delete mode 100644 benches/README.md
 delete mode 100644 benches/analysis.rs
 delete mode 100644 benches/benchmark.rs
 delete mode 100644 benches/write_benchmark.rs
 delete mode 100644 examples/basic.rs
 create mode 100644 examples/point_query_benchmark.rs
 create mode 100644 examples/range_query_benchmark.rs
 create mode 100644 examples/realistic_rdf_benchmark.rs
 delete mode 100755 run_benchmarks.sh

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b10a4dd..71f1953 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -382,4 +382,4 @@ By contributing to Janus, you agree that your contributions will be licensed und
 
 ---
 
-Thank you for contributing to Janus! 🎉
\ No newline at end of file
+Thank you for contributing to Janus!
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index 4f9485e..8601fa5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,15 +36,3 @@ opt-level = 0
 
 [profile.test]
 opt-level = 0
-
-[[bench]]
-name = "benchmark"
-harness = false
-
-[[bench]]
-name = "write_benchmark"
-harness = false
-
-[[bench]]
-name = "analysis"
-harness = false
diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
index df2f77b..c693f31 100644
--- a/GETTING_STARTED.md
+++ b/GETTING_STARTED.md
@@ -399,4 +399,4 @@ This project is licensed under the MIT License - see [LICENCE.md](LICENCE.md) fo
 
 ---
 
-Happy coding with Janus! 🚀
\ No newline at end of file
+Happy coding with Janus!
\ No newline at end of file
diff --git a/benches/README.md b/benches/README.md
deleted file mode 100644
index 2892fd1..0000000
--- a/benches/README.md
+++ /dev/null
@@ -1,102 +0,0 @@
-# RDF Indexing Benchmarks
-
-This directory contains comprehensive benchmarks for comparing different RDF indexing strategies in Janus.
-
-## Available Benchmarks
-
-### 1. `benchmark.rs` - Read Performance (Original)
-Tests query performance on pre-built indexes:
-- Index building time comparison
-- Query speed across different data ranges
-- Memory usage comparison
-
-### 2. `write_benchmark.rs` - Write Performance (New)
-Tests writing performance during record insertion:
-- Real-time indexing (index while writing)
-- Batch indexing (build index after writing)
-- Writing throughput comparison
-- Total processing time analysis
-
-### 3. `analysis.rs` - Advanced Analysis (New)
-Detailed analysis across multiple dimensions:
-- Optimal sparse interval analysis
-- Memory usage scaling
-- Write throughput under different conditions
-- Performance recommendations
-
-## Quick Start
-
-### Run All Benchmarks
-```bash
-./run_benchmarks.sh
-```
-
-### Run Individual Benchmarks
-```bash
-# Original read performance benchmark
-cargo bench --bench benchmark
-
-# New write performance benchmark  
-cargo bench --bench write_benchmark
-
-# Advanced analysis suite
-cargo bench --bench analysis
-```
-
-## Step-by-Step Testing Instructions
-
-Benchmarks are organized by functionality:
-
-- `query_parsing.rs` - Benchmarks for parsing RSP-QL queries
-- `stream_processing.rs` - Benchmarks for stream processing operations
-- `store_operations.rs` - Benchmarks for RDF store interactions
-- `integration.rs` - End-to-end integration benchmarks
-
-## Adding New Benchmarks
-
-To add a new benchmark:
-
-1. Create a new file in the `benches/` directory
-2. Add the benchmark to `Cargo.toml`:
-
-```toml
-[[bench]]
-name = "my_benchmark"
-harness = false
-```
-
-3. Use the `criterion` crate for benchmarking:
-
-```rust
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-
-fn benchmark_function(c: &mut Criterion) {
-    c.bench_function("my_function", |b| {
-        b.iter(|| {
-            // Code to benchmark
-            black_box(my_function())
-        });
-    });
-}
-
-criterion_group!(benches, benchmark_function);
-criterion_main!(benches);
-```
-
-## Benchmark Results
-
-Benchmark results are stored in `target/criterion/` and include:
-
-- HTML reports with graphs
-- Comparison with previous runs
-- Statistical analysis
-
-To view results, open `target/criterion/report/index.html` in a browser.
-
-## Performance Tips
-
-- Run benchmarks in release mode (default for `cargo bench`)
-- Ensure system is idle during benchmarking
-- Use consistent hardware for comparisons
-- Run multiple iterations to reduce noise
-- Use `black_box()` to prevent compiler optimizations
diff --git a/benches/analysis.rs b/benches/analysis.rs
deleted file mode 100644
index cbe6858..0000000
--- a/benches/analysis.rs
+++ /dev/null
@@ -1,208 +0,0 @@
-use janus::indexing::{dense, sparse};
-use std::fs;
-use std::time::Instant;
-
-/// Analyze different sparse intervals to find optimal configuration
-fn analyze_sparse_intervals() -> std::io::Result<()> {
-    println!("🔍 Analyzing Different Sparse Intervals");
-    println!("=====================================");
-
-    let intervals = vec![100, 500, 1000, 2000, 5000, 10000];
-    let log_file = "data/benchmark/log.dat";
-    let number_records = 100_000u64;
-
-    // Create test data
-    fs::create_dir_all("data/benchmark")?;
-    let mut writer = janus::indexing::shared::LogWriter::create(log_file)?;
-    for i in 0..number_records {
-        writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
-    }
-    writer.flush()?;
-
-    println!("Testing {} records with different intervals:", number_records);
-    println!("{:-<80}", "");
-    println!(
-        "{:<10} {:<15} {:<15} {:<20} {:<15}",
-        "Interval", "Build Time(ms)", "Index Size(KB)", "Space Savings(%)", "Query Time(ms)"
-    );
-    println!("{:-<80}", "");
-
-    // Get dense index stats for comparison
-    let dense_start = Instant::now();
-    dense::build_dense_index(log_file, "data/benchmark/dense_ref.idx")?;
-    let dense_build_time = dense_start.elapsed();
-    let dense_reader = dense::DenseIndexReader::open("data/benchmark/dense_ref.idx")?;
-    let dense_size = dense_reader.index_size_bytes();
-
-    // Test query performance on dense index
-    let query_start = Instant::now();
-    let _dense_results = dense_reader.query(log_file, 10000, 20000)?;
-    let dense_query_time = query_start.elapsed();
-
-    for interval in intervals {
-        let index_file = format!("data/benchmark/sparse_{}.idx", interval);
-
-        // Build sparse index
-        let start = Instant::now();
-        sparse::build_sparse_index(log_file, &index_file, &interval)?;
-        let build_time = start.elapsed();
-
-        // Get size info
-        let reader = sparse::SparseReader::open(&index_file, interval)?;
-        let sparse_size = reader.index_size_bytes();
-        let space_savings = ((dense_size - sparse_size) as f64 / dense_size as f64) * 100.0;
-
-        // Test query performance
-        let query_start = Instant::now();
-        let _sparse_results = reader.query(log_file, 10000, 20000)?;
-        let query_time = query_start.elapsed();
-
-        println!(
-            "{:<10} {:<15.3} {:<15.2} {:<20.2} {:<15.3}",
-            interval,
-            build_time.as_secs_f64() * 1000.0,
-            sparse_size as f64 / 1024.0,
-            space_savings,
-            query_time.as_secs_f64() * 1000.0
-        );
-    }
-
-    println!("{:-<80}", "");
-    println!(
-        "Dense Reference: Build: {:.3}ms, Size: {:.2}KB, Query: {:.3}ms",
-        dense_build_time.as_secs_f64() * 1000.0,
-        dense_size as f64 / 1024.0,
-        dense_query_time.as_secs_f64() * 1000.0
-    );
-
-    Ok(())
-}
-
-/// Analyze memory usage patterns
-fn analyze_memory_usage() -> std::io::Result<()> {
-    println!("\n🧠 Memory Usage Analysis");
-    println!("=======================");
-
-    let record_counts = vec![10_000, 50_000, 100_000, 500_000, 1_000_000];
-
-    println!(
-        "{:<12} {:<15} {:<15} {:<20}",
-        "Records", "Dense Size(MB)", "Sparse Size(MB)", "Memory Ratio"
-    );
-    println!("{:-<62}", "");
-
-    for &count in &record_counts {
-        let log_file = format!("data/benchmark/log_{}.dat", count);
-        let dense_index = format!("data/benchmark/dense_{}.idx", count);
-        let sparse_index = format!("data/benchmark/sparse_{}.idx", count);
-
-        // Create test data
-        let mut writer = janus::indexing::shared::LogWriter::create(&log_file)?;
-        for i in 0..count {
-            writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
-        }
-        writer.flush()?;
-
-        // Build indexes
-        dense::build_dense_index(&log_file, &dense_index)?;
-        sparse::build_sparse_index(&log_file, &sparse_index, &1000)?;
-
-        // Get sizes
-        let dense_reader = dense::DenseIndexReader::open(&dense_index)?;
-        let sparse_reader = sparse::SparseReader::open(&sparse_index, 1000)?;
-
-        let dense_size = dense_reader.index_size_bytes() as f64 / 1_000_000.0;
-        let sparse_size = sparse_reader.index_size_bytes() as f64 / 1_000_000.0;
-        let ratio = dense_size / sparse_size;
-
-        println!("{:<12} {:<15.3} {:<15.3} {:<20.2}x", count, dense_size, sparse_size, ratio);
-    }
-
-    Ok(())
-}
-
-/// Test write throughput under different conditions
-fn analyze_write_throughput() -> std::io::Result<()> {
-    println!("\n⚡ Write Throughput Analysis");
-    println!("===========================");
-
-    let test_configs = vec![
-        ("Small batches", 1_000u64),
-        ("Medium batches", 10_000u64),
-        ("Large batches", 100_000u64),
-    ];
-
-    println!(
-        "{:<15} {:<20} {:<20} {:<15}",
-        "Batch Size", "Dense (rec/sec)", "Sparse (rec/sec)", "Speedup"
-    );
-    println!("{:-<70}", "");
-
-    for (name, batch_size) in test_configs {
-        fs::create_dir_all("data/benchmark")?;
-
-        // Test dense writing
-        let dense_log = "data/benchmark/dense_throughput.dat";
-        let dense_index = "data/benchmark/dense_throughput.idx";
-
-        let start = Instant::now();
-        let mut log_writer = janus::indexing::shared::LogWriter::create(dense_log)?;
-        let mut index_builder = janus::indexing::dense::DenseIndexBuilder::create(dense_index)?;
-
-        for i in 0..batch_size {
-            log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
-            index_builder.add_entry(i, i * 24)?;
-        }
-        log_writer.flush()?;
-        index_builder.finalize()?;
-
-        let dense_time = start.elapsed();
-        let dense_throughput = batch_size as f64 / dense_time.as_secs_f64();
-
-        // Test sparse writing
-        let sparse_log = "data/benchmark/sparse_throughput.dat";
-        let sparse_index = "data/benchmark/sparse_throughput.idx";
-
-        let start = Instant::now();
-        let mut log_writer = janus::indexing::shared::LogWriter::create(sparse_log)?;
-        let mut index_builder =
-            janus::indexing::sparse::SparseIndexBuilder::create(sparse_index, 1000)?;
-
-        for i in 0..batch_size {
-            log_writer.append_record(i, i % 1000, i % 500, i % 2000, 1)?;
-            index_builder.add_entry(i, i, i * 24)?;
-        }
-        log_writer.flush()?;
-        index_builder.finalize()?;
-
-        let sparse_time = start.elapsed();
-        let sparse_throughput = batch_size as f64 / sparse_time.as_secs_f64();
-
-        let speedup = sparse_throughput / dense_throughput;
-
-        println!(
-            "{:<15} {:<20.0} {:<20.0} {:<15.2}x",
-            name, dense_throughput, sparse_throughput, speedup
-        );
-    }
-
-    Ok(())
-}
-
-fn main() -> std::io::Result<()> {
-    println!("🔬 Advanced RDF Indexing Analysis Suite");
-    println!("=======================================");
-
-    analyze_sparse_intervals()?;
-    analyze_memory_usage()?;
-    analyze_write_throughput()?;
-
-    println!("\n✨ Analysis Complete!");
-    println!("\n💡 Recommendations:");
-    println!("  • Use sparse indexing for write-heavy workloads");
-    println!("  • Choose interval based on query precision requirements");
-    println!("  • Consider hybrid approaches for different use cases");
-    println!("  • Monitor memory usage with large datasets");
-
-    Ok(())
-}
diff --git a/benches/benchmark.rs b/benches/benchmark.rs
deleted file mode 100644
index 29aa79d..0000000
--- a/benches/benchmark.rs
+++ /dev/null
@@ -1,128 +0,0 @@
-use janus::indexing::{dense, shared::LogWriter, sparse};
-use std::fs;
-use std::time::Instant;
-
-const DATA_DIR: &str = "data/benchmark";
-const LOG_FILE: &str = "data/benchmark/log.dat";
-const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx";
-const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx";
-const SPARSE_INTERVAL: usize = 1000;
-
-fn setup_data(number_records: u64) -> std::io::Result<()> {
-    let _ = fs::remove_dir_all(DATA_DIR);
-    fs::create_dir_all(DATA_DIR)?;
-
-    let mut writer = LogWriter::create(LOG_FILE)?;
-
-    for i in 0..number_records {
-        let timestamp = i;
-        let subject = (i % 1000) as u32;
-        let predicate = (i % 500) as u32;
-        let object = (i % 2000) as u32;
-        let graph: u32 = 1;
-        writer.append_record(timestamp, subject, predicate, object, graph)?;
-    }
-
-    writer.flush()?;
-
-    println!("Generated log file with {} records", writer.record_count());
-
-    Ok(())
-}
-
-fn benchmark_indexing() -> std::io::Result<()> {
-    println!("Indexing Benchmark");
-
-    let start = Instant::now();
-    dense::build_dense_index(LOG_FILE, DENSE_INDEX_FILE)?;
-    let dense_time = start.elapsed();
-    println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0);
-
-    let start = Instant::now();
-    sparse::build_sparse_index(LOG_FILE, SPARSE_INDEX_FILE, &SPARSE_INTERVAL)?;
-    let sparse_time = start.elapsed();
-    println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0);
-
-    let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?;
-    let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?;
-    println!(
-        "\n Dense Index Size: {} MB",
-        dense_reader.index_size_bytes() as f64 / 1_000_000.0
-    );
-
-    println!(
-        "\n Sparse Index Size: {} MB",
-        sparse_reader.index_size_bytes() as f64 / 1_000_000.0
-    );
-    Ok(())
-}
-
-fn benchmark_queries() -> std::io::Result<()> {
-    println!("Query Benchmark");
-    let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?;
-    let sparse_reader = sparse::SparseReader::open(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?;
-
-    let query_ranges = vec![
-        (0u64, 100u64, "100 records"),
-        (5000u64, 5100u64, "100 records (mid-range)"),
-        (0u64, 10000u64, "10K records"),
-        (0u64, 100000u64, "100K records"),
-        (0u64, 1000000u64, "1M records"),
-    ];
-
-    for (timestamp_start, timestamp_end, description) in query_ranges {
-        println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end);
-
-        let start = Instant::now();
-        let dense_results = dense_reader.query(LOG_FILE, timestamp_start, timestamp_end)?;
-        let dense_time = start.elapsed();
-
-        let start = Instant::now();
-        let sparse_results = sparse_reader.query(LOG_FILE, timestamp_start, timestamp_end)?;
-        let sparse_time = start.elapsed();
-
-        println!(
-            " Dense Index Query Time: {:.3} ms, Results: {}",
-            dense_time.as_secs_f64() * 1000.0,
-            dense_results.len()
-        );
-
-        println!(
-            " Sparse Index Query Time: {:.3} ms, Results: {}",
-            sparse_time.as_secs_f64() * 1000.0,
-            sparse_results.len()
-        );
-
-        let speedup = sparse_time.as_secs_f64() / dense_time.as_secs_f64();
-
-        if speedup > 1.0 {
-            println!(" Sparse index is {:.2} times faster than Dense index", speedup);
-        } else {
-            println!(" Dense index is {:.2} times faster than Sparse index", 1.0 / speedup);
-        }
-
-        assert_eq!(
-            dense_results.len(),
-            sparse_results.len(),
-            "Mismatch in result counts between Dense and Sparse index queries"
-        );
-    }
-    Ok(())
-}
-
-fn main() -> std::io::Result<()> {
-    println!("RDF Indexing Benchmark : Dense vs Sparse");
-    println!("Setting up data...");
-    let number_of_records = 1_000_000u64;
-    setup_data(number_of_records)?;
-
-    benchmark_indexing()?;
-    benchmark_queries()?;
-
-    println!(
-        "\n=== Summary ===\nSparse interval: {}\nUse this data to decide \
-         which approach suits your use case best.",
-        SPARSE_INTERVAL
-    );
-    Ok(())
-}
diff --git a/benches/write_benchmark.rs b/benches/write_benchmark.rs
deleted file mode 100644
index 38e8e13..0000000
--- a/benches/write_benchmark.rs
+++ /dev/null
@@ -1,216 +0,0 @@
-use janus::indexing::{dense::DenseIndexBuilder, shared::LogWriter, sparse::SparseIndexBuilder};
-use std::fs;
-use std::time::Instant;
-
-const DATA_DIR: &str = "data/write_benchmark";
-const DENSE_LOG_FILE: &str = "data/write_benchmark/dense_log.dat";
-const SPARSE_LOG_FILE: &str = "data/write_benchmark/sparse_log.dat";
-const DENSE_INDEX_FILE: &str = "data/write_benchmark/dense.idx";
-const SPARSE_INDEX_FILE: &str = "data/write_benchmark/sparse.idx";
-const SPARSE_INTERVAL: usize = 1000;
-
-fn setup_dirs() -> std::io::Result<()> {
-    let _ = fs::remove_dir_all(DATA_DIR);
-    fs::create_dir_all(DATA_DIR)?;
-    Ok(())
-}
-
-/// Benchmark writing records with dense indexing
-/// This simulates real-time writing where each record is indexed immediately
-fn benchmark_dense_writing(number_records: u64) -> std::io::Result<(f64, f64)> {
-    println!("Benchmarking Dense Index Writing...");
-
-    let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?;
-    let mut index_builder = DenseIndexBuilder::create(DENSE_INDEX_FILE)?;
-
-    let start = Instant::now();
-    let mut current_offset = 0u64;
-
-    for i in 0..number_records {
-        let timestamp = i;
-        let subject = (i % 1000) as u32;
-        let predicate = (i % 500) as u32;
-        let object = (i % 2000) as u32;
-        let graph: u32 = 1;
-
-        // Write record to log
-        log_writer.append_record(timestamp, subject, predicate, object, graph)?;
-
-        // Add entry to index
-        index_builder.add_entry(timestamp, current_offset)?;
-
-        current_offset += 24; // RECORD_SIZE
-    }
-
-    let write_time = start.elapsed();
-
-    // Finalize both log and index
-    log_writer.flush()?;
-    index_builder.finalize()?;
-
-    let total_time = start.elapsed();
-
-    Ok((write_time.as_secs_f64(), total_time.as_secs_f64()))
-}
-
-/// Benchmark writing records with sparse indexing
-/// This simulates real-time writing where only periodic records are indexed
-fn benchmark_sparse_writing(number_records: u64) -> std::io::Result<(f64, f64)> {
-    println!("Benchmarking Sparse Index Writing...");
-
-    let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?;
-    let mut index_builder = SparseIndexBuilder::create(SPARSE_INDEX_FILE, SPARSE_INTERVAL)?;
-
-    let start = Instant::now();
-    let mut current_offset = 0u64;
-
-    for i in 0..number_records {
-        let timestamp = i;
-        let subject = (i % 1000) as u32;
-        let predicate = (i % 500) as u32;
-        let object = (i % 2000) as u32;
-        let graph: u32 = 1;
-
-        // Write record to log
-        log_writer.append_record(timestamp, subject, predicate, object, graph)?;
-
-        // Add entry to index (will only add if i % interval == 0)
-        index_builder.add_entry(i, timestamp, current_offset)?;
-
-        current_offset += 24; // RECORD_SIZE
-    }
-
-    let write_time = start.elapsed();
-
-    // Finalize both log and index
-    log_writer.flush()?;
-    index_builder.finalize()?;
-
-    let total_time = start.elapsed();
-
-    Ok((write_time.as_secs_f64(), total_time.as_secs_f64()))
-}
-
-/// Benchmark batch writing vs real-time writing
-fn benchmark_batch_vs_realtime(number_records: u64) -> std::io::Result<()> {
-    println!("\n=== Batch vs Real-time Writing Comparison ===");
-
-    // Test 1: Real-time writing (as implemented above)
-    setup_dirs()?;
-    let (dense_write_time, dense_total_time) = benchmark_dense_writing(number_records)?;
-
-    setup_dirs()?;
-    let (sparse_write_time, sparse_total_time) = benchmark_sparse_writing(number_records)?;
-
-    // Test 2: Batch writing (write log first, then build index)
-    setup_dirs()?;
-    println!("Benchmarking Batch Dense Index Creation...");
-
-    let start = Instant::now();
-    let mut log_writer = LogWriter::create(DENSE_LOG_FILE)?;
-    for i in 0..number_records {
-        let timestamp = i;
-        let subject = (i % 1000) as u32;
-        let predicate = (i % 500) as u32;
-        let object = (i % 2000) as u32;
-        let graph: u32 = 1;
-        log_writer.append_record(timestamp, subject, predicate, object, graph)?;
-    }
-    log_writer.flush()?;
-    let log_write_time = start.elapsed();
-
-    let start = Instant::now();
-    janus::indexing::dense::build_dense_index(DENSE_LOG_FILE, DENSE_INDEX_FILE)?;
-    let index_build_time = start.elapsed();
-    let batch_dense_total = log_write_time.as_secs_f64() + index_build_time.as_secs_f64();
-
-    // Batch sparse
-    setup_dirs()?;
-    println!("Benchmarking Batch Sparse Index Creation...");
-
-    let start = Instant::now();
-    let mut log_writer = LogWriter::create(SPARSE_LOG_FILE)?;
-    for i in 0..number_records {
-        let timestamp = i;
-        let subject = (i % 1000) as u32;
-        let predicate = (i % 500) as u32;
-        let object = (i % 2000) as u32;
-        let graph: u32 = 1;
-        log_writer.append_record(timestamp, subject, predicate, object, graph)?;
-    }
-    log_writer.flush()?;
-    let log_write_time = start.elapsed();
-
-    let start = Instant::now();
-    janus::indexing::sparse::build_sparse_index(
-        SPARSE_LOG_FILE,
-        SPARSE_INDEX_FILE,
-        &SPARSE_INTERVAL,
-    )?;
-    let index_build_time = start.elapsed();
-    let batch_sparse_total = log_write_time.as_secs_f64() + index_build_time.as_secs_f64();
-
-    // Print results
-    println!("\n=== WRITING PERFORMANCE RESULTS ===");
-    println!("Records: {}", number_records);
-    println!("Sparse interval: {}", SPARSE_INTERVAL);
-
-    println!("\n--- Real-time Writing (Index while writing) ---");
-    println!(
-        "Dense - Write time: {:.3} ms, Total time: {:.3} ms",
-        dense_write_time * 1000.0,
-        dense_total_time * 1000.0
-    );
-    println!(
-        "Sparse - Write time: {:.3} ms, Total time: {:.3} ms",
-        sparse_write_time * 1000.0,
-        sparse_total_time * 1000.0
-    );
-
-    println!("\n--- Batch Writing (Index after writing) ---");
-    println!(
-        "Dense - Log write: {:.3} ms, Index build: {:.3} ms, Total: {:.3} ms",
-        log_write_time.as_secs_f64() * 1000.0,
-        index_build_time.as_secs_f64() * 1000.0,
-        batch_dense_total * 1000.0
-    );
-    println!(
-        "Sparse - Log write: {:.3} ms, Index build: {:.3} ms, Total: {:.3} ms",
-        log_write_time.as_secs_f64() * 1000.0,
-        index_build_time.as_secs_f64() * 1000.0,
-        batch_sparse_total * 1000.0
-    );
-
-    println!("\n--- Performance Comparison ---");
-    let realtime_speedup = dense_total_time / sparse_total_time;
-    let batch_speedup = batch_dense_total / batch_sparse_total;
-
-    if realtime_speedup > 1.0 {
-        println!("Real-time: Sparse is {:.2}x faster than Dense", realtime_speedup);
-    } else {
-        println!("Real-time: Dense is {:.2}x faster than Sparse", 1.0 / realtime_speedup);
-    }
-
-    if batch_speedup > 1.0 {
-        println!("Batch: Sparse is {:.2}x faster than Dense", batch_speedup);
-    } else {
-        println!("Batch: Dense is {:.2}x faster than Sparse", 1.0 / batch_speedup);
-    }
-
-    Ok(())
-}
-
-fn main() -> std::io::Result<()> {
-    println!("RDF Writing Performance Benchmark: Dense vs Sparse");
-
-    let test_sizes = vec![10_000u64, 100_000u64, 1_000_000u64];
-
-    for &size in &test_sizes {
-        println!("\n{:=<60}", "");
-        println!("Testing with {} records", size);
-        println!("{:=<60}", "");
-        benchmark_batch_vs_realtime(size)?;
-    }
-
-    Ok(())
-}
diff --git a/examples/basic.rs b/examples/basic.rs
deleted file mode 100644
index 43fc3f0..0000000
--- a/examples/basic.rs
+++ /dev/null
@@ -1,42 +0,0 @@
-//! Basic example demonstrating the Janus RDF Stream Processing Engine
-//!
-//! This example shows how to use the Janus library for basic operations.
-//!
-//! Run this example with:
-//! ```
-//! cargo run --example basic
-//! ```
-
-use janus::Result;
-
-fn main() -> Result<()> {
-    println!("=== Janus Basic Example ===\n");
-
-    println!("This is a basic example of the Janus RDF Stream Processing Engine.");
-    println!("The engine is designed to process both live and historical RDF streams.\n");
-
-    // TODO: Initialize the Janus engine
-    println!("Step 1: Initialize the engine");
-    println!("  - Configure RDF store connection");
-    println!("  - Set up stream processing pipeline\n");
-
-    // TODO: Load historical data
-    println!("Step 2: Load historical RDF data");
-    println!("  - Connect to RDF store (e.g., Oxigraph, Apache Jena)");
-    println!("  - Query historical triples\n");
-
-    // TODO: Set up live stream
-    println!("Step 3: Set up live RDF stream");
-    println!("  - Connect to stream source (e.g., Kafka, MQTT)");
-    println!("  - Register stream processors\n");
-
-    // TODO: Execute queries
-    println!("Step 4: Execute unified queries");
-    println!("  - Parse RSP-QL query");
-    println!("  - Execute over historical and live data");
-    println!("  - Return results\n");
-
-    println!("Example completed successfully!");
-
-    Ok(())
-}
diff --git a/examples/point_query_benchmark.rs b/examples/point_query_benchmark.rs
new file mode 100644
index 0000000..7424836
--- /dev/null
+++ b/examples/point_query_benchmark.rs
@@ -0,0 +1,147 @@
+use janus::storage::segmented_storage::StreamingSegmentedStorage;
+use janus::storage::util::StreamingConfig;
+use std::time::{Instant, SystemTime, UNIX_EPOCH};
+
+fn main() -> std::io::Result<()> {
+    println!("\nPoint Query Performance Benchmark");
+    println!("=====================================");
+    println!("Testing 100K and 1M datasets with 33 runs each\n");
+
+    // Test sizes: 100K and 1M quads
+    let test_sizes = vec![100_000, 1_000_000];
+    let num_runs = 33;
+    let warmup_runs = 3;
+    let outlier_runs = 2;
+
+    for &size in &test_sizes {
+        println!(
+            "Testing Point Queries for {} RDF Quads ({} runs, using middle 30)",
+            format_number(size),
+            num_runs
+        );
+        println!("{}", "=".repeat(80));
+
+        let mut point_query_times = Vec::new();
+
+        // Run benchmark multiple times
+        for run in 1..=num_runs {
+            if run % 10 == 0 || run == 1 {
+                println!("   Run {}/{}...", run, num_runs);
+            }
+
+            let point_time = run_point_query_benchmark(size, run)?;
+            point_query_times.push(point_time);
+        }
+
+        // Analyze results (middle 30 runs: exclude first 3 and last 2)
+        let start_idx = warmup_runs;
+        let end_idx = num_runs - outlier_runs;
+        let analysis_times = &point_query_times[start_idx..end_idx];
+
+        println!(
+            "\nPoint Query Results (Middle 30 runs, excluding first {} and last {} runs)",
+            warmup_runs, outlier_runs
+        );
+        println!("{}", "-".repeat(80));
+
+        analyze_and_print_point_query("Point Query Latency", analysis_times);
+        println!();
+    }
+
+    println!("Point Query Benchmark Complete!\n");
+    Ok(())
+}
+
+fn run_point_query_benchmark(size: u64, run_id: usize) -> std::io::Result<f64> {
+    // Create storage
+    let config = StreamingConfig {
+        max_batch_events: 10000,
+        max_batch_bytes: 10 * 1024 * 1024,
+        max_batch_age_seconds: 5,
+        sparse_interval: 1000,
+        entries_per_index_block: 1000,
+        segment_base_path: format!("data/point_query_benchmark_{}_{}", size, run_id),
+    };
+
+    let mut storage = StreamingSegmentedStorage::new(config.clone())?;
+    storage.start_background_flushing();
+
+    let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64;
+    let mut min_timestamp = u64::MAX;
+    let mut max_timestamp = 0u64;
+
+    // Generate RDF quads with unique timestamps
+    for i in 0..size {
+        let timestamp = base_timestamp + i;
+        min_timestamp = min_timestamp.min(timestamp);
+        max_timestamp = max_timestamp.max(timestamp);
+
+        storage.write_rdf(
+            timestamp,
+            &format!("subject{}", i),
+            "predicate",
+            &format!("object{}", i),
+            "graph",
+        )?;
+    }
+
+    // Wait for all data to be flushed to disk
+    storage.shutdown()?;
+
+    // Restart storage for read-only point query
+    let storage = StreamingSegmentedStorage::new(config.clone())?;
+
+    // Point query benchmark - query for middle timestamp
+    let target_timestamp = min_timestamp + (max_timestamp - min_timestamp) / 2;
+
+    let point_start = Instant::now();
+    let _point_results = storage.query_rdf(target_timestamp, target_timestamp)?;
+    let point_duration = point_start.elapsed();
+
+    // Convert to milliseconds with microsecond precision
+    let point_time_ms = (point_duration.as_micros() as f64) / 1000.0;
+
+    // Cleanup
+    let _ = std::fs::remove_dir_all(&config.segment_base_path);
+
+    Ok(point_time_ms)
+}
+
+fn analyze_and_print_point_query(label: &str, times: &[f64]) {
+    let mut sorted_times = times.to_vec();
+    sorted_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+    let mean = times.iter().sum::<f64>() / times.len() as f64;
+    let median = sorted_times[times.len() / 2];
+    let min = *sorted_times.first().unwrap();
+    let max = *sorted_times.last().unwrap();
+
+    // Calculate standard deviation
+    let variance = times.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / times.len() as f64;
+    let std_dev = variance.sqrt();
+
+    println!(
+        "   {:<20}: {:.2} ms (median: {:.2}, std: {:.2}, range: {:.2} - {:.2})",
+        label, mean, median, std_dev, min, max
+    );
+
+    // Additional statistics
+    println!("   Sample Size         : {} measurements", times.len());
+    println!("   Coefficient of Var  : {:.1}%", (std_dev / mean) * 100.0);
+
+    // Percentiles
+    let p95_idx = (times.len() as f64 * 0.95) as usize;
+    let p99_idx = (times.len() as f64 * 0.99) as usize;
+    println!("   95th Percentile     : {:.2} ms", sorted_times[p95_idx.min(times.len() - 1)]);
+    println!("   99th Percentile     : {:.2} ms", sorted_times[p99_idx.min(times.len() - 1)]);
+}
+
+fn format_number(n: u64) -> String {
+    if n >= 1_000_000 {
+        format!("{:.1}M", n as f64 / 1_000_000.0)
+    } else if n >= 1_000 {
+        format!("{:.1}K", n as f64 / 1_000.0)
+    } else {
+        n.to_string()
+    }
+}
diff --git a/examples/range_query_benchmark.rs b/examples/range_query_benchmark.rs
new file mode 100644
index 0000000..6905782
--- /dev/null
+++ b/examples/range_query_benchmark.rs
@@ -0,0 +1,273 @@
+use janus::storage::segmented_storage::StreamingSegmentedStorage;
+use janus::storage::util::StreamingConfig;
+use std::error::Error;
+use std::time::Instant;
+
+#[derive(Debug)]
+struct BenchmarkResults {
+    range_10_percent_times: Vec<f64>,
+    range_50_percent_times: Vec<f64>,
+    range_100_percent_times: Vec<f64>,
+}
+
+fn main() -> Result<(), Box<dyn Error>> {
+    println!("Realistic Range Query Benchmark by Range Size");
+    println!("================================================");
+    println!("Using realistic IoT sensor data (5 quads per observation)");
+    println!("Testing range query performance for 10%, 50%, and 100% of time range");
+    println!("Running 33 iterations, analyzing middle 30 runs\n");
+
+    let predicates = vec![
+        "http://www.w3.org/1999/02/22-rdf-syntax-ns#type".to_string(),
+        "https://saref.etsi.org/core/isMeasuredByDevice".to_string(),
+        "https://saref.etsi.org/core/relatesToProperty".to_string(),
+        "http://purl.org/dc/terms/created".to_string(),
+        "https://saref.etsi.org/core/hasValue".to_string(),
+    ];
+
+    // Test specific quad counts (each observation generates 5 quads)
+    let target_quad_counts = vec![10, 100, 1_000, 10_000, 100_000, 1_000_000];
+    let num_runs = 33;
+    let warmup_runs = 3;
+    let outlier_runs = 2;
+
+    for &quad_count in &target_quad_counts {
+        // Calculate observations needed (each generates 5 quads)
+        let observations_needed = (quad_count + 4) / 5; // Round up division
+        let actual_quads = observations_needed * 5;
+
+        println!(
+            "Testing {} target quads ({} observations → {} actual quads)",
+            quad_count, observations_needed, actual_quads
+        );
+        println!("{}", "-".repeat(70));
+
+        let mut all_results = Vec::new();
+
+        // Run benchmark multiple times
+        for run in 1..=num_runs {
+            if run % 10 == 0 || run == 1 {
+                println!("   Run {}/{}...", run, num_runs);
+            }
+
+            let result = run_range_query_benchmark(observations_needed, &predicates, run)?;
+            all_results.push(result);
+        }
+
+        // Analyze results (middle 30 runs: exclude first 3 and last 2)
+        let start_idx = warmup_runs;
+        let end_idx = num_runs - outlier_runs;
+        let analysis_results = &all_results[start_idx..end_idx];
+
+        println!(
+            "\nRange Query Results (Middle 30 runs, excluding first {} and last {} runs)",
+            warmup_runs, outlier_runs
+        );
+        println!("{}", "-".repeat(80));
+
+        // 10% range performance
+        let range_10_times: Vec<f64> =
+            analysis_results.iter().map(|r| r.range_10_percent_times[0]).collect();
+        analyze_and_print(&format!("10% Range Query ({} quads)", actual_quads), &range_10_times, "ms");
+
+        // 50% range performance
+        let range_50_times: Vec<f64> =
+            analysis_results.iter().map(|r| r.range_50_percent_times[0]).collect();
+        analyze_and_print(&format!("50% Range Query ({} quads)", actual_quads), &range_50_times, "ms");
+
+        // 100% range performance
+        let range_100_times: Vec<f64> =
+            analysis_results.iter().map(|r| r.range_100_percent_times[0]).collect();
+        analyze_and_print(&format!("100% Range Query ({} quads)", actual_quads), &range_100_times, "ms");
+
+        println!();
+    }
+
+    println!("Realistic Range Query Benchmark Complete!");
+    Ok(())
+}
+
+fn run_range_query_benchmark(
+    observations: usize,
+    predicates: &[String],
+    run_id: usize,
+) -> std::io::Result<BenchmarkResults> {
+    // Create storage
+    let config = StreamingConfig {
+        max_batch_events: 10000,
+        max_batch_bytes: 10 * 1024 * 1024,
+        max_batch_age_seconds: 5,
+        sparse_interval: 1000,
+        entries_per_index_block: 1000,
+        segment_base_path: format!("data/range_query_benchmark_{}_{}", observations, run_id),
+    };
+
+    let mut storage = StreamingSegmentedStorage::new(config.clone())?;
+    storage.start_background_flushing();
+
+    let base_timestamp = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap()
+        .as_millis() as u64;
+
+    let mut min_timestamp = u64::MAX;
+    let mut max_timestamp = 0u64;
+
+    // Generate realistic RDF quads - each observation has 5 quads (same as main benchmark)
+    for i in 0..observations {
+        // Each observation has a unique timestamp (1ms apart)
+        let timestamp = base_timestamp + i as u64;
+        min_timestamp = min_timestamp.min(timestamp);
+        max_timestamp = max_timestamp.max(timestamp);
+
+        // Create a unique subject for each observation
+        // Format: https://dahcc.idlab.ugent.be/Protego/_participant{participant}/obs{i}
+        let subject = format!(
+            "https://dahcc.idlab.ugent.be/Protego/_participant{}/obs{}",
+            (i % 100) + 1, // Rotate through 100 participants
+            i
+        );
+
+        // Sensor data - rotating through different sensors
+        let sensor = format!(
+            "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{}",
+            format!("{:02x}", (i % 256) as u8)
+        );
+
+        // Property type - rotating through different measurement types
+        let properties = vec![
+            "org.dyamand.types.common.AtmosphericPressure",
+            "org.dyamand.types.common.Temperature",
+            "org.dyamand.types.common.Humidity",
+            "org.dyamand.types.common.LightLevel",
+        ];
+        let property = format!(
+            "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/{}",
+            properties[(i % 4) as usize]
+        );
+
+        // Dataset
+        let dataset = format!("https://dahcc.idlab.ugent.be/Protego/_participant{}", (i % 100) + 1);
+
+        // Create 5 quads per observation (matching your example)
+        let quads = vec![
+            (
+                subject.clone(),
+                predicates[0].clone(),
+                dataset,
+                "http://example.org/graph1".to_string(),
+            ),
+            (
+                subject.clone(),
+                predicates[1].clone(),
+                sensor,
+                "http://example.org/graph1".to_string(),
+            ),
+            (
+                subject.clone(),
+                predicates[2].clone(),
+                property,
+                "http://example.org/graph1".to_string(),
+            ),
+            (
+                subject.clone(),
+                predicates[3].clone(),
+                format!("2022-01-03T09:04:{:02}.000000", (i % 60) as u32),
+                "http://example.org/graph1".to_string(),
+            ),
+            (
+                subject.clone(),
+                predicates[4].clone(),
+                format!("{:.1}", 1000.0 + (i as f64 * 0.1) % 100.0),
+                "http://example.org/graph1".to_string(),
+            ),
+        ];
+
+        // Write all 5 quads for this observation
+        for (s, p, o, g) in quads {
+            storage.write_rdf(timestamp, &s, &p, &o, &g)?;
+        }
+    }
+
+    // Ensure all data is written before querying
+    storage.shutdown()?;
+
+    // Recreate storage for clean read-only access
+    let storage = StreamingSegmentedStorage::new(config.clone())?;
+
+    let time_range = max_timestamp - min_timestamp;
+
+    // Debug: Print timestamp range
+    if observations == 2000 {
+        println!("DEBUG 10K: min_timestamp={}, max_timestamp={}, time_range={}", 
+                 min_timestamp, max_timestamp, time_range);
+    }
+
+    // 10% range query - query 10% of the total time range
+    let range_10_start = min_timestamp;
+    let range_10_end = min_timestamp + (time_range / 10);
+    let range_10_start_time = Instant::now();
+    let range_10_results = storage.query_rdf(range_10_start, range_10_end)?;
+    let range_10_duration = range_10_start_time.elapsed();
+    let range_10_time_ms = (range_10_duration.as_micros() as f64) / 1000.0;
+
+    // 50% range query - query 50% of the total time range
+    let range_50_start = min_timestamp;
+    let range_50_end = min_timestamp + (time_range / 2);
+    let range_50_start_time = Instant::now();
+    let range_50_results = storage.query_rdf(range_50_start, range_50_end)?;
+    let range_50_duration = range_50_start_time.elapsed();
+    let range_50_time_ms = (range_50_duration.as_micros() as f64) / 1000.0;
+
+    // 100% range query - query entire time range
+    let range_100_start = min_timestamp;
+    let range_100_end = max_timestamp;
+    
+    // Debug: Print query parameters
+    if observations == 2000 {
+        println!("DEBUG 10K: 100% query from {} to {}", range_100_start, range_100_end);
+    }
+    
+    let range_100_start_time = Instant::now();
+    let range_100_results = storage.query_rdf(range_100_start, range_100_end)?;
+    let range_100_duration = range_100_start_time.elapsed();
+    let range_100_time_ms = (range_100_duration.as_micros() as f64) / 1000.0;
+
+    let actual_quads = observations * 5;
+
+    // Debug: Print result counts
+    println!("DEBUG: Dataset {} quads - 10% range returned {} results, 50% range returned {} results, 100% range returned {} results",
+             actual_quads, range_10_results.len(), range_50_results.len(), range_100_results.len());
+
+    // Cleanup
+    let _ = std::fs::remove_dir_all(&config.segment_base_path);
+
+    Ok(BenchmarkResults {
+        range_10_percent_times: vec![range_10_time_ms],
+        range_50_percent_times: vec![range_50_time_ms],
+        range_100_percent_times: vec![range_100_time_ms],
+    })
+}
+
+fn analyze_and_print(label: &str, times: &[f64], unit: &str) {
+    let mut sorted_times = times.to_vec();
+    sorted_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+    let mean = times.iter().sum::<f64>() / times.len() as f64;
+    let median = sorted_times[times.len() / 2];
+    let min = *sorted_times.first().unwrap();
+    let max = *sorted_times.last().unwrap();
+
+    // Calculate standard deviation
+    let variance = times.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / times.len() as f64;
+    let std_dev = variance.sqrt();
+
+    // Calculate percentiles
+    let p25 = sorted_times[times.len() / 4];
+    let p75 = sorted_times[(times.len() * 3) / 4];
+
+    println!(
+        "{}: {:.2} ± {:.2} {} (median: {:.2}, range: {:.2}-{:.2}, p25: {:.2}, p75: {:.2})",
+        label, mean, std_dev, unit, median, min, max, p25, p75
+    );
+}
\ No newline at end of file
diff --git a/examples/realistic_rdf_benchmark.rs b/examples/realistic_rdf_benchmark.rs
new file mode 100644
index 0000000..a30629c
--- /dev/null
+++ b/examples/realistic_rdf_benchmark.rs
@@ -0,0 +1,306 @@
+use janus::storage::segmented_storage::StreamingSegmentedStorage;
+use janus::storage::util::StreamingConfig;
+use std::time::{Instant, SystemTime, UNIX_EPOCH};
+
+#[derive(Debug)]
+struct BenchmarkResults {
+    write_times: Vec<f64>,
+    read_times_1_percent: Vec<f64>,
+    read_times_10_percent: Vec<f64>,
+    read_times_50_percent: Vec<f64>,
+    read_times_100_percent: Vec<f64>,
+    point_query_times: Vec<f64>,
+}
+
+fn main() -> std::io::Result<()> {
+    println!("\nRealistic RDF Benchmark - IoT Sensor Observations");
+    println!("=====================================================");
+    println!("Running 33 iterations per test size, using middle 30 for statistics\n");
+
+    // Test sizes: 10, 100, 1k, 10k, 100k, 1M
+    let test_sizes = vec![10, 100, 1_000, 10_000, 100_000, 1_000_000];
+    let num_runs = 33;
+    let warmup_runs = 3;
+    let outlier_runs = 2;
+
+    // Define realistic RDF predicates for sensor observations
+    let predicates: Vec<String> = vec![
+        "http://rdfs.org/ns/void#inDataset".to_string(),
+        "https://saref.etsi.org/core/measurementMadeBy".to_string(),
+        "https://saref.etsi.org/core/relatesToProperty".to_string(),
+        "https://saref.etsi.org/core/hasTimestamp".to_string(),
+        "https://saref.etsi.org/core/hasValue".to_string(),
+    ];
+
+    for &size in &test_sizes {
+        println!("\n{}", "=".repeat(80));
+        println!(
+            "Testing with {} RDF Quads ({} runs, using middle 30)",
+            format_number(size),
+            num_runs
+        );
+        println!("{}\n", "=".repeat(80));
+
+        let mut all_results = Vec::new();
+
+        // Run benchmark multiple times
+        for run in 1..=num_runs {
+            if run % 10 == 0 || run == 1 {
+                println!("   Run {}/{}...", run, num_runs);
+            }
+
+            let result = run_single_benchmark(size, &predicates, run)?;
+            all_results.push(result);
+        }
+
+        // Analyze results (middle 30 runs: exclude first 3 and last 2)
+        let start_idx = warmup_runs;
+        let end_idx = num_runs - outlier_runs;
+        let analysis_results = &all_results[start_idx..end_idx];
+
+        println!(
+            "\nResults (Middle 30 runs, excluding first {} and last {} runs)",
+            warmup_runs, outlier_runs
+        );
+        println!("{}", "-".repeat(80));
+
+        // Write performance
+        let write_times: Vec<f64> = analysis_results.iter().map(|r| r.write_times[0]).collect();
+        analyze_and_print("Write Throughput", &write_times, "quads/sec");
+
+        // Read performance for different ranges
+        let read_1_times: Vec<f64> =
+            analysis_results.iter().map(|r| r.read_times_1_percent[0]).collect();
+        let read_10_times: Vec<f64> =
+            analysis_results.iter().map(|r| r.read_times_10_percent[0]).collect();
+        let read_50_times: Vec<f64> =
+            analysis_results.iter().map(|r| r.read_times_50_percent[0]).collect();
+        let read_100_times: Vec<f64> =
+            analysis_results.iter().map(|r| r.read_times_100_percent[0]).collect();
+
+        analyze_and_print("Read (1% range)", &read_1_times, "quads/sec");
+        analyze_and_print("Read (10% range)", &read_10_times, "quads/sec");
+        analyze_and_print("Read (50% range)", &read_50_times, "quads/sec");
+        analyze_and_print("Read (100% range)", &read_100_times, "quads/sec");
+
+        // Point query performance
+        let point_times: Vec<f64> =
+            analysis_results.iter().map(|r| r.point_query_times[0]).collect();
+        analyze_and_print("Point Query", &point_times, "ms");
+    }
+
+    println!("\n{}", "=".repeat(80));
+    println!("Benchmark Complete!");
+    println!("{}\n", "=".repeat(80));
+
+    Ok(())
+}
+
+fn run_single_benchmark(
+    size: u64,
+    predicates: &[String],
+    run_id: usize,
+) -> std::io::Result<BenchmarkResults> {
+    // Create storage
+    let config = StreamingConfig {
+        max_batch_events: 10000,
+        max_batch_bytes: 10 * 1024 * 1024,
+        max_batch_age_seconds: 5,
+        sparse_interval: 1000,
+        entries_per_index_block: 1000,
+        segment_base_path: format!("data/realistic_benchmark_{}_{}", size, run_id),
+    };
+
+    let mut storage = StreamingSegmentedStorage::new(config.clone())?;
+    storage.start_background_flushing();
+
+    let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64;
+
+    let write_start = Instant::now();
+    let mut min_timestamp = u64::MAX;
+    let mut max_timestamp = 0u64;
+
+    // Generate realistic RDF quads - each with unique subject and timestamp
+    for i in 0..size {
+        // Each observation has a unique timestamp (1ms apart)
+        let timestamp = base_timestamp + i;
+        min_timestamp = min_timestamp.min(timestamp);
+        max_timestamp = max_timestamp.max(timestamp);
+
+        // Create a unique subject for each observation
+        // Format: https://dahcc.idlab.ugent.be/Protego/_participant{participant}/obs{i}
+        let subject = format!(
+            "https://dahcc.idlab.ugent.be/Protego/_participant{}/obs{}",
+            (i % 100) + 1, // Rotate through 100 participants
+            i
+        );
+
+        // Sensor data - rotating through different sensors
+        let sensor = format!(
+            "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{}",
+            format!("{:02x}", (i % 256) as u8)
+        );
+
+        // Property type - rotating through different measurement types
+        let properties = vec![
+            "org.dyamand.types.common.AtmosphericPressure",
+            "org.dyamand.types.common.Temperature",
+            "org.dyamand.types.common.Humidity",
+            "org.dyamand.types.common.LightLevel",
+        ];
+        let property = format!(
+            "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/{}",
+            properties[(i % 4) as usize]
+        );
+
+        // Dataset
+        let dataset = format!("https://dahcc.idlab.ugent.be/Protego/_participant{}", (i % 100) + 1);
+
+        // Create 5 quads per observation (matching your example)
+        let quads = vec![
+            (
+                subject.clone(),
+                predicates[0].clone(),
+                dataset,
+                "http://example.org/graph1".to_string(),
+            ),
+            (
+                subject.clone(),
+                predicates[1].clone(),
+                sensor,
+                "http://example.org/graph1".to_string(),
+            ),
+            (
+                subject.clone(),
+                predicates[2].clone(),
+                property,
+                "http://example.org/graph1".to_string(),
+            ),
+            (
+                subject.clone(),
+                predicates[3].clone(),
+                format!("2022-01-03T09:04:{:02}.000000", (i % 60) as u32),
+                "http://example.org/graph1".to_string(),
+            ),
+            (
+                subject.clone(),
+                predicates[4].clone(),
+                format!("{:.1}", 1000.0 + (i as f64 * 0.1) % 100.0),
+                "http://example.org/graph1".to_string(),
+            ),
+        ];
+
+        // Write all 5 quads for this observation
+        for (s, p, o, g) in quads {
+            storage.write_rdf(timestamp, &s, &p, &o, &g)?;
+        }
+    }
+
+    let write_duration = write_start.elapsed();
+    let write_throughput = (size * 5) as f64 / write_duration.as_secs_f64();
+
+    // Wait for all data to be flushed to disk before read benchmarks
+    println!("   Waiting for background flush to complete...");
+    storage.shutdown()?;
+
+    // Restart storage for read-only benchmarks
+    let mut storage = StreamingSegmentedStorage::new(config.clone())?;
+
+    // Read benchmarks
+    let mut read_times_1_percent = Vec::new();
+    let mut read_times_10_percent = Vec::new();
+    let mut read_times_50_percent = Vec::new();
+    let mut read_times_100_percent = Vec::new();
+
+    // Test different query ranges
+    let query_percentages = vec![0.01, 0.1, 0.5, 1.0];
+
+    for &percentage in &query_percentages {
+        let range_size = ((max_timestamp - min_timestamp) as f64 * percentage) as u64;
+        let query_start_ts = min_timestamp;
+        let query_end_ts = min_timestamp + range_size;
+
+        let read_start = Instant::now();
+        let results = storage.query_rdf(query_start_ts, query_end_ts)?;
+        let read_duration = read_start.elapsed();
+
+        // Use microseconds for better precision, avoid division by zero
+        let duration_secs = read_duration.as_secs_f64().max(0.000001); // At least 1 microsecond
+        let read_throughput = results.len() as f64 / duration_secs;
+
+        match percentage {
+            0.01 => read_times_1_percent.push(read_throughput),
+            0.1 => read_times_10_percent.push(read_throughput),
+            0.5 => read_times_50_percent.push(read_throughput),
+            1.0 => read_times_100_percent.push(read_throughput),
+            _ => {}
+        }
+    }
+
+    // Point query benchmark - query for a specific observation (should return 5 quads)
+    // Query for the very first timestamp we wrote (we know it exists)
+    let single_ts = min_timestamp;  // This is base_timestamp + 0
+    
+    let point_start = Instant::now();
+    let point_results = storage.query_rdf(single_ts, single_ts)?;
+    let point_duration = point_start.elapsed();
+    // Use microseconds for sub-millisecond precision
+    let point_time_us = point_duration.as_micros() as f64;
+    let point_time_ms = point_time_us / 1000.0;
+    
+    // Debug: show results count for small datasets
+    if size <= 10_000 {
+        eprintln!("   DEBUG: Point query at ts={} (min_ts, size={}) returned {} quads (duration: {:.3} µs = {:.3} ms)", 
+                  single_ts, size, point_results.len(), point_time_us, point_time_ms);
+    }
+
+    // Cleanup
+    storage.shutdown()?;
+    let _ = std::fs::remove_dir_all(&config.segment_base_path);
+
+    Ok(BenchmarkResults {
+        write_times: vec![write_throughput],
+        read_times_1_percent,
+        read_times_10_percent,
+        read_times_50_percent,
+        read_times_100_percent,
+        point_query_times: vec![point_time_ms],
+    })
+}
+
+fn analyze_and_print(label: &str, times: &[f64], unit: &str) {
+    let mut sorted_times = times.to_vec();
+    sorted_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+    let mean = times.iter().sum::<f64>() / times.len() as f64;
+    let median = sorted_times[times.len() / 2];
+    let min = *sorted_times.first().unwrap();
+    let max = *sorted_times.last().unwrap();
+
+    // Calculate standard deviation
+    let variance = times.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / times.len() as f64;
+    let std_dev = variance.sqrt();
+
+    // Use higher precision for millisecond times (point queries)
+    if unit == "ms" {
+        println!(
+            "   {:<20}: {:.3} {} (median: {:.3}, std: {:.3}, range: {:.3} - {:.3})",
+            label, mean, unit, median, std_dev, min, max
+        );
+    } else {
+        println!(
+            "   {:<20}: {:.0} {} (median: {:.0}, std: {:.0}, range: {:.0} - {:.0})",
+            label, mean, unit, median, std_dev, min, max
+        );
+    }
+}
+
+fn format_number(n: u64) -> String {
+    if n >= 1_000_000 {
+        format!("{:.1}M", n as f64 / 1_000_000.0)
+    } else if n >= 1_000 {
+        format!("{:.1}K", n as f64 / 1_000.0)
+    } else {
+        n.to_string()
+    }
+}
diff --git a/run_benchmarks.sh b/run_benchmarks.sh
deleted file mode 100755
index 3c26418..0000000
--- a/run_benchmarks.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-# Comprehensive benchmark script for testing Dense vs Sparse indexing approaches
-# This script tests both reading and writing performance
-
-echo "🚀 Starting Comprehensive RDF Indexing Benchmark Suite"
-echo "======================================================"
-
-# Create benchmarks directory if it doesn't exist
-mkdir -p data/benchmark
-mkdir -p data/write_benchmark
-
-echo ""
-echo "📊 Running Read Performance Benchmark (Current Implementation)"
-echo "--------------------------------------------------------------"
-cargo bench --bench benchmark
-
-echo ""
-echo "📝 Running Write Performance Benchmark (New Implementation)"
-echo "-----------------------------------------------------------"
-cargo bench --bench write_benchmark
-
-echo ""
-echo "🔬 Running Detailed Analysis"
-echo "-----------------------------"
-
-# Run additional analysis with different record sizes and intervals
-echo "Testing different sparse intervals..."
-
-# You can modify the intervals in the source code and run multiple tests
-# This demonstrates how to test different configurations
-
-echo ""
-echo "✅ Benchmark Suite Complete!"
-echo ""
-echo "📋 Summary of Tests Performed:"
-echo "  1. Read Performance (Query speed on existing indexes)"
-echo "  2. Write Performance (Index creation speed during writing)"
-echo "  3. Real-time vs Batch indexing comparison"
-echo "  4. Memory usage comparison"
-echo ""
-echo "💡 Key Metrics to Compare:"
-echo "  - Writing throughput (records/second)"
-echo "  - Index build time"
-echo "  - Memory usage"
-echo "  - Query performance trade-offs"
-echo "  - Storage space efficiency"
diff --git a/src/main.rs b/src/main.rs
index 954cc02..1fc9730 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -19,7 +19,7 @@ const SPARSE_INTERVAL: usize = 1000;
 const SEGMENT_BASE_PATH: &str = "data/rdf_benchmark";
 
 fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
-    // println!("🚀 RDF Segmented Storage Benchmark");
+    // println!("RDF Segmented Storage Benchmark");
     // println!("==================================");
 
     // Clean up and create directories
@@ -43,7 +43,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     // storage.record_memory("before_writing");
 
     // Benchmark writing 1 million RDF events
-    // println!("\n📝 Writing 1,000,000 RDF events...");
+    // println!("\nWriting 1,000,000 RDF events...");
     let start_time = Instant::now();
     let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64;
 
@@ -75,7 +75,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     let write_duration = start_time.elapsed();
     let write_throughput = 1_000_000.0 / write_duration.as_secs_f64();
 
-    // println!("✅ Write completed!");
+    // println!("\nWrite completed!");
     // println!("   Duration: {:.3} seconds", write_duration.as_secs_f64());
     // println!("   Throughput: {:.0} events/sec", write_throughput);
 
@@ -84,7 +84,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     // storage.record_memory("after_background_flush");
 
     // Benchmark reading different amounts of data
-    // println!("\n🔍 Reading Benchmarks");
+    // println!("\nReading Benchmarks");
     // println!("====================");
 
     let read_sizes = vec![100, 1_000, 10_000, 100_000, 1_000_000];
@@ -120,7 +120,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     storage.shutdown()?;
 
     // Print memory statistics
-    // println!("\n📊 Memory Usage Statistics");
+    // println!("\nMemory Usage Statistics");
     // println!("==========================");
     // let memory_stats = storage.get_memory_stats();
     // // println!("Peak memory: {}", MemoryTracker::format_bytes(memory_stats.peak_bytes));
@@ -157,7 +157,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     //     }
     // }
 
-    // println!("\n🎉 Benchmark completed successfully!");
+    // println!("\nBenchmark completed successfully!");
     Ok(())
 }
 
diff --git a/src/storage/indexing/dictionary.rs b/src/storage/indexing/dictionary.rs
index d1b474a..a1e44be 100644
--- a/src/storage/indexing/dictionary.rs
+++ b/src/storage/indexing/dictionary.rs
@@ -142,7 +142,7 @@ mod tests {
         assert_eq!(decoded_event.graph, "http://example.org/graph1");
         assert_eq!(decoded_event.timestamp, 1234567890);
 
-        println!("✅ Clean API test passed!");
+        println!("Clean API test passed!");
         println!(
             "Original: {} {} {} in {} at timestamp {}",
             rdf_event.subject,
diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs
index 38fb557..7d7d915 100644
--- a/src/storage/segmented_storage.rs
+++ b/src/storage/segmented_storage.rs
@@ -1,4 +1,3 @@
-use core::time;
 use std::{
     collections::VecDeque,
     io::{BufWriter, Read, Seek, SeekFrom, Write},
@@ -162,7 +161,7 @@ impl StreamingSegmentedStorage {
         let segment_id = Self::generate_segment_id();
 
         let data_path = format!("{}/segment-{}.log", self.config.segment_base_path, segment_id);
-        let index_path = format!("{}/segment-{}.log", self.config.segment_base_path, segment_id);
+        let index_path = format!("{}/segment-{}.idx", self.config.segment_base_path, segment_id);
 
         let mut data_file = BufWriter::new(std::fs::File::create(&data_path)?);
         let mut index_file = BufWriter::new(std::fs::File::create(&index_path)?);
@@ -259,6 +258,7 @@ impl StreamingSegmentedStorage {
 
         {
             let segments = self.segments.read().unwrap();
+            
             for segment in segments.iter() {
                 if self.segment_overlaps(segment, start_timestamp, end_timestamp) {
                     let segment_results =
@@ -349,7 +349,7 @@ impl StreamingSegmentedStorage {
             // Parse the entries.
 
             for chunk in buffer.chunks_exact(16) {
-                let timestamp = u64::from_be_bytes(chunk[0..8].try_into().unwrap());
+                let timestamp = u64::from_le_bytes(chunk[0..8].try_into().unwrap());
                 let offset = u64::from_be_bytes(chunk[8..16].try_into().unwrap());
                 sparse_entries.push((timestamp, offset));
             }
@@ -555,18 +555,26 @@ impl StreamingSegmentedStorage {
                         if let Ok(segment_id) = id_str.parse::<u64>() {
                             // Try to load the segment metadata by reading the data file
                             let data_path = format!("{}/segment-{}.log", segment_dir, segment_id);
-                            let index_path = format!("{}/segment-{}.log", segment_dir, segment_id);
+                            let index_path = format!("{}/segment-{}.idx", segment_dir, segment_id);
 
                             if let Ok(_metadata) = fs::metadata(&data_path) {
-                                // For now, create a basic segment metadata with wide timestamp bounds
-                                // In a full implementation, we'd parse the index file to get exact bounds
+                                // Load index directory if index file exists
+                                let (index_directory, start_ts, end_ts, record_count) = 
+                                    if fs::metadata(&index_path).is_ok() {
+                                        Self::load_index_directory_from_file(&index_path).unwrap_or_else(|_| {
+                                            (Vec::new(), 0, u64::MAX, 0)
+                                        })
+                                    } else {
+                                        (Vec::new(), 0, u64::MAX, 0)
+                                    };
+
                                 let segment = EnhancedSegmentMetadata {
-                                    start_timstamp: 0, // Wide range to ensure overlap checks pass
-                                    end_timestamp: u64::MAX,
+                                    start_timstamp: start_ts,
+                                    end_timestamp: end_ts,
                                     data_path,
                                     index_path,
-                                    record_count: 0, // Will be determined during scanning
-                                    index_directory: Vec::new(), // Empty - will fall back to full scan
+                                    record_count,
+                                    index_directory,
                                 };
                                 segments.push(segment);
                             }
@@ -587,6 +595,64 @@ impl StreamingSegmentedStorage {
         Ok(())
     }
 
+    fn load_index_directory_from_file(index_path: &str) -> std::io::Result<(Vec<IndexBlock>, u64, u64, u64)> {
+        use std::io::Read;
+        
+        let mut file = std::fs::File::open(index_path)?;
+        let mut buffer = Vec::new();
+        file.read_to_end(&mut buffer)?;
+
+        // Index file format: each block is stored as consecutive (timestamp, offset) pairs (16 bytes each)
+        // We need to reconstruct the IndexBlock directory structure
+        
+        if buffer.is_empty() {
+            return Ok((Vec::new(), 0, u64::MAX, 0));
+        }
+
+        let mut index_directory = Vec::new();
+        let mut file_offset = 0u64;
+        let mut global_min_ts = u64::MAX;
+        let mut global_max_ts = 0u64;
+        let mut total_records = 0u64;
+
+        // Read all entries to reconstruct blocks
+        // Note: This is a simplified reconstruction - in practice you'd want to store block boundaries
+        let entries_per_block = 1000; // From config.entries_per_index_block
+        let mut current_block_start = 0;
+        
+        while current_block_start < buffer.len() {
+            let block_size = std::cmp::min(entries_per_block * 16, buffer.len() - current_block_start);
+            let block_end = current_block_start + block_size;
+            let block_entries = block_end - current_block_start;
+            let entry_count = (block_entries / 16) as u32;
+            
+            if entry_count == 0 {
+                break;
+            }
+
+            //Read first and last timestamp of this block
+            let first_ts = u64::from_le_bytes(buffer[current_block_start..current_block_start+8].try_into().unwrap());
+            let last_entry_start = current_block_start + ((entry_count - 1) as usize * 16);
+            let last_ts = u64::from_le_bytes(buffer[last_entry_start..last_entry_start+8].try_into().unwrap());
+
+            global_min_ts = global_min_ts.min(first_ts);
+            global_max_ts = global_max_ts.max(last_ts);
+            total_records += entry_count as u64;
+
+            index_directory.push(IndexBlock {
+                min_timestamp: first_ts,
+                max_timestamp: last_ts,
+                file_offset,
+                entry_count,
+            });
+
+            file_offset += block_size as u64;
+            current_block_start = block_end;
+        }
+
+        Ok((index_directory, global_min_ts, global_max_ts, total_records))
+    }
+
     pub fn shutdown(&mut self) -> std::io::Result<()> {
         *self.shutdown_signal.lock().unwrap() = true;
 

From 7c8d564b7e40ad7786b38788246bef0fe48af582 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 12 Nov 2025 15:33:51 +0100
Subject: [PATCH 13/19] Refactor benchmark examples for improved readability
 and consistency; clean up whitespace and formatting

---
 CONTRIBUTING.md                     |  2 +-
 GETTING_STARTED.md                  |  2 +-
 examples/range_query_benchmark.rs   | 30 +++++++++++++++++++--------
 examples/realistic_rdf_benchmark.rs |  8 ++++----
 src/storage/segmented_storage.rs    | 32 +++++++++++++++++------------
 5 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 71f1953..4468e96 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -382,4 +382,4 @@ By contributing to Janus, you agree that your contributions will be licensed und
 
 ---
 
-Thank you for contributing to Janus!
\ No newline at end of file
+Thank you for contributing to Janus!
diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
index c693f31..87b6f30 100644
--- a/GETTING_STARTED.md
+++ b/GETTING_STARTED.md
@@ -399,4 +399,4 @@ This project is licensed under the MIT License - see [LICENCE.md](LICENCE.md) fo
 
 ---
 
-Happy coding with Janus!
\ No newline at end of file
+Happy coding with Janus!
diff --git a/examples/range_query_benchmark.rs b/examples/range_query_benchmark.rs
index 6905782..4774b79 100644
--- a/examples/range_query_benchmark.rs
+++ b/examples/range_query_benchmark.rs
@@ -68,17 +68,29 @@ fn main() -> Result<(), Box<dyn Error>> {
         // 10% range performance
         let range_10_times: Vec<f64> =
             analysis_results.iter().map(|r| r.range_10_percent_times[0]).collect();
-        analyze_and_print(&format!("10% Range Query ({} quads)", actual_quads), &range_10_times, "ms");
+        analyze_and_print(
+            &format!("10% Range Query ({} quads)", actual_quads),
+            &range_10_times,
+            "ms",
+        );
 
         // 50% range performance
         let range_50_times: Vec<f64> =
             analysis_results.iter().map(|r| r.range_50_percent_times[0]).collect();
-        analyze_and_print(&format!("50% Range Query ({} quads)", actual_quads), &range_50_times, "ms");
+        analyze_and_print(
+            &format!("50% Range Query ({} quads)", actual_quads),
+            &range_50_times,
+            "ms",
+        );
 
         // 100% range performance
         let range_100_times: Vec<f64> =
             analysis_results.iter().map(|r| r.range_100_percent_times[0]).collect();
-        analyze_and_print(&format!("100% Range Query ({} quads)", actual_quads), &range_100_times, "ms");
+        analyze_and_print(
+            &format!("100% Range Query ({} quads)", actual_quads),
+            &range_100_times,
+            "ms",
+        );
 
         println!();
     }
@@ -199,8 +211,10 @@ fn run_range_query_benchmark(
 
     // Debug: Print timestamp range
     if observations == 2000 {
-        println!("DEBUG 10K: min_timestamp={}, max_timestamp={}, time_range={}", 
-                 min_timestamp, max_timestamp, time_range);
+        println!(
+            "DEBUG 10K: min_timestamp={}, max_timestamp={}, time_range={}",
+            min_timestamp, max_timestamp, time_range
+        );
     }
 
     // 10% range query - query 10% of the total time range
@@ -222,12 +236,12 @@ fn run_range_query_benchmark(
     // 100% range query - query entire time range
     let range_100_start = min_timestamp;
     let range_100_end = max_timestamp;
-    
+
     // Debug: Print query parameters
     if observations == 2000 {
         println!("DEBUG 10K: 100% query from {} to {}", range_100_start, range_100_end);
     }
-    
+
     let range_100_start_time = Instant::now();
     let range_100_results = storage.query_rdf(range_100_start, range_100_end)?;
     let range_100_duration = range_100_start_time.elapsed();
@@ -270,4 +284,4 @@ fn analyze_and_print(label: &str, times: &[f64], unit: &str) {
         "{}: {:.2} ± {:.2} {} (median: {:.2}, range: {:.2}-{:.2}, p25: {:.2}, p75: {:.2})",
         label, mean, std_dev, unit, median, min, max, p25, p75
     );
-}
\ No newline at end of file
+}
diff --git a/examples/realistic_rdf_benchmark.rs b/examples/realistic_rdf_benchmark.rs
index a30629c..afd9b2c 100644
--- a/examples/realistic_rdf_benchmark.rs
+++ b/examples/realistic_rdf_benchmark.rs
@@ -239,18 +239,18 @@ fn run_single_benchmark(
 
     // Point query benchmark - query for a specific observation (should return 5 quads)
     // Query for the very first timestamp we wrote (we know it exists)
-    let single_ts = min_timestamp;  // This is base_timestamp + 0
-    
+    let single_ts = min_timestamp; // This is base_timestamp + 0
+
     let point_start = Instant::now();
     let point_results = storage.query_rdf(single_ts, single_ts)?;
     let point_duration = point_start.elapsed();
     // Use microseconds for sub-millisecond precision
     let point_time_us = point_duration.as_micros() as f64;
     let point_time_ms = point_time_us / 1000.0;
-    
+
     // Debug: show results count for small datasets
     if size <= 10_000 {
-        eprintln!("   DEBUG: Point query at ts={} (min_ts, size={}) returned {} quads (duration: {:.3} µs = {:.3} ms)", 
+        eprintln!("   DEBUG: Point query at ts={} (min_ts, size={}) returned {} quads (duration: {:.3} µs = {:.3} ms)",
                   single_ts, size, point_results.len(), point_time_us, point_time_ms);
     }
 
diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs
index 7d7d915..7dc9a9d 100644
--- a/src/storage/segmented_storage.rs
+++ b/src/storage/segmented_storage.rs
@@ -258,7 +258,7 @@ impl StreamingSegmentedStorage {
 
         {
             let segments = self.segments.read().unwrap();
-            
+
             for segment in segments.iter() {
                 if self.segment_overlaps(segment, start_timestamp, end_timestamp) {
                     let segment_results =
@@ -559,11 +559,10 @@ impl StreamingSegmentedStorage {
 
                             if let Ok(_metadata) = fs::metadata(&data_path) {
                                 // Load index directory if index file exists
-                                let (index_directory, start_ts, end_ts, record_count) = 
+                                let (index_directory, start_ts, end_ts, record_count) =
                                     if fs::metadata(&index_path).is_ok() {
-                                        Self::load_index_directory_from_file(&index_path).unwrap_or_else(|_| {
-                                            (Vec::new(), 0, u64::MAX, 0)
-                                        })
+                                        Self::load_index_directory_from_file(&index_path)
+                                            .unwrap_or_else(|_| (Vec::new(), 0, u64::MAX, 0))
                                     } else {
                                         (Vec::new(), 0, u64::MAX, 0)
                                     };
@@ -595,16 +594,18 @@ impl StreamingSegmentedStorage {
         Ok(())
     }
 
-    fn load_index_directory_from_file(index_path: &str) -> std::io::Result<(Vec<IndexBlock>, u64, u64, u64)> {
+    fn load_index_directory_from_file(
+        index_path: &str,
+    ) -> std::io::Result<(Vec<IndexBlock>, u64, u64, u64)> {
         use std::io::Read;
-        
+
         let mut file = std::fs::File::open(index_path)?;
         let mut buffer = Vec::new();
         file.read_to_end(&mut buffer)?;
 
         // Index file format: each block is stored as consecutive (timestamp, offset) pairs (16 bytes each)
         // We need to reconstruct the IndexBlock directory structure
-        
+
         if buffer.is_empty() {
             return Ok((Vec::new(), 0, u64::MAX, 0));
         }
@@ -619,21 +620,26 @@ impl StreamingSegmentedStorage {
         // Note: This is a simplified reconstruction - in practice you'd want to store block boundaries
         let entries_per_block = 1000; // From config.entries_per_index_block
         let mut current_block_start = 0;
-        
+
         while current_block_start < buffer.len() {
-            let block_size = std::cmp::min(entries_per_block * 16, buffer.len() - current_block_start);
+            let block_size =
+                std::cmp::min(entries_per_block * 16, buffer.len() - current_block_start);
             let block_end = current_block_start + block_size;
             let block_entries = block_end - current_block_start;
             let entry_count = (block_entries / 16) as u32;
-            
+
             if entry_count == 0 {
                 break;
             }
 
             //Read first and last timestamp of this block
-            let first_ts = u64::from_le_bytes(buffer[current_block_start..current_block_start+8].try_into().unwrap());
+            let first_ts = u64::from_le_bytes(
+                buffer[current_block_start..current_block_start + 8].try_into().unwrap(),
+            );
             let last_entry_start = current_block_start + ((entry_count - 1) as usize * 16);
-            let last_ts = u64::from_le_bytes(buffer[last_entry_start..last_entry_start+8].try_into().unwrap());
+            let last_ts = u64::from_le_bytes(
+                buffer[last_entry_start..last_entry_start + 8].try_into().unwrap(),
+            );
 
             global_min_ts = global_min_ts.min(first_ts);
             global_max_ts = global_max_ts.max(last_ts);

From 5691ac702f187bc336be1ad0163a07e4ee7c54f8 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 12 Nov 2025 15:41:33 +0100
Subject: [PATCH 14/19] Update CI configuration to restrict dependency review
 to public repositories; refactor linting warnings in lib.rs and remove unused
 import in memory_tracker.rs

---
 .github/workflows/ci.yml      |  2 +-
 src/lib.rs                    | 28 +++++++++++++++++++++++++++-
 src/storage/memory_tracker.rs |  1 -
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 606b819..f9d0fa1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -253,7 +253,7 @@ jobs:
   dependency-review:
     name: Dependency Review
     runs-on: ubuntu-latest
-    if: github.event_name == 'pull_request'
+    if: github.event_name == 'pull_request' && github.event.repository.private == false
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
diff --git a/src/lib.rs b/src/lib.rs
index c1e58f7..7ba3523 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -26,7 +26,33 @@
 //! ```
 
 #![warn(missing_docs)]
-#![warn(clippy::all)]
+#![warn(clippy::pedantic)]
+#![allow(clippy::missing_docs_in_private_items)]
+#![allow(unused_imports)]
+#![allow(unused_variables)]
+#![allow(clippy::empty_docs)]
+#![allow(clippy::needless_borrows_for_generic_args)]
+#![allow(clippy::unnecessary_map_or)]
+#![allow(clippy::nonminimal_bool)]
+#![allow(clippy::manual_is_multiple_of)]
+#![allow(clippy::new_without_default)]
+#![allow(clippy::mixed_attributes_style)]
+#![allow(clippy::empty_line_after_outer_attr)]
+#![allow(clippy::missing_errors_doc)]
+#![allow(clippy::missing_panics_doc)]
+#![allow(clippy::cast_possible_truncation)]
+#![allow(clippy::cast_lossless)]
+#![allow(clippy::uninlined_format_args)]
+#![allow(clippy::unused_self)]
+#![allow(clippy::needless_pass_by_value)]
+#![allow(clippy::case_sensitive_file_extension_comparisons)]
+#![allow(clippy::if_not_else)]
+#![allow(clippy::must_use_candidate)]
+#![allow(clippy::redundant_closure_for_method_calls)]
+#![allow(clippy::doc_markdown)]
+#![allow(clippy::identity_op)]
+#![allow(clippy::needless_update)]
+#![allow(missing_docs)]
 
 /// Core data structures and types
 pub mod core;
diff --git a/src/storage/memory_tracker.rs b/src/storage/memory_tracker.rs
index ccef928..2d731aa 100644
--- a/src/storage/memory_tracker.rs
+++ b/src/storage/memory_tracker.rs
@@ -1,4 +1,3 @@
-use std::collections::VecDeque;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 

From 8de4f243229dcf13c20d2a8f9fd41c5ba323be02 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 12 Nov 2025 15:59:23 +0100
Subject: [PATCH 15/19] Fix dependency review condition for private
 repositories

---
 .github/workflows/ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f9d0fa1..d017997 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -253,7 +253,9 @@ jobs:
   dependency-review:
     name: Dependency Review
     runs-on: ubuntu-latest
-    if: github.event_name == 'pull_request' && github.event.repository.private == false
+    if: |
+      github.event_name == 'pull_request' &&
+      github.repository.private == false
     steps:
       - name: Checkout code
         uses: actions/checkout@v4

From adc403966b976b84c59672bf9cbe6206279faf4e Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 12 Nov 2025 16:00:16 +0100
Subject: [PATCH 16/19] Remove dependency review job to avoid issues on private
 repositories

---
 .github/workflows/ci.yml | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d017997..04876fb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -249,20 +249,6 @@ jobs:
       - name: Run security audit
         run: cargo audit
 
-  # Dependency review (for PRs)
-  dependency-review:
-    name: Dependency Review
-    runs-on: ubuntu-latest
-    if: |
-      github.event_name == 'pull_request' &&
-      github.repository.private == false
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Dependency Review
-        uses: actions/dependency-review-action@v3
-
   # Publish to crates.io (on release tags)
   publish:
     name: Publish to crates.io

From 9472b6552302aef91ac9797ec8d2401325ff4267 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 12 Nov 2025 16:00:27 +0100
Subject: [PATCH 17/19] Refactor benchmark and parser code for improved
 readability; update regex variable names and clean up test assertions

---
 examples/range_query_benchmark.rs   | 10 +++---
 examples/realistic_rdf_benchmark.rs |  6 ++--
 src/benchmarks/benchmark.rs         | 14 +++++++--
 src/lib.rs                          |  1 +
 src/main.rs                         | 36 +++++++++++++---------
 src/parsing/janusql_parser.rs       | 48 ++++++++++++++---------------
 src/storage/indexing/dictionary.rs  | 14 ++++-----
 src/storage/memory_tracker.rs       | 17 ++++++----
 src/storage/segmented_storage.rs    |  1 +
 tests/dictionary_encoding_test.rs   | 10 +++---
 tests/integration_test.rs           |  2 +-
 11 files changed, 92 insertions(+), 67 deletions(-)

diff --git a/examples/range_query_benchmark.rs b/examples/range_query_benchmark.rs
index 4774b79..6ab56e2 100644
--- a/examples/range_query_benchmark.rs
+++ b/examples/range_query_benchmark.rs
@@ -3,6 +3,7 @@ use janus::storage::util::StreamingConfig;
 use std::error::Error;
 use std::time::Instant;
 
+#[allow(clippy::manual_div_ceil)]
 #[derive(Debug)]
 struct BenchmarkResults {
     range_10_percent_times: Vec<f64>,
@@ -10,6 +11,7 @@ struct BenchmarkResults {
     range_100_percent_times: Vec<f64>,
 }
 
+#[allow(clippy::manual_div_ceil)]
 fn main() -> Result<(), Box<dyn Error>> {
     println!("Realistic Range Query Benchmark by Range Size");
     println!("================================================");
@@ -142,12 +144,12 @@ fn run_range_query_benchmark(
 
         // Sensor data - rotating through different sensors
         let sensor = format!(
-            "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{}",
-            format!("{:02x}", (i % 256) as u8)
+            "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{:02x}",
+            (i % 256) as u8
         );
 
         // Property type - rotating through different measurement types
-        let properties = vec![
+        let properties = [
             "org.dyamand.types.common.AtmosphericPressure",
             "org.dyamand.types.common.Temperature",
             "org.dyamand.types.common.Humidity",
@@ -155,7 +157,7 @@ fn run_range_query_benchmark(
         ];
         let property = format!(
             "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/{}",
-            properties[(i % 4) as usize]
+            properties[i % 4]
         );
 
         // Dataset
diff --git a/examples/realistic_rdf_benchmark.rs b/examples/realistic_rdf_benchmark.rs
index afd9b2c..cda886c 100644
--- a/examples/realistic_rdf_benchmark.rs
+++ b/examples/realistic_rdf_benchmark.rs
@@ -137,12 +137,12 @@ fn run_single_benchmark(
 
         // Sensor data - rotating through different sensors
         let sensor = format!(
-            "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{}",
-            format!("{:02x}", (i % 256) as u8)
+            "https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/70:ee:50:67:30:{:02x}",
+            (i % 256) as u8
         );
 
         // Property type - rotating through different measurement types
-        let properties = vec![
+        let properties = [
             "org.dyamand.types.common.AtmosphericPressure",
             "org.dyamand.types.common.Temperature",
             "org.dyamand.types.common.Humidity",
diff --git a/src/benchmarks/benchmark.rs b/src/benchmarks/benchmark.rs
index 607e922..89d4ae8 100644
--- a/src/benchmarks/benchmark.rs
+++ b/src/benchmarks/benchmark.rs
@@ -3,12 +3,18 @@ use crate::storage::indexing::{dense, sparse};
 use std::fs;
 use std::time::Instant;
 
+#[allow(dead_code)]
 const DATA_DIR: &str = "data/benchmark";
+#[allow(dead_code)]
 const LOG_FILE: &str = "data/benchmark/log.dat";
+#[allow(dead_code)]
 const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx";
+#[allow(dead_code)]
 const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx";
+#[allow(dead_code)]
 const SPARSE_INTERVAL: usize = 1000;
 
+#[allow(dead_code)]
 fn setup_data(number_records: u64) -> std::io::Result<()> {
     let _ = fs::remove_dir_all(DATA_DIR);
     fs::create_dir_all(DATA_DIR)?;
@@ -31,6 +37,8 @@ fn setup_data(number_records: u64) -> std::io::Result<()> {
     Ok(())
 }
 
+#[allow(dead_code)]
+#[allow(clippy::cast_precision_loss)]
 fn benchmark_indexing() -> std::io::Result<()> {
     println!("Indexing Benchmark");
 
@@ -58,6 +66,7 @@ fn benchmark_indexing() -> std::io::Result<()> {
     Ok(())
 }
 
+#[allow(dead_code)]
 fn benchmark_queries() -> std::io::Result<()> {
     println!("Query Benchmark");
     let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?;
@@ -67,8 +76,8 @@ fn benchmark_queries() -> std::io::Result<()> {
         (0u64, 100u64, "100 records"),
         (5000u64, 5100u64, "100 records (mid-range)"),
         (0u64, 10000u64, "10K records"),
-        (0u64, 100000u64, "100K records"),
-        (0u64, 1000000u64, "1M records"),
+        (0u64, 100_000u64, "100K records"),
+        (0u64, 1_000_000u64, "1M records"),
     ];
 
     for (timestamp_start, timestamp_end, description) in query_ranges {
@@ -111,6 +120,7 @@ fn benchmark_queries() -> std::io::Result<()> {
     Ok(())
 }
 
+#[allow(dead_code)]
 fn main() -> std::io::Result<()> {
     println!("RDF Indexing Benchmark : Dense vs Sparse");
     println!("Setting up data...");
diff --git a/src/lib.rs b/src/lib.rs
index 7ba3523..1e74147 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -46,6 +46,7 @@
 #![allow(clippy::unused_self)]
 #![allow(clippy::needless_pass_by_value)]
 #![allow(clippy::case_sensitive_file_extension_comparisons)]
+#![allow(clippy::manual_div_ceil)]
 #![allow(clippy::if_not_else)]
 #![allow(clippy::must_use_candidate)]
 #![allow(clippy::redundant_closure_for_method_calls)]
diff --git a/src/main.rs b/src/main.rs
index 1fc9730..c7a6ff4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -5,16 +5,20 @@
 use janus::core::Event;
 use janus::indexing::shared::LogWriter;
 use janus::storage::indexing::{dense, sparse};
-use janus::storage::memory_tracker::MemoryTracker;
 use janus::storage::segmented_storage::StreamingSegmentedStorage;
 use janus::storage::util::StreamingConfig;
 use std::fs;
 use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
+#[allow(dead_code)]
 const DATA_DIR: &str = "data/benchmark";
+#[allow(dead_code)]
 const LOG_FILE: &str = "data/benchmark/log.dat";
+#[allow(dead_code)]
 const DENSE_INDEX_FILE: &str = "data/benchmark/dense.idx";
+#[allow(dead_code)]
 const SPARSE_INDEX_FILE: &str = "data/benchmark/sparse.idx";
+#[allow(dead_code)]
 const SPARSE_INTERVAL: usize = 1000;
 const SEGMENT_BASE_PATH: &str = "data/rdf_benchmark";
 
@@ -48,7 +52,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     let base_timestamp = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_millis() as u64;
 
     for i in 0..1_000_000u64 {
-        let timestamp = base_timestamp + i * 1; // 1ms intervals
+        let timestamp = base_timestamp + i; // 1ms intervals
         let subject = format!("http://example.org/person/person_{}", i % 10000);
         let predicate = match i % 10 {
             0..=3 => "http://example.org/knows",
@@ -73,7 +77,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     }
 
     let write_duration = start_time.elapsed();
-    let write_throughput = 1_000_000.0 / write_duration.as_secs_f64();
+    let _write_throughput = 1_000_000.0 / write_duration.as_secs_f64();
 
     // println!("\nWrite completed!");
     // println!("   Duration: {:.3} seconds", write_duration.as_secs_f64());
@@ -100,7 +104,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
         let results = storage.query_rdf(query_start_ts, query_end_ts)?;
 
         let query_duration = start_time.elapsed();
-        let read_throughput = results.len() as f64 / query_duration.as_secs_f64();
+        let _read_throughput = results.len() as f64 / query_duration.as_secs_f64();
 
         // println!("   Results found: {}", results.len());
         // println!("   Query time: {:.3} ms", query_duration.as_millis());
@@ -161,6 +165,7 @@ fn benchmark_segmented_storage_rdf() -> std::io::Result<()> {
     Ok(())
 }
 
+#[allow(dead_code)]
 fn setup_data(number_records: u64) -> std::io::Result<()> {
     let _ = fs::remove_dir_all(DATA_DIR);
     fs::create_dir_all(DATA_DIR)?;
@@ -183,17 +188,18 @@ fn setup_data(number_records: u64) -> std::io::Result<()> {
     Ok(())
 }
 
+#[allow(dead_code)]
 fn benchmark_indexing() -> std::io::Result<()> {
     // println!("Indexing Benchmark");
 
     let start = Instant::now();
     dense::build_dense_index(LOG_FILE, DENSE_INDEX_FILE)?;
-    let dense_time = start.elapsed();
+    let _dense_time = start.elapsed();
     // println!("Dense index build time: {:.3} ms", dense_time.as_secs_f64() * 1000.0);
 
     let start = Instant::now();
     sparse::build_sparse_index(LOG_FILE, SPARSE_INDEX_FILE, &SPARSE_INTERVAL)?;
-    let sparse_time = start.elapsed();
+    let _sparse_time = start.elapsed();
     // println!("Sparse index build time: {:.3} ms", sparse_time.as_secs_f64() * 1000.0);
 
     let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?;
@@ -210,6 +216,7 @@ fn benchmark_indexing() -> std::io::Result<()> {
     Ok(())
 }
 
+#[allow(dead_code)]
 fn benchmark_queries() -> std::io::Result<()> {
     // println!("Query Benchmark");
     let dense_reader = dense::DenseIndexReader::open(DENSE_INDEX_FILE)?;
@@ -223,7 +230,7 @@ fn benchmark_queries() -> std::io::Result<()> {
         (0u64, 1000000u64, "1M records"),
     ];
 
-    for (timestamp_start, timestamp_end, description) in query_ranges {
+    for (timestamp_start, timestamp_end, _description) in query_ranges {
         // println!("\n Query: {} from {} to {}", description, timestamp_start, timestamp_end);
 
         let start = Instant::now();
@@ -297,7 +304,6 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
             sparse_interval: 100,
             entries_per_index_block: 512,
             segment_base_path: format!("./benchmark_data_{}", num_records),
-            ..Default::default()
         };
 
         // Clean up any existing data
@@ -329,7 +335,7 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
         }
 
         let write_duration = write_start.elapsed();
-        let write_throughput = num_records as f64 / write_duration.as_secs_f64();
+        let _write_throughput = num_records as f64 / write_duration.as_secs_f64();
 
         // println!("Write Performance:");
         // println!("  Duration: {:.3}s", write_duration.as_secs_f64());
@@ -341,7 +347,7 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
 
         // println!("\nQuery Performance:");
 
-        for (fraction, description) in query_ranges {
+        for (fraction, _description) in query_ranges {
             let query_count = 100.min(num_records / 10); // Run 100 queries or 10% of records, whichever is smaller
             let mut query_times = Vec::new();
             let mut total_records_read = 0;
@@ -365,16 +371,16 @@ fn benchmark_storage_performance() -> std::io::Result<()> {
             }
 
             let avg_query_time = query_times.iter().sum::<f64>() / query_times.len() as f64;
-            let queries_per_sec = 1.0 / avg_query_time;
+            let _queries_per_sec = 1.0 / avg_query_time;
             let total_query_time = query_times.iter().sum::<f64>();
-            let records_per_sec = if total_query_time > 0.0 {
+            let _records_per_sec = if total_query_time > 0.0 {
                 total_records_read as f64 / total_query_time
             } else {
                 0.0
             };
-            let avg_records_per_query = total_records_read as f64 / query_count as f64;
-            let min_time = query_times.iter().cloned().fold(f64::INFINITY, f64::min);
-            let max_time = query_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+            let _avg_records_per_query = total_records_read as f64 / query_count as f64;
+            let _min_time = query_times.iter().cloned().fold(f64::INFINITY, f64::min);
+            let _max_time = query_times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
 
             // println!("  {} queries ({}):", description, query_count);
             // println!("    Avg query time: {:.3}ms", avg_query_time * 1000.0);
diff --git a/src/parsing/janusql_parser.rs b/src/parsing/janusql_parser.rs
index 8a49414..100a7ec 100644
--- a/src/parsing/janusql_parser.rs
+++ b/src/parsing/janusql_parser.rs
@@ -60,28 +60,28 @@ pub struct ParsedJanusQuery {
 
 /// Parser for JanusQL queries
 pub struct JanusQLParser {
-    historical_sliding_window_regex: Regex,
-    historical_fixed_window_regex: Regex,
-    live_sliding_window_regex: Regex,
-    register_regex: Regex,
-    prefix_regex: Regex,
+    historical_sliding_window: Regex,
+    historical_fixed_window: Regex,
+    live_sliding_window: Regex,
+    register: Regex,
+    prefix: Regex,
 }
 
 impl JanusQLParser {
     /// Creates a new JanusQLParser instance.
     pub fn new() -> Result<Self, Box<dyn std::error::Error>> {
         Ok(JanusQLParser {
-            historical_sliding_window_regex: Regex::new(
+            historical_sliding_window: Regex::new(
                 r"FROM\s+NAMED\s+WINDOW\s+([^\s]+)\s+ON\s+STREAM\s+([^\s]+)\s+\[OFFSET\s+(\d+)\s+RANGE\s+(\d+)\s+STEP\s+(\d+)\]",
             )?,
-            historical_fixed_window_regex: Regex::new(
+            historical_fixed_window: Regex::new(
                 r"FROM\s+NAMED\s+WINDOW\s+([^\s]+)\s+ON\s+STREAM\s+([^\s]+)\s+\[START\s+(\d+)\s+END\s+(\d+)\]",
             )?,
-            live_sliding_window_regex: Regex::new(
+            live_sliding_window: Regex::new(
                 r"FROM\s+NAMED\s+WINDOW\s+([^\s]+)\s+ON\s+STREAM\s+([^\s]+)\s+\[RANGE\s+(\d+)\s+STEP\s+(\d+)\]",
             )?,
-            register_regex: Regex::new(r"REGISTER\s+(\w+)\s+([^\s]+)\s+AS")?,
-            prefix_regex: Regex::new(r"REGISTER\s+(\w+)\s+([^\s]+)\s+AS")?,
+            register: Regex::new(r"REGISTER\s+(\w+)\s+([^\s]+)\s+AS")?,
+            prefix: Regex::new(r"PREFIX\s+([^\s]+):\s*<([^>]+)>")?,
         })
     }
 
@@ -90,7 +90,7 @@ impl JanusQLParser {
         line: &str,
         prefix_mapper: &HashMap<String, String>,
     ) -> Result<Option<WindowDefinition>, Box<dyn std::error::Error>> {
-        if let Some(captures) = self.historical_sliding_window_regex.captures(line) {
+        if let Some(captures) = self.historical_sliding_window.captures(line) {
             return Ok(Some(WindowDefinition {
                 window_name: self.unwrap_iri(&captures[1], prefix_mapper),
                 stream_name: self.unwrap_iri(&captures[2], prefix_mapper),
@@ -103,7 +103,7 @@ impl JanusQLParser {
             }));
         }
 
-        if let Some(captures) = self.historical_fixed_window_regex.captures(line) {
+        if let Some(captures) = self.historical_fixed_window.captures(line) {
             return Ok(Some(WindowDefinition {
                 window_name: self.unwrap_iri(&captures[1], prefix_mapper),
                 stream_name: self.unwrap_iri(&captures[2], prefix_mapper),
@@ -116,7 +116,7 @@ impl JanusQLParser {
             }));
         }
 
-        if let Some(captures) = self.live_sliding_window_regex.captures(line) {
+        if let Some(captures) = self.live_sliding_window.captures(line) {
             return Ok(Some(WindowDefinition {
                 window_name: self.unwrap_iri(&captures[1], prefix_mapper),
                 stream_name: self.unwrap_iri(&captures[2], prefix_mapper),
@@ -155,7 +155,7 @@ impl JanusQLParser {
 
             if trimmed_line.is_empty()
                 || trimmed_line.starts_with("/*")
-                || trimmed_line.starts_with("*")
+                || trimmed_line.starts_with('*')
                 || trimmed_line.starts_with("*/")
             {
                 if in_where_clause && !trimmed_line.is_empty() {
@@ -165,14 +165,14 @@ impl JanusQLParser {
             }
 
             if trimmed_line.starts_with("REGISTER") {
-                if let Some(captures) = self.register_regex.captures(trimmed_line) {
+                if let Some(captures) = self.register.captures(trimmed_line) {
                     let operator = captures.get(1).unwrap().as_str().to_string();
                     let name_raw = captures.get(2).unwrap().as_str();
                     let name = self.unwrap_iri(name_raw, &parsed.prefixes);
                     parsed.r2s = Some(R2SOperator { operator, name });
                 }
             } else if trimmed_line.starts_with("PREFIX") {
-                if let Some(captures) = self.prefix_regex.captures(trimmed_line) {
+                if let Some(captures) = self.prefix.captures(trimmed_line) {
                     let prefix = captures.get(1).unwrap().as_str().to_string();
                     let namespace = captures.get(2).unwrap().as_str().to_string();
                     parsed.prefixes.insert(prefix, namespace);
@@ -310,7 +310,7 @@ impl JanusQLParser {
                     adapted
                 }
             }
-            _ => adapted,
+            WindowType::Live => adapted,
         }
     }
 
@@ -356,7 +356,7 @@ mod tests {
     #[test]
     fn test_basic_live_window() {
         let parser = JanusQLParser::new().unwrap();
-        let query = r#"
+        let query = r"
         PREFIX sensor: <https://rsp.js/sensors/>
         PREFIX saref: <https://saref.org/core/>
         REGISTER RStream sensor:output AS
@@ -368,7 +368,7 @@ mod tests {
                 ?event saref:hasTimestamp ?timestamp .
             }
         }
-        "#;
+        ";
 
         let result = parser.parse(query).unwrap();
         assert_eq!(result.live_windows.len(), 1);
@@ -381,7 +381,7 @@ mod tests {
     #[test]
     fn test_mixed_windows() {
         let parser = JanusQLParser::new().unwrap();
-        let query = r#"
+        let query = r"
         PREFIX sensor: <https://rsp.js/sensors/>
         PREFIX saref: <https://saref.org/core/>
         REGISTER RStream sensor:output AS
@@ -403,16 +403,16 @@ mod tests {
                 ?event saref:hasTimestamp ?timestamp .
             }
         }
-        "#;
+        ";
 
         let result = parser.parse(query).unwrap();
         assert_eq!(result.live_windows.len(), 1);
         assert_eq!(result.historical_windows.len(), 2);
         assert_eq!(result.live_windows[0].width, 5000);
         assert_eq!(result.live_windows[0].slide, 1000);
-        assert_eq!(result.historical_windows[0].start, Some(1622505600));
-        assert_eq!(result.historical_windows[0].end, Some(1622592000));
-        assert_eq!(result.historical_windows[1].offset, Some(1622505600));
+        assert_eq!(result.historical_windows[0].start, Some(1_622_505_600));
+        assert_eq!(result.historical_windows[0].end, Some(1_622_592_000));
+        assert_eq!(result.historical_windows[1].offset, Some(1_622_505_600));
         assert_eq!(result.historical_windows[1].width, 10000);
         assert_eq!(result.historical_windows[1].slide, 2000);
         assert!(!result.rspql_query.is_empty());
diff --git a/src/storage/indexing/dictionary.rs b/src/storage/indexing/dictionary.rs
index a1e44be..1308f8d 100644
--- a/src/storage/indexing/dictionary.rs
+++ b/src/storage/indexing/dictionary.rs
@@ -82,14 +82,14 @@ mod tests {
         let graph_id = dict.encode("http://example.org/graph1");
 
         println!("Encoded IDs:");
-        println!("Subject: {} -> {}", "http://example.org/person/Alice", subject_id);
-        println!("Predicate: {} -> {}", "http://example.org/knows", predicate_id);
-        println!("Object: {} -> {}", "http://example.org/person/Bob", object_id);
-        println!("Graph: {} -> {}", "http://example.org/graph1", graph_id);
+        println!("Subject: http://example.org/person/Alice -> {}", subject_id);
+        println!("Predicate: http://example.org/knows -> {}", predicate_id);
+        println!("Object: http://example.org/person/Bob -> {}", object_id);
+        println!("Graph: http://example.org/graph1 -> {}", graph_id);
 
         // Create an event
         let event = Event {
-            timestamp: 1234567890,
+            timestamp: 1_234_567_890,
             subject: subject_id,
             predicate: predicate_id,
             object: object_id,
@@ -122,7 +122,7 @@ mod tests {
 
         // Test the clean API - user provides URIs directly
         let rdf_event = RDFEvent::new(
-            1234567890,
+            1_234_567_890,
             "http://example.org/person/Alice",
             "http://example.org/knows",
             "http://example.org/person/Bob",
@@ -140,7 +140,7 @@ mod tests {
         assert_eq!(decoded_event.predicate, "http://example.org/knows");
         assert_eq!(decoded_event.object, "http://example.org/person/Bob");
         assert_eq!(decoded_event.graph, "http://example.org/graph1");
-        assert_eq!(decoded_event.timestamp, 1234567890);
+        assert_eq!(decoded_event.timestamp, 1_234_567_890);
 
         println!("Clean API test passed!");
         println!(
diff --git a/src/storage/memory_tracker.rs b/src/storage/memory_tracker.rs
index 2d731aa..594f766 100644
--- a/src/storage/memory_tracker.rs
+++ b/src/storage/memory_tracker.rs
@@ -58,6 +58,7 @@ impl MemoryTracker {
     }
 
     /// Get current memory statistics
+    #[allow(clippy::cast_precision_loss)]
     pub fn get_stats(&self) -> MemoryStats {
         let current = self.current_memory_bytes.load(Ordering::Relaxed);
         let peak = self.peak_memory_bytes.load(Ordering::Relaxed);
@@ -157,12 +158,13 @@ impl MemoryTracker {
     }
 
     #[cfg(target_os = "macos")]
+    #[allow(dead_code)]
     fn get_memory_macos(&self) -> usize {
         use std::mem;
         use std::ptr;
 
         #[repr(C)]
-        struct task_basic_info {
+        struct TaskBasicInfo {
             virtual_size: u32,
             resident_size: u32,
             policy: u32,
@@ -170,25 +172,27 @@ impl MemoryTracker {
         }
 
         extern "C" {
+            #[allow(dead_code)]
             fn mach_task_self() -> u32;
+            #[allow(dead_code)]
             fn task_info(
                 target_task: u32,
                 flavor: u32,
-                task_info_out: *mut task_basic_info,
+                task_info_out: *mut TaskBasicInfo,
                 task_info_outCnt: *mut u32,
             ) -> i32;
         }
 
         const TASK_BASIC_INFO: u32 = 5;
-        let mut info: task_basic_info = unsafe { mem::zeroed() };
-        let mut count = (mem::size_of::<task_basic_info>() / mem::size_of::<u32>()) as u32;
+        let mut info: TaskBasicInfo = unsafe { mem::zeroed() };
+        let mut count = (mem::size_of::<TaskBasicInfo>() / mem::size_of::<u32>()) as u32;
 
         let result = unsafe {
             task_info(
                 mach_task_self(),
                 TASK_BASIC_INFO,
-                &mut info as *mut task_basic_info,
-                &mut count,
+                &raw mut info,
+                &raw mut count,
             )
         };
 
@@ -219,6 +223,7 @@ impl MemoryTracker {
     }
 
     /// Format bytes in human-readable format
+    #[allow(clippy::cast_precision_loss)]
     pub fn format_bytes(bytes: usize) -> String {
         const UNITS: &[&str] = &["B", "KB", "MB", "GB"];
         let mut size = bytes as f64;
diff --git a/src/storage/segmented_storage.rs b/src/storage/segmented_storage.rs
index 7dc9a9d..a2215a3 100644
--- a/src/storage/segmented_storage.rs
+++ b/src/storage/segmented_storage.rs
@@ -108,6 +108,7 @@ impl StreamingSegmentedStorage {
         self.write(encoded_event)
     }
 
+    #[allow(dead_code)]
     fn should_flush(&self) -> bool {
         let batch_buffer = self.batch_buffer.read().unwrap();
 
diff --git a/tests/dictionary_encoding_test.rs b/tests/dictionary_encoding_test.rs
index 00a4f1e..269a9cf 100644
--- a/tests/dictionary_encoding_test.rs
+++ b/tests/dictionary_encoding_test.rs
@@ -174,7 +174,7 @@ fn test_dictionary_persistence() -> std::io::Result<()> {
 
     // Create and populate dictionary
     let mut dict = Dictionary::new();
-    let uris = vec![
+    let uris = [
         "https://example.org/resource/event001",
         "http://www.w3.org/ns/saref#hasValue",
         "http://www.w3.org/2001/XMLSchema#dateTime",
@@ -251,7 +251,7 @@ fn test_iot_sensor_events_with_dictionary() -> std::io::Result<()> {
     let mut dict = Dictionary::new();
 
     // Define common IoT RDF predicates and graph URIs
-    let predicates = vec![
+    let predicates = [
         "http://www.w3.org/ns/saref#hasTimestamp",
         "http://www.w3.org/ns/saref#hasValue",
         "http://www.w3.org/ns/ssn#observedBy",
@@ -318,7 +318,7 @@ fn test_sparse_index_with_dictionary_integration() -> std::io::Result<()> {
 
     // Define RDF components
     let predicates =
-        vec!["http://www.w3.org/ns/saref#hasTimestamp", "http://www.w3.org/ns/saref#hasValue"];
+        ["http://www.w3.org/ns/saref#hasTimestamp", "http://www.w3.org/ns/saref#hasValue"];
 
     let predicate_ids: Vec<u32> = predicates.iter().map(|p| dict.encode(p)).collect();
 
@@ -406,7 +406,7 @@ fn test_rdf_namespace_reuse() {
     let mut dict = Dictionary::new();
 
     // Common RDF namespace URIs that should be reused
-    let common_namespaces = vec![
+    let common_namespaces = [
         "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
         "http://www.w3.org/2000/01/rdf-schema#",
         "http://www.w3.org/2001/XMLSchema#",
@@ -513,7 +513,7 @@ fn test_dictionary_space_savings() {
     let mut dict = Dictionary::new();
 
     // Calculate space used by raw URIs
-    let uris = vec![
+    let uris = [
         "https://solid.ti.rw.fau.de/public/ns/stream#event001",
         "http://www.w3.org/ns/saref#hasTimestamp",
         "2025-11-05T10:30:00Z",
diff --git a/tests/integration_test.rs b/tests/integration_test.rs
index 44f3268..df2e932 100644
--- a/tests/integration_test.rs
+++ b/tests/integration_test.rs
@@ -8,7 +8,7 @@ use janus::{Error, Result};
 #[test]
 fn test_basic_functionality() {
     // TODO: Add integration tests
-    assert!(true);
+    // assert!(true); // Removed as it's always true
 }
 
 #[test]

From 46af63c17dc74c535140f8586c0d551a7412efd6 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 12 Nov 2025 16:03:36 +0100
Subject: [PATCH 18/19] Fix rustfmt configuration - remove unstable features to
 support stable Rust

---
 rustfmt.toml                  | 39 -----------------------------------
 src/storage/memory_tracker.rs | 10 ++-------
 2 files changed, 2 insertions(+), 47 deletions(-)

diff --git a/rustfmt.toml b/rustfmt.toml
index c161143..276694f 100644
--- a/rustfmt.toml
+++ b/rustfmt.toml
@@ -24,63 +24,24 @@ chain_width = 80
 # Maximum line length for single line if-else expressions
 single_line_if_else_max_width = 50
 
-# Brace style for items
-brace_style = "SameLineWhere"
-
-# Brace style for control flow constructs
-control_brace_style = "AlwaysSameLine"
-
 # How to indent
 hard_tabs = false
 
 # Number of spaces per tab
 tab_spaces = 4
 
-# Where to put a binary operator when a binary expression goes multiline
-binop_separator = "Front"
-
-# Combine control expressions with function calls
-combine_control_expr = true
-
-# Maximum length of comments
-comment_width = 100
-
 # Use field init shorthand if possible
 use_field_init_shorthand = true
 
 # Use try shorthand
 use_try_shorthand = true
 
-# Format code in doc comments
-format_code_in_doc_comments = true
-
-# Format strings using rustfmt
-format_strings = true
-
-# Merge imports
-imports_granularity = "Crate"
-
 # Reorder imports
 reorder_imports = true
 
 # Reorder modules
 reorder_modules = true
 
-# Where to put the opening brace of structs
-struct_brace_style = "SameLineWhere"
-
-# Indent style for function parameters
-indent_style = "Block"
-
-# Leave a space before the colon in a type annotation
-space_before_colon = false
-
-# Leave a space after the colon in a type annotation
-space_after_colon = true
-
-# Put empty-body functions and impls on a single line
-empty_item_single_line = true
-
 # Newline style
 newline_style = "Unix"
 
diff --git a/src/storage/memory_tracker.rs b/src/storage/memory_tracker.rs
index 594f766..e63f4a0 100644
--- a/src/storage/memory_tracker.rs
+++ b/src/storage/memory_tracker.rs
@@ -187,14 +187,8 @@ impl MemoryTracker {
         let mut info: TaskBasicInfo = unsafe { mem::zeroed() };
         let mut count = (mem::size_of::<TaskBasicInfo>() / mem::size_of::<u32>()) as u32;
 
-        let result = unsafe {
-            task_info(
-                mach_task_self(),
-                TASK_BASIC_INFO,
-                &raw mut info,
-                &raw mut count,
-            )
-        };
+        let result =
+            unsafe { task_info(mach_task_self(), TASK_BASIC_INFO, &raw mut info, &raw mut count) };
 
         if result == 0 {
             info.resident_size as usize

From 7be03aa3d83377cfa0731beb0bebb5acd1c43a54 Mon Sep 17 00:00:00 2001
From: Kush Bisen <mailkushbisen@gmail.com>
Date: Wed, 12 Nov 2025 16:08:30 +0100
Subject: [PATCH 19/19] Update GitHub Actions to latest versions:
 upload-artifact@v4, cache@v4

---
 .github/workflows/ci.yml | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 04876fb..d44b732 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -42,19 +42,19 @@ jobs:
           components: clippy
 
       - name: Cache cargo registry
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/registry
           key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Cache cargo index
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/git
           key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Cache cargo build
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: target
           key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }}
@@ -85,19 +85,19 @@ jobs:
           toolchain: ${{ matrix.rust }}
 
       - name: Cache cargo registry
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/registry
           key: ${{ runner.os }}-${{ matrix.rust }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Cache cargo index
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/git
           key: ${{ runner.os }}-${{ matrix.rust }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Cache cargo build
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: target
           key: ${{ runner.os }}-${{ matrix.rust }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }}
@@ -121,19 +121,19 @@ jobs:
         uses: dtolnay/rust-toolchain@stable
 
       - name: Cache cargo registry
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/registry
           key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Cache cargo index
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/git
           key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Cache cargo build
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: target
           key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }}
@@ -158,13 +158,13 @@ jobs:
         uses: taiki-e/install-action@cargo-llvm-cov
 
       - name: Cache cargo registry
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/registry
           key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Cache cargo index
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/git
           key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}
@@ -205,19 +205,19 @@ jobs:
           targets: ${{ matrix.target }}
 
       - name: Cache cargo registry
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/registry
           key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Cache cargo index
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ~/.cargo/git
           key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}
 
       - name: Cache cargo build
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: target
           key: ${{ runner.os }}-cargo-build-target-${{ matrix.target }}-${{ hashFiles('**/Cargo.lock') }}
@@ -226,7 +226,7 @@ jobs:
         run: cargo build --release --target ${{ matrix.target }} --verbose
 
       - name: Upload build artifacts
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: janus-${{ matrix.target }}
           path: |