From 74cc11b4a2660c784381f649ad7ad4c28b7e2b31 Mon Sep 17 00:00:00 2001 From: Mazdak Farrokhzad Date: Wed, 10 Dec 2025 14:19:11 +0100 Subject: [PATCH 1/4] adds point scan ABIs to rust bindings, and uses more point scans internally --- crates/bindings-macro/src/table.rs | 2 + crates/bindings-sys/src/lib.rs | 141 ++++++- crates/bindings/src/table.rs | 150 +++++-- crates/core/src/db/relational_db.rs | 15 +- crates/core/src/host/instance_env.rs | 111 ++++-- crates/core/src/host/mod.rs | 2 + crates/core/src/host/wasm_common.rs | 3 + .../src/host/wasmtime/wasm_instance_env.rs | 106 +++++ crates/core/src/subscription/tx.rs | 24 +- .../locking_tx_datastore/committed_state.rs | 69 +++- .../src/locking_tx_datastore/datastore.rs | 25 +- .../src/locking_tx_datastore/mut_tx.rs | 377 +++++++++++++----- .../src/locking_tx_datastore/state_view.rs | 151 +++---- .../datastore/src/locking_tx_datastore/tx.rs | 57 ++- .../src/locking_tx_datastore/tx_state.rs | 27 +- crates/execution/src/lib.rs | 23 +- crates/execution/src/pipelined.rs | 6 +- crates/lib/src/filterable_value.rs | 16 + crates/standalone/src/subcommands/start.rs | 2 +- crates/table/src/table_index/mod.rs | 6 +- 20 files changed, 971 insertions(+), 342 deletions(-) diff --git a/crates/bindings-macro/src/table.rs b/crates/bindings-macro/src/table.rs index 68eb2f11777..519a347c11c 100644 --- a/crates/bindings-macro/src/table.rs +++ b/crates/bindings-macro/src/table.rs @@ -466,11 +466,13 @@ impl ValidatedIndex<'_> { }; let vis = superize_vis(vis); + let num_cols = cols.len(); let mut decl = quote! { #typeck_direct_index #vis struct #index_ident; impl spacetimedb::table::Index for #index_ident { + const NUM_COLS_INDEXED: usize = #num_cols; fn index_id() -> spacetimedb::table::IndexId { static INDEX_ID: std::sync::OnceLock = std::sync::OnceLock::new(); *INDEX_ID.get_or_init(|| { diff --git a/crates/bindings-sys/src/lib.rs b/crates/bindings-sys/src/lib.rs index 751b6ba8cb0..1409998351b 100644 --- a/crates/bindings-sys/src/lib.rs +++ b/crates/bindings-sys/src/lib.rs @@ -588,7 +588,6 @@ pub mod raw { /// /// - `out_ptr` is NULL or `out` is not in bounds of WASM memory. pub fn identity(out_ptr: *mut u8); - } // See comment on previous `extern "C"` block re: ABI version. @@ -775,6 +774,86 @@ pub mod raw { ) -> u16; } + #[link(wasm_import_module = "spacetime_10.4")] + extern "C" { + /// Finds all rows in the index identified by `index_id`, + /// according to `point = point_ptr[..point_len]` in WASM memory. + /// + /// The index itself has a schema/type. + /// Matching defined by first BSATN-decoding `point` to that `AlgebraicType` + /// and then comparing the decoded `point` to the keys in the index + /// using `Ord for AlgebraicValue`. + /// to the keys in the index. + /// The `point` is BSATN-decoded to that `AlgebraicType`. + /// A match happens when `Ordering::Equal` is returned from `fn cmp`. + /// This occurs exactly when the row's BSATN-encoding + /// is equal to the encoding of the `AlgebraicValue`. + /// + /// This ABI is not limited to single column indices. + /// Multi-column indices can be queried by providing + /// a BSATN-encoded `ProductValue` + /// that is typed at the `ProductType` of the index. + /// + /// The relevant table for the index is found implicitly via the `index_id`, + /// which is unique for the module. + /// + /// On success, the iterator handle is written to the `out` pointer. + /// This handle can be advanced by [`row_iter_bsatn_advance`]. + /// + /// # Traps + /// + /// Traps if: + /// - `point_ptr` is NULL or `point` is not in bounds of WASM memory. + /// - `out` is NULL or `out[..size_of::()]` is not in bounds of WASM memory. + /// + /// # Errors + /// + /// Returns an error: + /// + /// - `NOT_IN_TRANSACTION`, when called outside of a transaction. + /// - `NO_SUCH_INDEX`, when `index_id` is not a known ID of an index. + /// - `WRONG_INDEX_ALGO` if the index is not a range-scan compatible index. + /// - `BSATN_DECODE_ERROR`, when `point` cannot be decoded to an `AlgebraicValue` + /// typed at the index's key type (`AlgebraicType`). + pub fn datastore_index_scan_point_bsatn( + index_id: IndexId, + point_ptr: *const u8, // AlgebraicValue + point_len: usize, + out: *mut RowIter, + ) -> u16; + + /// Deletes all rows found in the index identified by `index_id`, + /// according to `point = point_ptr[..point_len]` in WASM memory. + /// + /// This syscall will delete all the rows found by + /// [`datastore_index_scan_point_bsatn`] with the same arguments passed. + /// See `datastore_index_scan_point_bsatn` for details. + /// + /// The number of rows deleted is written to the WASM pointer `out`. + /// + /// # Traps + /// + /// Traps if: + /// - `point_ptr` is NULL or `point` is not in bounds of WASM memory. + /// - `out` is NULL or `out[..size_of::()]` is not in bounds of WASM memory. + /// + /// # Errors + /// + /// Returns an error: + /// + /// - `NOT_IN_TRANSACTION`, when called outside of a transaction. + /// - `NO_SUCH_INDEX`, when `index_id` is not a known ID of an index. + /// - `WRONG_INDEX_ALGO` if the index is not a range-compatible index. + /// - `BSATN_DECODE_ERROR`, when `point` cannot be decoded to an `AlgebraicValue` + /// typed at the index's key type (`AlgebraicType`). + pub fn datastore_delete_by_index_scan_point_bsatn( + index_id: IndexId, + point_ptr: *const u8, // AlgebraicValue + point_len: usize, + out: *mut u32, + ) -> u16; + } + /// What strategy does the database index use? /// /// See also: @@ -1095,6 +1174,44 @@ pub fn datastore_table_scan_bsatn(table_id: TableId) -> Result { Ok(RowIter { raw }) } +/// Finds all rows in the index identified by `index_id`, +/// according to the `point. +/// +/// The index itself has a schema/type. +/// Matching defined by first BSATN-decoding `point` to that `AlgebraicType` +/// and then comparing the decoded `point` to the keys in the index +/// using `Ord for AlgebraicValue`. +/// to the keys in the index. +/// The `point` is BSATN-decoded to that `AlgebraicType`. +/// A match happens when `Ordering::Equal` is returned from `fn cmp`. +/// This occurs exactly when the row's BSATN-encoding +/// is equal to the encoding of the `AlgebraicValue`. +/// +/// This ABI is not limited to single column indices. +/// Multi-column indices can be queried by providing +/// a BSATN-encoded `ProductValue` +/// that is typed at the `ProductType` of the index. +/// +/// The relevant table for the index is found implicitly via the `index_id`, +/// which is unique for the module. +/// +/// On success, the iterator handle is written to the `out` pointer. +/// This handle can be advanced by [`RowIter::read`]. +/// +/// # Errors +/// +/// Returns an error: +/// +/// - `NOT_IN_TRANSACTION`, when called outside of a transaction. +/// - `NO_SUCH_INDEX`, when `index_id` is not a known ID of an index. +/// - `WRONG_INDEX_ALGO` if the index is not a range-compatible index. +/// - `BSATN_DECODE_ERROR`, when `point` cannot be decoded to an `AlgebraicValue` +/// typed at the index's key type (`AlgebraicType`). +pub fn datastore_index_scan_point_bsatn(index_id: IndexId, point: &[u8]) -> Result { + let raw = unsafe { call(|out| raw::datastore_index_scan_point_bsatn(index_id, point.as_ptr(), point.len(), out))? }; + Ok(RowIter { raw }) +} + /// Finds all rows in the index identified by `index_id`, /// according to the `prefix`, `rstart`, and `rend`. /// @@ -1169,6 +1286,28 @@ pub fn datastore_index_scan_range_bsatn( Ok(RowIter { raw }) } +/// Deletes all rows found in the index identified by `index_id`, +/// according to the `point. +/// +/// This syscall will delete all the rows found by +/// [`datastore_index_scan_point_bsatn`] with the same arguments passed. +/// See `datastore_index_scan_point_bsatn` for details. +/// +/// The number of rows deleted is returned on success. +/// +/// # Errors +/// +/// Returns an error: +/// +/// - `NOT_IN_TRANSACTION`, when called outside of a transaction. +/// - `NO_SUCH_INDEX`, when `index_id` is not a known ID of an index. +/// - `WRONG_INDEX_ALGO` if the index is not a range-compatible index. +/// - `BSATN_DECODE_ERROR`, when `point` cannot be decoded to an `AlgebraicValue` +/// typed at the index's key type (`AlgebraicType`). +pub fn datastore_delete_by_index_scan_point_bsatn(index_id: IndexId, point: &[u8]) -> Result { + unsafe { call(|out| raw::datastore_delete_by_index_scan_point_bsatn(index_id, point.as_ptr(), point.len(), out)) } +} + /// Deletes all rows found in the index identified by `index_id`, /// according to the `prefix`, `rstart`, and `rend`. /// diff --git a/crates/bindings/src/table.rs b/crates/bindings/src/table.rs index 0dd7a6edc17..8a093ab0dd2 100644 --- a/crates/bindings/src/table.rs +++ b/crates/bindings/src/table.rs @@ -346,15 +346,12 @@ impl> UniqueColumn (bool, IterBuf) { let index_id = Col::index_id(); - let args = get_args::(col_val); - let (prefix, prefix_elems, rstart, rend) = args.args_for_syscall(); - - let n_del = sys::datastore_delete_by_index_scan_range_bsatn(index_id, prefix, prefix_elems, rstart, rend) - .unwrap_or_else(|e| { - panic!("unique: unexpected error from datastore_delete_by_index_scan_range_bsatn: {e}") - }); + let point = IterBuf::serialize(col_val).unwrap(); + let n_del = sys::datastore_delete_by_index_scan_point_bsatn(index_id, &point).unwrap_or_else(|e| { + panic!("unique: unexpected error from datastore_delete_by_index_scan_point_bsatn: {e}") + }); - (n_del > 0, args.data) + (n_del > 0, point) } /// Deletes the row where the value in the unique column matches that in the corresponding field of `new_row`, and @@ -404,32 +401,21 @@ impl> UniqueColumn>(col_val: &Col::ColType) -> IndexScanRangeArgs { - IndexScanRangeArgs { - data: IterBuf::serialize(&std::ops::Bound::Included(col_val)).unwrap(), - prefix_elems: 0, - rstart_idx: 0, - rend_idx: None, - } -} - #[inline] fn find>(col_val: &Col::ColType) -> Option { // Find the row with a match. let index_id = Col::index_id(); - let args = get_args::(col_val); - let (prefix, prefix_elems, rstart, rend) = args.args_for_syscall(); + let point = IterBuf::serialize(col_val).unwrap(); - let iter = sys::datastore_index_scan_range_bsatn(index_id, prefix, prefix_elems, rstart, rend) - .unwrap_or_else(|e| panic!("unique: unexpected error from `datastore_index_scan_range_bsatn`: {e}")); - let mut iter = TableIter::new_with_buf(iter, args.data); + let iter = sys::datastore_index_scan_point_bsatn(index_id, &point) + .unwrap_or_else(|e| panic!("unique: unexpected error from `datastore_index_scan_point_bsatn`: {e}")); + let mut iter = TableIter::new_with_buf(iter, point); // We will always find either 0 or 1 rows here due to the unique constraint. let row = iter.next(); assert!( iter.is_exhausted(), - "`datastore_index_scan_range_bsatn` on unique field cannot return >1 rows" + "`datastore_index_scan_point_bsatn` on unique field cannot return >1 rows" ); row } @@ -461,7 +447,20 @@ impl> UniqueColumnReadOnly IndexId; } @@ -676,11 +675,21 @@ impl RangedIndex { B: IndexScanRangeBounds, { let index_id = Idx::index_id(); - let args = b.get_args(); - let (prefix, prefix_elems, rstart, rend) = args.args_for_syscall(); - sys::datastore_delete_by_index_scan_range_bsatn(index_id, prefix, prefix_elems, rstart, rend) - .unwrap_or_else(|e| panic!("unexpected error from `datastore_delete_by_index_scan_range_bsatn`: {e}")) - .into() + if const { is_point_scan::() } { + b.with_point_arg(|point| { + sys::datastore_delete_by_index_scan_point_bsatn(index_id, point) + .unwrap_or_else(|e| { + panic!("unexpected error from `datastore_delete_by_index_scan_point_bsatn`: {e}") + }) + .into() + }) + } else { + let args = b.get_range_args(); + let (prefix, prefix_elems, rstart, rend) = args.args_for_syscall(); + sys::datastore_delete_by_index_scan_range_bsatn(index_id, prefix, prefix_elems, rstart, rend) + .unwrap_or_else(|e| panic!("unexpected error from `datastore_delete_by_index_scan_range_bsatn`: {e}")) + .into() + } } } @@ -691,10 +700,19 @@ where B: IndexScanRangeBounds, { let index_id = Idx::index_id(); - let args = b.get_args(); - let (prefix, prefix_elems, rstart, rend) = args.args_for_syscall(); - let iter = sys::datastore_index_scan_range_bsatn(index_id, prefix, prefix_elems, rstart, rend) - .unwrap_or_else(|e| panic!("unexpected error from `datastore_index_scan_range_bsatn`: {e}")); + + let iter = if const { is_point_scan::() } { + b.with_point_arg(|point| { + sys::datastore_index_scan_point_bsatn(index_id, point) + .unwrap_or_else(|e| panic!("unexpected error from `datastore_index_scan_point_bsatn`: {e}")) + }) + } else { + let args = b.get_range_args(); + let (prefix, prefix_elems, rstart, rend) = args.args_for_syscall(); + sys::datastore_index_scan_range_bsatn(index_id, prefix, prefix_elems, rstart, rend) + .unwrap_or_else(|e| panic!("unexpected error from `datastore_index_scan_range_bsatn`: {e}")) + }; + TableIter::new(iter) } @@ -723,11 +741,30 @@ impl RangedIndexReadOnly } } +/// Returns whether `B` is a point scan on `I`. +const fn is_point_scan, T, K>() -> bool { + B::POINT && B::COLS_PROVIDED == I::NUM_COLS_INDEXED +} + /// Trait used for overloading methods on [`RangedIndex`]. /// See [`RangedIndex`] for more information. pub trait IndexScanRangeBounds { + /// True if no range occurs in this range bounds. + #[doc(hidden)] + const POINT: bool; + + /// The number of columns mentioned in this range bounds. + /// For `(42, 12..24)` it's `2`. + #[doc(hidden)] + const COLS_PROVIDED: usize; + + // TODO(perf, centril): once we have stable specialization, + // just use `to_le_bytes` internally instead. + #[doc(hidden)] + fn with_point_arg(&self, run: impl FnOnce(&[u8]) -> R) -> R; + #[doc(hidden)] - fn get_args(&self) -> IndexScanRangeArgs; + fn get_range_args(&self) -> IndexScanRangeArgs; } #[doc(hidden)] @@ -812,8 +849,15 @@ macro_rules! impl_index_scan_range_bounds { Term: IndexScanRangeBoundsTerminator, $ArgTerminator: FilterableValue, > IndexScanRangeBounds<($ColTerminator, $($ColUnused,)*)> for (Term,) { - fn get_args(&self) -> IndexScanRangeArgs { - IndexScanRangeBounds::<($ColTerminator, $($ColUnused,)*), SingleBound>::get_args(&self.0) + const POINT: bool = Term::POINT; + const COLS_PROVIDED: usize = 1; + + fn with_point_arg(&self, run: impl FnOnce(&[u8]) -> R) -> R { + IndexScanRangeBounds::<($ColTerminator, $($ColUnused,)*), SingleBound>::with_point_arg(&self.0, run) + } + + fn get_range_args(&self) -> IndexScanRangeArgs { + IndexScanRangeBounds::<($ColTerminator, $($ColUnused,)*), SingleBound>::get_range_args(&self.0) } } // Implementation for bare values: serialize the value as the terminating bounds. @@ -823,7 +867,15 @@ macro_rules! impl_index_scan_range_bounds { Term: IndexScanRangeBoundsTerminator, $ArgTerminator: FilterableValue, > IndexScanRangeBounds<($ColTerminator, $($ColUnused,)*), SingleBound> for Term { - fn get_args(&self) -> IndexScanRangeArgs { + const POINT: bool = Term::POINT; + const COLS_PROVIDED: usize = 1; + + fn with_point_arg(&self, run: impl FnOnce(&[u8]) -> R) -> R { + // We can assume here that we have a point bound. + run(&IterBuf::serialize(self.point()).unwrap()) + } + + fn get_range_args(&self) -> IndexScanRangeArgs { let mut data = IterBuf::take(); let rend_idx = self.bounds().serialize_into(&mut data); IndexScanRangeArgs { data, prefix_elems: 0, rstart_idx: 0, rend_idx } @@ -854,7 +906,27 @@ macro_rules! impl_index_scan_range_bounds { $ColTerminator, $($ColUnused,)*) > for ($($ArgPrefix,)+ Term,) { - fn get_args(&self) -> IndexScanRangeArgs { + const POINT: bool = Term::POINT; + const COLS_PROVIDED: usize = 1 + impl_index_scan_range_bounds!(@count $($ColPrefix)+); + + fn with_point_arg(&self, run: impl FnOnce(&[u8]) -> R) -> R { + // We can assume here that we have a point bound. + let mut data = IterBuf::take(); + + // Destructure the argument tuple into variables with the same names as their types. + #[allow(non_snake_case)] + let ($($ArgPrefix,)+ term,) = self; + + // For each part in the tuple queried, serialize it into the `data` buffer. + Ok(()) + $(.and_then(|()| data.serialize_into($ArgPrefix)))+ + .and_then(|()| data.serialize_into(term.point())) + .unwrap(); + + run(&*data) + } + + fn get_range_args(&self) -> IndexScanRangeArgs { let mut data = IterBuf::take(); // Get the number of prefix elements. @@ -864,7 +936,7 @@ macro_rules! impl_index_scan_range_bounds { #[allow(non_snake_case)] let ($($ArgPrefix,)+ term,) = self; - // For each prefix queried, zerialize it into the `data` buffer. + // For each prefix queried, serialize it into the `data` buffer. Ok(()) $(.and_then(|()| data.serialize_into($ArgPrefix)))+ .unwrap(); diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index c84f24e42d0..99f57806247 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -16,7 +16,7 @@ use spacetimedb_datastore::error::{DatastoreError, TableError, ViewError}; use spacetimedb_datastore::execution_context::{ReducerContext, Workload, WorkloadType}; use spacetimedb_datastore::locking_tx_datastore::datastore::TxMetrics; use spacetimedb_datastore::locking_tx_datastore::state_view::{ - IterByColEqMutTx, IterByColRangeMutTx, IterMutTx, IterTx, StateView, + IterByColEqMutTx, IterByColRangeMutTx, IterMutTx, StateView, }; use spacetimedb_datastore::locking_tx_datastore::{MutTxId, TxId}; use spacetimedb_datastore::system_tables::{ @@ -53,7 +53,7 @@ use spacetimedb_schema::schema::{ use spacetimedb_snapshot::{ReconstructedSnapshot, SnapshotError, SnapshotRepository}; use spacetimedb_table::indexes::RowPointer; use spacetimedb_table::page_pool::PagePool; -use spacetimedb_table::table::RowRef; +use spacetimedb_table::table::{RowRef, TableScanIter}; use spacetimedb_vm::errors::{ErrorType, ErrorVm}; use spacetimedb_vm::ops::parse; use std::borrow::Cow; @@ -1363,7 +1363,7 @@ impl RelationalDB { Ok(self.inner.iter_mut_tx(tx, table_id)?) } - pub fn iter<'a>(&'a self, tx: &'a Tx, table_id: TableId) -> Result, DBError> { + pub fn iter<'a>(&'a self, tx: &'a Tx, table_id: TableId) -> Result, DBError> { Ok(self.inner.iter_tx(tx, table_id)?) } @@ -1442,6 +1442,15 @@ impl RelationalDB { Ok(tx.index_scan_range(index_id, prefix, prefix_elems, rstart, rend)?) } + pub fn index_scan_point<'a>( + &'a self, + tx: &'a MutTx, + index_id: IndexId, + point: &[u8], + ) -> Result<(TableId, AlgebraicValue, impl Iterator>), DBError> { + Ok(tx.index_scan_point(index_id, point)?) + } + pub fn insert<'a>( &'a self, tx: &'a mut MutTx, diff --git a/crates/core/src/host/instance_env.rs b/crates/core/src/host/instance_env.rs index 933834432c0..4b2d95d7ac1 100644 --- a/crates/core/src/host/instance_env.rs +++ b/crates/core/src/host/instance_env.rs @@ -160,9 +160,11 @@ impl ChunkedWriter { pub fn collect_iter( pool: &mut ChunkPool, iter: impl Iterator, - rows_scanned: &mut usize, - bytes_scanned: &mut usize, - ) -> Vec> { + ) -> (Vec>, usize, usize) { + // Track the number of rows and the number of bytes scanned by the iterator. + let mut rows_scanned = 0; + let mut bytes_scanned = 0; + let mut chunked_writer = Self::new(pool); // Consume the iterator, serializing each `item`, // while allowing a chunk to be created at boundaries. @@ -171,16 +173,16 @@ impl ChunkedWriter { item.to_bsatn_extend(&mut chunked_writer.curr).unwrap(); // Flush at item boundaries. chunked_writer.flush(pool); - // Update rows scanned - *rows_scanned += 1; + // Update rows scanned. + rows_scanned += 1; } let chunks = chunked_writer.into_chunks(); // Update (BSATN) bytes scanned - *bytes_scanned += chunks.iter().map(|chunk| chunk.len()).sum::(); + bytes_scanned += chunks.iter().map(|chunk| chunk.len()).sum::(); - chunks + (chunks, rows_scanned, bytes_scanned) } } @@ -410,6 +412,23 @@ impl InstanceEnv { Ok(row_len) } + #[tracing::instrument(level = "trace", skip_all)] + pub fn datastore_delete_by_index_scan_point_bsatn( + &self, + index_id: IndexId, + point: &[u8], + ) -> Result { + let stdb = self.relational_db(); + let tx = &mut *self.get_tx()?; + + // Find all rows in the table to delete. + let (table_id, _, iter) = stdb.index_scan_point(tx, index_id, point)?; + // Re. `SmallVec`, `delete_by_field` only cares about 1 element, so optimize for that. + let rows_to_delete = iter.map(|row_ref| row_ref.pointer()).collect::>(); + + Ok(Self::datastore_delete_by_index_scan(stdb, tx, table_id, rows_to_delete)) + } + #[tracing::instrument(level = "trace", skip_all)] pub fn datastore_delete_by_index_scan_range_bsatn( &self, @@ -427,7 +446,18 @@ impl InstanceEnv { // Re. `SmallVec`, `delete_by_field` only cares about 1 element, so optimize for that. let rows_to_delete = iter.map(|row_ref| row_ref.pointer()).collect::>(); - // Note, we're deleting rows based on the result of a btree scan. + Ok(Self::datastore_delete_by_index_scan(stdb, tx, table_id, rows_to_delete)) + } + + /// Deletes `rows_to_delete` in `tx` + /// and assumes `rows_to_delete` came from an index scan. + fn datastore_delete_by_index_scan( + stdb: &RelationalDB, + tx: &mut MutTxId, + table_id: TableId, + rows_to_delete: SmallVec<[RowPointer; 1]>, + ) -> u32 { + // Note, we're deleting rows based on the result of an index scan. // Hence we must update our `index_seeks` and `rows_scanned` metrics. // // Note that we're not updating `bytes_scanned` at all, @@ -436,7 +466,7 @@ impl InstanceEnv { tx.metrics.rows_scanned += rows_to_delete.len(); // Delete them and count how many we deleted. - Ok(stdb.delete(tx, table_id, rows_to_delete)) + stdb.delete(tx, table_id, rows_to_delete) } /// Deletes all rows in the table identified by `table_id` @@ -521,25 +551,44 @@ impl InstanceEnv { pool: &mut ChunkPool, table_id: TableId, ) -> Result>, NodesError> { - let stdb = self.relational_db(); let tx = &mut *self.get_tx()?; - // Track the number of rows and the number of bytes scanned by the iterator - let mut rows_scanned = 0; - let mut bytes_scanned = 0; + // Open the iterator. + let iter = self.relational_db().iter_mut(tx, table_id)?; - // Scan table and serialize rows to bsatn - let chunks = ChunkedWriter::collect_iter( - pool, - stdb.iter_mut(tx, table_id)?, - &mut rows_scanned, - &mut bytes_scanned, - ); + // Scan the index and serialize rows to BSATN. + let (chunks, rows_scanned, bytes_scanned) = ChunkedWriter::collect_iter(pool, iter); + + // Record the number of rows and the number of bytes scanned by the iterator. + tx.metrics.bytes_scanned += bytes_scanned; + tx.metrics.rows_scanned += rows_scanned; tx.record_table_scan(&self.func_type, table_id); - tx.metrics.rows_scanned += rows_scanned; + Ok(chunks) + } + + #[tracing::instrument(level = "trace", skip_all)] + pub fn datastore_index_scan_point_bsatn_chunks( + &self, + pool: &mut ChunkPool, + index_id: IndexId, + point: &[u8], + ) -> Result>, NodesError> { + let tx = &mut *self.get_tx()?; + + // Open index iterator + let (table_id, point, iter) = self.relational_db().index_scan_point(tx, index_id, point)?; + + // Scan the index and serialize rows to BSATN. + let (chunks, rows_scanned, bytes_scanned) = ChunkedWriter::collect_iter(pool, iter); + + // Record the number of rows and the number of bytes scanned by the iterator. + tx.metrics.index_seeks += 1; tx.metrics.bytes_scanned += bytes_scanned; + tx.metrics.rows_scanned += rows_scanned; + + tx.record_index_scan_point(&self.func_type, table_id, index_id, point); Ok(chunks) } @@ -554,24 +603,22 @@ impl InstanceEnv { rstart: &[u8], rend: &[u8], ) -> Result>, NodesError> { - let stdb = self.relational_db(); let tx = &mut *self.get_tx()?; - // Track rows and bytes scanned by the iterator - let mut rows_scanned = 0; - let mut bytes_scanned = 0; - // Open index iterator - let (table_id, lower, upper, iter) = stdb.index_scan_range(tx, index_id, prefix, prefix_elems, rstart, rend)?; - - // Scan the index and serialize rows to bsatn - let chunks = ChunkedWriter::collect_iter(pool, iter, &mut rows_scanned, &mut bytes_scanned); + let (table_id, lower, upper, iter) = + self.relational_db() + .index_scan_range(tx, index_id, prefix, prefix_elems, rstart, rend)?; - tx.record_index_scan(&self.func_type, table_id, index_id, lower, upper); + // Scan the index and serialize rows to BSATN. + let (chunks, rows_scanned, bytes_scanned) = ChunkedWriter::collect_iter(pool, iter); + // Record the number of rows and the number of bytes scanned by the iterator. tx.metrics.index_seeks += 1; - tx.metrics.rows_scanned += rows_scanned; tx.metrics.bytes_scanned += bytes_scanned; + tx.metrics.rows_scanned += rows_scanned; + + tx.record_index_scan_range(&self.func_type, table_id, index_id, lower, upper); Ok(chunks) } diff --git a/crates/core/src/host/mod.rs b/crates/core/src/host/mod.rs index fef8512144e..c928c63261f 100644 --- a/crates/core/src/host/mod.rs +++ b/crates/core/src/host/mod.rs @@ -167,11 +167,13 @@ pub enum AbiCall { IndexIdFromName, DatastoreTableRowCount, DatastoreTableScanBsatn, + DatastoreIndexScanPointBsatn, DatastoreIndexScanRangeBsatn, RowIterBsatnAdvance, RowIterBsatnClose, DatastoreInsertBsatn, DatastoreUpdateBsatn, + DatastoreDeleteByIndexScanPointBsatn, DatastoreDeleteByIndexScanRangeBsatn, DatastoreDeleteAllByEqBsatn, BytesSourceRead, diff --git a/crates/core/src/host/wasm_common.rs b/crates/core/src/host/wasm_common.rs index 408f6b84e61..a48d5e4d512 100644 --- a/crates/core/src/host/wasm_common.rs +++ b/crates/core/src/host/wasm_common.rs @@ -421,6 +421,9 @@ macro_rules! abi_funcs { "spacetime_10.1"::bytes_source_remaining_length, "spacetime_10.2"::get_jwt, + + "spacetime_10.4"::datastore_index_scan_point_bsatn, + "spacetime_10.4"::datastore_delete_by_index_scan_point_bsatn, } $link_async! { diff --git a/crates/core/src/host/wasmtime/wasm_instance_env.rs b/crates/core/src/host/wasmtime/wasm_instance_env.rs index e375c6b51bb..059d4ac3cf3 100644 --- a/crates/core/src/host/wasmtime/wasm_instance_env.rs +++ b/crates/core/src/host/wasmtime/wasm_instance_env.rs @@ -529,6 +529,69 @@ impl WasmInstanceEnv { }) } + /// Finds all rows in the index identified by `index_id`, + /// according to `point = point_ptr[..point_len]` in WASM memory. + /// + /// The index itself has a schema/type. + /// Matching defined by first BSATN-decoding `point` to that `AlgebraicType` + /// and then comparing the decoded `point` to the keys in the index + /// using `Ord for AlgebraicValue`. + /// to the keys in the index. + /// The `point` is BSATN-decoded to that `AlgebraicType`. + /// A match happens when `Ordering::Equal` is returned from `fn cmp`. + /// This occurs exactly when the row's BSATN-encoding + /// is equal to the encoding of the `AlgebraicValue`. + /// + /// This ABI is not limited to single column indices. + /// Multi-column indices can be queried by providing + /// a BSATN-encoded `ProductValue` + /// that is typed at the `ProductType` of the index. + /// + /// The relevant table for the index is found implicitly via the `index_id`, + /// which is unique for the module. + /// + /// On success, the iterator handle is written to the `out` pointer. + /// This handle can be advanced by [`row_iter_bsatn_advance`]. + /// + /// # Traps + /// + /// Traps if: + /// - `point_ptr` is NULL or `point` is not in bounds of WASM memory. + /// - `out` is NULL or `out[..size_of::()]` is not in bounds of WASM memory. + /// + /// # Errors + /// + /// Returns an error: + /// + /// - `NOT_IN_TRANSACTION`, when called outside of a transaction. + /// - `NO_SUCH_INDEX`, when `index_id` is not a known ID of an index. + /// - `WRONG_INDEX_ALGO` if the index is not a range-scan compatible index. + /// - `BSATN_DECODE_ERROR`, when `point` cannot be decoded to an `AlgebraicValue` + /// typed at the index's key type (`AlgebraicType`). + pub fn datastore_index_scan_point_bsatn( + caller: Caller<'_, Self>, + index_id: u32, + point_ptr: WasmPtr, // AlgebraicValue + point_len: u32, + out: WasmPtr, + ) -> RtResult { + Self::cvt_ret(caller, AbiCall::DatastoreIndexScanPointBsatn, out, |caller| { + let (mem, env) = Self::mem_env(caller); + // Read the `point` from WASM memory. + let point = mem.deref_slice(point_ptr, point_len)?; + + // Find the relevant rows. + let chunks = env.instance_env.datastore_index_scan_point_bsatn_chunks( + &mut env.chunk_pool, + index_id.into(), + point, + )?; + + // Insert the encoded + concatenated rows into a new buffer and return its id. + Ok(env.iters.insert(chunks.into_iter())) + }) + } + /// Finds all rows in the index identified by `index_id`, /// according to the: /// - `prefix = prefix_ptr[..prefix_len]`, @@ -879,6 +942,49 @@ impl WasmInstanceEnv { }) } + /// Deletes all rows found in the index identified by `index_id`, + /// according to `point = point_ptr[..point_len]` in WASM memory. + /// + /// This syscall will delete all the rows found by + /// [`datastore_index_scan_point_bsatn`] with the same arguments passed. + /// See `datastore_index_scan_point_bsatn` for details. + /// + /// The number of rows deleted is written to the WASM pointer `out`. + /// + /// # Traps + /// + /// Traps if: + /// - `point_ptr` is NULL or `point` is not in bounds of WASM memory. + /// - `out` is NULL or `out[..size_of::()]` is not in bounds of WASM memory. + /// + /// # Errors + /// + /// Returns an error: + /// + /// - `NOT_IN_TRANSACTION`, when called outside of a transaction. + /// - `NO_SUCH_INDEX`, when `index_id` is not a known ID of an index. + /// - `WRONG_INDEX_ALGO` if the index is not a range-compatible index. + /// - `BSATN_DECODE_ERROR`, when `point` cannot be decoded to an `AlgebraicValue` + /// typed at the index's key type (`AlgebraicType`). + pub fn datastore_delete_by_index_scan_point_bsatn( + caller: Caller<'_, Self>, + index_id: u32, + point_ptr: WasmPtr, // AlgebraicValue + point_len: u32, + out: WasmPtr, + ) -> RtResult { + Self::cvt_ret(caller, AbiCall::DatastoreDeleteByIndexScanPointBsatn, out, |caller| { + let (mem, env) = Self::mem_env(caller); + // Read the `point` from WASM memory. + let point = mem.deref_slice(point_ptr, point_len)?; + + // Delete the relevant rows. + Ok(env + .instance_env + .datastore_delete_by_index_scan_point_bsatn(index_id.into(), point)?) + }) + } + /// Deletes all rows found in the index identified by `index_id`, /// according to the: /// - `prefix = prefix_ptr[..prefix_len]`, diff --git a/crates/core/src/subscription/tx.rs b/crates/core/src/subscription/tx.rs index cdaece0ae97..c8fb779d0e7 100644 --- a/crates/core/src/subscription/tx.rs +++ b/crates/core/src/subscription/tx.rs @@ -9,7 +9,7 @@ use spacetimedb_datastore::{ use spacetimedb_execution::{Datastore, DeltaStore, Row}; use spacetimedb_lib::{query::Delta, AlgebraicValue, ProductValue}; use spacetimedb_primitives::{IndexId, TableId}; -use spacetimedb_table::table::{IndexScanRangeIter, TableScanIter}; +use spacetimedb_table::table::{IndexScanPointIter, IndexScanRangeIter, TableScanIter}; use std::{ collections::BTreeMap, ops::{Deref, RangeBounds}, @@ -121,11 +121,16 @@ impl Datastore for DeltaTx<'_> { where Self: 'a; - type IndexIter<'a> + type RangeIndexIter<'a> = IndexScanRangeIter<'a> where Self: 'a; + type PointIndexIter<'a> + = IndexScanPointIter<'a> + where + Self: 'a; + fn row_count(&self, table_id: TableId) -> u64 { self.tx.row_count(table_id) } @@ -134,13 +139,22 @@ impl Datastore for DeltaTx<'_> { self.tx.table_scan(table_id) } - fn index_scan<'a>( + fn index_scan_range<'a>( &'a self, table_id: TableId, index_id: IndexId, range: &impl RangeBounds, - ) -> anyhow::Result> { - self.tx.index_scan(table_id, index_id, range) + ) -> anyhow::Result> { + self.tx.index_scan_range(table_id, index_id, range) + } + + fn index_scan_point<'a>( + &'a self, + table_id: TableId, + index_id: IndexId, + point: &AlgebraicValue, + ) -> anyhow::Result> { + self.tx.index_scan_point(table_id, index_id, point) } } diff --git a/crates/datastore/src/locking_tx_datastore/committed_state.rs b/crates/datastore/src/locking_tx_datastore/committed_state.rs index d3b98399bff..41e5615c18e 100644 --- a/crates/datastore/src/locking_tx_datastore/committed_state.rs +++ b/crates/datastore/src/locking_tx_datastore/committed_state.rs @@ -2,7 +2,7 @@ use super::{ datastore::Result, delete_table::DeleteTable, sequence::{Sequence, SequencesState}, - state_view::{IterByColRangeTx, IterTx, ScanIterByColRangeTx, StateView}, + state_view::StateView, tx_state::{IndexIdMap, PendingSchemaChange, TxState}, IterByColEqTx, }; @@ -10,7 +10,11 @@ use crate::{ db_metrics::DB_METRICS, error::{DatastoreError, IndexError, TableError, ViewError}, execution_context::ExecutionContext, - locking_tx_datastore::{mut_tx::ViewReadSets, state_view::iter_st_column_for_table}, + locking_tx_datastore::{ + mut_tx::ViewReadSets, + state_view::{iter_st_column_for_table, ApplyFilter, EqOnColumn, RangeOnColumn, ScanOrIndex}, + IterByColRangeTx, + }, system_tables::{ system_tables, StColumnRow, StConstraintData, StConstraintRow, StIndexRow, StSequenceRow, StTableFields, StTableRow, StViewRow, SystemTable, ST_CLIENT_ID, ST_CLIENT_IDX, ST_COLUMN_ID, ST_COLUMN_IDX, ST_COLUMN_NAME, @@ -44,7 +48,7 @@ use spacetimedb_table::{ blob_store::{BlobStore, HashMapBlobStore}, indexes::{RowPointer, SquashedOffset}, page_pool::PagePool, - table::{IndexScanRangeIter, InsertError, RowRef, Table, TableAndIndex}, + table::{IndexScanPointIter, IndexScanRangeIter, InsertError, RowRef, Table, TableAndIndex, TableScanIter}, }; use std::collections::BTreeMap; use std::sync::Arc; @@ -128,7 +132,7 @@ impl MemoryUsage for CommittedState { } impl StateView for CommittedState { - type Iter<'a> = IterTx<'a>; + type Iter<'a> = TableScanIter<'a>; type IterByColRange<'a, R: RangeBounds> = IterByColRangeTx<'a, R>; type IterByColEq<'a, 'r> = IterByColEqTx<'a, 'r> @@ -144,11 +148,10 @@ impl StateView for CommittedState { } fn iter(&self, table_id: TableId) -> Result> { - if self.table_name(table_id).is_some() { - return Ok(IterTx::new(table_id, self)); - } - Err(TableError::IdNotFound(SystemTable::st_table, table_id.0).into()) + self.table_scan(table_id) + .ok_or_else(|| TableError::IdNotFound(SystemTable::st_table, table_id.0).into()) } + /// Returns an iterator, /// yielding every row in the table identified by `table_id`, /// where the values of `cols` are contained in `range`. @@ -158,12 +161,11 @@ impl StateView for CommittedState { cols: ColList, range: R, ) -> Result> { - match self.index_seek(table_id, &cols, &range) { - Some(iter) => Ok(IterByColRangeTx::Index(iter)), - None => Ok(IterByColRangeTx::Scan(ScanIterByColRangeTx::new( + match self.index_seek_range(table_id, &cols, &range) { + Some(iter) => Ok(ScanOrIndex::Index(iter)), + None => Ok(ScanOrIndex::Scan(ApplyFilter::new( + RangeOnColumn { cols, range }, self.iter(table_id)?, - cols, - range, ))), } } @@ -172,9 +174,16 @@ impl StateView for CommittedState { &'a self, table_id: TableId, cols: impl Into, - value: &'r AlgebraicValue, + val: &'r AlgebraicValue, ) -> Result> { - self.iter_by_col_range(table_id, cols.into(), value) + let cols = cols.into(); + match self.index_seek_point(table_id, &cols, val) { + Some(iter) => Ok(ScanOrIndex::Index(iter)), + None => Ok(ScanOrIndex::Scan(ApplyFilter::new( + EqOnColumn { cols, val }, + self.iter(table_id)?, + ))), + } } } @@ -599,15 +608,21 @@ impl CommittedState { Ok(()) } + /// Returns an iterator doing a full table scan on `table_id`. + pub(super) fn table_scan<'a>(&'a self, table_id: TableId) -> Option> { + Some(self.get_table(table_id)?.scan_rows(&self.blob_store)) + } + /// When there's an index on `cols`, /// returns an iterator over the [TableIndex] that yields all the [`RowRef`]s /// that match the specified `range` in the indexed column. /// /// Matching is defined by `Ord for AlgebraicValue`. /// - /// For a unique index this will always yield at most one `RowRef`. + /// For a unique index this will always yield at most one `RowRef` + /// when `range` is a point. /// When there is no index this returns `None`. - pub(super) fn index_seek<'a>( + pub(super) fn index_seek_range<'a>( &'a self, table_id: TableId, cols: &ColList, @@ -619,6 +634,26 @@ impl CommittedState { .map(|i| i.seek_range(range)) } + /// When there's an index on `cols`, + /// returns an iterator over the [TableIndex] that yields all the [`RowRef`]s + /// that equal `value` in the indexed column. + /// + /// Matching is defined by `Eq for AlgebraicValue`. + /// + /// For a unique index this will always yield at most one `RowRef`. + /// When there is no index this returns `None`. + pub(super) fn index_seek_point<'a>( + &'a self, + table_id: TableId, + cols: &ColList, + value: &AlgebraicValue, + ) -> Option> { + self.tables + .get(&table_id)? + .get_index_by_cols_with_table(&self.blob_store, cols) + .map(|i| i.seek_point(value)) + } + /// Returns the table associated with the given `index_id`, if any. pub(super) fn get_table_for_index(&self, index_id: IndexId) -> Option { self.index_id_map.get(&index_id).copied() diff --git a/crates/datastore/src/locking_tx_datastore/datastore.rs b/crates/datastore/src/locking_tx_datastore/datastore.rs index 481f3a1becb..3f694ad5b3f 100644 --- a/crates/datastore/src/locking_tx_datastore/datastore.rs +++ b/crates/datastore/src/locking_tx_datastore/datastore.rs @@ -1,15 +1,14 @@ use super::{ - committed_state::CommittedState, - mut_tx::MutTxId, - sequence::SequencesState, - state_view::{IterByColRangeTx, StateView}, - tx::TxId, + committed_state::CommittedState, mut_tx::MutTxId, sequence::SequencesState, state_view::StateView, tx::TxId, tx_state::TxState, }; use crate::{ db_metrics::DB_METRICS, error::{DatastoreError, TableError}, - locking_tx_datastore::state_view::{IterByColRangeMutTx, IterMutTx, IterTx}, + locking_tx_datastore::{ + state_view::{IterByColEqMutTx, IterByColRangeMutTx, IterMutTx}, + IterByColEqTx, IterByColRangeTx, + }, traits::{InsertFlags, UpdateFlags}, }; use crate::{ @@ -42,7 +41,11 @@ use spacetimedb_sats::{ use spacetimedb_sats::{memory_usage::MemoryUsage, Deserialize}; use spacetimedb_schema::schema::{ColumnSchema, IndexSchema, SequenceSchema, TableSchema}; use spacetimedb_snapshot::{ReconstructedSnapshot, SnapshotRepository}; -use spacetimedb_table::{indexes::RowPointer, page_pool::PagePool, table::RowRef}; +use spacetimedb_table::{ + indexes::RowPointer, + page_pool::PagePool, + table::{RowRef, TableScanIter}, +}; use std::borrow::Cow; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -380,7 +383,7 @@ impl Tx for Locking { impl TxDatastore for Locking { type IterTx<'a> - = IterTx<'a> + = TableScanIter<'a> where Self: 'a; type IterByColRangeTx<'a, R: RangeBounds> @@ -388,7 +391,7 @@ impl TxDatastore for Locking { where Self: 'a; type IterByColEqTx<'a, 'r> - = IterByColRangeTx<'a, &'r AlgebraicValue> + = IterByColEqTx<'a, 'r> where Self: 'a; @@ -467,7 +470,7 @@ impl MutTxDatastore for Locking { Self: 'a; type IterByColRangeMutTx<'a, R: RangeBounds> = IterByColRangeMutTx<'a, R>; type IterByColEqMutTx<'a, 'r> - = IterByColRangeMutTx<'a, &'r AlgebraicValue> + = IterByColEqMutTx<'a, 'r> where Self: 'a; @@ -2688,7 +2691,7 @@ mod tests { let index_id = datastore.index_id_from_name_mut_tx(&tx, "index")?.unwrap(); let find_row_by_key = |tx: &MutTxId, key: u32| { let key: AlgebraicValue = key.into(); - tx.index_scan(table_id, index_id, &key) + Datastore::index_scan_range(tx, table_id, index_id, &key) .unwrap() .map(|row| row.pointer()) .collect::>() diff --git a/crates/datastore/src/locking_tx_datastore/mut_tx.rs b/crates/datastore/src/locking_tx_datastore/mut_tx.rs index cf7ac5d011c..452c4e3edcc 100644 --- a/crates/datastore/src/locking_tx_datastore/mut_tx.rs +++ b/crates/datastore/src/locking_tx_datastore/mut_tx.rs @@ -3,14 +3,14 @@ use super::{ datastore::{Result, TxMetrics}, delete_table::DeleteTable, sequence::{Sequence, SequencesState}, - state_view::{IterByColEqMutTx, IterByColRangeMutTx, IterMutTx, ScanIterByColRangeMutTx, StateView}, + state_view::{IterByColEqMutTx, IterByColRangeMutTx, IterMutTx, StateView}, tx::TxId, tx_state::{IndexIdMap, PendingSchemaChange, TxState, TxTableForInsertion}, SharedMutexGuard, SharedWriteGuard, }; -use crate::traits::{InsertFlags, RowTypeForTable, TxData, UpdateFlags}; use crate::{ error::ViewError, + locking_tx_datastore::state_view::EqOnColumn, system_tables::{ system_tables, ConnectionIdViaU128, IdentityViaU256, StConnectionCredentialsFields, StConnectionCredentialsRow, StViewColumnFields, StViewFields, StViewParamFields, StViewParamRow, StViewSubFields, StViewSubRow, @@ -29,6 +29,10 @@ use crate::{ }; use crate::{execution_context::ExecutionContext, system_tables::StViewColumnRow}; use crate::{execution_context::Workload, system_tables::StViewRow}; +use crate::{ + locking_tx_datastore::state_view::{ApplyFilter, RangeOnColumn, ScanOrIndex}, + traits::{InsertFlags, RowTypeForTable, TxData, UpdateFlags}, +}; use core::ops::RangeBounds; use core::{cell::RefCell, mem}; use core::{iter, ops::Bound}; @@ -60,8 +64,8 @@ use spacetimedb_table::{ indexes::{RowPointer, SquashedOffset}, static_assert_size, table::{ - BlobNumBytes, DuplicateError, IndexScanRangeIter, InsertError, RowRef, Table, TableAndIndex, - UniqueConstraintViolation, + BlobNumBytes, DuplicateError, IndexScanPointIter, IndexScanRangeIter, InsertError, RowRef, Table, + TableAndIndex, UniqueConstraintViolation, }, table_index::TableIndex, }; @@ -254,8 +258,9 @@ impl MutTxId { } } - /// Record that a view performs an index scan in this transaction's read set - pub fn record_index_scan( + /// Record that a view performs a ranged index scan in this transaction's read set. + #[inline] + pub fn record_index_scan_range( &mut self, op: &FuncCallType, table_id: TableId, @@ -263,29 +268,67 @@ impl MutTxId { lower: Bound, upper: Bound, ) { - let FuncCallType::View(view) = op else { - return; + if let FuncCallType::View(view) = op { + self.record_index_scan_range_inner(view, table_id, index_id, lower, upper); }; + } - // Check for precise index seek + // This is cold as we don't want it to be inlined in case it doesn't end up getting called. + // This previously showed up in flamegraphs. + #[cold] + pub fn record_index_scan_range_inner( + &mut self, + view: &ViewCallInfo, + table_id: TableId, + index_id: IndexId, + lower: Bound, + upper: Bound, + ) { + // Check for precise index seek. if let (Bound::Included(low_val), Bound::Included(up_val)) = (&lower, &upper) { if low_val == up_val { - // Fetch index metadata - let Some((_, idx, _)) = self.get_table_and_index(index_id) else { - return; - }; - - let cols = idx.index().indexed_columns.clone(); - self.read_sets - .insert_index_scan(table_id, cols, low_val.clone(), view.clone()); + self.record_index_scan_point_inner(view, table_id, index_id, low_val.clone()); return; } } - // Everything else is treated as a table scan + // Everything else is treated as a table scan. self.read_sets.insert_full_table_scan(table_id, view.clone()); } + /// Record that a view performs a point index scan in this transaction's read set. + #[inline] + pub fn record_index_scan_point( + &mut self, + op: &FuncCallType, + table_id: TableId, + index_id: IndexId, + val: AlgebraicValue, + ) { + if let FuncCallType::View(view) = op { + self.record_index_scan_point_inner(view, table_id, index_id, val); + }; + } + + // This is cold as we don't want it to be inlined in case it doesn't end up getting called. + // This previously showed up in flamegraphs. + #[cold] + fn record_index_scan_point_inner( + &mut self, + view: &ViewCallInfo, + table_id: TableId, + index_id: IndexId, + val: AlgebraicValue, + ) { + // Fetch index metadata + let Some((_, idx, _)) = self.get_table_and_index(index_id) else { + return; + }; + + let cols = idx.index().indexed_columns.clone(); + self.read_sets.insert_index_scan(table_id, cols, val, view.clone()); + } + /// Returns the views whose read sets overlaps with this transaction's write set pub fn view_for_update(&self) -> impl Iterator + '_ { let mut res = self @@ -356,11 +399,16 @@ impl Datastore for MutTxId { where Self: 'a; - type IndexIter<'a> + type RangeIndexIter<'a> = IndexScanRanged<'a> where Self: 'a; + type PointIndexIter<'a> + = IndexScanPoint<'a> + where + Self: 'a; + fn row_count(&self, table_id: TableId) -> u64 { self.table_row_count(table_id).unwrap_or_default() } @@ -369,25 +417,32 @@ impl Datastore for MutTxId { Ok(self.iter(table_id)?) } - fn index_scan<'a>( + fn index_scan_range<'a>( &'a self, table_id: TableId, index_id: IndexId, range: &impl RangeBounds, - ) -> anyhow::Result> { + ) -> anyhow::Result> { // Extract the table id, and commit/tx indices. let (_, commit_index, tx_index) = self .get_table_and_index(index_id) .ok_or_else(|| IndexError::NotFound(index_id))?; - // Get an index seek iterator for the tx and committed state. - let tx_iter = tx_index.map(|i| i.seek_range(range)); - let commit_iter = commit_index.seek_range(range); + Ok(self.index_scan_range_inner(table_id, tx_index, commit_index, range)) + } - let dt = self.tx_state.get_delete_table(table_id); - let iter = combine_range_index_iters(dt, tx_iter, commit_iter); + fn index_scan_point<'a>( + &'a self, + table_id: TableId, + index_id: IndexId, + point: &AlgebraicValue, + ) -> anyhow::Result> { + // Extract the table id, and commit/tx indices. + let (_, commit_index, tx_index) = self + .get_table_and_index(index_id) + .ok_or_else(|| IndexError::NotFound(index_id))?; - Ok(iter) + Ok(self.index_scan_point_inner(table_id, tx_index, commit_index, point)) } } @@ -1237,11 +1292,58 @@ impl MutTxId { Ok(row.map(|row| row.read_col(StIndexFields::IndexId).unwrap())) } + /// Returns an iterator yielding rows by performing a point index scan + /// on the index identified by `index_id`. + pub fn index_scan_point<'a>( + &'a self, + index_id: IndexId, + mut point: &[u8], + ) -> Result<(TableId, AlgebraicValue, IndexScanPoint<'a>)> { + // Extract the table id, and commit/tx indices. + let (table_id, commit_index, tx_index) = self + .get_table_and_index(index_id) + .ok_or_else(|| IndexError::NotFound(index_id))?; + // Extract the index type. + let index_ty = &commit_index.index().key_type; + + // We have the index key type, so we can decode the key. + let index_ty = WithTypespace::empty(index_ty); + let point = index_ty + .deserialize(Deserializer::new(&mut point)) + .map_err(IndexError::Decode)?; + + // Get an index seek iterator for the tx and committed state. + let tx_iter = tx_index.map(|i| i.seek_point(&point)); + let commit_iter = commit_index.seek_point(&point); + + let dt = self.tx_state.get_delete_table(table_id); + let iter = ScanMutTx::combine(dt, tx_iter, commit_iter); + + Ok((table_id, point, iter)) + } + + /// See [`MutTxId::index_scan_point`]. + fn index_scan_point_inner<'a>( + &'a self, + table_id: TableId, + tx_index: Option>, + commit_index: TableAndIndex<'a>, + point: &AlgebraicValue, + ) -> IndexScanPoint<'a> { + // Get an index seek iterator for the tx and committed state. + let tx_iter = tx_index.map(|i| i.seek_point(point)); + let commit_iter = commit_index.seek_point(point); + + // Combine it all. + let dt = self.tx_state.get_delete_table(table_id); + ScanMutTx::combine(dt, tx_iter, commit_iter) + } + /// Returns an iterator yielding rows by performing a range index scan /// on the range-scan-compatible index identified by `index_id`. /// /// The `prefix` is equated to the first `prefix_elems` values of the index key - /// and then `prefix_elem`th value is bounded to the left by by `rstart` + /// and then `prefix_elem`th value is bounded to the left bys `rstart` /// and to the right by `rend`. pub fn index_scan_range<'a>( &'a self, @@ -1270,16 +1372,27 @@ impl MutTxId { let bounds = Self::range_scan_decode_bounds(index_ty, prefix, prefix_elems, rstart, rend).map_err(IndexError::Decode)?; - // Get an index seek iterator for the tx and committed state. - let tx_iter = tx_index.map(|i| i.seek_range(&bounds)); - let commit_iter = commit_index.seek_range(&bounds); + let iter = self.index_scan_range_inner(table_id, tx_index, commit_index, &bounds); let (lower, upper) = bounds; + Ok((table_id, lower, upper, iter)) + } - let dt = self.tx_state.get_delete_table(table_id); - let iter = combine_range_index_iters(dt, tx_iter, commit_iter); + /// See [`MutTxId::index_scan_range`]. + fn index_scan_range_inner<'a>( + &'a self, + table_id: TableId, + tx_index: Option>, + commit_index: TableAndIndex<'a>, + bounds: &impl RangeBounds, + ) -> IndexScanRanged<'a> { + // Get an index seek iterator for the tx and committed state. + let tx_iter = tx_index.map(|i| i.seek_range(bounds)); + let commit_iter = commit_index.seek_range(bounds); - Ok((table_id, lower, upper, iter)) + // Combine it all. + let dt = self.tx_state.get_delete_table(table_id); + ScanMutTx::combine(dt, tx_iter, commit_iter) } /// Translate `index_id` to the table id, and commit/tx indices. @@ -1953,15 +2066,26 @@ impl<'a> RowRefInsertion<'a> { } /// The iterator returned by [`MutTxId::index_scan_range`]. -pub struct IndexScanRanged<'a> { - inner: IndexScanRangedInner<'a>, +pub type IndexScanRanged<'a> = ScanMutTx<'a, IndexScanRangeIter<'a>>; + +/// The iterator returned by [`MutTxId::index_scan_point`]. +pub type IndexScanPoint<'a> = ScanMutTx<'a, IndexScanPointIter<'a>>; + +/// The iterator returned by e.g., [`MutTxId::index_scan_range`] +/// and [`MutTxId::index_scan_point`]. +/// +/// This layer only handles the transactionality aspects of [`MutTxId`]. +/// The specifics of a point vs. range scan or a non-index scan +/// are dealt with by `I`. +pub struct ScanMutTx<'a, I> { + inner: ScanMutTxInner<'a, I>, } -enum IndexScanRangedInner<'a> { - CommitOnly(IndexScanRangeIter<'a>), - CommitOnlyWithDeletes(FilterDeleted<'a, IndexScanRangeIter<'a>>), - Both(iter::Chain, IndexScanRangeIter<'a>>), - BothWithDeletes(iter::Chain, FilterDeleted<'a, IndexScanRangeIter<'a>>>), +enum ScanMutTxInner<'a, I> { + CommitOnly(I), + CommitOnlyWithDeletes(FilterDeleted<'a, I>), + Both(iter::Chain), + BothWithDeletes(iter::Chain>), } pub(super) struct FilterDeleted<'a, I> { @@ -1969,15 +2093,43 @@ pub(super) struct FilterDeleted<'a, I> { pub(super) deletes: &'a DeleteTable, } -impl<'a> Iterator for IndexScanRanged<'a> { +impl<'a, I: Iterator>> ScanMutTx<'a, I> { + /// Combine together a `tx_iter`, with its potential `delete_table`, + /// with a `commit_iter`, creating a single iterator. + fn combine(delete_table: Option<&'a DeleteTable>, tx_iter: Option, commit_iter: I) -> Self { + // Chain together the indexed rows in the tx and committed state, + // but don't yield rows deleted in the tx state. + use itertools::Either::*; + use ScanMutTxInner::*; + let commit_iter = match delete_table { + None => Left(commit_iter), + Some(deletes) => Right(FilterDeleted { + iter: commit_iter, + deletes, + }), + }; + // This is effectively just `tx_iter.into_iter().flatten().chain(commit_iter)`, + // but with all the branching and `Option`s flattened to just one layer. + let iter = match (tx_iter, commit_iter) { + (None, Left(commit_iter)) => CommitOnly(commit_iter), + (None, Right(commit_iter)) => CommitOnlyWithDeletes(commit_iter), + (Some(tx_iter), Left(commit_iter)) => Both(tx_iter.chain(commit_iter)), + (Some(tx_iter), Right(commit_iter)) => BothWithDeletes(tx_iter.chain(commit_iter)), + }; + ScanMutTx { inner: iter } + } +} + +impl<'a, I: Iterator>> Iterator for ScanMutTx<'a, I> { type Item = RowRef<'a>; fn next(&mut self) -> Option { + use ScanMutTxInner::*; match &mut self.inner { - IndexScanRangedInner::CommitOnly(it) => it.next(), - IndexScanRangedInner::CommitOnlyWithDeletes(it) => it.next(), - IndexScanRangedInner::Both(it) => it.next(), - IndexScanRangedInner::BothWithDeletes(it) => it.next(), + CommitOnly(it) => it.next(), + CommitOnlyWithDeletes(it) => it.next(), + Both(it) => it.next(), + BothWithDeletes(it) => it.next(), } } } @@ -3013,18 +3165,36 @@ fn iter_by_col_range<'a, R: RangeBounds>( // If there's an index, use that. // It's sufficient to check that the committed state has an index // as index schema changes are applied immediately. - if let Some(commit_iter) = committed_state.index_seek(table_id, &cols, &range) { - let tx_iter = tx_state.index_seek_by_cols(table_id, &cols, &range); + if let Some(commit_iter) = committed_state.index_seek_range(table_id, &cols, &range) { + let tx_iter = tx_state.index_seek_range_by_cols(table_id, &cols, &range); let delete_table = tx_state.get_delete_table(table_id); - let iter = combine_range_index_iters(delete_table, tx_iter, commit_iter); - Ok(IterByColRangeMutTx::Index(iter)) + let iter = ScanMutTx::combine(delete_table, tx_iter, commit_iter); + Ok(ScanOrIndex::Index(iter)) } else { unindexed_iter_by_col_range_warn(tx_state, committed_state, table_id, &cols); let iter = iter(tx_state, committed_state, table_id)?; + let filter = RangeOnColumn { cols, range }; + let iter = ApplyFilter::new(filter, iter); + Ok(ScanOrIndex::Scan(iter)) + } +} + +#[cfg(not(feature = "unindexed_iter_by_col_range_warn"))] +fn unindexed_iter_by_col_range_warn(_: &TxState, _: &CommittedState, _: TableId, _: &ColList) {} - Ok(IterByColRangeMutTx::Scan(ScanIterByColRangeMutTx::new( - iter, cols, range, - ))) +#[cfg(feature = "unindexed_iter_by_col_range_warn")] +fn unindexed_iter_by_col_range_warn( + tx_state: &TxState, + committed_state: &CommittedState, + table_id: TableId, + cols: &ColList, +) { + match table_row_count(tx_state, committed_state, table_id) { + // TODO(ux): log these warnings to the module logs rather than host logs. + None => log::error!("iter_by_col_range on unindexed column, but couldn't fetch table `{table_id}`s row count",), + Some(num_rows) => too_many_rows_for_scan_do(committed_state, num_rows, table_id, cols, |name, cols| { + log::warn!("iter_by_col_range without index: table {name} has {num_rows} rows; scanning columns {cols:?}",); + }), } } @@ -3033,43 +3203,31 @@ fn iter_by_col_eq<'a, 'r>( committed_state: &'a CommittedState, table_id: TableId, cols: impl Into, - value: &'r AlgebraicValue, + val: &'r AlgebraicValue, ) -> Result> { - iter_by_col_range(tx_state, committed_state, table_id, cols.into(), value) -} - -fn combine_range_index_iters<'a>( - delete_table: Option<&'a DeleteTable>, - tx_iter: Option>, - commit_iter: IndexScanRangeIter<'a>, -) -> IndexScanRanged<'a> { - // Chain together the indexed rows in the tx and committed state, - // but don't yield rows deleted in the tx state. - use itertools::Either::*; - use IndexScanRangedInner::*; - let commit_iter = match delete_table { - None => Left(commit_iter), - Some(deletes) => Right(FilterDeleted { - iter: commit_iter, - deletes, - }), - }; - // This is effectively just `tx_iter.into_iter().flatten().chain(commit_iter)`, - // but with all the branching and `Option`s flattened to just one layer. - let iter = match (tx_iter, commit_iter) { - (None, Left(commit_iter)) => CommitOnly(commit_iter), - (None, Right(commit_iter)) => CommitOnlyWithDeletes(commit_iter), - (Some(tx_iter), Left(commit_iter)) => Both(tx_iter.chain(commit_iter)), - (Some(tx_iter), Right(commit_iter)) => BothWithDeletes(tx_iter.chain(commit_iter)), - }; - IndexScanRanged { inner: iter } + // If there's an index, use that. + // It's sufficient to check that the committed state has an index + // as index schema changes are applied immediately. + let cols = cols.into(); + if let Some(commit_iter) = committed_state.index_seek_point(table_id, &cols, val) { + let tx_iter = tx_state.index_seek_point_by_cols(table_id, &cols, val); + let delete_table = tx_state.get_delete_table(table_id); + let iter = ScanMutTx::combine(delete_table, tx_iter, commit_iter); + Ok(ScanOrIndex::Index(iter)) + } else { + unindexed_iter_by_col_eq_warn(tx_state, committed_state, table_id, &cols); + let iter = iter(tx_state, committed_state, table_id)?; + let filter = EqOnColumn { cols, val }; + let iter = ApplyFilter::new(filter, iter); + Ok(ScanOrIndex::Scan(iter)) + } } #[cfg(not(feature = "unindexed_iter_by_col_range_warn"))] -fn unindexed_iter_by_col_range_warn(_: &TxState, _: &CommittedState, _: TableId, _: &ColList) {} +fn unindexed_iter_by_col_eq_warn(_: &TxState, _: &CommittedState, _: TableId, _: &ColList) {} #[cfg(feature = "unindexed_iter_by_col_range_warn")] -fn unindexed_iter_by_col_range_warn( +fn unindexed_iter_by_col_eq_warn( tx_state: &TxState, committed_state: &CommittedState, table_id: TableId, @@ -3077,26 +3235,35 @@ fn unindexed_iter_by_col_range_warn( ) { match table_row_count(tx_state, committed_state, table_id) { // TODO(ux): log these warnings to the module logs rather than host logs. - None => log::error!("iter_by_col_range on unindexed column, but couldn't fetch table `{table_id}`s row count",), - Some(num_rows) => { - const TOO_MANY_ROWS_FOR_SCAN: u64 = 1000; - if num_rows >= TOO_MANY_ROWS_FOR_SCAN { - let schema = committed_state.get_schema(table_id).unwrap(); - let table_name = &schema.table_name; - let col_names = cols - .iter() - .map(|col_id| { - schema - .columns() - .get(col_id.idx()) - .map(|col| &col.col_name[..]) - .unwrap_or("[unknown column]") - }) - .collect::>(); - log::warn!( - "iter_by_col_range without index: table {table_name} has {num_rows} rows; scanning columns {col_names:?}", - ); - } - } + None => log::error!("iter_by_col_eq on unindexed column, but couldn't fetch table `{table_id}`s row count",), + Some(num_rows) => too_many_rows_for_scan_do(committed_state, num_rows, table_id, cols, |name, cols| { + log::warn!("iter_by_col_eq without index: table {name} has {num_rows} rows; scanning columns {cols:?}",); + }), + } +} + +#[cfg(feature = "unindexed_iter_by_col_range_warn")] +fn too_many_rows_for_scan_do( + committed_state: &CommittedState, + num_rows: u64, + table_id: TableId, + cols: &ColList, + logic: impl FnOnce(&str, &[&str]), +) { + const TOO_MANY_ROWS_FOR_SCAN: u64 = 1000; + if num_rows >= TOO_MANY_ROWS_FOR_SCAN { + let schema: &Arc = committed_state.get_schema(table_id).unwrap(); + let table_name = &schema.table_name; + let col_names = cols + .iter() + .map(|col_id| { + schema + .columns() + .get(col_id.idx()) + .map(|col| &col.col_name[..]) + .unwrap_or("[unknown column]") + }) + .collect::>(); + logic(table_name, &col_names); } } diff --git a/crates/datastore/src/locking_tx_datastore/state_view.rs b/crates/datastore/src/locking_tx_datastore/state_view.rs index ec3cd0560b2..3244a5ca92f 100644 --- a/crates/datastore/src/locking_tx_datastore/state_view.rs +++ b/crates/datastore/src/locking_tx_datastore/state_view.rs @@ -1,6 +1,7 @@ -use super::mut_tx::{FilterDeleted, IndexScanRanged}; +use super::mut_tx::FilterDeleted; use super::{committed_state::CommittedState, datastore::Result, tx_state::TxState}; use crate::error::{DatastoreError, TableError}; +use crate::locking_tx_datastore::mut_tx::{IndexScanPoint, IndexScanRanged}; use crate::system_tables::{ ConnectionIdViaU128, StColumnFields, StColumnRow, StConnectionCredentialsFields, StConnectionCredentialsRow, StConstraintFields, StConstraintRow, StIndexFields, StIndexRow, StScheduledFields, StScheduledRow, @@ -14,6 +15,7 @@ use spacetimedb_lib::ConnectionId; use spacetimedb_primitives::{ColList, TableId}; use spacetimedb_sats::AlgebraicValue; use spacetimedb_schema::schema::{ColumnSchema, TableSchema, ViewDefInfo}; +use spacetimedb_table::table::IndexScanPointIter; use spacetimedb_table::{ blob_store::HashMapBlobStore, table::{IndexScanRangeIter, RowRef, Table, TableScanIter}, @@ -317,132 +319,85 @@ impl<'a> Iterator for IterMutTx<'a> { } } -pub struct IterTx<'a> { - iter: TableScanIter<'a>, +/// A filter on a row. +pub trait RowFilter { + /// Does this filter include `row`? + fn filter<'a>(&self, row: RowRef<'a>) -> bool; } -impl<'a> IterTx<'a> { - pub(super) fn new(table_id: TableId, committed_state: &'a CommittedState) -> Self { - // The table_id was validated to exist in the committed state. - let table = committed_state - .tables - .get(&table_id) - .expect("table_id must exist in committed state"); - let iter = table.scan_rows(&committed_state.blob_store); - Self { iter } - } +/// A row filter that matches `range` for the given `cols` of rows. +pub struct RangeOnColumn { + pub cols: ColList, + pub range: R, } -impl<'a> Iterator for IterTx<'a> { - type Item = RowRef<'a>; - - #[inline] - fn next(&mut self) -> Option { - self.iter.next() +impl> RowFilter for RangeOnColumn { + fn filter<'a>(&self, row: RowRef<'a>) -> bool { + self.range.contains(&row.project(&self.cols).unwrap()) } } -/// An [IterByColRangeTx] for an individual column value. -pub type IterByColEqTx<'a, 'r> = IterByColRangeTx<'a, &'r AlgebraicValue>; -/// An [IterByColRangeMutTx] for an individual column value. -pub type IterByColEqMutTx<'a, 'r> = IterByColRangeMutTx<'a, &'r AlgebraicValue>; - -/// An iterator for a range of values in a column. -pub enum IterByColRangeTx<'a, R: RangeBounds> { - /// When the column in question does not have an index. - Scan(ScanIterByColRangeTx<'a, R>), - - /// When the column has an index. - Index(IndexScanRangeIter<'a>), +/// A row filter that matches `val` for the given `cols` of rows. +pub struct EqOnColumn<'r> { + pub cols: ColList, + pub val: &'r AlgebraicValue, } -impl<'a, R: RangeBounds> Iterator for IterByColRangeTx<'a, R> { - type Item = RowRef<'a>; - - fn next(&mut self) -> Option { - match self { - IterByColRangeTx::Scan(iter) => iter.next(), - IterByColRangeTx::Index(iter) => iter.next(), - } +impl RowFilter for EqOnColumn<'_> { + fn filter<'a>(&self, row: RowRef<'a>) -> bool { + self.val == &row.project(&self.cols).unwrap() } } -/// An iterator for a range of values in a column. -pub enum IterByColRangeMutTx<'a, R: RangeBounds> { - /// When the column in question does not have an index. - Scan(ScanIterByColRangeMutTx<'a, R>), - - /// When the column has an index. - Index(IndexScanRanged<'a>), - - /// When the range itself is empty. - RangeEmpty, +/// Applies filter `F` to `I`, producing another iterator. +pub struct ApplyFilter { + iter: I, + filter: F, } -impl<'a, R: RangeBounds> Iterator for IterByColRangeMutTx<'a, R> { - type Item = RowRef<'a>; - - fn next(&mut self) -> Option { - match self { - Self::Scan(range) => range.next(), - Self::Index(range) => range.next(), - Self::RangeEmpty => None, - } - } -} - -pub struct ScanIterByColRangeTx<'a, R: RangeBounds> { - scan_iter: IterTx<'a>, - cols: ColList, - range: R, -} - -impl<'a, R: RangeBounds> ScanIterByColRangeTx<'a, R> { - // TODO(perf, centril): consider taking `cols` by reference. - pub(super) fn new(scan_iter: IterTx<'a>, cols: ColList, range: R) -> Self { - Self { scan_iter, cols, range } +impl ApplyFilter { + /// Returns an iterator that applies `filer` to `iter`. + pub(super) fn new(filter: F, iter: I) -> Self { + Self { iter, filter } } } -impl<'a, R: RangeBounds> Iterator for ScanIterByColRangeTx<'a, R> { +impl<'a, F: RowFilter, I: Iterator>> Iterator for ApplyFilter { type Item = RowRef<'a>; fn next(&mut self) -> Option { - for row_ref in &mut self.scan_iter { - let value = row_ref.project(&self.cols).unwrap(); - if self.range.contains(&value) { - return Some(row_ref); - } - } - - None + self.iter.find(|row| self.filter.filter(*row)) } } -pub struct ScanIterByColRangeMutTx<'a, R: RangeBounds> { - scan_iter: IterMutTx<'a>, - cols: ColList, - range: R, -} +type ScanFilterTx<'a, F> = ApplyFilter>; +pub type IterByColRangeTx<'a, R> = ScanOrIndex>, IndexScanRangeIter<'a>>; +pub type IterByColEqTx<'a, 'r> = ScanOrIndex>, IndexScanPointIter<'a>>; -impl<'a, R: RangeBounds> ScanIterByColRangeMutTx<'a, R> { - // TODO(perf, centril): consider taking `cols` by reference. - pub(super) fn new(scan_iter: IterMutTx<'a>, cols: ColList, range: R) -> Self { - Self { scan_iter, cols, range } - } +type ScanFilterMutTx<'a, F> = ApplyFilter>; +pub type IterByColRangeMutTx<'a, R> = ScanOrIndex>, IndexScanRanged<'a>>; +pub type IterByColEqMutTx<'a, 'r> = ScanOrIndex>, IndexScanPoint<'a>>; + +/// An iterator that either scans or index scans. +pub enum ScanOrIndex { + /// When the column in question does not have an index. + Scan(S), + + /// When the column has an index. + Index(I), } -impl<'a, R: RangeBounds> Iterator for ScanIterByColRangeMutTx<'a, R> { +impl<'a, S, I> Iterator for ScanOrIndex +where + S: Iterator>, + I: Iterator>, +{ type Item = RowRef<'a>; fn next(&mut self) -> Option { - for row_ref in &mut self.scan_iter { - let value = row_ref.project(&self.cols).unwrap(); - if self.range.contains(&value) { - return Some(row_ref); - } + match self { + Self::Scan(iter) => iter.next(), + Self::Index(iter) => iter.next(), } - - None } } diff --git a/crates/datastore/src/locking_tx_datastore/tx.rs b/crates/datastore/src/locking_tx_datastore/tx.rs index e411c858237..d3e418276f9 100644 --- a/crates/datastore/src/locking_tx_datastore/tx.rs +++ b/crates/datastore/src/locking_tx_datastore/tx.rs @@ -5,14 +5,13 @@ use super::{ IterByColEqTx, SharedReadGuard, }; use crate::execution_context::ExecutionContext; -use crate::locking_tx_datastore::state_view::IterTx; use spacetimedb_durability::TxOffset; use spacetimedb_execution::Datastore; use spacetimedb_lib::metrics::ExecutionMetrics; use spacetimedb_primitives::{ColList, IndexId, TableId}; use spacetimedb_sats::AlgebraicValue; use spacetimedb_schema::schema::TableSchema; -use spacetimedb_table::table::{IndexScanRangeIter, TableScanIter}; +use spacetimedb_table::table::{IndexScanPointIter, IndexScanRangeIter, TableAndIndex, TableScanIter}; use std::sync::Arc; use std::{future, num::NonZeroU64}; use std::{ @@ -37,11 +36,16 @@ impl Datastore for TxId { where Self: 'a; - type IndexIter<'a> + type RangeIndexIter<'a> = IndexScanRangeIter<'a> where Self: 'a; + type PointIndexIter<'a> + = IndexScanPointIter<'a> + where + Self: 'a; + fn row_count(&self, table_id: TableId) -> u64 { self.committed_state_shared_lock .table_row_count(table_id) @@ -50,31 +54,31 @@ impl Datastore for TxId { fn table_scan<'a>(&'a self, table_id: TableId) -> anyhow::Result> { self.committed_state_shared_lock - .get_table(table_id) - .map(|table| table.scan_rows(&self.committed_state_shared_lock.blob_store)) + .table_scan(table_id) .ok_or_else(|| anyhow::anyhow!("TableId `{table_id}` does not exist")) } - fn index_scan<'a>( + fn index_scan_range<'a>( &'a self, table_id: TableId, index_id: IndexId, range: &impl RangeBounds, - ) -> anyhow::Result> { - self.committed_state_shared_lock - .get_table(table_id) - .ok_or_else(|| anyhow::anyhow!("TableId `{table_id}` does not exist")) - .and_then(|table| { - table - .get_index_by_id_with_table(&self.committed_state_shared_lock.blob_store, index_id) - .map(|i| i.seek_range(range)) - .ok_or_else(|| anyhow::anyhow!("IndexId `{index_id}` does not exist")) - }) + ) -> anyhow::Result> { + self.with_index(table_id, index_id, |i| i.seek_range(range)) + } + + fn index_scan_point<'a>( + &'a self, + table_id: TableId, + index_id: IndexId, + point: &AlgebraicValue, + ) -> anyhow::Result> { + self.with_index(table_id, index_id, |i| i.seek_point(point)) } } impl StateView for TxId { - type Iter<'a> = IterTx<'a>; + type Iter<'a> = TableScanIter<'a>; type IterByColRange<'a, R: RangeBounds> = IterByColRangeTx<'a, R>; type IterByColEq<'a, 'r> = IterByColEqTx<'a, 'r> @@ -112,11 +116,28 @@ impl StateView for TxId { cols: impl Into, value: &'r AlgebraicValue, ) -> Result> { - self.iter_by_col_range(table_id, cols.into(), value) + self.committed_state_shared_lock.iter_by_col_eq(table_id, cols, value) } } impl TxId { + fn with_index<'a, R>( + &'a self, + table_id: TableId, + index_id: IndexId, + seek: impl FnOnce(TableAndIndex<'a>) -> R, + ) -> anyhow::Result { + self.committed_state_shared_lock + .get_table(table_id) + .ok_or_else(|| anyhow::anyhow!("TableId `{table_id}` does not exist")) + .and_then(|table| { + table + .get_index_by_id_with_table(&self.committed_state_shared_lock.blob_store, index_id) + .map(seek) + .ok_or_else(|| anyhow::anyhow!("IndexId `{index_id}` does not exist")) + }) + } + /// Release this read-only transaction, /// allowing new mutable transactions to start if this was the last read-only transaction. /// diff --git a/crates/datastore/src/locking_tx_datastore/tx_state.rs b/crates/datastore/src/locking_tx_datastore/tx_state.rs index 2dcd277a018..d67e19fcc06 100644 --- a/crates/datastore/src/locking_tx_datastore/tx_state.rs +++ b/crates/datastore/src/locking_tx_datastore/tx_state.rs @@ -10,7 +10,7 @@ use spacetimedb_table::{ indexes::{RowPointer, SquashedOffset}, pointer_map::PointerMap, static_assert_size, - table::{IndexScanRangeIter, RowRef, Table, TableAndIndex}, + table::{IndexScanPointIter, IndexScanRangeIter, RowRef, Table, TableAndIndex}, table_index::TableIndex, }; use std::collections::{btree_map, BTreeMap}; @@ -134,9 +134,10 @@ impl TxState { /// /// Matching is defined by `Ord for AlgebraicValue`. /// - /// For a unique index this will always yield at most one `RowRef`. + /// For a unique index this will always yield at most one `RowRef` + /// when `range` is a point. /// When there is no index this returns `None`. - pub(super) fn index_seek_by_cols<'a>( + pub(super) fn index_seek_range_by_cols<'a>( &'a self, table_id: TableId, cols: &ColList, @@ -148,6 +149,26 @@ impl TxState { .map(|i| i.seek_range(range)) } + /// When there's an index on `cols`, + /// returns an iterator over the `TableIndex` that yields all the [`RowRef`]s + /// that match the specified `range` in the indexed column. + /// + /// Matching is defined by `Eq for AlgebraicValue`. + /// + /// For a unique index this will always yield at most one `RowRef`. + /// When there is no index this returns `None`. + pub(super) fn index_seek_point_by_cols<'a>( + &'a self, + table_id: TableId, + cols: &ColList, + point: &AlgebraicValue, + ) -> Option> { + self.insert_tables + .get(&table_id)? + .get_index_by_cols_with_table(&self.blob_store, cols) + .map(|i| i.seek_point(point)) + } + /// Returns the table for `table_id` combined with the index for `index_id`, if both exist. pub(super) fn get_index_by_id_with_table(&self, table_id: TableId, index_id: IndexId) -> Option> { self.insert_tables diff --git a/crates/execution/src/lib.rs b/crates/execution/src/lib.rs index 79d782272b9..d9583cbcef7 100644 --- a/crates/execution/src/lib.rs +++ b/crates/execution/src/lib.rs @@ -24,8 +24,13 @@ pub trait Datastore { where Self: 'a; - /// Iterator type for ranged index scans - type IndexIter<'a>: Iterator> + 'a + /// Iterator type for ranged index scans. + type RangeIndexIter<'a>: Iterator> + 'a + where + Self: 'a; + + /// Iterator type for point index scans. + type PointIndexIter<'a>: Iterator> + 'a where Self: 'a; @@ -35,13 +40,21 @@ pub trait Datastore { /// Scans and returns all of the rows in a table fn table_scan<'a>(&'a self, table_id: TableId) -> Result>; - /// Scans a range of keys from an index returning a [`RowRef`] iterator - fn index_scan<'a>( + /// Scans a range of keys from an index returning a [`RowRef`] iterator. + fn index_scan_range<'a>( &'a self, table_id: TableId, index_id: IndexId, range: &impl RangeBounds, - ) -> Result>; + ) -> Result>; + + /// Scans a key from an index returning a [`RowRef`] iterator. + fn index_scan_point<'a>( + &'a self, + table_id: TableId, + index_id: IndexId, + point: &AlgebraicValue, + ) -> Result>; } pub trait DeltaStore { diff --git a/crates/execution/src/pipelined.rs b/crates/execution/src/pipelined.rs index 9a9a1550f04..1c60bc066b0 100644 --- a/crates/execution/src/pipelined.rs +++ b/crates/execution/src/pipelined.rs @@ -726,7 +726,7 @@ impl PipelinedIxScan { ) -> Result<()> { // A single column index scan let single_col_scan = || { - tx.index_scan( + tx.index_scan_range( self.table_id, self.index_id, &(self.lower.as_ref(), self.upper.as_ref()), @@ -739,7 +739,7 @@ impl PipelinedIxScan { }; // A multi-column index scan let multi_col_scan = |prefix: &[AlgebraicValue]| { - tx.index_scan( + tx.index_scan_range( self.table_id, self.index_id, &( @@ -832,7 +832,7 @@ impl PipelinedIxJoin { let iter_rhs = |u: &Tuple, lhs_field: &TupleField, bytes_scanned: &mut usize| -> Result<_> { let key = project(u, lhs_field, bytes_scanned); Ok(tx - .index_scan(self.rhs_table, self.rhs_index, &key)? + .index_scan_point(self.rhs_table, self.rhs_index, &key)? .map(Row::Ptr) .map(Tuple::Row)) }; diff --git a/crates/lib/src/filterable_value.rs b/crates/lib/src/filterable_value.rs index 82a90eb7c4e..d0039823d09 100644 --- a/crates/lib/src/filterable_value.rs +++ b/crates/lib/src/filterable_value.rs @@ -126,12 +126,28 @@ impl TermBound<&Bound> { } } pub trait IndexScanRangeBoundsTerminator { + /// Whether this bound terminator is a point. + const POINT: bool = false; + + /// The key type of the bound. type Arg; + + /// Returns the point bound, assuming `POINT == true`. + fn point(&self) -> &Self::Arg { + unimplemented!() + } + + /// Returns the terminal bound for the range scan. + /// This bound can either be a point, as in most cases, or an actual bound. fn bounds(&self) -> TermBound<&Self::Arg>; } impl> IndexScanRangeBoundsTerminator for Arg { + const POINT: bool = true; type Arg = Arg; + fn point(&self) -> &Arg { + self + } fn bounds(&self) -> TermBound<&Arg> { TermBound::Single(ops::Bound::Included(self)) } diff --git a/crates/standalone/src/subcommands/start.rs b/crates/standalone/src/subcommands/start.rs index 811c3538117..cb7e9f60209 100644 --- a/crates/standalone/src/subcommands/start.rs +++ b/crates/standalone/src/subcommands/start.rs @@ -119,7 +119,7 @@ pub async fn exec(args: &ArgMatches, db_cores: JobCores) -> anyhow::Result<()> { Storage::Disk }; let page_pool_max_size = args - .get_one::<&str>("page_pool_max_size") + .get_one::("page_pool_max_size") .map(|size| parse_size::Config::new().with_binary().parse_size(size)) .transpose() .context("unrecognized format in `page_pool_max_size`")? diff --git a/crates/table/src/table_index/mod.rs b/crates/table/src/table_index/mod.rs index 1f817a703eb..f89cda695e0 100644 --- a/crates/table/src/table_index/mod.rs +++ b/crates/table/src/table_index/mod.rs @@ -1552,7 +1552,10 @@ mod test { } fn test_seek(index: &TableIndex, val_to_ptr: &HashMap, range: impl RangeBounds, expect: impl IntoIterator) -> TestCaseResult { - let mut ptrs_in_index = index.seek_range(&range).collect::>(); + check_seek(index.seek_range(&range).collect(), val_to_ptr, expect) + } + + fn check_seek(mut ptrs_in_index: Vec, val_to_ptr: &HashMap, expect: impl IntoIterator) -> TestCaseResult { ptrs_in_index.sort(); let mut expected_ptrs = expect.into_iter().map(|expected| val_to_ptr.get(&expected).unwrap()).copied().collect::>(); expected_ptrs.sort(); @@ -1566,6 +1569,7 @@ mod test { // Test point ranges. for x in range.clone() { test_seek(&index, &val_to_ptr, V(x), [x])?; + check_seek(index.seek_point(&V(x)).collect(), &val_to_ptr, [x])?; } // Test `..` (`RangeFull`). From b278fd253a6cf7c034c40a094f538e827901e36f Mon Sep 17 00:00:00 2001 From: Mazdak Farrokhzad Date: Thu, 11 Dec 2025 00:28:14 +0100 Subject: [PATCH 2/4] move work of do_durabiliy into a DurabilityWorker --- crates/core/src/db/durability.rs | 148 ++++++++++++++++++ crates/core/src/db/mod.rs | 1 + crates/core/src/db/relational_db.rs | 93 ++--------- .../subscription/module_subscription_actor.rs | 4 +- crates/datastore/src/execution_context.rs | 14 +- .../locking_tx_datastore/committed_state.rs | 2 +- crates/datastore/src/traits.rs | 8 +- 7 files changed, 177 insertions(+), 93 deletions(-) create mode 100644 crates/core/src/db/durability.rs diff --git a/crates/core/src/db/durability.rs b/crates/core/src/db/durability.rs new file mode 100644 index 00000000000..05b962cd9fa --- /dev/null +++ b/crates/core/src/db/durability.rs @@ -0,0 +1,148 @@ +use crate::db::persistence::Durability; +use futures::{channel::mpsc, StreamExt}; +use spacetimedb_commitlog::payload::{ + txdata::{Mutations, Ops}, + Txdata, +}; +use spacetimedb_data_structures::map::IntSet; +use spacetimedb_datastore::{execution_context::ReducerContext, traits::TxData}; +use spacetimedb_durability::DurableOffset; +use spacetimedb_primitives::TableId; +use std::sync::Arc; + +/// A request to persist a transaction. +pub struct DurabilityRequest { + reducer_context: Option, + tx_data: Arc, +} + +/// Represents a handle to a background task that persists transactions +/// according to the [`Durability`] policy provided. +/// +/// This exists to avoid doing some preparatory work +/// before sending over to the `Durability` layer. +#[derive(Clone)] +pub struct DurabilityWorker { + request_tx: mpsc::UnboundedSender, + durability: Arc, +} + +impl DurabilityWorker { + /// Create a new [`DurabilityWorker`] using the given `durability` policy. + pub fn new(durability: Arc) -> Self { + let (request_tx, request_rx) = mpsc::unbounded(); + + let actor = DurabilityWorkerActor { + request_rx, + durability: durability.clone(), + }; + tokio::spawn(actor.run()); + + Self { request_tx, durability } + } + + /// Request that a transaction be made be made durable. + /// That is, if `(tx_data, ctx)` should be appended to the commitlog, do so. + /// + /// Note that by this stage, + /// [`spacetimedb_datastore::locking_tx_datastore::committed_state::tx_consumes_offset`] + /// has already decided based on the reducer and operations whether the transaction should be appended; + /// this method is responsible only for reading its decision out of the `tx_data` + /// and calling `durability.append_tx`. + /// + /// This method does not block, + /// and sends the work to an actor that collects data and calls `durability.append_tx`. + /// + /// Panics if the durability worker has closed the receive end of its queue(s), + /// which is likely due to it having panicked. + pub fn request_durability(&self, reducer_context: Option, tx_data: &Arc) { + self.request_tx + .unbounded_send(DurabilityRequest { + reducer_context, + tx_data: tx_data.clone(), + }) + .expect("durability worker panicked"); + } + + /// Get the [`DurableOffset`] of this database. + pub fn durable_tx_offset(&self) -> DurableOffset { + self.durability.durable_tx_offset() + } +} + +pub struct DurabilityWorkerActor { + request_rx: mpsc::UnboundedReceiver, + durability: Arc, +} + +impl DurabilityWorkerActor { + /// Processes requests to do durability. + async fn run(mut self) { + while let Some(DurabilityRequest { + reducer_context, + tx_data, + }) = self.request_rx.next().await + { + Self::do_durability(&*self.durability, reducer_context, &tx_data); + } + } + + pub fn do_durability(durability: &Durability, reducer_context: Option, tx_data: &TxData) { + if tx_data.tx_offset().is_none() { + let name = reducer_context.as_ref().map(|rcx| &*rcx.name); + debug_assert!( + !tx_data.has_rows_or_connect_disconnect(name), + "tx_data has no rows but has connect/disconnect: `{name:?}`" + ); + return; + } + + let is_not_ephemeral_table = |table_id: &TableId| -> bool { + tx_data + .ephemeral_tables() + .map(|etables| !etables.contains(table_id)) + .unwrap_or(true) + }; + + let inserts: Box<_> = tx_data + .inserts() + // Skip ephemeral tables + .filter(|(table_id, _)| is_not_ephemeral_table(table_id)) + .map(|(table_id, rowdata)| Ops { + table_id: *table_id, + rowdata: rowdata.clone(), + }) + .collect(); + + let truncates: IntSet = tx_data.truncates().collect(); + + let deletes: Box<_> = tx_data + .deletes() + .filter(|(table_id, _)| is_not_ephemeral_table(table_id)) + .map(|(table_id, rowdata)| Ops { + table_id: *table_id, + rowdata: rowdata.clone(), + }) + // filter out deletes for tables that are truncated in the same transaction. + .filter(|ops| !truncates.contains(&ops.table_id)) + .collect(); + + let truncates = truncates.into_iter().filter(is_not_ephemeral_table).collect(); + + let inputs = reducer_context.map(|rcx| rcx.into()); + + let txdata = Txdata { + inputs, + outputs: None, + mutations: Some(Mutations { + inserts, + deletes, + truncates, + }), + }; + + // TODO: Should measure queuing time + actual write + // This does not block, as per trait docs. + durability.append_tx(txdata); + } +} diff --git a/crates/core/src/db/mod.rs b/crates/core/src/db/mod.rs index 62acd17a78c..3453324b892 100644 --- a/crates/core/src/db/mod.rs +++ b/crates/core/src/db/mod.rs @@ -11,6 +11,7 @@ pub mod persistence; pub mod relational_db; pub mod snapshot; pub mod update; +mod durability; /// Whether SpacetimeDB is run in memory, or persists objects and /// a message log to disk. diff --git a/crates/core/src/db/relational_db.rs b/crates/core/src/db/relational_db.rs index 99f57806247..ecf8cef6daf 100644 --- a/crates/core/src/db/relational_db.rs +++ b/crates/core/src/db/relational_db.rs @@ -1,3 +1,4 @@ +use crate::db::durability::DurabilityWorker; use crate::db::MetricsRecorderQueue; use crate::error::{DBError, DatabaseError, RestoreSnapshotError}; use crate::messages::control_db::HostType; @@ -10,10 +11,9 @@ use fs2::FileExt; use log::info; use spacetimedb_commitlog::repo::OnNewSegmentFn; use spacetimedb_commitlog::{self as commitlog, SizeOnDisk}; -use spacetimedb_data_structures::map::IntSet; use spacetimedb_datastore::db_metrics::DB_METRICS; use spacetimedb_datastore::error::{DatastoreError, TableError, ViewError}; -use spacetimedb_datastore::execution_context::{ReducerContext, Workload, WorkloadType}; +use spacetimedb_datastore::execution_context::{Workload, WorkloadType}; use spacetimedb_datastore::locking_tx_datastore::datastore::TxMetrics; use spacetimedb_datastore::locking_tx_datastore::state_view::{ IterByColEqMutTx, IterByColRangeMutTx, IterMutTx, StateView, @@ -103,7 +103,7 @@ pub struct RelationalDB { owner_identity: Identity, inner: Locking, - durability: Option>, + durability: Option, snapshot_worker: Option, row_count_fn: RowCountFn, @@ -154,6 +154,7 @@ impl RelationalDB { Arc::new(EnumMap::from_fn(|ty| ExecutionCounters::new(&ty, &database_identity))); let (durability, disk_size_fn, snapshot_worker) = Persistence::unzip(persistence); + let durability = durability.map(DurabilityWorker::new); Self { inner, @@ -766,19 +767,21 @@ impl RelationalDB { } #[tracing::instrument(level = "trace", skip_all)] - pub fn commit_tx(&self, tx: MutTx) -> Result, DBError> { + #[allow(clippy::type_complexity)] + pub fn commit_tx(&self, tx: MutTx) -> Result, TxMetrics, String)>, DBError> { log::trace!("COMMIT MUT TX"); - // TODO: Never returns `None` -- should it? let reducer_context = tx.ctx.reducer_context().cloned(); + // TODO: Never returns `None` -- should it? let Some((tx_offset, tx_data, tx_metrics, reducer)) = self.inner.commit_mut_tx(tx)? else { return Ok(None); }; self.maybe_do_snapshot(&tx_data); + let tx_data = Arc::new(tx_data); if let Some(durability) = &self.durability { - Self::do_durability(&**durability, reducer_context.as_ref(), &tx_data) + durability.request_durability(reducer_context, &tx_data); } Ok(Some((tx_offset, tx_data, tx_metrics, reducer))) @@ -789,7 +792,7 @@ impl RelationalDB { &self, tx: MutTx, workload: Workload, - ) -> Result, DBError> { + ) -> Result, TxMetrics, Tx)>, DBError> { log::trace!("COMMIT MUT TX"); let Some((tx_data, tx_metrics, tx)) = self.inner.commit_mut_tx_downgrade(tx, workload)? else { @@ -798,82 +801,14 @@ impl RelationalDB { self.maybe_do_snapshot(&tx_data); + let tx_data = Arc::new(tx_data); if let Some(durability) = &self.durability { - Self::do_durability(&**durability, tx.ctx.reducer_context(), &tx_data) + durability.request_durability(tx.ctx.reducer_context().cloned(), &tx_data); } Ok(Some((tx_data, tx_metrics, tx))) } - /// If `(tx_data, ctx)` should be appended to the commitlog, do so. - /// - /// Note that by this stage, - /// [`spacetimedb_datastore::locking_tx_datastore::committed_state::tx_consumes_offset`] - /// has already decided based on the reducer and operations whether the transaction should be appended; - /// this method is responsible only for reading its decision out of the `tx_data` - /// and calling `durability.append_tx`. - fn do_durability(durability: &Durability, reducer_context: Option<&ReducerContext>, tx_data: &TxData) { - use commitlog::payload::{ - txdata::{Mutations, Ops}, - Txdata, - }; - - let is_not_ephemeral_table = |table_id: &TableId| -> bool { - tx_data - .ephemeral_tables() - .map(|etables| !etables.contains(table_id)) - .unwrap_or(true) - }; - - if tx_data.tx_offset().is_some() { - let inserts: Box<_> = tx_data - .inserts() - // Skip ephemeral tables - .filter(|(table_id, _)| is_not_ephemeral_table(table_id)) - .map(|(table_id, rowdata)| Ops { - table_id: *table_id, - rowdata: rowdata.clone(), - }) - .collect(); - - let truncates: IntSet = tx_data.truncates().collect(); - - let deletes: Box<_> = tx_data - .deletes() - .filter(|(table_id, _)| is_not_ephemeral_table(table_id)) - .map(|(table_id, rowdata)| Ops { - table_id: *table_id, - rowdata: rowdata.clone(), - }) - // filter out deletes for tables that are truncated in the same transaction. - .filter(|ops| !truncates.contains(&ops.table_id)) - .collect(); - - let truncates = truncates.into_iter().filter(is_not_ephemeral_table).collect(); - - let inputs = reducer_context.map(|rcx| rcx.into()); - - let txdata = Txdata { - inputs, - outputs: None, - mutations: Some(Mutations { - inserts, - deletes, - truncates, - }), - }; - - // TODO: Should measure queuing time + actual write - durability.append_tx(txdata); - } else { - debug_assert!( - !tx_data.has_rows_or_connect_disconnect(reducer_context), - "tx_data has no rows but has connect/disconnect: `{:?}`", - reducer_context.map(|rcx| &rcx.name), - ); - } - } - /// Get the [`DurableOffset`] of this database, or `None` if this is an /// in-memory instance. pub fn durable_tx_offset(&self) -> Option { @@ -1520,8 +1455,8 @@ impl RelationalDB { } /// Reports the metrics for `reducer`, using counters provided by `db`. - pub fn report_mut_tx_metrics(&self, reducer: String, metrics: TxMetrics, tx_data: Option) { - self.report_tx_metrics(reducer, tx_data.map(Arc::new), Some(metrics), None); + pub fn report_mut_tx_metrics(&self, reducer: String, metrics: TxMetrics, tx_data: Option>) { + self.report_tx_metrics(reducer, tx_data, Some(metrics), None); } /// Reports subscription metrics for `reducer`, using counters provided by `db`. diff --git a/crates/core/src/subscription/module_subscription_actor.rs b/crates/core/src/subscription/module_subscription_actor.rs index 8238ba29288..02e10dfea94 100644 --- a/crates/core/src/subscription/module_subscription_actor.rs +++ b/crates/core/src/subscription/module_subscription_actor.rs @@ -1031,7 +1031,7 @@ impl ModuleSubscriptions { return Ok(Err(WriteConflict)); }; *db_update = DatabaseUpdate::from_writes(&tx_data); - (read_tx, Arc::new(tx_data), tx_metrics) + (read_tx, tx_data, tx_metrics) } EventStatus::Failed(_) | EventStatus::OutOfEnergy => { // If the transaction failed, we need to rollback the mutable tx. @@ -1198,7 +1198,7 @@ impl ModuleSubscriptions { let _ = extra.send(tx_offset); } self.relational_db - .report_tx_metrics(reducer, Some(Arc::new(tx_data)), Some(tx_metrics_mut), None); + .report_tx_metrics(reducer, Some(tx_data), Some(tx_metrics_mut), None); } }); (guard, offset_rx) diff --git a/crates/datastore/src/execution_context.rs b/crates/datastore/src/execution_context.rs index d6f17a9503a..25343f3d1fc 100644 --- a/crates/datastore/src/execution_context.rs +++ b/crates/datastore/src/execution_context.rs @@ -39,7 +39,7 @@ pub struct ReducerContext { pub arg_bsatn: Bytes, } -impl From<&ReducerContext> for txdata::Inputs { +impl From for txdata::Inputs { fn from( ReducerContext { name, @@ -47,9 +47,9 @@ impl From<&ReducerContext> for txdata::Inputs { caller_connection_id, timestamp, arg_bsatn, - }: &ReducerContext, + }: ReducerContext, ) -> Self { - let reducer_name = Arc::new(Varchar::from_str_truncate(name)); + let reducer_name = Arc::new(Varchar::from_string_truncate(name)); let cap = arg_bsatn.len() /* caller_identity */ + 32 @@ -58,10 +58,10 @@ impl From<&ReducerContext> for txdata::Inputs { /* timestamp */ + 8; let mut buf = Vec::with_capacity(cap); - bsatn::to_writer(&mut buf, caller_identity).unwrap(); - bsatn::to_writer(&mut buf, caller_connection_id).unwrap(); - bsatn::to_writer(&mut buf, timestamp).unwrap(); - buf.extend_from_slice(arg_bsatn); + bsatn::to_writer(&mut buf, &caller_identity).unwrap(); + bsatn::to_writer(&mut buf, &caller_connection_id).unwrap(); + bsatn::to_writer(&mut buf, ×tamp).unwrap(); + buf.extend_from_slice(&arg_bsatn); txdata::Inputs { reducer_name, diff --git a/crates/datastore/src/locking_tx_datastore/committed_state.rs b/crates/datastore/src/locking_tx_datastore/committed_state.rs index 41e5615c18e..370135a100f 100644 --- a/crates/datastore/src/locking_tx_datastore/committed_state.rs +++ b/crates/datastore/src/locking_tx_datastore/committed_state.rs @@ -710,7 +710,7 @@ impl CommittedState { // Note that this may change in the future: some analytics and/or // timetravel queries may benefit from seeing all inputs, even if // the database state did not change. - tx_data.has_rows_or_connect_disconnect(ctx.reducer_context()) + tx_data.has_rows_or_connect_disconnect(ctx.reducer_context().map(|rcx| &*rcx.name)) } pub(super) fn drop_view_from_read_sets(&mut self, view_id: ViewId, sender: Option) { diff --git a/crates/datastore/src/traits.rs b/crates/datastore/src/traits.rs index e1161b5a11b..7e49e56141f 100644 --- a/crates/datastore/src/traits.rs +++ b/crates/datastore/src/traits.rs @@ -6,7 +6,7 @@ use std::{ops::RangeBounds, sync::Arc}; use super::locking_tx_datastore::datastore::TxMetrics; use super::system_tables::ModuleKind; use super::Result; -use crate::execution_context::{ReducerContext, Workload}; +use crate::execution_context::Workload; use crate::system_tables::ST_TABLE_ID; use spacetimedb_data_structures::map::{IntMap, IntSet}; use spacetimedb_durability::TxOffset; @@ -236,7 +236,7 @@ impl TxData { /// Determines which ephemeral tables were modified in this transaction. /// /// Iterates over the tables updated in this transaction and records those that - /// also appear in `all_ephemeral_tables`. + /// also appear in `all_ephemeral_tables`. /// `self.ephemeral_tables` remains `None` if no ephemeral tables were modified. pub fn set_ephemeral_tables(&mut self, all_ephemeral_tables: &EphemeralTables) { for tid in self.tables.keys() { @@ -307,11 +307,11 @@ impl TxData { /// Check if this [`TxData`] contains any `inserted | deleted` rows or `connect/disconnect` operations. /// /// This is used to determine if a transaction should be written to disk. - pub fn has_rows_or_connect_disconnect(&self, reducer_context: Option<&ReducerContext>) -> bool { + pub fn has_rows_or_connect_disconnect(&self, reducer_name: Option<&str>) -> bool { self.inserts().any(|(_, inserted_rows)| !inserted_rows.is_empty()) || self.deletes().any(|(.., deleted_rows)| !deleted_rows.is_empty()) || matches!( - reducer_context.map(|rcx| rcx.name.strip_prefix("__identity_")), + reducer_name.map(|rn| rn.strip_prefix("__identity_")), Some(Some("connected__" | "disconnected__")) ) } From d23f3ffc3f34cb065f85b18619d10a6afbf05ec7 Mon Sep 17 00:00:00 2001 From: Mazdak Farrokhzad Date: Fri, 4 Jul 2025 11:32:10 +0200 Subject: [PATCH 3/4] subscriptions: reuse buffers in `ServerMessage` via global pool --- crates/bench/benches/subscription.rs | 21 +-- crates/client-api-messages/src/websocket.rs | 5 + crates/client-api/src/routes/subscribe.rs | 36 +++-- crates/core/src/client.rs | 1 + crates/core/src/client/client_connection.rs | 4 + crates/core/src/client/consume_each_list.rs | 80 +++++++++++ crates/core/src/client/messages.rs | 7 + crates/core/src/host/host_controller.rs | 20 ++- crates/core/src/host/module_host.rs | 16 ++- .../core/src/subscription/execution_unit.rs | 5 +- crates/core/src/subscription/mod.rs | 129 ++++++++++-------- .../subscription/module_subscription_actor.rs | 82 ++++++++--- .../module_subscription_manager.rs | 18 ++- crates/core/src/subscription/query.rs | 5 +- .../src/subscription/row_list_builder_pool.rs | 90 ++++++++++++ crates/core/src/subscription/subscription.rs | 5 +- .../src/subscription/websocket_building.rs | 29 +++- crates/core/src/worker_metrics/mod.rs | 49 +++++++ crates/execution/src/lib.rs | 19 +-- crates/sats/src/bsatn.rs | 55 +++++++- crates/standalone/src/lib.rs | 5 + crates/standalone/src/subcommands/start.rs | 1 + crates/table/src/static_layout.rs | 40 ++---- crates/table/src/table.rs | 7 +- crates/vm/src/relation.rs | 5 +- 25 files changed, 575 insertions(+), 159 deletions(-) create mode 100644 crates/core/src/client/consume_each_list.rs create mode 100644 crates/core/src/subscription/row_list_builder_pool.rs diff --git a/crates/bench/benches/subscription.rs b/crates/bench/benches/subscription.rs index 0f022010193..065916f78b2 100644 --- a/crates/bench/benches/subscription.rs +++ b/crates/bench/benches/subscription.rs @@ -1,10 +1,12 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use spacetimedb::client::consume_each_list::ConsumeEachBuffer; use spacetimedb::error::DBError; use spacetimedb::host::module_host::DatabaseTableUpdate; use spacetimedb::identity::AuthCtx; use spacetimedb::messages::websocket::BsatnFormat; use spacetimedb::sql::ast::SchemaViewer; use spacetimedb::subscription::query::compile_read_only_queryset; +use spacetimedb::subscription::row_list_builder_pool::BsatnRowListBuilderPool; use spacetimedb::subscription::subscription::ExecutionSet; use spacetimedb::subscription::tx::DeltaTx; use spacetimedb::subscription::{collect_table_update, TableUpdateType}; @@ -119,6 +121,8 @@ fn eval(c: &mut Criterion) { let ins_rhs = insert_op(rhs, "location", new_rhs_row); let update = [&ins_lhs, &ins_rhs]; + let bsatn_rlb_pool = black_box(BsatnRowListBuilderPool::new()); + // A benchmark runner for the new query engine let bench_query = |c: &mut Criterion, name, sql| { c.bench_function(name, |b| { @@ -134,13 +138,17 @@ fn eval(c: &mut Criterion) { let tx = DeltaTx::from(&tx); b.iter(|| { - drop(black_box(collect_table_update::<_, BsatnFormat>( + let updates = black_box(collect_table_update::( &plans, table_id, table_name.clone(), &tx, TableUpdateType::Subscribe, - ))) + &bsatn_rlb_pool, + )); + if let Ok((updates, _)) = updates { + updates.consume_each_list(&mut |buffer| bsatn_rlb_pool.try_put(buffer)); + } }) }); }; @@ -152,12 +160,9 @@ fn eval(c: &mut Criterion) { let query: ExecutionSet = query.into(); b.iter(|| { - drop(black_box(query.eval::( - &raw.db, - &tx, - None, - Compression::None, - ))) + let updates = + black_box(query.eval::(&raw.db, &tx, &bsatn_rlb_pool, None, Compression::None)); + updates.consume_each_list(&mut |buffer| bsatn_rlb_pool.try_put(buffer)); }) }); }; diff --git a/crates/client-api-messages/src/websocket.rs b/crates/client-api-messages/src/websocket.rs index 4620f508809..f0ccbe2bb7d 100644 --- a/crates/client-api-messages/src/websocket.rs +++ b/crates/client-api-messages/src/websocket.rs @@ -941,6 +941,11 @@ impl BsatnRowList { let data_range = self.size_hint.index_to_range(index, data_end)?; Some(self.rows_data.slice(data_range)) } + + /// Consumes the list and returns the parts. + pub fn into_inner(self) -> (RowSizeHint, Bytes) { + (self.size_hint, self.rows_data) + } } /// An iterator over all the elements in a [`BsatnRowList`]. diff --git a/crates/client-api/src/routes/subscribe.rs b/crates/client-api/src/routes/subscribe.rs index 377c598fb14..9a80e96c7f7 100644 --- a/crates/client-api/src/routes/subscribe.rs +++ b/crates/client-api/src/routes/subscribe.rs @@ -32,6 +32,7 @@ use spacetimedb::client::{ }; use spacetimedb::host::module_host::ClientConnectedError; use spacetimedb::host::NoSuchModule; +use spacetimedb::subscription::row_list_builder_pool::BsatnRowListBuilderPool; use spacetimedb::util::spawn_rayon; use spacetimedb::worker_metrics::WORKER_METRICS; use spacetimedb::Identity; @@ -404,6 +405,8 @@ async fn ws_client_actor_inner( let (idle_tx, idle_rx) = watch::channel(state.next_idle_deadline()); let idle_timer = ws_idle_timer(idle_rx); + let bsatn_rlb_pool = client.module().subscriptions().bsatn_rlb_pool.clone(); + // Spawn a task to send outgoing messages // obtained from `sendrx` and `unordered_rx`. let send_task = tokio::spawn(ws_send_loop( @@ -412,6 +415,7 @@ async fn ws_client_actor_inner( ws_send, sendrx, unordered_rx, + bsatn_rlb_pool, )); // Spawn a task to handle incoming messages. let recv_task = tokio::spawn(ws_recv_task( @@ -1050,10 +1054,11 @@ async fn ws_send_loop( ws: impl Sink + Unpin, messages: impl Receiver, unordered: mpsc::UnboundedReceiver, + bsatn_rlb_pool: BsatnRowListBuilderPool, ) { let metrics = SendMetrics::new(state.database); - ws_send_loop_inner(state, ws, messages, unordered, |encode_rx, frames_tx| { - ws_encode_task(metrics, config, encode_rx, frames_tx) + ws_send_loop_inner(state, ws, messages, unordered, move |encode_rx, frames_tx| { + ws_encode_task(metrics, config, encode_rx, frames_tx, bsatn_rlb_pool) }) .await } @@ -1231,6 +1236,7 @@ async fn ws_encode_task( config: ClientConfig, mut messages: mpsc::UnboundedReceiver, outgoing_frames: mpsc::UnboundedSender, + bsatn_rlb_pool: BsatnRowListBuilderPool, ) { // Serialize buffers can be reclaimed once all frames of a message are // copied to the wire. Since we don't know when that will happen, we prepare @@ -1249,7 +1255,7 @@ async fn ws_encode_task( let in_use_buf = match message { OutboundMessage::Error(message) => { - let (stats, in_use, mut frames) = ws_encode_message(config, buf, message, false).await; + let (stats, in_use, mut frames) = ws_encode_message(config, buf, message, false, &bsatn_rlb_pool).await; metrics.report(None, None, stats); if frames.try_for_each(|frame| outgoing_frames.send(frame)).is_err() { break; @@ -1262,7 +1268,8 @@ async fn ws_encode_task( let num_rows = message.num_rows(); let is_large = num_rows.is_some_and(|n| n > 1024); - let (stats, in_use, mut frames) = ws_encode_message(config, buf, message, is_large).await; + let (stats, in_use, mut frames) = + ws_encode_message(config, buf, message, is_large, &bsatn_rlb_pool).await; metrics.report(workload, num_rows, stats); if frames.try_for_each(|frame| outgoing_frames.send(frame)).is_err() { break; @@ -1319,18 +1326,25 @@ async fn ws_encode_message( buf: SerializeBuffer, message: impl ToProtocol + Send + 'static, is_large_message: bool, + bsatn_rlb_pool: &BsatnRowListBuilderPool, ) -> (EncodeMetrics, InUseSerializeBuffer, impl Iterator) { const FRAGMENT_SIZE: usize = 4096; - let serialize_and_compress = |serialize_buf, message, config| { + fn serialize_and_compress( + bsatn_rlb_pool: &BsatnRowListBuilderPool, + serialize_buf: SerializeBuffer, + message: impl ToProtocol + Send + 'static, + config: ClientConfig, + ) -> (Duration, InUseSerializeBuffer, DataMessage) { let start = Instant::now(); - let (msg_alloc, msg_data) = serialize(serialize_buf, message, config); + let (msg_alloc, msg_data) = serialize(bsatn_rlb_pool, serialize_buf, message, config); (start.elapsed(), msg_alloc, msg_data) - }; + } let (timing, msg_alloc, msg_data) = if is_large_message { - spawn_rayon(move || serialize_and_compress(buf, message, config)).await + let bsatn_rlb_pool = bsatn_rlb_pool.clone(); + spawn_rayon(move || serialize_and_compress(&bsatn_rlb_pool, buf, message, config)).await } else { - serialize_and_compress(buf, message, config) + serialize_and_compress(bsatn_rlb_pool, buf, message, config) }; let metrics = EncodeMetrics { @@ -1630,6 +1644,7 @@ mod tests { sink::drain(), messages_rx, unordered_rx, + BsatnRowListBuilderPool::new(), ); pin_mut!(send_loop); @@ -1653,6 +1668,7 @@ mod tests { sink::drain(), messages_rx, unordered_rx, + BsatnRowListBuilderPool::new(), ); pin_mut!(send_loop); @@ -1703,6 +1719,7 @@ mod tests { UnfeedableSink, messages_rx, unordered_rx, + BsatnRowListBuilderPool::new(), ); pin_mut!(send_loop); @@ -1749,6 +1766,7 @@ mod tests { UnflushableSink, messages_rx, unordered_rx, + BsatnRowListBuilderPool::new(), ); pin_mut!(send_loop); diff --git a/crates/core/src/client.rs b/crates/core/src/client.rs index c37d0f58b45..4322587dd41 100644 --- a/crates/core/src/client.rs +++ b/crates/core/src/client.rs @@ -3,6 +3,7 @@ use std::fmt; mod client_connection; mod client_connection_index; +pub mod consume_each_list; mod message_handlers; pub mod messages; diff --git a/crates/core/src/client/client_connection.rs b/crates/core/src/client/client_connection.rs index 00c0f3024b3..37628fb2325 100644 --- a/crates/core/src/client/client_connection.rs +++ b/crates/core/src/client/client_connection.rs @@ -15,6 +15,7 @@ use crate::host::module_host::ClientConnectedError; use crate::host::{CallProcedureReturn, FunctionArgs, ModuleHost, NoSuchModule, ReducerCallError, ReducerCallResult}; use crate::messages::websocket::Subscribe; use crate::subscription::module_subscription_manager::BroadcastError; +use crate::subscription::row_list_builder_pool::JsonRowListBuilderFakePool; use crate::util::asyncify; use crate::util::prometheus_handle::IntGaugeExt; use crate::worker_metrics::WORKER_METRICS; @@ -954,6 +955,7 @@ impl ClientConnection { self.sender.clone(), message_id.to_owned(), timer, + JsonRowListBuilderFakePool, |msg: OneOffQueryResponseMessage| msg.into(), ) .await @@ -965,6 +967,7 @@ impl ClientConnection { message_id: &[u8], timer: Instant, ) -> Result<(), anyhow::Error> { + let bsatn_rlb_pool = self.module().replica_ctx().subscriptions.bsatn_rlb_pool.clone(); self.module() .one_off_query::( self.auth.clone(), @@ -972,6 +975,7 @@ impl ClientConnection { self.sender.clone(), message_id.to_owned(), timer, + bsatn_rlb_pool, |msg: OneOffQueryResponseMessage| msg.into(), ) .await diff --git a/crates/core/src/client/consume_each_list.rs b/crates/core/src/client/consume_each_list.rs new file mode 100644 index 00000000000..191b726c570 --- /dev/null +++ b/crates/core/src/client/consume_each_list.rs @@ -0,0 +1,80 @@ +use bytes::Bytes; +use spacetimedb_client_api_messages::websocket::{ + BsatnFormat, BsatnRowList, CompressableQueryUpdate, DatabaseUpdate, OneOffQueryResponse, QueryUpdate, + ServerMessage, TableUpdate, UpdateStatus, +}; + +/// Moves each buffer in `self` into a closure. +pub trait ConsumeEachBuffer { + /// Consumes `self`, moving each `Bytes` buffer in `self` into the closure `each`. + fn consume_each_list(self, each: &mut impl FnMut(Bytes)); +} + +impl ConsumeEachBuffer for ServerMessage { + fn consume_each_list(self, each: &mut impl FnMut(Bytes)) { + use ServerMessage::*; + match self { + InitialSubscription(x) => x.database_update.consume_each_list(each), + TransactionUpdate(x) => x.status.consume_each_list(each), + TransactionUpdateLight(x) => x.update.consume_each_list(each), + IdentityToken(_) | ProcedureResult(_) | SubscriptionError(_) => {} + OneOffQueryResponse(x) => x.consume_each_list(each), + SubscribeApplied(x) => x.rows.table_rows.consume_each_list(each), + UnsubscribeApplied(x) => x.rows.table_rows.consume_each_list(each), + SubscribeMultiApplied(x) => x.update.consume_each_list(each), + UnsubscribeMultiApplied(x) => x.update.consume_each_list(each), + } + } +} + +impl ConsumeEachBuffer for OneOffQueryResponse { + fn consume_each_list(self, each: &mut impl FnMut(Bytes)) { + Vec::from(self.tables) + .into_iter() + .for_each(|x| x.rows.consume_each_list(each)); + } +} + +impl ConsumeEachBuffer for UpdateStatus { + fn consume_each_list(self, each: &mut impl FnMut(Bytes)) { + match self { + Self::Committed(x) => x.consume_each_list(each), + Self::Failed(_) | UpdateStatus::OutOfEnergy => {} + } + } +} + +impl ConsumeEachBuffer for DatabaseUpdate { + fn consume_each_list(self, each: &mut impl FnMut(Bytes)) { + self.tables.into_iter().for_each(|x| x.consume_each_list(each)); + } +} + +impl ConsumeEachBuffer for TableUpdate { + fn consume_each_list(self, each: &mut impl FnMut(Bytes)) { + self.updates.into_iter().for_each(|x| x.consume_each_list(each)); + } +} + +impl ConsumeEachBuffer for CompressableQueryUpdate { + fn consume_each_list(self, each: &mut impl FnMut(Bytes)) { + match self { + Self::Uncompressed(x) => x.consume_each_list(each), + Self::Brotli(bytes) | Self::Gzip(bytes) => each(bytes), + } + } +} + +impl ConsumeEachBuffer for QueryUpdate { + fn consume_each_list(self, each: &mut impl FnMut(Bytes)) { + self.deletes.consume_each_list(each); + self.inserts.consume_each_list(each); + } +} + +impl ConsumeEachBuffer for BsatnRowList { + fn consume_each_list(self, each: &mut impl FnMut(Bytes)) { + let (_, buffer) = self.into_inner(); + each(buffer); + } +} diff --git a/crates/core/src/client/messages.rs b/crates/core/src/client/messages.rs index 811d87dc1cb..4511afb9ec2 100644 --- a/crates/core/src/client/messages.rs +++ b/crates/core/src/client/messages.rs @@ -1,7 +1,9 @@ use super::{ClientConfig, DataMessage, Protocol}; +use crate::client::consume_each_list::ConsumeEachBuffer; use crate::host::module_host::{EventStatus, ModuleEvent, ProcedureCallError}; use crate::host::{ArgsTuple, ProcedureCallResult}; use crate::messages::websocket as ws; +use crate::subscription::row_list_builder_pool::BsatnRowListBuilderPool; use crate::subscription::websocket_building::{brotli_compress, decide_compression, gzip_compress}; use bytes::{BufMut, Bytes, BytesMut}; use bytestring::ByteString; @@ -133,6 +135,7 @@ impl InUseSerializeBuffer { /// If `protocol` is [`Protocol::Binary`], /// the message will be conditionally compressed by this method according to `compression`. pub fn serialize( + bsatn_rlb_pool: &BsatnRowListBuilderPool, mut buffer: SerializeBuffer, msg: impl ToProtocol, config: ClientConfig, @@ -155,6 +158,10 @@ pub fn serialize( bsatn::to_writer(w.into_inner(), &msg).unwrap() }); + // At this point, we no longer have a use for `msg`, + // so try to reclaim its buffers. + msg.consume_each_list(&mut |buffer| bsatn_rlb_pool.try_put(buffer)); + // Conditionally compress the message. let (in_use, msg_bytes) = match decide_compression(srv_msg.len(), config.compression) { Compression::None => buffer.uncompressed(), diff --git a/crates/core/src/host/host_controller.rs b/crates/core/src/host/host_controller.rs index 4bf60054c28..5469677207b 100644 --- a/crates/core/src/host/host_controller.rs +++ b/crates/core/src/host/host_controller.rs @@ -16,6 +16,7 @@ use crate::module_host_context::ModuleCreationContext; use crate::replica_context::ReplicaContext; use crate::subscription::module_subscription_actor::ModuleSubscriptions; use crate::subscription::module_subscription_manager::{spawn_send_worker, SubscriptionManager, TransactionOffset}; +use crate::subscription::row_list_builder_pool::BsatnRowListBuilderPool; use crate::util::asyncify; use crate::util::jobs::{JobCores, SingleCoreExecutor}; use crate::worker_metrics::WORKER_METRICS; @@ -105,6 +106,8 @@ pub struct HostController { runtimes: Arc, /// The CPU cores that are reserved for ModuleHost operations to run on. db_cores: JobCores, + /// The pool of buffers used to build `BsatnRowList`s in subscriptions. + pub bsatn_rlb_pool: BsatnRowListBuilderPool, } struct HostRuntimes { @@ -205,6 +208,7 @@ impl HostController { runtimes: HostRuntimes::new(Some(&data_dir)), data_dir, page_pool: PagePool::new(default_config.page_pool_max_size), + bsatn_rlb_pool: BsatnRowListBuilderPool::new(), db_cores, } } @@ -332,6 +336,7 @@ impl HostController { // core - there's not a concern that we'll only end up using 1/2 // of the actual cores. self.db_cores.take(), + self.bsatn_rlb_pool.clone(), ) .await } @@ -617,6 +622,7 @@ async fn make_replica_ctx( database: Database, replica_id: u64, relational_db: Arc, + bsatn_rlb_pool: BsatnRowListBuilderPool, ) -> anyhow::Result { let logger = tokio::task::block_in_place(move || Arc::new(DatabaseLogger::open_today(path.module_logs()))); let send_worker_queue = spawn_send_worker(Some(database.database_identity)); @@ -624,7 +630,8 @@ async fn make_replica_ctx( send_worker_queue.clone(), ))); let downgraded = Arc::downgrade(&subscriptions); - let subscriptions = ModuleSubscriptions::new(relational_db.clone(), subscriptions, send_worker_queue); + let subscriptions = + ModuleSubscriptions::new(relational_db.clone(), subscriptions, send_worker_queue, bsatn_rlb_pool); // If an error occurs when evaluating a subscription, // we mark each client that was affected, @@ -721,11 +728,12 @@ async fn launch_module( replica_dir: ReplicaDir, runtimes: Arc, executor: SingleCoreExecutor, + bsatn_rlb_pool: BsatnRowListBuilderPool, ) -> anyhow::Result<(Program, LaunchedModule)> { let db_identity = database.database_identity; let host_type = database.host_type; - let replica_ctx = make_replica_ctx(replica_dir, database, replica_id, relational_db) + let replica_ctx = make_replica_ctx(replica_dir, database, replica_id, relational_db, bsatn_rlb_pool) .await .map(Arc::new)?; let (scheduler, scheduler_starter) = Scheduler::open(replica_ctx.relational_db.clone()); @@ -831,6 +839,7 @@ impl Host { runtimes, persistence, page_pool, + bsatn_rlb_pool, .. } = host_controller; let on_panic = host_controller.unregister_fn(replica_id); @@ -897,6 +906,7 @@ impl Host { replica_dir, runtimes.clone(), host_controller.db_cores.take(), + bsatn_rlb_pool.clone(), ) .await?; @@ -968,6 +978,7 @@ impl Host { database: Database, program: Program, executor: SingleCoreExecutor, + bsatn_rlb_pool: BsatnRowListBuilderPool, ) -> anyhow::Result> { // Even in-memory databases acquire a lockfile. // Grab a tempdir to put that lockfile in. @@ -1000,6 +1011,7 @@ impl Host { phony_replica_dir, runtimes.clone(), executor, + bsatn_rlb_pool, ) .await?; @@ -1202,7 +1214,9 @@ pub async fn extract_schema(program_bytes: Box<[u8]>, host_type: HostType) -> an let runtimes = HostRuntimes::new(None); let page_pool = PagePool::new(None); let core = SingleCoreExecutor::in_current_tokio_runtime(); - let module_info = Host::try_init_in_memory_to_check(&runtimes, page_pool, database, program, core).await?; + let bsatn_rlb_pool = BsatnRowListBuilderPool::new(); + let module_info = + Host::try_init_in_memory_to_check(&runtimes, page_pool, database, program, core, bsatn_rlb_pool).await?; // this should always succeed, but sometimes it doesn't let module_def = match Arc::try_unwrap(module_info) { Ok(info) => info.module_def, diff --git a/crates/core/src/host/module_host.rs b/crates/core/src/host/module_host.rs index f896b998e4a..aaafeb337f1 100644 --- a/crates/core/src/host/module_host.rs +++ b/crates/core/src/host/module_host.rs @@ -23,7 +23,7 @@ use crate::sql::ast::SchemaViewer; use crate::sql::parser::RowLevelExpr; use crate::subscription::module_subscription_actor::ModuleSubscriptions; use crate::subscription::tx::DeltaTx; -use crate::subscription::websocket_building::BuildableWebsocketFormat; +use crate::subscription::websocket_building::{BuildableWebsocketFormat, RowListBuilderSource}; use crate::subscription::{execute_plan, execute_plan_for_view}; use crate::util::jobs::{SingleCoreExecutor, WeakSingleCoreExecutor}; use crate::vm::check_row_limit; @@ -152,9 +152,12 @@ impl UpdatesRelValue<'_> { !(self.deletes.is_empty() && self.inserts.is_empty()) } - pub fn encode(&self) -> (F::QueryUpdate, u64, usize) { - let (deletes, nr_del) = F::encode_list(self.deletes.iter()); - let (inserts, nr_ins) = F::encode_list(self.inserts.iter()); + pub fn encode( + &self, + rlb_pool: &impl RowListBuilderSource, + ) -> (F::QueryUpdate, u64, usize) { + let (deletes, nr_del) = F::encode_list(rlb_pool.take_row_list_builder(), self.deletes.iter()); + let (inserts, nr_ins) = F::encode_list(rlb_pool.take_row_list_builder(), self.inserts.iter()); let num_rows = nr_del + nr_ins; let num_bytes = deletes.num_bytes() + inserts.num_bytes(); let qu = QueryUpdate { deletes, inserts }; @@ -1855,6 +1858,7 @@ impl ModuleHost { client: Arc, message_id: Vec, timer: Instant, + rlb_pool: impl 'static + Send + RowListBuilderSource, // We take this because we only have a way to convert with the concrete types (Bsatn and Json) into_message: impl FnOnce(OneOffQueryResponseMessage) -> SerializableMessage + Send + 'static, ) -> Result<(), anyhow::Error> { @@ -1921,13 +1925,13 @@ impl ModuleHost { .map(|plan| ViewProject::new(plan, num_cols, num_private_cols)) .collect::>(); // Execute the union and return the results - return execute_plan_for_view::<_, F>(&optimized, &DeltaTx::from(&*tx)) + return execute_plan_for_view::(&optimized, &DeltaTx::from(&*tx), &rlb_pool) .map(|(rows, _, metrics)| (OneOffTable { table_name, rows }, metrics)) .context("One-off queries are not allowed to modify the database"); } // Execute the union and return the results - execute_plan::<_, F>(&optimized, &DeltaTx::from(&*tx)) + execute_plan::(&optimized, &DeltaTx::from(&*tx), &rlb_pool) .map(|(rows, _, metrics)| (OneOffTable { table_name, rows }, metrics)) .context("One-off queries are not allowed to modify the database") })(); diff --git a/crates/core/src/subscription/execution_unit.rs b/crates/core/src/subscription/execution_unit.rs index 794495d38c7..daf071133f7 100644 --- a/crates/core/src/subscription/execution_unit.rs +++ b/crates/core/src/subscription/execution_unit.rs @@ -5,7 +5,7 @@ use crate::error::DBError; use crate::estimation; use crate::host::module_host::{DatabaseTableUpdate, DatabaseTableUpdateRelValue, UpdatesRelValue}; use crate::messages::websocket::TableUpdate; -use crate::subscription::websocket_building::BuildableWebsocketFormat; +use crate::subscription::websocket_building::{BuildableWebsocketFormat, RowListBuilderSource}; use crate::util::slow::SlowQueryLogger; use crate::vm::{build_query, TxMode}; use spacetimedb_client_api_messages::websocket::{Compression, QueryUpdate, RowListLen as _, SingleQueryUpdate}; @@ -240,6 +240,7 @@ impl ExecutionUnit { &self, db: &RelationalDB, tx: &Tx, + rlb_pool: &impl RowListBuilderSource, sql: &str, slow_query_threshold: Option, compression: Compression, @@ -250,7 +251,7 @@ impl ExecutionUnit { let tx = &tx.into(); let mut inserts = build_query(db, tx, &self.eval_plan, &mut NoInMemUsed); let inserts = inserts.iter(); - let (inserts, num_rows) = F::encode_list(inserts); + let (inserts, num_rows) = F::encode_list(rlb_pool.take_row_list_builder(), inserts); (!inserts.is_empty()).then(|| { let deletes = F::List::default(); diff --git a/crates/core/src/subscription/mod.rs b/crates/core/src/subscription/mod.rs index f893de157cc..32abd7fec9d 100644 --- a/crates/core/src/subscription/mod.rs +++ b/crates/core/src/subscription/mod.rs @@ -1,4 +1,4 @@ -use crate::subscription::websocket_building::{BuildableWebsocketFormat, RowListBuilder as _}; +use crate::subscription::websocket_building::{BuildableWebsocketFormat, RowListBuilder as _, RowListBuilderSource}; use crate::{error::DBError, worker_metrics::WORKER_METRICS}; use anyhow::Result; use metrics::QueryMetrics; @@ -16,6 +16,8 @@ use spacetimedb_execution::{pipelined::PipelinedProject, Datastore, DeltaStore}; use spacetimedb_lib::identity::AuthCtx; use spacetimedb_lib::{metrics::ExecutionMetrics, Identity}; use spacetimedb_primitives::TableId; +use spacetimedb_sats::bsatn::ToBsatn; +use spacetimedb_sats::Serialize; use std::sync::Arc; pub mod delta; @@ -24,6 +26,7 @@ pub mod metrics; pub mod module_subscription_actor; pub mod module_subscription_manager; pub mod query; +pub mod row_list_builder_pool; #[allow(clippy::module_inception)] // it's right this isn't ideal :/ pub mod subscription; pub mod tx; @@ -103,46 +106,48 @@ impl MetricsRecorder for ExecutionCounters { /// /// NOTE: This method was largely copied from [`execute_plan`]. /// TODO: Merge with [`execute_plan`]. -pub fn execute_plan_for_view(plan_fragments: &[ViewProject], tx: &Tx) -> Result<(F::List, u64, ExecutionMetrics)> -where - Tx: Datastore + DeltaStore, - F: BuildableWebsocketFormat, -{ - let mut count = 0; - let mut list = F::ListBuilder::default(); - let mut metrics = ExecutionMetrics::default(); - - for fragment in plan_fragments { - fragment.execute(tx, &mut metrics, &mut |row| { - count += 1; - list.push(row); - Ok(()) - })?; - } - - let list = list.finish(); - metrics.bytes_scanned += list.num_bytes(); - metrics.bytes_sent_to_clients += list.num_bytes(); - Ok((list, count, metrics)) +pub fn execute_plan_for_view( + plan_fragments: &[ViewProject], + tx: &(impl Datastore + DeltaStore), + rlb_pool: &impl RowListBuilderSource, +) -> Result<(F::List, u64, ExecutionMetrics)> { + build_list_with_executor(rlb_pool, |metrics, add| { + for fragment in plan_fragments { + fragment.execute(tx, metrics, add)?; + } + Ok(()) + }) } /// Execute a subscription query -pub fn execute_plan(plan_fragments: &[PipelinedProject], tx: &Tx) -> Result<(F::List, u64, ExecutionMetrics)> -where - Tx: Datastore + DeltaStore, - F: BuildableWebsocketFormat, -{ +pub fn execute_plan( + plan_fragments: &[PipelinedProject], + tx: &(impl Datastore + DeltaStore), + rlb_pool: &impl RowListBuilderSource, +) -> Result<(F::List, u64, ExecutionMetrics)> { + build_list_with_executor(rlb_pool, |metrics, add| { + for fragment in plan_fragments { + fragment.execute(tx, metrics, add)?; + } + Ok(()) + }) +} + +/// Returns a list built by passing a function `add` to `driver`, +/// which will call the former for every row it processes. +pub fn build_list_with_executor( + rlb_pool: &impl RowListBuilderSource, + driver: impl FnOnce(&mut ExecutionMetrics, &mut dyn FnMut(R) -> Result<()>) -> Result<()>, +) -> Result<(F::List, u64, ExecutionMetrics)> { let mut count = 0; - let mut list = F::ListBuilder::default(); + let mut list = rlb_pool.take_row_list_builder(); let mut metrics = ExecutionMetrics::default(); - for fragment in plan_fragments { - fragment.execute(tx, &mut metrics, &mut |row| { - count += 1; - list.push(row); - Ok(()) - })?; - } + driver(&mut metrics, &mut |row| { + count += 1; + list.push(row); + Ok(()) + })?; let list = list.finish(); metrics.bytes_scanned += list.num_bytes(); @@ -172,12 +177,13 @@ pub fn collect_table_update_for_view( table_name: Box, tx: &Tx, update_type: TableUpdateType, + rlb_pool: &impl RowListBuilderSource, ) -> Result<(TableUpdate, ExecutionMetrics)> where Tx: Datastore + DeltaStore, F: BuildableWebsocketFormat, { - execute_plan_for_view::(plan_fragments, tx).map(|(rows, num_rows, metrics)| { + execute_plan_for_view::(plan_fragments, tx, rlb_pool).map(|(rows, num_rows, metrics)| { let empty = F::List::default(); let qu = match update_type { TableUpdateType::Subscribe => QueryUpdate { @@ -201,18 +207,15 @@ where } /// Execute a subscription query and collect the results in a [TableUpdate] -pub fn collect_table_update( +pub fn collect_table_update( plan_fragments: &[PipelinedProject], table_id: TableId, table_name: Box, - tx: &Tx, + tx: &(impl Datastore + DeltaStore), update_type: TableUpdateType, -) -> Result<(TableUpdate, ExecutionMetrics)> -where - Tx: Datastore + DeltaStore, - F: BuildableWebsocketFormat, -{ - execute_plan::(plan_fragments, tx).map(|(rows, num_rows, metrics)| { + rlb_pool: &impl RowListBuilderSource, +) -> Result<(TableUpdate, ExecutionMetrics)> { + execute_plan::(plan_fragments, tx, rlb_pool).map(|(rows, num_rows, metrics)| { let empty = F::List::default(); let qu = match update_type { TableUpdateType::Subscribe => QueryUpdate { @@ -236,16 +239,13 @@ where } /// Execute a collection of subscription queries in parallel -pub fn execute_plans( +pub fn execute_plans( auth: &AuthCtx, plans: &[Arc], - tx: &Tx, + tx: &(impl Datastore + DeltaStore + Sync), update_type: TableUpdateType, -) -> Result<(DatabaseUpdate, ExecutionMetrics, Vec), DBError> -where - Tx: Datastore + DeltaStore + Sync, - F: BuildableWebsocketFormat, -{ + rlb_pool: &(impl Sync + RowListBuilderSource), +) -> Result<(DatabaseUpdate, ExecutionMetrics, Vec), DBError> { plans .par_iter() .flat_map_iter(|plan| plan.plans_fragments().map(|fragment| (plan.sql(), fragment))) @@ -265,14 +265,35 @@ where if let Some(schema) = plan.return_table() { let pipelined_plan = PipelinedProject::from(plan.clone()); let view_plan = ViewProject::new(pipelined_plan, schema.num_cols(), schema.num_private_cols()); - collect_table_update_for_view(&[view_plan], table_id, (&**table_name).into(), tx, update_type)? + collect_table_update_for_view( + &[view_plan], + table_id, + (&**table_name).into(), + tx, + update_type, + rlb_pool, + )? } else { let pipelined_plan = PipelinedProject::from(plan.clone()); - collect_table_update(&[pipelined_plan], table_id, (&**table_name).into(), tx, update_type)? + collect_table_update( + &[pipelined_plan], + table_id, + (&**table_name).into(), + tx, + update_type, + rlb_pool, + )? } } else { let pipelined_plan = PipelinedProject::from(plan.clone()); - collect_table_update(&[pipelined_plan], table_id, (&**table_name).into(), tx, update_type)? + collect_table_update( + &[pipelined_plan], + table_id, + (&**table_name).into(), + tx, + update_type, + rlb_pool, + )? }; let elapsed = start_time.elapsed(); diff --git a/crates/core/src/subscription/module_subscription_actor.rs b/crates/core/src/subscription/module_subscription_actor.rs index 02e10dfea94..bb778e4f9e2 100644 --- a/crates/core/src/subscription/module_subscription_actor.rs +++ b/crates/core/src/subscription/module_subscription_actor.rs @@ -19,6 +19,7 @@ use crate::host::module_host::{DatabaseUpdate, EventStatus, ModuleEvent}; use crate::host::ModuleHost; use crate::messages::websocket::Subscribe; use crate::subscription::query::is_subscribe_to_all_tables; +use crate::subscription::row_list_builder_pool::{BsatnRowListBuilderPool, JsonRowListBuilderFakePool}; use crate::subscription::{collect_table_update_for_view, execute_plans}; use crate::util::prometheus_handle::IntGaugeExt; use crate::vm::check_row_limit; @@ -56,6 +57,7 @@ pub struct ModuleSubscriptions { /// You will deadlock otherwise. subscriptions: Subscriptions, broadcast_queue: BroadcastQueue, + pub bsatn_rlb_pool: BsatnRowListBuilderPool, stats: Arc, metrics: Arc, } @@ -236,6 +238,7 @@ impl ModuleSubscriptions { relational_db: Arc, subscriptions: Subscriptions, broadcast_queue: BroadcastQueue, + bsatn_rlb_pool: BsatnRowListBuilderPool, ) -> Self { let db = &relational_db.database_identity(); let stats = Arc::new(SubscriptionGauges::new(db)); @@ -247,6 +250,7 @@ impl ModuleSubscriptions { broadcast_queue, stats, metrics, + bsatn_rlb_pool, } } @@ -266,6 +270,7 @@ impl ModuleSubscriptions { db, SubscriptionManager::for_test_without_metrics_arc_rwlock(), send_worker_queue, + BsatnRowListBuilderPool::new(), ) } @@ -342,13 +347,27 @@ impl ModuleSubscriptions { .map(PipelinedProject::from) .map(|plan| ViewProject::new(plan, num_cols, view_info.num_private_cols())) .collect::>(); - collect_table_update_for_view(&plans, table_id, table_name.into(), &tx, update_type) - .map(|(table_update, metrics)| (FormatSwitch::Bsatn(table_update), metrics)) + collect_table_update_for_view( + &plans, + table_id, + table_name.into(), + &tx, + update_type, + &self.bsatn_rlb_pool, + ) + .map(|(table_update, metrics)| (FormatSwitch::Bsatn(table_update), metrics)) } (Protocol::Binary, None) => { let plans = plans.into_iter().map(PipelinedProject::from).collect::>(); - collect_table_update(&plans, table_id, table_name.into(), &tx, update_type) - .map(|(table_update, metrics)| (FormatSwitch::Bsatn(table_update), metrics)) + collect_table_update( + &plans, + table_id, + table_name.into(), + &tx, + update_type, + &self.bsatn_rlb_pool, + ) + .map(|(table_update, metrics)| (FormatSwitch::Bsatn(table_update), metrics)) } (Protocol::Text, Some(view_info)) => { let plans = plans @@ -356,13 +375,27 @@ impl ModuleSubscriptions { .map(PipelinedProject::from) .map(|plan| ViewProject::new(plan, num_cols, view_info.num_private_cols())) .collect::>(); - collect_table_update_for_view(&plans, table_id, table_name.into(), &tx, update_type) - .map(|(table_update, metrics)| (FormatSwitch::Json(table_update), metrics)) + collect_table_update_for_view( + &plans, + table_id, + table_name.into(), + &tx, + update_type, + &JsonRowListBuilderFakePool, + ) + .map(|(table_update, metrics)| (FormatSwitch::Json(table_update), metrics)) } (Protocol::Text, None) => { let plans = plans.into_iter().map(PipelinedProject::from).collect::>(); - collect_table_update(&plans, table_id, table_name.into(), &tx, update_type) - .map(|(table_update, metrics)| (FormatSwitch::Json(table_update), metrics)) + collect_table_update( + &plans, + table_id, + table_name.into(), + &tx, + update_type, + &JsonRowListBuilderFakePool, + ) + .map(|(table_update, metrics)| (FormatSwitch::Json(table_update), metrics)) } }?) } @@ -391,11 +424,13 @@ impl ModuleSubscriptions { let tx = DeltaTx::from(tx); let (update, metrics, query_metrics) = match sender.config.protocol { Protocol::Binary => { - let (update, metrics, query_metrics) = execute_plans(auth, queries, &tx, update_type)?; + let (update, metrics, query_metrics) = + execute_plans(auth, queries, &tx, update_type, &self.bsatn_rlb_pool)?; (FormatSwitch::Bsatn(update), metrics, query_metrics) } Protocol::Text => { - let (update, metrics, query_metrics) = execute_plans(auth, queries, &tx, update_type)?; + let (update, metrics, query_metrics) = + execute_plans(auth, queries, &tx, update_type, &JsonRowListBuilderFakePool)?; (FormatSwitch::Json(update), metrics, query_metrics) } }; @@ -938,12 +973,18 @@ impl ModuleSubscriptions { let tx = DeltaTx::from(&*tx); let (database_update, metrics, query_metrics) = match sender.config.protocol { - Protocol::Binary => execute_plans(&auth, &queries, &tx, TableUpdateType::Subscribe).map( - |(table_update, metrics, query_metrics)| (FormatSwitch::Bsatn(table_update), metrics, query_metrics), - )?, - Protocol::Text => execute_plans(&auth, &queries, &tx, TableUpdateType::Subscribe).map( - |(table_update, metrics, query_metrics)| (FormatSwitch::Json(table_update), metrics, query_metrics), - )?, + Protocol::Binary => execute_plans(&auth, &queries, &tx, TableUpdateType::Subscribe, &self.bsatn_rlb_pool) + .map(|(table_update, metrics, query_metrics)| { + (FormatSwitch::Bsatn(table_update), metrics, query_metrics) + })?, + Protocol::Text => execute_plans( + &auth, + &queries, + &tx, + TableUpdateType::Subscribe, + &JsonRowListBuilderFakePool, + ) + .map(|(table_update, metrics, query_metrics)| (FormatSwitch::Json(table_update), metrics, query_metrics))?, }; record_query_metrics(&self.relational_db.database_identity(), query_metrics); @@ -1068,7 +1109,12 @@ impl ModuleSubscriptions { ); // Create the delta transaction we'll use to eval updates against. let delta_read_tx = DeltaTx::new(&read_tx, tx_data.as_ref(), subscriptions.index_ids_for_subscriptions()); - let update_metrics = subscriptions.eval_updates_sequential((&delta_read_tx, tx_offset), event.clone(), caller); + let update_metrics = subscriptions.eval_updates_sequential( + (&delta_read_tx, tx_offset), + &self.bsatn_rlb_pool, + event.clone(), + caller, + ); read_tx.metrics.merge(update_metrics); Ok(Ok(CommitAndBroadcastEventSuccess { tx_offset: extra_tx_offset, @@ -1267,6 +1313,7 @@ mod tests { use crate::subscription::module_subscription_actor::commit_and_broadcast_event; use crate::subscription::module_subscription_manager::{spawn_send_worker, SubscriptionManager}; use crate::subscription::query::compile_read_only_query; + use crate::subscription::row_list_builder_pool::BsatnRowListBuilderPool; use crate::subscription::TableUpdateType; use core::fmt; use itertools::Itertools; @@ -1311,6 +1358,7 @@ mod tests { db.clone(), SubscriptionManager::for_test_without_metrics_arc_rwlock(), send_worker_queue, + BsatnRowListBuilderPool::new(), ); let auth = AuthCtx::new(owner, sender.auth.claims.identity); diff --git a/crates/core/src/subscription/module_subscription_manager.rs b/crates/core/src/subscription/module_subscription_manager.rs index 05ce5a2bc28..50ccf45a1c8 100644 --- a/crates/core/src/subscription/module_subscription_manager.rs +++ b/crates/core/src/subscription/module_subscription_manager.rs @@ -9,7 +9,8 @@ use crate::error::DBError; use crate::host::module_host::{DatabaseTableUpdate, ModuleEvent, UpdatesRelValue}; use crate::messages::websocket::{self as ws, TableUpdate}; use crate::subscription::delta::eval_delta; -use crate::subscription::websocket_building::BuildableWebsocketFormat; +use crate::subscription::row_list_builder_pool::{BsatnRowListBuilderPool, JsonRowListBuilderFakePool}; +use crate::subscription::websocket_building::{BuildableWebsocketFormat, RowListBuilderSource}; use crate::worker_metrics::WORKER_METRICS; use core::mem; use parking_lot::RwLock; @@ -1141,6 +1142,7 @@ impl SubscriptionManager { pub fn eval_updates_sequential( &self, (tx, tx_offset): (&DeltaTx, TransactionOffset), + bsatn_rlb_pool: &BsatnRowListBuilderPool, event: Arc, caller: Option>, ) -> ExecutionMetrics { @@ -1230,12 +1232,13 @@ impl SubscriptionManager { updates: &UpdatesRelValue<'_>, memory: &mut Option<(F::QueryUpdate, u64, usize)>, metrics: &mut ExecutionMetrics, + rlb_pool: &impl RowListBuilderSource, ) -> SingleQueryUpdate { let (update, num_rows, num_bytes) = memory .get_or_insert_with(|| { // TODO(centril): consider pushing the encoding of each row into // `eval_delta` instead, to avoid building the temporary `Vec`s in `UpdatesRelValue`. - let encoded = updates.encode::(); + let encoded = updates.encode::(rlb_pool); // The first time we insert into this map, we call encode. // This is when we serialize the rows to BSATN/JSON. // Hence this is where we increment `bytes_scanned`. @@ -1280,11 +1283,13 @@ impl SubscriptionManager { &delta_updates, &mut ops_bin_uncompressed, &mut acc.metrics, + bsatn_rlb_pool, )), Protocol::Text => Json(memo_encode::( &delta_updates, &mut ops_json, &mut acc.metrics, + &JsonRowListBuilderFakePool, )), }; ClientUpdate { @@ -1655,6 +1660,7 @@ mod tests { use crate::host::module_host::DatabaseTableUpdate; use crate::sql::ast::SchemaViewer; use crate::subscription::module_subscription_manager::ClientQueryId; + use crate::subscription::row_list_builder_pool::BsatnRowListBuilderPool; use crate::subscription::tx::DeltaTx; use crate::{ client::{ClientActorId, ClientConfig, ClientConnectionSender, ClientName}, @@ -2524,7 +2530,13 @@ mod tests { db.report_read_tx_metrics(reducer, tx_metrics); }); let delta_tx = DeltaTx::from(&*tx); - subscriptions.eval_updates_sequential((&delta_tx, offset_rx), event, Some(Arc::new(client0))); + let bsatn_rlb_pool = BsatnRowListBuilderPool::new(); + subscriptions.eval_updates_sequential( + (&delta_tx, offset_rx), + &bsatn_rlb_pool, + event, + Some(Arc::new(client0)), + ); } runtime.block_on(async move { diff --git a/crates/core/src/subscription/query.rs b/crates/core/src/subscription/query.rs index 143f0ea96e6..328ec96dee8 100644 --- a/crates/core/src/subscription/query.rs +++ b/crates/core/src/subscription/query.rs @@ -156,6 +156,7 @@ mod tests { use crate::sql::execute::collect_result; use crate::sql::execute::tests::run_for_testing; use crate::subscription::module_subscription_manager::QueriedTableIndexIds; + use crate::subscription::row_list_builder_pool::BsatnRowListBuilderPool; use crate::subscription::subscription::{legacy_get_all, ExecutionSet}; use crate::subscription::tx::DeltaTx; use crate::vm::tests::create_table_with_rows; @@ -355,7 +356,9 @@ mod tests { total_tables: usize, rows: &[ProductValue], ) -> ResultTest<()> { - let result = s.eval::(db, tx, None, Compression::None).tables; + let result = s + .eval::(db, tx, &BsatnRowListBuilderPool::new(), None, Compression::None) + .tables; assert_eq!( result.len(), total_tables, diff --git a/crates/core/src/subscription/row_list_builder_pool.rs b/crates/core/src/subscription/row_list_builder_pool.rs new file mode 100644 index 00000000000..3deab7016fb --- /dev/null +++ b/crates/core/src/subscription/row_list_builder_pool.rs @@ -0,0 +1,90 @@ +use crate::subscription::websocket_building::{BsatnRowListBuilder, BuildableWebsocketFormat, RowListBuilderSource}; +use bytes::{Bytes, BytesMut}; +use core::sync::atomic::{AtomicUsize, Ordering}; +use derive_more::Deref; +use spacetimedb_client_api_messages::websocket::{BsatnFormat, JsonFormat}; +use spacetimedb_data_structures::object_pool::{Pool, PooledObject}; +use spacetimedb_memory_usage::MemoryUsage; + +/// The default buffer capacity, currently 4 KiB. +const DEFAULT_BUFFER_CAPACITY: usize = 4096; + +/// The pool can store at most 4 MiB worth of buffers. +/// NOTE(centril): This hasn't been measured yet, +/// but this should be a fairly good initial guestimate +/// as the server would need to handle half as many tables in total. +/// If there are two queries mentioning the same table, +/// that counts as two tables. +const DEFAULT_POOL_CAPACITY: usize = 1024; + +/// New-type for `BytesMut` to deal with the orphan check. +pub struct PooledBuffer(BytesMut); + +impl MemoryUsage for PooledBuffer { + fn heap_usage(&self) -> usize { + self.0.heap_usage() + } +} + +impl PooledObject for PooledBuffer { + type ResidentBytesStorage = AtomicUsize; + + fn resident_object_bytes(storage: &Self::ResidentBytesStorage, _: usize) -> usize { + storage.load(Ordering::Relaxed) + } + + fn add_to_resident_object_bytes(storage: &Self::ResidentBytesStorage, bytes: usize) { + storage.fetch_add(bytes, Ordering::Relaxed); + } + + fn sub_from_resident_object_bytes(storage: &Self::ResidentBytesStorage, bytes: usize) { + storage.fetch_sub(bytes, Ordering::Relaxed); + } +} + +/// The pool for [`BsatnRowListBuilder`]s. +#[derive(Clone, Deref, Debug)] +pub struct BsatnRowListBuilderPool { + pool: Pool, +} + +impl BsatnRowListBuilderPool { + /// Returns a new pool with the default maximum capacity. + #[allow(clippy::new_without_default)] + pub fn new() -> Self { + let pool = Pool::new(DEFAULT_POOL_CAPACITY); + Self { pool } + } + + /// Tries to reclaim the allocation of `buffer` into the pool + /// to be used when building a new list. + /// + /// In most calls, this method will do nothing, + /// as `buffer` will be shared between clients subscribing to the same query. + /// It's only on the last client that the refcount will be 1 + /// which will then cause `put` to add the allocation into the buffer. + pub fn try_put(&self, buffer: Bytes) { + if let Ok(bytes) = buffer.try_into_mut() { + self.put(PooledBuffer(bytes)); + } + } +} + +impl RowListBuilderSource for BsatnRowListBuilderPool { + fn take_row_list_builder(&self) -> BsatnRowListBuilder { + let PooledBuffer(buffer) = self.pool.take( + |buffer| buffer.0.clear(), + || PooledBuffer(BytesMut::with_capacity(DEFAULT_BUFFER_CAPACITY)), + ); + BsatnRowListBuilder::new_from_bytes(buffer) + } +} + +/// The "pool" for the builder for the [`JsonFormat`]. +pub(crate) struct JsonRowListBuilderFakePool; + +impl RowListBuilderSource for JsonRowListBuilderFakePool { + fn take_row_list_builder(&self) -> ::ListBuilder { + Vec::new() + } +} diff --git a/crates/core/src/subscription/subscription.rs b/crates/core/src/subscription/subscription.rs index 975fa2527fc..d96aee8fc93 100644 --- a/crates/core/src/subscription/subscription.rs +++ b/crates/core/src/subscription/subscription.rs @@ -28,7 +28,7 @@ use crate::error::{DBError, SubscriptionError}; use crate::host::module_host::{DatabaseTableUpdate, DatabaseUpdateRelValue, UpdatesRelValue}; use crate::messages::websocket as ws; use crate::sql::ast::SchemaViewer; -use crate::subscription::websocket_building::BuildableWebsocketFormat; +use crate::subscription::websocket_building::{BuildableWebsocketFormat, RowListBuilderSource}; use crate::vm::{build_query, TxMode}; use anyhow::Context; use itertools::Either; @@ -517,6 +517,7 @@ impl ExecutionSet { &self, db: &RelationalDB, tx: &Tx, + rlb_pool: &impl RowListBuilderSource, slow_query_threshold: Option, compression: Compression, ) -> ws::DatabaseUpdate { @@ -525,7 +526,7 @@ impl ExecutionSet { .exec_units // if you need eval to run single-threaded for debugging, change this to .iter() .iter() - .filter_map(|unit| unit.eval(db, tx, &unit.sql, slow_query_threshold, compression)) + .filter_map(|unit| unit.eval(db, tx, rlb_pool, &unit.sql, slow_query_threshold, compression)) .collect(); ws::DatabaseUpdate { tables } } diff --git a/crates/core/src/subscription/websocket_building.rs b/crates/core/src/subscription/websocket_building.rs index e4061eb6b83..43944e2df34 100644 --- a/crates/core/src/subscription/websocket_building.rs +++ b/crates/core/src/subscription/websocket_building.rs @@ -1,3 +1,4 @@ +use bytes::BytesMut; use bytestring::ByteString; use core::mem; use spacetimedb_client_api_messages::websocket::{ @@ -10,6 +11,12 @@ use spacetimedb_sats::Serialize; use std::io; use std::io::Write as _; +/// A source of row list builders for a given [`BuildableWebsocketFormat`]. +pub trait RowListBuilderSource { + /// Returns a row list builder from the source `self`. + fn take_row_list_builder(&self) -> F::ListBuilder; +} + /// A list of rows being built. pub trait RowListBuilder: Default { type FinishedList; @@ -22,13 +29,17 @@ pub trait RowListBuilder: Default { } pub trait BuildableWebsocketFormat: WebsocketFormat { - /// The builder for [`Self::List`]. + /// The builder for [`WebsocketFormat::List`]. type ListBuilder: RowListBuilder; /// Encodes the `elems` to a list in the format and also returns the length of the list. - fn encode_list(elems: impl Iterator) -> (Self::List, u64) { + /// + /// Needs to be provided with an empty [`WebsocketFormat::ListBuilder`]. + fn encode_list( + mut list: Self::ListBuilder, + elems: impl Iterator, + ) -> (Self::List, u64) { let mut num_rows = 0; - let mut list = Self::ListBuilder::default(); for elem in elems { num_rows += 1; list.push(elem); @@ -36,7 +47,7 @@ pub trait BuildableWebsocketFormat: WebsocketFormat { (list.finish(), num_rows) } - /// Convert a `QueryUpdate` into `Self::QueryUpdate`. + /// Convert a `QueryUpdate` into [`WebsocketFormat::QueryUpdate`]. /// This allows some formats to e.g., compress the update. fn into_query_update(qu: QueryUpdate, compression: Compression) -> Self::QueryUpdate; } @@ -67,7 +78,7 @@ pub struct BsatnRowListBuilder { /// intended to facilitate parallel decode purposes on large initial updates. size_hint: RowSizeHintBuilder, /// The flattened byte array for a list of rows. - rows_data: Vec, + rows_data: BytesMut, } /// A [`RowSizeHint`] under construction. @@ -88,6 +99,14 @@ pub enum RowSizeHintBuilder { RowOffsets(Vec), } +impl BsatnRowListBuilder { + /// Returns a new builder using an empty [`BytesMut`] for the `rows_data` buffer. + pub fn new_from_bytes(rows_data: BytesMut) -> Self { + let size_hint = <_>::default(); + Self { size_hint, rows_data } + } +} + impl Default for RowSizeHintBuilder { fn default() -> Self { Self::Empty diff --git a/crates/core/src/worker_metrics/mod.rs b/crates/core/src/worker_metrics/mod.rs index a53ed1b3097..3c0fb7aab8d 100644 --- a/crates/core/src/worker_metrics/mod.rs +++ b/crates/core/src/worker_metrics/mod.rs @@ -1,5 +1,6 @@ use crate::hash::Hash; use crate::messages::control_db::HostType; +use crate::subscription::row_list_builder_pool::BsatnRowListBuilderPool; use once_cell::sync::Lazy; use prometheus::{GaugeVec, HistogramVec, IntCounterVec, IntGaugeVec}; use spacetimedb_datastore::execution_context::WorkloadType; @@ -82,6 +83,31 @@ metrics_group!( #[labels(node_id: str)] pub page_pool_pages_returned: IntGaugeVec, + #[name = bsatn_rlb_pool_resident_bytes] + #[help = "Total memory used by the `BsatnRowListBuilderPool`"] + #[labels(node_id: str)] + pub bsatn_rlb_pool_resident_bytes: IntGaugeVec, + + #[name = bsatn_rlb_pool_dropped] + #[help = "Total number of buffers dropped by the `BsatnRowListBuilderPool`"] + #[labels(node_id: str)] + pub bsatn_rlb_pool_dropped: IntGaugeVec, + + #[name = bsatn_rlb_pool_new_allocated] + #[help = "Total number of fresh buffers allocated by the `BsatnRowListBuilderPool`"] + #[labels(node_id: str)] + pub bsatn_rlb_pool_new_allocated: IntGaugeVec, + + #[name = bsatn_rlb_pool_reused] + #[help = "Total number of buffers reused by the `BsatnRowListBuilderPool`"] + #[labels(node_id: str)] + pub bsatn_rlb_pool_reused: IntGaugeVec, + + #[name = bsatn_rlb_pool_returned] + #[help = "Total number of buffers returned to the `BsatnRowListBuilderPool`"] + #[labels(node_id: str)] + pub bsatn_rlb_pool_returned: IntGaugeVec, + #[name = tokio_num_workers] #[help = "Number of core tokio workers"] #[labels(node_id: str)] @@ -468,6 +494,29 @@ pub fn spawn_page_pool_stats(node_id: String, page_pool: PagePool) { }); } +static SPAWN_BSATN_RLB_POOL_GUARD: Once = Once::new(); +pub fn spawn_bsatn_rlb_pool_stats(node_id: String, pool: BsatnRowListBuilderPool) { + SPAWN_BSATN_RLB_POOL_GUARD.call_once(|| { + spawn(async move { + let resident_bytes = WORKER_METRICS.bsatn_rlb_pool_resident_bytes.with_label_values(&node_id); + let dropped_pages = WORKER_METRICS.bsatn_rlb_pool_dropped.with_label_values(&node_id); + let new_pages = WORKER_METRICS.bsatn_rlb_pool_new_allocated.with_label_values(&node_id); + let reused_pages = WORKER_METRICS.bsatn_rlb_pool_reused.with_label_values(&node_id); + let returned_pages = WORKER_METRICS.bsatn_rlb_pool_returned.with_label_values(&node_id); + + loop { + resident_bytes.set(pool.heap_usage() as i64); + dropped_pages.set(pool.dropped_count() as i64); + new_pages.set(pool.new_allocated_count() as i64); + reused_pages.set(pool.reused_count() as i64); + returned_pages.set(pool.reused_count() as i64); + + sleep(Duration::from_secs(10)).await; + } + }); + }); +} + // How frequently to update the tokio stats. #[cfg(all(target_has_atomic = "64", tokio_unstable))] const TOKIO_STATS_INTERVAL: Duration = Duration::from_secs(10); diff --git a/crates/execution/src/lib.rs b/crates/execution/src/lib.rs index d9583cbcef7..f239d165771 100644 --- a/crates/execution/src/lib.rs +++ b/crates/execution/src/lib.rs @@ -1,18 +1,13 @@ -use std::{ - hash::{Hash, Hasher}, - ops::RangeBounds, -}; - use anyhow::Result; -use spacetimedb_lib::{ - bsatn::{EncodeError, ToBsatn}, - query::Delta, - sats::impl_serialize, - AlgebraicValue, ProductValue, -}; +use core::hash::{Hash, Hasher}; +use core::ops::RangeBounds; +use spacetimedb_lib::query::Delta; use spacetimedb_physical_plan::plan::{ProjectField, TupleField}; use spacetimedb_primitives::{ColList, IndexId, TableId}; +use spacetimedb_sats::bsatn::{BufReservedFill, EncodeError, ToBsatn}; +use spacetimedb_sats::buffer::BufWriter; use spacetimedb_sats::product_value::InvalidFieldError; +use spacetimedb_sats::{impl_serialize, AlgebraicValue, ProductValue}; use spacetimedb_table::{static_assert_size, table::RowRef}; pub mod dml; @@ -157,7 +152,7 @@ impl ToBsatn for Row<'_> { } } - fn to_bsatn_extend(&self, buf: &mut Vec) -> std::result::Result<(), EncodeError> { + fn to_bsatn_extend(&self, buf: &mut (impl BufWriter + BufReservedFill)) -> std::result::Result<(), EncodeError> { match self { Self::Ptr(ptr) => ptr.to_bsatn_extend(buf), Self::Ref(val) => val.to_bsatn_extend(buf), diff --git a/crates/sats/src/bsatn.rs b/crates/sats/src/bsatn.rs index d1909db952f..aab203f8e75 100644 --- a/crates/sats/src/bsatn.rs +++ b/crates/sats/src/bsatn.rs @@ -1,7 +1,10 @@ +use core::mem::MaybeUninit; + use crate::buffer::{BufReader, BufWriter, CountWriter}; use crate::de::{BasicSmallVecVisitor, Deserialize, DeserializeSeed, Deserializer as _}; use crate::ser::Serialize; use crate::{ProductValue, Typespace, WithTypespace}; +use bytes::BytesMut; use ser::BsatnError; use smallvec::SmallVec; @@ -106,6 +109,52 @@ codec_funcs!(val: crate::AlgebraicValue); codec_funcs!(val: crate::ProductValue); codec_funcs!(val: crate::SumValue); +/// Provides a view over a buffer that an reserve an additional `len` bytes +/// and then provide those as an uninitialized buffer to write into. +pub trait BufReservedFill { + /// Reserves space for `len` in `self` and then runs `fill` to fill it, + /// adding `len` to the total length of `self`. + /// + /// # Safety + /// + /// `fill` must initialize every byte in the slice. + unsafe fn reserve_and_fill(&mut self, len: usize, fill: impl FnOnce(&mut [MaybeUninit])); +} + +impl BufReservedFill for Vec { + unsafe fn reserve_and_fill(&mut self, len: usize, fill: impl FnOnce(&mut [MaybeUninit])) { + // Get an uninitialized slice within `self` of `len` bytes. + let start = self.len(); + self.reserve(len); + let sink = &mut self.spare_capacity_mut()[..len]; + + // Run the filling logic. + fill(sink); + + // SAFETY: Caller promised that `sink` was fully initialized, + // which entails that we initialized `start .. start + len`, + // so now we have initialized up to `start + len`. + unsafe { self.set_len(start + len) } + } +} + +impl BufReservedFill for BytesMut { + unsafe fn reserve_and_fill(&mut self, len: usize, fill: impl FnOnce(&mut [MaybeUninit])) { + // Get an uninitialized slice within `self` of `len` bytes. + let start = self.len(); + self.reserve(len); + let sink = &mut self.spare_capacity_mut()[..len]; + + // Run the filling logic. + fill(sink); + + // SAFETY: Caller promised that `sink` was fully initialized, + // which entails that we initialized `start .. start + len`, + // so now we have initialized up to `start + len`. + unsafe { self.set_len(start + len) } + } +} + /// Types that can be encoded to BSATN. /// /// Implementations of this trait may be more efficient than directly calling [`to_vec`]. @@ -117,7 +166,7 @@ pub trait ToBsatn { /// BSATN-encode the row referred to by `self` into `buf`, /// pushing `self`'s bytes onto the end of `buf`, similar to [`Vec::extend`]. - fn to_bsatn_extend(&self, buf: &mut Vec) -> Result<(), BsatnError>; + fn to_bsatn_extend(&self, buf: &mut (impl BufWriter + BufReservedFill)) -> Result<(), BsatnError>; /// Returns the static size of the type of this object. /// @@ -129,7 +178,7 @@ impl ToBsatn for &T { fn to_bsatn_vec(&self) -> Result, BsatnError> { T::to_bsatn_vec(*self) } - fn to_bsatn_extend(&self, buf: &mut Vec) -> Result<(), BsatnError> { + fn to_bsatn_extend(&self, buf: &mut (impl BufWriter + BufReservedFill)) -> Result<(), BsatnError> { T::to_bsatn_extend(*self, buf) } fn static_bsatn_size(&self) -> Option { @@ -142,7 +191,7 @@ impl ToBsatn for ProductValue { to_vec(self) } - fn to_bsatn_extend(&self, buf: &mut Vec) -> Result<(), BsatnError> { + fn to_bsatn_extend(&self, buf: &mut (impl BufWriter + BufReservedFill)) -> Result<(), BsatnError> { to_writer(buf, self) } diff --git a/crates/standalone/src/lib.rs b/crates/standalone/src/lib.rs index bc18f5c67cf..cd2da537a54 100644 --- a/crates/standalone/src/lib.rs +++ b/crates/standalone/src/lib.rs @@ -16,6 +16,7 @@ use spacetimedb::energy::{EnergyBalance, EnergyQuanta, NullEnergyMonitor}; use spacetimedb::host::{DiskStorage, HostController, MigratePlanResult, UpdateDatabaseResult}; use spacetimedb::identity::{AuthCtx, Identity}; use spacetimedb::messages::control_db::{Database, Node, Replica}; +use spacetimedb::subscription::row_list_builder_pool::BsatnRowListBuilderPool; use spacetimedb::util::jobs::JobCores; use spacetimedb::worker_metrics::WORKER_METRICS; use spacetimedb_client_api::auth::{self, LOCALHOST}; @@ -107,6 +108,10 @@ impl StandaloneEnv { pub fn page_pool(&self) -> &PagePool { &self.host_controller.page_pool } + + pub fn bsatn_rlb_pool(&self) -> &BsatnRowListBuilderPool { + &self.host_controller.bsatn_rlb_pool + } } #[async_trait] diff --git a/crates/standalone/src/subcommands/start.rs b/crates/standalone/src/subcommands/start.rs index cb7e9f60209..20627b7e886 100644 --- a/crates/standalone/src/subcommands/start.rs +++ b/crates/standalone/src/subcommands/start.rs @@ -181,6 +181,7 @@ pub async fn exec(args: &ArgMatches, db_cores: JobCores) -> anyhow::Result<()> { worker_metrics::spawn_jemalloc_stats(listen_addr.clone()); worker_metrics::spawn_tokio_stats(listen_addr.clone()); worker_metrics::spawn_page_pool_stats(listen_addr.clone(), ctx.page_pool().clone()); + worker_metrics::spawn_bsatn_rlb_pool_stats(listen_addr.clone(), ctx.bsatn_rlb_pool().clone()); let mut db_routes = DatabaseRoutes::default(); db_routes.root_post = db_routes.root_post.layer(DefaultBodyLimit::disable()); db_routes.db_put = db_routes.db_put.layer(DefaultBodyLimit::disable()); diff --git a/crates/table/src/static_layout.rs b/crates/table/src/static_layout.rs index b414189d3a4..de633e203d2 100644 --- a/crates/table/src/static_layout.rs +++ b/crates/table/src/static_layout.rs @@ -29,6 +29,7 @@ use core::mem::MaybeUninit; use core::ptr; use smallvec::SmallVec; use spacetimedb_data_structures::slim_slice::SlimSmallSliceBox; +use spacetimedb_sats::bsatn::BufReservedFill; use spacetimedb_sats::layout::{ AlgebraicTypeLayout, HasLayout, PrimitiveType, ProductTypeElementLayout, ProductTypeLayoutView, RowTypeLayout, SumTypeLayout, SumTypeVariantLayout, @@ -86,22 +87,11 @@ impl StaticLayout { /// As a consequence of this, for every `field` in `self.fields`, /// `row[field.bflatn_offset .. field.bflatn_offset + length]` will be initialized. pub(crate) unsafe fn serialize_row_into_vec(&self, row: &Bytes) -> Vec { - // Create an uninitialized buffer `buf` of the correct length. - let bsatn_len = self.bsatn_length as usize; - let mut buf = Vec::with_capacity(bsatn_len); - let sink = buf.spare_capacity_mut(); + let mut buf = Vec::new(); - // (1) Write the row into the slice using a series of `memcpy`s. - // SAFETY: - // - Caller promised that `row` is valid for `self`. - // - `sink` was constructed with exactly the correct length above. - unsafe { - self.serialize_row_into(sink, row); - } + // SAFETY: Forward caller requirements. + unsafe { self.serialize_row_extend(&mut buf, row) }; - // SAFETY: In (1), we initialized `0..len` - // as `row` was valid for `self` per caller requirements. - unsafe { buf.set_len(bsatn_len) } buf } @@ -113,26 +103,18 @@ impl StaticLayout { /// for which `self` was computed. /// As a consequence of this, for every `field` in `self.fields`, /// `row[field.bflatn_offset .. field.bflatn_offset + length]` will be initialized. - pub(crate) unsafe fn serialize_row_extend(&self, buf: &mut Vec, row: &Bytes) { - // Get an uninitialized slice within `buf` of the correct length. - let start = buf.len(); + pub(crate) unsafe fn serialize_row_extend(&self, buf: &mut impl BufReservedFill, row: &Bytes) { let len = self.bsatn_length as usize; - buf.reserve(len); - let sink = &mut buf.spare_capacity_mut()[..len]; - - // (1) Write the row into the slice using a series of `memcpy`s. + // Writes the row into the slice using a series of `memcpy`s. // SAFETY: // - Caller promised that `row` is valid for `self`. // - `sink` was constructed with exactly the correct length above. - unsafe { + let filler = |sink: &mut _| unsafe { self.serialize_row_into(sink, row); - } - - // SAFETY: In (1), we initialized `start .. start + len` - // as `row` was valid for `self` per caller requirements - // and we had initialized up to `start` before, - // so now we have initialized up to `start + len`. - unsafe { buf.set_len(start + len) } + }; + // SAFETY: + // The closure `filler` will write exactly `len` bytes. + unsafe { buf.reserve_and_fill(len, filler) }; } #[allow(unused)] diff --git a/crates/table/src/table.rs b/crates/table/src/table.rs index 92a3d1a8703..05572c29773 100644 --- a/crates/table/src/table.rs +++ b/crates/table/src/table.rs @@ -29,12 +29,13 @@ use core::{mem, ops::RangeBounds}; use derive_more::{Add, AddAssign, From, Sub, SubAssign}; use enum_as_inner::EnumAsInner; use smallvec::SmallVec; -use spacetimedb_lib::{bsatn::DecodeError, de::DeserializeOwned}; use spacetimedb_primitives::{ColId, ColList, IndexId, SequenceId, TableId}; use spacetimedb_sats::memory_usage::MemoryUsage; use spacetimedb_sats::{ algebraic_value::ser::ValueSerializer, - bsatn::{self, ser::BsatnError, ToBsatn}, + bsatn::{self, ser::BsatnError, BufReservedFill, DecodeError, ToBsatn}, + buffer::BufWriter, + de::DeserializeOwned, i256, product_value::InvalidFieldError, satn::Satn, @@ -1913,7 +1914,7 @@ impl ToBsatn for RowRef<'_> { /// /// This method will use a [`StaticLayout`] if one is available, /// and may therefore be faster than calling [`bsatn::to_writer`]. - fn to_bsatn_extend(&self, buf: &mut Vec) -> Result<(), BsatnError> { + fn to_bsatn_extend(&self, buf: &mut (impl BufWriter + BufReservedFill)) -> Result<(), BsatnError> { if let Some(static_layout) = self.static_layout() { // Use fast path, by first fetching the row data and then using the static layout. let row = self.get_row_data(); diff --git a/crates/vm/src/relation.rs b/crates/vm/src/relation.rs index 2d625369f87..a56eb4286be 100644 --- a/crates/vm/src/relation.rs +++ b/crates/vm/src/relation.rs @@ -2,7 +2,8 @@ use core::hash::{Hash, Hasher}; use derive_more::From; use spacetimedb_execution::Row; use spacetimedb_lib::db::auth::StAccess; -use spacetimedb_sats::bsatn::{ser::BsatnError, ToBsatn}; +use spacetimedb_sats::bsatn::{ser::BsatnError, BufReservedFill, ToBsatn}; +use spacetimedb_sats::buffer::BufWriter; use spacetimedb_sats::product_value::ProductValue; use spacetimedb_sats::{impl_serialize, AlgebraicValue}; use spacetimedb_schema::relation::{ColExpr, ColExprRef, Header}; @@ -172,7 +173,7 @@ impl ToBsatn for RelValue<'_> { RelValue::ProjRef(this) => (*this).to_bsatn_vec(), } } - fn to_bsatn_extend(&self, buf: &mut Vec) -> Result<(), BsatnError> { + fn to_bsatn_extend(&self, buf: &mut (impl BufWriter + BufReservedFill)) -> Result<(), BsatnError> { match self { RelValue::Row(this) => this.to_bsatn_extend(buf), RelValue::Projection(this) => this.to_bsatn_extend(buf), From 5e619ca33b6fd94196dda91d4db6948315b802af Mon Sep 17 00:00:00 2001 From: Mazdak Farrokhzad Date: Fri, 5 Dec 2025 14:20:19 +0100 Subject: [PATCH 4/4] reuse TxState, stashing in CommittedState btw txes --- Cargo.lock | 1 + .../locking_tx_datastore/committed_state.rs | 74 ++++++++++--------- .../src/locking_tx_datastore/datastore.rs | 7 +- .../src/locking_tx_datastore/delete_table.rs | 23 +++++- .../src/locking_tx_datastore/tx_state.rs | 44 ++++++++++- crates/lib/src/db/auth.rs | 6 +- crates/schema/Cargo.toml | 5 +- crates/schema/src/def.rs | 35 +++++++++ crates/schema/src/schema.rs | 59 +++++++++++++++ crates/table/src/blob_store.rs | 11 +++ crates/table/src/pages.rs | 7 +- crates/table/src/pointer_map.rs | 7 ++ crates/table/src/table.rs | 26 ++++--- .../src/table_index/unique_direct_index.rs | 10 ++- 14 files changed, 256 insertions(+), 59 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2d20cd009f0..c670be4ce33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8080,6 +8080,7 @@ dependencies = [ "smallvec", "spacetimedb-data-structures", "spacetimedb-lib 1.11.0", + "spacetimedb-memory-usage", "spacetimedb-primitives 1.11.0", "spacetimedb-sats 1.11.0", "spacetimedb-sql-parser", diff --git a/crates/datastore/src/locking_tx_datastore/committed_state.rs b/crates/datastore/src/locking_tx_datastore/committed_state.rs index 370135a100f..5286b9a9fe6 100644 --- a/crates/datastore/src/locking_tx_datastore/committed_state.rs +++ b/crates/datastore/src/locking_tx_datastore/committed_state.rs @@ -1,6 +1,5 @@ use super::{ datastore::Result, - delete_table::DeleteTable, sequence::{Sequence, SequencesState}, state_view::StateView, tx_state::{IndexIdMap, PendingSchemaChange, TxState}, @@ -33,7 +32,7 @@ use crate::{ }, }; use anyhow::anyhow; -use core::{convert::Infallible, ops::RangeBounds}; +use core::{convert::Infallible, mem, ops::RangeBounds}; use spacetimedb_data_structures::map::{HashSet, IntMap, IntSet}; use spacetimedb_durability::TxOffset; use spacetimedb_lib::{db::auth::StTableType, Identity}; @@ -50,9 +49,7 @@ use spacetimedb_table::{ page_pool::PagePool, table::{IndexScanPointIter, IndexScanRangeIter, InsertError, RowRef, Table, TableAndIndex, TableScanIter}, }; -use std::collections::BTreeMap; use std::sync::Arc; -use thin_vec::ThinVec; /// Contains the live, in-memory snapshot of a database. This structure /// is exposed in order to support tools wanting to process the commit @@ -90,6 +87,12 @@ pub struct CommittedState { /// - system tables: `st_view_sub`, `st_view_arg` /// - Tables which back views. pub(super) ephemeral_tables: EphemeralTables, + + /// After a [`CommittedState::merge`], + /// the merged [`TxState`] is stored here, + /// until it's allocations are needed for the next mutable transaction. + /// This is a performance optimization. + saved_tx_state: Option>, } impl CommittedState { @@ -119,6 +122,7 @@ impl MemoryUsage for CommittedState { table_dropped, read_sets, ephemeral_tables, + saved_tx_state, } = self; // NOTE(centril): We do not want to include the heap usage of `page_pool` as it's a shared resource. next_tx_offset.heap_usage() @@ -128,6 +132,7 @@ impl MemoryUsage for CommittedState { + table_dropped.heap_usage() + read_sets.heap_usage() + ephemeral_tables.heap_usage() + + saved_tx_state.heap_usage() } } @@ -198,6 +203,7 @@ impl CommittedState { read_sets: <_>::default(), page_pool, ephemeral_tables: <_>::default(), + saved_tx_state: <_>::default(), } } @@ -717,26 +723,25 @@ impl CommittedState { self.read_sets.remove_view(view_id, sender) } - pub(super) fn merge(&mut self, tx_state: TxState, read_sets: ViewReadSets, ctx: &ExecutionContext) -> TxData { + /// Returns the saved tx state or creates a new one. + /// The tx state is ready for use by a new mutable transaction. + pub(super) fn take_tx_state(&mut self) -> TxState { + *self.saved_tx_state.take().unwrap_or_default() + } + + pub(super) fn merge(&mut self, mut tx_state: TxState, read_sets: ViewReadSets, ctx: &ExecutionContext) -> TxData { let mut tx_data = TxData::default(); let mut truncates = IntSet::default(); // First, apply deletes. This will free up space in the committed tables. - self.merge_apply_deletes( - &mut tx_data, - tx_state.delete_tables, - tx_state.pending_schema_changes, - &mut truncates, - ); + self.merge_apply_deletes(&mut tx_data, &mut tx_state, &mut truncates); // Then, apply inserts. This will re-fill the holes freed by deletions // before allocating new pages. - self.merge_apply_inserts( - &mut tx_data, - tx_state.insert_tables, - tx_state.blob_store, - &mut truncates, - ); + self.merge_apply_inserts(&mut tx_data, &mut tx_state, &mut truncates); + + // We're done with `tx_state`. Save it for reuse. + self.saved_tx_state = Some(Box::new(tx_state)); // Record any truncated tables in the `TxData`. tx_data.add_truncates(truncates); @@ -763,13 +768,7 @@ impl CommittedState { self.read_sets.merge(read_sets) } - fn merge_apply_deletes( - &mut self, - tx_data: &mut TxData, - delete_tables: BTreeMap, - pending_schema_changes: ThinVec, - truncates: &mut IntSet, - ) { + fn merge_apply_deletes(&mut self, tx_data: &mut TxData, tx_state: &mut TxState, truncates: &mut IntSet) { fn delete_rows( tx_data: &mut TxData, table_id: TableId, @@ -805,7 +804,7 @@ impl CommittedState { } } - for (table_id, row_ptrs) in delete_tables { + for (&table_id, row_ptrs) in &mut tx_state.delete_tables { match self.get_table_and_blob_store_mut(table_id) { Ok((table, blob_store, ..)) => delete_rows( tx_data, @@ -819,12 +818,14 @@ impl CommittedState { Err(_) if !row_ptrs.is_empty() => panic!("Deletion for non-existent table {table_id:?}... huh?"), Err(_) => {} } + + row_ptrs.clear(); } // Delete all tables marked for deletion. // The order here does not matter as once a `table_id` has been dropped // it will never be re-created. - for change in pending_schema_changes { + for change in mem::take(&mut tx_state.pending_schema_changes) { if let PendingSchemaChange::TableRemoved(table_id, mut table) = change { let row_ptrs = table.scan_all_row_ptrs(); truncates.insert(table_id); @@ -841,13 +842,7 @@ impl CommittedState { } } - fn merge_apply_inserts( - &mut self, - tx_data: &mut TxData, - insert_tables: BTreeMap, - tx_blob_store: impl BlobStore, - truncates: &mut IntSet, - ) { + fn merge_apply_inserts(&mut self, tx_data: &mut TxData, tx_state: &mut TxState, truncates: &mut IntSet) { // TODO(perf): Consider moving whole pages from the `insert_tables` into the committed state, // rather than copying individual rows out of them. // This will require some magic to get the indexes right, @@ -856,13 +851,17 @@ impl CommittedState { // based on the available holes in the committed state // and the fullness of the page. - for (table_id, tx_table) in insert_tables { + for (&table_id, tx_table) in &mut tx_state.insert_tables { + // TODO(perf, centril): Optimize the case where there is no commit table. + // In that case, all we need to do is transfer the tx table over + // and construct `inserts`. + let (commit_table, commit_blob_store, page_pool) = self.get_table_and_blob_store_or_create(table_id, tx_table.get_schema()); // For each newly-inserted row, insert it into the committed state. let mut inserts = Vec::with_capacity(tx_table.row_count as usize); - for row_ref in tx_table.scan_rows(&tx_blob_store) { + for row_ref in tx_table.scan_rows(&tx_state.blob_store) { let pv = row_ref.to_product_value(); commit_table .insert(page_pool, commit_blob_store, &pv) @@ -882,7 +881,8 @@ impl CommittedState { } } - let (schema, _indexes, pages) = tx_table.consume_for_merge(); + // Cleanup `tx_table` and steals its pages. + let (schema, pages) = tx_table.drain_for_merge(); // The schema may have been modified in the transaction. // Update this last to placate borrowck and avoid a clone. @@ -892,6 +892,8 @@ impl CommittedState { // Put all the pages in the table back into the pool. self.page_pool.put_many(pages); } + + tx_state.blob_store.clear(); } /// Rolls back the changes immediately made to the committed state during a transaction. diff --git a/crates/datastore/src/locking_tx_datastore/datastore.rs b/crates/datastore/src/locking_tx_datastore/datastore.rs index 3f694ad5b3f..7ec9a05d604 100644 --- a/crates/datastore/src/locking_tx_datastore/datastore.rs +++ b/crates/datastore/src/locking_tx_datastore/datastore.rs @@ -1,6 +1,5 @@ use super::{ committed_state::CommittedState, mut_tx::MutTxId, sequence::SequencesState, state_view::StateView, tx::TxId, - tx_state::TxState, }; use crate::{ db_metrics::DB_METRICS, @@ -917,14 +916,16 @@ impl MutTx for Locking { let ctx = ExecutionContext::with_workload(self.database_identity, workload); let timer = Instant::now(); - let committed_state_write_lock = self.committed_state.write_arc(); + let mut committed_state_write_lock = self.committed_state.write_arc(); let sequence_state_lock = self.sequence_state.lock_arc(); let lock_wait_time = timer.elapsed(); + let tx_state = committed_state_write_lock.take_tx_state(); + MutTxId { committed_state_write_lock, sequence_state_lock, - tx_state: TxState::default(), + tx_state, lock_wait_time, read_sets: <_>::default(), timer, diff --git a/crates/datastore/src/locking_tx_datastore/delete_table.rs b/crates/datastore/src/locking_tx_datastore/delete_table.rs index 517e525afbc..a048c95f3d8 100644 --- a/crates/datastore/src/locking_tx_datastore/delete_table.rs +++ b/crates/datastore/src/locking_tx_datastore/delete_table.rs @@ -1,4 +1,4 @@ -use spacetimedb_sats::layout::Size; +use spacetimedb_sats::{layout::Size, memory_usage::MemoryUsage}; use spacetimedb_table::{ fixed_bit_set::FixedBitSet, indexes::{max_rows_in_page, PageIndex, PageOffset, RowPointer, SquashedOffset}, @@ -16,6 +16,17 @@ pub struct DeleteTable { fixed_row_size: Size, } +impl MemoryUsage for DeleteTable { + fn heap_usage(&self) -> usize { + let Self { + deleted, + len, + fixed_row_size, + } = self; + deleted.heap_usage() + len.heap_usage() + fixed_row_size.heap_usage() + } +} + impl DeleteTable { /// Returns a new deletion table where the rows have `fixed_row_size`. /// @@ -126,6 +137,16 @@ impl DeleteTable { pub fn is_empty(&self) -> bool { self.len == 0 } + + /// Clears this deletion table, + /// enabling it for reuse for the same `fixed_row_size`. + pub fn clear(&mut self) { + self.len = 0; + + for set in self.deleted.iter_mut().filter_map(|set| set.as_mut()) { + set.clear(); + } + } } #[cfg(test)] diff --git a/crates/datastore/src/locking_tx_datastore/tx_state.rs b/crates/datastore/src/locking_tx_datastore/tx_state.rs index d67e19fcc06..fa64c7f2c89 100644 --- a/crates/datastore/src/locking_tx_datastore/tx_state.rs +++ b/crates/datastore/src/locking_tx_datastore/tx_state.rs @@ -3,7 +3,7 @@ use core::ops::RangeBounds; use spacetimedb_data_structures::map::IntMap; use spacetimedb_lib::db::auth::StAccess; use spacetimedb_primitives::{ColList, ConstraintId, IndexId, SequenceId, TableId}; -use spacetimedb_sats::AlgebraicValue; +use spacetimedb_sats::{memory_usage::MemoryUsage, AlgebraicValue}; use spacetimedb_schema::schema::{ColumnSchema, ConstraintSchema, IndexSchema, SequenceSchema}; use spacetimedb_table::{ blob_store::{BlobStore, HashMapBlobStore}, @@ -77,6 +77,23 @@ pub(super) struct TxState { pub(super) pending_schema_changes: ThinVec, } +static_assert_size!(TxState, 88); + +impl MemoryUsage for TxState { + fn heap_usage(&self) -> usize { + let Self { + insert_tables, + delete_tables, + blob_store, + pending_schema_changes, + } = self; + insert_tables.heap_usage() + + delete_tables.heap_usage() + + blob_store.heap_usage() + + pending_schema_changes.heap_usage() + } +} + /// A pending schema change is a change to a `TableSchema` /// that has been applied immediately to the [`CommittedState`](super::committed_state::CommittedState) /// and which need to be reverted if the transaction fails. @@ -117,7 +134,30 @@ pub enum PendingSchemaChange { SequenceAdded(TableId, SequenceId), } -static_assert_size!(TxState, 88); +impl MemoryUsage for PendingSchemaChange { + fn heap_usage(&self) -> usize { + match self { + Self::IndexRemoved(table_id, index_id, table_index, index_schema) => { + table_id.heap_usage() + index_id.heap_usage() + table_index.heap_usage() + index_schema.heap_usage() + } + Self::IndexAdded(table_id, index_id, pointer_map) => { + table_id.heap_usage() + index_id.heap_usage() + pointer_map.heap_usage() + } + Self::TableRemoved(table_id, table) => table_id.heap_usage() + table.heap_usage(), + Self::TableAdded(table_id) => table_id.heap_usage(), + Self::TableAlterAccess(table_id, st_access) => table_id.heap_usage() + st_access.heap_usage(), + Self::TableAlterRowType(table_id, column_schemas) => table_id.heap_usage() + column_schemas.heap_usage(), + Self::ConstraintRemoved(table_id, constraint_schema) => { + table_id.heap_usage() + constraint_schema.heap_usage() + } + Self::ConstraintAdded(table_id, constraint_id) => table_id.heap_usage() + constraint_id.heap_usage(), + Self::SequenceRemoved(table_id, sequence, sequence_schema) => { + table_id.heap_usage() + sequence.heap_usage() + sequence_schema.heap_usage() + } + Self::SequenceAdded(table_id, sequence_id) => table_id.heap_usage() + sequence_id.heap_usage(), + } + } +} impl TxState { /// Returns the row count in insert tables diff --git a/crates/lib/src/db/auth.rs b/crates/lib/src/db/auth.rs index ae2810d49d1..50000314226 100644 --- a/crates/lib/src/db/auth.rs +++ b/crates/lib/src/db/auth.rs @@ -1,6 +1,5 @@ -use spacetimedb_sats::{impl_deserialize, impl_serialize, impl_st, AlgebraicType}; - use crate::de::Error; +use spacetimedb_sats::{impl_deserialize, impl_serialize, impl_st, AlgebraicType}; /// Describe the visibility of the table #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] @@ -11,6 +10,9 @@ pub enum StAccess { Private, } +#[cfg(feature = "memory-usage")] +impl spacetimedb_memory_usage::MemoryUsage for StAccess {} + impl StAccess { pub fn as_str(&self) -> &'static str { match self { diff --git a/crates/schema/Cargo.toml b/crates/schema/Cargo.toml index 08d9be2c448..c49482f70b5 100644 --- a/crates/schema/Cargo.toml +++ b/crates/schema/Cargo.toml @@ -11,9 +11,10 @@ test = [] [dependencies] spacetimedb-lib = { workspace = true, features = ["enum-map"] } -spacetimedb-primitives.workspace = true -spacetimedb-sats.workspace = true +spacetimedb-primitives = { workspace = true, features = ["memory-usage"] } +spacetimedb-sats = { workspace = true, features = ["memory-usage"] } spacetimedb-data-structures.workspace = true +spacetimedb-memory-usage.workspace = true spacetimedb-sql-parser.workspace = true anyhow.workspace = true diff --git a/crates/schema/src/def.rs b/crates/schema/src/def.rs index 2c0a97d9ef4..e26717fbf71 100644 --- a/crates/schema/src/def.rs +++ b/crates/schema/src/def.rs @@ -654,6 +654,15 @@ pub enum IndexAlgorithm { Direct(DirectAlgorithm), } +impl spacetimedb_memory_usage::MemoryUsage for IndexAlgorithm { + fn heap_usage(&self) -> usize { + match self { + Self::BTree(a) => a.heap_usage(), + Self::Direct(a) => a.heap_usage(), + } + } +} + impl IndexAlgorithm { /// Get the columns of the index. pub fn columns(&self) -> ColOrCols<'_> { @@ -686,6 +695,12 @@ pub struct BTreeAlgorithm { pub columns: ColList, } +impl spacetimedb_memory_usage::MemoryUsage for BTreeAlgorithm { + fn heap_usage(&self) -> usize { + self.columns.heap_usage() + } +} + impl> From for BTreeAlgorithm { fn from(columns: CL) -> Self { let columns = columns.into(); @@ -706,6 +721,12 @@ pub struct DirectAlgorithm { pub column: ColId, } +impl spacetimedb_memory_usage::MemoryUsage for DirectAlgorithm { + fn heap_usage(&self) -> usize { + self.column.heap_usage() + } +} + impl> From for DirectAlgorithm { fn from(column: C) -> Self { let column = column.into(); @@ -859,6 +880,14 @@ pub enum ConstraintData { Unique(UniqueConstraintData), } +impl spacetimedb_memory_usage::MemoryUsage for ConstraintData { + fn heap_usage(&self) -> usize { + match self { + ConstraintData::Unique(d) => d.heap_usage(), + } + } +} + impl ConstraintData { /// If this is a unique constraint, returns the columns that must be unique. /// Otherwise, returns `None`. @@ -886,6 +915,12 @@ pub struct UniqueConstraintData { pub columns: ColSet, } +impl spacetimedb_memory_usage::MemoryUsage for UniqueConstraintData { + fn heap_usage(&self) -> usize { + self.columns.heap_usage() + } +} + impl From for RawUniqueConstraintDataV9 { fn from(val: UniqueConstraintData) -> Self { RawUniqueConstraintDataV9 { diff --git a/crates/schema/src/schema.rs b/crates/schema/src/schema.rs index 51826292454..39306bee282 100644 --- a/crates/schema/src/schema.rs +++ b/crates/schema/src/schema.rs @@ -1068,6 +1068,18 @@ pub struct ColumnSchema { pub col_type: AlgebraicType, } +impl spacetimedb_memory_usage::MemoryUsage for ColumnSchema { + fn heap_usage(&self) -> usize { + let Self { + table_id, + col_pos, + col_name, + col_type, + } = self; + table_id.heap_usage() + col_pos.heap_usage() + col_name.heap_usage() + col_type.heap_usage() + } +} + impl ColumnSchema { pub fn for_test(pos: impl Into, name: impl Into>, ty: AlgebraicType) -> Self { Self { @@ -1182,6 +1194,29 @@ pub struct SequenceSchema { pub max_value: i128, } +impl spacetimedb_memory_usage::MemoryUsage for SequenceSchema { + fn heap_usage(&self) -> usize { + let Self { + sequence_id, + sequence_name, + table_id, + col_pos, + increment, + start, + min_value, + max_value, + } = self; + sequence_id.heap_usage() + + sequence_name.heap_usage() + + table_id.heap_usage() + + col_pos.heap_usage() + + increment.heap_usage() + + start.heap_usage() + + min_value.heap_usage() + + max_value.heap_usage() + } +} + impl Schema for SequenceSchema { type Def = SequenceDef; type Id = SequenceId; @@ -1296,6 +1331,18 @@ pub struct IndexSchema { pub index_algorithm: IndexAlgorithm, } +impl spacetimedb_memory_usage::MemoryUsage for IndexSchema { + fn heap_usage(&self) -> usize { + let Self { + index_id, + table_id, + index_name, + index_algorithm, + } = self; + index_id.heap_usage() + table_id.heap_usage() + index_name.heap_usage() + index_algorithm.heap_usage() + } +} + impl IndexSchema { pub fn for_test(name: impl Into>, algo: impl Into) -> Self { Self { @@ -1347,6 +1394,18 @@ pub struct ConstraintSchema { pub data: ConstraintData, // this reuses the type from Def, which is fine, neither of `schema` nor `def` are ABI modules. } +impl spacetimedb_memory_usage::MemoryUsage for ConstraintSchema { + fn heap_usage(&self) -> usize { + let Self { + table_id, + constraint_id, + constraint_name, + data, + } = self; + table_id.heap_usage() + constraint_id.heap_usage() + constraint_name.heap_usage() + data.heap_usage() + } +} + impl ConstraintSchema { pub fn unique_for_test(name: impl Into>, cols: impl Into) -> Self { Self { diff --git a/crates/table/src/blob_store.rs b/crates/table/src/blob_store.rs index 6155d884b5d..85b06e91ec5 100644 --- a/crates/table/src/blob_store.rs +++ b/crates/table/src/blob_store.rs @@ -94,6 +94,11 @@ pub trait BlobStore: Sync { /// but rather just decrement a reference count. fn free_blob(&mut self, hash: &BlobHash) -> Result<(), NoSuchBlobError>; + /// Clear the blob store, leaving it with no blobs. + /// + /// TODO(perf,centril): this will likely become a draining method in the future. + fn clear(&mut self); + /// Iterate over all blobs present in the blob store. /// /// Each element is a tuple `(hash, uses, data)`, @@ -152,6 +157,8 @@ impl BlobStore for NullBlobStore { unimplemented!("NullBlobStore doesn't do anything") } + fn clear(&mut self) {} + fn iter_blobs(&self) -> BlobsIter<'_> { unimplemented!("NullBlobStore doesn't do anything") } @@ -228,6 +235,10 @@ impl BlobStore for HashMapBlobStore { Ok(()) } + fn clear(&mut self) { + self.map.clear(); + } + fn iter_blobs(&self) -> BlobsIter<'_> { Box::new(self.map.iter().map(|(hash, obj)| (hash, obj.uses, &obj.blob[..]))) } diff --git a/crates/table/src/pages.rs b/crates/table/src/pages.rs index ede2da47850..ef1d9c1bb73 100644 --- a/crates/table/src/pages.rs +++ b/crates/table/src/pages.rs @@ -363,9 +363,10 @@ impl Pages { self.pages = pages; } - /// Consumes the page manager, returning all the pages it held. - pub fn into_page_iter(self) -> impl Iterator> { - self.pages.into_iter() + /// Drains the pages of the page manager, returning all the pages it held. + pub fn drain(&mut self) -> impl Iterator> + use<'_> { + self.non_full_pages.clear(); + self.pages.drain(..) } } diff --git a/crates/table/src/pointer_map.rs b/crates/table/src/pointer_map.rs index ea5523b8813..1a9f5f4c497 100644 --- a/crates/table/src/pointer_map.rs +++ b/crates/table/src/pointer_map.rs @@ -313,6 +313,13 @@ impl PointerMap { ret } + + /// Removes all the row pointers from the map. + pub fn clear(&mut self) { + self.map.clear(); + self.colliders.clear(); + self.emptied_collider_slots.clear(); + } } impl FromIterator<(RowHash, RowPointer)> for PointerMap { diff --git a/crates/table/src/table.rs b/crates/table/src/table.rs index 05572c29773..9faac008428 100644 --- a/crates/table/src/table.rs +++ b/crates/table/src/table.rs @@ -1562,15 +1562,23 @@ impl Table { self.pointer_map = Some(self.rebuild_pointer_map(blob_store)); } - /// Consumes the table, returning some constituents needed for merge. - pub fn consume_for_merge( - self, - ) -> ( - Arc, - impl Iterator, - impl Iterator>, - ) { - (self.schema, self.indexes.into_iter(), self.inner.pages.into_page_iter()) + /// Steals the pages of the table for merging. + pub fn drain_for_merge(&mut self) -> (Arc, impl Iterator> + use<'_>) { + // Reset statistics. + self.blob_store_bytes = BlobNumBytes::default(); + self.row_count = 0; + + // Clear indices. + for index in self.indexes.values_mut() { + index.clear(); + } + + // Clear pointer map. + if let Some(pm) = &mut self.pointer_map { + pm.clear(); + } + + (self.schema.clone(), self.inner.pages.drain()) } /// Returns the number of rows resident in this table. diff --git a/crates/table/src/table_index/unique_direct_index.rs b/crates/table/src/table_index/unique_direct_index.rs index 53b8805dcc0..07ea26b3f91 100644 --- a/crates/table/src/table_index/unique_direct_index.rs +++ b/crates/table/src/table_index/unique_direct_index.rs @@ -90,6 +90,11 @@ impl InnerIndex { // SAFETY: `self.inner.len() = KEYS_PER_INNER` and `key.0 < KEYS_PER_INNER`. unsafe { self.inner.get_unchecked_mut(key.0) } } + + /// Clears the inner index. + fn clear(&mut self) { + self.inner.fill(NONE_PTR); + } } impl UniqueDirectIndex { @@ -201,8 +206,11 @@ impl UniqueDirectIndex { /// Deletes all entries from the index, leaving it empty. /// This will not deallocate the outer index. pub fn clear(&mut self) { - self.outer.clear(); self.len = 0; + self.outer + .iter_mut() + .filter_map(|i| i.as_mut()) + .for_each(InnerIndex::clear); } /// Returns whether `other` can be merged into `self`