Skip to content

Commit 1e8d9a6

Browse files
authored
disk poc delete vector (#843)
* First step * comment * Restore vectors * Working serializtion * Load vectors from disk * load from vector * Add serializer script * Change the gitginore * mark delete * Remove old tests * add mark as deleted * pr changes * fix the bm * Fixed the replace entry point * change order * Remove prints * added delete vectors and unit tests * Added bm and tests * Removed the freeidvector * Add repair updates * remove comments * change to if continue * getchached vector data * remove cache for now * back to origin * Fix the stale * choose candidate by distance * Add the tests * reduce call for staged nodes * Change to map for better performance * PR suggestopns * improvments * PR changes * Added to ci * Added download to ci * Added track delete time to bm * Added the bm filename * Change the name of file * added no gt bm * PR fix * Add more validations * More pr changes * Added more benchmarks * Change to l2 * PR
1 parent 8cc034c commit 1e8d9a6

File tree

9 files changed

+1804
-106
lines changed

9 files changed

+1804
-106
lines changed

.github/workflows/benchmark.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ on:
4141
- bm-batch-iter-uint8-single
4242
- bm-batch-iter-uint8-multi
4343
- bm-updated-fp32-single
44+
- bm-hnsw-disk-fp32-single
4445
- bm-spaces
4546
description: 'Benchmarks set to run'
4647
default: benchmarks-all

src/VecSim/algorithms/hnsw/hnsw_disk.h

Lines changed: 500 additions & 92 deletions
Large diffs are not rendered by default.

src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,10 @@ HNSWDiskIndex<DataType, DistType>::HNSWDiskIndex(
5858
indexDataGuard(), visitedNodesHandlerPool(INITIAL_CAPACITY, this->allocator),
5959
delta_list(), new_elements_meta_data(this->allocator), batchThreshold(0), // Will be restored from file
6060
pendingVectorIds(this->allocator), pendingMetadata(this->allocator),
61-
pendingVectorCount(0), stagedGraphUpdates(this->allocator),
62-
stagedNeighborUpdates(this->allocator) {
61+
pendingVectorCount(0), pendingDeleteIds(this->allocator),
62+
stagedInsertUpdates(this->allocator),
63+
stagedDeleteUpdates(this->allocator), stagedRepairUpdates(this->allocator),
64+
stagedInsertNeighborUpdates(this->allocator) {
6365

6466
// Restore index fields from file (including batchThreshold)
6567
this->restoreIndexFields(input);
@@ -264,18 +266,27 @@ void HNSWDiskIndex<DataType, DistType>::saveIndexIMP(std::ofstream &output) {
264266
if (!pendingVectorIds.empty()) {
265267
throw std::runtime_error("Serialization error: pendingVectorIds not empty after flush");
266268
}
267-
if (!stagedGraphUpdates.empty()) {
268-
throw std::runtime_error("Serialization error: stagedGraphUpdates not empty after flush");
269+
if (!stagedInsertUpdates.empty()) {
270+
throw std::runtime_error("Serialization error: stagedInsertUpdates not empty after flush");
269271
}
270-
if (!stagedNeighborUpdates.empty()) {
271-
throw std::runtime_error("Serialization error: stagedNeighborUpdates not empty after flush");
272+
if (!stagedDeleteUpdates.empty()) {
273+
throw std::runtime_error("Serialization error: stagedDeleteUpdates not empty after flush");
274+
}
275+
if (!stagedInsertNeighborUpdates.empty()) {
276+
throw std::runtime_error("Serialization error: stagedInsertNeighborUpdates not empty after flush");
272277
}
273278
if (!rawVectorsInRAM.empty()) {
274279
throw std::runtime_error("Serialization error: rawVectorsInRAM not empty after flush");
275280
}
276281
if (pendingVectorCount != 0) {
277282
throw std::runtime_error("Serialization error: pendingVectorCount not zero after flush");
278283
}
284+
if (!stagedRepairUpdates.empty()) {
285+
throw std::runtime_error("Serialization error: stagedRepairUpdates not empty after flush");
286+
}
287+
if (pendingDeleteIds.size() != 0) {
288+
throw std::runtime_error("Serialization error: pendingDeleteIds not empty after flush");
289+
}
279290
// Note: delta_list and new_elements_meta_data are currently unused legacy variables
280291
// but we verify them for future-proofing
281292
if (!delta_list.empty()) {
@@ -692,8 +703,9 @@ void HNSWDiskIndex<DataType, DistType>::restoreGraph(std::ifstream &input,
692703
this->pendingVectorIds.clear();
693704
this->pendingMetadata.clear();
694705
this->pendingVectorCount = 0;
695-
this->stagedGraphUpdates.clear();
696-
this->stagedNeighborUpdates.clear();
706+
this->stagedInsertUpdates.clear();
707+
this->stagedDeleteUpdates.clear();
708+
this->stagedInsertNeighborUpdates.clear();
697709

698710
// Resize visited nodes handler pool
699711
this->visitedNodesHandlerPool.resize(this->curElementCount);

0 commit comments

Comments
 (0)