From e4c04960c6bf58937ea2c1ee34a59a22011e7c87 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Sat, 15 Nov 2025 23:03:00 +0200 Subject: [PATCH 01/20] use raw vector if alredy fetched --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 168 ++++++++++-------- .../components/components_factory.h | 4 +- src/VecSim/spaces/computer/calculator.h | 27 ++- src/VecSim/vec_sim_index.h | 4 + tests/unit/test_components.cpp | 3 + tests/unit/test_hnsw_disk.cpp | 35 ++-- 6 files changed, 146 insertions(+), 95 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 2f4329c6a..97c9929a4 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -161,7 +161,6 @@ class HNSWDiskIndex : public VecSimIndexAbstract { mutable VisitedNodesHandlerPool visitedNodesHandlerPool; // Global batch operation state - mutable std::unordered_map> delta_list; mutable vecsim_stl::vector new_elements_meta_data; // Batch processing state @@ -200,9 +199,8 @@ class HNSWDiskIndex : public VecSimIndexAbstract { // Temporary storage for raw vectors in RAM (until flush batch) // Maps idType -> raw vector data (stored as string for simplicity) std::unordered_map rawVectorsInRAM; - - // Cache for raw vectors retrieved from disk (mutable to allow caching in const methods) - mutable std::unordered_map rawVectorsDiskCache; + // Cache for raw vectors retrieved from disk (to avoid redundant reads) + std::unordered_map rawVectorsCache; protected: HNSWDiskIndex() = delete; // default constructor is disabled. @@ -231,7 +229,7 @@ class HNSWDiskIndex : public VecSimIndexAbstract { public: // Core vector addition methods void insertElementToGraph(idType element_id, size_t element_max_level, idType entry_point, - size_t global_max_level, const void *vector_data); + size_t global_max_level, const void *raw_vector_data, const void *vector_data); idType mutuallyConnectNewElement(idType new_node_id, candidatesMaxHeap &top_candidates, size_t level); // Batch processing methods @@ -240,6 +238,7 @@ class HNSWDiskIndex : public VecSimIndexAbstract { // Helper methods void getNeighbors(idType nodeId, size_t level, vecsim_stl::vector& result) const; + void getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, const void** vector_data) const; void searchPendingVectors(const void* query_data, candidatesLabelsMaxHeap& top_candidates, size_t k) const; // Manual control of staged updates @@ -253,10 +252,7 @@ class HNSWDiskIndex : public VecSimIndexAbstract { // New method for handling neighbor connection updates when neighbor lists are full void stageRevisitNeighborConnections(idType new_node_id, idType selected_neighbor, size_t level, DistType distance); - - // void patchDeltaList(std::unordered_map> &delta_list, - // vecsim_stl::vector &new_elements_meta_data, - // std::unordered_map &new_ids_mapping); + public: // Methods needed by benchmark framework @@ -269,12 +265,12 @@ class HNSWDiskIndex : public VecSimIndexAbstract { candidatesLabelsMaxHeap* getNewMaxPriorityQueue() const; bool isMarkedDeleted(idType id) const; labelType getExternalLabel(idType id) const; - void processCandidate(idType candidate_id, const void* data_point, size_t level, size_t ef, + void processCandidate(idType candidate_id, const void* data_point_raw, const void* data_point, size_t level, size_t ef, void* visited_tags, size_t visited_tag, candidatesLabelsMaxHeap& top_candidates, candidatesMaxHeap& candidate_set, DistType& lowerBound) const; // Raw vector storage and retrieval methods - const char* getRawVector(idType id) const; + void getRawVector(idType id, void* output_buffer) const; protected: @@ -287,7 +283,7 @@ class HNSWDiskIndex : public VecSimIndexAbstract { // New hierarchical search method candidatesLabelsMaxHeap * - hierarchicalSearch(const void *data_point, idType ep_id, size_t ef, size_t k, void *timeoutCtx = nullptr, + hierarchicalSearch(const void *data_point_raw, const void *data_point, idType ep_id, size_t ef, size_t k, void *timeoutCtx = nullptr, VecSimQueryReply_Code *rc = nullptr) const; public: @@ -367,7 +363,7 @@ HNSWDiskIndex::HNSWDiskIndex( : VecSimIndexAbstract(abstractInitParams, components), idToMetaData(INITIAL_CAPACITY, this->allocator), labelToIdMap(this->allocator), db(db), cf(cf), indexDataGuard(), visitedNodesHandlerPool(INITIAL_CAPACITY, this->allocator), - delta_list(), new_elements_meta_data(this->allocator), + new_elements_meta_data(this->allocator), batchThreshold(10), pendingVectorIds(this->allocator), pendingMetadata(this->allocator), pendingVectorCount(0), stagedGraphUpdates(this->allocator), stagedNeighborUpdates(this->allocator) { @@ -404,7 +400,6 @@ HNSWDiskIndex::~HNSWDiskIndex() { pendingMetadata.clear(); // Clear delta list and new elements metadata - delta_list.clear(); new_elements_meta_data.clear(); // Clear main data structures @@ -413,7 +408,6 @@ HNSWDiskIndex::~HNSWDiskIndex() { // Ensure all memory is properly released idToMetaData.shrink_to_fit(); - labelToIdMap.clear(); // Note: db and cf are not owned by this class, so we don't delete them // Base class destructor will handle indexCalculator and preprocessors @@ -454,12 +448,12 @@ HNSWDiskIndex::topKQuery(const void *query_data, size_t k, // Step 2: Perform hierarchical search from top level down to bottom level // Use a more sophisticated search that properly traverses the HNSW hierarchy - auto *results = hierarchicalSearch(processed_query, bottom_layer_ep, std::max(query_ef, k), k, timeoutCtx, &rep->code); + auto *results = hierarchicalSearch(query_data, processed_query, bottom_layer_ep, std::max(query_ef, k), k, timeoutCtx, &rep->code); if (VecSim_OK == rep->code && results) { // Step 3: Also search pending batch vectors and merge results if (pendingVectorCount > 0) { - searchPendingVectors(processed_query, *results, k); + searchPendingVectors(query_data, *results, k); } rep->results.resize(results->size()); @@ -470,35 +464,22 @@ HNSWDiskIndex::topKQuery(const void *query_data, size_t k, } else { // Even if main search failed, still search pending vectors if (pendingVectorCount > 0) { - // Create a simple vector to store pending results - std::vector> pending_results; - pending_results.reserve(pendingVectorCount); - - // Search pending vectors manually - for (size_t i = 0; i < pendingVectorCount; i++) { - idType vectorId = pendingVectorIds[i]; - const void* vector_data = this->vectors->getElement(vectorId); - const DiskElementMetaData& metadata = idToMetaData[vectorId]; - labelType label = metadata.label; - DistType dist = this->calcDistance(processed_query, vector_data); - - pending_results.emplace_back(dist, label); - } - - // Sort by distance and take top k - std::sort(pending_results.begin(), pending_results.end()); - if (pending_results.size() > k) { - pending_results.resize(k); - } - - if (!pending_results.empty()) { - rep->results.resize(pending_results.size()); - for (size_t i = 0; i < pending_results.size(); i++) { - rep->results[i].score = pending_results[i].first; - rep->results[i].id = pending_results[i].second; + // Create a heap to store pending results + auto *pending_results = getNewMaxPriorityQueue(); + + // Search pending vectors using the helper method + searchPendingVectors(query_data, *pending_results, k); + + if (!pending_results->empty()) { + rep->results.resize(pending_results->size()); + for (auto result = rep->results.rbegin(); result != rep->results.rend(); result++) { + std::tie(result->score, result->id) = pending_results->top(); + pending_results->pop(); } rep->code = VecSim_QueryReply_OK; // Mark as successful since we found results } + + delete pending_results; } } @@ -677,7 +658,7 @@ int HNSWDiskIndex::addVector( template void HNSWDiskIndex::insertElementToGraph( idType element_id, size_t element_max_level, idType entry_point, - size_t global_max_level, const void *vector_data) { + size_t global_max_level, const void *raw_vector_data, const void *vector_data) { idType curr_element = entry_point; DistType cur_dist = std::numeric_limits::max(); @@ -784,14 +765,17 @@ void HNSWDiskIndex::flushStagedGraphUpdates() { // Write graph updates first (so they're available when processing neighbor updates) rocksdb::WriteBatch graphBatch; + // Buffer for retrieving raw vectors (in case they're on disk) + std::vector raw_vector_buffer(this->inputBlobSize); + // First, handle new node insertions for (const auto& update : stagedGraphUpdates) { auto newKey = GraphKey(update.node_id, update.level); - const void* raw_vector_data = getRawVector(update.node_id); + getRawVector(update.node_id, raw_vector_buffer.data()); // Serialize with new format: [raw_vector_data][neighbor_count][neighbor_ids...] - std::string graph_value = serializeGraphValue(raw_vector_data, update.neighbors); + std::string graph_value = serializeGraphValue(raw_vector_buffer.data(), update.neighbors); graphBatch.Put(cf, newKey.asSlice(), graph_value); } @@ -821,7 +805,7 @@ void HNSWDiskIndex::flushStagedGraphUpdates() { // Use a separate batch for neighbor updates rocksdb::WriteBatch neighborBatch; - // Process each node's neighbor updates + // Process each node's neighbor updates (reuse the buffer from above) for (const auto& [node_id, levelMap] : neighborUpdatesByNode) { for (const auto& [level, newNeighbors] : levelMap) { // Read existing graph value from disk @@ -849,10 +833,10 @@ void HNSWDiskIndex::flushStagedGraphUpdates() { } } - const void* raw_vector_data = getRawVector(node_id); + getRawVector(node_id, raw_vector_buffer.data()); // Serialize with new format and add to batch - std::string graph_value = serializeGraphValue(raw_vector_data, updated_neighbors); + std::string graph_value = serializeGraphValue(raw_vector_buffer.data(), updated_neighbors); neighborBatch.Put(cf, neighborKey.asSlice(), graph_value); } } @@ -949,7 +933,9 @@ idType HNSWDiskIndex::searchBottomLayerEP(const void *query_ VecSimQueryReply_Code *rc) const { if (rc) *rc = VecSim_QueryReply_OK; - auto [curr_element, max_level] = safeGetEntryPointState(); + // auto [curr_element, max_level] = safeGetEntryPointState(); + auto curr_element = entrypointNode; + auto max_level = maxLevel; if (curr_element == INVALID_ID) return curr_element; // index is empty. @@ -1059,7 +1045,7 @@ void HNSWDiskIndex::deserializeGraphValue( } const char* ptr = value.data(); - + // Skip raw vector data ptr += this->inputBlobSize; @@ -1103,20 +1089,21 @@ const void* HNSWDiskIndex::getDataByInternalId(idType id) co } template -const char* HNSWDiskIndex::getRawVector(idType id) const { +void HNSWDiskIndex::getRawVector(idType id, void* output_buffer) const { if (id >= curElementCount) { this->log(VecSimCommonStrings::LOG_WARNING_STRING, "WARNING: getRawVector called with invalid id %u (current count: %zu)", id, curElementCount); - return nullptr; } // First check RAM (for vectors not yet flushed) auto it = rawVectorsInRAM.find(id); if (it != rawVectorsInRAM.end()) { const char* data_ptr = it->second.data(); - return data_ptr; + std::memcpy(output_buffer, data_ptr, this->inputBlobSize); + return; + } // If not in RAM, retrieve from disk @@ -1128,21 +1115,16 @@ const char* HNSWDiskIndex::getRawVector(idType id) const { // Extract vector data const void* vector_data = getVectorFromGraphValue(level0_graph_value); if (vector_data != nullptr) { - // Cache the raw vector data - const char* data_ptr = reinterpret_cast(vector_data); - rawVectorsDiskCache[id] = std::string(data_ptr, this->inputBlobSize); - return rawVectorsDiskCache[id].data(); + // Must copy to output buffer since level0_graph_value will be destroyed + std::memcpy(output_buffer, vector_data, this->inputBlobSize); + return; } - } else if (status.IsNotFound()) { - this->log(VecSimCommonStrings::LOG_WARNING_STRING, - "WARNING: Raw vector not found in RAM or on disk for id %u", id); } else { this->log(VecSimCommonStrings::LOG_WARNING_STRING, "WARNING: Failed to retrieve raw vector for id %u: %s", id, status.ToString().c_str()); } - return nullptr; } template @@ -1305,7 +1287,7 @@ size_t HNSWDiskIndex::getRandomLevel(double reverse_size) { template -void HNSWDiskIndex::processCandidate(idType candidate_id, const void* data_point, size_t level, size_t ef, +void HNSWDiskIndex::processCandidate(idType candidate_id, const void* data_point_raw, const void* data_point, size_t level, size_t ef, void* visited_tags, size_t visited_tag, candidatesLabelsMaxHeap& top_candidates, candidatesMaxHeap& candidate_set, DistType& lowerBound) const { // Use a simple set-based approach for now to avoid visited nodes handler issues @@ -1320,9 +1302,14 @@ void HNSWDiskIndex::processCandidate(idType candidate_id, co visited_set->insert(candidate_id); + + // Add neighbors to candidate set for further exploration + vecsim_stl::vector neighbors(this->allocator); + const void* vector_data; + getNeighborsAndVector(candidate_id, level, neighbors, &vector_data); // Calculate distance to candidate - DistType dist = this->calcDistance(data_point, getDataByInternalId(candidate_id)); - + DistType dist = this->calcDistanceRaw(data_point_raw, vector_data); + // Add to top candidates if it's one of the best if (top_candidates.size() < ef || dist < lowerBound) { top_candidates.emplace(dist, getExternalLabel(candidate_id)); @@ -1333,9 +1320,7 @@ void HNSWDiskIndex::processCandidate(idType candidate_id, co } } - // Add neighbors to candidate set for further exploration - vecsim_stl::vector neighbors(this->allocator); - getNeighbors(candidate_id, level, neighbors); + if (!neighbors.empty()) { for (idType neighbor_id : neighbors) { @@ -1439,20 +1424,55 @@ void HNSWDiskIndex::getNeighbors(idType nodeId, size_t level } } + +template +void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, const void** vector_data) const { + // Clear the result vector first + result.clear(); + + // First check staged graph updates + for (const auto& update : stagedGraphUpdates) { + if (update.node_id == nodeId && update.level == level) { + result.reserve(update.neighbors.size()); + for (size_t i = 0; i < update.neighbors.size(); i++) { + result.push_back(update.neighbors[i]); + } + } + } + auto it = rawVectorsCache.find(nodeId); + if (it != rawVectorsCache.end()) { + *vector_data = it->second.data(); + } + if (!result.empty() && it != rawVectorsCache.end()) { + return; + } + // If not found in staged updates, check disk + GraphKey graphKey(nodeId, level); + + std::string graph_value; + rocksdb::Status status = db->Get(rocksdb::ReadOptions(), cf, graphKey.asSlice(), &graph_value); + + if (status.ok()) { + // Parse using new format: [vector_data][neighbor_count][neighbor_ids...] + deserializeGraphValue(graph_value, result); + *vector_data = graph_value.data(); + } +} + template void HNSWDiskIndex::searchPendingVectors(const void* query_data, candidatesLabelsMaxHeap& top_candidates, size_t k) const { for (size_t i = 0; i < pendingVectorCount; i++) { idType vectorId = pendingVectorIds[i]; // Get the vector data from memory - const void* vector_data = this->vectors->getElement(vectorId); + const void* vector_data = rawVectorsInRAM.find(vectorId)->second.data(); // Get metadata for this vector const DiskElementMetaData& metadata = idToMetaData[vectorId]; labelType label = metadata.label; // Calculate distance - DistType dist = this->calcDistance(query_data, vector_data); + DistType dist = this->calcDistanceRaw(query_data, vector_data); // Add to candidates if it's good enough if (top_candidates.size() < k) { @@ -1482,7 +1502,7 @@ void HNSWDiskIndex::processBatch() { // Get the vector data from memory const void* vector_data = this->vectors->getElement(vectorId); - + const void* raw_vector_data = rawVectorsInRAM.find(vectorId)->second.data(); // Get metadata for this vector DiskElementMetaData& metadata = idToMetaData[vectorId]; labelType label = metadata.label; @@ -1490,7 +1510,7 @@ void HNSWDiskIndex::processBatch() { // Insert into graph if not the first element if (entrypointNode != INVALID_ID) { - insertElementToGraph(vectorId, elementMaxLevel, entrypointNode, maxLevel, vector_data); + insertElementToGraph(vectorId, elementMaxLevel, entrypointNode, maxLevel, raw_vector_data, vector_data); } else { // First element becomes the entry point entrypointNode = vectorId; @@ -1710,7 +1730,7 @@ void HNSWDiskIndex::debugValidateGraphConnectivity() const { template candidatesLabelsMaxHeap * -HNSWDiskIndex::hierarchicalSearch(const void *data_point, idType ep_id, size_t ef, size_t k, void *timeoutCtx, +HNSWDiskIndex::hierarchicalSearch(const void *data_point_raw, const void *data_point, idType ep_id, size_t ef, size_t k, void *timeoutCtx, VecSimQueryReply_Code *rc) const { if (rc) *rc = VecSim_QueryReply_OK; @@ -1801,7 +1821,7 @@ HNSWDiskIndex::hierarchicalSearch(const void *data_point, id } // Process this candidate - processCandidate(curr_candidate_id, data_point, 0, ef, + processCandidate(curr_candidate_id, data_point_raw, data_point, 0, ef, reinterpret_cast(&visited_set), 0, *top_candidates, candidate_set, lower_bound); diff --git a/src/VecSim/index_factories/components/components_factory.h b/src/VecSim/index_factories/components/components_factory.h index 0779df266..5b4013272 100644 --- a/src/VecSim/index_factories/components/components_factory.h +++ b/src/VecSim/index_factories/components/components_factory.h @@ -59,8 +59,10 @@ CreateQuantizedIndexComponents(std::shared_ptr allocator, VecSi // Use INT8 distance function for quantized vectors spaces::dist_func_t distFunc = spaces::GetDistFunc(distance_metric, dim, &alignment); + spaces::dist_func_t rawDistFunc = + spaces::GetDistFunc(distance_metric, dim, &alignment); - auto indexCalculator = new (allocator) DistanceCalculatorCommon(allocator, distFunc); + auto indexCalculator = new (allocator) DistanceCalculatorQuantized(allocator, distFunc, rawDistFunc); // Create preprocessor container with space for 2 preprocessors (normalization + quantization) const size_t n_preprocessors = (metric == VecSimMetric_Cosine && !is_normalized) ? 2 : 1; diff --git a/src/VecSim/spaces/computer/calculator.h b/src/VecSim/spaces/computer/calculator.h index a82293700..e14c3e9ab 100644 --- a/src/VecSim/spaces/computer/calculator.h +++ b/src/VecSim/spaces/computer/calculator.h @@ -23,6 +23,7 @@ class IndexCalculatorInterface : public VecsimBaseObject { virtual ~IndexCalculatorInterface() = default; virtual DistType calcDistance(const void *v1, const void *v2, size_t dim) const = 0; + virtual DistType calcDistanceRaw(const void *v1, const void *v2, size_t dim) const = 0; }; /** @@ -39,7 +40,7 @@ class DistanceCalculatorInterface : public IndexCalculatorInterface { DistanceCalculatorInterface(std::shared_ptr allocator, DistFuncType dist_func) : IndexCalculatorInterface(allocator), dist_func(dist_func) {} virtual DistType calcDistance(const void *v1, const void *v2, size_t dim) const = 0; - + virtual DistType calcDistanceRaw(const void *v1, const void *v2, size_t dim) const = 0; protected: DistFuncType dist_func; }; @@ -56,4 +57,28 @@ class DistanceCalculatorCommon DistType calcDistance(const void *v1, const void *v2, size_t dim) const override { return this->dist_func(v1, v2, dim); } + DistType calcDistanceRaw(const void *v1, const void *v2, size_t dim) const override { + return this->dist_func(v1, v2, dim); + } +}; + +template +class DistanceCalculatorQuantized + : public DistanceCalculatorInterface> { +protected: + spaces::dist_func_t raw_dist_func; + +public: + DistanceCalculatorQuantized(std::shared_ptr allocator, + spaces::dist_func_t quant_dist_func, spaces::dist_func_t raw_dist_func) + : DistanceCalculatorInterface>(allocator, + quant_dist_func), + raw_dist_func(raw_dist_func) {} + + DistType calcDistance(const void *v1, const void *v2, size_t dim) const override { + return this->dist_func(v1, v2, dim); + } + DistType calcDistanceRaw(const void *v1, const void *v2, size_t dim) const { + return this->raw_dist_func(v1, v2, dim); + } }; diff --git a/src/VecSim/vec_sim_index.h b/src/VecSim/vec_sim_index.h index de19c048b..225f420da 100644 --- a/src/VecSim/vec_sim_index.h +++ b/src/VecSim/vec_sim_index.h @@ -136,6 +136,10 @@ struct VecSimIndexAbstract : public VecSimIndexInterface { DistType calcDistance(const void *vector_data1, const void *vector_data2) const { return indexCalculator->calcDistance(vector_data1, vector_data2, this->dim); } + + DistType calcDistanceRaw(const void *vector_data1, const void *vector_data2) const { + return indexCalculator->calcDistanceRaw(vector_data1, vector_data2, this->dim); + } /** * @brief Preprocess a blob for both storage and query. diff --git a/tests/unit/test_components.cpp b/tests/unit/test_components.cpp index c7c1b0879..a6775c514 100644 --- a/tests/unit/test_components.cpp +++ b/tests/unit/test_components.cpp @@ -31,6 +31,9 @@ class DistanceCalculatorDummy : public DistanceCalculatorInterfacedist_func(7); } + virtual DistType calcDistanceRaw(const void *v1, const void *v2, size_t dim) const { + return this->dist_func(7); + } }; } // namespace dummyCalcultor diff --git a/tests/unit/test_hnsw_disk.cpp b/tests/unit/test_hnsw_disk.cpp index f2852ca11..9974cf758 100644 --- a/tests/unit/test_hnsw_disk.cpp +++ b/tests/unit/test_hnsw_disk.cpp @@ -710,14 +710,12 @@ TEST_F(HNSWDiskIndexTest, RawVectorStorageAndRetrieval) { } // Verify that vectors were stored on disk + std::vector buffer(dim); for (size_t i = 0; i < num_vectors; ++i) { - const char* retrieved_vector = index.getRawVector(i); - - // Check that we got a vector back - EXPECT_NE(retrieved_vector, nullptr) << "Retrieved vector " << i << " is nullptr"; + index.getRawVector(i, buffer.data()); // Check that the data matches (approximately, due to preprocessing) - const float* retrieved_data = reinterpret_cast(retrieved_vector); + const float* retrieved_data = reinterpret_cast(buffer.data()); for (size_t j = 0; j < dim; ++j) { EXPECT_FLOAT_EQ(retrieved_data[j], test_vectors[i][j]) << "Vector " << i << " element " << j << " mismatch"; @@ -760,10 +758,12 @@ TEST_F(HNSWDiskIndexTest, RawVectorRetrievalInvalidId) { HNSWDiskIndex index(¶ms, abstractInitParams, components, db.get(), default_cf); // Try to retrieve a vector with an invalid ID (index is empty) - const char* retrieved_vector = index.getRawVector(0); - - // Should return nullptr - EXPECT_EQ(retrieved_vector, nullptr) << "Retrieved vector for invalid ID should be nullptr"; + std::vector buffer(dim); + memset(buffer.data(), 0, dim * sizeof(float)); + index.getRawVector(0, buffer.data()); + for (size_t j = 0; j < dim; ++j) { + EXPECT_FLOAT_EQ(buffer[j], 0.0f) << "Invalid ID retrieval returned non-zero data"; + } std::cout << "Invalid ID retrieval test passed!" << std::endl; } @@ -810,28 +810,25 @@ TEST_F(HNSWDiskIndexTest, RawVectorMultipleRetrievals) { // Retrieve the vector multiple times const size_t num_retrievals = 5; - std::vector retrieved_vectors; + std::vector> retrieved_vectors; for (size_t i = 0; i < num_retrievals; ++i) { - const char* retrieved = index.getRawVector(0); - EXPECT_NE(retrieved, nullptr) << "Retrieval " << i << " returned nullptr"; - retrieved_vectors.push_back(retrieved); + std::vector buffer(dim); + index.getRawVector(0, buffer.data()); + retrieved_vectors.push_back(buffer); } - // Verify all retrievals point to the same data + // Verify all retrievals have the same data for (size_t i = 1; i < num_retrievals; ++i) { - const float* data0 = reinterpret_cast(retrieved_vectors[0]); - const float* datai = reinterpret_cast(retrieved_vectors[i]); for (size_t j = 0; j < dim; ++j) { - EXPECT_FLOAT_EQ(data0[j], datai[j]) + EXPECT_FLOAT_EQ(retrieved_vectors[0][j], retrieved_vectors[i][j]) << "Retrieval " << i << " element " << j << " differs from first retrieval"; } } // Verify the data matches the original - const float* retrieved_data = reinterpret_cast(retrieved_vectors[0]); for (size_t j = 0; j < dim; ++j) { - EXPECT_FLOAT_EQ(retrieved_data[j], test_vector[j]) + EXPECT_FLOAT_EQ(retrieved_vectors[0][j], test_vector[j]) << "Retrieved vector element " << j << " mismatch"; } From ae53e6cf3e8ef5efcb3bbfb87507d6ead41966c2 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Sun, 16 Nov 2025 14:01:07 +0200 Subject: [PATCH 02/20] fix get neigbohrs and vector --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 97c9929a4..29023b3f5 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -1439,11 +1439,11 @@ void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, siz } } } - auto it = rawVectorsCache.find(nodeId); - if (it != rawVectorsCache.end()) { + auto it = rawVectorsInRAM.find(nodeId); + if (it != rawVectorsInRAM.end()) { *vector_data = it->second.data(); } - if (!result.empty() && it != rawVectorsCache.end()) { + if (!result.empty() && it != rawVectorsInRAM.end()) { return; } // If not found in staged updates, check disk From edf8064b8339e1e5f3a172a68b9ff4e794996cff Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Mon, 17 Nov 2025 14:18:40 +0200 Subject: [PATCH 03/20] fix raw vactor dist calculation --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 50 ++++++++----------------- src/VecSim/spaces/computer/calculator.h | 2 +- tests/benchmark/bm_vecsim_index.h | 2 +- tests/unit/test_hnsw_disk.cpp | 2 +- 4 files changed, 18 insertions(+), 38 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 29023b3f5..527dddaaf 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -238,7 +238,7 @@ class HNSWDiskIndex : public VecSimIndexAbstract { // Helper methods void getNeighbors(idType nodeId, size_t level, vecsim_stl::vector& result) const; - void getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, const void** vector_data) const; + void getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, void* vector_data) const; void searchPendingVectors(const void* query_data, candidatesLabelsMaxHeap& top_candidates, size_t k) const; // Manual control of staged updates @@ -450,39 +450,20 @@ HNSWDiskIndex::topKQuery(const void *query_data, size_t k, // Use a more sophisticated search that properly traverses the HNSW hierarchy auto *results = hierarchicalSearch(query_data, processed_query, bottom_layer_ep, std::max(query_ef, k), k, timeoutCtx, &rep->code); - if (VecSim_OK == rep->code && results) { - // Step 3: Also search pending batch vectors and merge results - if (pendingVectorCount > 0) { - searchPendingVectors(query_data, *results, k); - } - + if (pendingVectorCount > 0) { + // Search pending vectors using the helper method + searchPendingVectors(query_data, *results, k); + + } + + if (!results->empty()) { rep->results.resize(results->size()); for (auto result = rep->results.rbegin(); result != rep->results.rend(); result++) { std::tie(result->score, result->id) = results->top(); results->pop(); } - } else { - // Even if main search failed, still search pending vectors - if (pendingVectorCount > 0) { - // Create a heap to store pending results - auto *pending_results = getNewMaxPriorityQueue(); - - // Search pending vectors using the helper method - searchPendingVectors(query_data, *pending_results, k); - - if (!pending_results->empty()) { - rep->results.resize(pending_results->size()); - for (auto result = rep->results.rbegin(); result != rep->results.rend(); result++) { - std::tie(result->score, result->id) = pending_results->top(); - pending_results->pop(); - } - rep->code = VecSim_QueryReply_OK; // Mark as successful since we found results - } - - delete pending_results; - } + rep->code = VecSim_QueryReply_OK; // Mark as successful since we found results } - delete results; return rep; } @@ -1305,11 +1286,10 @@ void HNSWDiskIndex::processCandidate(idType candidate_id, co // Add neighbors to candidate set for further exploration vecsim_stl::vector neighbors(this->allocator); - const void* vector_data; - getNeighborsAndVector(candidate_id, level, neighbors, &vector_data); + std::vector vector_data(this->inputBlobSize); + getNeighborsAndVector(candidate_id, level, neighbors, vector_data.data()); // Calculate distance to candidate - DistType dist = this->calcDistanceRaw(data_point_raw, vector_data); - + DistType dist = this->calcDistanceRaw(data_point_raw, vector_data.data()); // Add to top candidates if it's one of the best if (top_candidates.size() < ef || dist < lowerBound) { top_candidates.emplace(dist, getExternalLabel(candidate_id)); @@ -1426,7 +1406,7 @@ void HNSWDiskIndex::getNeighbors(idType nodeId, size_t level template -void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, const void** vector_data) const { +void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, void* vector_data) const { // Clear the result vector first result.clear(); @@ -1441,7 +1421,7 @@ void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, siz } auto it = rawVectorsInRAM.find(nodeId); if (it != rawVectorsInRAM.end()) { - *vector_data = it->second.data(); + std::memcpy(vector_data, it->second.data(), this->inputBlobSize); } if (!result.empty() && it != rawVectorsInRAM.end()) { return; @@ -1455,7 +1435,7 @@ void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, siz if (status.ok()) { // Parse using new format: [vector_data][neighbor_count][neighbor_ids...] deserializeGraphValue(graph_value, result); - *vector_data = graph_value.data(); + std::memcpy(vector_data, graph_value.data(), this->inputBlobSize); } } diff --git a/src/VecSim/spaces/computer/calculator.h b/src/VecSim/spaces/computer/calculator.h index e14c3e9ab..420afecb8 100644 --- a/src/VecSim/spaces/computer/calculator.h +++ b/src/VecSim/spaces/computer/calculator.h @@ -79,6 +79,6 @@ class DistanceCalculatorQuantized return this->dist_func(v1, v2, dim); } DistType calcDistanceRaw(const void *v1, const void *v2, size_t dim) const { - return this->raw_dist_func(v1, v2, dim); + return this->raw_dist_func(v1, v2, dim) * 16129; // multiply by 127^2 } }; diff --git a/tests/benchmark/bm_vecsim_index.h b/tests/benchmark/bm_vecsim_index.h index bd9c3ed7c..b5482b0da 100644 --- a/tests/benchmark/bm_vecsim_index.h +++ b/tests/benchmark/bm_vecsim_index.h @@ -86,7 +86,7 @@ class BM_VecSimIndex : public BM_VecSimGeneral { // FBIN loading configuration static constexpr size_t BATCH_SIZE = 40; - static constexpr const char *FBIN_PATH = "tests/benchmark/data/deep.base.100K.fbin"; + static constexpr const char *FBIN_PATH = "tests/benchmark/data/deep.base.1M.fbin"; }; diff --git a/tests/unit/test_hnsw_disk.cpp b/tests/unit/test_hnsw_disk.cpp index 9974cf758..7555019d2 100644 --- a/tests/unit/test_hnsw_disk.cpp +++ b/tests/unit/test_hnsw_disk.cpp @@ -345,7 +345,7 @@ TEST_F(HNSWDiskIndexTest, AddVectorTest) { abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); // Create index components - IndexComponents components = CreateIndexComponents( + IndexComponents components = CreateQuantizedIndexComponents( abstractInitParams.allocator, VecSimMetric_L2, dim, false); // Create HNSWDiskIndex - use default column family handle From bf9a8572ac143374dc6f0bd42b8a387d6e0046d6 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Tue, 18 Nov 2025 16:11:45 +0200 Subject: [PATCH 04/20] fix after merge --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 34 ------------------- .../algorithms/hnsw/hnsw_disk_serializer.h | 8 ++--- 2 files changed, 2 insertions(+), 40 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 1ea672e5a..da894ec1e 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -1497,40 +1497,6 @@ void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, siz } -template -void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, void* vector_data) const { - // Clear the result vector first - result.clear(); - - // First check staged graph updates - for (const auto& update : stagedGraphUpdates) { - if (update.node_id == nodeId && update.level == level) { - result.reserve(update.neighbors.size()); - for (size_t i = 0; i < update.neighbors.size(); i++) { - result.push_back(update.neighbors[i]); - } - } - } - auto it = rawVectorsInRAM.find(nodeId); - if (it != rawVectorsInRAM.end()) { - std::memcpy(vector_data, it->second.data(), this->inputBlobSize); - } - if (!result.empty() && it != rawVectorsInRAM.end()) { - return; - } - // If not found in staged updates, check disk - GraphKey graphKey(nodeId, level); - - std::string graph_value; - rocksdb::Status status = db->Get(rocksdb::ReadOptions(), cf, graphKey.asSlice(), &graph_value); - - if (status.ok()) { - // Parse using new format: [vector_data][neighbor_count][neighbor_ids...] - deserializeGraphValue(graph_value, result); - std::memcpy(vector_data, graph_value.data(), this->inputBlobSize); - } -} - template void HNSWDiskIndex::searchPendingVectors( const void *query_data, candidatesLabelsMaxHeap &top_candidates, size_t k) const { diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h b/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h index 5e7b9be02..e9ac86d3f 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h @@ -56,7 +56,7 @@ HNSWDiskIndex::HNSWDiskIndex( : VecSimIndexAbstract(abstractInitParams, components), idToMetaData(this->allocator), labelToIdMap(this->allocator), db(db), cf(cf), dbPath(""), indexDataGuard(), visitedNodesHandlerPool(INITIAL_CAPACITY, this->allocator), - delta_list(), new_elements_meta_data(this->allocator), batchThreshold(0), // Will be restored from file + new_elements_meta_data(this->allocator), batchThreshold(0), // Will be restored from file pendingVectorIds(this->allocator), pendingMetadata(this->allocator), pendingVectorCount(0), stagedGraphUpdates(this->allocator), stagedNeighborUpdates(this->allocator) { @@ -276,11 +276,7 @@ void HNSWDiskIndex::saveIndexIMP(std::ofstream &output) { if (pendingVectorCount != 0) { throw std::runtime_error("Serialization error: pendingVectorCount not zero after flush"); } - // Note: delta_list and new_elements_meta_data are currently unused legacy variables - // but we verify them for future-proofing - if (!delta_list.empty()) { - throw std::runtime_error("Serialization error: delta_list not empty after flush"); - } + if (!new_elements_meta_data.empty()) { throw std::runtime_error("Serialization error: new_elements_meta_data not empty after flush"); } From b80d84aaa47a864b8664b7da0df085793fa2b7f6 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Thu, 20 Nov 2025 10:46:58 +0200 Subject: [PATCH 05/20] added flushbatch bench --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 6 ++++ .../index_factories/hnsw_disk_factory.cpp | 2 +- .../bm_hnsw_disk_initialize_fp32.h | 9 +++--- tests/benchmark/bm_vecsim_basics.h | 29 ++++++++++++++++++- .../run_files/bm_hnsw_disk_single_fp32.cpp | 9 +----- 5 files changed, 41 insertions(+), 14 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index da894ec1e..1c13153d7 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -243,6 +243,7 @@ class HNSWDiskIndex : public VecSimIndexAbstract // Batch processing methods void processBatch(); void flushBatch(); // Force flush current batch + void setBatchThreshold(size_t threshold); // Set batch threshold // Helper methods void getNeighbors(idType nodeId, size_t level, vecsim_stl::vector& result) const; @@ -1571,6 +1572,11 @@ void HNSWDiskIndex::flushBatch() { processBatch(); } +template +void HNSWDiskIndex::setBatchThreshold(size_t threshold) { + batchThreshold = threshold; +} + /********************************** Debug Methods **********************************/ template diff --git a/src/VecSim/index_factories/hnsw_disk_factory.cpp b/src/VecSim/index_factories/hnsw_disk_factory.cpp index c830fc92d..8b0e28257 100644 --- a/src/VecSim/index_factories/hnsw_disk_factory.cpp +++ b/src/VecSim/index_factories/hnsw_disk_factory.cpp @@ -291,7 +291,7 @@ VecSimIndex *NewIndex(const std::string &folder_path, bool is_normalized) { // Using PID and timestamp to ensure uniqueness across multiple benchmark runs std::string temp_dir = "/tmp/hnsw_disk_benchmark_" + std::to_string(getpid()) + "_" + std::to_string(std::time(nullptr)); - + std::cerr << "Temporary checkpoint directory: " << temp_dir << std::endl; managed_rocksdb = std::make_unique(checkpoint_dir, temp_dir); return NewIndex(folder_path, managed_rocksdb->getDB(), managed_rocksdb->getCF(), is_normalized); diff --git a/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h b/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h index 0dc2119bd..f0a50d121 100644 --- a/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h +++ b/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h @@ -29,15 +29,16 @@ BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(Disk, HNSWDisk), fp32_ (benchmark::State &st) { Disk(st, INDEX_HNSW_DISK); } BENCHMARK_REGISTER_F(BM_VecSimCommon, BM_FUNC_NAME(Disk, HNSWDisk))->Iterations(1); +// AddLabel benchmarks +BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimBasics, BM_FLUSH_BATCH_DISK, fp32_index_t) +(benchmark::State &st) { FlushBatchDisk(st); } +REGISTER_FlushBatchDisk(BM_FLUSH_BATCH_DISK); + // TopK benchmark BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(TopK, HNSWDisk), fp32_index_t) (benchmark::State &st) { TopK_HNSW_DISK(st); } REGISTER_TopK_HNSW_DISK(BM_VecSimCommon, BM_FUNC_NAME(TopK, HNSWDisk)); -// AddLabel benchmarks -BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimBasics, BM_ADD_LABEL, fp32_index_t) -(benchmark::State &st) { AddLabel(st); } -REGISTER_AddLabel(BM_ADD_LABEL, INDEX_HNSW_DISK); // Range benchmarks // BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimBasics, BM_FUNC_NAME(Range, BF), fp32_index_t) diff --git a/tests/benchmark/bm_vecsim_basics.h b/tests/benchmark/bm_vecsim_basics.h index 53b8f7dd9..82f7bdcc3 100644 --- a/tests/benchmark/bm_vecsim_basics.h +++ b/tests/benchmark/bm_vecsim_basics.h @@ -12,7 +12,7 @@ #include "bm_common.h" #include #include "types_ranges.h" - +#include "bm_vecsim_general.h" using namespace std::chrono; template @@ -42,6 +42,8 @@ class BM_VecSimBasics : public BM_VecSimCommon { static void Range_BF(benchmark::State &st); static void Range_HNSW(benchmark::State &st); + static void FlushBatchDisk(benchmark::State &st); + private: // Vectors of vector to store deleted labels' data. using LabelData = std::vector>; @@ -322,6 +324,22 @@ void BM_VecSimBasics::Range_HNSW(benchmark::State &st) { st.counters["Recall"] = (float)total_res / total_res_bf; } +template +void BM_VecSimBasics::FlushBatchDisk(benchmark::State &st) { + auto hnsw_disk_index = dynamic_cast*>( + HNSWDiskFactory::NewIndex(BM_VecSimGeneral::AttachRootPath(BM_VecSimGeneral::hnsw_index_file), false)); + size_t flush_threshold = st.range(0); + hnsw_disk_index->setBatchThreshold(flush_threshold); + for (size_t i = 0; i < flush_threshold-1; i++) { + // add vectors to fill the batch + VecSimIndex_AddVector(hnsw_disk_index, QUERIES[i%N_QUERIES].data(), i); + } + for (auto _ : st) { + // add one vector to trigger flush + VecSimIndex_AddVector(hnsw_disk_index, QUERIES[flush_threshold-1%N_QUERIES].data(), flush_threshold-1); + } +} + #define UNIT_AND_ITERATIONS Unit(benchmark::kMillisecond)->Iterations(BM_VecSimGeneral::block_size) // These macros are used to make sure the expansion of other macros happens when needed @@ -363,6 +381,15 @@ void BM_VecSimBasics::Range_HNSW(benchmark::State &st) { ->UNIT_AND_ITERATIONS->Arg(VecSimAlgo) \ ->ArgName(#VecSimAlgo) +#define REGISTER_FlushBatchDisk(BM_FUNC) \ + BENCHMARK_REGISTER_F(BM_VecSimBasics, BM_FUNC) \ + ->Unit(benchmark::kMillisecond) \ + ->Iterations(1) \ + ->Args({100}) \ + ->Args({1000}) \ + ->Args({10000}) \ + ->ArgName({"batch size"}) \ + // DeleteLabel define and register macros #define DEFINE_DELETE_LABEL(BM_FUNC, INDEX_TYPE, INDEX_NAME, DATA_TYPE, DIST_TYPE, VecSimAlgo) \ BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimBasics, BM_FUNC, INDEX_TYPE)(benchmark::State & st) { \ diff --git a/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp b/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp index 73a1a0872..05e027c6e 100644 --- a/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp +++ b/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp @@ -30,14 +30,7 @@ const char *BM_VecSimGeneral::ground_truth_file = "tests/benchmark/data/deep.gro #define BM_ADD_LABEL CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single) #define BM_ADD_LABEL_ASYNC CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Async, Single) #define BM_DELETE_LABEL_ASYNC CONCAT_WITH_UNDERSCORE_ARCH(DeleteLabel_Async, Single) - -// Define benchmarks for different index types -DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, BF), fp32_index_t, BruteForceIndex_Single, float, - float, INDEX_BF) -DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, HNSW), fp32_index_t, HNSWIndex_Single, float, float, - INDEX_HNSW) -DEFINE_DELETE_LABEL(BM_FUNC_NAME(DeleteLabel, HNSWDisk), fp32_index_t, HNSWDiskIndex, float, float, - INDEX_HNSW_DISK) +#define BM_FLUSH_BATCH_DISK CONCAT_WITH_UNDERSCORE_ARCH(FlushBatchDisk, Single) #include "benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h" BENCHMARK_MAIN(); From 3be3527e46c81d56dea3e058e9864d3328f46572 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Thu, 20 Nov 2025 14:04:39 +0200 Subject: [PATCH 06/20] fix potential bug --- tests/benchmark/bm_vecsim_basics.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmark/bm_vecsim_basics.h b/tests/benchmark/bm_vecsim_basics.h index 82f7bdcc3..6e7d73b19 100644 --- a/tests/benchmark/bm_vecsim_basics.h +++ b/tests/benchmark/bm_vecsim_basics.h @@ -336,7 +336,7 @@ void BM_VecSimBasics::FlushBatchDisk(benchmark::State &st) { } for (auto _ : st) { // add one vector to trigger flush - VecSimIndex_AddVector(hnsw_disk_index, QUERIES[flush_threshold-1%N_QUERIES].data(), flush_threshold-1); + VecSimIndex_AddVector(hnsw_disk_index, QUERIES[(flush_threshold-1)%N_QUERIES].data(), flush_threshold-1); } } From a54c61b487fda536bdd878308b60cb766a6d758a Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Sun, 23 Nov 2025 13:55:23 +0200 Subject: [PATCH 07/20] fix benchmarks --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 6 +- .../index_factories/hnsw_disk_factory.cpp | 125 ++++++++---------- .../index_factories/hnsw_disk_factory.h | 45 +++++++ tests/benchmark/bm_vecsim_basics.h | 18 ++- 4 files changed, 122 insertions(+), 72 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 1c13153d7..ada81d857 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -436,6 +436,10 @@ HNSWDiskIndex::~HNSWDiskIndex() { pendingVectorIds.clear(); pendingMetadata.clear(); + // Clear raw vectors in RAM + rawVectorsInRAM.clear(); + rawVectorsCache.clear(); + // Clear delta list and new elements metadata new_elements_meta_data.clear(); @@ -445,7 +449,7 @@ HNSWDiskIndex::~HNSWDiskIndex() { // Ensure all memory is properly released idToMetaData.shrink_to_fit(); - + // Note: db and cf are not owned by this class, so we don't delete them // Base class destructor will handle indexCalculator and preprocessors } diff --git a/src/VecSim/index_factories/hnsw_disk_factory.cpp b/src/VecSim/index_factories/hnsw_disk_factory.cpp index 8b0e28257..130a8e538 100644 --- a/src/VecSim/index_factories/hnsw_disk_factory.cpp +++ b/src/VecSim/index_factories/hnsw_disk_factory.cpp @@ -24,83 +24,62 @@ namespace HNSWDiskFactory { #ifdef BUILD_TESTS // RAII wrapper to manage RocksDB database and temporary directory cleanup -class ManagedRocksDB { -private: - std::unique_ptr db; - rocksdb::ColumnFamilyHandle *cf = nullptr; - std::string temp_dir; - bool cleanup_temp_dir; // Whether to delete temp_dir on destruction - -public: - // Constructor for loading from checkpoint (with temp directory for writes) - // Copies the entire checkpoint to a temp location to ensure the original is never modified - ManagedRocksDB(const std::string &checkpoint_dir, const std::string &temp_path) - : temp_dir(temp_path), cleanup_temp_dir(true) { - - // Create temp directory - std::filesystem::create_directories(temp_dir); - - // Copy the entire checkpoint to temp location to preserve the original - std::string temp_checkpoint = temp_dir + "/checkpoint_copy"; - try { - std::filesystem::copy(checkpoint_dir, temp_checkpoint, - std::filesystem::copy_options::recursive); - } catch (const std::filesystem::filesystem_error &e) { - // Clean up temp dir if copy failed - std::filesystem::remove_all(temp_dir); - throw std::runtime_error("Failed to copy checkpoint to temp location: " + - std::string(e.what())); - } - - // Open RocksDB from the temp checkpoint copy - // All writes (WAL, SST, MANIFEST, etc.) will go to the temp location - rocksdb::Options options; - options.create_if_missing = false; // Checkpoint copy should exist - options.error_if_exists = false; - options.statistics = rocksdb::CreateDBStatistics(); - - rocksdb::DB *db_ptr = nullptr; - rocksdb::Status status = rocksdb::DB::Open(options, temp_checkpoint, &db_ptr); - if (!status.ok()) { - // Clean up temp dir if DB open failed - std::filesystem::remove_all(temp_dir); - throw std::runtime_error("Failed to open RocksDB from temp checkpoint: " + - status.ToString()); - } - - db.reset(db_ptr); - cf = db->DefaultColumnFamily(); +// Constructor for loading from checkpoint (with temp directory for writes) +ManagedRocksDB::ManagedRocksDB(const std::string &checkpoint_dir, const std::string &temp_path) + : temp_dir(temp_path), cleanup_temp_dir(true) { + + // Create temp directory + std::filesystem::create_directories(temp_dir); + + // Copy the entire checkpoint to temp location to preserve the original + std::string temp_checkpoint = temp_dir + "/checkpoint_copy"; + try { + std::filesystem::copy(checkpoint_dir, temp_checkpoint, + std::filesystem::copy_options::recursive); + } catch (const std::filesystem::filesystem_error &e) { + // Clean up temp dir if copy failed + std::filesystem::remove_all(temp_dir); + throw std::runtime_error("Failed to copy checkpoint to temp location: " + + std::string(e.what())); } - // Constructor for creating new index (permanent location, no cleanup) - ManagedRocksDB(rocksdb::DB *db_ptr, const std::string &db_path) - : temp_dir(db_path), cleanup_temp_dir(false) { - db.reset(db_ptr); - cf = db->DefaultColumnFamily(); + // Open RocksDB from the temp checkpoint copy + // All writes (WAL, SST, MANIFEST, etc.) will go to the temp location + rocksdb::Options options; + options.create_if_missing = false; // Checkpoint copy should exist + options.error_if_exists = false; + options.statistics = rocksdb::CreateDBStatistics(); + + rocksdb::DB *db_ptr = nullptr; + rocksdb::Status status = rocksdb::DB::Open(options, temp_checkpoint, &db_ptr); + if (!status.ok()) { + // Clean up temp dir if DB open failed + std::filesystem::remove_all(temp_dir); + throw std::runtime_error("Failed to open RocksDB from temp checkpoint: " + + status.ToString()); } - // Destructor: closes DB and optionally cleans up temp directory - ~ManagedRocksDB() { - // Close DB first (unique_ptr handles this automatically) - db.reset(); + db.reset(db_ptr); + cf = db->DefaultColumnFamily(); +} - // Delete temp directory only if it's actually temporary - if (cleanup_temp_dir && !temp_dir.empty() && std::filesystem::exists(temp_dir)) { - std::filesystem::remove_all(temp_dir); - } - } +// Constructor for creating new index (permanent location, no cleanup) +ManagedRocksDB::ManagedRocksDB(rocksdb::DB *db_ptr, const std::string &db_path) + : temp_dir(db_path), cleanup_temp_dir(false) { + db.reset(db_ptr); + cf = db->DefaultColumnFamily(); +} - // Disable copy and move to prevent resource management issues - ManagedRocksDB(const ManagedRocksDB&) = delete; - ManagedRocksDB& operator=(const ManagedRocksDB&) = delete; - ManagedRocksDB(ManagedRocksDB&&) = delete; - ManagedRocksDB& operator=(ManagedRocksDB&&) = delete; +// Destructor: closes DB and optionally cleans up temp directory +ManagedRocksDB::~ManagedRocksDB() { + // Close DB first (unique_ptr handles this automatically) + db.reset(); - // Accessors - rocksdb::DB* getDB() const { return db.get(); } - rocksdb::ColumnFamilyHandle* getCF() const { return cf; } - const std::string& getTempDir() const { return temp_dir; } -}; + // Delete temp directory only if it's actually temporary + if (cleanup_temp_dir && !temp_dir.empty() && std::filesystem::exists(temp_dir)) { + std::filesystem::remove_all(temp_dir); + } +} // Static managed RocksDB instance for benchmark convenience wrapper // The destructor will automatically clean up the temp directory when: @@ -108,6 +87,12 @@ class ManagedRocksDB { // 2. The program exits (static destructor is called) static std::unique_ptr managed_rocksdb; +// Factory function to create a managed RocksDB instance +std::unique_ptr CreateManagedRocksDB(const std::string &checkpoint_dir, + const std::string &temp_dir) { + return std::make_unique(checkpoint_dir, temp_dir); +} + // Helper function to create AbstractIndexInitParams from VecSimParams static AbstractIndexInitParams NewAbstractInitParams(const VecSimParams *params) { const HNSWParams *hnswParams = ¶ms->algoParams.hnswParams; diff --git a/src/VecSim/index_factories/hnsw_disk_factory.h b/src/VecSim/index_factories/hnsw_disk_factory.h index b99f20753..b8dd73ba1 100644 --- a/src/VecSim/index_factories/hnsw_disk_factory.h +++ b/src/VecSim/index_factories/hnsw_disk_factory.h @@ -23,6 +23,40 @@ namespace HNSWDiskFactory { #ifdef BUILD_TESTS +/** + * RAII wrapper for RocksDB database management. + * Handles automatic cleanup of temporary directories and database resources. + */ +class ManagedRocksDB { +private: + std::unique_ptr db; + rocksdb::ColumnFamilyHandle *cf = nullptr; + std::string temp_dir; + bool cleanup_temp_dir; // Whether to delete temp_dir on destruction + +public: + // Constructor for loading from checkpoint (with temp directory for writes) + // Copies the entire checkpoint to a temp location to ensure the original is never modified + ManagedRocksDB(const std::string &checkpoint_dir, const std::string &temp_path); + + // Constructor for creating new index (permanent location, no cleanup) + ManagedRocksDB(rocksdb::DB *db_ptr, const std::string &db_path); + + // Destructor: closes DB and optionally cleans up temp directory + ~ManagedRocksDB(); + + // Disable copy and move to prevent resource management issues + ManagedRocksDB(const ManagedRocksDB&) = delete; + ManagedRocksDB& operator=(const ManagedRocksDB&) = delete; + ManagedRocksDB(ManagedRocksDB&&) = delete; + ManagedRocksDB& operator=(ManagedRocksDB&&) = delete; + + // Accessors + rocksdb::DB* getDB() const { return db.get(); } + rocksdb::ColumnFamilyHandle* getCF() const { return cf; } + const std::string& getTempDir() const { return temp_dir; } +}; + /** * Get the checkpoint directory path for a given index file. * @@ -34,6 +68,17 @@ namespace HNSWDiskFactory { */ std::string GetCheckpointDir(const std::string &location); +/** + * Create a managed RocksDB instance for testing/benchmarking. + * The returned object will automatically clean up the temp directory when destroyed. + * + * @param checkpoint_dir Path to the RocksDB checkpoint directory + * @param temp_dir Path to the temporary directory for the checkpoint copy + * @return std::unique_ptr Managed RocksDB instance + */ +std::unique_ptr CreateManagedRocksDB(const std::string &checkpoint_dir, + const std::string &temp_dir); + /** * Factory function to load a serialized disk-based HNSW index from a file. * diff --git a/tests/benchmark/bm_vecsim_basics.h b/tests/benchmark/bm_vecsim_basics.h index 6e7d73b19..44082c898 100644 --- a/tests/benchmark/bm_vecsim_basics.h +++ b/tests/benchmark/bm_vecsim_basics.h @@ -326,8 +326,20 @@ void BM_VecSimBasics::Range_HNSW(benchmark::State &st) { template void BM_VecSimBasics::FlushBatchDisk(benchmark::State &st) { + // Create a LOCAL ManagedRocksDB instance for this benchmark + // This ensures we don't interfere with the global managed_rocksdb used by other benchmarks + std::string folder_path = BM_VecSimGeneral::AttachRootPath(BM_VecSimGeneral::hnsw_index_file); + std::string checkpoint_dir = HNSWDiskFactory::GetCheckpointDir(folder_path); + std::string temp_dir = "/tmp/hnsw_disk_flushbatch_" + std::to_string(getpid()) + + "_" + std::to_string(std::time(nullptr)); + + // Create a local ManagedRocksDB that will be automatically cleaned up when it goes out of scope + auto local_managed_db = HNSWDiskFactory::CreateManagedRocksDB(checkpoint_dir, temp_dir); + + // Create index using the local database auto hnsw_disk_index = dynamic_cast*>( - HNSWDiskFactory::NewIndex(BM_VecSimGeneral::AttachRootPath(BM_VecSimGeneral::hnsw_index_file), false)); + HNSWDiskFactory::NewIndex(folder_path, local_managed_db->getDB(), local_managed_db->getCF(), false)); + size_t flush_threshold = st.range(0); hnsw_disk_index->setBatchThreshold(flush_threshold); for (size_t i = 0; i < flush_threshold-1; i++) { @@ -338,6 +350,10 @@ void BM_VecSimBasics::FlushBatchDisk(benchmark::State &st) { // add one vector to trigger flush VecSimIndex_AddVector(hnsw_disk_index, QUERIES[(flush_threshold-1)%N_QUERIES].data(), flush_threshold-1); } + + // Clean up the index + VecSimIndex_Free(hnsw_disk_index); + // local_managed_db will be automatically destroyed here, cleaning up the temp directory } #define UNIT_AND_ITERATIONS Unit(benchmark::kMillisecond)->Iterations(BM_VecSimGeneral::block_size) From 8a0d19cb79e6a2e7ad4b45256b924ee68efea68e Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Sun, 30 Nov 2025 07:50:12 +0200 Subject: [PATCH 08/20] fix search efiecincy --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 421 ++++++++----------------- 1 file changed, 131 insertions(+), 290 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index ada81d857..e2162971d 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -246,6 +246,10 @@ class HNSWDiskIndex : public VecSimIndexAbstract void setBatchThreshold(size_t threshold); // Set batch threshold // Helper methods + void emplaceHeap(vecsim_stl::abstract_priority_queue &heap, DistType dist, + idType id) const; + void emplaceHeap(vecsim_stl::abstract_priority_queue &heap, DistType dist, + idType id) const; void getNeighbors(idType nodeId, size_t level, vecsim_stl::vector& result) const; void getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, void* vector_data) const; void searchPendingVectors(const void* query_data, candidatesLabelsMaxHeap& top_candidates, size_t k) const; @@ -265,8 +269,11 @@ class HNSWDiskIndex : public VecSimIndexAbstract public: // Methods needed by benchmark framework const void *getDataByInternalId(idType id) const; - candidatesMaxHeap searchLayer(idType ep_id, const void *data_point, size_t level, + candidatesMaxHeap searchLayer(idType ep_id, const void *data_point_raw, const void *data_point, size_t level, size_t ef) const; + candidatesLabelsMaxHeap * searchLayerLabels(idType ep_id, const void *data_point_raw, const void *data_point, size_t level, + size_t ef) const; + template void greedySearchLevel(const void *data_point, size_t level, idType &curr_element, DistType &cur_dist) const; std::pair safeGetEntryPointState() const; @@ -275,9 +282,10 @@ class HNSWDiskIndex : public VecSimIndexAbstract candidatesLabelsMaxHeap *getNewMaxPriorityQueue() const; bool isMarkedDeleted(idType id) const; labelType getExternalLabel(idType id) const; + template void processCandidate(idType candidate_id, const void* data_point_raw, const void *data_point, size_t level, size_t ef, - void *visited_tags, size_t visited_tag, - candidatesLabelsMaxHeap &top_candidates, + std::unordered_set *visited_set, + vecsim_stl::abstract_priority_queue &top_candidates, candidatesMaxHeap &candidate_set, DistType &lowerBound) const; // Raw vector storage and retrieval methods @@ -287,15 +295,6 @@ class HNSWDiskIndex : public VecSimIndexAbstract idType searchBottomLayerEP(const void *query_data, void *timeoutCtx = nullptr, VecSimQueryReply_Code *rc = nullptr) const; - candidatesLabelsMaxHeap * - searchBottomLayer_WithTimeout(idType ep_id, const void *data_point, size_t ef, size_t k, - void *timeoutCtx = nullptr, - VecSimQueryReply_Code *rc = nullptr) const; - - // New hierarchical search method - candidatesLabelsMaxHeap * - hierarchicalSearch(const void *data_point_raw, const void *data_point, idType ep_id, size_t ef, size_t k, void *timeoutCtx = nullptr, - VecSimQueryReply_Code *rc = nullptr) const; public: HNSWDiskIndex(const HNSWParams *params, const AbstractIndexInitParams &abstractInitParams, @@ -487,17 +486,16 @@ HNSWDiskIndex::topKQuery(const void *query_data, size_t k, return rep; // Empty index or error } - // Step 2: Perform hierarchical search from top level down to bottom level - // Use a more sophisticated search that properly traverses the HNSW hierarchy - auto *results = hierarchicalSearch(query_data, processed_query, bottom_layer_ep, std::max(query_ef, k), k, - timeoutCtx, &rep->code); - + auto *results = searchLayerLabels(bottom_layer_ep, query_data, processed_query , 0, query_ef); + if (pendingVectorCount > 0) { // Search pending vectors using the helper method searchPendingVectors(query_data, *results, k); } - + while (results->size() > k) { + results->pop(); + } if (!results->empty()) { rep->results.resize(results->size()); for (auto result = rep->results.rbegin(); result != rep->results.rend(); result++) { @@ -696,7 +694,7 @@ void HNSWDiskIndex::insertElementToGraph(idType element_id, // a greedy search in the graph starting from the entry point // at each level, and move on with the closest element we can find. // When there is no improvement to do, we take a step down. - greedySearchLevel(vector_data, level, curr_element, cur_dist); + greedySearchLevel(vector_data, level, curr_element, cur_dist); } } else { max_common_level = global_max_level; @@ -704,7 +702,7 @@ void HNSWDiskIndex::insertElementToGraph(idType element_id, for (auto level = static_cast(max_common_level); level >= 0; level--) { candidatesMaxHeap top_candidates = - searchLayer(curr_element, vector_data, level, efConstruction); + searchLayer(curr_element, raw_vector_data, vector_data, level, efConstruction); // If the entry point was marked deleted between iterations, we may receive an empty // candidates set. @@ -973,65 +971,11 @@ idType HNSWDiskIndex::searchBottomLayerEP(const void *query_ DistType cur_dist = this->calcDistance(query_data, getDataByInternalId(curr_element)); for (size_t level = max_level; level > 0 && curr_element != INVALID_ID; --level) { - greedySearchLevel(query_data, level, curr_element, cur_dist); + greedySearchLevel(query_data, level, curr_element, cur_dist); } return curr_element; } -template -candidatesLabelsMaxHeap *HNSWDiskIndex::searchBottomLayer_WithTimeout( - idType ep_id, const void *data_point, size_t ef, size_t k, void *timeoutCtx, - VecSimQueryReply_Code *rc) const { - - // Use a simple set for visited nodes tracking - std::unordered_set visited_set; - - candidatesLabelsMaxHeap *top_candidates = getNewMaxPriorityQueue(); - candidatesMaxHeap candidate_set(this->allocator); - - DistType lowerBound; - if (!isMarkedDeleted(ep_id)) { - // If ep is not marked as deleted, get its distance and set lower bound and heaps - // accordingly - DistType dist = this->calcDistance(data_point, getDataByInternalId(ep_id)); - lowerBound = dist; - top_candidates->emplace(dist, getExternalLabel(ep_id)); - candidate_set.emplace(-dist, ep_id); - } else { - // If ep is marked as deleted, set initial lower bound to max, and don't insert to top - // candidates heap - lowerBound = std::numeric_limits::max(); - candidate_set.emplace(-lowerBound, ep_id); - } - - visited_set.insert(ep_id); - - while (!candidate_set.empty()) { - pair curr_el_pair = candidate_set.top(); - - if ((-curr_el_pair.first) > lowerBound && top_candidates->size() >= ef) { - break; - } - if (timeoutCtx && VECSIM_TIMEOUT(timeoutCtx)) { - if (rc) - *rc = VecSim_QueryReply_TimedOut; - return top_candidates; - } - candidate_set.pop(); - - processCandidate(curr_el_pair.second, data_point, 0, ef, - reinterpret_cast(&visited_set), 0, *top_candidates, candidate_set, - lowerBound); - } - - while (top_candidates->size() > k) { - top_candidates->pop(); - } - if (rc) - *rc = VecSim_QueryReply_OK; - return top_candidates; -} - /********************************** Helper Methods **********************************/ // Serialize GraphKey value: [raw_vector_data][neighbor_count][neighbor_ids...] @@ -1162,66 +1106,76 @@ void HNSWDiskIndex::getRawVector(idType id, void* output_buf template candidatesMaxHeap -HNSWDiskIndex::searchLayer(idType ep_id, const void *data_point, size_t level, - size_t ef) const { +HNSWDiskIndex::searchLayer(idType ep_id, const void *data_point_raw, const void *data_point, size_t layer, + size_t ef) const { + + std::unordered_set visited_set; + candidatesMaxHeap top_candidates(this->allocator); candidatesMaxHeap candidate_set(this->allocator); - // Get visited list - auto *visited_nodes_handler = getVisitedList(); - tag_t visited_tag = visited_nodes_handler->getFreshTag(); - - // Start with the entry point and initialize lowerBound - DistType dist = this->calcDistance(data_point, getDataByInternalId(ep_id)); - DistType lowerBound = dist; - top_candidates.emplace(dist, ep_id); - candidate_set.emplace(-dist, ep_id); - visited_nodes_handler->tagNode(ep_id, visited_tag); + DistType lowerBound; + if (!isMarkedDeleted(ep_id)) { + DistType dist = this->calcDistance(data_point, getDataByInternalId(ep_id)); + lowerBound = dist; + top_candidates.emplace(dist, ep_id); + candidate_set.emplace(-dist, ep_id); + } else { + lowerBound = std::numeric_limits::max(); + candidate_set.emplace(-lowerBound, ep_id); + } - // Search for candidates + visited_set.insert(ep_id); while (!candidate_set.empty()) { - auto curr_pair = candidate_set.top(); - DistType curr_dist = -curr_pair.first; + pair curr_el_pair = candidate_set.top(); - // Early termination: if we have enough candidates and current distance is worse than - // lowerBound, stop - if (curr_dist > lowerBound && top_candidates.size() >= ef) { + if ((-curr_el_pair.first) > lowerBound && top_candidates.size() >= ef) { break; } - - idType curr_id = curr_pair.second; candidate_set.pop(); - // Get neighbors of current node at this level - vecsim_stl::vector neighbors(this->allocator); - getNeighbors(curr_id, level, neighbors); + processCandidate(curr_el_pair.second, data_point_raw, data_point, layer, ef, + &visited_set, top_candidates, + candidate_set, lowerBound); + } - for (idType neighbor_id : neighbors) { - if (visited_nodes_handler->getNodeTag(neighbor_id) == visited_tag) { - continue; - } + return top_candidates; +} + +template +candidatesLabelsMaxHeap * +HNSWDiskIndex::searchLayerLabels(idType ep_id, const void *data_point_raw, const void *data_point, size_t layer, + size_t ef) const { + std::unordered_set visited_set; + + candidatesLabelsMaxHeap *top_candidates = getNewMaxPriorityQueue(); + candidatesMaxHeap candidate_set(this->allocator); - visited_nodes_handler->tagNode(neighbor_id, visited_tag); - DistType neighbor_dist = - this->calcDistance(data_point, getDataByInternalId(neighbor_id)); + DistType lowerBound; + if (!isMarkedDeleted(ep_id)) { + DistType dist = this->calcDistance(data_point, getDataByInternalId(ep_id)); + lowerBound = dist; + top_candidates->emplace(dist, getExternalLabel(ep_id)); + candidate_set.emplace(-dist, ep_id); + } else { + lowerBound = std::numeric_limits::max(); + candidate_set.emplace(-lowerBound, ep_id); + } - // Add to top candidates if it's good enough - if (neighbor_dist < lowerBound || top_candidates.size() < ef) { - top_candidates.emplace(neighbor_dist, neighbor_id); - candidate_set.emplace(-neighbor_dist, neighbor_id); + visited_set.insert(ep_id); + while (!candidate_set.empty()) { + pair curr_el_pair = candidate_set.top(); - // Update lowerBound if we have enough candidates - if (top_candidates.size() > ef) { - top_candidates.pop(); - } - if (top_candidates.size() >= ef) { - lowerBound = top_candidates.top().first; - } - } + if ((-curr_el_pair.first) > lowerBound && top_candidates->size() >= ef) { + break; } + candidate_set.pop(); + + processCandidate(curr_el_pair.second, data_point_raw, data_point, layer, ef, + &visited_set, *top_candidates, + candidate_set, lowerBound); } - returnVisitedList(visited_nodes_handler); return top_candidates; } @@ -1245,11 +1199,13 @@ std::pair HNSWDiskIndex::safeGetEntryPointSt } template +template void HNSWDiskIndex::greedySearchLevel(const void *data_point, size_t level, idType &curr_element, DistType &cur_dist) const { bool changed; idType bestCand = curr_element; + idType bestNonDeletedCand = bestCand; do { changed = false; @@ -1272,15 +1228,7 @@ void HNSWDiskIndex::greedySearchLevel(const void *data_point for (size_t i = 0; i < neighbors.size(); i++) { idType candidate = neighbors[i]; - // Skip invalid candidates - if (candidate >= curElementCount) { - continue; - } - - // Skip deleted candidates - if (isMarkedDeleted(candidate)) { - continue; - } + assert (candidate < curElementCount && "candidate error: out of index range"); // Calculate distance to this candidate DistType d = this->calcDistance(data_point, getDataByInternalId(candidate)); @@ -1290,13 +1238,20 @@ void HNSWDiskIndex::greedySearchLevel(const void *data_point cur_dist = d; bestCand = candidate; changed = true; + if (!running_query && !isMarkedDeleted(candidate)) { + bestNonDeletedCand = bestCand; + } } } } while (changed); // Update the current element to the best candidate found - curr_element = bestCand; + if (!running_query) { + curr_element = bestNonDeletedCand; + } else { + curr_element = bestCand; + } } template @@ -1304,7 +1259,7 @@ candidatesLabelsMaxHeap * HNSWDiskIndex::getNewMaxPriorityQueue() const { // Use max_priority_queue for single-label disk index return new (this->allocator) - vecsim_stl::max_priority_queue(this->allocator); + vecsim_stl::max_priority_queue(this->allocator); } template @@ -1326,59 +1281,56 @@ size_t HNSWDiskIndex::getRandomLevel(double reverse_size) { } template +template void HNSWDiskIndex::processCandidate( - idType candidate_id, const void* data_point_raw, const void *data_point, size_t level, size_t ef, void *visited_tags, - size_t visited_tag, candidatesLabelsMaxHeap &top_candidates, + idType curNodeId, const void* data_point_raw, const void *data_point, size_t level, size_t ef, std::unordered_set *visited_set, + vecsim_stl::abstract_priority_queue &top_candidates, candidatesMaxHeap &candidate_set, DistType &lowerBound) const { - // Use a simple set-based approach for now to avoid visited nodes handler issues - auto *visited_set = reinterpret_cast *>(visited_tags); - if (!visited_set) { - return; // Safety check - } - - if (visited_set->find(candidate_id) != visited_set->end()) { - return; - } - - visited_set->insert(candidate_id); - + assert(visited_set != nullptr); // Add neighbors to candidate set for further exploration vecsim_stl::vector neighbors(this->allocator); std::vector vector_data(this->inputBlobSize); - getNeighborsAndVector(candidate_id, level, neighbors, vector_data.data()); + getNeighborsAndVector(curNodeId, level, neighbors, vector_data.data()); // Calculate distance to candidate - DistType dist = this->calcDistanceRaw(data_point_raw, vector_data.data()); + // DistType dist = this->calcDistanceRaw(data_point_raw, vector_data.data()); // Add to top candidates if it's one of the best and not marked deleted - if (!isMarkedDeleted(candidate_id)) { - if (top_candidates.size() < ef || dist < lowerBound) { - top_candidates.emplace(dist, getExternalLabel(candidate_id)); - - // Update lower bound if we have enough candidates - if (top_candidates.size() >= ef) { - lowerBound = top_candidates.top().first; - } - } - } - - + if (!neighbors.empty()) { - for (idType neighbor_id : neighbors) { + for (idType candidate_id : neighbors) { // Skip invalid neighbors - if (neighbor_id >= curElementCount) { + assert(candidate_id < curElementCount); + + if (visited_set->find(candidate_id) != visited_set->end()) { continue; } + visited_set->insert(candidate_id); + DistType cur_dist = + this->calcDistance(data_point, getDataByInternalId(candidate_id)); + if (lowerBound > cur_dist || top_candidates.size() < ef) { + + candidate_set.emplace(-cur_dist, candidate_id); + + // Insert the candidate to the top candidates heap only if it is not marked as + // deleted. + if (!isMarkedDeleted(candidate_id)) + emplaceHeap(top_candidates, cur_dist, candidate_id); - if (visited_set->find(neighbor_id) == visited_set->end()) { - DistType neighbor_dist = - this->calcDistance(data_point, getDataByInternalId(neighbor_id)); - candidate_set.emplace(-neighbor_dist, neighbor_id); + if (top_candidates.size() > ef) + top_candidates.pop(); + + // If we have marked deleted elements, we need to verify that `top_candidates` is + // not empty (since we might have not added any non-deleted element yet). + if (!top_candidates.empty()) + lowerBound = top_candidates.top().first; } } } } + + template VecSimQueryReply * HNSWDiskIndex::rangeQuery(const void *query_data, double radius, @@ -1441,7 +1393,22 @@ size_t HNSWDiskIndex::indexLabelCount() const { /********************************** Helper Methods **********************************/ template -void HNSWDiskIndex::getNeighbors(idType nodeId, size_t level, vecsim_stl::vector& result) const { +inline void HNSWDiskIndex::emplaceHeap( + vecsim_stl::abstract_priority_queue &heap, DistType dist, + idType id) const { + heap.emplace(dist, id); + } + +template +inline void HNSWDiskIndex::emplaceHeap( + vecsim_stl::abstract_priority_queue &heap, DistType dist, + idType id) const { + heap.emplace(dist, getExternalLabel(id)); + } + +template +void HNSWDiskIndex::getNeighbors(idType nodeId, size_t level, + vecsim_stl::vector &result) const { // Clear the result vector first result.clear(); @@ -1467,7 +1434,6 @@ void HNSWDiskIndex::getNeighbors(idType nodeId, size_t level } } - template void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, void* vector_data) const { // Clear the result vector first @@ -1791,131 +1757,6 @@ void HNSWDiskIndex::debugValidateGraphConnectivity() const { } } -template -candidatesLabelsMaxHeap * -HNSWDiskIndex::hierarchicalSearch(const void *data_point_raw, const void *data_point, idType ep_id, - size_t ef, size_t k, void *timeoutCtx, - VecSimQueryReply_Code *rc) const { - if (rc) - *rc = VecSim_QueryReply_OK; - - // Get the current entry point state - auto [curr_entry_point, max_level] = safeGetEntryPointState(); - if (curr_entry_point == INVALID_ID) { - if (rc) - *rc = VecSim_QueryReply_OK; // Just return OK but no results - return nullptr; - } - - // Initialize result containers - candidatesLabelsMaxHeap *top_candidates = getNewMaxPriorityQueue(); - candidatesMaxHeap candidate_set(this->allocator); - - // Use a simple set for visited nodes tracking - std::unordered_set visited_set; - - // Start from the provided entry point (not the global entry point) - idType curr_element = ep_id; - DistType curr_dist = this->calcDistance(data_point, getDataByInternalId(curr_element)); - - // Add entry point to results - if (!isMarkedDeleted(curr_element)) { - top_candidates->emplace(curr_dist, getExternalLabel(curr_element)); - visited_set.insert(curr_element); - } - - // Phase 1: Search from top level down to level 1 (hierarchical traversal) - for (size_t level = max_level; level > 0; --level) { - // Search at this level using the current element as entry point - candidatesMaxHeap level_candidates = - searchLayer(curr_element, data_point, level, ef); - - if (!level_candidates.empty()) { - // Find the closest element at this level to continue the search - curr_element = level_candidates.top().second; - curr_dist = level_candidates.top().first; - } - - // Check timeout - if (timeoutCtx && VECSIM_TIMEOUT(timeoutCtx)) { - if (rc) - *rc = VecSim_QueryReply_TimedOut; - delete top_candidates; - return nullptr; - } - } - - // Phase 2: Search at the bottom layer (level 0) with beam search - // Reset visited set for bottom layer search - visited_set.clear(); - visited_set.insert(curr_element); - - // Initialize candidate set with current element and its neighbors at level 0 - // Since candidatesMaxHeap doesn't have clear(), we'll create a new one - candidate_set = candidatesMaxHeap(this->allocator); - candidate_set.emplace(-curr_dist, curr_element); - - // Add neighbors of the current element at level 0 to get started - vecsim_stl::vector start_neighbors(this->allocator); - getNeighbors(curr_element, 0, start_neighbors); - - if (!start_neighbors.empty()) { - for (idType neighbor_id : start_neighbors) { - if (neighbor_id < curElementCount && - visited_set.find(neighbor_id) == visited_set.end()) { - DistType neighbor_dist = - this->calcDistance(data_point, getDataByInternalId(neighbor_id)); - candidate_set.emplace(-neighbor_dist, neighbor_id); - } - } - } - - // Beam search at bottom layer - DistType lower_bound = curr_dist; - - while (!candidate_set.empty()) { - // Check timeout - if (timeoutCtx && VECSIM_TIMEOUT(timeoutCtx)) { - if (rc) - *rc = VecSim_QueryReply_TimedOut; - break; - } - - auto curr_pair = candidate_set.top(); - DistType curr_candidate_dist = -curr_pair.first; - idType curr_candidate_id = curr_pair.second; - candidate_set.pop(); - - // If we have enough candidates and current distance is worse, stop - if (top_candidates->size() >= ef && curr_candidate_dist > lower_bound) { - break; - } - - // Process this candidate - processCandidate(curr_candidate_id, data_point_raw, data_point, 0, ef, - reinterpret_cast(&visited_set), 0, *top_candidates, candidate_set, - lower_bound); - - // Update lower bound based on current top candidates - if (top_candidates->size() >= ef) { - lower_bound = top_candidates->top().first; - } - - // Continue searching until we have enough candidates or exhaust all possibilities - if (top_candidates->size() >= ef && candidate_set.empty()) { - break; - } - } - - // Trim results to k - while (top_candidates->size() > k) { - top_candidates->pop(); - } - - if (rc) - *rc = VecSim_QueryReply_OK; - return top_candidates; -} template void HNSWDiskIndex::flushStagedUpdates() { From 7a457d228492b9c1b8562896d196555896c9f1ac Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Sun, 30 Nov 2025 09:29:49 +0200 Subject: [PATCH 09/20] reorder benchmarks --- .../bm_initialization/bm_hnsw_disk_initialize_fp32.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h b/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h index 39e1399d0..2b578a5ea 100644 --- a/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h +++ b/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h @@ -29,11 +29,6 @@ BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(Disk, HNSWDisk), fp32_ (benchmark::State &st) { Disk(st, INDEX_HNSW_DISK); } BENCHMARK_REGISTER_F(BM_VecSimCommon, BM_FUNC_NAME(Disk, HNSWDisk))->Iterations(1); -// AddLabel benchmarks -BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimBasics, BM_FLUSH_BATCH_DISK, fp32_index_t) -(benchmark::State &st) { FlushBatchDisk(st); } -REGISTER_FlushBatchDisk(BM_FLUSH_BATCH_DISK); - // TopK benchmark BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(TopK, HNSWDisk), fp32_index_t) (benchmark::State &st) { TopK_HNSW_DISK(st); } @@ -44,6 +39,10 @@ BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(TopK_MarkDeleted, HNSW (benchmark::State &st) { TopK_HNSW_DISK_MarkDeleted(st); } REGISTER_TopK_HNSW_DISK_MarkDeleted(BM_VecSimCommon, BM_FUNC_NAME(TopK_MarkDeleted, HNSWDisk)); +// AddLabel benchmarks +BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimBasics, BM_FLUSH_BATCH_DISK, fp32_index_t) +(benchmark::State &st) { FlushBatchDisk(st); } +REGISTER_FlushBatchDisk(BM_FLUSH_BATCH_DISK); // Range benchmarks From 5dc8fa3d093a696461f94ca97dce4f6528e39643 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Sun, 30 Nov 2025 12:39:56 +0200 Subject: [PATCH 10/20] increas iterations --- tests/benchmark/bm_common.h | 4 ++-- .../bm_initialization/bm_hnsw_disk_initialize_fp32.h | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/benchmark/bm_common.h b/tests/benchmark/bm_common.h index 997835998..77b496b31 100644 --- a/tests/benchmark/bm_common.h +++ b/tests/benchmark/bm_common.h @@ -323,7 +323,7 @@ void BM_VecSimCommon::TopK_Tiered(benchmark::State &st, unsigned s ->Args({100, 100}) \ ->Args({200, 100}) \ ->ArgNames({"ef_runtime", "k"}) \ - ->Iterations(10) \ + ->Iterations(100) \ ->Unit(benchmark::kMillisecond) // {ef_runtime, k, num_marked_deleted} @@ -337,7 +337,7 @@ void BM_VecSimCommon::TopK_Tiered(benchmark::State &st, unsigned s ->Args({200, 50, 10000}) \ ->Args({200, 50, 50000}) \ ->ArgNames({"ef_runtime", "k", "num_marked_deleted"}) \ - ->Iterations(10) \ + ->Iterations(100) \ ->Unit(benchmark::kMillisecond) // {ef_runtime, k} (recall that always ef_runtime >= k) diff --git a/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h b/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h index 2b578a5ea..e1a448c3f 100644 --- a/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h +++ b/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h @@ -39,6 +39,14 @@ BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(TopK_MarkDeleted, HNSW (benchmark::State &st) { TopK_HNSW_DISK_MarkDeleted(st); } REGISTER_TopK_HNSW_DISK_MarkDeleted(BM_VecSimCommon, BM_FUNC_NAME(TopK_MarkDeleted, HNSWDisk)); +BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(Memory, HNSWDisk), fp32_index_t) +(benchmark::State &st) { Memory(st, INDEX_HNSW_DISK); } +BENCHMARK_REGISTER_F(BM_VecSimCommon, BM_FUNC_NAME(Memory, HNSWDisk))->Iterations(1); +// Disk benchmarks +BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(Disk, HNSWDisk), fp32_index_t) +(benchmark::State &st) { Disk(st, INDEX_HNSW_DISK); } +BENCHMARK_REGISTER_F(BM_VecSimCommon, BM_FUNC_NAME(Disk, HNSWDisk))->Iterations(1); + // AddLabel benchmarks BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimBasics, BM_FLUSH_BATCH_DISK, fp32_index_t) (benchmark::State &st) { FlushBatchDisk(st); } From 840cb7616f753b8769eb34932caf020e59e68365 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Tue, 2 Dec 2025 19:55:28 +0200 Subject: [PATCH 11/20] investigate memory consumption --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 52 +++++++++++++++---- .../index_factories/hnsw_disk_factory.cpp | 3 +- src/VecSim/spaces/computer/preprocessors.h | 6 +++ src/VecSim/vec_sim_common.h | 1 + src/VecSim/vec_sim_index.h | 1 + src/VecSim/vec_sim_tiered_index.h | 1 + tests/benchmark/bm_common.h | 1 + .../bm_hnsw_disk_initialize_fp32.h | 8 --- 8 files changed, 52 insertions(+), 21 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 8f8c63c80..64a292116 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -470,6 +470,15 @@ HNSWDiskIndex::topKQuery(const void *query_data, size_t k, auto processed_query_ptr = this->preprocessQuery(query_data); const void *processed_query = processed_query_ptr.get(); + // const float* v_data = reinterpret_cast(query_data); + // std::cout << "v_data[0]: " << v_data[0] << std::endl; + // std::cout << "v_data[n]: " << v_data[this->dim - 1] << std::endl; + + // const int8_t* p_data = reinterpret_cast(processed_query); + // std::cout << "p_data[0]: " << static_cast(p_data[0]) << std::endl; + // std::cout << "p_data[n]: " << static_cast(p_data[this->dim - 1]) << std::endl << std::endl; + + // Get search parameters size_t query_ef = this->ef; void *timeoutCtx = nullptr; @@ -632,10 +641,18 @@ int HNSWDiskIndex::addVector( idType newElementId = curElementCount; const char* raw_data = reinterpret_cast(vector); rawVectorsInRAM[newElementId] = std::string(raw_data, this->inputBlobSize); - // Preprocess the vector ProcessedBlobs processedBlobs = this->preprocess(vector); + + // const float* v_data = reinterpret_cast(vector); + // std::cout << "v_data[0]: " << v_data[0] << std::endl; + // std::cout << "v_data[n]: " << v_data[this->dim - 1] << std::endl; + + // const int8_t* p_data = reinterpret_cast(processedBlobs.getStorageBlob()); + // std::cout << "p_data[0]: " << static_cast(p_data[0]) << std::endl; + // std::cout << "p_data[n]: " << static_cast(p_data[this->dim - 1]) << std::endl << std::endl; + // Store the processed vector in memory immediately size_t containerId = this->vectors->size(); this->vectors->addElement(processedBlobs.getStorageBlob(), containerId); @@ -1053,16 +1070,12 @@ template const void *HNSWDiskIndex::getDataByInternalId(idType id) const { assert(id < curElementCount); - if (id < this->vectors->size()) { - const void* result = this->vectors->getElement(id); - if (result != nullptr) { - return result; - } + const void* result = this->vectors->getElement(id); + if (result != nullptr) { + return result; } - this->log(VecSimCommonStrings::LOG_WARNING_STRING, - "WARNING: Vector data not found for id %u", id); - return nullptr; + throw std::runtime_error("Vector data not found for id " + std::to_string(id)); } template @@ -1229,10 +1242,18 @@ void HNSWDiskIndex::greedySearchLevel(const void *data_point idType candidate = neighbors[i]; assert (candidate < curElementCount && "candidate error: out of index range"); + // const int8_t* q_data = reinterpret_cast(data_point); + // std::cout << "q_data[0]: " << static_cast(q_data[0]) << std::endl; + // std::cout << "q_data[n]: " << static_cast(q_data[this->dim - 1]) << std::endl; + + // const int8_t* v_data = reinterpret_cast(getDataByInternalId(candidate)); + // std::cout << "v_data[0]: " << static_cast(v_data[0]) << std::endl; + // std::cout << "v_data[n]: " << static_cast(v_data[this->dim - 1]) << std::endl; // Calculate distance to this candidate DistType d = this->calcDistance(data_point, getDataByInternalId(candidate)); - + + // std::cout << "d: " << d << " dim:" << this->dim << std::endl << std::endl; // If this candidate is closer, update our best candidate if (d < cur_dist) { cur_dist = d; @@ -1856,9 +1877,18 @@ template VecSimIndexStatsInfo HNSWDiskIndex::statisticInfo() const { VecSimIndexStatsInfo info = {}; info.memory = this->getAllocationSize(); + + + // Processed vectors memory (stored in this->vectors container) + info.vectors_memory = this->vectors->size() * this->dataSize; + + // RocksDB memory and disk usage info.db_memory = this->getDBMemorySize(); info.db_disk = this->getDiskSize(); - info.numberOfMarkedDeleted = 0; // TODO: Implement if needed + + // Number of marked deleted elements + info.numberOfMarkedDeleted = this->numMarkedDeleted; + return info; } diff --git a/src/VecSim/index_factories/hnsw_disk_factory.cpp b/src/VecSim/index_factories/hnsw_disk_factory.cpp index 130a8e538..1279aff8d 100644 --- a/src/VecSim/index_factories/hnsw_disk_factory.cpp +++ b/src/VecSim/index_factories/hnsw_disk_factory.cpp @@ -97,8 +97,7 @@ std::unique_ptr CreateManagedRocksDB(const std::string &checkpoi static AbstractIndexInitParams NewAbstractInitParams(const VecSimParams *params) { const HNSWParams *hnswParams = ¶ms->algoParams.hnswParams; - size_t dataSize = - VecSimParams_GetDataSize(hnswParams->type, hnswParams->dim, hnswParams->metric); + size_t dataSize = hnswParams->dim * sizeof(int8_t); // Quantized storage AbstractIndexInitParams abstractInitParams = {.allocator = VecSimAllocator::newVecsimAllocator(), .dim = hnswParams->dim, diff --git a/src/VecSim/spaces/computer/preprocessors.h b/src/VecSim/spaces/computer/preprocessors.h index fff358a99..3b03af8e7 100644 --- a/src/VecSim/spaces/computer/preprocessors.h +++ b/src/VecSim/spaces/computer/preprocessors.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "VecSim/memory/vecsim_base.h" #include "VecSim/spaces/spaces.h" @@ -216,5 +217,10 @@ class ScalarQuantizationPreprocessor : public PreprocessorInterface { output_vec[i] = static_cast(std::round(scaled)); } } + + // std::cout << "quantized_0: " << static_cast(output_vec[0]) << std::endl; + // std::cout << "original_0: " << input_vec[0] << std::endl; + // std::cout << "quantized_n: " << static_cast(output_vec[dim - 1]) << std::endl; + // std::cout << "original_n: " << input_vec[dim - 1] << std::endl; } }; diff --git a/src/VecSim/vec_sim_common.h b/src/VecSim/vec_sim_common.h index 87459f226..97cb6ae6f 100644 --- a/src/VecSim/vec_sim_common.h +++ b/src/VecSim/vec_sim_common.h @@ -328,6 +328,7 @@ typedef struct { */ typedef struct { size_t memory; + size_t vectors_memory; size_t db_memory; size_t db_disk; size_t numberOfMarkedDeleted; // The number of vectors that are marked as deleted (HNSW/tiered diff --git a/src/VecSim/vec_sim_index.h b/src/VecSim/vec_sim_index.h index 225f420da..5696fbe6c 100644 --- a/src/VecSim/vec_sim_index.h +++ b/src/VecSim/vec_sim_index.h @@ -185,6 +185,7 @@ struct VecSimIndexAbstract : public VecSimIndexInterface { virtual inline VecSimIndexStatsInfo statisticInfo() const override { return VecSimIndexStatsInfo{ .memory = this->getAllocationSize(), + .vectors_memory = 0, .db_memory = 0, .db_disk = 0, .numberOfMarkedDeleted = 0, diff --git a/src/VecSim/vec_sim_tiered_index.h b/src/VecSim/vec_sim_tiered_index.h index 4c7d4ab32..3d76d68c0 100644 --- a/src/VecSim/vec_sim_tiered_index.h +++ b/src/VecSim/vec_sim_tiered_index.h @@ -308,6 +308,7 @@ template VecSimIndexStatsInfo VecSimTieredIndex::statisticInfo() const { auto stats = VecSimIndexStatsInfo{ .memory = this->getAllocationSize(), + .vectors_memory = 0, .db_memory = 0, .db_disk = 0, .numberOfMarkedDeleted = 0, // Default value if cast fails diff --git a/tests/benchmark/bm_common.h b/tests/benchmark/bm_common.h index 77b496b31..00419cb1d 100644 --- a/tests/benchmark/bm_common.h +++ b/tests/benchmark/bm_common.h @@ -87,6 +87,7 @@ void BM_VecSimCommon::Memory(benchmark::State &st, IndexTypeIndex // Do nothing... } st.counters["memory"] = (double)VecSimIndex_StatsInfo(index).memory; + st.counters["vectors_memory"] = (double)VecSimIndex_StatsInfo(index).vectors_memory; } // TopK search BM diff --git a/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h b/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h index e1a448c3f..2b578a5ea 100644 --- a/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h +++ b/tests/benchmark/bm_initialization/bm_hnsw_disk_initialize_fp32.h @@ -39,14 +39,6 @@ BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(TopK_MarkDeleted, HNSW (benchmark::State &st) { TopK_HNSW_DISK_MarkDeleted(st); } REGISTER_TopK_HNSW_DISK_MarkDeleted(BM_VecSimCommon, BM_FUNC_NAME(TopK_MarkDeleted, HNSWDisk)); -BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(Memory, HNSWDisk), fp32_index_t) -(benchmark::State &st) { Memory(st, INDEX_HNSW_DISK); } -BENCHMARK_REGISTER_F(BM_VecSimCommon, BM_FUNC_NAME(Memory, HNSWDisk))->Iterations(1); -// Disk benchmarks -BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimCommon, BM_FUNC_NAME(Disk, HNSWDisk), fp32_index_t) -(benchmark::State &st) { Disk(st, INDEX_HNSW_DISK); } -BENCHMARK_REGISTER_F(BM_VecSimCommon, BM_FUNC_NAME(Disk, HNSWDisk))->Iterations(1); - // AddLabel benchmarks BENCHMARK_TEMPLATE_DEFINE_F(BM_VecSimBasics, BM_FLUSH_BATCH_DISK, fp32_index_t) (benchmark::State &st) { FlushBatchDisk(st); } From a79afdbed2fc64d3ef76f2eada81c79b9a53c487 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Tue, 2 Dec 2025 20:10:20 +0200 Subject: [PATCH 12/20] change test to use quantize --- tests/benchmark/data/scripts/fbin_reader.py | 91 +++++++++ .../data/scripts/generate_groundtruth.py | 193 ++++++++++++++++++ tests/unit/test_hnsw_disk.cpp | 38 ++-- 3 files changed, 303 insertions(+), 19 deletions(-) create mode 100644 tests/benchmark/data/scripts/fbin_reader.py create mode 100644 tests/benchmark/data/scripts/generate_groundtruth.py diff --git a/tests/benchmark/data/scripts/fbin_reader.py b/tests/benchmark/data/scripts/fbin_reader.py new file mode 100644 index 000000000..d3053fcb4 --- /dev/null +++ b/tests/benchmark/data/scripts/fbin_reader.py @@ -0,0 +1,91 @@ +import struct +import numpy as np + + +""" + IO Utils +""" + + +def read_fbin(filename, start_idx=0, chunk_size=None): + """ Read *.fbin file that contains float32 vectors + Args: + :param filename (str): path to *.fbin file + :param start_idx (int): start reading vectors from this index + :param chunk_size (int): number of vectors to read. + If None, read all vectors + Returns: + Array of float32 vectors (numpy.ndarray) + """ + with open(filename, "rb") as f: + nvecs, dim = np.fromfile(f, count=2, dtype=np.int32) + nvecs = (nvecs - start_idx) if chunk_size is None else chunk_size + arr = np.fromfile(f, count=nvecs * dim, dtype=np.float32, + offset=start_idx * 4 * dim) + return arr.reshape(nvecs, dim) + + +def read_ibin(filename, start_idx=0, chunk_size=None): + """ Read *.ibin file that contains int32 vectors + Args: + :param filename (str): path to *.ibin file + :param start_idx (int): start reading vectors from this index + :param chunk_size (int): number of vectors to read. + If None, read all vectors + Returns: + Array of int32 vectors (numpy.ndarray) + """ + with open(filename, "rb") as f: + nvecs, dim = np.fromfile(f, count=2, dtype=np.int32) + nvecs = (nvecs - start_idx) if chunk_size is None else chunk_size + arr = np.fromfile(f, count=nvecs * dim, dtype=np.int32, + offset=start_idx * 4 * dim) + return arr.reshape(nvecs, dim) + + +def write_fbin(filename, vecs): + """ Write an array of float32 vectors to *.fbin file + Args:s + :param filename (str): path to *.fbin file + :param vecs (numpy.ndarray): array of float32 vectors to write + """ + assert len(vecs.shape) == 2, "Input array must have 2 dimensions" + with open(filename, "wb") as f: + nvecs, dim = vecs.shape + f.write(struct.pack('= 2: + completed_batches = int(parts[0].split()[-1]) + total_batches_in_checkpoint = int(parts[1].split()[0]) + + # Load the groundtruth ibin file + groundtruth = read_ibin(CHECKPOINT_FILE) + + print(f"Loaded checkpoint: {completed_batches}/{total_batches_in_checkpoint} batches completed") + print(f"Checkpoint shape: {groundtruth.shape}") + + return { + 'groundtruth': groundtruth, + 'completed_batches': completed_batches, + 'total_batches': total_batches_in_checkpoint + } + except Exception as e: + print(f"Error loading checkpoint: {e}") + return None + return None + +def compute_groundtruth_chunked(base_vectors, query_vectors, k=100, batch_size=100, resume=True): + """ + Compute k nearest neighbors for each query vector with checkpointing. + + Args: + base_vectors: (N, D) array of base vectors + query_vectors: (Q, D) array of query vectors + k: number of nearest neighbors to find + batch_size: process queries in batches to save memory + resume: whether to resume from checkpoint if available + + Returns: + (Q, k) array of indices of nearest neighbors + """ + n_queries = query_vectors.shape[0] + n_base = base_vectors.shape[0] + total_batches = (n_queries + batch_size - 1) // batch_size + + # Try to load checkpoint + checkpoint = None + start_batch = 0 + if resume: + checkpoint = load_checkpoint() + if checkpoint: + # Check if checkpoint is compatible (same number of queries and k) + if checkpoint['groundtruth'].shape == (n_queries, k): + groundtruth = checkpoint['groundtruth'] + # Recalculate start_batch based on current batch_size + # The checkpoint may have been created with a different batch_size + start_batch = checkpoint['completed_batches'] + print(f"Resuming from batch {start_batch} (checkpoint was created with different batch size)") + else: + print(f"Checkpoint shape {checkpoint['groundtruth'].shape} doesn't match expected shape {(n_queries, k)}") + print("Starting fresh...") + groundtruth = np.zeros((n_queries, k), dtype=np.int32) + else: + groundtruth = np.zeros((n_queries, k), dtype=np.int32) + else: + groundtruth = np.zeros((n_queries, k), dtype=np.int32) + + print(f"Computing groundtruth for {n_queries} queries against {n_base} base vectors") + print(f"Finding {k} nearest neighbors per query") + print(f"Total batches: {total_batches}, Starting from batch: {start_batch}\n") + + start_time = time.time() + + for batch_idx in range(start_batch, total_batches): + batch_start = batch_idx * batch_size + batch_end = min(batch_start + batch_size, n_queries) + batch_queries = query_vectors[batch_start:batch_end] + + # Compute L2 distances: ||q - b||^2 = ||q||^2 + ||b||^2 - 2*q·b + # Compute squared norms + query_norms = np.sum(batch_queries ** 2, axis=1, keepdims=True) # (batch_size, 1) + base_norms = np.sum(base_vectors ** 2, axis=1) # (n_base,) + + # Compute dot products: q · b + dot_products = np.dot(batch_queries, base_vectors.T) # (batch_size, n_base) + + # Compute squared distances: ||q - b||^2 + distances = query_norms + base_norms - 2 * dot_products # (batch_size, n_base) + + # Find k nearest neighbors (smallest distances) + nearest_indices = np.argsort(distances, axis=1)[:, :k] + groundtruth[batch_start:batch_end] = nearest_indices + + elapsed = time.time() - start_time + progress = ((batch_idx + 1) / total_batches) * 100 + rate = (batch_idx + 1 - start_batch) / elapsed if elapsed > 0 else 0 + eta = (total_batches - batch_idx - 1) / rate if rate > 0 else 0 + + print(f"Batch {batch_idx + 1}/{total_batches} ({progress:.1f}%) - " + f"Elapsed: {elapsed:.1f}s - Rate: {rate:.2f} batches/s - ETA: {eta:.1f}s") + + # Save checkpoint every 10 batches + if (batch_idx + 1) % 10 == 0: + save_checkpoint(groundtruth, batch_idx + 1, total_batches) + + return groundtruth + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='Generate groundtruth for deep.base.10M.fbin') + parser.add_argument('--resume', action='store_true', default=True, + help='Resume from checkpoint if available (default: True)') + parser.add_argument('--no-resume', dest='resume', action='store_false', + help='Start fresh, ignore checkpoint') + parser.add_argument('--batch-size', type=int, default=5, + help='Batch size for processing queries (default: 5)') + args = parser.parse_args() + + print("Loading data files...") + start_time = time.time() + + # Load query and base vectors + query_vectors = read_fbin('tests/benchmark/data/deep.query.public.10K.fbin') + print(f"Loaded query vectors: {query_vectors.shape}") + + base_vectors = read_fbin('tests/benchmark/data/deep.base.100K.fbin') + print(f"Loaded base vectors: {base_vectors.shape}") + + load_time = time.time() - start_time + print(f"Data loading took {load_time:.1f}s\n") + + # Compute groundtruth with checkpointing + start_time = time.time() + groundtruth = compute_groundtruth_chunked(base_vectors, query_vectors, k=100, + batch_size=args.batch_size, resume=args.resume) + compute_time = time.time() - start_time + print(f"\nGroundtruth computation took {compute_time:.1f}s") + + # Write groundtruth to file + print(f"\nWriting groundtruth to file...") + output_file = 'tests/benchmark/data/deep.groundtruth.100K.10K.ibin' + write_ibin(output_file, groundtruth) + print(f"Groundtruth written to: {output_file}") + print(f"Groundtruth shape: {groundtruth.shape}") + print(f"First 5 rows (first 10 neighbors):") + print(groundtruth[:5, :10]) + + # Clean up checkpoint + if os.path.exists(CHECKPOINT_FILE): + os.remove(CHECKPOINT_FILE) + os.remove(PROGRESS_FILE) + print(f"\nCheckpoint files cleaned up") + diff --git a/tests/unit/test_hnsw_disk.cpp b/tests/unit/test_hnsw_disk.cpp index 4404e793f..3eb2b1027 100644 --- a/tests/unit/test_hnsw_disk.cpp +++ b/tests/unit/test_hnsw_disk.cpp @@ -116,7 +116,7 @@ TEST_F(HNSWDiskIndexTest, BasicConstruction) { AbstractIndexInitParams abstractInitParams; abstractInitParams.dim = dim; abstractInitParams.vecType = params.type; - abstractInitParams.dataSize = dim * sizeof(float); + abstractInitParams.dataSize = dim * sizeof(int8_t); abstractInitParams.blockSize = 1; abstractInitParams.multi = false; abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); @@ -156,7 +156,7 @@ TEST_F(HNSWDiskIndexTest, SimpleTest) { AbstractIndexInitParams abstractInitParams; abstractInitParams.dim = dim; abstractInitParams.vecType = params.type; - abstractInitParams.dataSize = dim * sizeof(float); + abstractInitParams.dataSize = dim * sizeof(int8_t); abstractInitParams.multi = false; abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); @@ -196,7 +196,7 @@ TEST_F(HNSWDiskIndexTest, BasicStoreVectorTest) { AbstractIndexInitParams abstractInitParams; abstractInitParams.dim = dim; abstractInitParams.vecType = params.type; - abstractInitParams.dataSize = dim * sizeof(float); + abstractInitParams.dataSize = dim * sizeof(int8_t); abstractInitParams.multi = false; abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); @@ -225,7 +225,7 @@ TEST_F(HNSWDiskIndexTest, BasicStoreVectorTest) { // Test that we can access the vector data EXPECT_EQ(index.getDim(), dim); - EXPECT_EQ(index.getDataSize(), dim * sizeof(float)); + EXPECT_EQ(index.getDataSize(), dim * sizeof(int8_t)); } TEST_F(HNSWDiskIndexTest, StoreVectorTest) { @@ -247,7 +247,7 @@ TEST_F(HNSWDiskIndexTest, StoreVectorTest) { AbstractIndexInitParams abstractInitParams; abstractInitParams.dim = dim; abstractInitParams.vecType = params.type; - abstractInitParams.dataSize = dim * sizeof(float); + abstractInitParams.dataSize = dim * sizeof(int8_t); abstractInitParams.blockSize = 1; abstractInitParams.multi = false; abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); @@ -297,13 +297,13 @@ TEST_F(HNSWDiskIndexTest, SimpleAddVectorTest) { AbstractIndexInitParams abstractInitParams; abstractInitParams.dim = dim; abstractInitParams.vecType = params.type; - abstractInitParams.dataSize = dim * sizeof(float); + abstractInitParams.dataSize = dim * sizeof(int8_t); abstractInitParams.blockSize = 1; // Use small block size for testing abstractInitParams.multi = false; abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); // Create index components - IndexComponents components = CreateIndexComponents( + IndexComponents components = CreateQuantizedIndexComponents( abstractInitParams.allocator, VecSimMetric_L2, dim, false); // Create HNSWDiskIndex - use default column family handle @@ -345,7 +345,7 @@ TEST_F(HNSWDiskIndexTest, AddVectorTest) { AbstractIndexInitParams abstractInitParams; abstractInitParams.dim = dim; abstractInitParams.vecType = params.type; - abstractInitParams.dataSize = dim * sizeof(float); + abstractInitParams.dataSize = dim * sizeof(int8_t); abstractInitParams.blockSize = 1; // Use small block size for testing abstractInitParams.multi = false; abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); @@ -460,7 +460,7 @@ TEST_F(HNSWDiskIndexTest, BatchingTest) { AbstractIndexInitParams abstractInitParams; abstractInitParams.dim = dim; abstractInitParams.vecType = params.type; - abstractInitParams.dataSize = dim * sizeof(float); + abstractInitParams.dataSize = dim * sizeof(int8_t); abstractInitParams.blockSize = 1; // Use small block size for testing abstractInitParams.multi = false; abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); @@ -531,13 +531,13 @@ TEST_F(HNSWDiskIndexTest, HierarchicalSearchTest) { AbstractIndexInitParams abstractInitParams; abstractInitParams.dim = dim; abstractInitParams.vecType = params.type; - abstractInitParams.dataSize = dim * sizeof(float); + abstractInitParams.dataSize = dim * sizeof(int8_t); abstractInitParams.blockSize = 1; abstractInitParams.multi = false; abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); // Create index components - IndexComponents components = CreateIndexComponents( + IndexComponents components = CreateQuantizedIndexComponents( abstractInitParams.allocator, VecSimMetric_L2, dim, false); // Create HNSWDiskIndex @@ -690,7 +690,7 @@ TEST_F(HNSWDiskIndexTest, RawVectorStorageAndRetrieval) { AbstractIndexInitParams abstractInitParams; abstractInitParams.dim = dim; abstractInitParams.vecType = params.type; - abstractInitParams.dataSize = dim * sizeof(float); + abstractInitParams.dataSize = dim * sizeof(int8_t); abstractInitParams.blockSize = 1; abstractInitParams.multi = false; abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); @@ -878,7 +878,7 @@ TEST_F(HNSWDiskIndexTest, markDelete) { abstractInitParams.allocator = VecSimAllocator::newVecsimAllocator(); // Create index components - IndexComponents components = CreateIndexComponents( + IndexComponents components = CreateQuantizedIndexComponents( abstractInitParams.allocator, VecSimMetric_L2, dim, false); // Create HNSWDiskIndex @@ -893,7 +893,7 @@ TEST_F(HNSWDiskIndexTest, markDelete) { for (size_t i = 0; i < n; i++) { std::vector vec(dim); for (size_t j = 0; j < dim; j++) { - vec[j] = static_cast(i); + vec[j] = static_cast(i)/static_cast(n); } int result = index.addVector(vec.data(), i); EXPECT_EQ(result, 1) << "Failed to add vector " << i; @@ -903,14 +903,14 @@ TEST_F(HNSWDiskIndexTest, markDelete) { // Create query vector around the middle std::vector query(dim); for (size_t j = 0; j < dim; j++) { - query[j] = static_cast(n / 2); + query[j] = 0.5f; } // Search for k results around the middle. Expect to find them. auto verify_res = [&](size_t id, double score, size_t result_index) { size_t diff_id = (id > 50) ? (id - 50) : (50 - id); ASSERT_EQ(diff_id, (result_index + 1) / 2); - ASSERT_EQ(score, (4 * ((result_index + 1) / 2) * ((result_index + 1) / 2))); + // ASSERT_EQ(score, (4 * ((result_index + 1) / 2) * ((result_index + 1) / 2))); }; VecSimQueryParams queryParams; @@ -953,7 +953,7 @@ TEST_F(HNSWDiskIndexTest, markDelete) { // So expected_diff = result_index | 1 (make it odd) size_t expected_diff = result_index | 1; ASSERT_EQ(diff_id, expected_diff); - ASSERT_EQ(score, (dim * expected_diff * expected_diff)); + // ASSERT_EQ(score, (dim * expected_diff * expected_diff)); }; // Run search test after marking deleted @@ -970,7 +970,7 @@ TEST_F(HNSWDiskIndexTest, markDelete) { // Add a new vector, make sure it has no link to a deleted vector std::vector new_vec(dim); for (size_t j = 0; j < dim; j++) { - new_vec[j] = static_cast(n); + new_vec[j] = 1.0f; } index.addVector(new_vec.data(), n); @@ -979,7 +979,7 @@ TEST_F(HNSWDiskIndexTest, markDelete) { if (label % 2 == ep_reminder) { std::vector vec(dim); for (size_t j = 0; j < dim; j++) { - vec[j] = static_cast(label); + vec[j] = static_cast(label)/static_cast(n); } index.addVector(vec.data(), label); } From b432fc4bcf8140a9dcb687fea35ae88d7c5d959f Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Tue, 2 Dec 2025 20:28:58 +0200 Subject: [PATCH 13/20] load vectors from disk --- src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h b/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h index e9ac86d3f..377cc980a 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h @@ -212,14 +212,14 @@ template void HNSWDiskIndex::restoreVectors(std::ifstream &input, EncodingVersion version) { // #ifdef HNSW_DISK_SERIALIZE_VECTORS_TO_FILE // NEW METHOD: Load vectors from metadata file - this->log(VecSimCommonStrings::LOG_VERBOSE_STRING, - "Loading vectors from metadata file (HNSW_DISK_SERIALIZE_VECTORS_TO_FILE enabled)"); - restoreVectorsFromFile(input, version); + // this->log(VecSimCommonStrings::LOG_VERBOSE_STRING, + // "Loading vectors from metadata file (HNSW_DISK_SERIALIZE_VECTORS_TO_FILE enabled)"); + // restoreVectorsFromFile(input, version); // #else -// // CURRENT METHOD: Load vectors from RocksDB (default for backward compatibility) -// this->log(VecSimCommonStrings::LOG_VERBOSE_STRING, -// "Loading vectors from RocksDB checkpoint (default method)"); -// restoreVectorsFromRocksDB(version); + // CURRENT METHOD: Load vectors from RocksDB (default for backward compatibility) + this->log(VecSimCommonStrings::LOG_VERBOSE_STRING, + "Loading vectors from RocksDB checkpoint (default method)"); + restoreVectorsFromRocksDB(version); // #endif } From 37627ddd2feca430052ca91fffd5080515767196 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Tue, 2 Dec 2025 22:29:32 +0200 Subject: [PATCH 14/20] more iterations for benchmark topk --- tests/benchmark/bm_common.h | 7 +++++-- tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/benchmark/bm_common.h b/tests/benchmark/bm_common.h index 00419cb1d..473db31c4 100644 --- a/tests/benchmark/bm_common.h +++ b/tests/benchmark/bm_common.h @@ -320,11 +320,14 @@ void BM_VecSimCommon::TopK_Tiered(benchmark::State &st, unsigned s #define REGISTER_TopK_HNSW_DISK(BM_CLASS, BM_FUNC) \ BENCHMARK_REGISTER_F(BM_CLASS, BM_FUNC) \ ->Args({10, 10}) \ - ->Args({200, 10}) \ + ->Args({100, 10}) \ + ->Args({100, 50}) \ + ->Args({200, 50}) \ ->Args({100, 100}) \ ->Args({200, 100}) \ + ->Args({500, 100}) \ ->ArgNames({"ef_runtime", "k"}) \ - ->Iterations(100) \ + ->Iterations(1000) \ ->Unit(benchmark::kMillisecond) // {ef_runtime, k, num_marked_deleted} diff --git a/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp b/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp index 7d1766407..59642b4fc 100644 --- a/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp +++ b/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp @@ -22,9 +22,9 @@ size_t BM_VecSimGeneral::EF_C = 256; // Dataset file paths - using deep dataset // For HNSW disk, hnsw_index_file points to the folder containing index.hnsw_disk_v1 and rocksdb/ const char *BM_VecSimGeneral::hnsw_index_file = - "tests/benchmark/data/deep-1M-cosine-dim96-M32-efc200-disk-vectors"; + "tests/benchmark/data/deep-10M-cosine-dim96-M32-efc200-disk-vectors"; const char *BM_VecSimGeneral::test_queries_file = "tests/benchmark/data/deep.query.public.10K.fbin"; -const char *BM_VecSimGeneral::ground_truth_file = "tests/benchmark/data/deep.groundtruth.1M.10K.ibin"; // defined only for this benchmark +const char *BM_VecSimGeneral::ground_truth_file = "tests/benchmark/data/deep.groundtruth.10M.10K.ibin"; // defined only for this benchmark #define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Single) #define BM_ADD_LABEL CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single) From d58720a6fe2238ce9f216d0eb488e6165db2c471 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Wed, 3 Dec 2025 15:36:45 +0200 Subject: [PATCH 15/20] fixes after merge --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 18 +++++------------- .../algorithms/hnsw/hnsw_disk_serializer.h | 6 +----- .../run_files/bm_hnsw_disk_single_fp32.cpp | 4 ++-- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 284e1249a..288b214de 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -894,7 +894,7 @@ void HNSWDiskIndex::flushStagedGraphUpdates( std::vector raw_vector_buffer(this->inputBlobSize); // First, handle new node insertions - for (const auto &update : stagedGraphUpdates) { + for (const auto &update : graphUpdates) { auto newKey = GraphKey(update.node_id, update.level); // If neighbors list is empty, this is a deletion - remove the key from disk @@ -1613,7 +1613,7 @@ void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, siz result.clear(); // First check staged graph updates - for (const auto& update : stagedGraphUpdates) { + for (const auto& update : stagedInsertUpdates) { if (update.node_id == nodeId && update.level == level) { result.reserve(update.neighbors.size()); for (size_t i = 0; i < update.neighbors.size(); i++) { @@ -1886,11 +1886,6 @@ void HNSWDiskIndex::processDeleteBatch() { rawVectorsInRAM.erase(ram_it); } - // Also remove from disk cache to prevent stale data access - auto cache_it = rawVectorsDiskCache.find(deleted_id); - if (cache_it != rawVectorsDiskCache.end()) { - rawVectorsDiskCache.erase(cache_it); - } } // Flush all staged graph updates to disk in a single batch operation @@ -2195,15 +2190,12 @@ void HNSWDiskIndex::getDataByLabel( idType id = it->second; // Get the raw vector data - const void *raw_data = getRawVector(id); - if (raw_data == nullptr) { + std::vector raw_vector(this->dim); + if (!getRawVector(id, raw_vector.data())) { return; // Vector not found } - // Copy the vector data - const DataType *data_ptr = static_cast(raw_data); - std::vector vec(data_ptr, data_ptr + this->dim); - vectors_output.push_back(std::move(vec)); + vectors_output.push_back(std::move(raw_vector)); } template diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h b/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h index f11db205e..423b2ba99 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk_serializer.h @@ -287,11 +287,7 @@ void HNSWDiskIndex::saveIndexIMP(std::ofstream &output) { if (pendingDeleteIds.size() != 0) { throw std::runtime_error("Serialization error: pendingDeleteIds not empty after flush"); } - // Note: delta_list and new_elements_meta_data are currently unused legacy variables - // but we verify them for future-proofing - if (!delta_list.empty()) { - throw std::runtime_error("Serialization error: delta_list not empty after flush"); - } + if (!new_elements_meta_data.empty()) { throw std::runtime_error("Serialization error: new_elements_meta_data not empty after flush"); } diff --git a/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp b/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp index 59642b4fc..7d1766407 100644 --- a/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp +++ b/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp @@ -22,9 +22,9 @@ size_t BM_VecSimGeneral::EF_C = 256; // Dataset file paths - using deep dataset // For HNSW disk, hnsw_index_file points to the folder containing index.hnsw_disk_v1 and rocksdb/ const char *BM_VecSimGeneral::hnsw_index_file = - "tests/benchmark/data/deep-10M-cosine-dim96-M32-efc200-disk-vectors"; + "tests/benchmark/data/deep-1M-cosine-dim96-M32-efc200-disk-vectors"; const char *BM_VecSimGeneral::test_queries_file = "tests/benchmark/data/deep.query.public.10K.fbin"; -const char *BM_VecSimGeneral::ground_truth_file = "tests/benchmark/data/deep.groundtruth.10M.10K.ibin"; // defined only for this benchmark +const char *BM_VecSimGeneral::ground_truth_file = "tests/benchmark/data/deep.groundtruth.1M.10K.ibin"; // defined only for this benchmark #define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Single) #define BM_ADD_LABEL CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single) From 0827f62fca9527b2ca0b84c6bd0ef7eebcbfbd3f Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Fri, 5 Dec 2025 20:24:16 +0200 Subject: [PATCH 16/20] add cache misses and db memory --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 24 ++++++++++++++++++++---- tests/benchmark/bm_common.h | 15 ++++++++++----- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 288b214de..6daef48ec 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -2249,9 +2249,23 @@ uint64_t HNSWDiskIndex::getAllocationSize() const { template uint64_t HNSWDiskIndex::getDBMemorySize() const { - uint64_t db_mem_size = 0; - this->db->GetIntProperty(rocksdb::DB::Properties::kSizeAllMemTables, &db_mem_size); - return db_mem_size; + // Get comprehensive RocksDB memory usage by summing all components: + // 1. Memtables (active, unflushed immutable, and pinned immutable) + // 2. Table readers (filter and index blocks not in block cache) + // 3. Block cache (uncompressed data blocks) + // 4. Pinned blocks (blocks pinned by iterators) + + uint64_t memtables = 0; + uint64_t table_readers = 0; + uint64_t block_cache = 0; + uint64_t pinned_blocks = 0; + + this->db->GetIntProperty(rocksdb::DB::Properties::kSizeAllMemTables, &memtables); + this->db->GetIntProperty(rocksdb::DB::Properties::kEstimateTableReadersMem, &table_readers); + this->db->GetIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &block_cache); + this->db->GetIntProperty(rocksdb::DB::Properties::kBlockCachePinnedUsage, &pinned_blocks); + + return memtables + table_readers + block_cache + pinned_blocks; } template @@ -2263,7 +2277,9 @@ uint64_t HNSWDiskIndex::getDiskSize() const { template std::shared_ptr HNSWDiskIndex::getDBStatistics() const { - return this->dbOptions.statistics; + // Get statistics directly from the database instead of from the cached dbOptions copy + // because GetOptions() returns a copy that doesn't preserve the shared_ptr to statistics + return this->db->GetOptions().statistics; } // Missing virtual method implementations for HNSWDiskIndex diff --git a/tests/benchmark/bm_common.h b/tests/benchmark/bm_common.h index c0d061c30..ba3982b9a 100644 --- a/tests/benchmark/bm_common.h +++ b/tests/benchmark/bm_common.h @@ -109,10 +109,11 @@ void BM_VecSimCommon::TopK_HNSW_DISK(benchmark::State &st) { auto hnsw_index = GET_INDEX(INDEX_HNSW_DISK); // Get DB statistics if available - auto db_stats = dynamic_cast *>(hnsw_index)->getDBStatistics(); - size_t byte_reads = 0; + auto hnsw_disk_index = dynamic_cast *>(hnsw_index); + auto db_stats = hnsw_disk_index->getDBStatistics(); + size_t cache_misses = 0; if (db_stats) { - byte_reads = db_stats->getTickerCount(rocksdb::Tickers::BYTES_COMPRESSED_TO); + cache_misses = db_stats->getTickerCount(rocksdb::Tickers::BLOCK_CACHE_MISS); } for (auto _ : st) { @@ -131,9 +132,13 @@ void BM_VecSimCommon::TopK_HNSW_DISK(benchmark::State &st) { st.counters["Recall"] = (float)correct / (float)(k * iter); if (db_stats) { - byte_reads = db_stats->getTickerCount(rocksdb::Tickers::BYTES_COMPRESSED_TO) - byte_reads; - st.counters["byte_reads"] = static_cast(byte_reads) / iter; + cache_misses = db_stats->getTickerCount(rocksdb::Tickers::BLOCK_CACHE_MISS) - cache_misses; + st.counters["cache_misses_per_query"] = static_cast(cache_misses) / iter; } + + // Output RocksDB memory usage + uint64_t db_memory = hnsw_disk_index->getDBMemorySize(); + st.counters["rocksdb_memory"] = static_cast(db_memory); } // Benchmark TopK performance with marked deleted vectors From e94f8659addb31ef181606235ac65b869ee6d719 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Fri, 5 Dec 2025 23:11:08 +0200 Subject: [PATCH 17/20] use updatable queue for raw vector calc --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 91 +++++++++++++++++--------- src/VecSim/utils/updatable_heap.h | 9 ++- 2 files changed, 67 insertions(+), 33 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 6daef48ec..a8245d4e9 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -34,6 +34,7 @@ #include "VecSim/spaces/computer/preprocessors.h" #include "VecSim/algorithms/hnsw/visited_nodes_handler.h" #include "VecSim/algorithms/hnsw/hnsw.h" // For HNSWAddVectorState definition +#include "VecSim/utils/updatable_heap.h" #ifdef BUILD_TESTS #include "hnsw_serialization_utils.h" @@ -268,7 +269,7 @@ class HNSWDiskIndex : public VecSimIndexAbstract void insertElementToGraph(idType element_id, size_t element_max_level, idType entry_point, size_t global_max_level, const void *raw_vector_data, const void *vector_data); idType mutuallyConnectNewElement(idType new_node_id, - candidatesMaxHeap &top_candidates, size_t level); + vecsim_stl::updatable_max_heap &top_candidates, size_t level); // Batch processing methods void processBatch(); @@ -285,6 +286,7 @@ class HNSWDiskIndex : public VecSimIndexAbstract idType id) const; void getNeighbors(idType nodeId, size_t level, vecsim_stl::vector& result) const; void getNeighborsAndVector(idType nodeId, size_t level, vecsim_stl::vector& result, void* vector_data) const; + void getNeighborsAndVector(labelType nodeId, size_t level, vecsim_stl::vector& result, void* vector_data) const; void searchPendingVectors(const void* query_data, candidatesLabelsMaxHeap& top_candidates, size_t k) const; // Manual control of staged updates @@ -319,9 +321,9 @@ class HNSWDiskIndex : public VecSimIndexAbstract public: // Methods needed by benchmark framework const void *getDataByInternalId(idType id) const; - candidatesMaxHeap searchLayer(idType ep_id, const void *data_point_raw, const void *data_point, size_t level, + vecsim_stl::updatable_max_heap searchLayer(idType ep_id, const void *data_point_raw, const void *data_point, size_t level, size_t ef) const; - candidatesLabelsMaxHeap * searchLayerLabels(idType ep_id, const void *data_point_raw, const void *data_point, size_t level, + vecsim_stl::updatable_max_heap searchLayerLabels(idType ep_id, const void *data_point_raw, const void *data_point, size_t level, size_t ef) const; template void greedySearchLevel(const void *data_point, size_t level, idType &curr_element, @@ -332,10 +334,17 @@ class HNSWDiskIndex : public VecSimIndexAbstract candidatesLabelsMaxHeap *getNewMaxPriorityQueue() const; bool isMarkedDeleted(idType id) const; labelType getExternalLabel(idType id) const; + + // Helper methods for emplacing to heaps (overloaded for idType and labelType) + void emplaceToHeap(vecsim_stl::abstract_priority_queue &heap, DistType dist, + idType id) const; + void emplaceToHeap(vecsim_stl::abstract_priority_queue &heap, + DistType dist, idType id) const; + template - void processCandidate(idType candidate_id, const void* data_point_raw, const void *data_point, size_t level, size_t ef, + void processCandidate(Identifier candidate_id, const void* data_point_raw, const void *data_point, size_t level, size_t ef, std::unordered_set *visited_set, - vecsim_stl::abstract_priority_queue &top_candidates, + vecsim_stl::updatable_max_heap &top_candidates, candidatesMaxHeap &candidate_set, DistType &lowerBound) const; // Raw vector storage and retrieval methods @@ -583,25 +592,24 @@ HNSWDiskIndex::topKQuery(const void *query_data, size_t k, return rep; // Empty index or error } - auto *results = searchLayerLabels(bottom_layer_ep, query_data, processed_query , 0, query_ef); + auto results = searchLayerLabels(bottom_layer_ep, query_data, processed_query , 0, query_ef); if (pendingVectorCount > 0) { // Search pending vectors using the helper method - searchPendingVectors(query_data, *results, k); + searchPendingVectors(query_data, results, k); } - while (results->size() > k) { - results->pop(); + while (results.size() > k) { + results.pop(); } - if (!results->empty()) { - rep->results.resize(results->size()); + if (!results.empty()) { + rep->results.resize(results.size()); for (auto result = rep->results.rbegin(); result != rep->results.rend(); result++) { - std::tie(result->score, result->id) = results->top(); - results->pop(); + std::tie(result->score, result->id) = results.top(); + results.pop(); } rep->code = VecSim_QueryReply_OK; // Mark as successful since we found results } - delete results; return rep; } @@ -800,7 +808,7 @@ void HNSWDiskIndex::insertElementToGraph(idType element_id, } for (auto level = static_cast(max_common_level); level >= 0; level--) { - candidatesMaxHeap top_candidates = + vecsim_stl::updatable_max_heap top_candidates = searchLayer(curr_element, raw_vector_data, vector_data, level, efConstruction); // If the entry point was marked deleted between iterations, we may receive an empty @@ -816,7 +824,7 @@ void HNSWDiskIndex::insertElementToGraph(idType element_id, template idType HNSWDiskIndex::mutuallyConnectNewElement( - idType new_node_id, candidatesMaxHeap &top_candidates, size_t level) { + idType new_node_id, vecsim_stl::updatable_max_heap &top_candidates, size_t level) { // The maximum number of neighbors allowed for an existing neighbor (not new). size_t max_M_cur = level ? M : M0; @@ -1226,13 +1234,13 @@ bool HNSWDiskIndex::getRawVector(idType id, void* output_buf } template -candidatesMaxHeap +vecsim_stl::updatable_max_heap HNSWDiskIndex::searchLayer(idType ep_id, const void *data_point_raw, const void *data_point, size_t layer, size_t ef) const { std::unordered_set visited_set; - candidatesMaxHeap top_candidates(this->allocator); + vecsim_stl::updatable_max_heap top_candidates(this->allocator); candidatesMaxHeap candidate_set(this->allocator); DistType lowerBound; @@ -1264,19 +1272,19 @@ HNSWDiskIndex::searchLayer(idType ep_id, const void *data_po } template -candidatesLabelsMaxHeap * +vecsim_stl::updatable_max_heap HNSWDiskIndex::searchLayerLabels(idType ep_id, const void *data_point_raw, const void *data_point, size_t layer, size_t ef) const { std::unordered_set visited_set; - candidatesLabelsMaxHeap *top_candidates = getNewMaxPriorityQueue(); + vecsim_stl::updatable_max_heap top_candidates(this->allocator); candidatesMaxHeap candidate_set(this->allocator); DistType lowerBound; if (!isMarkedDeleted(ep_id)) { DistType dist = this->calcDistance(data_point, getDataByInternalId(ep_id)); lowerBound = dist; - top_candidates->emplace(dist, getExternalLabel(ep_id)); + top_candidates.emplace(dist, getExternalLabel(ep_id)); candidate_set.emplace(-dist, ep_id); } else { lowerBound = std::numeric_limits::max(); @@ -1287,13 +1295,13 @@ HNSWDiskIndex::searchLayerLabels(idType ep_id, const void *d while (!candidate_set.empty()) { pair curr_el_pair = candidate_set.top(); - if ((-curr_el_pair.first) > lowerBound && top_candidates->size() >= ef) { + if ((-curr_el_pair.first) > lowerBound && top_candidates.size() >= ef) { break; } candidate_set.pop(); - processCandidate(curr_el_pair.second, data_point_raw, data_point, layer, ef, - &visited_set, *top_candidates, + processCandidate(getExternalLabel(curr_el_pair.second), data_point_raw, data_point, layer, ef, + &visited_set, top_candidates, candidate_set, lowerBound); } @@ -1386,9 +1394,22 @@ void HNSWDiskIndex::greedySearchLevel(const void *data_point template candidatesLabelsMaxHeap * HNSWDiskIndex::getNewMaxPriorityQueue() const { - // Use max_priority_queue for single-label disk index + // Use updatable_max_heap to allow updating distances for labels return new (this->allocator) - vecsim_stl::max_priority_queue(this->allocator); + vecsim_stl::updatable_max_heap(this->allocator); +} + +template +void HNSWDiskIndex::emplaceToHeap( + vecsim_stl::abstract_priority_queue &heap, DistType dist, idType id) const { + heap.emplace(dist, id); +} + +template +void HNSWDiskIndex::emplaceToHeap( + vecsim_stl::abstract_priority_queue &heap, DistType dist, + idType id) const { + heap.emplace(dist, getExternalLabel(id)); } template @@ -1412,18 +1433,20 @@ size_t HNSWDiskIndex::getRandomLevel(double reverse_size) { template template void HNSWDiskIndex::processCandidate( - idType curNodeId, const void* data_point_raw, const void *data_point, size_t level, size_t ef, std::unordered_set *visited_set, - vecsim_stl::abstract_priority_queue &top_candidates, + Identifier curNodeId, const void* data_point_raw, const void *data_point, size_t level, size_t ef, std::unordered_set *visited_set, + vecsim_stl::updatable_max_heap &top_candidates, candidatesMaxHeap &candidate_set, DistType &lowerBound) const { assert(visited_set != nullptr); - // Add neighbors to candidate set for further exploration vecsim_stl::vector neighbors(this->allocator); std::vector vector_data(this->inputBlobSize); getNeighborsAndVector(curNodeId, level, neighbors, vector_data.data()); - // Calculate distance to candidate - // DistType dist = this->calcDistanceRaw(data_point_raw, vector_data.data()); - // Add to top candidates if it's one of the best and not marked deleted + // Calculate distance to candidate with raw data + if (top_candidates.exists(curNodeId)) { + + DistType dist = this->calcDistanceRaw(data_point_raw, vector_data.data()); + emplaceToHeap(top_candidates, dist, curNodeId); + } if (!neighbors.empty()) { @@ -1640,6 +1663,10 @@ void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, siz } } +template +void HNSWDiskIndex::getNeighborsAndVector(labelType nodeId, size_t level, vecsim_stl::vector& result, void* vector_data) const { + getNeighborsAndVector(labelToIdMap.at(nodeId), level, result, vector_data); +} template void HNSWDiskIndex::searchPendingVectors( diff --git a/src/VecSim/utils/updatable_heap.h b/src/VecSim/utils/updatable_heap.h index f79e78eb5..866a10772 100644 --- a/src/VecSim/utils/updatable_heap.h +++ b/src/VecSim/utils/updatable_heap.h @@ -41,9 +41,12 @@ class updatable_max_heap : public abstract_priority_queue { inline void emplace(Priority p, Value v) override; inline bool empty() const override; inline void pop() override; + inline bool exists(Value v) const; inline const std::pair top() const override; inline size_t size() const override; - + // Random order iteration + const auto begin() const { return this->priorityToValue.begin(); } + const auto end() const { return this->priorityToValue.end(); } private: inline auto top_ptr() const; }; @@ -110,4 +113,8 @@ void updatable_max_heap::emplace(Priority p, Value v) { } } +template +bool updatable_max_heap::exists(Value v) const { + return valueToNode.find(v) != valueToNode.end(); +} } // namespace vecsim_stl From b89111c6e5616cf7fe324fa9fbfed51682320259 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Sun, 7 Dec 2025 10:37:57 +0200 Subject: [PATCH 18/20] make useRaw global --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index a8245d4e9..02192d636 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -182,6 +182,8 @@ class HNSWDiskIndex : public VecSimIndexAbstract size_t deleteBatchThreshold = 10; vecsim_stl::vector pendingDeleteIds; + bool useRawData = false; + // In-memory graph updates staging (for delayed disk operations) struct GraphUpdate { idType node_id; @@ -1442,7 +1444,7 @@ void HNSWDiskIndex::processCandidate( std::vector vector_data(this->inputBlobSize); getNeighborsAndVector(curNodeId, level, neighbors, vector_data.data()); // Calculate distance to candidate with raw data - if (top_candidates.exists(curNodeId)) { + if (useRawData && top_candidates.exists(curNodeId)) { DistType dist = this->calcDistanceRaw(data_point_raw, vector_data.data()); emplaceToHeap(top_candidates, dist, curNodeId); From 97105823c7f794f8599c37bbf4c34b6a38f949a3 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Tue, 9 Dec 2025 11:01:50 +0200 Subject: [PATCH 19/20] try to fix benchmarks --- .../index_factories/hnsw_disk_factory.cpp | 8 +--- .../index_factories/hnsw_disk_factory.h | 45 ------------------- tests/benchmark/bm_vecsim_basics.h | 14 ++---- .../run_files/bm_hnsw_disk_single_fp32.cpp | 4 +- 4 files changed, 6 insertions(+), 65 deletions(-) diff --git a/src/VecSim/index_factories/hnsw_disk_factory.cpp b/src/VecSim/index_factories/hnsw_disk_factory.cpp index a99224be9..00d6c0224 100644 --- a/src/VecSim/index_factories/hnsw_disk_factory.cpp +++ b/src/VecSim/index_factories/hnsw_disk_factory.cpp @@ -224,7 +224,7 @@ class ManagedRocksDB { } // Destructor: closes DB and optionally cleans up temp directory -ManagedRocksDB::~ManagedRocksDB() { + ~ManagedRocksDB() { // Close DB first (unique_ptr handles this automatically) db.reset(); @@ -253,12 +253,6 @@ ManagedRocksDB::~ManagedRocksDB() { // 2. The program exits (static destructor is called) static std::unique_ptr managed_rocksdb; -// Factory function to create a managed RocksDB instance -std::unique_ptr CreateManagedRocksDB(const std::string &checkpoint_dir, - const std::string &temp_dir) { - return std::make_unique(checkpoint_dir, temp_dir); -} - // Helper function to create AbstractIndexInitParams from VecSimParams static AbstractIndexInitParams NewAbstractInitParams(const VecSimParams *params) { const HNSWParams *hnswParams = ¶ms->algoParams.hnswParams; diff --git a/src/VecSim/index_factories/hnsw_disk_factory.h b/src/VecSim/index_factories/hnsw_disk_factory.h index 19e18de8d..d852de722 100644 --- a/src/VecSim/index_factories/hnsw_disk_factory.h +++ b/src/VecSim/index_factories/hnsw_disk_factory.h @@ -23,40 +23,6 @@ namespace HNSWDiskFactory { #ifdef BUILD_TESTS -/** - * RAII wrapper for RocksDB database management. - * Handles automatic cleanup of temporary directories and database resources. - */ -class ManagedRocksDB { -private: - std::unique_ptr db; - rocksdb::ColumnFamilyHandle *cf = nullptr; - std::string temp_dir; - bool cleanup_temp_dir; // Whether to delete temp_dir on destruction - -public: - // Constructor for loading from checkpoint (with temp directory for writes) - // Copies the entire checkpoint to a temp location to ensure the original is never modified - ManagedRocksDB(const std::string &checkpoint_dir, const std::string &temp_path); - - // Constructor for creating new index (permanent location, no cleanup) - ManagedRocksDB(rocksdb::DB *db_ptr, const std::string &db_path); - - // Destructor: closes DB and optionally cleans up temp directory - ~ManagedRocksDB(); - - // Disable copy and move to prevent resource management issues - ManagedRocksDB(const ManagedRocksDB&) = delete; - ManagedRocksDB& operator=(const ManagedRocksDB&) = delete; - ManagedRocksDB(ManagedRocksDB&&) = delete; - ManagedRocksDB& operator=(ManagedRocksDB&&) = delete; - - // Accessors - rocksdb::DB* getDB() const { return db.get(); } - rocksdb::ColumnFamilyHandle* getCF() const { return cf; } - const std::string& getTempDir() const { return temp_dir; } -}; - /** * Get the checkpoint directory path for a given index file. * @@ -68,17 +34,6 @@ class ManagedRocksDB { */ std::string GetCheckpointDir(const std::string &location); -/** - * Create a managed RocksDB instance for testing/benchmarking. - * The returned object will automatically clean up the temp directory when destroyed. - * - * @param checkpoint_dir Path to the RocksDB checkpoint directory - * @param temp_dir Path to the temporary directory for the checkpoint copy - * @return std::unique_ptr Managed RocksDB instance - */ -std::unique_ptr CreateManagedRocksDB(const std::string &checkpoint_dir, - const std::string &temp_dir); - /** * Factory function to load a serialized disk-based HNSW index from a file. * diff --git a/tests/benchmark/bm_vecsim_basics.h b/tests/benchmark/bm_vecsim_basics.h index 2bb5b2237..46f64f1d0 100644 --- a/tests/benchmark/bm_vecsim_basics.h +++ b/tests/benchmark/bm_vecsim_basics.h @@ -324,16 +324,9 @@ void BM_VecSimBasics::FlushBatchDisk(benchmark::State &st) { // Create a LOCAL ManagedRocksDB instance for this benchmark // This ensures we don't interfere with the global managed_rocksdb used by other benchmarks std::string folder_path = BM_VecSimGeneral::AttachRootPath(BM_VecSimGeneral::hnsw_index_file); - std::string checkpoint_dir = HNSWDiskFactory::GetCheckpointDir(folder_path); - std::string temp_dir = "/tmp/hnsw_disk_flushbatch_" + std::to_string(getpid()) + - "_" + std::to_string(std::time(nullptr)); - - // Create a local ManagedRocksDB that will be automatically cleaned up when it goes out of scope - auto local_managed_db = HNSWDiskFactory::CreateManagedRocksDB(checkpoint_dir, temp_dir); - - // Create index using the local database - auto hnsw_disk_index = dynamic_cast*>( - HNSWDiskFactory::NewIndex(folder_path, local_managed_db->getDB(), local_managed_db->getCF(), false)); + INDICES[INDEX_HNSW_DISK] = IndexPtr(HNSWDiskFactory::NewIndex(folder_path)); + auto hnsw_index = GET_INDEX(INDEX_HNSW_DISK); + auto *hnsw_disk_index = dynamic_cast *>(hnsw_index); size_t flush_threshold = st.range(0); hnsw_disk_index->setBatchThreshold(flush_threshold); @@ -348,7 +341,6 @@ void BM_VecSimBasics::FlushBatchDisk(benchmark::State &st) { // Clean up the index VecSimIndex_Free(hnsw_disk_index); - // local_managed_db will be automatically destroyed here, cleaning up the temp directory } #define UNIT_AND_ITERATIONS Unit(benchmark::kMillisecond)->Iterations(BM_VecSimGeneral::block_size) diff --git a/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp b/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp index bbb02d5e6..62032819c 100644 --- a/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp +++ b/tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp @@ -22,9 +22,9 @@ size_t BM_VecSimGeneral::EF_C = 256; // Dataset file paths - using deep dataset // For HNSW disk, hnsw_index_file points to the folder containing index.hnsw_disk_v1 and rocksdb/ const char *BM_VecSimGeneral::hnsw_index_file = - "tests/benchmark/data/deep-100K-L2-dim96-M32-efc200-disk-vectors"; + "tests/benchmark/data/deep-1M-L2-dim96-M32-efc200-disk-vectors"; const char *BM_VecSimGeneral::test_queries_file = "tests/benchmark/data/deep.query.public.10K.fbin"; -const char *BM_VecSimGeneral::ground_truth_file = "tests/benchmark/data/deep.groundtruth.100K.10K.ibin"; // defined only for this benchmark +const char *BM_VecSimGeneral::ground_truth_file = "tests/benchmark/data/deep.groundtruth.1M.10K.ibin"; // defined only for this benchmark #define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Single) #define BM_ADD_LABEL CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single) From bc61a9a62aeecd5de41c2a298ff7ddcb157c73f8 Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Tue, 9 Dec 2025 15:41:51 +0200 Subject: [PATCH 20/20] try to fix benchmarks --- src/VecSim/algorithms/hnsw/hnsw_disk.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_disk.h b/src/VecSim/algorithms/hnsw/hnsw_disk.h index 02192d636..39dee75d6 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_disk.h +++ b/src/VecSim/algorithms/hnsw/hnsw_disk.h @@ -1667,7 +1667,14 @@ void HNSWDiskIndex::getNeighborsAndVector(idType nodeId, siz template void HNSWDiskIndex::getNeighborsAndVector(labelType nodeId, size_t level, vecsim_stl::vector& result, void* vector_data) const { - getNeighborsAndVector(labelToIdMap.at(nodeId), level, result, vector_data); + // Check if label exists in the map (it won't if it's been marked as deleted) + auto it = labelToIdMap.find(nodeId); + if (it == labelToIdMap.end()) { + // Label doesn't exist (has been marked as deleted), return empty neighbors + result.clear(); + return; + } + getNeighborsAndVector(it->second, level, result, vector_data); } template