Skip to content

Commit aad853d

Browse files
authored
Introduce tiered HNSW index MOD-4300 MOD-4299 (#278)
* Implement + test abstract and instance of HNSW tiered index
1 parent 0e8d38a commit aad853d

16 files changed

+336
-12
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
cmake_minimum_required(VERSION 3.10)
22
cmake_policy(SET CMP0077 NEW)
33

4+
set(CMAKE_CXX_STANDARD 20)
45
option(VECSIM_BUILD_TESTS "Build tests" ON)
56

67
# option(VECSIM_STATIC "Build as static library" OFF)

src/VecSim/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.10)
22
cmake_policy(SET CMP0077 NEW)
33
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
44

5+
set(CMAKE_CXX_STANDARD 20)
6+
57
project(VecsimLib)
68
file(GLOB_RECURSE headers ./**.h)
79
set(HEADER_LIST "${headers}")

src/VecSim/algorithms/brute_force/brute_force.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ class BruteForceIndex : public VecSimIndexAbstract<DistType> {
3939
vecsim_stl::vector<DistType> computeBlockScores(VectorBlock *block, const void *queryBlob,
4040
void *timeoutCtx,
4141
VecSimQueryResult_Code *rc) const;
42+
inline DataType *getDataByInternalId(idType id) const {
43+
return (DataType *)vectorBlocks.at(id / this->blockSize)->getVector(id % this->blockSize);
44+
}
4245
virtual VecSimQueryResult_List topKQuery(const void *queryBlob, size_t k,
4346
VecSimQueryParams *queryParams) override;
4447
VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
@@ -60,9 +63,6 @@ class BruteForceIndex : public VecSimIndexAbstract<DistType> {
6063
// Private internal function that implements generic single vector deletion.
6164
virtual int removeVector(idType id);
6265

63-
inline DataType *getDataByInternalId(idType id) const {
64-
return (DataType *)vectorBlocks.at(id / this->blockSize)->getVector(id % this->blockSize);
65-
}
6666
inline VectorBlock *getVectorVectorBlock(idType id) const {
6767
return vectorBlocks.at(id / this->blockSize);
6868
}

src/VecSim/algorithms/hnsw/hnsw_batch_iterator.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ HNSW_BatchIterator<DataType, DistType>::HNSW_BatchIterator(
7676
index(index), depleted(false), top_candidates_extras(this->allocator),
7777
candidates(this->allocator) {
7878

79-
this->dist_func = index->GetDistFunc();
80-
this->dim = index->GetDim();
79+
this->dist_func = index->getDistFunc();
80+
this->dim = index->getDim();
8181
this->entry_point = index->getEntryPointId();
8282
// Use "fresh" tag to mark nodes that were visited along the search in some iteration.
8383
this->visited_list = index->getVisitedList();

src/VecSim/algorithms/hnsw/hnsw_factory.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@
88
#include "VecSim/algorithms/hnsw/hnsw_multi.h"
99
#include "VecSim/algorithms/hnsw/hnsw_factory.h"
1010
#include "VecSim/algorithms/hnsw/hnsw.h"
11+
#include "hnsw_tiered.h"
1112

1213
namespace HNSWFactory {
1314

1415
template <typename DataType, typename DistType = DataType>
15-
inline VecSimIndex *NewIndex_ChooseMultiOrSingle(const HNSWParams *params,
16-
std::shared_ptr<VecSimAllocator> allocator) {
16+
inline HNSWIndex<DataType, DistType> *
17+
NewIndex_ChooseMultiOrSingle(const HNSWParams *params, std::shared_ptr<VecSimAllocator> allocator) {
1718
// check if single and return new hnsw_index
1819
if (params->multi)
1920
return new (allocator) HNSWIndex_Multi<DataType, DistType>(params, allocator);
@@ -127,6 +128,21 @@ size_t EstimateElementSize(const HNSWParams *params) {
127128
return size_meta_data + size_total_data_per_element;
128129
}
129130

131+
VecSimIndex *NewTieredIndex(const TieredHNSWParams *params,
132+
std::shared_ptr<VecSimAllocator> allocator) {
133+
if (params->hnswParams.type == VecSimType_FLOAT32) {
134+
auto *hnsw_index =
135+
NewIndex_ChooseMultiOrSingle<float, float>(&params->hnswParams, allocator);
136+
return new (allocator) TieredHNSWIndex<float, float>(hnsw_index, params->tieredParams);
137+
} else if (params->hnswParams.type == VecSimType_FLOAT64) {
138+
auto *hnsw_index =
139+
NewIndex_ChooseMultiOrSingle<double, double>(&params->hnswParams, allocator);
140+
return new (allocator) TieredHNSWIndex<double, double>(hnsw_index, params->tieredParams);
141+
} else {
142+
return nullptr; // Invalid type
143+
}
144+
}
145+
130146
#ifdef BUILD_TESTS
131147

132148
template <typename DataType, typename DistType = DataType>

src/VecSim/algorithms/hnsw/hnsw_factory.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
namespace HNSWFactory {
1717

1818
VecSimIndex *NewIndex(const HNSWParams *params, std::shared_ptr<VecSimAllocator> allocator);
19+
VecSimIndex *NewTieredIndex(const TieredHNSWParams *params,
20+
std::shared_ptr<VecSimAllocator> allocator);
1921
size_t EstimateInitialSize(const HNSWParams *params);
2022
size_t EstimateElementSize(const HNSWParams *params);
2123

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#pragma once
2+
3+
#include "VecSim/vec_sim_tiered_index.h"
4+
#include "hnsw.h"
5+
#include "hnsw_factory.h"
6+
7+
#include <unordered_map>
8+
9+
template <typename DataType, typename DistType>
10+
class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
11+
private:
12+
/// Mappings from id/label to associated jobs, for invalidating and update ids if necessary.
13+
// In MULTI, we can have more than one insert job pending per label
14+
std::unordered_map<labelType, std::vector<HNSWInsertJob *>> labelToInsertJobs;
15+
std::unordered_map<idType, std::vector<HNSWRepairJob *>> idToRepairJobs;
16+
std::unordered_map<idType, HNSWSwapJob *> idToSwapJob;
17+
18+
// Todo: implement these methods later on
19+
void executeInsertJob(HNSWInsertJob *job) {}
20+
void executeRepairJob(HNSWRepairJob *job) {}
21+
22+
// To be executed synchronously upon deleting a vector, doesn't require a wrapper.
23+
void executeSwapJob(HNSWSwapJob *job) {}
24+
25+
// Wrappers static functions to be sent as callbacks upon creating the jobs (since members
26+
// functions cannot serve as callback, this serve as the "gateway" to the appropriate index).
27+
static void executeInsertJobWrapper(HNSWInsertJob *job) {
28+
reinterpret_cast<TieredHNSWIndex<DataType, DistType> *>(job->index)->executeInsertJob(job);
29+
}
30+
static void executeRepairJobWrapper(HNSWRepairJob *job) {
31+
reinterpret_cast<TieredHNSWIndex<DataType, DistType> *>(job->index)->executeRepairJob(job);
32+
}
33+
34+
#ifdef BUILD_TESTS
35+
#include "VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h"
36+
#endif
37+
38+
public:
39+
TieredHNSWIndex(HNSWIndex<DataType, DistType> *hnsw_index, TieredIndexParams tieredParams)
40+
: VecSimTieredIndex<DataType, DistType>(hnsw_index, tieredParams) {}
41+
virtual ~TieredHNSWIndex() = default;
42+
43+
// TODO: Implement the actual methods instead of these temporary ones.
44+
int addVector(const void *blob, labelType id) override {
45+
return this->index->addVector(blob, id);
46+
}
47+
int deleteVector(labelType id) override { return this->index->deleteVector(id); }
48+
double getDistanceFrom(labelType id, const void *blob) const override {
49+
return this->index->getDistanceFrom(id, blob);
50+
}
51+
size_t indexSize() const override { return this->index->indexSize(); }
52+
size_t indexLabelCount() const override { return this->index->indexLabelCount(); }
53+
VecSimQueryResult_List topKQuery(const void *queryBlob, size_t k,
54+
VecSimQueryParams *queryParams) override {
55+
return this->index->topKQuery(queryBlob, k, queryParams);
56+
}
57+
VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
58+
VecSimQueryParams *queryParams) override {
59+
return this->index->rangeQuery(queryBlob, radius, queryParams);
60+
}
61+
VecSimIndexInfo info() const override { return this->index->info(); }
62+
VecSimInfoIterator *infoIterator() const override { return this->index->infoIterator(); }
63+
VecSimBatchIterator *newBatchIterator(const void *queryBlob,
64+
VecSimQueryParams *queryParams) const override {
65+
return this->index->newBatchIterator(queryBlob, queryParams);
66+
}
67+
bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override {
68+
return this->index->preferAdHocSearch(subsetSize, k, initial_check);
69+
}
70+
inline void setLastSearchMode(VecSearchMode mode) override {
71+
return this->index->setLastSearchMode(mode);
72+
}
73+
};
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#include "VecSim/friend_test_decl.h"
2+
INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_CreateIndexInstance_Test)

src/VecSim/vec_sim_common.h

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,13 @@ typedef enum {
6666
VecSimParamResolverErr_InvalidPolicy_AdHoc_With_EfRuntime
6767
} VecSimResolveCode;
6868

69+
/**
70+
* Callback signatures for asynchronous tiered index.
71+
*/
72+
typedef int (*SubmitCB)(void *job_queue, void **jobs, size_t jobs_len);
73+
typedef int (*UpdateMemoryCB)(void *memory_ctx, size_t memory);
74+
typedef void (*JobCallback)(void *);
75+
6976
/**
7077
* @brief Index initialization parameters.
7178
*
@@ -92,14 +99,81 @@ typedef struct {
9299
size_t blockSize;
93100
} BFParams;
94101

102+
// A struct that contains the common tiered index params.
103+
typedef struct {
104+
void *jobQueue; // External queue that holds the jobs.
105+
SubmitCB submitCb; // A callback that submits an array of jobs into a given jobQueue.
106+
void *memoryCtx; // External context that stores the index memory consumption.
107+
UpdateMemoryCB UpdateMemCb; // A callback that updates the memoryCtx
108+
// with a given memory (number).
109+
} TieredIndexParams;
110+
111+
typedef struct {
112+
HNSWParams hnswParams;
113+
TieredIndexParams tieredParams;
114+
} TieredHNSWParams;
115+
95116
typedef struct {
96117
VecSimAlgo algo; // Algorithm to use.
97118
union {
98119
HNSWParams hnswParams;
99120
BFParams bfParams;
121+
TieredHNSWParams tieredHNSWParams;
100122
};
101123
} VecSimParams;
102124

125+
/**
126+
* The specific job types in use (to be extended in the future by demand)
127+
*/
128+
typedef enum {
129+
HNSW_INSERT_VECTOR_JOB,
130+
HNSW_REPAIR_NODE_CONNECTIONS_JOB,
131+
HNSW_SEARCH_JOB,
132+
HNSW_SWAP_JOB,
133+
INVALID_JOB // to indicate that finding a JobType >= INVALID_JOB is an error
134+
} JobType;
135+
136+
/**
137+
* Definition of generic job structure for asynchronous tiered index.
138+
*/
139+
typedef struct AsyncJob {
140+
JobType jobType;
141+
JobCallback Execute; // A callback that receives a job as its input and executes the job.
142+
} AsyncJob;
143+
144+
/**
145+
* Definition of a job that inserts a new vector from flat into HNSW Index.
146+
*/
147+
typedef struct HNSWInsertJob {
148+
AsyncJob base;
149+
void *index;
150+
labelType label;
151+
idType id;
152+
} HNSWInsertJob;
153+
154+
/**
155+
* Definition of a job that swaps last id with a deleted id in HNSW Index after delete operation.
156+
*/
157+
typedef struct HNSWSwapJob {
158+
AsyncJob base;
159+
void *index;
160+
idType deleted_id;
161+
long pending_repair_jobs_counter; // number of repair jobs left to complete before this job
162+
// is ready to be executed (atomic counter).
163+
} HNSWSwapJob;
164+
165+
/**
166+
* Definition of a job that repairs a certain node's connection in HNSW Index after delete
167+
* operation.
168+
*/
169+
typedef struct HNSWRepairJob {
170+
AsyncJob base;
171+
void *index;
172+
idType node_id;
173+
unsigned short level;
174+
HNSWSwapJob *assosiated_swap_job;
175+
} HNSWRepairJob;
176+
103177
typedef struct {
104178
size_t efRuntime; // EF parameter for HNSW graph accuracy/latency for search.
105179
double epsilon; // Epsilon parameter for HNSW graph accuracy/latency for range search.

src/VecSim/vec_sim_index.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,10 @@ struct VecSimIndexAbstract : public VecSimIndexInterface {
5454
*/
5555
virtual ~VecSimIndexAbstract() {}
5656

57-
inline dist_func_t<DistType> GetDistFunc() const { return dist_func; }
58-
inline size_t GetDim() const { return dim; }
57+
inline dist_func_t<DistType> getDistFunc() const { return dist_func; }
58+
inline size_t getDim() const { return dim; }
5959
inline void setLastSearchMode(VecSearchMode mode) override { this->last_mode = mode; }
6060
inline bool isMultiValue() const { return isMulti; }
61+
inline VecSimType getType() const { return vecType; }
62+
inline VecSimMetric getMetric() const { return metric; }
6163
};

0 commit comments

Comments
 (0)