Skip to content

Commit dd4a4bc

Browse files
authored
Read bm from zip (#846)
* Added support in zip bm * Change to 1M * install rocksdb in the ci * Change to apt get * stop service * Added unzip * Change the temp folder name * PR review
1 parent fc9b51b commit dd4a4bc

File tree

6 files changed

+207
-38
lines changed

6 files changed

+207
-38
lines changed

.github/workflows/benchmark-runner.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ jobs:
6161
- name: Install benchmark dependencies
6262
run: |
6363
sudo .install/install_script.sh
64-
sudo apt install python3-pip -y
64+
./install_rocksdb_simple.sh
65+
sudo apt-get install -y python3-pip
6566
pip3 install --upgrade pip PyYAML setuptools redisbench-admin
6667
pip3 install -r requirements.txt
6768
- name: Download pre-generated indices

install_rocksdb_simple.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,11 @@ while [[ $# -gt 0 ]]; do
5151
done
5252

5353
echo "Installing RocksDB 10.5.1..."
54-
54+
sudo systemctl stop unattended-upgrades || true
55+
sudo pkill -9 unattended-upgr || true
5556
# Install dependencies
56-
sudo apt update
57-
sudo apt install -y build-essential cmake git wget \
57+
sudo apt-get update
58+
sudo apt-get install -y build-essential cmake git wget unzip \
5859
libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev \
5960
liblz4-dev libzstd-dev libgtest-dev pkg-config
6061

src/VecSim/index_factories/hnsw_disk_factory.cpp

Lines changed: 180 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,38 +16,177 @@
1616
#include <fstream>
1717
#include <filesystem>
1818
#include <unistd.h>
19+
#include <sys/wait.h>
1920
#include <ctime>
2021
#include <cstdlib>
22+
#include <array>
23+
#include <memory>
24+
#include <random>
2125

2226
namespace HNSWDiskFactory {
2327

2428
#ifdef BUILD_TESTS
2529

30+
/**
31+
* @brief Generate a random alphanumeric string of the specified length
32+
* @param length The length of the string to generate
33+
* @return A random string containing only alphanumeric characters
34+
*/
35+
static std::string generate_random_string(size_t length) {
36+
static const char charset[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
37+
static const size_t charset_size = sizeof(charset) - 1;
38+
39+
std::random_device rd;
40+
std::mt19937 gen(rd());
41+
std::uniform_int_distribution<size_t> dist(0, charset_size - 1);
42+
43+
std::string result;
44+
result.reserve(length);
45+
for (size_t i = 0; i < length; ++i) {
46+
result += charset[dist(gen)];
47+
}
48+
return result;
49+
}
50+
51+
/**
52+
* @brief Check if a file is a zip archive by examining its magic bytes
53+
* @param file_path Path to the file to check
54+
* @return true if the file starts with zip magic bytes (PK\x03\x04)
55+
*/
56+
static bool isZipFile(const std::string &file_path) {
57+
std::ifstream file(file_path, std::ios::binary);
58+
if (!file.is_open()) {
59+
return false;
60+
}
61+
// ZIP files start with "PK\x03\x04" (0x504B0304)
62+
std::array<char, 4> magic{};
63+
file.read(magic.data(), 4);
64+
return file.gcount() == 4 && magic[0] == 'P' && magic[1] == 'K' &&
65+
magic[2] == '\x03' && magic[3] == '\x04';
66+
}
67+
68+
/**
69+
* @brief Extract a zip file to a target directory using the system unzip command
70+
* @param zip_path Path to the zip file
71+
* @param target_dir Directory where the zip contents will be extracted
72+
* @throws std::runtime_error if extraction fails
73+
*/
74+
static void extractZipToDirectory(const std::string &zip_path, const std::string &target_dir) {
75+
// Create the target directory if it doesn't exist
76+
std::filesystem::create_directories(target_dir);
77+
78+
// Build the unzip command
79+
// -o: overwrite files without prompting
80+
// -q: quiet mode
81+
// -d: extract to specified directory
82+
std::string command = "unzip -o -q \"" + zip_path + "\" -d \"" + target_dir + "\"";
83+
84+
// Execute the command
85+
int status = std::system(command.c_str());
86+
if (status != 0) {
87+
throw std::runtime_error("Failed to extract zip file: " + zip_path +
88+
" (exit code: " + std::to_string(status) + ")");
89+
}
90+
}
91+
2692
// RAII wrapper to manage RocksDB database and temporary directory cleanup
2793
class ManagedRocksDB {
2894
private:
2995
std::unique_ptr<rocksdb::DB> db;
3096
rocksdb::ColumnFamilyHandle *cf = nullptr;
3197
std::string temp_dir;
98+
std::string extracted_folder_path; // Path to the extracted index folder within temp_dir
3299
bool cleanup_temp_dir; // Whether to delete temp_dir on destruction
33100

101+
// Private constructor - use static factory methods
102+
ManagedRocksDB() : cleanup_temp_dir(false) {}
103+
34104
public:
35-
// Constructor for loading from checkpoint (with temp directory for writes)
36-
// Copies the entire checkpoint to a temp location to ensure the original is never modified
37-
ManagedRocksDB(const std::string &checkpoint_dir, const std::string &temp_path)
38-
: temp_dir(temp_path), cleanup_temp_dir(true) {
105+
// Factory method for loading from a zip file (extracts to temp directory)
106+
static std::unique_ptr<ManagedRocksDB> fromZipFile(const std::string &zip_path,
107+
const std::string &temp_path) {
108+
auto instance = std::unique_ptr<ManagedRocksDB>(new ManagedRocksDB());
109+
instance->temp_dir = temp_path;
110+
instance->extracted_folder_path = temp_path;
111+
instance->cleanup_temp_dir = true;
39112

40113
// Create temp directory
41-
std::filesystem::create_directories(temp_dir);
114+
std::filesystem::create_directories(instance->temp_dir);
115+
116+
// Extract the zip file to temp directory
117+
try {
118+
extractZipToDirectory(zip_path, instance->temp_dir);
119+
} catch (const std::exception &e) {
120+
std::filesystem::remove_all(instance->temp_dir);
121+
throw std::runtime_error("Failed to extract zip file: " + std::string(e.what()));
122+
}
123+
124+
// Find the extracted folder - it should contain index.hnsw_disk_v1 and rocksdb/
125+
// The zip might contain the folder at root level or directly contain the files
126+
std::string index_file = instance->temp_dir + "/index.hnsw_disk_v1";
127+
std::string rocksdb_dir = instance->temp_dir + "/rocksdb";
128+
129+
if (!std::filesystem::exists(index_file) || !std::filesystem::exists(rocksdb_dir)) {
130+
// Check if there's a single subdirectory containing the files
131+
for (const auto &entry : std::filesystem::directory_iterator(instance->temp_dir)) {
132+
if (entry.is_directory()) {
133+
std::string sub_index = entry.path().string() + "/index.hnsw_disk_v1";
134+
std::string sub_rocksdb = entry.path().string() + "/rocksdb";
135+
if (std::filesystem::exists(sub_index) &&
136+
std::filesystem::exists(sub_rocksdb)) {
137+
instance->extracted_folder_path = entry.path().string();
138+
break;
139+
}
140+
}
141+
}
142+
}
143+
144+
// Verify the structure exists
145+
if (!std::filesystem::exists(instance->extracted_folder_path + "/index.hnsw_disk_v1") ||
146+
!std::filesystem::exists(instance->extracted_folder_path + "/rocksdb")) {
147+
std::filesystem::remove_all(instance->temp_dir);
148+
throw std::runtime_error(
149+
"Invalid zip structure: expected index.hnsw_disk_v1 and rocksdb/ directory");
150+
}
151+
152+
// Open RocksDB from the extracted checkpoint
153+
std::string checkpoint_dir = instance->extracted_folder_path + "/rocksdb";
154+
rocksdb::Options options;
155+
options.create_if_missing = false;
156+
options.error_if_exists = false;
157+
options.statistics = rocksdb::CreateDBStatistics();
158+
159+
rocksdb::DB *db_ptr = nullptr;
160+
rocksdb::Status status = rocksdb::DB::Open(options, checkpoint_dir, &db_ptr);
161+
if (!status.ok()) {
162+
std::filesystem::remove_all(instance->temp_dir);
163+
throw std::runtime_error("Failed to open RocksDB from extracted checkpoint: " +
164+
status.ToString());
165+
}
166+
167+
instance->db.reset(db_ptr);
168+
instance->cf = instance->db->DefaultColumnFamily();
169+
return instance;
170+
}
171+
172+
// Factory method for loading from checkpoint directory (copies to temp location)
173+
static std::unique_ptr<ManagedRocksDB> fromCheckpointDir(const std::string &checkpoint_dir,
174+
const std::string &temp_path) {
175+
auto instance = std::unique_ptr<ManagedRocksDB>(new ManagedRocksDB());
176+
instance->temp_dir = temp_path;
177+
instance->cleanup_temp_dir = true;
178+
179+
// Create temp directory
180+
std::filesystem::create_directories(instance->temp_dir);
42181

43182
// Copy the entire checkpoint to temp location to preserve the original
44-
std::string temp_checkpoint = temp_dir + "/checkpoint_copy";
183+
std::string temp_checkpoint = instance->temp_dir + "/checkpoint_copy";
45184
try {
46185
std::filesystem::copy(checkpoint_dir, temp_checkpoint,
47186
std::filesystem::copy_options::recursive);
48187
} catch (const std::filesystem::filesystem_error &e) {
49188
// Clean up temp dir if copy failed
50-
std::filesystem::remove_all(temp_dir);
189+
std::filesystem::remove_all(instance->temp_dir);
51190
throw std::runtime_error("Failed to copy checkpoint to temp location: " +
52191
std::string(e.what()));
53192
}
@@ -63,20 +202,25 @@ class ManagedRocksDB {
63202
rocksdb::Status status = rocksdb::DB::Open(options, temp_checkpoint, &db_ptr);
64203
if (!status.ok()) {
65204
// Clean up temp dir if DB open failed
66-
std::filesystem::remove_all(temp_dir);
205+
std::filesystem::remove_all(instance->temp_dir);
67206
throw std::runtime_error("Failed to open RocksDB from temp checkpoint: " +
68207
status.ToString());
69208
}
70209

71-
db.reset(db_ptr);
72-
cf = db->DefaultColumnFamily();
210+
instance->db.reset(db_ptr);
211+
instance->cf = instance->db->DefaultColumnFamily();
212+
return instance;
73213
}
74214

75-
// Constructor for creating new index (permanent location, no cleanup)
76-
ManagedRocksDB(rocksdb::DB *db_ptr, const std::string &db_path)
77-
: temp_dir(db_path), cleanup_temp_dir(false) {
78-
db.reset(db_ptr);
79-
cf = db->DefaultColumnFamily();
215+
// Factory method for creating new index (permanent location, no cleanup)
216+
static std::unique_ptr<ManagedRocksDB> fromExistingDB(rocksdb::DB *db_ptr,
217+
const std::string &db_path) {
218+
auto instance = std::unique_ptr<ManagedRocksDB>(new ManagedRocksDB());
219+
instance->temp_dir = db_path;
220+
instance->cleanup_temp_dir = false;
221+
instance->db.reset(db_ptr);
222+
instance->cf = instance->db->DefaultColumnFamily();
223+
return instance;
80224
}
81225

82226
// Destructor: closes DB and optionally cleans up temp directory
@@ -100,6 +244,7 @@ class ManagedRocksDB {
100244
rocksdb::DB* getDB() const { return db.get(); }
101245
rocksdb::ColumnFamilyHandle* getCF() const { return cf; }
102246
const std::string& getTempDir() const { return temp_dir; }
247+
const std::string& getExtractedFolderPath() const { return extracted_folder_path; }
103248
};
104249

105250
// Static managed RocksDB instance for benchmark convenience wrapper
@@ -180,7 +325,7 @@ VecSimIndex *NewIndex(const VecSimParams *params) {
180325
}
181326

182327
// Store in RAII wrapper (will close DB on exit, but won't delete directory)
183-
managed_rocksdb = std::make_unique<ManagedRocksDB>(db_ptr, dbPath);
328+
managed_rocksdb = ManagedRocksDB::fromExistingDB(db_ptr, dbPath);
184329

185330
// Create AbstractIndexInitParams
186331
AbstractIndexInitParams abstractInitParams = NewAbstractInitParams(params);
@@ -278,7 +423,24 @@ VecSimIndex *NewIndex(const std::string &folder_path, rocksdb::DB *db,
278423
}
279424

280425
VecSimIndex *NewIndex(const std::string &folder_path, bool is_normalized) {
281-
// Get the checkpoint directory path
426+
// Create a temporary directory
427+
// Using PID and timestamp to ensure uniqueness across multiple benchmark runs
428+
std::string temp_dir = (std::filesystem::temp_directory_path() /
429+
("hnsw_disk_benchmark_" + std::to_string(getpid()) + "_" +
430+
std::to_string(std::time(nullptr)) + "_" + generate_random_string(8))).string();
431+
432+
// Check if the input is a zip file
433+
if (isZipFile(folder_path)) {
434+
// Load from zip file - extract and open RocksDB from extracted location
435+
managed_rocksdb = ManagedRocksDB::fromZipFile(folder_path, temp_dir);
436+
437+
// Use the extracted folder path for loading the index
438+
std::string extracted_path = managed_rocksdb->getExtractedFolderPath();
439+
return NewIndex(extracted_path, managed_rocksdb->getDB(), managed_rocksdb->getCF(),
440+
is_normalized);
441+
}
442+
443+
// Not a zip file - treat as folder path (original behavior)
282444
std::string checkpoint_dir = GetCheckpointDir(folder_path);
283445

284446
if (!std::filesystem::exists(checkpoint_dir)) {
@@ -287,12 +449,7 @@ VecSimIndex *NewIndex(const std::string &folder_path, bool is_normalized) {
287449
"\nMake sure the index was saved with the checkpoint-based format.");
288450
}
289451

290-
// Create a temporary directory for the checkpoint copy
291-
// Using PID and timestamp to ensure uniqueness across multiple benchmark runs
292-
std::string temp_dir = "/tmp/hnsw_disk_benchmark_" + std::to_string(getpid()) +
293-
"_" + std::to_string(std::time(nullptr));
294-
295-
managed_rocksdb = std::make_unique<ManagedRocksDB>(checkpoint_dir, temp_dir);
452+
managed_rocksdb = ManagedRocksDB::fromCheckpointDir(checkpoint_dir, temp_dir);
296453

297454
return NewIndex(folder_path, managed_rocksdb->getDB(), managed_rocksdb->getCF(), is_normalized);
298455
}

src/VecSim/index_factories/hnsw_disk_factory.h

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,18 +70,28 @@ VecSimIndex *NewIndex(const std::string &folder_path, rocksdb::DB *db,
7070

7171
/**
7272
* Convenience wrapper to load a disk-based HNSW index with automatic database management.
73-
* Opens the checkpoint database and loads the index from the specified folder.
73+
* Opens the checkpoint database and loads the index from the specified folder or zip file.
7474
* The original checkpoint is NEVER modified - all operations use a temporary copy.
7575
*
76-
* @param folder_path Path to the folder containing the index
76+
* @param folder_path Path to the folder containing the index, OR path to a zip file
77+
* containing the index folder structure
7778
* @param is_normalized Whether vectors are already normalized (for Cosine metric optimization)
7879
* @return VecSimIndex* Pointer to the loaded HNSWDiskIndex, or throws on error
7980
*
81+
* @note ZIP FILE SUPPORT:
82+
* - If folder_path points to a zip file (detected by magic bytes), it will be
83+
* extracted to a temporary directory before loading
84+
* - The zip file should contain:
85+
* - index.hnsw_disk_v1 (index metadata file)
86+
* - rocksdb/ (RocksDB checkpoint directory)
87+
* Either at root level or inside a single subdirectory
88+
*
8089
* @note CHECKPOINT PRESERVATION:
81-
* - The entire checkpoint is copied to /tmp/hnsw_disk_benchmark_<pid>_<timestamp>/checkpoint_copy
82-
* - All RocksDB operations (reads and writes) use the temporary copy
83-
* - The original checkpoint remains completely unchanged across all benchmark runs
84-
* - This ensures consistent benchmark results when running the same benchmark multiple times
90+
* - For folder input: The checkpoint is copied to <temp_dir>/hnsw_disk_benchmark_<pid>_<timestamp>_<random>/
91+
* - For zip input: The zip is extracted to <temp_dir>/hnsw_disk_benchmark_<pid>_<timestamp>_<random>/
92+
* - <temp_dir> is the system temporary directory (std::filesystem::temp_directory_path())
93+
* - All RocksDB operations use the temporary copy
94+
* - The original files remain completely unchanged
8595
*
8696
* @note CLEANUP GUARANTEES:
8797
* - Temporary directory is automatically cleaned up via RAII (ManagedRocksDB destructor)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/deep1b/deep-1M-cosine-dim96-M32-efc200-disk-vectors.zip
1+
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/deep1b/deep-1M-L2-dim96-M32-efc200-disk-vectors.zip
22
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/deep1b/deep.query.public.10K.fbin
33
https://dev.cto.redis.s3.amazonaws.com/VectorSimilarity/deep1b/deep.groundtruth.1M.10K.ibin

tests/benchmark/run_files/bm_hnsw_disk_single_fp32.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,19 @@ bool BM_VecSimGeneral::is_multi = false;
1212
// Only enable HNSW_DISK for this benchmark
1313
uint32_t BM_VecSimGeneral::enabled_index_types = IndexTypeFlags::INDEX_MASK_HNSW_DISK;
1414

15-
// Configure using deep dataset (1M vectors, 96 dimensions)
15+
// Configure using deep dataset (100K vectors, 96 dimensions)
1616
size_t BM_VecSimGeneral::n_queries = 100;
17-
size_t BM_VecSimGeneral::n_vectors = 1000000;
17+
size_t BM_VecSimGeneral::n_vectors = 100000;
1818
size_t BM_VecSimGeneral::dim = 96;
1919
size_t BM_VecSimGeneral::M = 32;
2020
size_t BM_VecSimGeneral::EF_C = 256;
2121

2222
// Dataset file paths - using deep dataset
2323
// For HNSW disk, hnsw_index_file points to the folder containing index.hnsw_disk_v1 and rocksdb/
2424
const char *BM_VecSimGeneral::hnsw_index_file =
25-
"tests/benchmark/data/deep-1M-cosine-dim96-M32-efc200-disk-vectors";
25+
"tests/benchmark/data/deep-100K-L2-dim96-M32-efc200-disk-vectors";
2626
const char *BM_VecSimGeneral::test_queries_file = "tests/benchmark/data/deep.query.public.10K.fbin";
27-
const char *BM_VecSimGeneral::ground_truth_file = "tests/benchmark/data/deep.groundtruth.1M.10K.ibin"; // defined only for this benchmark
27+
const char *BM_VecSimGeneral::ground_truth_file = "tests/benchmark/data/deep.groundtruth.100K.10K.ibin"; // defined only for this benchmark
2828

2929
#define BM_FUNC_NAME(bm_func, algo) CONCAT_WITH_UNDERSCORE_ARCH(bm_func, algo, Single)
3030
#define BM_ADD_LABEL CONCAT_WITH_UNDERSCORE_ARCH(AddLabel, Single)

0 commit comments

Comments
 (0)