1616#include < fstream>
1717#include < filesystem>
1818#include < unistd.h>
19+ #include < sys/wait.h>
1920#include < ctime>
2021#include < cstdlib>
22+ #include < array>
23+ #include < memory>
24+ #include < random>
2125
2226namespace HNSWDiskFactory {
2327
2428#ifdef BUILD_TESTS
2529
30+ /* *
31+ * @brief Generate a random alphanumeric string of the specified length
32+ * @param length The length of the string to generate
33+ * @return A random string containing only alphanumeric characters
34+ */
35+ static std::string generate_random_string (size_t length) {
36+ static const char charset[] = " 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" ;
37+ static const size_t charset_size = sizeof (charset) - 1 ;
38+
39+ std::random_device rd;
40+ std::mt19937 gen (rd ());
41+ std::uniform_int_distribution<size_t > dist (0 , charset_size - 1 );
42+
43+ std::string result;
44+ result.reserve (length);
45+ for (size_t i = 0 ; i < length; ++i) {
46+ result += charset[dist (gen)];
47+ }
48+ return result;
49+ }
50+
51+ /* *
52+ * @brief Check if a file is a zip archive by examining its magic bytes
53+ * @param file_path Path to the file to check
54+ * @return true if the file starts with zip magic bytes (PK\x03\x04)
55+ */
56+ static bool isZipFile (const std::string &file_path) {
57+ std::ifstream file (file_path, std::ios::binary);
58+ if (!file.is_open ()) {
59+ return false ;
60+ }
61+ // ZIP files start with "PK\x03\x04" (0x504B0304)
62+ std::array<char , 4 > magic{};
63+ file.read (magic.data (), 4 );
64+ return file.gcount () == 4 && magic[0 ] == ' P' && magic[1 ] == ' K' &&
65+ magic[2 ] == ' \x03 ' && magic[3 ] == ' \x04 ' ;
66+ }
67+
68+ /* *
69+ * @brief Extract a zip file to a target directory using the system unzip command
70+ * @param zip_path Path to the zip file
71+ * @param target_dir Directory where the zip contents will be extracted
72+ * @throws std::runtime_error if extraction fails
73+ */
74+ static void extractZipToDirectory (const std::string &zip_path, const std::string &target_dir) {
75+ // Create the target directory if it doesn't exist
76+ std::filesystem::create_directories (target_dir);
77+
78+ // Build the unzip command
79+ // -o: overwrite files without prompting
80+ // -q: quiet mode
81+ // -d: extract to specified directory
82+ std::string command = " unzip -o -q \" " + zip_path + " \" -d \" " + target_dir + " \" " ;
83+
84+ // Execute the command
85+ int status = std::system (command.c_str ());
86+ if (status != 0 ) {
87+ throw std::runtime_error (" Failed to extract zip file: " + zip_path +
88+ " (exit code: " + std::to_string (status) + " )" );
89+ }
90+ }
91+
2692// RAII wrapper to manage RocksDB database and temporary directory cleanup
2793class ManagedRocksDB {
2894private:
2995 std::unique_ptr<rocksdb::DB> db;
3096 rocksdb::ColumnFamilyHandle *cf = nullptr ;
3197 std::string temp_dir;
98+ std::string extracted_folder_path; // Path to the extracted index folder within temp_dir
3299 bool cleanup_temp_dir; // Whether to delete temp_dir on destruction
33100
101+ // Private constructor - use static factory methods
102+ ManagedRocksDB () : cleanup_temp_dir(false ) {}
103+
34104public:
35- // Constructor for loading from checkpoint (with temp directory for writes)
36- // Copies the entire checkpoint to a temp location to ensure the original is never modified
37- ManagedRocksDB (const std::string &checkpoint_dir, const std::string &temp_path)
38- : temp_dir(temp_path), cleanup_temp_dir(true ) {
105+ // Factory method for loading from a zip file (extracts to temp directory)
106+ static std::unique_ptr<ManagedRocksDB> fromZipFile (const std::string &zip_path,
107+ const std::string &temp_path) {
108+ auto instance = std::unique_ptr<ManagedRocksDB>(new ManagedRocksDB ());
109+ instance->temp_dir = temp_path;
110+ instance->extracted_folder_path = temp_path;
111+ instance->cleanup_temp_dir = true ;
39112
40113 // Create temp directory
41- std::filesystem::create_directories (temp_dir);
114+ std::filesystem::create_directories (instance->temp_dir );
115+
116+ // Extract the zip file to temp directory
117+ try {
118+ extractZipToDirectory (zip_path, instance->temp_dir );
119+ } catch (const std::exception &e) {
120+ std::filesystem::remove_all (instance->temp_dir );
121+ throw std::runtime_error (" Failed to extract zip file: " + std::string (e.what ()));
122+ }
123+
124+ // Find the extracted folder - it should contain index.hnsw_disk_v1 and rocksdb/
125+ // The zip might contain the folder at root level or directly contain the files
126+ std::string index_file = instance->temp_dir + " /index.hnsw_disk_v1" ;
127+ std::string rocksdb_dir = instance->temp_dir + " /rocksdb" ;
128+
129+ if (!std::filesystem::exists (index_file) || !std::filesystem::exists (rocksdb_dir)) {
130+ // Check if there's a single subdirectory containing the files
131+ for (const auto &entry : std::filesystem::directory_iterator (instance->temp_dir )) {
132+ if (entry.is_directory ()) {
133+ std::string sub_index = entry.path ().string () + " /index.hnsw_disk_v1" ;
134+ std::string sub_rocksdb = entry.path ().string () + " /rocksdb" ;
135+ if (std::filesystem::exists (sub_index) &&
136+ std::filesystem::exists (sub_rocksdb)) {
137+ instance->extracted_folder_path = entry.path ().string ();
138+ break ;
139+ }
140+ }
141+ }
142+ }
143+
144+ // Verify the structure exists
145+ if (!std::filesystem::exists (instance->extracted_folder_path + " /index.hnsw_disk_v1" ) ||
146+ !std::filesystem::exists (instance->extracted_folder_path + " /rocksdb" )) {
147+ std::filesystem::remove_all (instance->temp_dir );
148+ throw std::runtime_error (
149+ " Invalid zip structure: expected index.hnsw_disk_v1 and rocksdb/ directory" );
150+ }
151+
152+ // Open RocksDB from the extracted checkpoint
153+ std::string checkpoint_dir = instance->extracted_folder_path + " /rocksdb" ;
154+ rocksdb::Options options;
155+ options.create_if_missing = false ;
156+ options.error_if_exists = false ;
157+ options.statistics = rocksdb::CreateDBStatistics ();
158+
159+ rocksdb::DB *db_ptr = nullptr ;
160+ rocksdb::Status status = rocksdb::DB::Open (options, checkpoint_dir, &db_ptr);
161+ if (!status.ok ()) {
162+ std::filesystem::remove_all (instance->temp_dir );
163+ throw std::runtime_error (" Failed to open RocksDB from extracted checkpoint: " +
164+ status.ToString ());
165+ }
166+
167+ instance->db .reset (db_ptr);
168+ instance->cf = instance->db ->DefaultColumnFamily ();
169+ return instance;
170+ }
171+
172+ // Factory method for loading from checkpoint directory (copies to temp location)
173+ static std::unique_ptr<ManagedRocksDB> fromCheckpointDir (const std::string &checkpoint_dir,
174+ const std::string &temp_path) {
175+ auto instance = std::unique_ptr<ManagedRocksDB>(new ManagedRocksDB ());
176+ instance->temp_dir = temp_path;
177+ instance->cleanup_temp_dir = true ;
178+
179+ // Create temp directory
180+ std::filesystem::create_directories (instance->temp_dir );
42181
43182 // Copy the entire checkpoint to temp location to preserve the original
44- std::string temp_checkpoint = temp_dir + " /checkpoint_copy" ;
183+ std::string temp_checkpoint = instance-> temp_dir + " /checkpoint_copy" ;
45184 try {
46185 std::filesystem::copy (checkpoint_dir, temp_checkpoint,
47186 std::filesystem::copy_options::recursive);
48187 } catch (const std::filesystem::filesystem_error &e) {
49188 // Clean up temp dir if copy failed
50- std::filesystem::remove_all (temp_dir);
189+ std::filesystem::remove_all (instance-> temp_dir );
51190 throw std::runtime_error (" Failed to copy checkpoint to temp location: " +
52191 std::string (e.what ()));
53192 }
@@ -63,20 +202,25 @@ class ManagedRocksDB {
63202 rocksdb::Status status = rocksdb::DB::Open (options, temp_checkpoint, &db_ptr);
64203 if (!status.ok ()) {
65204 // Clean up temp dir if DB open failed
66- std::filesystem::remove_all (temp_dir);
205+ std::filesystem::remove_all (instance-> temp_dir );
67206 throw std::runtime_error (" Failed to open RocksDB from temp checkpoint: " +
68207 status.ToString ());
69208 }
70209
71- db.reset (db_ptr);
72- cf = db->DefaultColumnFamily ();
210+ instance->db .reset (db_ptr);
211+ instance->cf = instance->db ->DefaultColumnFamily ();
212+ return instance;
73213 }
74214
75- // Constructor for creating new index (permanent location, no cleanup)
76- ManagedRocksDB (rocksdb::DB *db_ptr, const std::string &db_path)
77- : temp_dir(db_path), cleanup_temp_dir(false ) {
78- db.reset (db_ptr);
79- cf = db->DefaultColumnFamily ();
215+ // Factory method for creating new index (permanent location, no cleanup)
216+ static std::unique_ptr<ManagedRocksDB> fromExistingDB (rocksdb::DB *db_ptr,
217+ const std::string &db_path) {
218+ auto instance = std::unique_ptr<ManagedRocksDB>(new ManagedRocksDB ());
219+ instance->temp_dir = db_path;
220+ instance->cleanup_temp_dir = false ;
221+ instance->db .reset (db_ptr);
222+ instance->cf = instance->db ->DefaultColumnFamily ();
223+ return instance;
80224 }
81225
82226 // Destructor: closes DB and optionally cleans up temp directory
@@ -100,6 +244,7 @@ class ManagedRocksDB {
100244 rocksdb::DB* getDB () const { return db.get (); }
101245 rocksdb::ColumnFamilyHandle* getCF () const { return cf; }
102246 const std::string& getTempDir () const { return temp_dir; }
247+ const std::string& getExtractedFolderPath () const { return extracted_folder_path; }
103248};
104249
105250// Static managed RocksDB instance for benchmark convenience wrapper
@@ -180,7 +325,7 @@ VecSimIndex *NewIndex(const VecSimParams *params) {
180325 }
181326
182327 // Store in RAII wrapper (will close DB on exit, but won't delete directory)
183- managed_rocksdb = std::make_unique<ManagedRocksDB> (db_ptr, dbPath);
328+ managed_rocksdb = ManagedRocksDB::fromExistingDB (db_ptr, dbPath);
184329
185330 // Create AbstractIndexInitParams
186331 AbstractIndexInitParams abstractInitParams = NewAbstractInitParams (params);
@@ -278,7 +423,24 @@ VecSimIndex *NewIndex(const std::string &folder_path, rocksdb::DB *db,
278423}
279424
280425VecSimIndex *NewIndex (const std::string &folder_path, bool is_normalized) {
281- // Get the checkpoint directory path
426+ // Create a temporary directory
427+ // Using PID and timestamp to ensure uniqueness across multiple benchmark runs
428+ std::string temp_dir = (std::filesystem::temp_directory_path () /
429+ (" hnsw_disk_benchmark_" + std::to_string (getpid ()) + " _" +
430+ std::to_string (std::time (nullptr )) + " _" + generate_random_string (8 ))).string ();
431+
432+ // Check if the input is a zip file
433+ if (isZipFile (folder_path)) {
434+ // Load from zip file - extract and open RocksDB from extracted location
435+ managed_rocksdb = ManagedRocksDB::fromZipFile (folder_path, temp_dir);
436+
437+ // Use the extracted folder path for loading the index
438+ std::string extracted_path = managed_rocksdb->getExtractedFolderPath ();
439+ return NewIndex (extracted_path, managed_rocksdb->getDB (), managed_rocksdb->getCF (),
440+ is_normalized);
441+ }
442+
443+ // Not a zip file - treat as folder path (original behavior)
282444 std::string checkpoint_dir = GetCheckpointDir (folder_path);
283445
284446 if (!std::filesystem::exists (checkpoint_dir)) {
@@ -287,12 +449,7 @@ VecSimIndex *NewIndex(const std::string &folder_path, bool is_normalized) {
287449 " \n Make sure the index was saved with the checkpoint-based format." );
288450 }
289451
290- // Create a temporary directory for the checkpoint copy
291- // Using PID and timestamp to ensure uniqueness across multiple benchmark runs
292- std::string temp_dir = " /tmp/hnsw_disk_benchmark_" + std::to_string (getpid ()) +
293- " _" + std::to_string (std::time (nullptr ));
294-
295- managed_rocksdb = std::make_unique<ManagedRocksDB>(checkpoint_dir, temp_dir);
452+ managed_rocksdb = ManagedRocksDB::fromCheckpointDir (checkpoint_dir, temp_dir);
296453
297454 return NewIndex (folder_path, managed_rocksdb->getDB (), managed_rocksdb->getCF (), is_normalized);
298455}
0 commit comments