Skip to content

Commit 0f495df

Browse files
authored
updates for time (#27)
Update total_index time calculation to be more reliable.
1 parent 534f84c commit 0f495df

File tree

11 files changed

+1692
-703
lines changed

11 files changed

+1692
-703
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ optimization_settings:
178178
# defines weight of each metric in optimization function
179179
metric_weights:
180180
f1: 1
181-
total_indexing_time: 1
181+
total_indexing_time: 1 # weight for total indexing time (seconds to reach 100% indexed)
182182
algorithms: ["hnsw"] # indexing algorithm to be included in the study
183183
vector_data_types: ["float16", "float32"] # data types to be included in the study
184184
distance_metrics: ["cosine"] # distance metrics to be included in the study
@@ -202,6 +202,12 @@ embedding_models:
202202

203203
```
204204

205+
The `total_indexing_time` metric is measured in **seconds** using wall-clock time
206+
from when indexing starts until Redis reports `percent_indexed == 1`. When a
207+
study reuses an existing index without reloading data, the previously measured
208+
indexing time is reused instead of querying `index.info()["total_indexing_time"]`.
209+
210+
205211
#### Code
206212
```python
207213
import os

docs/examples/grid_study/00_grid_study.ipynb

Lines changed: 721 additions & 688 deletions
Large diffs are not rendered by default.

docs/examples/search_study/search_study_walkthrough.ipynb

Lines changed: 847 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
__version__ = "0.4.2"
1+
__version__ = "0.4.3"
22

33
all = ["__version__"]

redis_retrieval_optimizer/bayes_study.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -122,22 +122,38 @@ def objective(trial, study_config, redis_url, corpus_processor, search_method_ma
122122
dtype=trial_settings.index_settings.vector_data_type,
123123
)
124124

125+
125126
if recreate_data:
126127
logging.info("Recreating index...")
127128
corpus = utils.load_json(study_config.corpus)
128129
corpus_data = corpus_processor(corpus, emb_model)
129130
corpus_size = len(corpus_data)
130131
logging.info(f"Corpus size: {corpus_size}")
131132

132-
# reload data
133+
# reload data and measure wall-clock time until indexing completes
134+
indexing_start_time = time.time()
133135
trial_index.load(corpus_data)
134136

135-
while float(trial_index.info()["percent_indexed"]) < 1:
136-
time.sleep(1)
137-
logging.info(f"Indexing progress: {trial_index.info()['percent_indexed']}")
137+
while float(trial_index.info()["percent_indexed"]) < 1:
138+
time.sleep(1)
139+
logging.info(f"Indexing progress: {trial_index.info()['percent_indexed']}")
140+
else:
141+
# Only wait if index is not fully indexed
142+
if float(trial_index.info()["percent_indexed"]) < 1:
143+
while float(trial_index.info()["percent_indexed"]) < 1:
144+
time.sleep(1)
145+
logging.info(f"Indexing progress: {trial_index.info()['percent_indexed']}")
146+
147+
if recreate_data:
148+
assert indexing_start_time is not None
149+
total_indexing_time = time.time() - indexing_start_time
150+
utils.set_last_indexing_time(redis_url, total_indexing_time)
151+
else:
152+
last_indexing_time = utils.get_last_indexing_time(redis_url)
153+
total_indexing_time = (
154+
last_indexing_time if last_indexing_time is not None else 0.0
155+
)
138156

139-
# capture index metrics
140-
total_indexing_time = float(trial_index.info()["total_indexing_time"])
141157
num_docs = trial_index.info()["num_docs"]
142158

143159
logging.info(f"Data indexed {total_indexing_time=}s, {num_docs=}")

redis_retrieval_optimizer/grid_study.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,12 +112,16 @@ def init_index_from_grid_settings(
112112
# corpus processing functions should be user defined
113113
corpus_data = corpus_processor(corpus, emb_model)
114114

115+
indexing_start_time = time.time()
115116
index.load(corpus_data)
116117

117118
while float(index.info()["percent_indexed"]) < 1:
118119
time.sleep(1)
119120
logging.info(f"Indexing progress: {index.info()['percent_indexed']}")
120121

122+
total_indexing_time = time.time() - indexing_start_time
123+
utils.set_last_indexing_time(redis_url, total_indexing_time)
124+
121125
index_settings["embedding"] = embed_settings.model_dump()
122126
utils.set_last_index_settings(redis_url, index_settings)
123127

@@ -188,6 +192,7 @@ def run_grid_study(
188192

189193
# corpus processing functions should be user defined
190194
corpus_data = corpus_processor(corpus, emb_model)
195+
indexing_start_time = time.time()
191196
index.load(corpus_data)
192197

193198
while float(index.info()["percent_indexed"]) < 1:
@@ -196,6 +201,9 @@ def run_grid_study(
196201
f"Indexing progress: {index.info()['percent_indexed']}"
197202
)
198203

204+
total_indexing_time = time.time() - indexing_start_time
205+
utils.set_last_indexing_time(redis_url, total_indexing_time)
206+
199207
# Get embedding model with current dtype
200208
emb_model = utils.get_embedding_model(
201209
embedding_model, redis_url, dtype=dtype
@@ -220,9 +228,11 @@ def run_grid_study(
220228
qrels, search_method_output.run
221229
)
222230

223-
index_info = index.info()
231+
last_indexing_time = utils.get_last_indexing_time(redis_url)
224232

225-
trial_metrics["total_indexing_time"] = index_info["total_indexing_time"]
233+
trial_metrics["total_indexing_time"] = (
234+
last_indexing_time if last_indexing_time is not None else 0.0
235+
)
226236

227237
memory_stats = utils.get_index_memory_stats(
228238
grid_study_config.index_settings.name,

redis_retrieval_optimizer/search_study.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,10 @@ def run_search_study(
9090

9191
trial_metrics = utils.eval_trial_metrics(qrels, search_method_output.run)
9292

93-
trial_metrics["total_indexing_time"] = index_info["total_indexing_time"]
93+
last_indexing_time = utils.get_last_indexing_time(redis_url)
94+
trial_metrics["total_indexing_time"] = (
95+
last_indexing_time if last_indexing_time is not None else 0.0
96+
)
9497

9598
memory_stats = utils.get_index_memory_stats(
9699
search_study_config.index_name,

redis_retrieval_optimizer/utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,27 @@ def set_last_index_settings(redis_url, index_settings):
6969
client.json().set("ret-opt:last_schema", Path.root_path(), index_settings)
7070

7171

72+
def get_last_indexing_time(redis_url: str) -> float | None:
73+
"""Return the last recorded total indexing time in seconds, if any.
74+
75+
This is stored under a dedicated JSON key so we can reuse the
76+
indexing time across runs where we do not reload data.
77+
"""
78+
client = Redis.from_url(redis_url)
79+
value = client.json().get("ret-opt:last_indexing_time")
80+
return float(value) if value is not None else None
81+
82+
83+
def set_last_indexing_time(redis_url: str, indexing_time: float) -> None:
84+
"""Persist the total indexing time (in seconds) for the current index.
85+
86+
This is used when subsequent runs reuse the existing indexed data
87+
and therefore should reuse the previously measured indexing time.
88+
"""
89+
client = Redis.from_url(redis_url)
90+
client.json().set("ret-opt:last_indexing_time", Path.root_path(), indexing_time)
91+
92+
7293
def check_recreate(index_settings, last_index_settings):
7394
embedding_settings = index_settings.pop("embedding") if index_settings else None
7495
last_embedding_settings = (

tests/integration/test_bayes.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22

3+
import pytest
34
import yaml
45
from redisvl.index import SearchIndex
56

@@ -31,6 +32,16 @@ def test_run_bayes_study(redis_url):
3132

3233
assert metrics.shape[0] == study_config["optimization_settings"]["n_trials"]
3334

35+
# total_indexing_time should be recorded for each trial and persisted
36+
assert "total_indexing_time" in metrics.columns
37+
38+
last_indexing_time = utils.get_last_indexing_time(redis_url)
39+
assert last_indexing_time is not None
40+
assert last_indexing_time > 0.0
41+
42+
# The last trial's recorded indexing time should match the persisted value
43+
assert metrics["total_indexing_time"].iloc[-1] == pytest.approx(last_indexing_time)
44+
3445
for score in metrics["f1"].tolist():
3546
assert score > 0.0
3647

@@ -43,4 +54,5 @@ def test_run_bayes_study(redis_url):
4354

4455
# clean up
4556
index.client.json().delete("ret-opt:last_schema")
57+
index.client.json().delete("ret-opt:last_indexing_time")
4658
index.delete(drop=True)

tests/integration/test_grid.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22

3+
import pytest
34
import yaml
45
from redisvl.index import SearchIndex
56

@@ -44,6 +45,19 @@ def test_run_grid_study(redis_url):
4445
for score in metrics["f1"].tolist():
4546
assert score > 0.0
4647

48+
# total_indexing_time should be recorded and reused across trials
49+
assert "total_indexing_time" in metrics.columns
50+
51+
# With a single vector data type, all trials should share the same
52+
# positive indexing time value.
53+
unique_times = metrics["total_indexing_time"].unique()
54+
assert len(unique_times) == 1
55+
assert unique_times[0] > 0.0
56+
57+
last_indexing_time = utils.get_last_indexing_time(redis_url)
58+
assert last_indexing_time is not None
59+
assert unique_times[0] == pytest.approx(last_indexing_time)
60+
4761
last_schema = utils.get_last_index_settings(redis_url)
4862
assert last_schema is not None
4963

@@ -53,6 +67,7 @@ def test_run_grid_study(redis_url):
5367

5468
# clean up
5569
index.client.json().delete("ret-opt:last_schema")
70+
index.client.json().delete("ret-opt:last_indexing_time")
5671
index.delete(drop=True)
5772

5873

@@ -96,6 +111,17 @@ def test_run_grid_study_with_multiple_dtypes(redis_url):
96111
for score in metrics["f1"].tolist():
97112
assert score > 0.0
98113

114+
# total_indexing_time should be recorded for each dtype and reused
115+
# across search methods for that dtype.
116+
assert "total_indexing_time" in metrics.columns
117+
118+
for dtype in unique_dtypes:
119+
dtype_times = metrics.loc[
120+
metrics["vector_data_type"] == dtype, "total_indexing_time"
121+
]
122+
assert dtype_times.nunique() == 1
123+
assert dtype_times.iloc[0] > 0.0
124+
99125
last_schema = utils.get_last_index_settings(redis_url)
100126
assert last_schema is not None
101127

@@ -105,4 +131,5 @@ def test_run_grid_study_with_multiple_dtypes(redis_url):
105131

106132
# clean up
107133
index.client.json().delete("ret-opt:last_schema")
134+
index.client.json().delete("ret-opt:last_indexing_time")
108135
index.delete(drop=True)

0 commit comments

Comments
 (0)