Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ optimization_settings:
# defines weight of each metric in optimization function
metric_weights:
f1: 1
total_indexing_time: 1
total_indexing_time: 1 # weight for total indexing time (seconds to reach 100% indexed)
algorithms: ["hnsw"] # indexing algorithm to be included in the study
vector_data_types: ["float16", "float32"] # data types to be included in the study
distance_metrics: ["cosine"] # distance metrics to be included in the study
Expand All @@ -202,6 +202,12 @@ embedding_models:

```

The `total_indexing_time` metric is measured in **seconds** using wall-clock time
from when indexing starts until Redis reports `percent_indexed == 1`. When a
study reuses an existing index without reloading data, the previously measured
indexing time is reused instead of querying `index.info()["total_indexing_time"]`.


#### Code
```python
import os
Expand Down
1,409 changes: 721 additions & 688 deletions docs/examples/grid_study/00_grid_study.ipynb

Large diffs are not rendered by default.

847 changes: 847 additions & 0 deletions docs/examples/search_study/search_study_walkthrough.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion redis_retrieval_optimizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__version__ = "0.4.2"
__version__ = "0.4.3"

all = ["__version__"]
28 changes: 22 additions & 6 deletions redis_retrieval_optimizer/bayes_study.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,22 +122,38 @@ def objective(trial, study_config, redis_url, corpus_processor, search_method_ma
dtype=trial_settings.index_settings.vector_data_type,
)


if recreate_data:
logging.info("Recreating index...")
corpus = utils.load_json(study_config.corpus)
corpus_data = corpus_processor(corpus, emb_model)
corpus_size = len(corpus_data)
logging.info(f"Corpus size: {corpus_size}")

# reload data
# reload data and measure wall-clock time until indexing completes
indexing_start_time = time.time()
trial_index.load(corpus_data)

while float(trial_index.info()["percent_indexed"]) < 1:
time.sleep(1)
logging.info(f"Indexing progress: {trial_index.info()['percent_indexed']}")
while float(trial_index.info()["percent_indexed"]) < 1:
time.sleep(1)
logging.info(f"Indexing progress: {trial_index.info()['percent_indexed']}")
else:
# Only wait if index is not fully indexed
if float(trial_index.info()["percent_indexed"]) < 1:
while float(trial_index.info()["percent_indexed"]) < 1:
time.sleep(1)
logging.info(f"Indexing progress: {trial_index.info()['percent_indexed']}")

if recreate_data:
assert indexing_start_time is not None
total_indexing_time = time.time() - indexing_start_time
utils.set_last_indexing_time(redis_url, total_indexing_time)
else:
last_indexing_time = utils.get_last_indexing_time(redis_url)
total_indexing_time = (
last_indexing_time if last_indexing_time is not None else 0.0
)

# capture index metrics
total_indexing_time = float(trial_index.info()["total_indexing_time"])
num_docs = trial_index.info()["num_docs"]

logging.info(f"Data indexed {total_indexing_time=}s, {num_docs=}")
Expand Down
14 changes: 12 additions & 2 deletions redis_retrieval_optimizer/grid_study.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,16 @@ def init_index_from_grid_settings(
# corpus processing functions should be user defined
corpus_data = corpus_processor(corpus, emb_model)

indexing_start_time = time.time()
index.load(corpus_data)

while float(index.info()["percent_indexed"]) < 1:
time.sleep(1)
logging.info(f"Indexing progress: {index.info()['percent_indexed']}")

total_indexing_time = time.time() - indexing_start_time
utils.set_last_indexing_time(redis_url, total_indexing_time)

index_settings["embedding"] = embed_settings.model_dump()
utils.set_last_index_settings(redis_url, index_settings)

Expand Down Expand Up @@ -188,6 +192,7 @@ def run_grid_study(

# corpus processing functions should be user defined
corpus_data = corpus_processor(corpus, emb_model)
indexing_start_time = time.time()
index.load(corpus_data)

while float(index.info()["percent_indexed"]) < 1:
Expand All @@ -196,6 +201,9 @@ def run_grid_study(
f"Indexing progress: {index.info()['percent_indexed']}"
)

total_indexing_time = time.time() - indexing_start_time
utils.set_last_indexing_time(redis_url, total_indexing_time)

# Get embedding model with current dtype
emb_model = utils.get_embedding_model(
embedding_model, redis_url, dtype=dtype
Expand All @@ -220,9 +228,11 @@ def run_grid_study(
qrels, search_method_output.run
)

index_info = index.info()
last_indexing_time = utils.get_last_indexing_time(redis_url)

trial_metrics["total_indexing_time"] = index_info["total_indexing_time"]
trial_metrics["total_indexing_time"] = (
last_indexing_time if last_indexing_time is not None else 0.0
)

memory_stats = utils.get_index_memory_stats(
grid_study_config.index_settings.name,
Expand Down
5 changes: 4 additions & 1 deletion redis_retrieval_optimizer/search_study.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ def run_search_study(

trial_metrics = utils.eval_trial_metrics(qrels, search_method_output.run)

trial_metrics["total_indexing_time"] = index_info["total_indexing_time"]
last_indexing_time = utils.get_last_indexing_time(redis_url)
trial_metrics["total_indexing_time"] = (
last_indexing_time if last_indexing_time is not None else 0.0
)

memory_stats = utils.get_index_memory_stats(
search_study_config.index_name,
Expand Down
21 changes: 21 additions & 0 deletions redis_retrieval_optimizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,27 @@ def set_last_index_settings(redis_url, index_settings):
client.json().set("ret-opt:last_schema", Path.root_path(), index_settings)


def get_last_indexing_time(redis_url: str) -> float | None:
"""Return the last recorded total indexing time in seconds, if any.

This is stored under a dedicated JSON key so we can reuse the
indexing time across runs where we do not reload data.
"""
client = Redis.from_url(redis_url)
value = client.json().get("ret-opt:last_indexing_time")
return float(value) if value is not None else None


def set_last_indexing_time(redis_url: str, indexing_time: float) -> None:
"""Persist the total indexing time (in seconds) for the current index.

This is used when subsequent runs reuse the existing indexed data
and therefore should reuse the previously measured indexing time.
"""
client = Redis.from_url(redis_url)
client.json().set("ret-opt:last_indexing_time", Path.root_path(), indexing_time)


def check_recreate(index_settings, last_index_settings):
embedding_settings = index_settings.pop("embedding") if index_settings else None
last_embedding_settings = (
Expand Down
12 changes: 12 additions & 0 deletions tests/integration/test_bayes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

import pytest
import yaml
from redisvl.index import SearchIndex

Expand Down Expand Up @@ -31,6 +32,16 @@ def test_run_bayes_study(redis_url):

assert metrics.shape[0] == study_config["optimization_settings"]["n_trials"]

# total_indexing_time should be recorded for each trial and persisted
assert "total_indexing_time" in metrics.columns

last_indexing_time = utils.get_last_indexing_time(redis_url)
assert last_indexing_time is not None
assert last_indexing_time > 0.0

# The last trial's recorded indexing time should match the persisted value
assert metrics["total_indexing_time"].iloc[-1] == pytest.approx(last_indexing_time)

for score in metrics["f1"].tolist():
assert score > 0.0

Expand All @@ -43,4 +54,5 @@ def test_run_bayes_study(redis_url):

# clean up
index.client.json().delete("ret-opt:last_schema")
index.client.json().delete("ret-opt:last_indexing_time")
index.delete(drop=True)
27 changes: 27 additions & 0 deletions tests/integration/test_grid.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

import pytest
import yaml
from redisvl.index import SearchIndex

Expand Down Expand Up @@ -44,6 +45,19 @@ def test_run_grid_study(redis_url):
for score in metrics["f1"].tolist():
assert score > 0.0

# total_indexing_time should be recorded and reused across trials
assert "total_indexing_time" in metrics.columns

# With a single vector data type, all trials should share the same
# positive indexing time value.
unique_times = metrics["total_indexing_time"].unique()
assert len(unique_times) == 1
assert unique_times[0] > 0.0

last_indexing_time = utils.get_last_indexing_time(redis_url)
assert last_indexing_time is not None
assert unique_times[0] == pytest.approx(last_indexing_time)

last_schema = utils.get_last_index_settings(redis_url)
assert last_schema is not None

Expand All @@ -53,6 +67,7 @@ def test_run_grid_study(redis_url):

# clean up
index.client.json().delete("ret-opt:last_schema")
index.client.json().delete("ret-opt:last_indexing_time")
index.delete(drop=True)


Expand Down Expand Up @@ -96,6 +111,17 @@ def test_run_grid_study_with_multiple_dtypes(redis_url):
for score in metrics["f1"].tolist():
assert score > 0.0

# total_indexing_time should be recorded for each dtype and reused
# across search methods for that dtype.
assert "total_indexing_time" in metrics.columns

for dtype in unique_dtypes:
dtype_times = metrics.loc[
metrics["vector_data_type"] == dtype, "total_indexing_time"
]
assert dtype_times.nunique() == 1
assert dtype_times.iloc[0] > 0.0

last_schema = utils.get_last_index_settings(redis_url)
assert last_schema is not None

Expand All @@ -105,4 +131,5 @@ def test_run_grid_study_with_multiple_dtypes(redis_url):

# clean up
index.client.json().delete("ret-opt:last_schema")
index.client.json().delete("ret-opt:last_indexing_time")
index.delete(drop=True)
22 changes: 18 additions & 4 deletions tests/integration/test_search_study.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import time

import pytest
import yaml
from redisvl.index import SearchIndex
from redisvl.utils.vectorize.text.huggingface import HFTextVectorizer
Expand Down Expand Up @@ -38,14 +40,20 @@ def test_run_search_study(redis_url):
# Load corpus data
corpus = utils.load_json(f"{TEST_DIR}/search_data/corpus.json")
corpus_data = eval_beir.process_corpus(corpus, emb_model)
indexing_start_time = time.time()
index.load(corpus_data)

# Wait for indexing to complete
while float(index.info()["percent_indexed"]) < 1:
import time

time.sleep(1)

total_indexing_time = time.time() - indexing_start_time
# Sanity check: indexing time should be positive for a small test corpus.
assert total_indexing_time > 0.0

# Persist the measured indexing time so search_study can reuse it.
utils.set_last_indexing_time(redis_url, total_indexing_time)

# Run search study
metrics = run_search_study(
config_path=search_config_path,
Expand All @@ -57,6 +65,13 @@ def test_run_search_study(redis_url):

assert metrics.shape[0] == expected_trials

# total_indexing_time should be present and match the value we measured.
assert "total_indexing_time" in metrics.columns

unique_indexing_times = metrics["total_indexing_time"].unique()
assert len(unique_indexing_times) == 1
assert unique_indexing_times[0] == pytest.approx(total_indexing_time)

for score in metrics["f1"].tolist():
assert score > 0.0

Expand All @@ -67,13 +82,12 @@ def test_run_search_study(redis_url):
assert method in unique_methods

# Clean up
index.client.json().delete("ret-opt:last_indexing_time")
index.delete(drop=True)


def test_search_study_requires_embedding_model(redis_url):
"""Test that search study requires embedding_model in config."""
import pytest

# Create a config without embedding_model
config_path = f"{TEST_DIR}/search_data/test_search_study_config_no_embedding.yaml"

Expand Down