Skip to content

Commit 6127e84

Browse files
Optimize query cycling for large query counts using generators
1 parent 2eda5b7 commit 6127e84

File tree

1 file changed

+51
-19
lines changed

1 file changed

+51
-19
lines changed

engine/base_client/search.py

Lines changed: 51 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -84,36 +84,69 @@ def search_all(
8484

8585
# Handle num_queries parameter
8686
if num_queries > 0:
87-
# If we need more queries than available, cycle through the list
87+
# If we need more queries than available, use a cycling generator
8888
if num_queries > len(queries_list) and len(queries_list) > 0:
8989
print(f"Requested {num_queries} queries but only {len(queries_list)} are available.")
90-
print(f"Extending queries by cycling through the available ones.")
91-
# Calculate how many complete cycles and remaining items we need
92-
complete_cycles = num_queries // len(queries_list)
93-
remaining = num_queries % len(queries_list)
94-
95-
# Create the extended list
96-
extended_queries = []
97-
for _ in range(complete_cycles):
98-
extended_queries.extend(queries_list)
99-
extended_queries.extend(queries_list[:remaining])
100-
101-
used_queries = extended_queries
90+
print(f"Using a cycling generator to efficiently process queries.")
91+
92+
# Create a cycling generator function
93+
def cycling_query_generator(queries, total_count):
94+
"""Generate queries by cycling through the available ones."""
95+
count = 0
96+
while count < total_count:
97+
for query in queries:
98+
if count < total_count:
99+
yield query
100+
count += 1
101+
else:
102+
break
103+
104+
# Use the generator instead of creating a full list
105+
used_queries = cycling_query_generator(queries_list, num_queries)
106+
# We need to know the total count for the progress bar
107+
total_query_count = num_queries
102108
else:
103109
used_queries = queries_list[:num_queries]
110+
total_query_count = len(used_queries)
104111
print(f"Using {num_queries} queries")
105112
else:
106113
used_queries = queries_list
114+
total_query_count = len(used_queries)
107115

108116
if parallel == 1:
109117
# Single-threaded execution
110118
start = time.perf_counter()
111-
results = [search_one(query) for query in tqdm.tqdm(used_queries)]
119+
120+
# Create a progress bar with the correct total
121+
pbar = tqdm.tqdm(total=total_query_count, desc="Processing queries", unit="queries")
122+
123+
# Process queries with progress updates
124+
results = []
125+
for query in used_queries:
126+
results.append(search_one(query))
127+
pbar.update(1)
128+
129+
# Close the progress bar
130+
pbar.close()
131+
112132
total_time = time.perf_counter() - start
113133
else:
114-
# Dynamically calculate chunk size
115-
chunk_size = max(1, len(used_queries) // parallel)
116-
query_chunks = list(chunked_iterable(used_queries, chunk_size))
134+
# Dynamically calculate chunk size based on total_query_count
135+
chunk_size = max(1, total_query_count // parallel)
136+
137+
# If used_queries is a generator, we need to handle it differently
138+
if hasattr(used_queries, '__next__'):
139+
# For generators, we'll create chunks on-the-fly
140+
query_chunks = []
141+
remaining = total_query_count
142+
while remaining > 0:
143+
current_chunk_size = min(chunk_size, remaining)
144+
chunk = [next(used_queries) for _ in range(current_chunk_size)]
145+
query_chunks.append(chunk)
146+
remaining -= current_chunk_size
147+
else:
148+
# For lists, we can use the chunked_iterable function
149+
query_chunks = list(chunked_iterable(used_queries, chunk_size))
117150

118151
# Function to be executed by each worker process
119152
def worker_function(chunk, result_queue):
@@ -141,8 +174,7 @@ def worker_function(chunk, result_queue):
141174
start = time.perf_counter()
142175

143176
# Create a progress bar for the total number of queries
144-
total_queries = len(used_queries)
145-
pbar = tqdm.tqdm(total=total_queries, desc="Processing queries", unit="queries")
177+
pbar = tqdm.tqdm(total=total_query_count, desc="Processing queries", unit="queries")
146178

147179
# Collect results from all worker processes
148180
results = []

0 commit comments

Comments
 (0)