Optimize query cycling for large query counts using generators

fcostaoliveira · fcostaoliveira · commit 6127e847306e · 2025-05-06T23:33:10.000+01:00
diff --git a/engine/base_client/search.py b/engine/base_client/search.py
@@ -84,36 +84,69 @@ def search_all(
 
         # Handle num_queries parameter
         if num_queries > 0:
-            # If we need more queries than available, cycle through the list
+            # If we need more queries than available, use a cycling generator
             if num_queries > len(queries_list) and len(queries_list) > 0:
                 print(f"Requested {num_queries} queries but only {len(queries_list)} are available.")
-                print(f"Extending queries by cycling through the available ones.")
-                # Calculate how many complete cycles and remaining items we need
-                complete_cycles = num_queries // len(queries_list)
-                remaining = num_queries % len(queries_list)
-
-                # Create the extended list
-                extended_queries = []
-                for _ in range(complete_cycles):
-                    extended_queries.extend(queries_list)
-                extended_queries.extend(queries_list[:remaining])
-
-                used_queries = extended_queries
+                print(f"Using a cycling generator to efficiently process queries.")
+
+                # Create a cycling generator function
+                def cycling_query_generator(queries, total_count):
+                    """Generate queries by cycling through the available ones."""
+                    count = 0
+                    while count < total_count:
+                        for query in queries:
+                            if count < total_count:
+                                yield query
+                                count += 1
+                            else:
+                                break
+
+                # Use the generator instead of creating a full list
+                used_queries = cycling_query_generator(queries_list, num_queries)
+                # We need to know the total count for the progress bar
+                total_query_count = num_queries
             else:
                 used_queries = queries_list[:num_queries]
+                total_query_count = len(used_queries)
                 print(f"Using {num_queries} queries")
         else:
             used_queries = queries_list
+            total_query_count = len(used_queries)
 
         if parallel == 1:
             # Single-threaded execution
             start = time.perf_counter()
-            results = [search_one(query) for query in tqdm.tqdm(used_queries)]
+
+            # Create a progress bar with the correct total
+            pbar = tqdm.tqdm(total=total_query_count, desc="Processing queries", unit="queries")
+
+            # Process queries with progress updates
+            results = []
+            for query in used_queries:
+                results.append(search_one(query))
+                pbar.update(1)
+
+            # Close the progress bar
+            pbar.close()
+
             total_time = time.perf_counter() - start
         else:
-            # Dynamically calculate chunk size
-            chunk_size = max(1, len(used_queries) // parallel)
-            query_chunks = list(chunked_iterable(used_queries, chunk_size))
+            # Dynamically calculate chunk size based on total_query_count
+            chunk_size = max(1, total_query_count // parallel)
+
+            # If used_queries is a generator, we need to handle it differently
+            if hasattr(used_queries, '__next__'):
+                # For generators, we'll create chunks on-the-fly
+                query_chunks = []
+                remaining = total_query_count
+                while remaining > 0:
+                    current_chunk_size = min(chunk_size, remaining)
+                    chunk = [next(used_queries) for _ in range(current_chunk_size)]
+                    query_chunks.append(chunk)
+                    remaining -= current_chunk_size
+            else:
+                # For lists, we can use the chunked_iterable function
+                query_chunks = list(chunked_iterable(used_queries, chunk_size))
 
             # Function to be executed by each worker process
             def worker_function(chunk, result_queue):
@@ -141,8 +174,7 @@ def worker_function(chunk, result_queue):
             start = time.perf_counter()
 
             # Create a progress bar for the total number of queries
-            total_queries = len(used_queries)
-            pbar = tqdm.tqdm(total=total_queries, desc="Processing queries", unit="queries")
+            pbar = tqdm.tqdm(total=total_query_count, desc="Processing queries", unit="queries")
 
             # Collect results from all worker processes
             results = []