From ac59668cb46ddaeb37d79a0b2a240531504aeec7 Mon Sep 17 00:00:00 2001
From: lorenzovaccarini <lorenzomaria.vaccarini@mail.polimi.it>
Date: Sun, 28 Dec 2025 16:07:33 +0100
Subject: [PATCH 1/2] Implement double config for AdaptiveCrawler

---
 crawl4ai/adaptive_crawler.py | 82 +++++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 20 deletions(-)

diff --git a/crawl4ai/adaptive_crawler.py b/crawl4ai/adaptive_crawler.py
index b7c649b00..9034fd7da 100644
--- a/crawl4ai/adaptive_crawler.py
+++ b/crawl4ai/adaptive_crawler.py
@@ -179,6 +179,7 @@ class AdaptiveConfig:
     # Embedding strategy parameters
     embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
     embedding_llm_config: Optional[Union[LLMConfig, Dict]] = None  # Separate config for embeddings
+    query_llm_config: Optional[Union[LLMConfig, Dict]] = None  # Separate config for embeddings
     n_query_variations: int = 10
     coverage_threshold: float = 0.85
     alpha_shape_alpha: float = 0.5
@@ -252,7 +253,7 @@ def validate(self):
         assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
     
     @property
-    def _embedding_llm_config_dict(self) -> Optional[Dict]:
+    def _llm_config_dict(self) -> Optional[Dict]:
         """Convert LLMConfig to dict format for backward compatibility."""
         if self.embedding_llm_config is None:
             return None
@@ -614,12 +615,19 @@ def _get_document_terms(self, crawl_result: CrawlResult) -> List[str]:
         return self._tokenize(content.lower())
 
 
+# strategy = EmbeddingStrategy(
+#     embedding_model=self.config.embedding_model,
+#     llm_config=self.config.embedding_llm_config
+# )
+# -> Forwards the two arguments in AdaptiveConfig
 class EmbeddingStrategy(CrawlStrategy):
     """Embedding-based adaptive crawling using semantic space coverage"""
     
-    def __init__(self, embedding_model: str = None, llm_config: Union[LLMConfig, Dict] = None):
+    def __init__(self, embedding_model: str = None, embedding_llm_config: Union[LLMConfig, Dict] = None, query_llm_config: Union[LLMConfig, Dict] = None):
         self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
-        self.llm_config = llm_config
+        self.embedding_llm_config = embedding_llm_config  # For embeddings only
+        self.query_llm_config = query_llm_config  # For query generation only
+        
         self._embedding_cache = {}
         self._link_embedding_cache = {}  # Cache for link embeddings
         self._validation_passed = False  # Track if validation passed
@@ -632,6 +640,19 @@ def __init__(self, embedding_model: str = None, llm_config: Union[LLMConfig, Dic
     
     def _get_embedding_llm_config_dict(self) -> Dict:
         """Get embedding LLM config as dict with fallback to default."""
+        # First check if we have a direct embedding_llm_config
+        if self.embedding_llm_config:
+            if isinstance(self.embedding_llm_config, dict):
+                return self.embedding_llm_config
+            else:
+                # Convert LLMConfig object to dict
+                return {
+                    'provider': self.embedding_llm_config.provider,
+                    'api_token': self.embedding_llm_config.api_token,
+                    'base_url': self.embedding_llm_config.base_url
+                }
+        
+        # Then check if we have it from AdaptiveConfig
         if hasattr(self, 'config') and self.config:
             config_dict = self.config._embedding_llm_config_dict
             if config_dict:
@@ -642,11 +663,41 @@ def _get_embedding_llm_config_dict(self) -> Dict:
             'provider': 'openai/text-embedding-3-small',
             'api_token': os.getenv('OPENAI_API_KEY')
         }
+    
+    def _get_query_llm_config_dict(self) -> Dict:
+        """Get query generation LLM config as dict with fallback to default."""
+        # First check if we have a direct query_llm_config
+        if self.query_llm_config:
+            if isinstance(self.query_llm_config, dict):
+                return self.query_llm_config
+            else:
+                # Convert LLMConfig object to dict
+                return {
+                    'provider': self.query_llm_config.provider,
+                    'api_token': self.query_llm_config.api_token,
+                    'base_url': self.query_llm_config.base_url
+                }
+        
+        # Then check if we have it from AdaptiveConfig
+        if hasattr(self, 'config') and self.config:
+            config_dict = self.config._query_llm_config_dict
+            if config_dict:
+                return config_dict
+        
+        # Fallback to default if no config provided
+        return {
+            'provider': 'openai/gpt-4o-mini',
+            'api_token': os.getenv('OPENAI_API_KEY')
+        }
         
     async def _get_embeddings(self, texts: List[str]) -> Any:
         """Get embeddings using configured method"""
         from .utils import get_text_embeddings
         embedding_llm_config = self._get_embedding_llm_config_dict()
+
+        print("EMBEDDING LLM CONFIGGGG")
+        print(embedding_llm_config)
+        print("\n\n\n\n")
         return await get_text_embeddings(
             texts, 
             embedding_llm_config,
@@ -712,27 +763,17 @@ async def map_query_semantic_space(self, query: str, n_synthetic: int = 10) -> A
         
         Return as a JSON array of strings."""
         
-        # Use the LLM for query generation
-        # Convert LLMConfig to dict if needed
-        llm_config_dict = None
-        if self.llm_config:
-            if isinstance(self.llm_config, dict):
-                llm_config_dict = self.llm_config
-            else:
-                # Convert LLMConfig object to dict
-                llm_config_dict = {
-                    'provider': self.llm_config.provider,
-                    'api_token': self.llm_config.api_token
-                }
-        
-        provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
-        api_token = llm_config_dict.get('api_token') if llm_config_dict else None
+        query_llm_config_dict = self._get_query_llm_config_dict()
+        provider = query_llm_config_dict.get('provider', 'openai/gpt-4o-mini')
+        api_token = query_llm_config_dict.get('api_token')
+        base_url = query_llm_config_dict.get('base_url')
         
         response = perform_completion_with_backoff(
             provider=provider,
             prompt_with_variables=prompt,
             api_token=api_token,
-            json_response=True
+            json_response=True,
+            base_url=base_url,
         )
         
         variations = json.loads(response.choices[0].message.content)
@@ -1298,7 +1339,8 @@ def _create_strategy(self, strategy_name: str) -> CrawlStrategy:
         elif strategy_name == "embedding":
             strategy = EmbeddingStrategy(
                 embedding_model=self.config.embedding_model,
-                llm_config=self.config.embedding_llm_config
+                embedding_llm_config=self.config.embedding_llm_config,
+                query_llm_config=self.config.query_llm_config  # Pass both configs
             )
             strategy.config = self.config  # Pass config to strategy
             return strategy

From a8523d17dceb3683d9ce810237434668b7f5d44a Mon Sep 17 00:00:00 2001
From: lorenzovaccarini <lorenzomaria.vaccarini@mail.polimi.it>
Date: Sun, 28 Dec 2025 16:40:38 +0100
Subject: [PATCH 2/2] Remove useless print comment

---
 crawl4ai/adaptive_crawler.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/crawl4ai/adaptive_crawler.py b/crawl4ai/adaptive_crawler.py
index 9034fd7da..ce59b1fe0 100644
--- a/crawl4ai/adaptive_crawler.py
+++ b/crawl4ai/adaptive_crawler.py
@@ -695,9 +695,6 @@ async def _get_embeddings(self, texts: List[str]) -> Any:
         from .utils import get_text_embeddings
         embedding_llm_config = self._get_embedding_llm_config_dict()
 
-        print("EMBEDDING LLM CONFIGGGG")
-        print(embedding_llm_config)
-        print("\n\n\n\n")
         return await get_text_embeddings(
             texts, 
             embedding_llm_config,