33import asyncio
44import logging
55import warnings
6+ from datetime import timedelta
67from functools import partial
78from typing import TYPE_CHECKING , Any , Generic , Literal
89
10+ import playwright .async_api
911from more_itertools import partition
1012from pydantic import ValidationError
1113from typing_extensions import NotRequired , TypedDict , TypeVar
1214
1315from crawlee import service_locator
1416from crawlee ._request import Request , RequestOptions
15- from crawlee ._types import ConcurrencySettings
17+ from crawlee ._types import (
18+ BasicCrawlingContext ,
19+ ConcurrencySettings ,
20+ )
1621from crawlee ._utils .blocked import RETRY_CSS_SELECTORS
1722from crawlee ._utils .docs import docs_group
1823from crawlee ._utils .robots import RobotsTxtFile
24+ from crawlee ._utils .time import SharedTimeout
1925from crawlee ._utils .urls import to_absolute_url_iterator
2026from crawlee .browsers import BrowserPool
2127from crawlee .crawlers ._basic import BasicCrawler , BasicCrawlerOptions , ContextPipeline
4450
4551 from crawlee import RequestTransformAction
4652 from crawlee ._types import (
47- BasicCrawlingContext ,
4853 EnqueueLinksKwargs ,
4954 ExtractLinksFunction ,
5055 HttpHeaders ,
@@ -106,6 +111,7 @@ def __init__(
106111 fingerprint_generator : FingerprintGenerator | None | Literal ['default' ] = 'default' ,
107112 headless : bool | None = None ,
108113 use_incognito_pages : bool | None = None ,
114+ navigation_timeout : timedelta | None = None ,
109115 ** kwargs : Unpack [BasicCrawlerOptions [PlaywrightCrawlingContext , StatisticsState ]],
110116 ) -> None :
111117 """Initialize a new instance.
@@ -134,12 +140,16 @@ def __init__(
134140 use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
135141 own context that is destroyed once the page is closed or crashes.
136142 This option should not be used if `browser_pool` is provided.
143+ navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
144+ the request handler)
137145 kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
138146 """
139147 configuration = kwargs .pop ('configuration' , None )
140148 if configuration is not None :
141149 service_locator .set_configuration (configuration )
142150
151+ self ._shared_navigation_timeouts : dict [int , SharedTimeout ] = {}
152+
143153 if browser_pool :
144154 # Raise an exception if browser_pool is provided together with other browser-related arguments.
145155 if any (
@@ -202,6 +212,8 @@ def __init__(
202212 if 'concurrency_settings' not in kwargs or kwargs ['concurrency_settings' ] is None :
203213 kwargs ['concurrency_settings' ] = ConcurrencySettings (desired_concurrency = 1 )
204214
215+ self ._navigation_timeout = navigation_timeout or timedelta (minutes = 1 )
216+
205217 super ().__init__ (** kwargs )
206218
207219 async def _open_page (
@@ -228,10 +240,18 @@ async def _open_page(
228240 block_requests = partial (block_requests , page = crawlee_page .page ),
229241 )
230242
231- async with browser_page_context (crawlee_page .page ):
232- for hook in self ._pre_navigation_hooks :
233- await hook (pre_navigation_context )
234- yield pre_navigation_context
243+ context_id = id (pre_navigation_context )
244+ self ._shared_navigation_timeouts [context_id ] = SharedTimeout (self ._navigation_timeout )
245+
246+ try :
247+ async with browser_page_context (crawlee_page .page ):
248+ for hook in self ._pre_navigation_hooks :
249+ async with self ._shared_navigation_timeouts [context_id ]:
250+ await hook (pre_navigation_context )
251+
252+ yield pre_navigation_context
253+ finally :
254+ self ._shared_navigation_timeouts .pop (context_id , None )
235255
236256 def _prepare_request_interceptor (
237257 self ,
@@ -266,6 +286,7 @@ async def _navigate(
266286 Raises:
267287 ValueError: If the browser pool is not initialized.
268288 SessionError: If the URL cannot be loaded by the browser.
289+ TimeoutError: If navigation does not succeed within the navigation timeout.
269290
270291 Yields:
271292 The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
@@ -297,7 +318,13 @@ async def _navigate(
297318 # Set route_handler only for current request
298319 await context .page .route (context .request .url , route_handler )
299320
300- response = await context .page .goto (context .request .url )
321+ try :
322+ async with self ._shared_navigation_timeouts [id (context )] as remaining_timeout :
323+ response = await context .page .goto (
324+ context .request .url , timeout = remaining_timeout .total_seconds () * 1000
325+ )
326+ except playwright .async_api .TimeoutError as exc :
327+ raise asyncio .TimeoutError from exc
301328
302329 if response is None :
303330 raise SessionError (f'Failed to load the URL: { context .request .url } ' )
0 commit comments