Skip to content

Commit 0dfb6c2

Browse files
janbucharvdusek
andauthored
fix: Only apply requestHandlerTimeout to request handler (#1474)
Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
1 parent 131f1f0 commit 0dfb6c2

File tree

20 files changed

+385
-49
lines changed

20 files changed

+385
-49
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ keywords = [
3434
"scraping",
3535
]
3636
dependencies = [
37+
"async-timeout>=5.0.1",
3738
"cachetools>=5.5.0",
3839
"colorama>=0.4.0",
3940
"impit>=0.8.0",

src/crawlee/_utils/time.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
import time
44
from contextlib import contextmanager
55
from dataclasses import dataclass
6+
from datetime import timedelta
67
from typing import TYPE_CHECKING
78

9+
from async_timeout import Timeout, timeout
10+
811
if TYPE_CHECKING:
912
from collections.abc import Iterator
10-
from datetime import timedelta
13+
from types import TracebackType
1114

1215
_SECONDS_PER_MINUTE = 60
1316
_SECONDS_PER_HOUR = 3600
@@ -35,6 +38,43 @@ def measure_time() -> Iterator[TimerResult]:
3538
result.cpu = after_cpu - before_cpu
3639

3740

41+
class SharedTimeout:
42+
"""Keeps track of a time budget shared by multiple independent async operations.
43+
44+
Provides a reusable, non-reentrant context manager interface.
45+
"""
46+
47+
def __init__(self, timeout: timedelta) -> None:
48+
self._remaining_timeout = timeout
49+
self._active_timeout: Timeout | None = None
50+
self._activation_timestamp: float | None = None
51+
52+
async def __aenter__(self) -> timedelta:
53+
if self._active_timeout is not None or self._activation_timestamp is not None:
54+
raise RuntimeError('A shared timeout context cannot be entered twice at the same time')
55+
56+
self._activation_timestamp = time.monotonic()
57+
self._active_timeout = new_timeout = timeout(self._remaining_timeout.total_seconds())
58+
await new_timeout.__aenter__()
59+
return self._remaining_timeout
60+
61+
async def __aexit__(
62+
self,
63+
exc_type: type[BaseException] | None,
64+
exc_value: BaseException | None,
65+
exc_traceback: TracebackType | None,
66+
) -> None:
67+
if self._active_timeout is None or self._activation_timestamp is None:
68+
raise RuntimeError('Logic error')
69+
70+
await self._active_timeout.__aexit__(exc_type, exc_value, exc_traceback)
71+
elapsed = time.monotonic() - self._activation_timestamp
72+
self._remaining_timeout = self._remaining_timeout - timedelta(seconds=elapsed)
73+
74+
self._active_timeout = None
75+
self._activation_timestamp = None
76+
77+
3878
def format_duration(duration: timedelta | None) -> str:
3979
"""Format a timedelta into a human-readable string with appropriate units."""
4080
if duration is None:

src/crawlee/crawlers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from crawlee._utils.try_import import install_import_hook as _install_import_hook
22
from crawlee._utils.try_import import try_import as _try_import
33

4-
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
4+
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
55
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
66
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
77

@@ -51,6 +51,7 @@
5151
'BeautifulSoupParserType',
5252
'ContextPipeline',
5353
'HttpCrawler',
54+
'HttpCrawlerOptions',
5455
'HttpCrawlingContext',
5556
'HttpCrawlingResult',
5657
'ParsedHttpCrawlingContext',
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from ._abstract_http_crawler import AbstractHttpCrawler
1+
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
22
from ._abstract_http_parser import AbstractHttpParser
33
from ._http_crawling_context import ParsedHttpCrawlingContext
44

55
__all__ = [
66
'AbstractHttpCrawler',
77
'AbstractHttpParser',
8+
'HttpCrawlerOptions',
89
'ParsedHttpCrawlingContext',
910
]

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,16 @@
33
import asyncio
44
import logging
55
from abc import ABC
6+
from datetime import timedelta
67
from typing import TYPE_CHECKING, Any, Generic
78

89
from more_itertools import partition
910
from pydantic import ValidationError
10-
from typing_extensions import TypeVar
11+
from typing_extensions import NotRequired, TypeVar
1112

1213
from crawlee._request import Request, RequestOptions
1314
from crawlee._utils.docs import docs_group
15+
from crawlee._utils.time import SharedTimeout
1416
from crawlee._utils.urls import to_absolute_url_iterator
1517
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
1618
from crawlee.errors import SessionError
@@ -32,6 +34,19 @@
3234
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
3335

3436

37+
class HttpCrawlerOptions(
38+
BasicCrawlerOptions[TCrawlingContext, TStatisticsState],
39+
Generic[TCrawlingContext, TStatisticsState],
40+
):
41+
"""Arguments for the `AbstractHttpCrawler` constructor.
42+
43+
It is intended for typing forwarded `__init__` arguments in the subclasses.
44+
"""
45+
46+
navigation_timeout: NotRequired[timedelta | None]
47+
"""Timeout for the HTTP request."""
48+
49+
3550
@docs_group('Crawlers')
3651
class AbstractHttpCrawler(
3752
BasicCrawler[TCrawlingContext, StatisticsState],
@@ -56,10 +71,13 @@ def __init__(
5671
self,
5772
*,
5873
parser: AbstractHttpParser[TParseResult, TSelectResult],
74+
navigation_timeout: timedelta | None = None,
5975
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
6076
) -> None:
6177
self._parser = parser
78+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
6279
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
80+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
6381

6482
if '_context_pipeline' not in kwargs:
6583
raise ValueError(
@@ -112,9 +130,17 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC
112130
async def _execute_pre_navigation_hooks(
113131
self, context: BasicCrawlingContext
114132
) -> AsyncGenerator[BasicCrawlingContext, None]:
115-
for hook in self._pre_navigation_hooks:
116-
await hook(context)
117-
yield context
133+
context_id = id(context)
134+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
135+
136+
try:
137+
for hook in self._pre_navigation_hooks:
138+
async with self._shared_navigation_timeouts[context_id]:
139+
await hook(context)
140+
141+
yield context
142+
finally:
143+
self._shared_navigation_timeouts.pop(context_id, None)
118144

119145
async def _parse_http_response(
120146
self, context: HttpCrawlingContext
@@ -222,12 +248,14 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
222248
Yields:
223249
The original crawling context enhanced by HTTP response.
224250
"""
225-
result = await self._http_client.crawl(
226-
request=context.request,
227-
session=context.session,
228-
proxy_info=context.proxy_info,
229-
statistics=self._statistics,
230-
)
251+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
252+
result = await self._http_client.crawl(
253+
request=context.request,
254+
session=context.session,
255+
proxy_info=context.proxy_info,
256+
statistics=self._statistics,
257+
timeout=remaining_timeout,
258+
)
231259

232260
yield HttpCrawlingContext.from_basic_crawling_context(context=context, http_response=result.http_response)
233261

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,12 +1507,15 @@ async def __run_task_function(self) -> None:
15071507
raise
15081508

15091509
async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1510-
await wait_for(
1511-
lambda: self._context_pipeline(context, self.router),
1512-
timeout=self._request_handler_timeout,
1513-
timeout_message=f'{self._request_handler_timeout_text}'
1514-
f' {self._request_handler_timeout.total_seconds()} seconds',
1515-
logger=self._logger,
1510+
await self._context_pipeline(
1511+
context,
1512+
lambda final_context: wait_for(
1513+
lambda: self.router(final_context),
1514+
timeout=self._request_handler_timeout,
1515+
timeout_message=f'{self._request_handler_timeout_text}'
1516+
f' {self._request_handler_timeout.total_seconds()} seconds',
1517+
logger=self._logger,
1518+
),
15161519
)
15171520

15181521
def _raise_for_error_status_code(self, status_code: int) -> None:

src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from bs4 import BeautifulSoup, Tag
66

77
from crawlee._utils.docs import docs_group
8-
from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
99

1010
from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
1111
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ def __init__(
5858
self,
5959
*,
6060
parser: BeautifulSoupParserType = 'lxml',
61-
**kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61+
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
6262
) -> None:
6363
"""Initialize a new instance.
6464

src/crawlee/crawlers/_parsel/_parsel_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from parsel import Selector
66

77
from crawlee._utils.docs import docs_group
8-
from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8+
from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
99

1010
from ._parsel_crawling_context import ParselCrawlingContext
1111
from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ async def request_handler(context: ParselCrawlingContext) -> None:
5656

5757
def __init__(
5858
self,
59-
**kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
59+
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
6060
) -> None:
6161
"""Initialize a new instance.
6262

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,25 @@
33
import asyncio
44
import logging
55
import warnings
6+
from datetime import timedelta
67
from functools import partial
78
from typing import TYPE_CHECKING, Any, Generic, Literal
89

10+
import playwright.async_api
911
from more_itertools import partition
1012
from pydantic import ValidationError
1113
from typing_extensions import NotRequired, TypedDict, TypeVar
1214

1315
from crawlee import service_locator
1416
from crawlee._request import Request, RequestOptions
15-
from crawlee._types import ConcurrencySettings
17+
from crawlee._types import (
18+
BasicCrawlingContext,
19+
ConcurrencySettings,
20+
)
1621
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
1722
from crawlee._utils.docs import docs_group
1823
from crawlee._utils.robots import RobotsTxtFile
24+
from crawlee._utils.time import SharedTimeout
1925
from crawlee._utils.urls import to_absolute_url_iterator
2026
from crawlee.browsers import BrowserPool
2127
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
@@ -44,7 +50,6 @@
4450

4551
from crawlee import RequestTransformAction
4652
from crawlee._types import (
47-
BasicCrawlingContext,
4853
EnqueueLinksKwargs,
4954
ExtractLinksFunction,
5055
HttpHeaders,
@@ -106,6 +111,7 @@ def __init__(
106111
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
107112
headless: bool | None = None,
108113
use_incognito_pages: bool | None = None,
114+
navigation_timeout: timedelta | None = None,
109115
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
110116
) -> None:
111117
"""Initialize a new instance.
@@ -134,12 +140,16 @@ def __init__(
134140
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
135141
own context that is destroyed once the page is closed or crashes.
136142
This option should not be used if `browser_pool` is provided.
143+
navigation_timeout: Timeout for navigation (the process between opening a Playwright page and calling
144+
the request handler)
137145
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
138146
"""
139147
configuration = kwargs.pop('configuration', None)
140148
if configuration is not None:
141149
service_locator.set_configuration(configuration)
142150

151+
self._shared_navigation_timeouts: dict[int, SharedTimeout] = {}
152+
143153
if browser_pool:
144154
# Raise an exception if browser_pool is provided together with other browser-related arguments.
145155
if any(
@@ -202,6 +212,8 @@ def __init__(
202212
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
203213
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
204214

215+
self._navigation_timeout = navigation_timeout or timedelta(minutes=1)
216+
205217
super().__init__(**kwargs)
206218

207219
async def _open_page(
@@ -228,10 +240,18 @@ async def _open_page(
228240
block_requests=partial(block_requests, page=crawlee_page.page),
229241
)
230242

231-
async with browser_page_context(crawlee_page.page):
232-
for hook in self._pre_navigation_hooks:
233-
await hook(pre_navigation_context)
234-
yield pre_navigation_context
243+
context_id = id(pre_navigation_context)
244+
self._shared_navigation_timeouts[context_id] = SharedTimeout(self._navigation_timeout)
245+
246+
try:
247+
async with browser_page_context(crawlee_page.page):
248+
for hook in self._pre_navigation_hooks:
249+
async with self._shared_navigation_timeouts[context_id]:
250+
await hook(pre_navigation_context)
251+
252+
yield pre_navigation_context
253+
finally:
254+
self._shared_navigation_timeouts.pop(context_id, None)
235255

236256
def _prepare_request_interceptor(
237257
self,
@@ -266,6 +286,7 @@ async def _navigate(
266286
Raises:
267287
ValueError: If the browser pool is not initialized.
268288
SessionError: If the URL cannot be loaded by the browser.
289+
TimeoutError: If navigation does not succeed within the navigation timeout.
269290
270291
Yields:
271292
The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links,
@@ -297,7 +318,13 @@ async def _navigate(
297318
# Set route_handler only for current request
298319
await context.page.route(context.request.url, route_handler)
299320

300-
response = await context.page.goto(context.request.url)
321+
try:
322+
async with self._shared_navigation_timeouts[id(context)] as remaining_timeout:
323+
response = await context.page.goto(
324+
context.request.url, timeout=remaining_timeout.total_seconds() * 1000
325+
)
326+
except playwright.async_api.TimeoutError as exc:
327+
raise asyncio.TimeoutError from exc
301328

302329
if response is None:
303330
raise SessionError(f'Failed to load the URL: {context.request.url}')

src/crawlee/crawlers/_playwright/_playwright_http_client.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ async def crawl(
5959
session: Session | None = None,
6060
proxy_info: ProxyInfo | None = None,
6161
statistics: Statistics | None = None,
62+
timeout: timedelta | None = None,
6263
) -> HttpCrawlingResult:
6364
raise NotImplementedError('The `crawl` method should not be used for `PlaywrightHttpClient`')
6465

@@ -72,6 +73,7 @@ async def send_request(
7273
payload: HttpPayload | None = None,
7374
session: Session | None = None,
7475
proxy_info: ProxyInfo | None = None,
76+
timeout: timedelta | None = None,
7577
) -> HttpResponse:
7678
# `proxy_info` are not used because `APIRequestContext` inherits the proxy from `BrowserContext`
7779
# TODO: Use `session` to restore all the fingerprint headers according to the `BrowserContext`, after resolved
@@ -87,7 +89,11 @@ async def send_request(
8789

8890
# Proxies appropriate to the browser context are used
8991
response = await browser_context.request.fetch(
90-
url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
92+
url_or_request=url,
93+
method=method.lower(),
94+
headers=dict(headers) if headers else None,
95+
data=payload,
96+
timeout=timeout.total_seconds() if timeout else None,
9197
)
9298

9399
return await PlaywrightHttpResponse.from_playwright_response(response, protocol='')

0 commit comments

Comments
 (0)