From 234a4dbacdb75f9183bbb97197d459815c100aa8 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 1 Apr 2025 16:55:34 +0000 Subject: [PATCH 1/2] first iteration for playwright http client --- src/crawlee/crawlers/_basic/_basic_crawler.py | 1 + src/crawlee/http_clients/__init__.py | 3 + src/crawlee/http_clients/_base.py | 15 ++ src/crawlee/http_clients/_curl_impersonate.py | 16 ++ src/crawlee/http_clients/_httpx.py | 16 ++ src/crawlee/http_clients/_playwright.py | 225 ++++++++++++++++++ src/crawlee/sessions/_cookies.py | 4 + .../unit/crawlers/_http/test_http_crawler.py | 7 +- 8 files changed, 285 insertions(+), 2 deletions(-) create mode 100644 src/crawlee/http_clients/_playwright.py diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 8c76da798e..5a1132d0bb 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -615,6 +615,7 @@ async def _run_crawler(self) -> None: event_manager, self._snapshotter, self._statistics, + self._http_client, self._session_pool if self._use_session_pool else None, *self._additional_context_managers, ) diff --git a/src/crawlee/http_clients/__init__.py b/src/crawlee/http_clients/__init__.py index 94df9980ec..39f38987b1 100644 --- a/src/crawlee/http_clients/__init__.py +++ b/src/crawlee/http_clients/__init__.py @@ -12,6 +12,8 @@ with _try_import(__name__, 'CurlImpersonateHttpClient'): from ._curl_impersonate import CurlImpersonateHttpClient +with _try_import(__name__, 'PlaywrightHttpClient'): + from ._playwright import PlaywrightHttpClient __all__ = [ 'CurlImpersonateHttpClient', @@ -19,4 +21,5 @@ 'HttpCrawlingResult', 'HttpResponse', 'HttpxHttpClient', + 'PlaywrightHttpClient', ] diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index efbd09b88e..29cfda2392 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -7,6 +7,8 @@ from crawlee._utils.docs import docs_group if TYPE_CHECKING: + from types import TracebackType + from crawlee import Request from crawlee._types import HttpHeaders, HttpMethod, HttpPayload from crawlee.proxy_configuration import ProxyInfo @@ -119,3 +121,16 @@ async def send_request( Returns: The HTTP response received from the server. """ + + @abstractmethod + async def __aenter__(self) -> HttpClient: + """Make.""" + + @abstractmethod + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + """Make.""" diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index 847681d5b3..b45ff54fcd 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from http.cookiejar import Cookie + from types import TracebackType from curl_cffi import Curl from curl_cffi.requests import Request as CurlRequest @@ -245,3 +246,18 @@ def _get_cookies(curl: Curl) -> list[Cookie]: cookie = curl_morsel.to_cookiejar_cookie() cookies.append(cookie) return cookies + + @override + async def __aenter__(self) -> CurlImpersonateHttpClient: + return self + + @override + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + for client in self._client_by_proxy_url.values(): + await client.close() + self._client_by_proxy_url.clear() diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index aebf833a12..3adeea7710 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -15,6 +15,7 @@ if TYPE_CHECKING: from ssl import SSLContext + from types import TracebackType from crawlee import Request from crawlee._types import HttpMethod, HttpPayload @@ -262,3 +263,18 @@ def _is_proxy_error(error: httpx.TransportError) -> bool: return True return False + + @override + async def __aenter__(self) -> HttpxHttpClient: + return self + + @override + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + for client in self._client_by_proxy_url.values(): + await client.aclose() + self._client_by_proxy_url.clear() diff --git a/src/crawlee/http_clients/_playwright.py b/src/crawlee/http_clients/_playwright.py new file mode 100644 index 0000000000..0fffa57040 --- /dev/null +++ b/src/crawlee/http_clients/_playwright.py @@ -0,0 +1,225 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from playwright.async_api import APIRequestContext, APIResponse, Playwright, ProxySettings, async_playwright +from typing_extensions import override + +from crawlee._types import HttpHeaders +from crawlee._utils.docs import docs_group +from crawlee.fingerprint_suite import HeaderGenerator +from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse + +if TYPE_CHECKING: + from types import TracebackType + + from crawlee import Request + from crawlee._types import HttpMethod, HttpPayload + from crawlee.proxy_configuration import ProxyInfo + from crawlee.sessions import Session + from crawlee.statistics import Statistics + + +class _PlaywrightResponse: + """Adapter class for `playwright.APIResponse` to conform to the `HttpResponse` protocol.""" + + def __init__(self, response: APIResponse, content: bytes) -> None: + self._response = response + self._content = content + + @property + def http_version(self) -> str: + return 'unidentified' + + @property + def status_code(self) -> int: + return self._response.status + + @property + def headers(self) -> HttpHeaders: + return HttpHeaders(dict(self._response.headers)) + + def read(self) -> bytes: + return self._content + + +@docs_group('Classes') +class PlaywrightHttpClient(HttpClient): + """HTTP client based on the Playwright library. + + This client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses) + and to manage sessions, proxies, and error handling. + + See the `HttpClient` class for more common information about HTTP clients. + + ### Usage + + ```python + from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler + from crawlee.http_clients import PlaywrightHttpClient + + http_client = PlaywrightHttpClient() + crawler = HttpCrawler(http_client=http_client) + ``` + """ + + _DEFAULT_HEADER_GENERATOR = HeaderGenerator() + + def __init__( + self, + *, + persist_cookies_per_session: bool = True, + header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR, + **request_context_kwargs: Any, + ) -> None: + """Initialize a new instance. + + Args: + persist_cookies_per_session: Whether to persist cookies per HTTP session. + header_generator: Header generator instance to use for generating common headers. + request_context_kwargs: Additional keyword arguments for Playwright's APIRequestContext. + """ + super().__init__( + persist_cookies_per_session=persist_cookies_per_session, + ) + + self._request_context_kwargs = request_context_kwargs + self._header_generator = header_generator + + self._playwright_context_manager = async_playwright() + self._playwright: Playwright | None = None + + @override + async def crawl( + self, + request: Request, + *, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + statistics: Statistics | None = None, + ) -> HttpCrawlingResult: + client = await self._get_client(proxy_info, session) + headers = self._combine_headers(request.headers) + + response = await client.fetch( + url_or_request=request.url, + method=request.method.lower(), + headers=dict(headers) if headers else None, + data=request.payload, + ) + + if statistics: + statistics.register_status_code(response.status) + + if self._persist_cookies_per_session and session: + await self._store_cookies_in_session(client, session) + + request.loaded_url = response.url + content = await response.body() + + await client.dispose() + + return HttpCrawlingResult( + http_response=_PlaywrightResponse(response, content=content), + ) + + @override + async def send_request( + self, + url: str, + *, + method: HttpMethod = 'GET', + headers: HttpHeaders | dict[str, str] | None = None, + payload: HttpPayload | None = None, + session: Session | None = None, + proxy_info: ProxyInfo | None = None, + ) -> HttpResponse: + if isinstance(headers, dict) or headers is None: + headers = HttpHeaders(headers or {}) + + # Создаем новый контекст для каждого запроса + client = await self._get_client(proxy_info, session) + headers = self._combine_headers(headers) + + response = await client.fetch( + url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload + ) + + # Обновляем cookie в сессии + if self._persist_cookies_per_session and session: + await self._store_cookies_in_session(client, session) + + content = await response.body() + + await client.dispose() + + return _PlaywrightResponse(response, content=content) + + def _combine_headers(self, explicit_headers: HttpHeaders | None) -> HttpHeaders | None: + """Merge default headers with explicit headers for an HTTP request. + + Generate a final set of request headers by combining default headers, a random User-Agent header, + and any explicitly provided headers. + """ + common_headers = self._header_generator.get_common_headers() if self._header_generator else HttpHeaders() + user_agent_header = ( + self._header_generator.get_random_user_agent_header() if self._header_generator else HttpHeaders() + ) + explicit_headers = explicit_headers or HttpHeaders() + headers = common_headers | user_agent_header | explicit_headers + return headers if headers else None + + async def _get_client(self, proxy_info: ProxyInfo | None, session: Session | None) -> APIRequestContext: + """Create a new Playwright APIRequestContext. + + Creates a new context for each request, configured with the appropriate + proxy settings and cookies from the session. + + Args: + proxy_info: The proxy configuration, if any + session: The session object, if any + + Returns: + A newly created Playwright APIRequestContext + """ + kwargs: dict[str, Any] = {} + + if proxy_info: + kwargs['proxy'] = ProxySettings( + server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', + username=proxy_info.username, + password=proxy_info.password, + ) + + if self._persist_cookies_per_session and session and session.cookies: + pw_cookies = session.cookies.get_cookies_as_playwright_format() + if pw_cookies: + kwargs['storage_state'] = {'cookies': pw_cookies, 'origins': []} + + kwargs.update(self._request_context_kwargs) + + if not self._playwright: + raise RuntimeError(f'The {self.__class__.__name__} is not started.') + + return await self._playwright.request.new_context(**kwargs) + + async def _store_cookies_in_session(self, client: APIRequestContext, session: Session) -> None: + """Store cookies from the Playwright request context in the session.""" + storage_state = await client.storage_state() + session.cookies.set_cookies_from_playwright_format(storage_state.get('cookies', [])) + + @override + async def __aenter__(self) -> PlaywrightHttpClient: + self._playwright = await self._playwright_context_manager.__aenter__() + return self + + @override + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, + ) -> None: + await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback) + self._playwright = None + self._playwright_context_manager = async_playwright() diff --git a/src/crawlee/sessions/_cookies.py b/src/crawlee/sessions/_cookies.py index 33bf63d155..c025c577cf 100644 --- a/src/crawlee/sessions/_cookies.py +++ b/src/crawlee/sessions/_cookies.py @@ -162,8 +162,12 @@ def _to_playwright(self, cookie_dict: CookieParam) -> PlaywrightCookieParam: result['httpOnly'] = result.pop('http_only') if 'same_site' in result: result['sameSite'] = result.pop('same_site') + else: + result['sameSite'] = 'None' if 'expires' in result: result['expires'] = float(result['expires']) + else: + result['expires'] = -1 return PlaywrightCookieParam(**result) diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index 8e20bf7f9d..b4cfa05380 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -12,7 +12,7 @@ from crawlee import ConcurrencySettings, Request from crawlee.crawlers import HttpCrawler -from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient +from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient, PlaywrightHttpClient from crawlee.sessions import SessionPool if TYPE_CHECKING: @@ -61,11 +61,14 @@ async def crawler_without_retries( params=[ pytest.param('curl', id='curl'), pytest.param('httpx', id='httpx'), + pytest.param('playwright', id='playwright'), ] ) -async def http_client(request: pytest.FixtureRequest) -> CurlImpersonateHttpClient | HttpxHttpClient: +async def http_client(request: pytest.FixtureRequest) -> HttpClient: if request.param == 'curl': return CurlImpersonateHttpClient(http_version=CurlHttpVersion.V1_1) + if request.param == 'playwright': + return PlaywrightHttpClient() return HttpxHttpClient(http2=False) From b81fce59c70115716e292f9c1ab41dc61f63084c Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 18 Apr 2025 15:50:57 +0000 Subject: [PATCH 2/2] fix --- src/crawlee/http_clients/_playwright.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/crawlee/http_clients/_playwright.py b/src/crawlee/http_clients/_playwright.py index 0fffa57040..0bc89deead 100644 --- a/src/crawlee/http_clients/_playwright.py +++ b/src/crawlee/http_clients/_playwright.py @@ -137,7 +137,6 @@ async def send_request( if isinstance(headers, dict) or headers is None: headers = HttpHeaders(headers or {}) - # Создаем новый контекст для каждого запроса client = await self._get_client(proxy_info, session) headers = self._combine_headers(headers) @@ -145,7 +144,6 @@ async def send_request( url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload ) - # Обновляем cookie в сессии if self._persist_cookies_per_session and session: await self._store_cookies_in_session(client, session)