Skip to content

feat: add playwright-based HttpClient #1128

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,7 @@ async def _run_crawler(self) -> None:
event_manager,
self._snapshotter,
self._statistics,
self._http_client,
self._session_pool if self._use_session_pool else None,
*self._additional_context_managers,
)
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/http_clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@
with _try_import(__name__, 'CurlImpersonateHttpClient'):
from ._curl_impersonate import CurlImpersonateHttpClient

with _try_import(__name__, 'PlaywrightHttpClient'):
from ._playwright import PlaywrightHttpClient

__all__ = [
'CurlImpersonateHttpClient',
'HttpClient',
'HttpCrawlingResult',
'HttpResponse',
'HttpxHttpClient',
'PlaywrightHttpClient',
]
15 changes: 15 additions & 0 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from crawlee._utils.docs import docs_group

if TYPE_CHECKING:
from types import TracebackType

from crawlee import Request
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload
from crawlee.proxy_configuration import ProxyInfo
Expand Down Expand Up @@ -119,3 +121,16 @@ async def send_request(
Returns:
The HTTP response received from the server.
"""

@abstractmethod
async def __aenter__(self) -> HttpClient:
"""Make."""

@abstractmethod
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
exc_traceback: TracebackType | None,
) -> None:
"""Make."""
16 changes: 16 additions & 0 deletions src/crawlee/http_clients/_curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

if TYPE_CHECKING:
from http.cookiejar import Cookie
from types import TracebackType

from curl_cffi import Curl
from curl_cffi.requests import Request as CurlRequest
Expand Down Expand Up @@ -245,3 +246,18 @@ def _get_cookies(curl: Curl) -> list[Cookie]:
cookie = curl_morsel.to_cookiejar_cookie()
cookies.append(cookie)
return cookies

@override
async def __aenter__(self) -> CurlImpersonateHttpClient:
return self

@override
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
exc_traceback: TracebackType | None,
) -> None:
for client in self._client_by_proxy_url.values():
await client.close()
self._client_by_proxy_url.clear()
16 changes: 16 additions & 0 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

if TYPE_CHECKING:
from ssl import SSLContext
from types import TracebackType

from crawlee import Request
from crawlee._types import HttpMethod, HttpPayload
Expand Down Expand Up @@ -262,3 +263,18 @@ def _is_proxy_error(error: httpx.TransportError) -> bool:
return True

return False

@override
async def __aenter__(self) -> HttpxHttpClient:
return self

@override
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
exc_traceback: TracebackType | None,
) -> None:
for client in self._client_by_proxy_url.values():
await client.aclose()
self._client_by_proxy_url.clear()
223 changes: 223 additions & 0 deletions src/crawlee/http_clients/_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

from playwright.async_api import APIRequestContext, APIResponse, Playwright, ProxySettings, async_playwright
from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee._utils.docs import docs_group
from crawlee.fingerprint_suite import HeaderGenerator
from crawlee.http_clients import HttpClient, HttpCrawlingResult, HttpResponse

if TYPE_CHECKING:
from types import TracebackType

from crawlee import Request
from crawlee._types import HttpMethod, HttpPayload
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
from crawlee.statistics import Statistics


class _PlaywrightResponse:
"""Adapter class for `playwright.APIResponse` to conform to the `HttpResponse` protocol."""

def __init__(self, response: APIResponse, content: bytes) -> None:
self._response = response
self._content = content

@property
def http_version(self) -> str:
return 'unidentified'

@property
def status_code(self) -> int:
return self._response.status

@property
def headers(self) -> HttpHeaders:
return HttpHeaders(dict(self._response.headers))

def read(self) -> bytes:
return self._content


@docs_group('Classes')
class PlaywrightHttpClient(HttpClient):
"""HTTP client based on the Playwright library.

This client uses the Playwright library to perform HTTP requests in crawlers (`BasicCrawler` subclasses)
and to manage sessions, proxies, and error handling.

See the `HttpClient` class for more common information about HTTP clients.

### Usage

```python
from crawlee.crawlers import HttpCrawler # or any other HTTP client-based crawler
from crawlee.http_clients import PlaywrightHttpClient

http_client = PlaywrightHttpClient()
crawler = HttpCrawler(http_client=http_client)
```
"""

_DEFAULT_HEADER_GENERATOR = HeaderGenerator()

def __init__(
self,
*,
persist_cookies_per_session: bool = True,
header_generator: HeaderGenerator | None = _DEFAULT_HEADER_GENERATOR,
**request_context_kwargs: Any,
) -> None:
"""Initialize a new instance.

Args:
persist_cookies_per_session: Whether to persist cookies per HTTP session.
header_generator: Header generator instance to use for generating common headers.
request_context_kwargs: Additional keyword arguments for Playwright's APIRequestContext.
"""
super().__init__(
persist_cookies_per_session=persist_cookies_per_session,
)

self._request_context_kwargs = request_context_kwargs
self._header_generator = header_generator

self._playwright_context_manager = async_playwright()
self._playwright: Playwright | None = None

@override
async def crawl(
self,
request: Request,
*,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
statistics: Statistics | None = None,
) -> HttpCrawlingResult:
client = await self._get_client(proxy_info, session)
headers = self._combine_headers(request.headers)

response = await client.fetch(
url_or_request=request.url,
method=request.method.lower(),
headers=dict(headers) if headers else None,
data=request.payload,
)

if statistics:
statistics.register_status_code(response.status)

if self._persist_cookies_per_session and session:
await self._store_cookies_in_session(client, session)

request.loaded_url = response.url
content = await response.body()

await client.dispose()

return HttpCrawlingResult(
http_response=_PlaywrightResponse(response, content=content),
)

@override
async def send_request(
self,
url: str,
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | dict[str, str] | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
if isinstance(headers, dict) or headers is None:
headers = HttpHeaders(headers or {})

client = await self._get_client(proxy_info, session)
headers = self._combine_headers(headers)

response = await client.fetch(
url_or_request=url, method=method.lower(), headers=dict(headers) if headers else None, data=payload
)

if self._persist_cookies_per_session and session:
await self._store_cookies_in_session(client, session)

content = await response.body()

await client.dispose()

return _PlaywrightResponse(response, content=content)

def _combine_headers(self, explicit_headers: HttpHeaders | None) -> HttpHeaders | None:
"""Merge default headers with explicit headers for an HTTP request.

Generate a final set of request headers by combining default headers, a random User-Agent header,
and any explicitly provided headers.
"""
common_headers = self._header_generator.get_common_headers() if self._header_generator else HttpHeaders()
user_agent_header = (
self._header_generator.get_random_user_agent_header() if self._header_generator else HttpHeaders()
)
explicit_headers = explicit_headers or HttpHeaders()
headers = common_headers | user_agent_header | explicit_headers
return headers if headers else None

async def _get_client(self, proxy_info: ProxyInfo | None, session: Session | None) -> APIRequestContext:
"""Create a new Playwright APIRequestContext.

Creates a new context for each request, configured with the appropriate
proxy settings and cookies from the session.

Args:
proxy_info: The proxy configuration, if any
session: The session object, if any

Returns:
A newly created Playwright APIRequestContext
"""
kwargs: dict[str, Any] = {}

if proxy_info:
kwargs['proxy'] = ProxySettings(
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
username=proxy_info.username,
password=proxy_info.password,
)

if self._persist_cookies_per_session and session and session.cookies:
pw_cookies = session.cookies.get_cookies_as_playwright_format()
if pw_cookies:
kwargs['storage_state'] = {'cookies': pw_cookies, 'origins': []}

kwargs.update(self._request_context_kwargs)

if not self._playwright:
raise RuntimeError(f'The {self.__class__.__name__} is not started.')

return await self._playwright.request.new_context(**kwargs)

async def _store_cookies_in_session(self, client: APIRequestContext, session: Session) -> None:
"""Store cookies from the Playwright request context in the session."""
storage_state = await client.storage_state()
session.cookies.set_cookies_from_playwright_format(storage_state.get('cookies', []))

@override
async def __aenter__(self) -> PlaywrightHttpClient:
self._playwright = await self._playwright_context_manager.__aenter__()
return self

@override
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
exc_traceback: TracebackType | None,
) -> None:
await self._playwright_context_manager.__aexit__(exc_type, exc_value, exc_traceback)
self._playwright = None
self._playwright_context_manager = async_playwright()
4 changes: 4 additions & 0 deletions src/crawlee/sessions/_cookies.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,12 @@ def _to_playwright(self, cookie_dict: CookieParam) -> PlaywrightCookieParam:
result['httpOnly'] = result.pop('http_only')
if 'same_site' in result:
result['sameSite'] = result.pop('same_site')
else:
result['sameSite'] = 'None'
if 'expires' in result:
result['expires'] = float(result['expires'])
else:
result['expires'] = -1

return PlaywrightCookieParam(**result)

Expand Down
7 changes: 5 additions & 2 deletions tests/unit/crawlers/_http/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from crawlee import ConcurrencySettings, Request
from crawlee.crawlers import HttpCrawler
from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient
from crawlee.http_clients import CurlImpersonateHttpClient, HttpxHttpClient, PlaywrightHttpClient
from crawlee.sessions import SessionPool

if TYPE_CHECKING:
Expand Down Expand Up @@ -61,11 +61,14 @@ async def crawler_without_retries(
params=[
pytest.param('curl', id='curl'),
pytest.param('httpx', id='httpx'),
pytest.param('playwright', id='playwright'),
]
)
async def http_client(request: pytest.FixtureRequest) -> CurlImpersonateHttpClient | HttpxHttpClient:
async def http_client(request: pytest.FixtureRequest) -> HttpClient:
if request.param == 'curl':
return CurlImpersonateHttpClient(http_version=CurlHttpVersion.V1_1)
if request.param == 'playwright':
return PlaywrightHttpClient()
return HttpxHttpClient(http2=False)


Expand Down