forked from apify/crawlee-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcloud_run_example.py
55 lines (42 loc) · 1.75 KB
/
cloud_run_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# mypy: disable-error-code="misc"
import json
import os
import uvicorn
from litestar import Litestar, get
from crawlee import service_locator
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
# highlight-start
# Disable writing storage data to the file system
configuration = service_locator.get_configuration()
configuration.persist_storage = False
configuration.write_metadata = False
# highlight-end
@get('/')
async def main() -> str:
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
crawler = PlaywrightCrawler(
headless=True,
max_requests_per_crawl=10,
browser_type='firefox',
)
@crawler.router.default_handler
async def default_handler(context: PlaywrightCrawlingContext) -> None:
"""Default request handler that processes each page during crawling."""
context.log.info(f'Processing {context.request.url} ...')
title = await context.page.query_selector('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': await title.inner_text() if title else None,
}
)
await context.enqueue_links()
await crawler.run(['https://crawlee.dev'])
data = await crawler.get_data()
# Return the results as JSON to the client
return json.dumps(data.items)
# Initialize the Litestar app with our route handler
app = Litestar(route_handlers=[main])
# Start the Uvicorn server using the `PORT` environment variable provided by GCP
# This is crucial - Cloud Run expects your app to listen on this specific port
uvicorn.run(app, host='0.0.0.0', port=int(os.environ.get('PORT', '8080'))) # noqa: S104 # Use all interfaces in a container, safely