Open
Description
Hello, I'm experiencing performance issues with my web crawler after approximately 1.5 to 2 hours of runtime. The crawling speed significantly decreases to about one site per minute or less, and I'm encountering numerous timeout errors.
Questions:
- What could be causing the performance degradation over time (maybe related to the queue size) ?
- Is this behavior correct?
Here is the code I use:
import asyncio
from crawlee.beautifulsoup_crawler import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
async def main() -> None:
crawler = BeautifulSoupCrawler()
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
url = context.request.url
context.log.info(f"Processing page: {url}")
await context.enqueue_links(strategy="all")
# Start the crawler with the provided URLs
await crawler.run(["https://crawlee.dev/"])
if __name__ == "__main__":
asyncio.run(main())
The logs and errors:
[crawlee.beautifulsoup_crawler.beautifulsoup_crawler] ERROR An exception occurred during handling of a request. This places the crawler and its underlying storages into an unknown state and crawling will be terminated.
Traceback (most recent call last):
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/basic_crawler/basic_crawler.py", line 872, in __run_request_handler
await self._context_pipeline(crawling_context, self.router)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/basic_crawler/context_pipeline.py", line 62, in __call__
result = await middleware_instance.__anext__()
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py", line 69, in _make_http_request
result = await self._http_client.crawl(
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/http_clients/httpx_client.py", line 98, in crawl
response = await client.send(http_request, follow_redirects=True)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_client.py", line 1675, in send
raise exc
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_client.py", line 1669, in send
await response.aread()
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_models.py", line 911, in aread
self._content = b"".join([part async for part in self.aiter_bytes()])
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_models.py", line 911, in <listcomp>
self._content = b"".join([part async for part in self.aiter_bytes()])
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_models.py", line 929, in aiter_bytes
async for raw_bytes in self.aiter_raw():
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_models.py", line 987, in aiter_raw
async for raw_stream_bytes in self.stream:
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_client.py", line 149, in __aiter__
async for chunk in self._stream:
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpx/_transports/default.py", line 254, in __aiter__
async for part in self._httpcore_stream:
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py", line 367, in __aiter__
raise exc from None
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py", line 363, in __aiter__
async for part in self._stream:
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/http11.py", line 349, in __aiter__
raise exc
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/http11.py", line 341, in __aiter__
async for chunk in self._connection._receive_response_body(**kwargs):
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/http11.py", line 210, in _receive_response_body
event = await self._receive_event(timeout=timeout)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_async/http11.py", line 224, in _receive_event
data = await self._network_stream.read(
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/httpcore/_backends/anyio.py", line 35, in read
return await self._stream.receive(max_bytes=max_bytes)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/anyio/streams/tls.py", line 205, in receive
data = await self._call_sslobject_method(self._ssl_object.read, max_bytes)
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/anyio/streams/tls.py", line 147, in _call_sslobject_method
data = await self.transport_stream.receive()
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 1145, in receive
await AsyncIOBackend.checkpoint()
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2050, in checkpoint
await sleep(0)
File "/home/debian/.pyenv/versions/3.10.4/lib/python3.10/asyncio/tasks.py", line 596, in sleep
await __sleep0()
File "/home/debian/.pyenv/versions/3.10.4/lib/python3.10/asyncio/tasks.py", line 590, in __sleep0
yield
asyncio.exceptions.CancelledError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/debian/.pyenv/versions/3.10.4/lib/python3.10/asyncio/tasks.py", line 456, in wait_for
return fut.result()
asyncio.exceptions.CancelledError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/_utils/wait.py", line 37, in wait_for
return await asyncio.wait_for(operation(), timeout.total_seconds())
File "/home/debian/.pyenv/versions/3.10.4/lib/python3.10/asyncio/tasks.py", line 458, in wait_for
raise exceptions.TimeoutError() from exc
asyncio.exceptions.TimeoutError
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/basic_crawler/basic_crawler.py", line 782, in __run_task_function
await wait_for(
File "/home/debian/Crawler/venv/lib/python3.10/site-packages/crawlee/_utils/wait.py", line 39, in wait_for
raise asyncio.TimeoutError(timeout_message) from ex
asyncio.exceptions.TimeoutError: Request handler timed out after 60.0 seconds
[crawlee.autoscaling.autoscaled_pool] WARN Task timed out after *not set* seconds
d[crawlee.statistics.statistics] INFO crawlee.beautifulsoup_crawler.beautifulsoup_crawler request statistics {
"requests_finished": 5622,
"requests_failed": 1,
"retry_histogram": [
5622,
0,
1
],
"request_avg_failed_duration": 0.859078,
"request_avg_finished_duration": 50.458161,
"requests_finished_per_minute": 127,
"requests_failed_per_minute": 0,
"request_total_duration": 283676.640522,
"requests_total": 5623,
"crawler_runtime": 2646.37724
}
[crawlee.autoscaling.autoscaled_pool] INFO current_concurrency = 197; desired_concurrency = 161; cpu = 0.0; mem = 0.0; ev