Skip to content

Commit 2e39490

Browse files
committed
Init
0 parents  commit 2e39490

11 files changed

+191
-0
lines changed

.gitignore

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
*.pyc
2+
.~lock*
3+
.DS_Store
4+
.mypy_cache/
5+
*.egg-info/
6+
.tox/
7+
.coverage
8+
htmlcov/
9+
coverage.xml
10+
build/
11+
dist/
12+
examples/
13+
pip-wheel-metadata/

LICENSE

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
Copyright 2020 Eugenio Lacuesta
2+
3+
Redistribution and use in source and binary forms, with or without modification,
4+
are permitted provided that the following conditions are met:
5+
6+
1. Redistributions of source code must retain the above copyright notice,
7+
this list of conditions and the following disclaimer.
8+
9+
2. Redistributions in binary form must reproduce the above copyright notice,
10+
this list of conditions and the following disclaimer in the documentation and/or
11+
other materials provided with the distribution.
12+
13+
3. Neither the name of the copyright holder nor the names of its contributors
14+
may be used to endorse or promote products derived from this software without
15+
specific prior written permission.
16+
17+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
21+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Makefile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
.PHONY: lint types black clean
2+
3+
lint:
4+
@python -m flake8 --exclude=.git,venv* scrapy_playwright/*.py tests/*.py
5+
6+
types:
7+
@mypy --ignore-missing-imports --follow-imports=skip scrapy_playwright/*.py tests/*.py
8+
9+
black:
10+
@black --check scrapy_playwright tests
11+
12+
clean:
13+
@find . -name "*.pyc" -delete
14+
@rm -rf .mypy_cache/ .tox/ build/ dist/ htmlcov/ .coverage coverage.xml

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[tool.black]
2+
line-length = 99

requirements-dev.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
black>=19.10b0
2+
flake8>=3.7.9
3+
mypy==0.780
4+
tox>=3.14

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
playwright>=0.7.0
2+
scrapy>=2.0

scrapy_playwright/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__version__ = "0.0.1"

scrapy_playwright/handler.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import logging
2+
from time import time
3+
from typing import Type, TypeVar
4+
5+
from playwright import AsyncPlaywrightContextManager
6+
from scrapy import Spider, signals
7+
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
8+
from scrapy.crawler import Crawler
9+
from scrapy.http import Request, Response
10+
from scrapy.http.headers import Headers
11+
from scrapy.responsetypes import responsetypes
12+
from scrapy.utils.defer import deferred_from_coro
13+
from scrapy.utils.reactor import verify_installed_reactor
14+
from twisted.internet.defer import Deferred, inlineCallbacks
15+
16+
17+
logger = logging.getLogger("scrapy-playwright")
18+
PlaywrightHandler = TypeVar("PlaywrightHandler", bound="ScrapyPlaywrightDownloadHandler")
19+
20+
21+
class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
22+
def __init__(self, crawler: Crawler) -> None:
23+
super().__init__(settings=crawler.settings, crawler=crawler)
24+
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
25+
crawler.signals.connect(self._engine_started, signals.engine_started)
26+
self.stats = crawler.stats
27+
28+
@classmethod
29+
def from_crawler(cls: Type[PlaywrightHandler], crawler: Crawler) -> PlaywrightHandler:
30+
return cls(crawler)
31+
32+
def _engine_started(self) -> Deferred:
33+
return deferred_from_coro(self._launch_browser())
34+
35+
async def _launch_browser(self) -> None:
36+
self.playwright_context_manager = AsyncPlaywrightContextManager()
37+
self.playwright = await self.playwright_context_manager.start()
38+
# FIXME: chromium hard-coded during initial development
39+
self.browser = await self.playwright.chromium.launch()
40+
41+
@inlineCallbacks
42+
def close(self) -> Deferred:
43+
yield super().close()
44+
if self.browser:
45+
yield deferred_from_coro(self.browser.close())
46+
yield deferred_from_coro(self.playwright_context_manager.__aexit__())
47+
48+
def download_request(self, request: Request, spider: Spider) -> Deferred:
49+
if request.meta.get("playwright"):
50+
return deferred_from_coro(self._download_request_playwright(request, spider))
51+
return super().download_request(request, spider)
52+
53+
async def _download_request_playwright(self, request: Request, spider: Spider) -> Response:
54+
page = await self.browser.newPage() # type: ignore
55+
self.stats.inc_value("playwright/page_count")
56+
57+
start_time = time()
58+
response = await page.goto(request.url)
59+
60+
body = (await page.content()).encode("utf8")
61+
request.meta["download_latency"] = time() - start_time
62+
63+
await page.screenshot(path="page.png") # FIXME: only for development
64+
await page.close()
65+
self.stats.inc_value("playwright/page_count/closed")
66+
67+
headers = Headers(response.headers)
68+
headers.pop("Content-Encoding", None)
69+
respcls = responsetypes.from_args(headers=headers, url=page.url, body=body)
70+
return respcls(
71+
url=page.url,
72+
status=response.status,
73+
headers=headers,
74+
body=body,
75+
request=request,
76+
flags=["playwright"],
77+
)

setup.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[flake8]
2+
max-line-length = 99

setup.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import setuptools
2+
3+
4+
with open("README.md", "r") as fh:
5+
long_description = fh.read()
6+
7+
8+
setuptools.setup(
9+
name="scrapy-playwright",
10+
version="0.0.1",
11+
license="BSD",
12+
description="Playwright integration for Scrapy",
13+
long_description=long_description,
14+
long_description_content_type="text/markdown",
15+
author="Eugenio Lacuesta",
16+
author_email="eugenio.lacuesta@gmail.com",
17+
url="https://github.com/elacuesta/scrapy-playwright",
18+
packages=["scrapy_playwright"],
19+
classifiers=[
20+
"Development Status :: 1 - Planning",
21+
"License :: OSI Approved :: BSD License",
22+
"Programming Language :: Python",
23+
"Programming Language :: Python :: 3.7",
24+
"Programming Language :: Python :: 3.8",
25+
"Framework :: Scrapy",
26+
"Intended Audience :: Developers",
27+
"Topic :: Internet :: WWW/HTTP",
28+
"Topic :: Software Development :: Libraries :: Application Frameworks",
29+
"Topic :: Software Development :: Libraries :: Python Modules",
30+
],
31+
install_requires=["scrapy>=2.0", "playwright>=0.7.0"],
32+
)

tox.ini

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[tox]
2+
envlist = py36,py37,py38
3+
4+
5+
[testenv]
6+
deps =
7+
-rrequirements.txt
8+
-rtests/requirements.txt
9+
commands =
10+
py.test --reactor=asyncio --cov=scrapy_playwright --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: scrapy_playwright tests}
11+
12+
13+
[testenv:py37]
14+
basepython = python3.7
15+
16+
17+
[testenv:py38]
18+
basepython = python3.8

0 commit comments

Comments
 (0)