Skip to content

Commit e494455

Browse files
authored
Merge pull request #113 from VinciGit00/playwright
2 parents 450291f + 42ab0aa commit e494455

File tree

8 files changed

+28
-8
lines changed

8 files changed

+28
-8
lines changed

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ The reference page for Scrapegraph-ai is available on the official page of pypy:
2323
```bash
2424
pip install scrapegraphai
2525
```
26+
you will also need to install Playwright for javascript-based scraping:
27+
```bash
28+
playwright install
29+
```
2630
## 🔍 Demo
2731
Official streamlit demo:
2832

examples/mixed_models/smart_scraper_mixed.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
"model": "ollama/nomic-embed-text",
2525
"temperature": 0,
2626
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
27-
}
27+
},
28+
"headless": False
2829
}
2930

3031
# ************************************************

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ google = "3.0.0"
3939
minify-html = "0.15.0"
4040
free-proxy = "1.1.1"
4141
langchain-groq = "0.1.3"
42+
playwright = "^1.43.0"
4243

4344
[tool.poetry.dev-dependencies]
4445
pytest = "8.0.0"

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ google==3.0.0
1313
minify-html==0.15.0
1414
free-proxy==1.1.1
1515
langchain-groq==0.1.3
16+
playwright==1.43.0

scrapegraphai/graphs/search_graph.py

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def _create_graph(self):
2929
fetch_node = FetchNode(
3030
input="url | local_dir",
3131
output=["doc"],
32+
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
3233
)
3334
parse_node = ParseNode(
3435
input="doc",

scrapegraphai/graphs/smart_scraper_graph.py

+2
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,15 @@ def __init__(self, prompt: str, source: str, config: dict):
2525

2626
self.input_key = "url" if source.startswith("http") else "local_dir"
2727

28+
2829
def _create_graph(self):
2930
"""
3031
Creates the graph of nodes representing the workflow for web scraping.
3132
"""
3233
fetch_node = FetchNode(
3334
input="url | local_dir",
3435
output=["doc"],
36+
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
3537
)
3638
parse_node = ParseNode(
3739
input="doc",

scrapegraphai/graphs/speech_graph.py

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def _create_graph(self):
3535
fetch_node = FetchNode(
3636
input="url | local_dir",
3737
output=["doc"],
38+
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
3839
)
3940
parse_node = ParseNode(
4041
input="doc",

scrapegraphai/nodes/fetch_node.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
Module for fetching the HTML node
33
"""
44

5-
from typing import List
6-
from langchain_community.document_loaders import AsyncHtmlLoader
5+
from typing import List, Optional
6+
from langchain_community.document_loaders import AsyncChromiumLoader
77
from langchain_core.documents import Document
88
from .base_node import BaseNode
99
from ..utils.remover import remover
@@ -37,7 +37,7 @@ class FetchNode(BaseNode):
3737
to succeed.
3838
"""
3939

40-
def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
40+
def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
4141
"""
4242
Initializes the FetchHTMLNode with a node name and node type.
4343
Arguments:
@@ -46,6 +46,8 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
4646
"""
4747
super().__init__(node_name, "node", input, output, 1)
4848

49+
self.headless = True if node_config is None else node_config.get("headless", True)
50+
4951
def execute(self, state):
5052
"""
5153
Executes the node's logic to fetch HTML content from a specified URL and
@@ -79,14 +81,21 @@ def execute(self, state):
7981

8082
else:
8183
if self.node_config is not None and self.node_config.get("endpoint") is not None:
82-
loader = AsyncHtmlLoader(
83-
source, proxies={"http": self.node_config["endpoint"]})
84+
85+
loader = AsyncChromiumLoader(
86+
[source],
87+
proxies={"http": self.node_config["endpoint"]},
88+
headless=self.headless,
89+
)
8490
else:
85-
loader = AsyncHtmlLoader(source)
91+
loader = AsyncChromiumLoader(
92+
[source],
93+
headless=self.headless,
94+
)
8695

8796
document = loader.load()
8897
compressed_document = [
89-
Document(page_content=remover(str(document)))]
98+
Document(page_content=remover(str(document[0].page_content)))]
9099

91100
state.update({self.output[0]: compressed_document})
92101
return state

0 commit comments

Comments
 (0)