Skip to content

feat(fetch): added playwright support #113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ The reference page for Scrapegraph-ai is available on the official page of pypy:
```bash
pip install scrapegraphai
```
you will also need to install Playwright for javascript-based scraping:
```bash
playwright install
```
## 🔍 Demo
Official streamlit demo:

Expand Down
3 changes: 2 additions & 1 deletion examples/mixed_models/smart_scraper_mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
}
},
"headless": False
}

# ************************************************
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ google = "3.0.0"
minify-html = "0.15.0"
free-proxy = "1.1.1"
langchain-groq = "0.1.3"
playwright = "^1.43.0"

[tool.poetry.dev-dependencies]
pytest = "8.0.0"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ google==3.0.0
minify-html==0.15.0
free-proxy==1.1.1
langchain-groq==0.1.3
playwright==1.43.0
1 change: 1 addition & 0 deletions scrapegraphai/graphs/search_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def _create_graph(self):
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
)
parse_node = ParseNode(
input="doc",
Expand Down
2 changes: 2 additions & 0 deletions scrapegraphai/graphs/smart_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@ def __init__(self, prompt: str, source: str, config: dict):

self.input_key = "url" if source.startswith("http") else "local_dir"


def _create_graph(self):
"""
Creates the graph of nodes representing the workflow for web scraping.
"""
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
)
parse_node = ParseNode(
input="doc",
Expand Down
1 change: 1 addition & 0 deletions scrapegraphai/graphs/speech_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def _create_graph(self):
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
)
parse_node = ParseNode(
input="doc",
Expand Down
23 changes: 16 additions & 7 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
Module for fetching the HTML node
"""

from typing import List
from langchain_community.document_loaders import AsyncHtmlLoader
from typing import List, Optional
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document
from .base_node import BaseNode
from ..utils.remover import remover
Expand Down Expand Up @@ -37,7 +37,7 @@ class FetchNode(BaseNode):
to succeed.
"""

def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
"""
Initializes the FetchHTMLNode with a node name and node type.
Arguments:
Expand All @@ -46,6 +46,8 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
"""
super().__init__(node_name, "node", input, output, 1)

self.headless = True if node_config is None else node_config.get("headless", True)

def execute(self, state):
"""
Executes the node's logic to fetch HTML content from a specified URL and
Expand Down Expand Up @@ -79,14 +81,21 @@ def execute(self, state):

else:
if self.node_config is not None and self.node_config.get("endpoint") is not None:
loader = AsyncHtmlLoader(
source, proxies={"http": self.node_config["endpoint"]})

loader = AsyncChromiumLoader(
[source],
proxies={"http": self.node_config["endpoint"]},
headless=self.headless,
)
else:
loader = AsyncHtmlLoader(source)
loader = AsyncChromiumLoader(
[source],
headless=self.headless,
)

document = loader.load()
compressed_document = [
Document(page_content=remover(str(document)))]
Document(page_content=remover(str(document[0].page_content)))]

state.update({self.output[0]: compressed_document})
return state
Loading