From 42ab0aa1d275b5798ab6fc9feea575fe59b6e767 Mon Sep 17 00:00:00 2001 From: "EURAC\\marperini" Date: Tue, 30 Apr 2024 04:02:58 +0200 Subject: [PATCH] feat(fetch): added playwright support --- README.md | 4 ++++ examples/mixed_models/smart_scraper_mixed.py | 3 ++- pyproject.toml | 1 + requirements.txt | 1 + scrapegraphai/graphs/search_graph.py | 1 + scrapegraphai/graphs/smart_scraper_graph.py | 2 ++ scrapegraphai/graphs/speech_graph.py | 1 + scrapegraphai/nodes/fetch_node.py | 23 ++++++++++++++------ 8 files changed, 28 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 5618e572..7ceef975 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,10 @@ The reference page for Scrapegraph-ai is available on the official page of pypy: ```bash pip install scrapegraphai ``` +you will also need to install Playwright for javascript-based scraping: +```bash +playwright install +``` ## 🔍 Demo Official streamlit demo: diff --git a/examples/mixed_models/smart_scraper_mixed.py b/examples/mixed_models/smart_scraper_mixed.py index 33ad3b91..6adb61b5 100644 --- a/examples/mixed_models/smart_scraper_mixed.py +++ b/examples/mixed_models/smart_scraper_mixed.py @@ -24,7 +24,8 @@ "model": "ollama/nomic-embed-text", "temperature": 0, "base_url": "http://localhost:11434", # set ollama URL arbitrarily - } + }, + "headless": False } # ************************************************ diff --git a/pyproject.toml b/pyproject.toml index 135672f8..ed8842b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ google = "3.0.0" minify-html = "0.15.0" free-proxy = "1.1.1" langchain-groq = "0.1.3" +playwright = "^1.43.0" [tool.poetry.dev-dependencies] pytest = "8.0.0" diff --git a/requirements.txt b/requirements.txt index 6f17b594..02aadac4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ google==3.0.0 minify-html==0.15.0 free-proxy==1.1.1 langchain-groq==0.1.3 +playwright==1.43.0 \ No newline at end of file diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py index b48965dd..5ae92ec2 100644 --- a/scrapegraphai/graphs/search_graph.py +++ b/scrapegraphai/graphs/search_graph.py @@ -29,6 +29,7 @@ def _create_graph(self): fetch_node = FetchNode( input="url | local_dir", output=["doc"], + node_config={"headless": True if self.config is None else self.config.get("headless", True)} ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 5a520224..ae5a9794 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -25,6 +25,7 @@ def __init__(self, prompt: str, source: str, config: dict): self.input_key = "url" if source.startswith("http") else "local_dir" + def _create_graph(self): """ Creates the graph of nodes representing the workflow for web scraping. @@ -32,6 +33,7 @@ def _create_graph(self): fetch_node = FetchNode( input="url | local_dir", output=["doc"], + node_config={"headless": True if self.config is None else self.config.get("headless", True)} ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py index 2b10077f..96d61a8b 100644 --- a/scrapegraphai/graphs/speech_graph.py +++ b/scrapegraphai/graphs/speech_graph.py @@ -35,6 +35,7 @@ def _create_graph(self): fetch_node = FetchNode( input="url | local_dir", output=["doc"], + node_config={"headless": True if self.config is None else self.config.get("headless", True)} ) parse_node = ParseNode( input="doc", diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 76d80929..b22e02c5 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -2,8 +2,8 @@ Module for fetching the HTML node """ -from typing import List -from langchain_community.document_loaders import AsyncHtmlLoader +from typing import List, Optional +from langchain_community.document_loaders import AsyncChromiumLoader from langchain_core.documents import Document from .base_node import BaseNode from ..utils.remover import remover @@ -37,7 +37,7 @@ class FetchNode(BaseNode): to succeed. """ - def __init__(self, input: str, output: List[str], node_name: str = "Fetch"): + def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"): """ Initializes the FetchHTMLNode with a node name and node type. Arguments: @@ -46,6 +46,8 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"): """ super().__init__(node_name, "node", input, output, 1) + self.headless = True if node_config is None else node_config.get("headless", True) + def execute(self, state): """ Executes the node's logic to fetch HTML content from a specified URL and @@ -79,14 +81,21 @@ def execute(self, state): else: if self.node_config is not None and self.node_config.get("endpoint") is not None: - loader = AsyncHtmlLoader( - source, proxies={"http": self.node_config["endpoint"]}) + + loader = AsyncChromiumLoader( + [source], + proxies={"http": self.node_config["endpoint"]}, + headless=self.headless, + ) else: - loader = AsyncHtmlLoader(source) + loader = AsyncChromiumLoader( + [source], + headless=self.headless, + ) document = loader.load() compressed_document = [ - Document(page_content=remover(str(document)))] + Document(page_content=remover(str(document[0].page_content)))] state.update({self.output[0]: compressed_document}) return state