Merge pull request #113 from VinciGit00/playwright

VinciGit00 · web-flow · commit e494455925c7 · 2024-04-30T07:56:12.000+02:00
diff --git a/README.md b/README.md
@@ -23,6 +23,10 @@ The reference page for Scrapegraph-ai is available on the official page of pypy:
 ```bash
 pip install scrapegraphai
 ```
+you will also need to install Playwright for javascript-based scraping:
+```bash
+playwright install
+```
 ## 🔍 Demo
 Official streamlit demo:
 
diff --git a/examples/mixed_models/smart_scraper_mixed.py b/examples/mixed_models/smart_scraper_mixed.py
@@ -24,7 +24,8 @@
         "model": "ollama/nomic-embed-text",
         "temperature": 0,
         "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
-    }
+    },
+    "headless": False
 }
 
 # ************************************************
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ google = "3.0.0"
 minify-html = "0.15.0"
 free-proxy = "1.1.1"
 langchain-groq = "0.1.3"
+playwright = "^1.43.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "8.0.0"
diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,4 @@ google==3.0.0
 minify-html==0.15.0
 free-proxy==1.1.1
 langchain-groq==0.1.3
+playwright==1.43.0
diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py
@@ -29,6 +29,7 @@ def _create_graph(self):
         fetch_node = FetchNode(
             input="url | local_dir",
             output=["doc"],
+            node_config={"headless": True if self.config is None else self.config.get("headless", True)}
         )
         parse_node = ParseNode(
             input="doc",
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -25,13 +25,15 @@ def __init__(self, prompt: str, source: str, config: dict):
 
         self.input_key = "url" if source.startswith("http") else "local_dir"
 
+
     def _create_graph(self):
         """
         Creates the graph of nodes representing the workflow for web scraping.
         """
         fetch_node = FetchNode(
             input="url | local_dir",
             output=["doc"],
+            node_config={"headless": True if self.config is None else self.config.get("headless", True)}
         )
         parse_node = ParseNode(
             input="doc",
diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py
@@ -35,6 +35,7 @@ def _create_graph(self):
         fetch_node = FetchNode(
             input="url | local_dir",
             output=["doc"],
+            node_config={"headless": True if self.config is None else self.config.get("headless", True)}
         )
         parse_node = ParseNode(
             input="doc",
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -2,8 +2,8 @@
 Module for fetching the HTML node
 """
 
-from typing import List
-from langchain_community.document_loaders import AsyncHtmlLoader
+from typing import List, Optional
+from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.documents import Document
 from .base_node import BaseNode
 from ..utils.remover import remover
@@ -37,7 +37,7 @@ class FetchNode(BaseNode):
                         to succeed.
     """
 
-    def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
+    def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
         """
         Initializes the FetchHTMLNode with a node name and node type.
         Arguments:
@@ -46,6 +46,8 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
         """
         super().__init__(node_name, "node", input, output, 1)
 
+        self.headless = True if node_config is None else node_config.get("headless", True)
+
     def execute(self, state):
         """
         Executes the node's logic to fetch HTML content from a specified URL and
@@ -79,14 +81,21 @@ def execute(self, state):
 
         else:
             if self.node_config is not None and self.node_config.get("endpoint") is not None:
-                loader = AsyncHtmlLoader(
-                    source, proxies={"http": self.node_config["endpoint"]})
+                
+                loader = AsyncChromiumLoader(
+                    [source],
+                    proxies={"http": self.node_config["endpoint"]},
+                    headless=self.headless,
+                )
             else:
-                loader = AsyncHtmlLoader(source)
+                loader = AsyncChromiumLoader(
+                    [source],
+                    headless=self.headless,
+                )
 
             document = loader.load()
             compressed_document = [
-                Document(page_content=remover(str(document)))]
+                Document(page_content=remover(str(document[0].page_content)))]
 
         state.update({self.output[0]: compressed_document})
         return state

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,8 @@`
`24`	`24`	`"model": "ollama/nomic-embed-text",`
`25`	`25`	`"temperature": 0,`
`26`	`26`	`"base_url": "http://localhost:11434", # set ollama URL arbitrarily`
`27`		`- }`
	`27`	`+ },`
	`28`	`+ "headless": False`
`28`	`29`	`}`
`29`	`30`
`30`	`31`	`# ************************************************`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ def _create_graph(self):`
`29`	`29`	`fetch_node = FetchNode(`
`30`	`30`	`input="url \| local_dir",`
`31`	`31`	`output=["doc"],`
	`32`	`+ node_config={"headless": True if self.config is None else self.config.get("headless", True)}`
`32`	`33`	`)`
`33`	`34`	`parse_node = ParseNode(`
`34`	`35`	`input="doc",`
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ def _create_graph(self):`
`35`	`35`	`fetch_node = FetchNode(`
`36`	`36`	`input="url \| local_dir",`
`37`	`37`	`output=["doc"],`
	`38`	`+ node_config={"headless": True if self.config is None else self.config.get("headless", True)}`
`38`	`39`	`)`
`39`	`40`	`parse_node = ParseNode(`
`40`	`41`	`input="doc",`