Merge pull request #64 from VinciGit00/generateScraperbranch

PeriniM · web-flow · commit edc439fd0144 · 2024-04-17T11:51:06.000+02:00
add generateScraperBranch
diff --git a/examples/gemini/script_generator_gemini.py b/examples/gemini/script_generator_gemini.py
@@ -0,0 +1,45 @@
+""" 
+Basic example of scraping pipeline using ScriptCreatorGraph
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+gemini_key = os.getenv("GOOGLE_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": gemini_key,
+        "model": "gpt-3.5-turbo",
+    },
+}
+
+# ************************************************
+# Create the ScriptCreatorGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = ScriptCreatorGraph(
+    prompt="List me all the news with their description.",
+    # also accepts a string with the already downloaded HTML code
+    source="https://perinim.github.io/projects",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/local_models/Docker/script_generator_docker.py b/examples/local_models/Docker/script_generator_docker.py
@@ -0,0 +1,43 @@
+""" 
+Basic example of scraping pipeline using ScriptCreatorGraph
+"""
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",
+        # "model_tokens": 2000, # set context length arbitrarily,
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+    }
+}
+
+# ************************************************
+# Create the ScriptCreatorGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = ScriptCreatorGraph(
+    prompt="List me all the news with their description.",
+    # also accepts a string with the already downloaded HTML code
+    source="https://perinim.github.io/projects",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/local_models/Ollama/script_generator_ollama.py b/examples/local_models/Ollama/script_generator_ollama.py
@@ -0,0 +1,44 @@
+""" 
+Basic example of scraping pipeline using ScriptCreatorGraph
+"""
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily,
+        "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    }
+}
+
+# ************************************************
+# Create the ScriptCreatorGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = ScriptCreatorGraph(
+    prompt="List me all the news with their description.",
+    # also accepts a string with the already downloaded HTML code
+    source="https://perinim.github.io/projects",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/openai/script_generator_openai.py b/examples/openai/script_generator_openai.py
@@ -0,0 +1,44 @@
+""" 
+Basic example of scraping pipeline using ScriptCreatorGraph
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-3.5-turbo",
+    },
+}
+
+# ************************************************
+# Create the ScriptCreatorGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = ScriptCreatorGraph(
+    prompt="List me all the news with their description.",
+    # also accepts a string with the already downloaded HTML code
+    source="https://perinim.github.io/projects",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py
@@ -6,6 +6,7 @@
 from dotenv import load_dotenv
 from scrapegraphai.graphs import SmartScraperGraph
 from scrapegraphai.utils import prettify_exec_info
+
 load_dotenv()
 
 
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
@@ -5,3 +5,4 @@
 from .smart_scraper_graph import SmartScraperGraph
 from .speech_graph import SpeechGraph
 from .search_graph import SearchGraph
+from .script_creator_graph import ScriptCreatorGraph
diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py
@@ -0,0 +1,77 @@
+""" 
+Module for creating the smart scraper
+"""
+from .base_graph import BaseGraph
+from ..nodes import (
+    FetchNode,
+    ParseNode,
+    RAGNode,
+    GenerateScraperNode
+)
+from .abstract_graph import AbstractGraph
+
+
+class ScriptCreatorGraph(AbstractGraph):
+    """
+    SmartScraper is a comprehensive web scraping tool that automates the process of extracting
+    information from web pages using a natural language model to interpret and answer prompts.
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict):
+        """
+        Initializes the ScriptCreatorGraph with a prompt, source, and configuration.
+        """
+        super().__init__(prompt, config, source)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+
+    def _create_graph(self):
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+        """
+        fetch_node = FetchNode(
+            input="url | local_dir",
+            output=["doc"],
+        )
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={"chunk_size": self.model_token}
+        )
+        rag_node = RAGNode(
+            input="user_prompt & (parsed_doc | doc)",
+            output=["relevant_chunks"],
+            node_config={
+                "llm": self.llm_model,
+                "embedder_model": self.embedder_model
+            }
+        )
+        generate_scraper_node = GenerateScraperNode(
+            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            output=["answer"],
+            node_config={"llm": self.llm_model},
+        )
+
+        return BaseGraph(
+            nodes={
+                fetch_node,
+                parse_node,
+                rag_node,
+                generate_scraper_node,
+            },
+            edges={
+                (fetch_node, parse_node),
+                (parse_node, rag_node),
+                (rag_node, generate_scraper_node)
+            },
+            entry_point=fetch_node
+        )
+
+    def run(self) -> str:
+        """
+        Executes the web scraping process and returns the answer to the prompt.
+        """
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -11,3 +11,4 @@
 from .text_to_speech_node import TextToSpeechNode
 from .image_to_text_node import ImageToTextNode
 from .search_internet_node import SearchInternetNode
+from .generate_scraper_node import GenerateScraperNode
diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py