diff --git a/examples/ScrapeGraphAI_generated_graph b/examples/ScrapeGraphAI_generated_graph deleted file mode 100644 index acc3232c..00000000 --- a/examples/ScrapeGraphAI_generated_graph +++ /dev/null @@ -1,19 +0,0 @@ -// ScrapeGraphAI Generated Graph -digraph { - node [color=lightblue2 style=filled] - FetchHTMLNode [shape=doublecircle] - GetProbableTagsNode - ParseNode - RAGNode - GenerateAnswerNode - ConditionalNode - ImageToTextNode - TextToSpeechNode - FetchHTMLNode -> GetProbableTagsNode - GetProbableTagsNode -> ParseNode - ParseNode -> RAGNode - RAGNode -> GenerateAnswerNode - RAGNode -> ConditionalNode - ConditionalNode -> ImageToTextNode - ConditionalNode -> TextToSpeechNode -} diff --git a/examples/graph_builder_example.py b/examples/graph_builder_example.py deleted file mode 100644 index 53956e09..00000000 --- a/examples/graph_builder_example.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Example of graph builder -""" -import os -from dotenv import load_dotenv -from scrapegraphai.builders import GraphBuilder - -load_dotenv() -openai_key = os.getenv("OPENAI_APIKEY") - -# Define the configuration for the graph -graph_config = { - "llm": { - "api_key": openai_key, - "model": "gpt-3.5-turbo", - }, -} - -# Example usage of GraphBuilder -graph_builder = GraphBuilder( - user_prompt="Extract the news and generate a text summary with a voiceover.", - config=graph_config -) - -graph_json = graph_builder.build_graph() - -# Convert the resulting JSON to Graphviz format -graphviz_graph = graph_builder.convert_json_to_graphviz(graph_json) - -# Save the graph to a file and open it in the default viewer -graphviz_graph.render('ScrapeGraphAI_generated_graph', view=True) diff --git a/examples/plain_html_example.txt b/examples/inputs/plain_html_example.txt similarity index 100% rename from examples/plain_html_example.txt rename to examples/inputs/plain_html_example.txt diff --git a/examples/result.csv b/examples/results/result.csv similarity index 100% rename from examples/result.csv rename to examples/results/result.csv diff --git a/examples/result.json b/examples/results/result.json similarity index 100% rename from examples/result.json rename to examples/results/result.json diff --git a/examples/scrape_plain_text.py b/examples/scrape_plain_text.py index fb2c0f84..81dee0f9 100644 --- a/examples/scrape_plain_text.py +++ b/examples/scrape_plain_text.py @@ -5,6 +5,7 @@ import os from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json load_dotenv() openai_key = os.getenv("OPENAI_APIKEY") @@ -19,7 +20,7 @@ # It could be also a http request using the request model -text = open('plain_html_example.txt', 'r', encoding="utf-8") +text = open('inputs/plain_html_example.txt', 'r', encoding="utf-8") # Create the SmartScraperGraph instance smart_scraper_graph = SmartScraperGraph( @@ -32,6 +33,5 @@ print(result) # Save to json or csv -onvert_to_csv(result, "result") +convert_to_csv(result, "result") convert_to_json(result, "result") - diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index 0bfda6fc..16e42b81 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -71,8 +71,7 @@ def _create_llm(self, llm_config: dict): return OpenAI(llm_params) elif "gemini" in llm_params["model"]: return Gemini(llm_params) - else: - raise ValueError("Model not supported") + raise ValueError("Model not supported") def _create_graph(self): """ diff --git a/scrapegraphai/models/gemini.py b/scrapegraphai/models/gemini.py index 95ee2d57..e35fd684 100644 --- a/scrapegraphai/models/gemini.py +++ b/scrapegraphai/models/gemini.py @@ -1,3 +1,6 @@ +""" +Gemini module configuration +""" from langchain_google_genai import ChatGoogleGenerativeAI diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py index e9766588..6a85f2d3 100644 --- a/scrapegraphai/nodes/base_node.py +++ b/scrapegraphai/nodes/base_node.py @@ -40,7 +40,8 @@ class BaseNode(ABC): raised to indicate the incorrect usage. """ - def __init__(self, node_name: str, node_type: str, input: str, output: List[str], min_input_len: int = 1, model_config: Optional[dict] = None): + def __init__(self, node_name: str, node_type: str, input: str, output: List[str], + min_input_len: int = 1, model_config: Optional[dict] = None): """ Initialize the node with a unique identifier and a specified node type. @@ -73,7 +74,9 @@ def execute(self, state: dict) -> dict: pass def get_input_keys(self, state: dict) -> List[str]: - # Use the _parse_input_keys method to identify which state keys are needed based on the input attribute + """Use the _parse_input_keys method to identify which state keys are + needed based on the input attribute + """ try: input_keys = self._parse_input_keys(state, self.input) self._validate_input_keys(input_keys) diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py index 703355d5..5d0a4949 100644 --- a/scrapegraphai/nodes/image_to_text_node.py +++ b/scrapegraphai/nodes/image_to_text_node.py @@ -1,7 +1,7 @@ -""" +""" Module for the ImageToTextNode class. """ - +from typing import List from .base_node import BaseNode @@ -10,34 +10,43 @@ class ImageToTextNode(BaseNode): A class representing a node that processes an image and returns the text description. Attributes: - llm (OpenAIImageToText): An instance of the OpenAIImageToText class. + llm_model (OpenAIImageToText): An instance of the OpenAIImageToText class. Methods: execute(state, url): Execute the node's logic and return the updated state. """ - def __init__(self, llm, node_name: str): + def __init__(self, input: str, output: List[str], model_config: dict, + node_name: str = "ImageToText"): """ Initializes an instance of the ImageToTextNode class. Args: - llm (OpenAIImageToText): An instance of the OpenAIImageToText class. - node_name (str): name of the node + input (str): The input for the node. + output (List[str]): The output of the node. + model_config (dict): Configuration for the model. + node_name (str): Name of the node. """ - super().__init__(node_name, "node") - self.llm = llm + super().__init__(node_name, "node", input, output, 1, model_config) + self.llm_model = model_config["llm_model"] - def execute(self, state: dict, url: str) -> dict: + def execute(self, state: dict) -> dict: """ Execute the node's logic and return the updated state. + Args: state (dict): The current state of the graph. - url (str): url of the image where to - :return: The updated state after executing this node. - """ + Returns: + dict: The updated state after executing this node. + """ print("---GENERATING TEXT FROM IMAGE---") - text_answer = self.llm.run(url) + input_keys = self.get_input_keys(state) + + input_data = [state[key] for key in input_keys] + url = input_data[0] + + text_answer = self.llm_model.run(url) state.update({"image_text": text_answer}) return state diff --git a/scrapegraphai/utils/parse_state_keys.py b/scrapegraphai/utils/parse_state_keys.py index c5da7e8a..5c99a60f 100644 --- a/scrapegraphai/utils/parse_state_keys.py +++ b/scrapegraphai/utils/parse_state_keys.py @@ -7,6 +7,8 @@ def parse_expression(expression, state: dict): """ Function for parsing the expressions + Args: + state (dict): state to elaborate """ # Check for empty expression if not expression: @@ -69,14 +71,14 @@ def evaluate_expression(expression): '|'.join(sub_result) + expression[end+1:] return evaluate_simple_expression(expression) - result = evaluate_expression(expression) + temp_result = evaluate_expression(expression) - if not result: + if not temp_result: raise ValueError("No state keys matched the expression.") # Remove redundant state keys from the result, without changing their order final_result = [] - for key in result: + for key in temp_result: if key not in final_result: final_result.append(key) diff --git a/scrapegraphai/utils/remover.py b/scrapegraphai/utils/remover.py index 9f765473..1cde0c0f 100644 --- a/scrapegraphai/utils/remover.py +++ b/scrapegraphai/utils/remover.py @@ -18,14 +18,11 @@ def remover(html_content: str) -> str: soup = BeautifulSoup(html_content, 'html.parser') - # Estrai il titolo title_tag = soup.find('title') title = title_tag.get_text() if title_tag else "" - # Rimuovi i tag <script> in tutto il documento [script.extract() for script in soup.find_all('script')] - # Estrai il corpo del documento body_content = soup.find('body') body = str(body_content) if body_content else "" diff --git a/scrapegraphai/utils/save_audio_from_bytes.py b/scrapegraphai/utils/save_audio_from_bytes.py index f250edbf..41c53d7b 100644 --- a/scrapegraphai/utils/save_audio_from_bytes.py +++ b/scrapegraphai/utils/save_audio_from_bytes.py @@ -1,11 +1,11 @@ """ This utility function saves the byte response as an audio file. """ - from pathlib import Path +from typing import Union -def save_audio_from_bytes(byte_response, output_path): +def save_audio_from_bytes(byte_response: bytes, output_path: Union[str, Path]) -> None: """ Saves the byte response as an audio file.