diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index f675162f..464080ff 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -20,4 +20,4 @@ jobs: pip install pylint pip install -r requirements.txt - name: Analysing the code with pylint - run: pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraphai/**/*.py scrapegraphai/*.py examples/**/*.py tests/**/*.py \ No newline at end of file + run: pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraphai/**/*.py scrapegraphai/*.py \ No newline at end of file diff --git a/README.md b/README.md index c86039ce..bd97c42f 100644 --- a/README.md +++ b/README.md @@ -43,14 +43,45 @@ Check out also the docusaurus [documentation](https://scrapegraph-doc.onrender.c You can use the `SmartScraper` class to extract information from a website using a prompt. The `SmartScraper` class is a direct graph implementation that uses the most common nodes present in a web scraping pipeline. For more information, please see the [documentation](https://scrapegraph-ai.readthedocs.io/en/latest/). -### Case 1: Extracting informations using a local LLM +### Case 1: Extracting informations using Ollama +Remember to download the model on Ollama separately! +```python +from scrapegraphai.graphs import SmartScraperGraph + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + } +} + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the news with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +``` + +### Case 2: Extracting informations using Docker Note: before using the local model remeber to create the docker container! ```text docker-compose up -d docker exec -it ollama ollama run stablelm-zephyr ``` -You can use which model you want instead of stablelm-zephyr +You can use which models avaiable on Ollama or your own model instead of stablelm-zephyr ```python from scrapegraphai.graphs import SmartScraperGraph @@ -75,7 +106,7 @@ print(result) ``` -### Case 2: Extracting informations using Openai model +### Case 3: Extracting informations using Openai model ```python from scrapegraphai.graphs import SmartScraperGraph OPENAI_API_KEY = "YOUR_API_KEY" @@ -98,7 +129,7 @@ result = smart_scraper_graph.run() print(result) ``` -### Case 3: Extracting informations using Gemini +### Case 4: Extracting informations using Gemini ```python from scrapegraphai.graphs import SmartScraperGraph GOOGLE_APIKEY = "YOUR_API_KEY" diff --git a/examples/gemini/readme.md b/examples/gemini/readme.md new file mode 100644 index 00000000..7e06773d --- /dev/null +++ b/examples/gemini/readme.md @@ -0,0 +1 @@ +This folder contains an example of how to use ScrapeGraph-AI with Gemini, a large language model (LLM) from Google AI. The example shows how to extract information from a website using a natural language prompt. \ No newline at end of file diff --git a/examples/gemini/results/result.csv b/examples/gemini/results/result.csv deleted file mode 100644 index 97ef817e..00000000 --- a/examples/gemini/results/result.csv +++ /dev/null @@ -1,2 +0,0 @@ -0,1,2,3 -"{'title': 'Rotary Pendulum RL', 'description': 'Open Source project aimed at controlling a real life rotary pendulum using RL algorithms'}","{'title': 'DQN Implementation from scratch', 'description': 'Developed a Deep Q-Network algorithm to train a simple and double pendulum'}","{'title': 'Multi Agents HAED', 'description': 'University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.'}","{'title': 'Wireless ESC for Modular Drones', 'description': 'Modular drone architecture proposal and proof of concept. The project received maximum grade.'}" diff --git a/examples/gemini/results/result.json b/examples/gemini/results/result.json deleted file mode 100644 index 8a4e7057..00000000 --- a/examples/gemini/results/result.json +++ /dev/null @@ -1 +0,0 @@ -{"projects": [{"title": "Rotary Pendulum RL", "description": "Open Source project aimed at controlling a real life rotary pendulum using RL algorithms"}, {"title": "DQN Implementation from scratch", "description": "Developed a Deep Q-Network algorithm to train a simple and double pendulum"}, {"title": "Multi Agents HAED", "description": "University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings."}, {"title": "Wireless ESC for Modular Drones", "description": "Modular drone architecture proposal and proof of concept. The project received maximum grade."}]} \ No newline at end of file diff --git a/examples/local_models/inputs/books.xml b/examples/local_models/Docker/inputs/books.xml similarity index 100% rename from examples/local_models/inputs/books.xml rename to examples/local_models/Docker/inputs/books.xml diff --git a/examples/local_models/inputs/plain_html_example.txt b/examples/local_models/Docker/inputs/plain_html_example.txt similarity index 100% rename from examples/local_models/inputs/plain_html_example.txt rename to examples/local_models/Docker/inputs/plain_html_example.txt diff --git a/examples/local_models/Docker/readme.md b/examples/local_models/Docker/readme.md new file mode 100644 index 00000000..e69de29b diff --git a/examples/local_models/scrape_plain_text_local.py b/examples/local_models/Docker/scrape_plain_text_docker.py similarity index 100% rename from examples/local_models/scrape_plain_text_local.py rename to examples/local_models/Docker/scrape_plain_text_docker.py diff --git a/examples/local_models/scrape_xml_local.py b/examples/local_models/Docker/scrape_xml_docker.py similarity index 100% rename from examples/local_models/scrape_xml_local.py rename to examples/local_models/Docker/scrape_xml_docker.py diff --git a/examples/local_models/search_graph_local.py b/examples/local_models/Docker/search_graph_docker.py similarity index 100% rename from examples/local_models/search_graph_local.py rename to examples/local_models/Docker/search_graph_docker.py diff --git a/examples/local_models/smart_scraper_local.py b/examples/local_models/Docker/smart_scraper_docker.py similarity index 72% rename from examples/local_models/smart_scraper_local.py rename to examples/local_models/Docker/smart_scraper_docker.py index a1fbbee4..e2c53cdd 100644 --- a/examples/local_models/smart_scraper_local.py +++ b/examples/local_models/Docker/smart_scraper_docker.py @@ -6,14 +6,6 @@ # ************************************************ # Define the configuration for the graph # ************************************************ -""" - Avaiable models: - - ollama/llama2 - - ollama/mistral - - ollama/codellama - - ollama/dolphin-mixtral - - ollama/mistral-openorca -""" graph_config = { "llm": { @@ -21,12 +13,7 @@ "temperature": 0, "format": "json", # Ollama needs the format to be specified explicitly # "model_tokens": 2000, # set context length arbitrarily, - # "base_url": "http://ollama:11434", # set ollama URL arbitrarily }, - "embeddings": { - "model": "ollama/nomic-embed-text", - "temperature": 0, - } } # ************************************************ diff --git a/examples/local_models/Ollama/inputs/books.xml b/examples/local_models/Ollama/inputs/books.xml new file mode 100644 index 00000000..e3d1fe87 --- /dev/null +++ b/examples/local_models/Ollama/inputs/books.xml @@ -0,0 +1,120 @@ + + + + Gambardella, Matthew + XML Developer's Guide + Computer + 44.95 + 2000-10-01 + An in-depth look at creating applications + with XML. + + + Ralls, Kim + Midnight Rain + Fantasy + 5.95 + 2000-12-16 + A former architect battles corporate zombies, + an evil sorceress, and her own childhood to become queen + of the world. + + + Corets, Eva + Maeve Ascendant + Fantasy + 5.95 + 2000-11-17 + After the collapse of a nanotechnology + society in England, the young survivors lay the + foundation for a new society. + + + Corets, Eva + Oberon's Legacy + Fantasy + 5.95 + 2001-03-10 + In post-apocalypse England, the mysterious + agent known only as Oberon helps to create a new life + for the inhabitants of London. Sequel to Maeve + Ascendant. + + + Corets, Eva + The Sundered Grail + Fantasy + 5.95 + 2001-09-10 + The two daughters of Maeve, half-sisters, + battle one another for control of England. Sequel to + Oberon's Legacy. + + + Randall, Cynthia + Lover Birds + Romance + 4.95 + 2000-09-02 + When Carla meets Paul at an ornithology + conference, tempers fly as feathers get ruffled. + + + Thurman, Paula + Splish Splash + Romance + 4.95 + 2000-11-02 + A deep sea diver finds true love twenty + thousand leagues beneath the sea. + + + Knorr, Stefan + Creepy Crawlies + Horror + 4.95 + 2000-12-06 + An anthology of horror stories about roaches, + centipedes, scorpions and other insects. + + + Kress, Peter + Paradox Lost + Science Fiction + 6.95 + 2000-11-02 + After an inadvertant trip through a Heisenberg + Uncertainty Device, James Salway discovers the problems + of being quantum. + + + O'Brien, Tim + Microsoft .NET: The Programming Bible + Computer + 36.95 + 2000-12-09 + Microsoft's .NET initiative is explored in + detail in this deep programmer's reference. + + + O'Brien, Tim + MSXML3: A Comprehensive Guide + Computer + 36.95 + 2000-12-01 + The Microsoft MSXML3 parser is covered in + detail, with attention to XML DOM interfaces, XSLT processing, + SAX and more. + + + Galos, Mike + Visual Studio 7: A Comprehensive Guide + Computer + 49.95 + 2001-04-16 + Microsoft Visual Studio 7 is explored in depth, + looking at how Visual Basic, Visual C++, C#, and ASP+ are + integrated into a comprehensive development + environment. + + \ No newline at end of file diff --git a/examples/local_models/Ollama/inputs/plain_html_example.txt b/examples/local_models/Ollama/inputs/plain_html_example.txt new file mode 100644 index 00000000..78f814ae --- /dev/null +++ b/examples/local_models/Ollama/inputs/plain_html_example.txt @@ -0,0 +1,105 @@ + +
+ + +
+
+
+
+
+
+

Projects

+

+
+
+ +
+
+
+ +
+ \ No newline at end of file diff --git a/examples/local_models/Ollama/readme.md b/examples/local_models/Ollama/readme.md new file mode 100644 index 00000000..e69de29b diff --git a/examples/local_models/Ollama/scrape_plain_text_ollama.py b/examples/local_models/Ollama/scrape_plain_text_ollama.py new file mode 100644 index 00000000..a9351d70 --- /dev/null +++ b/examples/local_models/Ollama/scrape_plain_text_ollama.py @@ -0,0 +1,55 @@ +""" +Basic example of scraping pipeline using SmartScraper from text +""" + +import os +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json + +# ************************************************ +# Read the text file +# ************************************************ + +FILE_NAME = "inputs/plain_html_example.txt" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +# It could be also a http request using the request model +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "model_tokens": 2000, # set context length arbitrarily + "base_url": "http://localhost:11434", + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", + } +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the news with their description.", + source=text, + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/Ollama/scrape_xml_ollama.py b/examples/local_models/Ollama/scrape_xml_ollama.py new file mode 100644 index 00000000..9b3838f1 --- /dev/null +++ b/examples/local_models/Ollama/scrape_xml_ollama.py @@ -0,0 +1,54 @@ +""" +Basic example of scraping pipeline using SmartScraper from XML documents +""" +import os +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import convert_to_csv, convert_to_json + +# ************************************************ +# Read the XML file +# ************************************************ + +FILE_NAME = "inputs/books.xml" +curr_dir = os.path.dirname(os.path.realpath(__file__)) +file_path = os.path.join(curr_dir, FILE_NAME) + +with open(file_path, 'r', encoding="utf-8") as file: + text = file.read() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "model_tokens": 2000, # set context length arbitrarily + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + } +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the authors, title and genres of the books", + source=text, # Pass the content of the file, not the file object + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# Save to json or csv +convert_to_csv(result, "result") +convert_to_json(result, "result") diff --git a/examples/local_models/Ollama/smart_scraper_ollama.py b/examples/local_models/Ollama/smart_scraper_ollama.py new file mode 100644 index 00000000..d710b986 --- /dev/null +++ b/examples/local_models/Ollama/smart_scraper_ollama.py @@ -0,0 +1,44 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ollama/mistral", + "temperature": 0, + "format": "json", # Ollama needs the format to be specified explicitly + # "model_tokens": 2000, # set context length arbitrarily, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + }, + "embeddings": { + "model": "ollama/nomic-embed-text", + "temperature": 0, + "base_url": "http://localhost:11434", # set ollama URL arbitrarily + } +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the news with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/local_models/results/result.csv b/examples/local_models/results/result.csv deleted file mode 100644 index 97ef817e..00000000 --- a/examples/local_models/results/result.csv +++ /dev/null @@ -1,2 +0,0 @@ -0,1,2,3 -"{'title': 'Rotary Pendulum RL', 'description': 'Open Source project aimed at controlling a real life rotary pendulum using RL algorithms'}","{'title': 'DQN Implementation from scratch', 'description': 'Developed a Deep Q-Network algorithm to train a simple and double pendulum'}","{'title': 'Multi Agents HAED', 'description': 'University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.'}","{'title': 'Wireless ESC for Modular Drones', 'description': 'Modular drone architecture proposal and proof of concept. The project received maximum grade.'}" diff --git a/examples/local_models/results/result.json b/examples/local_models/results/result.json deleted file mode 100644 index 8a4e7057..00000000 --- a/examples/local_models/results/result.json +++ /dev/null @@ -1 +0,0 @@ -{"projects": [{"title": "Rotary Pendulum RL", "description": "Open Source project aimed at controlling a real life rotary pendulum using RL algorithms"}, {"title": "DQN Implementation from scratch", "description": "Developed a Deep Q-Network algorithm to train a simple and double pendulum"}, {"title": "Multi Agents HAED", "description": "University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings."}, {"title": "Wireless ESC for Modular Drones", "description": "Modular drone architecture proposal and proof of concept. The project received maximum grade."}]} \ No newline at end of file diff --git a/examples/mixed_models/readme.md b/examples/mixed_models/readme.md new file mode 100644 index 00000000..9e739212 --- /dev/null +++ b/examples/mixed_models/readme.md @@ -0,0 +1 @@ +This folder contains an example of how to use ScrapeGraph-AI with mixed models. The example shows how to extract information from a website using a natural language prompt and a machine learning model. \ No newline at end of file diff --git a/examples/mixed_models/results/result.csv b/examples/mixed_models/results/result.csv deleted file mode 100644 index 97ef817e..00000000 --- a/examples/mixed_models/results/result.csv +++ /dev/null @@ -1,2 +0,0 @@ -0,1,2,3 -"{'title': 'Rotary Pendulum RL', 'description': 'Open Source project aimed at controlling a real life rotary pendulum using RL algorithms'}","{'title': 'DQN Implementation from scratch', 'description': 'Developed a Deep Q-Network algorithm to train a simple and double pendulum'}","{'title': 'Multi Agents HAED', 'description': 'University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.'}","{'title': 'Wireless ESC for Modular Drones', 'description': 'Modular drone architecture proposal and proof of concept. The project received maximum grade.'}" diff --git a/examples/mixed_models/results/result.json b/examples/mixed_models/results/result.json deleted file mode 100644 index 8a4e7057..00000000 --- a/examples/mixed_models/results/result.json +++ /dev/null @@ -1 +0,0 @@ -{"projects": [{"title": "Rotary Pendulum RL", "description": "Open Source project aimed at controlling a real life rotary pendulum using RL algorithms"}, {"title": "DQN Implementation from scratch", "description": "Developed a Deep Q-Network algorithm to train a simple and double pendulum"}, {"title": "Multi Agents HAED", "description": "University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings."}, {"title": "Wireless ESC for Modular Drones", "description": "Modular drone architecture proposal and proof of concept. The project received maximum grade."}]} \ No newline at end of file diff --git a/examples/openai/readme.md b/examples/openai/readme.md new file mode 100644 index 00000000..9a517ac6 --- /dev/null +++ b/examples/openai/readme.md @@ -0,0 +1 @@ +This folder contains an example of how to use ScrapeGraph-AI with OpenAI, an artificial intelligence platform. The examples show how to extract information from a website using a natural language prompt. \ No newline at end of file diff --git a/examples/openai/results/result.csv b/examples/openai/results/result.csv deleted file mode 100644 index 97ef817e..00000000 --- a/examples/openai/results/result.csv +++ /dev/null @@ -1,2 +0,0 @@ -0,1,2,3 -"{'title': 'Rotary Pendulum RL', 'description': 'Open Source project aimed at controlling a real life rotary pendulum using RL algorithms'}","{'title': 'DQN Implementation from scratch', 'description': 'Developed a Deep Q-Network algorithm to train a simple and double pendulum'}","{'title': 'Multi Agents HAED', 'description': 'University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings.'}","{'title': 'Wireless ESC for Modular Drones', 'description': 'Modular drone architecture proposal and proof of concept. The project received maximum grade.'}" diff --git a/examples/openai/results/result.json b/examples/openai/results/result.json deleted file mode 100644 index 8a4e7057..00000000 --- a/examples/openai/results/result.json +++ /dev/null @@ -1 +0,0 @@ -{"projects": [{"title": "Rotary Pendulum RL", "description": "Open Source project aimed at controlling a real life rotary pendulum using RL algorithms"}, {"title": "DQN Implementation from scratch", "description": "Developed a Deep Q-Network algorithm to train a simple and double pendulum"}, {"title": "Multi Agents HAED", "description": "University project which focuses on simulating a multi-agent system to perform environment mapping. Agents, equipped with sensors, explore and record their surroundings, considering uncertainties in their readings."}, {"title": "Wireless ESC for Modular Drones", "description": "Modular drone architecture proposal and proof of concept. The project received maximum grade."}]} \ No newline at end of file diff --git a/manual deployement/commit_and_push.sh b/manual deployement/commit_and_push.sh index 4a0afbee..cb51c968 100755 --- a/manual deployement/commit_and_push.sh +++ b/manual deployement/commit_and_push.sh @@ -21,7 +21,7 @@ cd .. commit_message="$1" # Run Pylint on the specified Python files -pylint scrapegraphai/**/*.py scrapegraphai/*.py examples/**/*.py tests/**/*.py +pylint pylint scrapegraphai/**/*.py scrapegraphai/*.py #Make the pull git pull diff --git a/scrapegraphai/utils/prettify_exec_info.py b/scrapegraphai/utils/prettify_exec_info.py index b34a7f8a..7023d6df 100644 --- a/scrapegraphai/utils/prettify_exec_info.py +++ b/scrapegraphai/utils/prettify_exec_info.py @@ -4,6 +4,7 @@ import pandas as pd + def prettify_exec_info(complete_result: dict) -> pd.DataFrame: """ Transform the execution information of the graph into a DataFrame for better visualization. @@ -14,32 +15,34 @@ def prettify_exec_info(complete_result: dict) -> pd.DataFrame: Returns: - pd.DataFrame: The execution information of the graph in a DataFrame. """ - + nodes_info = complete_result['nodes_info'] total_info = { 'total_exec_time': complete_result['total_exec_time'], 'total_model_info': complete_result['total_model_info'] } - + # Convert node-specific information to DataFrame flat_data = [] for node_name, node_info in nodes_info.items(): flat_data.append({ 'Node': node_name, 'Execution Time': node_info['exec_time'], - **node_info['model_info'] # Unpack the model_info dict into the row + # Unpack the model_info dict into the row + **node_info['model_info'] }) - + df_nodes = pd.DataFrame(flat_data) - + # Add a row for the total execution time and total model info total_row = { 'Node': 'Total', 'Execution Time': total_info['total_exec_time'], - **total_info['total_model_info'] # Unpack the total_model_info dict into the row + # Unpack the total_model_info dict into the row + **total_info['total_model_info'] } df_total = pd.DataFrame([total_row]) - + # Combine the nodes DataFrame with the total info DataFrame df_combined_with_total = pd.concat([df_nodes, df_total], ignore_index=True) - return df_combined_with_total \ No newline at end of file + return df_combined_with_total