Skip to content

Commit edc439f

Browse files
authored
Merge pull request #64 from VinciGit00/generateScraperbranch
add generateScraperBranch
2 parents bdac2df + b2d170c commit edc439f

9 files changed

+414
-0
lines changed
+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""
2+
Basic example of scraping pipeline using ScriptCreatorGraph
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import ScriptCreatorGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
gemini_key = os.getenv("GOOGLE_APIKEY")
18+
19+
graph_config = {
20+
"llm": {
21+
"api_key": gemini_key,
22+
"model": "gpt-3.5-turbo",
23+
},
24+
}
25+
26+
# ************************************************
27+
# Create the ScriptCreatorGraph instance and run it
28+
# ************************************************
29+
30+
smart_scraper_graph = ScriptCreatorGraph(
31+
prompt="List me all the news with their description.",
32+
# also accepts a string with the already downloaded HTML code
33+
source="https://perinim.github.io/projects",
34+
config=graph_config
35+
)
36+
37+
result = smart_scraper_graph.run()
38+
print(result)
39+
40+
# ************************************************
41+
# Get graph execution info
42+
# ************************************************
43+
44+
graph_exec_info = smart_scraper_graph.get_execution_info()
45+
print(prettify_exec_info(graph_exec_info))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""
2+
Basic example of scraping pipeline using ScriptCreatorGraph
3+
"""
4+
from scrapegraphai.graphs import ScriptCreatorGraph
5+
from scrapegraphai.utils import prettify_exec_info
6+
7+
# ************************************************
8+
# Define the configuration for the graph
9+
# ************************************************
10+
11+
graph_config = {
12+
"llm": {
13+
"model": "ollama/mistral",
14+
"temperature": 0,
15+
"format": "json",
16+
# "model_tokens": 2000, # set context length arbitrarily,
17+
},
18+
"embeddings": {
19+
"model": "ollama/nomic-embed-text",
20+
"temperature": 0,
21+
}
22+
}
23+
24+
# ************************************************
25+
# Create the ScriptCreatorGraph instance and run it
26+
# ************************************************
27+
28+
smart_scraper_graph = ScriptCreatorGraph(
29+
prompt="List me all the news with their description.",
30+
# also accepts a string with the already downloaded HTML code
31+
source="https://perinim.github.io/projects",
32+
config=graph_config
33+
)
34+
35+
result = smart_scraper_graph.run()
36+
print(result)
37+
38+
# ************************************************
39+
# Get graph execution info
40+
# ************************************************
41+
42+
graph_exec_info = smart_scraper_graph.get_execution_info()
43+
print(prettify_exec_info(graph_exec_info))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""
2+
Basic example of scraping pipeline using ScriptCreatorGraph
3+
"""
4+
from scrapegraphai.graphs import ScriptCreatorGraph
5+
from scrapegraphai.utils import prettify_exec_info
6+
# ************************************************
7+
# Define the configuration for the graph
8+
# ************************************************
9+
10+
graph_config = {
11+
"llm": {
12+
"model": "ollama/mistral",
13+
"temperature": 0,
14+
"format": "json", # Ollama needs the format to be specified explicitly
15+
# "model_tokens": 2000, # set context length arbitrarily,
16+
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
17+
},
18+
"embeddings": {
19+
"model": "ollama/nomic-embed-text",
20+
"temperature": 0,
21+
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
22+
}
23+
}
24+
25+
# ************************************************
26+
# Create the ScriptCreatorGraph instance and run it
27+
# ************************************************
28+
29+
smart_scraper_graph = ScriptCreatorGraph(
30+
prompt="List me all the news with their description.",
31+
# also accepts a string with the already downloaded HTML code
32+
source="https://perinim.github.io/projects",
33+
config=graph_config
34+
)
35+
36+
result = smart_scraper_graph.run()
37+
print(result)
38+
39+
# ************************************************
40+
# Get graph execution info
41+
# ************************************************
42+
43+
graph_exec_info = smart_scraper_graph.get_execution_info()
44+
print(prettify_exec_info(graph_exec_info))
+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""
2+
Basic example of scraping pipeline using ScriptCreatorGraph
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import ScriptCreatorGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
openai_key = os.getenv("OPENAI_APIKEY")
17+
18+
graph_config = {
19+
"llm": {
20+
"api_key": openai_key,
21+
"model": "gpt-3.5-turbo",
22+
},
23+
}
24+
25+
# ************************************************
26+
# Create the ScriptCreatorGraph instance and run it
27+
# ************************************************
28+
29+
smart_scraper_graph = ScriptCreatorGraph(
30+
prompt="List me all the news with their description.",
31+
# also accepts a string with the already downloaded HTML code
32+
source="https://perinim.github.io/projects",
33+
config=graph_config
34+
)
35+
36+
result = smart_scraper_graph.run()
37+
print(result)
38+
39+
# ************************************************
40+
# Get graph execution info
41+
# ************************************************
42+
43+
graph_exec_info = smart_scraper_graph.get_execution_info()
44+
print(prettify_exec_info(graph_exec_info))

examples/openai/smart_scraper_openai.py

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import SmartScraperGraph
88
from scrapegraphai.utils import prettify_exec_info
9+
910
load_dotenv()
1011

1112

scrapegraphai/graphs/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
from .smart_scraper_graph import SmartScraperGraph
66
from .speech_graph import SpeechGraph
77
from .search_graph import SearchGraph
8+
from .script_creator_graph import ScriptCreatorGraph
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""
2+
Module for creating the smart scraper
3+
"""
4+
from .base_graph import BaseGraph
5+
from ..nodes import (
6+
FetchNode,
7+
ParseNode,
8+
RAGNode,
9+
GenerateScraperNode
10+
)
11+
from .abstract_graph import AbstractGraph
12+
13+
14+
class ScriptCreatorGraph(AbstractGraph):
15+
"""
16+
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
17+
information from web pages using a natural language model to interpret and answer prompts.
18+
"""
19+
20+
def __init__(self, prompt: str, source: str, config: dict):
21+
"""
22+
Initializes the ScriptCreatorGraph with a prompt, source, and configuration.
23+
"""
24+
super().__init__(prompt, config, source)
25+
26+
self.input_key = "url" if source.startswith("http") else "local_dir"
27+
28+
def _create_graph(self):
29+
"""
30+
Creates the graph of nodes representing the workflow for web scraping.
31+
"""
32+
fetch_node = FetchNode(
33+
input="url | local_dir",
34+
output=["doc"],
35+
)
36+
parse_node = ParseNode(
37+
input="doc",
38+
output=["parsed_doc"],
39+
node_config={"chunk_size": self.model_token}
40+
)
41+
rag_node = RAGNode(
42+
input="user_prompt & (parsed_doc | doc)",
43+
output=["relevant_chunks"],
44+
node_config={
45+
"llm": self.llm_model,
46+
"embedder_model": self.embedder_model
47+
}
48+
)
49+
generate_scraper_node = GenerateScraperNode(
50+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
51+
output=["answer"],
52+
node_config={"llm": self.llm_model},
53+
)
54+
55+
return BaseGraph(
56+
nodes={
57+
fetch_node,
58+
parse_node,
59+
rag_node,
60+
generate_scraper_node,
61+
},
62+
edges={
63+
(fetch_node, parse_node),
64+
(parse_node, rag_node),
65+
(rag_node, generate_scraper_node)
66+
},
67+
entry_point=fetch_node
68+
)
69+
70+
def run(self) -> str:
71+
"""
72+
Executes the web scraping process and returns the answer to the prompt.
73+
"""
74+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
75+
self.final_state, self.execution_info = self.graph.execute(inputs)
76+
77+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/nodes/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@
1111
from .text_to_speech_node import TextToSpeechNode
1212
from .image_to_text_node import ImageToTextNode
1313
from .search_internet_node import SearchInternetNode
14+
from .generate_scraper_node import GenerateScraperNode

0 commit comments

Comments
 (0)