Skip to content

add csv scraper #124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ venv/
*.sqlite
*.google-cookie
examples/graph_examples/ScrapeGraphAI_generated_graph
examples/**/*.csv
examples/**/result.csv
examples/**/result.json
main.py
poetry.lock

Expand Down
60 changes: 60 additions & 0 deletions examples/gemini/csv_scraper_gemini.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
"""

import os
from dotenv import load_dotenv
import pandas as pd
from scrapegraphai.graphs import CSVScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info

load_dotenv()

# ************************************************
# Read the csv file
# ************************************************

text = pd.read_csv("inputs/username.csv")

# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
"base_url": "http://localhost:11434",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
}
}

# ************************************************
# Create the CSVScraperGraph instance and run it
# ************************************************

csv_scraper_graph = CSVScraperGraph(
prompt="List me all the last names",
source=str(text), # Pass the content of the file, not the file object
config=graph_config
)

result = csv_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = csv_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")
7 changes: 7 additions & 0 deletions examples/gemini/inputs/username.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Username; Identifier;First name;Last name
booker12;9012;Rachel;Booker
grey07;2070;Laura;Grey
johnson81;4081;Craig;Johnson
jenkins46;9346;Mary;Jenkins
smith79;5079;Jamie;Smith

1 change: 1 addition & 0 deletions examples/gemini/scrape_xml_gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()

# ************************************************
Expand Down
54 changes: 54 additions & 0 deletions examples/local_models/Docker/csv_scraper_docker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
"""

import pandas as pd
from scrapegraphai.graphs import CSVScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info

# ************************************************
# Read the csv file
# ************************************************

text = pd.read_csv("inputs/username.csv")

# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
}
}

# ************************************************
# Create the CSVScraperGraph instance and run it
# ************************************************

csv_scraper_graph = CSVScraperGraph(
prompt="List me all the last names",
source=str(text), # Pass the content of the file, not the file object
config=graph_config
)

result = csv_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = csv_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")
7 changes: 7 additions & 0 deletions examples/local_models/Docker/inputs/username.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Username; Identifier;First name;Last name
booker12;9012;Rachel;Booker
grey07;2070;Laura;Grey
johnson81;4081;Craig;Johnson
jenkins46;9346;Mary;Jenkins
smith79;5079;Jamie;Smith

56 changes: 56 additions & 0 deletions examples/local_models/Ollama/csv_scraper_ollama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
"""

import pandas as pd
from scrapegraphai.graphs import CSVScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info

# ************************************************
# Read the csv file
# ************************************************

text = pd.read_csv("inputs/username.csv")

# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json", # Ollama needs the format to be specified explicitly
# "model_tokens": 2000, # set context length arbitrarily
"base_url": "http://localhost:11434",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
}
}

# ************************************************
# Create the CSVScraperGraph instance and run it
# ************************************************

csv_scraper_graph = CSVScraperGraph(
prompt="List me all the last names",
source=str(text), # Pass the content of the file, not the file object
config=graph_config
)

result = csv_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = csv_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")
7 changes: 7 additions & 0 deletions examples/local_models/Ollama/inputs/username.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Username; Identifier;First name;Last name
booker12;9012;Rachel;Booker
grey07;2070;Laura;Grey
johnson81;4081;Craig;Johnson
jenkins46;9346;Mary;Jenkins
smith79;5079;Jamie;Smith

53 changes: 53 additions & 0 deletions examples/openai/csv_scraper_openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
"""

import os
from dotenv import load_dotenv
import pandas as pd
from scrapegraphai.graphs import CSVScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info

load_dotenv()
# ************************************************
# Read the csv file
# ************************************************

text = pd.read_csv("inputs/username.csv")

# ************************************************
# Define the configuration for the graph
# ************************************************

openai_key = os.getenv("OPENAI_APIKEY")

graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
},
}

# ************************************************
# Create the CSVScraperGraph instance and run it
# ************************************************

csv_scraper_graph = CSVScraperGraph(
prompt="List me all the last names",
source=str(text), # Pass the content of the file, not the file object
config=graph_config
)

result = csv_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = csv_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")
7 changes: 7 additions & 0 deletions examples/openai/inputs/username.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Username; Identifier;First name;Last name
booker12;9012;Rachel;Booker
grey07;2070;Laura;Grey
johnson81;4081;Craig;Johnson
jenkins46;9346;Mary;Jenkins
smith79;5079;Jamie;Smith

1 change: 1 addition & 0 deletions examples/openai/scrape_plain_text_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info

load_dotenv()

# ************************************************
Expand Down
1 change: 1 addition & 0 deletions scrapegraphai/graphs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
from .script_creator_graph import ScriptCreatorGraph
from .xml_scraper_graph import XMLScraperGraph
from .json_scraper_graph import JSONScraperGraph
from .csv_scraper_graph import CSVScraperGraph
88 changes: 88 additions & 0 deletions scrapegraphai/graphs/csv_scraper_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
Module for creating the smart scraper
"""
from .base_graph import BaseGraph
from ..nodes import (
FetchNode,
ParseNode,
RAGNode,
GenerateAnswerCSVNode
)
from .abstract_graph import AbstractGraph


class CSVScraperGraph(AbstractGraph):
"""
SmartScraper is a comprehensive web scraping tool that automates the process of extracting
information from web pages using a natural language model to interpret and answer prompts.
"""

def __init__(self, prompt: str, source: str, config: dict):
"""
Initializes the CSVScraperGraph with a prompt, source, and configuration.
"""
super().__init__(prompt, config, source)

self.input_key = "csv" if source.endswith("csv") else "csv_dir"

def _create_graph(self):
"""
Creates the graph of nodes representing the workflow for web scraping.
"""
fetch_node = FetchNode(
input="csv_dir",
output=["doc"],
node_config={
"headless": self.headless,
"verbose": self.verbose
}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={
"chunk_size": self.model_token,
"verbose": self.verbose
}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm": self.llm_model,
"embedder_model": self.embedder_model,
"verbose": self.verbose
}
)
generate_answer_node = GenerateAnswerCSVNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={
"llm": self.llm_model,
"verbose": self.verbose
}
)

return BaseGraph(
nodes=[
fetch_node,
parse_node,
rag_node,
generate_answer_node,
],
edges=[
(fetch_node, parse_node),
(parse_node, rag_node),
(rag_node, generate_answer_node)
],
entry_point=fetch_node
)

def run(self) -> str:
"""
Executes the web scraping process and returns the answer to the prompt.
"""
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

return self.final_state.get("answer", "No answer found.")
1 change: 1 addition & 0 deletions scrapegraphai/nodes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
from .generate_scraper_node import GenerateScraperNode
from .search_link_node import SearchLinkNode
from .robots_node import RobotsNode
from .generate_answer_csv_node import GenerateAnswerCSVNode
Loading
Loading