Skip to content

Commit 9723e6c

Browse files
committed
document ai init
0 parents  commit 9723e6c

File tree

8 files changed

+194
-0
lines changed

8 files changed

+194
-0
lines changed

.env

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Postgres database address for cocoindex
2+
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
3+
GOOGLE_CLOUD_PROJECT_ID = your-project-id
4+
GOOGLE_CLOUD_PROCESSOR_ID = your-processor-id

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pdf_embedding.egg-info/

README.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# 🥥 CocoIndex ETL with Document AI
2+
3+
[CocoIndex](https://cocoindex.io) is an ETL framework to transform data for AI, with real-time incremental processing - keep index up to date with low latency on source update. It supports custom logic like LEGO, and makes it easy for users to plugin the modules that best suits their project.
4+
5+
In this example, we will walk you through how to build embedding index based on local files, using [Google Document AI](https://cloud.google.com/document-ai?hl=en) as parser.
6+
7+
🥥 🌴 We are constantly improving - more blogs and examples coming soon. Stay tuned 👀 and **drop a star at [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex)** for latest updates!
8+
[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
9+
10+
11+
![Untitled design (9)](https://github.com/user-attachments/assets/5d9d49b9-6aa4-45f1-97cf-c9d16c02f0f4)
12+
13+
14+
## Prerequisite
15+
- [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
16+
- Get your [`LLAMA_CLOUD_API_KEY`](https://docs.cloud.llamaindex.ai/llamaparse/getting_started/get_an_api_key)
17+
18+
## Run
19+
20+
Install dependencies:
21+
22+
```bash
23+
pip install -e .
24+
```
25+
26+
Setup:
27+
28+
```bash
29+
python main.py cocoindex setup
30+
```
31+
32+
Update index:
33+
34+
```bash
35+
python main.py cocoindex update
36+
```
37+
38+
Run:
39+
40+
```bash
41+
python main.py
42+
```
43+
44+
## CocoInsight
45+
CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
46+
47+
Run CocoInsight to understand your RAG data pipeline:
48+
49+
```
50+
python main.py cocoindex server -c https://cocoindex.io
51+
```
52+
53+
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).

main.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import tempfile
2+
import os
3+
4+
from dotenv import load_dotenv
5+
from google.cloud import documentai_v1 as documentai
6+
from google.api_core.client_options import ClientOptions
7+
8+
import cocoindex
9+
10+
class ToMarkdown(cocoindex.op.FunctionSpec):
11+
"""Convert a PDF to markdown using Google Document AI."""
12+
13+
@cocoindex.op.executor_class(cache=True, behavior_version=1)
14+
class DocumentAIExecutor:
15+
"""Executor for Google Document AI to parse files.
16+
Supported file types: https://cloud.google.com/document-ai/docs/file-types
17+
"""
18+
19+
spec: ToMarkdown
20+
_client: documentai.DocumentProcessorServiceClient
21+
_processor_name: str
22+
23+
def prepare(self):
24+
# Initialize Document AI
25+
# You need to set GOOGLE_APPLICATION_CREDENTIALS environment variable
26+
# or explicitly create credentials and set project_id
27+
project_id = os.environ.get("GOOGLE_CLOUD_PROJECT_ID")
28+
location = os.environ.get("GOOGLE_CLOUD_LOCATION", "us")
29+
processor_id = os.environ.get("GOOGLE_CLOUD_PROCESSOR_ID")
30+
31+
# You must set the api_endpoint if you use a location other than 'us', e.g.:
32+
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
33+
self._client = documentai.DocumentProcessorServiceClient(client_options=opts)
34+
self._processor_name = self._client.processor_path(project_id, location, processor_id)
35+
36+
async def __call__(self, content: bytes) -> str:
37+
# Create the document object
38+
document = documentai.Document(
39+
content=content,
40+
mime_type="application/pdf"
41+
)
42+
43+
# Process the document
44+
request = documentai.ProcessRequest(
45+
name=self._processor_name,
46+
raw_document=documentai.RawDocument(content=content, mime_type="application/pdf")
47+
)
48+
49+
response = self._client.process_document(request=request)
50+
document = response.document
51+
52+
# Extract the text from the document
53+
text = document.text
54+
55+
# Convert to markdown format
56+
# This is a simple conversion - you might want to enhance this based on your needs
57+
# by using document.pages, entities, etc. for more structured markdown
58+
return text
59+
60+
61+
def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
62+
"""
63+
Embed the text using a SentenceTransformer model.
64+
"""
65+
return text.transform(
66+
cocoindex.functions.SentenceTransformerEmbed(
67+
model="sentence-transformers/all-MiniLM-L6-v2"))
68+
69+
@cocoindex.flow_def(name="PdfEmbedding")
70+
def pdf_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
71+
"""
72+
Define an example flow that embeds files into a vector database.
73+
"""
74+
data_scope["documents"] = flow_builder.add_source(cocoindex.sources.LocalFile(path="pdf_files", binary=True))
75+
76+
doc_embeddings = data_scope.add_collector()
77+
78+
with data_scope["documents"].row() as doc:
79+
doc["markdown"] = doc["content"].transform(ToMarkdown())
80+
doc["chunks"] = doc["markdown"].transform(
81+
cocoindex.functions.SplitRecursively(),
82+
language="markdown", chunk_size=2000, chunk_overlap=500)
83+
84+
with doc["chunks"].row() as chunk:
85+
chunk["embedding"] = chunk["text"].call(text_to_embedding)
86+
doc_embeddings.collect(id=cocoindex.GeneratedField.UUID,
87+
filename=doc["filename"], location=chunk["location"],
88+
text=chunk["text"], embedding=chunk["embedding"])
89+
90+
doc_embeddings.export(
91+
"doc_embeddings",
92+
cocoindex.storages.Postgres(),
93+
primary_key_fields=["id"],
94+
vector_indexes=[
95+
cocoindex.VectorIndexDef(
96+
field_name="embedding",
97+
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
98+
99+
query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
100+
name="SemanticsSearch",
101+
flow=pdf_embedding_flow,
102+
target_name="doc_embeddings",
103+
query_transform_flow=text_to_embedding,
104+
default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
105+
106+
@cocoindex.main_fn()
107+
def _run():
108+
# Run queries in a loop to demonstrate the query capabilities.
109+
while True:
110+
try:
111+
query = input("Enter search query (or Enter to quit): ")
112+
if query == '':
113+
break
114+
results, _ = query_handler.search(query, 10)
115+
print("\nSearch results:")
116+
for result in results:
117+
print(f"[{result.score:.3f}] {result.data['filename']}")
118+
print(f" {result.data['text']}")
119+
print("---")
120+
print()
121+
except KeyboardInterrupt:
122+
break
123+
124+
if __name__ == "__main__":
125+
load_dotenv(override=True)
126+
_run()

pdf_files/1706.03762v7.pdf

2.11 MB
Binary file not shown.

pdf_files/1810.04805v2.pdf

757 KB
Binary file not shown.

pdf_files/rfc8259.pdf

22.2 KB
Binary file not shown.

pyproject.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[project]
2+
name = "pdf-embedding"
3+
version = "0.1.0"
4+
description = "Simple example for cocoindex: build embedding index based on local PDF files."
5+
requires-python = ">=3.10"
6+
dependencies = [
7+
"cocoindex>=0.1.20",
8+
"python-dotenv>=1.0.1",
9+
"google-cloud-documentai>=2.20.1",
10+
]

0 commit comments

Comments
 (0)