This repository was archived by the owner on Mar 28, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcustom_extractor.py
67 lines (54 loc) · 2.8 KB
/
custom_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from pydantic import BaseModel
from typing import List
from indexify_extractor_sdk import Extractor, Content, Feature
from indexify_extractor_sdk.base_extractor import Content
import json
# Extractors can be parameterized by providing a pydantic model
# as the second argument to the Extractor class. The model is exposed
# in the Indexify API so that users can dynamically change the behavior
# of the extractor for various use cases. Some examples here can be
# chunk size of audio clips during segmentation, or the text splitter algorithm
# for embedding long documents
class InputParams(BaseModel):
a: int = 0
b: str = ""
class MyExtractor(Extractor):
name = "yourorgname/myextractor"
description = "Description of the extractor goes here."
# Any system dependencies that the python code here depends on needs to be listed here
# We use Ubuntu base images, so any ubuntu package can be installed here.
system_dependencies = []
# The mime types of content that this extractor can process. Any content ingested into
# Indexify which does not match one of these mime types will not be sent to this extractor.
input_mime_types = ["text/plain"]
def __init__(self):
super().__init__()
# The extract method is the main entrypoint for the extractor. It takes in a Content object and returns
# a list of transformed content objects. Each transformed content can have different features that will be
# indexed. For example a long document can be split into multiple content chunks, each containing features
# like embedding, JSON metdata, etc. If you are doing on ETL(transformation), you can return content with
# no features.
def extract(self, content: Content, params: InputParams) -> List[Content]:
return [
## If the name of the embedding field in the schema is anything besides "embedding",
# you must specify the name of the field in the Feature.embedding call.
# Feature.embedding(value=[1, 2, 3], name="my_embedding")
Content.from_text(
text="Hello World", features=[Feature.embedding(values=[1, 2, 3])]
),
Content.from_text(
text="Pipe Baz", features=[Feature.embedding(values=[1, 2, 3])]
),
Content.from_text(
text="Hello World",
features=[Feature.metadata({"key": "value"})],
),
]
# Provide some representative sample input that the extractor can process.
def sample_input(self) -> Content:
Content.from_text(text="Hello World")
if __name__ == "__main__":
# You can run the extractor by simply invoking this python file
# python custom_extractor.py and that would run the extractor
# with the sample input provided.
MyExtractor().extract_sample_input()