Skip to content

(feat): new examples #28

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions 12_model/2_openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import hashlib
import os
from functools import lru_cache

from openai import OpenAI

from chalk import online
from chalk.features import DataFrame, before_all, features, has_many

openai_client: OpenAI

# A list of prompts that we can run based on our user's titles
TITLE_PROMPTS = dict(
is_exec="Does the job title `{title}` mean that the person is a executive at their company? Please answer with one word, either: `Yes` or `No`.",
is_swe="Does the job title `{title}` mean the person is a software engineer? Please answer with one word, either: `Yes` or `No`",
)


@lru_cache
def hash_prompt(prompt: str, length=16) -> str:
"""Hash a prompt to a fixed length string. This is useful for caching open ai API requests"""
return int(hashlib.sha256(prompt.encode("utf-8")).hexdigest(), 16) % 10**length


def get_openai_yes_no_answer(response: str) -> bool | None:
"""Tests whether the response is a yes or no answer. If it is ambiguous, returns None."""
yes = "yes" in response
no = "no" in response
if (yes and no) or len(response) > 50:
# our answer is a bit ambiguous, let's not make a decision
return None
if yes:
return True
if no:
return False


@before_all
def initialize_open_ai_client():
# Note, one should be cautious when using global to give access to custom
# data sources: https://docs.chalk.ai/docs/generic#scopes
global openai_client

openai_client = OpenAI(
# This assumes that the OPEN_AI_API_KEY is set in your chalk
# deployment: https://docs.chalk.ai/docs/env-vars
api_key=os.environ.get("OPEN_AI_API_KEY"),
)


@features
class OpenAiQuery:
id: str
user_id: str
# currently, this is one of `is_exec` or `is_swe` (it is is the question "type")
category: str
prompt: str
prompt_hash: str
prompt_result: "OpenAiQueryResult"


# Setting no max staleness caches our openai queries, limiting our api calls
# for users with equivalent titles.
@features(max_staleness="infinity")
class OpenAiQueryResult:
id: str
result: str
queries: DataFrame[OpenAiQuery] = has_many(
lambda: OpenAiQuery.prompt_hash == OpenAiQueryResult.id
)


@features
class User:
id: str
title: str
is_exec: bool
is_swe: bool
open_ai_queries: DataFrame[OpenAiQuery] = has_many(
lambda: User.id == OpenAiQuery.user_id
)


@online
def get_openai_title_queries(
user_id: User.id,
title: User.title,
) -> User.open_ai_queries:
open_ai_title_queries = []
for category, title_prompt in TITLE_PROMPTS.items():
prompt = title_prompt.format(title=title)
prompt_hash = hash_prompt(prompt)
open_ai_title_queries.append(
OpenAiQuery(
id=f"{user_id}_{prompt_hash}",
user_id=user_id,
prompt=prompt,
category=category,
prompt_hash=prompt_hash,
)
)
return DataFrame(open_ai_title_queries)


# run queries by the hash of the prompt
@online
def get_openai_answer(
prompt_hash: OpenAiQuery.prompt_hash,
prompt: OpenAiQuery.prompt,
) -> OpenAiQuery.prompt_result:
result = openai_client.chat.completions.create(
messages=[
{
"role": "user",
"content": prompt,
}
],
model="gpt-3.5-turbo",
)

return OpenAiQueryResult(
id=prompt_hash,
result=result.choices[0].message.content,
)


@online
def get_openai_is_director(
result: User.open_ai_queries[OpenAiQuery.category == "is_exec"].prompt_result,
) -> User.is_exec:
"""does openai think our user is a director?"""
try:
result_cleaned = result[0].result.lower()
return get_openai_yes_no_answer(result_cleaned)
except IndexError:
return None


@online
def get_openai_is_swe(
result: User.open_ai_queries[OpenAiQuery.category == "is_swe"].prompt_result,
) -> User.is_swe:
"""does openai think our user is a software engineer?"""
try:
result_cleaned = result[0].result.lower()
return get_openai_yes_no_answer(result_cleaned)
except IndexError:
return None


@online
def dummy_users() -> DataFrame[User.id, User.title]:
"""Creates some dummy users for us to test"""
return DataFrame(
[
User(id=1, title="CEO"),
User(id=2, title="Queen of Scots"),
User(id=3, title="VP of Finance"),
User(id=4, title="SWE"),
User(id=5, title="Principal Software Engineer"),
User(id=6, title="Ingénieur Logiciel"),
]
)
92 changes: 92 additions & 0 deletions 12_model/3_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from chalk import Validation, embedding, online
from chalk.features import (
DataFrame,
Primary,
Vector,
before_all,
feature,
features,
has_many,
)


@features(max_staleness="infinity")
class FAQDocument:
id: int
title: str
body: str
link: str
embedding: Vector = embedding(
input=lambda: FAQDocument.body,
provider="openai",
model="text-embedding-ada-002",
)


@features
class SearchQuery:
query: Primary[str]
max_runtime: int = None
embedding: Vector = embedding(
input=lambda: SearchQuery.query,
provider="openai",
model="text-embedding-ada-002",
)
faqs: DataFrame[FAQDocument] = has_many(
lambda: SearchQuery.embedding.is_near(FAQDocument.embedding)
)
response: str


@online
def generate_response(
# Query for the three most relevant documents, and select their links
nearest_faqs: SearchQuery.faqs[FAQDocument.link, :3]
) -> SearchQuery.response:
return "\n".join(nearest_faqs[FAQDocument.link])


@online
def get_faqs() -> (
DataFrame[FAQDocument.id, FAQDocument.title, FAQDocument.body, FAQDocument.link]
):
return DataFrame(
[
FAQDocument(
id=1,
title="What is the difference between the online store and the offline store?",
body="The online store is intended to store features for low-latency retrieval in online query. Typically, the online store is implemented using Redis, DynamoDB, or (in some cases) Postgres. The offline store is intended to store historical logs of all previously ingested or computed features. It is used to compute large historical training sets. It is typically implemented using BigQuery, Snowflake, or other data warehouses.",
link="https://docs.chalk.ai/docs/faq#what-is-the-difference-between-the-online-store-and-the-offline-store",
),
FAQDocument(
id=2,
title="Can we do RBAC (Role Based Access Control) within Chalk?",
body="Yes! Within the dashboard you can assign roles with different permissions to different users. The default roles available are shown below.",
link="https://docs.chalk.ai/docs/faq#can-we-do-rbac-role-based-access-control-within-chalk",
),
FAQDocument(
id=3,
title="What are the necessary steps for us to get Chalk in our system?",
body="Please reach out via your support channel and we’d be happy to walk you through how to get Chalk setup running on your cloud infrastructure!",
link="https://docs.chalk.ai/docs/faq#what-are-the-necessary-steps-for-us-to-get-chalk-in-our-system",
),
FAQDocument(
id=4,
title="Does Chalk have a feature catalog?",
body="Yes! You can view all the features for all namespaces deployed in your environments, along with some metadata on recent activity and updates.",
link="https://docs.chalk.ai/docs/faq#does-chalk-have-a-feature-catalog",
),
FAQDocument(
id=5,
title="Can I upload features into the online store with an API endpoint?",
body="Yes! In addition to streaming and scheduled bulk ingests of features, you can submit requests using the upload_features SDK endpoints to synchronously ingest features into the online or offline stores using API clients.",
link="https://docs.chalk.ai/docs/faq#can-i-upload-features-into-the-online-store-with-an-api-endpoint",
),
FAQDocument(
id=6,
title="How are resources provisioned for my Chalk cluster, and can I modify the configuration?",
body="We have default resource configurations for general environments. You can modify the configuration for your project’s cloud resources by modifying the specs under Settings > Resources > Advanced Resource Configuration. You must hit Save and Apply Changes in order for your configuration changes to go through. If you are not sure how you should configure your cloud resources, please reach out to us in your support channel!",
link="https://docs.chalk.ai/docs/faq#how-are-resources-provisioned-for-my-chalk-cluster-and-can-i-modify-the-configuration",
),
]
)
82 changes: 77 additions & 5 deletions 12_model/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
With Chalk, it's easy to run models in resolvers.

## 1. Models
The example code below shows how to integrate a predictive model into a resolver.

**[1_model.py](1_model.py)**
The example code below, which can be found in its entirety in the **[1_model.py](1_model.py)** file,
shows how to integrate a predictive model into a resolver.

```python
class PredictionModel:
Expand Down Expand Up @@ -55,13 +55,85 @@ churn_model = PredictionModel("churn_model.skops")

@online
def get_user_churn_probability(
age: User.age,
num_friends: User.num_friends,
viewed_minutes: User.viewed_minutes,
age: User.age,
num_friends: User.num_friends,
viewed_minutes: User.viewed_minutes,
) -> User.probability_of_churn:
"""
This resolver runs a model that has been trained on a user's age, num_friends
and viewed_minutes. It returns a platform churn prediction.
"""
return churn_model.predict(np.array([[age, num_friends, viewed_minutes]]))
```

## 2. OpenAI

Chalk also makes it easy to integrate third party models, like ChatGPT, into your resolvers. In the
following example, we use Chat-GPT to answer questions about our Users.

Additionally, since our questions are often repeated, we cache the results of the queries,
limiting the number of API requests we need to make.

The example code below, which can be found in its entirety in the **[2_openai.py](2_openai.py)** file,
shows how to run a API request in a python resolver:

```python
# run queries by the hash of the prompt
@online
def get_openai_answer(
prompt_hash: OpenAiQuery.prompt_hash,
prompt: OpenAiQuery.prompt,
) -> OpenAiQuery.prompt_result:
result = openai_client.chat.completions.create(
messages=[
{
'role': 'user',
'content': prompt,
}
],
model="gpt-3.5-turbo",
)

return OpenAiQueryResult(
id=prompt_hash,
result=result.choices[0].message.content,
)
```

## 3. Embeddings

Chalk supports embeddings for your text features.

The example code below, which can be found in its entirety in the **[3_embedding.py](3_embedding.py)** file,
shows how to create embedding features and join feature sets based on their similarity:

```python
@features(max_staleness="infinity")
class FAQDocument:
id: int
title: str
body: str
link: str
embedding_text: str
embedding: Vector = embedding(
input=lambda: FAQDocument.embedding_text,
provider="openai",
model="text-embedding-ada-002",
)


@features
class SearchQuery:
query: Primary[str]
max_runtime: int = None
embedding_text: str
embedding: Vector = embedding(
input=lambda: SearchQuery.embedding_text,
provider="openai",
model="text-embedding-ada-002",
)
faqs: DataFrame[FAQDocument] = has_many(
lambda: SearchQuery.embedding.is_near(FAQDocument.embedding)
)
response: str
```