chalk-ai · sjmignot · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/12_model/2_openai.py b/12_model/2_openai.py
@@ -0,0 +1,163 @@
+import hashlib
+import os
+from functools import lru_cache
+
+from openai import OpenAI
+
+from chalk import online
+from chalk.features import DataFrame, before_all, features, has_many
+
+openai_client: OpenAI
+
+# A list of prompts that we can run based on our user's titles
+TITLE_PROMPTS = dict(
+    is_exec="Does the job title `{title}` mean that the person is a executive at their company? Please answer with one word, either: `Yes` or `No`.",
+    is_swe="Does the job title `{title}` mean the person is a software engineer? Please answer with one word, either: `Yes` or `No`",
+)
+
+
+@lru_cache
+def hash_prompt(prompt: str, length=16) -> str:
+    """Hash a prompt to a fixed length string. This is useful for caching open ai API requests"""
+    return int(hashlib.sha256(prompt.encode("utf-8")).hexdigest(), 16) % 10**length
+
+
+def get_openai_yes_no_answer(response: str) -> bool | None:
+    """Tests whether the response is a yes or no answer. If it is ambiguous, returns None."""
+    yes = "yes" in response
+    no = "no" in response
+    if (yes and no) or len(response) > 50:
+        # our answer is a bit ambiguous, let's not make a decision
+        return None
+    if yes:
+        return True
+    if no:
+        return False
+
+
+@before_all
+def initialize_open_ai_client():
+    # Note, one should be cautious when using global to give access to custom
+    # data sources: https://docs.chalk.ai/docs/generic#scopes
+    global openai_client
+
+    openai_client = OpenAI(
+        # This assumes that the OPEN_AI_API_KEY is set in your chalk
+        # deployment: https://docs.chalk.ai/docs/env-vars
+        api_key=os.environ.get("OPEN_AI_API_KEY"),
+    )
+
+
+@features
+class OpenAiQuery:
+    id: str
+    user_id: str
+    # currently, this is one of `is_exec` or `is_swe` (it is is the question "type")
+    category: str
+    prompt: str
+    prompt_hash: str
+    prompt_result: "OpenAiQueryResult"
+
+
+# Setting no max staleness caches our openai queries, limiting our api calls
+# for users with equivalent titles.
+@features(max_staleness="infinity")
+class OpenAiQueryResult:
+    id: str
+    result: str
+    queries: DataFrame[OpenAiQuery] = has_many(
+        lambda: OpenAiQuery.prompt_hash == OpenAiQueryResult.id
+    )
+
+
+@features
+class User:
+    id: str
+    title: str
+    is_exec: bool
+    is_swe: bool
+    open_ai_queries: DataFrame[OpenAiQuery] = has_many(
+        lambda: User.id == OpenAiQuery.user_id
+    )
+
+
+@online
+def get_openai_title_queries(
+    user_id: User.id,
+    title: User.title,
+) -> User.open_ai_queries:
+    open_ai_title_queries = []
+    for category, title_prompt in TITLE_PROMPTS.items():
+        prompt = title_prompt.format(title=title)
+        prompt_hash = hash_prompt(prompt)
+        open_ai_title_queries.append(
+            OpenAiQuery(
+                id=f"{user_id}_{prompt_hash}",
+                user_id=user_id,
+                prompt=prompt,
+                category=category,
+                prompt_hash=prompt_hash,
+            )
+        )
+    return DataFrame(open_ai_title_queries)
+
+
+# run queries by the hash of the prompt
+@online
+def get_openai_answer(
+    prompt_hash: OpenAiQuery.prompt_hash,
+    prompt: OpenAiQuery.prompt,
+) -> OpenAiQuery.prompt_result:
+    result = openai_client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        model="gpt-3.5-turbo",
+    )
+
+    return OpenAiQueryResult(
+        id=prompt_hash,
+        result=result.choices[0].message.content,
+    )
+
+
+@online
+def get_openai_is_director(
+    result: User.open_ai_queries[OpenAiQuery.category == "is_exec"].prompt_result,
+) -> User.is_exec:
+    """does openai think our user is a director?"""
+    try:
+        result_cleaned = result[0].result.lower()
+        return get_openai_yes_no_answer(result_cleaned)
+    except IndexError:
+        return None
+
+
+@online
+def get_openai_is_swe(
+    result: User.open_ai_queries[OpenAiQuery.category == "is_swe"].prompt_result,
+) -> User.is_swe:
+    """does openai think our user is a software engineer?"""
+    try:
+        result_cleaned = result[0].result.lower()
+        return get_openai_yes_no_answer(result_cleaned)
+    except IndexError:
+        return None
+
+
+@online
+def dummy_users() -> DataFrame[User.id, User.title]:
+    """Creates some dummy users for us to test"""
+    return DataFrame(
+        [
+            User(id=1, title="CEO"),
+            User(id=2, title="Queen of Scots"),
+            User(id=3, title="VP of Finance"),
+            User(id=4, title="SWE"),
+            User(id=5, title="Principal Software Engineer"),
+            User(id=6, title="Ingénieur Logiciel"),
+        ]
+    )
diff --git a/12_model/3_embeddings.py b/12_model/3_embeddings.py
@@ -0,0 +1,92 @@
+from chalk import Validation, embedding, online
+from chalk.features import (
+    DataFrame,
+    Primary,
+    Vector,
+    before_all,
+    feature,
+    features,
+    has_many,
+)
+
+
+@features(max_staleness="infinity")
+class FAQDocument:
+    id: int
+    title: str
+    body: str
+    link: str
+    embedding: Vector = embedding(
+        input=lambda: FAQDocument.body,
+        provider="openai",
+        model="text-embedding-ada-002",
+    )
+
+
+@features
+class SearchQuery:
+    query: Primary[str]
+    max_runtime: int = None
+    embedding: Vector = embedding(
+        input=lambda: SearchQuery.query,
+        provider="openai",
+        model="text-embedding-ada-002",
+    )
+    faqs: DataFrame[FAQDocument] = has_many(
+        lambda: SearchQuery.embedding.is_near(FAQDocument.embedding)
+    )
+    response: str
+
+
+@online
+def generate_response(
+    # Query for the three most relevant documents, and select their links
+    nearest_faqs: SearchQuery.faqs[FAQDocument.link, :3]
+) -> SearchQuery.response:
+    return "\n".join(nearest_faqs[FAQDocument.link])
+
+
+@online
+def get_faqs() -> (
+    DataFrame[FAQDocument.id, FAQDocument.title, FAQDocument.body, FAQDocument.link]
+):
+    return DataFrame(
+        [
+            FAQDocument(
+                id=1,
+                title="What is the difference between the online store and the offline store?",
+                body="The online store is intended to store features for low-latency retrieval in online query. Typically, the online store is implemented using Redis, DynamoDB, or (in some cases) Postgres. The offline store is intended to store historical logs of all previously ingested or computed features. It is used to compute large historical training sets. It is typically implemented using BigQuery, Snowflake, or other data warehouses.",
+                link="https://docs.chalk.ai/docs/faq#what-is-the-difference-between-the-online-store-and-the-offline-store",
+            ),
+            FAQDocument(
+                id=2,
+                title="Can we do RBAC (Role Based Access Control) within Chalk?",
+                body="Yes! Within the dashboard you can assign roles with different permissions to different users. The default roles available are shown below.",
+                link="https://docs.chalk.ai/docs/faq#can-we-do-rbac-role-based-access-control-within-chalk",
+            ),
+            FAQDocument(
+                id=3,
+                title="What are the necessary steps for us to get Chalk in our system?",
+                body="Please reach out via your support channel and we’d be happy to walk you through how to get Chalk setup running on your cloud infrastructure!",
+                link="https://docs.chalk.ai/docs/faq#what-are-the-necessary-steps-for-us-to-get-chalk-in-our-system",
+            ),
+            FAQDocument(
+                id=4,
+                title="Does Chalk have a feature catalog?",
+                body="Yes! You can view all the features for all namespaces deployed in your environments, along with some metadata on recent activity and updates.",
+                link="https://docs.chalk.ai/docs/faq#does-chalk-have-a-feature-catalog",
+            ),
+            FAQDocument(
+                id=5,
+                title="Can I upload features into the online store with an API endpoint?",
+                body="Yes! In addition to streaming and scheduled bulk ingests of features, you can submit requests using the upload_features SDK endpoints to synchronously ingest features into the online or offline stores using API clients.",
+                link="https://docs.chalk.ai/docs/faq#can-i-upload-features-into-the-online-store-with-an-api-endpoint",
+            ),
+            FAQDocument(
+                id=6,
+                title="How are resources provisioned for my Chalk cluster, and can I modify the configuration?",
+                body="We have default resource configurations for general environments. You can modify the configuration for your project’s cloud resources by modifying the specs under Settings > Resources > Advanced Resource Configuration. You must hit Save and Apply Changes in order for your configuration changes to go through. If you are not sure how you should configure your cloud resources, please reach out to us in your support channel!",
+                link="https://docs.chalk.ai/docs/faq#how-are-resources-provisioned-for-my-chalk-cluster-and-can-i-modify-the-configuration",
+            ),
+        ]
+    )
diff --git a/12_model/README.md b/12_model/README.md
@@ -3,9 +3,9 @@
 With Chalk, it's easy to run models in resolvers.
 
 ## 1. Models
-The example code below shows how to integrate a predictive model into a resolver.
 
-**[1_model.py](1_model.py)**
+The example code below, which can be found in its entirety in the **[1_model.py](1_model.py)** file,
+shows how to integrate a predictive model into a resolver.
 
 ```python
 class PredictionModel:
@@ -55,13 +55,85 @@ churn_model = PredictionModel("churn_model.skops")
 
 @online
 def get_user_churn_probability(
-    age: User.age,
-    num_friends: User.num_friends,
-    viewed_minutes: User.viewed_minutes,
+        age: User.age,
+        num_friends: User.num_friends,
+        viewed_minutes: User.viewed_minutes,
 ) -> User.probability_of_churn:
     """
     This resolver runs a model that has been trained on a user's age, num_friends
     and viewed_minutes. It returns a platform churn prediction.
     """
     return churn_model.predict(np.array([[age, num_friends, viewed_minutes]]))
 ```
+
+## 2. OpenAI
+
+Chalk also makes it easy to integrate third party models, like ChatGPT, into your resolvers. In the
+following example, we use Chat-GPT to answer questions about our Users.
+
+Additionally, since our questions are often repeated, we cache the results of the queries,
+limiting the number of API requests we need to make.
+
+The example code below, which can be found in its entirety in the **[2_openai.py](2_openai.py)** file,
+shows how to run a API request in a python resolver:
+
+```python
+# run queries by the hash of the prompt
+@online
+def get_openai_answer(
+        prompt_hash: OpenAiQuery.prompt_hash,
+        prompt: OpenAiQuery.prompt,
+) -> OpenAiQuery.prompt_result:
+    result = openai_client.chat.completions.create(
+        messages=[
+            {
+                'role': 'user',
+                'content': prompt,
+            }
+        ],
+        model="gpt-3.5-turbo",
+    )
+
+    return OpenAiQueryResult(
+        id=prompt_hash,
+        result=result.choices[0].message.content,
+    )
+```
+
+## 3. Embeddings
+
+Chalk supports embeddings for your text features.
+
+The example code below, which can be found in its entirety in the **[3_embedding.py](3_embedding.py)** file,
+shows how to create embedding features and join feature sets based on their similarity:
+
+```python
+@features(max_staleness="infinity")
+class FAQDocument:
+    id: int
+    title: str
+    body: str
+    link: str
+    embedding_text: str
+    embedding: Vector = embedding(
+        input=lambda: FAQDocument.embedding_text,
+        provider="openai",
+        model="text-embedding-ada-002",
+    )
+
+
+@features
+class SearchQuery:
+    query: Primary[str]
+    max_runtime: int = None
+    embedding_text: str
+    embedding: Vector = embedding(
+        input=lambda: SearchQuery.embedding_text,
+        provider="openai",
+        model="text-embedding-ada-002",
+    )
+    faqs: DataFrame[FAQDocument] = has_many(
+        lambda: SearchQuery.embedding.is_near(FAQDocument.embedding)
+    )
+    response: str
+```