From 8a0e8304f202157be0047fe62c8197a82c7acc04 Mon Sep 17 00:00:00 2001 From: Samuel Mignot Date: Mon, 24 Jun 2024 16:02:15 -0700 Subject: [PATCH 1/6] revert new examples --- 12_model/2_openai.py | 159 +++++++++++++++++++++++++++++++++++++++ 12_model/3_embeddings.py | 111 +++++++++++++++++++++++++++ 12_model/README.md | 41 ++++++++-- 3 files changed, 306 insertions(+), 5 deletions(-) create mode 100644 12_model/2_openai.py create mode 100644 12_model/3_embeddings.py diff --git a/12_model/2_openai.py b/12_model/2_openai.py new file mode 100644 index 0000000..4af8978 --- /dev/null +++ b/12_model/2_openai.py @@ -0,0 +1,159 @@ +from chalk import online +from chalk.features import features, has_many, DataFrame, before_all + +import hashlib +import os +from functools import lru_cache + +from openai import OpenAI + +openai_client: OpenAI + +# A list of prompts that we can run based on our user's titles +TITLE_PROMPTS = dict( + is_exec="Does the job title `{title}` mean that the person is a executive at their company? Please answer with one word, either: `Yes` or `No`.", + is_swe="Does the job title `{title}` mean the person is a software engineer? Please answer with one word, either: `Yes` or `No`" +) + + +@lru_cache +def hash_prompt(prompt: str, length=16) -> str: + """Hash a prompt to a fixed length string. This is useful for caching open ai API requests""" + return int(hashlib.sha256(prompt.encode('utf-8')).hexdigest(), 16) % 10 ** length + + +def get_openai_yes_no_answer(response: str) -> bool | None: + """Tests whether the response is a yes or no answer. If it is ambiguous, returns None.""" + yes = 'yes' in response + no = 'no' in response + if (yes and no) or len(response) > 50: + # our answer is a bit ambiguous, let's not make a decision + return None + if yes: + return True + if no: + return False + + +@before_all +def initialize_open_ai_client(): + # Note, one should be cautious when using global to give access to custom + # data sources: https://docs.chalk.ai/docs/generic#scopes + global openai_client + + openai_client = OpenAI( + # This assumes that the OPEN_AI_API_KEY is set in your chalk + # deployment: https://docs.chalk.ai/docs/env-vars + api_key=os.environ.get("OPEN_AI_API_KEY"), + ) + + +@features +class OpenAiQuery: + id: str + user_id: str + # currently, this is one of `is_exec` or `is_swe` (it is is the question "type") + category: str + prompt: str + prompt_hash: str + prompt_result: "OpenAiQueryResult" + + +# Setting no max staleness caches our openai queries, limiting our api calls +# for users with equivalent titles. +@features(max_staleness="infinity") +class OpenAiQueryResult: + id: str + result: str + queries: DataFrame[OpenAiQuery] = has_many(lambda: OpenAiQuery.prompt_hash == OpenAiQueryResult.id) + + +@features +class User: + id: str + title: str + is_exec: bool + is_swe: bool + open_ai_queries: DataFrame[OpenAiQuery] = has_many(lambda: User.id == OpenAiQuery.user_id) + + +@online +def get_openai_title_queries( + user_id: User.id, + title: User.title, +) -> User.open_ai_queries: + open_ai_title_queries = [] + for category, title_prompt in TITLE_PROMPTS.items(): + prompt = title_prompt.format(title=title) + prompt_hash = hash_prompt(prompt) + open_ai_title_queries.append( + OpenAiQuery( + id=f"{user_id}_{prompt_hash}", + user_id=user_id, + prompt=prompt, + category=category, + prompt_hash=prompt_hash, + ) + ) + return DataFrame(open_ai_title_queries) + + +# run queries by the hash of the prompt +@online +def get_openai_answer( + prompt_hash: OpenAiQuery.prompt_hash, + prompt: OpenAiQuery.prompt, +) -> OpenAiQuery.prompt_result: + result = openai_client.chat.completions.create( + messages=[ + { + 'role': 'user', + 'content': prompt, + } + ], + model="gpt-3.5-turbo", + ) + + return OpenAiQueryResult( + id=prompt_hash, + result=result.choices[0].message.content, + ) + + +@online +def get_openai_is_director( + result: User.open_ai_queries[OpenAiQuery.category == "is_exec"].prompt_result, +) -> User.is_exec: + """does openai think our user is a director?""" + try: + result_cleaned = result[0].result.lower() + return get_openai_yes_no_answer(result_cleaned) + except IndexError: + return None + + +@online +def get_openai_is_swe( + result: User.open_ai_queries[OpenAiQuery.category == "is_swe"].prompt_result, +) -> User.is_swe: + """does openai think our user is a software engineer?""" + try: + result_cleaned = result[0].result.lower() + return get_openai_yes_no_answer(result_cleaned) + except IndexError: + return None + + +@online +def dummy_users() -> DataFrame[User.id, User.title]: + """Creates some dummy users for us to test""" + return DataFrame( + [ + User(id=1, title="CEO"), + User(id=2, title="Queen of Scots"), + User(id=3, title='VP of Finance'), + User(id=4, title='SWE'), + User(id=5, title='Principal Software Engineer'), + User(id=6, title='Ingénieur Logiciel'), + ] + ) diff --git a/12_model/3_embeddings.py b/12_model/3_embeddings.py new file mode 100644 index 0000000..baf6c0b --- /dev/null +++ b/12_model/3_embeddings.py @@ -0,0 +1,111 @@ +from chalk import Validation, online +from chalk.features import features, DataFrame, feature, before_all, Vector, Primary +from sentence_transformers import SentenceTransformer + + +global model + + +@before_all +def load_embedding_model(): + global model + model = SentenceTransformer('intfloat/e5-small-v2') + + +@features +class Movie: + id: int + title: str + director: str + year: int + description: str + rating: float = feature(min=0.0, max=10.0), + runtime: int = feature(min=0.0), + genres=set[str] + embedding_text: str + embedding: Vector[384] + +@features +class SearchQuery: + query: Primary[str] + embedding: Vector[384] + + +@online +def get_movie_embedding_text(description: Movie.description, genres: Movie.genres, director: Movie.director) -> Movie.embedding_text: + genres_text = ", ".join(list(genres)) + return f"passage: {description}. Directed by {director}. {genres_text}" + +@online +def get_embedding(embedding_text: Movie.embedding_text) -> Movie.embedding: + return model.encode(embedding_text) + +@online +def get_query_embedding(embedding_text: SearchQuery.query) -> SearchQuery.embedding: + return model.encode(f"query: {embedding_text}") + + +@online +def get_movies() -> DataFrame[Movie]: + return DataFrame([ + Movie( + id=0, + title="High and Low", + year=1963, + director="Akira Kurosawa", + rating=8.4, + runtime=143, + genres={"Crime", "thriller"}, + description="An executive of a Yokohama shoe company becomes a victim of extortion when his chauffeur's son is kidnapped by mistake and held for ransom." + ), + Movie( + id=1, + title="Spirited Away", + year=2001, + director="Hayao Miyazaki", + rating=8.6, + runtime=135, + genres={"Animation", "Adventure", "Fantasy"}, + description="During her family's move to the suburbs, a sullen 10-year-old girl wanders into a world ruled by gods, witches and spirits, and where humans are changed into beasts.", + ), + Movie( + id=2 + title="Anatomy of a Fall", + year=2023, + director="Justine Triet" + rating=7.7, + runtime=152, + genres={"Crime", "Thriller"}, + description="A woman is suspected of murder after her husband's death; their half-blind son faces a moral dilemma as the main witness." + ), + Movie( + id=3, + title="The Cruz Brothers and Miss Malloy", + year=1980, + director="Kathleen Collins", + rating=6.2, + runtime=54, + genres={"Drama"}, + description="Three Puerto Rican brothers retreat to a town in New York following their father's death during a bank robbery. There they are hired by an elderly Irishwoman to renovate her house so she can throw one last house party." + ), + Movie( + id=4, + title="In the Mood for Love", + year=2000, + director="Wong Kar Wai", + rating=8.1, + runtime=98, + genres={"Romance","Drama"}, + description="Two neighbors form a strong bond after both suspect extramarital activities of their spouses. However, they agree to keep their bond platonic so as not to commit similar wrongs." + ), + Movie( + id=5, + title="Mad Max: Fury Road", + year=2015, + director="George Miller", + rating=8.1, + runtime=120, + genres={"Action"} + description="In a post-apocalyptic wasteland, a woman rebels against a tyrannical ruler in search for her homeland with the aid of a group of female prisoners, a psychotic worshipper and a drifter named Max.", + ) + ]) diff --git a/12_model/README.md b/12_model/README.md index cb1305b..a69ddb0 100644 --- a/12_model/README.md +++ b/12_model/README.md @@ -3,9 +3,9 @@ With Chalk, it's easy to run models in resolvers. ## 1. Models -The example code below shows how to integrate a predictive model into a resolver. -**[1_model.py](1_model.py)** +The example code below, which can be found in its entirety in the **[1_model.py](1_model.py)** file, +shows how to integrate a predictive model into a resolver. ```python class PredictionModel: @@ -55,9 +55,9 @@ churn_model = PredictionModel("churn_model.skops") @online def get_user_churn_probability( - age: User.age, - num_friends: User.num_friends, - viewed_minutes: User.viewed_minutes, + age: User.age, + num_friends: User.num_friends, + viewed_minutes: User.viewed_minutes, ) -> User.probability_of_churn: """ This resolver runs a model that has been trained on a user's age, num_friends @@ -65,3 +65,34 @@ def get_user_churn_probability( """ return churn_model.predict(np.array([[age, num_friends, viewed_minutes]])) ``` + +## 2. OpenAI + +Chalk also makes it easy to integrate third party models, like ChatGPT, into your resolvers. In the +following example, we use GPT-3 model to answer questions about our Users. + +Additionally, since our questions are often repeated, we cache the results of the queries, +limiting the number of API requests we need to make. + +```python +# run queries by the hash of the prompt +@online +def get_openai_answer( + prompt_hash: OpenAiQuery.prompt_hash, + prompt: OpenAiQuery.prompt, +) -> OpenAiQuery.prompt_result: + result = openai_client.chat.completions.create( + messages=[ + { + 'role': 'user', + 'content': prompt, + } + ], + model="gpt-3.5-turbo", + ) + + return OpenAiQueryResult( + id=prompt_hash, + result=result.choices[0].message.content, + ) +``` From 54492138afee1d65715324627a5bf78c80d77e90 Mon Sep 17 00:00:00 2001 From: Samuel Mignot Date: Mon, 24 Jun 2024 16:06:38 -0700 Subject: [PATCH 2/6] revert to faq --- 12_model/3_embeddings.py | 148 ++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 80 deletions(-) diff --git a/12_model/3_embeddings.py b/12_model/3_embeddings.py index baf6c0b..efcef5b 100644 --- a/12_model/3_embeddings.py +++ b/12_model/3_embeddings.py @@ -1,111 +1,99 @@ -from chalk import Validation, online -from chalk.features import features, DataFrame, feature, before_all, Vector, Primary -from sentence_transformers import SentenceTransformer +from chalk import Validation, online, embedding +from chalk.features import features, DataFrame, feature, before_all, Vector, Primary, has_many -global model - -@before_all -def load_embedding_model(): - global model - model = SentenceTransformer('intfloat/e5-small-v2') - - -@features -class Movie: +@features(max_staleness='infinity') +class FAQDocument: id: int title: str - director: str - year: int - description: str - rating: float = feature(min=0.0, max=10.0), - runtime: int = feature(min=0.0), - genres=set[str] + body: str + link: str embedding_text: str - embedding: Vector[384] + embedding: Vector = embedding( + input=lambda: FAQDocument.embedding_text, + provider="openai", + model="text-embedding-ada-002" + ) + @features class SearchQuery: query: Primary[str] - embedding: Vector[384] + max_runtime: int = None + embedding_text: str + embedding: Vector = embedding( + input=lambda: SearchQuery.embedding_text, + provider="openai", + model="text-embedding-ada-002" + ) + faqs: DataFrame[FAQDocument] = has_many( + lambda: SearchQuery.embedding.is_near( + FAQDocument.embedding + ) + ) + response: str @online -def get_movie_embedding_text(description: Movie.description, genres: Movie.genres, director: Movie.director) -> Movie.embedding_text: - genres_text = ", ".join(list(genres)) - return f"passage: {description}. Directed by {director}. {genres_text}" +def generate_query_embedding_text(query: SearchQuery.query) -> SearchQuery.embedding_text: + return f"query: {query}" + @online -def get_embedding(embedding_text: Movie.embedding_text) -> Movie.embedding: - return model.encode(embedding_text) +def get_movie_embedding_text(body: FAQDocument.body, title: FAQDocument.title) -> FAQDocument.embedding_text: + return f"passage: {title}. {body}" + @online -def get_query_embedding(embedding_text: SearchQuery.query) -> SearchQuery.embedding: - return model.encode(f"query: {embedding_text}") +def generate_response( + # Query for the five most relevant documents, and select their links + nearest_faqs: SearchQuery.faqs[ + FAQDocument.link, + :3 + ] +) -> SearchQuery.response: + return "\n".join(nearest_faqs[FAQDocument.link]) @online -def get_movies() -> DataFrame[Movie]: +def get_movies() -> DataFrame[ + FAQDocument.id, FAQDocument.title, FAQDocument.body, FAQDocument.link]: return DataFrame([ - Movie( - id=0, - title="High and Low", - year=1963, - director="Akira Kurosawa", - rating=8.4, - runtime=143, - genres={"Crime", "thriller"}, - description="An executive of a Yokohama shoe company becomes a victim of extortion when his chauffeur's son is kidnapped by mistake and held for ransom." - ), - Movie( + FAQDocument( id=1, - title="Spirited Away", - year=2001, - director="Hayao Miyazaki", - rating=8.6, - runtime=135, - genres={"Animation", "Adventure", "Fantasy"}, - description="During her family's move to the suburbs, a sullen 10-year-old girl wanders into a world ruled by gods, witches and spirits, and where humans are changed into beasts.", + title="What is the difference between the online store and the offline store?", + body="The online store is intended to store features for low-latency retrieval in online query. Typically, the online store is implemented using Redis, DynamoDB, or (in some cases) Postgres. The offline store is intended to store historical logs of all previously ingested or computed features. It is used to compute large historical training sets. It is typically implemented using BigQuery, Snowflake, or other data warehouses.", + link="https://docs.chalk.ai/docs/faq#what-is-the-difference-between-the-online-store-and-the-offline-store" ), - Movie( - id=2 - title="Anatomy of a Fall", - year=2023, - director="Justine Triet" - rating=7.7, - runtime=152, - genres={"Crime", "Thriller"}, - description="A woman is suspected of murder after her husband's death; their half-blind son faces a moral dilemma as the main witness." + FAQDocument( + id=2, + title="Can we do RBAC (Role Based Access Control) within Chalk?", + body="Yes! Within the dashboard you can assign roles with different permissions to different users. The default roles available are shown below.", + link="https://docs.chalk.ai/docs/faq#can-we-do-rbac-role-based-access-control-within-chalk" ), - Movie( + FAQDocument( id=3, - title="The Cruz Brothers and Miss Malloy", - year=1980, - director="Kathleen Collins", - rating=6.2, - runtime=54, - genres={"Drama"}, - description="Three Puerto Rican brothers retreat to a town in New York following their father's death during a bank robbery. There they are hired by an elderly Irishwoman to renovate her house so she can throw one last house party." + title="What are the necessary steps for us to get Chalk in our system?", + body="Please reach out via your support channel and we’d be happy to walk you through how to get Chalk setup running on your cloud infrastructure!", + link="https://docs.chalk.ai/docs/faq#what-are-the-necessary-steps-for-us-to-get-chalk-in-our-system" ), - Movie( + FAQDocument( id=4, - title="In the Mood for Love", - year=2000, - director="Wong Kar Wai", - rating=8.1, - runtime=98, - genres={"Romance","Drama"}, - description="Two neighbors form a strong bond after both suspect extramarital activities of their spouses. However, they agree to keep their bond platonic so as not to commit similar wrongs." + title="Does Chalk have a feature catalog?", + body="Yes! You can view all the features for all namespaces deployed in your environments, along with some metadata on recent activity and updates.", + link="https://docs.chalk.ai/docs/faq#does-chalk-have-a-feature-catalog" ), - Movie( + FAQDocument( id=5, - title="Mad Max: Fury Road", - year=2015, - director="George Miller", - rating=8.1, - runtime=120, - genres={"Action"} - description="In a post-apocalyptic wasteland, a woman rebels against a tyrannical ruler in search for her homeland with the aid of a group of female prisoners, a psychotic worshipper and a drifter named Max.", + title="Can I upload features into the online store with an API endpoint?", + body="Yes! In addition to streaming and scheduled bulk ingests of features, you can submit requests using the upload_features SDK endpoints to synchronously ingest features into the online or offline stores using API clients.", + link="https://docs.chalk.ai/docs/faq#can-i-upload-features-into-the-online-store-with-an-api-endpoint", + ), + FAQDocument( + id=6, + title="How are resources provisioned for my Chalk cluster, and can I modify the configuration?", + body="We have default resource configurations for general environments. You can modify the configuration for your project’s cloud resources by modifying the specs under Settings > Resources > Advanced Resource Configuration. You must hit Save and Apply Changes in order for your configuration changes to go through. If you are not sure how you should configure your cloud resources, please reach out to us in your support channel!", + link="https://docs.chalk.ai/docs/faq#how-are-resources-provisioned-for-my-chalk-cluster-and-can-i-modify-the-configuration" ) ]) From 5f7830cef40ca66a5a15d47423e69d2bf4681c59 Mon Sep 17 00:00:00 2001 From: Samuel Mignot Date: Mon, 24 Jun 2024 16:08:31 -0700 Subject: [PATCH 3/6] linting for model examples --- 12_model/2_openai.py | 34 ++++++----- 12_model/3_embeddings.py | 119 +++++++++++++++++++++------------------ 2 files changed, 83 insertions(+), 70 deletions(-) diff --git a/12_model/2_openai.py b/12_model/2_openai.py index 4af8978..bc34ab4 100644 --- a/12_model/2_openai.py +++ b/12_model/2_openai.py @@ -1,31 +1,31 @@ -from chalk import online -from chalk.features import features, has_many, DataFrame, before_all - import hashlib import os from functools import lru_cache from openai import OpenAI +from chalk import online +from chalk.features import DataFrame, before_all, features, has_many + openai_client: OpenAI # A list of prompts that we can run based on our user's titles TITLE_PROMPTS = dict( is_exec="Does the job title `{title}` mean that the person is a executive at their company? Please answer with one word, either: `Yes` or `No`.", - is_swe="Does the job title `{title}` mean the person is a software engineer? Please answer with one word, either: `Yes` or `No`" + is_swe="Does the job title `{title}` mean the person is a software engineer? Please answer with one word, either: `Yes` or `No`", ) @lru_cache def hash_prompt(prompt: str, length=16) -> str: """Hash a prompt to a fixed length string. This is useful for caching open ai API requests""" - return int(hashlib.sha256(prompt.encode('utf-8')).hexdigest(), 16) % 10 ** length + return int(hashlib.sha256(prompt.encode("utf-8")).hexdigest(), 16) % 10**length def get_openai_yes_no_answer(response: str) -> bool | None: """Tests whether the response is a yes or no answer. If it is ambiguous, returns None.""" - yes = 'yes' in response - no = 'no' in response + yes = "yes" in response + no = "no" in response if (yes and no) or len(response) > 50: # our answer is a bit ambiguous, let's not make a decision return None @@ -65,7 +65,9 @@ class OpenAiQuery: class OpenAiQueryResult: id: str result: str - queries: DataFrame[OpenAiQuery] = has_many(lambda: OpenAiQuery.prompt_hash == OpenAiQueryResult.id) + queries: DataFrame[OpenAiQuery] = has_many( + lambda: OpenAiQuery.prompt_hash == OpenAiQueryResult.id + ) @features @@ -74,7 +76,9 @@ class User: title: str is_exec: bool is_swe: bool - open_ai_queries: DataFrame[OpenAiQuery] = has_many(lambda: User.id == OpenAiQuery.user_id) + open_ai_queries: DataFrame[OpenAiQuery] = has_many( + lambda: User.id == OpenAiQuery.user_id + ) @online @@ -107,8 +111,8 @@ def get_openai_answer( result = openai_client.chat.completions.create( messages=[ { - 'role': 'user', - 'content': prompt, + "role": "user", + "content": prompt, } ], model="gpt-3.5-turbo", @@ -151,9 +155,9 @@ def dummy_users() -> DataFrame[User.id, User.title]: [ User(id=1, title="CEO"), User(id=2, title="Queen of Scots"), - User(id=3, title='VP of Finance'), - User(id=4, title='SWE'), - User(id=5, title='Principal Software Engineer'), - User(id=6, title='Ingénieur Logiciel'), + User(id=3, title="VP of Finance"), + User(id=4, title="SWE"), + User(id=5, title="Principal Software Engineer"), + User(id=6, title="Ingénieur Logiciel"), ] ) diff --git a/12_model/3_embeddings.py b/12_model/3_embeddings.py index efcef5b..3113a79 100644 --- a/12_model/3_embeddings.py +++ b/12_model/3_embeddings.py @@ -1,9 +1,16 @@ -from chalk import Validation, online, embedding -from chalk.features import features, DataFrame, feature, before_all, Vector, Primary, has_many +from chalk import Validation, embedding, online +from chalk.features import ( + DataFrame, + Primary, + Vector, + before_all, + feature, + features, + has_many, +) - -@features(max_staleness='infinity') +@features(max_staleness="infinity") class FAQDocument: id: int title: str @@ -13,7 +20,7 @@ class FAQDocument: embedding: Vector = embedding( input=lambda: FAQDocument.embedding_text, provider="openai", - model="text-embedding-ada-002" + model="text-embedding-ada-002", ) @@ -25,75 +32,77 @@ class SearchQuery: embedding: Vector = embedding( input=lambda: SearchQuery.embedding_text, provider="openai", - model="text-embedding-ada-002" + model="text-embedding-ada-002", ) faqs: DataFrame[FAQDocument] = has_many( - lambda: SearchQuery.embedding.is_near( - FAQDocument.embedding - ) + lambda: SearchQuery.embedding.is_near(FAQDocument.embedding) ) response: str @online -def generate_query_embedding_text(query: SearchQuery.query) -> SearchQuery.embedding_text: +def generate_query_embedding_text( + query: SearchQuery.query, +) -> SearchQuery.embedding_text: return f"query: {query}" @online -def get_movie_embedding_text(body: FAQDocument.body, title: FAQDocument.title) -> FAQDocument.embedding_text: +def get_movie_embedding_text( + body: FAQDocument.body, title: FAQDocument.title +) -> FAQDocument.embedding_text: return f"passage: {title}. {body}" @online def generate_response( # Query for the five most relevant documents, and select their links - nearest_faqs: SearchQuery.faqs[ - FAQDocument.link, - :3 - ] + nearest_faqs: SearchQuery.faqs[FAQDocument.link, :3] ) -> SearchQuery.response: return "\n".join(nearest_faqs[FAQDocument.link]) @online -def get_movies() -> DataFrame[ - FAQDocument.id, FAQDocument.title, FAQDocument.body, FAQDocument.link]: - return DataFrame([ - FAQDocument( - id=1, - title="What is the difference between the online store and the offline store?", - body="The online store is intended to store features for low-latency retrieval in online query. Typically, the online store is implemented using Redis, DynamoDB, or (in some cases) Postgres. The offline store is intended to store historical logs of all previously ingested or computed features. It is used to compute large historical training sets. It is typically implemented using BigQuery, Snowflake, or other data warehouses.", - link="https://docs.chalk.ai/docs/faq#what-is-the-difference-between-the-online-store-and-the-offline-store" - ), - FAQDocument( - id=2, - title="Can we do RBAC (Role Based Access Control) within Chalk?", - body="Yes! Within the dashboard you can assign roles with different permissions to different users. The default roles available are shown below.", - link="https://docs.chalk.ai/docs/faq#can-we-do-rbac-role-based-access-control-within-chalk" - ), - FAQDocument( - id=3, - title="What are the necessary steps for us to get Chalk in our system?", - body="Please reach out via your support channel and we’d be happy to walk you through how to get Chalk setup running on your cloud infrastructure!", - link="https://docs.chalk.ai/docs/faq#what-are-the-necessary-steps-for-us-to-get-chalk-in-our-system" - ), - FAQDocument( - id=4, - title="Does Chalk have a feature catalog?", - body="Yes! You can view all the features for all namespaces deployed in your environments, along with some metadata on recent activity and updates.", - link="https://docs.chalk.ai/docs/faq#does-chalk-have-a-feature-catalog" - ), - FAQDocument( - id=5, - title="Can I upload features into the online store with an API endpoint?", - body="Yes! In addition to streaming and scheduled bulk ingests of features, you can submit requests using the upload_features SDK endpoints to synchronously ingest features into the online or offline stores using API clients.", - link="https://docs.chalk.ai/docs/faq#can-i-upload-features-into-the-online-store-with-an-api-endpoint", - ), - FAQDocument( - id=6, - title="How are resources provisioned for my Chalk cluster, and can I modify the configuration?", - body="We have default resource configurations for general environments. You can modify the configuration for your project’s cloud resources by modifying the specs under Settings > Resources > Advanced Resource Configuration. You must hit Save and Apply Changes in order for your configuration changes to go through. If you are not sure how you should configure your cloud resources, please reach out to us in your support channel!", - link="https://docs.chalk.ai/docs/faq#how-are-resources-provisioned-for-my-chalk-cluster-and-can-i-modify-the-configuration" - ) - ]) +def get_movies() -> ( + DataFrame[FAQDocument.id, FAQDocument.title, FAQDocument.body, FAQDocument.link] +): + return DataFrame( + [ + FAQDocument( + id=1, + title="What is the difference between the online store and the offline store?", + body="The online store is intended to store features for low-latency retrieval in online query. Typically, the online store is implemented using Redis, DynamoDB, or (in some cases) Postgres. The offline store is intended to store historical logs of all previously ingested or computed features. It is used to compute large historical training sets. It is typically implemented using BigQuery, Snowflake, or other data warehouses.", + link="https://docs.chalk.ai/docs/faq#what-is-the-difference-between-the-online-store-and-the-offline-store", + ), + FAQDocument( + id=2, + title="Can we do RBAC (Role Based Access Control) within Chalk?", + body="Yes! Within the dashboard you can assign roles with different permissions to different users. The default roles available are shown below.", + link="https://docs.chalk.ai/docs/faq#can-we-do-rbac-role-based-access-control-within-chalk", + ), + FAQDocument( + id=3, + title="What are the necessary steps for us to get Chalk in our system?", + body="Please reach out via your support channel and we’d be happy to walk you through how to get Chalk setup running on your cloud infrastructure!", + link="https://docs.chalk.ai/docs/faq#what-are-the-necessary-steps-for-us-to-get-chalk-in-our-system", + ), + FAQDocument( + id=4, + title="Does Chalk have a feature catalog?", + body="Yes! You can view all the features for all namespaces deployed in your environments, along with some metadata on recent activity and updates.", + link="https://docs.chalk.ai/docs/faq#does-chalk-have-a-feature-catalog", + ), + FAQDocument( + id=5, + title="Can I upload features into the online store with an API endpoint?", + body="Yes! In addition to streaming and scheduled bulk ingests of features, you can submit requests using the upload_features SDK endpoints to synchronously ingest features into the online or offline stores using API clients.", + link="https://docs.chalk.ai/docs/faq#can-i-upload-features-into-the-online-store-with-an-api-endpoint", + ), + FAQDocument( + id=6, + title="How are resources provisioned for my Chalk cluster, and can I modify the configuration?", + body="We have default resource configurations for general environments. You can modify the configuration for your project’s cloud resources by modifying the specs under Settings > Resources > Advanced Resource Configuration. You must hit Save and Apply Changes in order for your configuration changes to go through. If you are not sure how you should configure your cloud resources, please reach out to us in your support channel!", + link="https://docs.chalk.ai/docs/faq#how-are-resources-provisioned-for-my-chalk-cluster-and-can-i-modify-the-configuration", + ), + ] + ) From 3532a53cb88747256a1764b5ad55cf12d2ab8f06 Mon Sep 17 00:00:00 2001 From: Samuel Mignot Date: Mon, 24 Jun 2024 16:12:50 -0700 Subject: [PATCH 4/6] update readme --- 12_model/README.md | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/12_model/README.md b/12_model/README.md index a69ddb0..aa9c83f 100644 --- a/12_model/README.md +++ b/12_model/README.md @@ -69,11 +69,14 @@ def get_user_churn_probability( ## 2. OpenAI Chalk also makes it easy to integrate third party models, like ChatGPT, into your resolvers. In the -following example, we use GPT-3 model to answer questions about our Users. +following example, we use Chat-GPT to answer questions about our Users. Additionally, since our questions are often repeated, we cache the results of the queries, limiting the number of API requests we need to make. +The example code below, which can be found in its entirety in the **[2_openai.py](2_openai.py)** file, +shows how to run a API request in a python resolver: + ```python # run queries by the hash of the prompt @online @@ -96,3 +99,41 @@ def get_openai_answer( result=result.choices[0].message.content, ) ``` + +## 3. Embeddings + +Chalk supports embeddings for your text features. + +The example code below, which can be found in its entirety in the **[3_embedding.py](3_embedding.py)** file, +shows how to create embedding features and join feature sets based on their similarity: + +```python +@features(max_staleness="infinity") +class FAQDocument: + id: int + title: str + body: str + link: str + embedding_text: str + embedding: Vector = embedding( + input=lambda: FAQDocument.embedding_text, + provider="openai", + model="text-embedding-ada-002", + ) + + +@features +class SearchQuery: + query: Primary[str] + max_runtime: int = None + embedding_text: str + embedding: Vector = embedding( + input=lambda: SearchQuery.embedding_text, + provider="openai", + model="text-embedding-ada-002", + ) + faqs: DataFrame[FAQDocument] = has_many( + lambda: SearchQuery.embedding.is_near(FAQDocument.embedding) + ) + response: str +``` From 7d5935970eac4dec6d477fda4980da5d1a28939d Mon Sep 17 00:00:00 2001 From: Samuel Mignot <43255992+sjmignot@users.noreply.github.com> Date: Tue, 25 Jun 2024 09:05:41 -0700 Subject: [PATCH 5/6] Update 12_model/3_embeddings.py --- 12_model/3_embeddings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/12_model/3_embeddings.py b/12_model/3_embeddings.py index 3113a79..0ac0e0d 100644 --- a/12_model/3_embeddings.py +++ b/12_model/3_embeddings.py @@ -56,7 +56,7 @@ def get_movie_embedding_text( @online def generate_response( - # Query for the five most relevant documents, and select their links + # Query for the three most relevant documents, and select their links nearest_faqs: SearchQuery.faqs[FAQDocument.link, :3] ) -> SearchQuery.response: return "\n".join(nearest_faqs[FAQDocument.link]) From 3fb5a0f36f98843edfb833e8878351d10b2636ea Mon Sep 17 00:00:00 2001 From: Samuel Mignot Date: Tue, 25 Jun 2024 09:07:46 -0700 Subject: [PATCH 6/6] fix faq --- 12_model/3_embeddings.py | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/12_model/3_embeddings.py b/12_model/3_embeddings.py index 0ac0e0d..88e3ad7 100644 --- a/12_model/3_embeddings.py +++ b/12_model/3_embeddings.py @@ -16,9 +16,8 @@ class FAQDocument: title: str body: str link: str - embedding_text: str embedding: Vector = embedding( - input=lambda: FAQDocument.embedding_text, + input=lambda: FAQDocument.body, provider="openai", model="text-embedding-ada-002", ) @@ -28,9 +27,8 @@ class FAQDocument: class SearchQuery: query: Primary[str] max_runtime: int = None - embedding_text: str embedding: Vector = embedding( - input=lambda: SearchQuery.embedding_text, + input=lambda: SearchQuery.query, provider="openai", model="text-embedding-ada-002", ) @@ -40,20 +38,6 @@ class SearchQuery: response: str -@online -def generate_query_embedding_text( - query: SearchQuery.query, -) -> SearchQuery.embedding_text: - return f"query: {query}" - - -@online -def get_movie_embedding_text( - body: FAQDocument.body, title: FAQDocument.title -) -> FAQDocument.embedding_text: - return f"passage: {title}. {body}" - - @online def generate_response( # Query for the three most relevant documents, and select their links @@ -63,7 +47,7 @@ def generate_response( @online -def get_movies() -> ( +def get_faqs() -> ( DataFrame[FAQDocument.id, FAQDocument.title, FAQDocument.body, FAQDocument.link] ): return DataFrame(