From 489d5856320e2d68872c31cec5146f72160f7dfd Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Tue, 15 Apr 2025 12:48:57 -0700 Subject: [PATCH 1/8] default to stagehand LLM clients for evals --- evals/args.ts | 4 ++ evals/index.eval.ts | 70 +++++------------------ evals/taskConfig.ts | 2 +- evals/utils.ts | 136 ++++++++++++++++++++++++++++++++++++++++++++ types/evals.ts | 12 ++++ 5 files changed, 168 insertions(+), 56 deletions(-) diff --git a/evals/args.ts b/evals/args.ts index 4c2f748fc..297d7f845 100644 --- a/evals/args.ts +++ b/evals/args.ts @@ -9,6 +9,7 @@ const parsedArgs: { concurrency?: number; extractMethod?: string; provider?: string; + useExternalClients?: boolean; leftover: string[]; } = { leftover: [], @@ -31,6 +32,9 @@ for (const arg of rawArgs) { parsedArgs.extractMethod = arg.split("=")[1]; } else if (arg.startsWith("provider=")) { parsedArgs.provider = arg.split("=")[1]?.toLowerCase(); + } else if (arg.startsWith("--useExternalClients=")) { + const val = arg.split("=")[1]?.toLowerCase(); + parsedArgs.useExternalClients = val === "true"; } else { parsedArgs.leftover.push(arg); } diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 617214e1e..9061f67af 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -20,25 +20,19 @@ import { filterByCategory, filterByEvalName, useTextExtract, + parsedArgs, } from "./args"; -import { generateExperimentName } from "./utils"; +import { createLLMClient, generateExperimentName } from "./utils"; import { exactMatch, errorMatch } from "./scoring"; import { tasksByName, MODELS, tasksConfig } from "./taskConfig"; -import { Eval, wrapAISDKModel, wrapOpenAI } from "braintrust"; +import { Eval } from "braintrust"; import { EvalFunction, SummaryResult, Testcase } from "@/types/evals"; import { EvalLogger } from "./logger"; -import { AvailableModel, LLMClient } from "@/dist"; +import { AvailableModel } from "@/dist"; import { env } from "./env"; import dotenv from "dotenv"; import { StagehandEvalError } from "@/types/stagehandErrors"; -import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI"; -import OpenAI from "openai"; import { initStagehand } from "./initStagehand"; -import { AISdkClient } from "@/examples/external_clients/aisdk"; -import { google } from "@ai-sdk/google"; -import { anthropic } from "@ai-sdk/anthropic"; -import { groq } from "@ai-sdk/groq"; -import { cerebras } from "@ai-sdk/cerebras"; dotenv.config(); /** @@ -273,51 +267,17 @@ const generateFilteredTestcases = (): Testcase[] => { } // Execute the task - let llmClient: LLMClient; - if (input.modelName.startsWith("gpt")) { - llmClient = new CustomOpenAIClient({ - modelName: input.modelName as AvailableModel, - client: wrapOpenAI( - new OpenAI({ - apiKey: process.env.OPENAI_API_KEY, - }), - ), - }); - } else if (input.modelName.startsWith("gemini")) { - llmClient = new AISdkClient({ - model: wrapAISDKModel(google(input.modelName)), - }); - } else if (input.modelName.startsWith("claude")) { - llmClient = new AISdkClient({ - model: wrapAISDKModel(anthropic(input.modelName)), - }); - } else if (input.modelName.includes("groq")) { - llmClient = new AISdkClient({ - model: wrapAISDKModel( - groq( - input.modelName.substring(input.modelName.indexOf("/") + 1), - ), - ), - }); - } else if (input.modelName.includes("cerebras")) { - llmClient = new AISdkClient({ - model: wrapAISDKModel( - cerebras( - input.modelName.substring(input.modelName.indexOf("/") + 1), - ), - ), - }); - } else if (input.modelName.includes("/")) { - llmClient = new CustomOpenAIClient({ - modelName: input.modelName as AvailableModel, - client: wrapOpenAI( - new OpenAI({ - apiKey: process.env.TOGETHER_AI_API_KEY, - baseURL: "https://api.together.xyz/v1", - }), - ), - }); - } + const llmClient = createLLMClient({ + modelName: input.modelName, + useExternalClients: parsedArgs.useExternalClients === true, + logger: (msg) => logger.log(msg), + openAiKey: process.env.OPENAI_API_KEY, + googleKey: process.env.GOOGLE_API_KEY, + anthropicKey: process.env.ANTHROPIC_API_KEY, + groqKey: process.env.GROQ_API_KEY, + cerebrasKey: process.env.CEREBRAS_API_KEY, + togetherKey: process.env.TOGETHER_AI_API_KEY, + }); const taskInput = await initStagehand({ logger, llmClient, diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts index e5ea48b02..814e8114b 100644 --- a/evals/taskConfig.ts +++ b/evals/taskConfig.ts @@ -95,7 +95,7 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) { */ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS ? process.env.EVAL_MODELS.split(",") - : ["claude-3-5-sonnet-latest", "gpt-4o-mini", "gpt-4o"]; + : ["gpt-4o-mini"]; /** * getModelList: diff --git a/evals/utils.ts b/evals/utils.ts index a573d1729..b83943231 100644 --- a/evals/utils.ts +++ b/evals/utils.ts @@ -11,6 +11,22 @@ import { LogLine } from "@/dist"; import stringComparison from "string-comparison"; const { jaroWinkler } = stringComparison; +import OpenAI from "openai"; +import { wrapAISDKModel, wrapOpenAI } from "braintrust"; +import { anthropic } from "@ai-sdk/anthropic"; +import { google } from "@ai-sdk/google"; +import { groq } from "@ai-sdk/groq"; +import { cerebras } from "@ai-sdk/cerebras"; +import { LLMClient } from "@/dist"; +import { AISdkClient } from "@/examples/external_clients/aisdk"; +import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI"; +import { OpenAIClient } from "@/lib/llm/OpenAIClient"; +import { AnthropicClient } from "@/lib/llm/AnthropicClient"; +import { GoogleClient } from "@/lib/llm/GoogleClient"; +import { GroqClient } from "@/lib/llm/GroqClient"; +import { CerebrasClient } from "@/lib/llm/CerebrasClient"; +import { CreateLLMClientOptions } from "@/types/evals"; +import { StagehandEvalError } from "@/types/stagehandErrors"; /** * normalizeString: @@ -119,3 +135,123 @@ export function logLineToString(logLine: LogLine): string { return "error logging line"; } } + +export function createLLMClient({ + modelName, + useExternalClients, + logger, + openAiKey, + googleKey, + anthropicKey, + groqKey, + cerebrasKey, + togetherKey, +}: CreateLLMClientOptions): LLMClient { + const isOpenAIModel = modelName.startsWith("gpt") || modelName.includes("/"); + const isGoogleModel = modelName.startsWith("gemini"); + const isAnthropicModel = modelName.startsWith("claude"); + const isGroqModel = modelName.includes("groq"); + const isCerebrasModel = modelName.includes("cerebras"); + + if (useExternalClients) { + if (isOpenAIModel) { + if (modelName.includes("/")) { + return new CustomOpenAIClient({ + modelName, + client: wrapOpenAI( + new OpenAI({ + apiKey: togetherKey, + baseURL: "https://api.together.xyz/v1", + }), + ), + }); + } + return new CustomOpenAIClient({ + modelName, + client: wrapOpenAI( + new OpenAI({ + apiKey: openAiKey, + }), + ), + }); + } else if (isGoogleModel) { + return new AISdkClient({ + model: wrapAISDKModel(google(modelName)), + }); + } else if (isAnthropicModel) { + return new AISdkClient({ + model: wrapAISDKModel(anthropic(modelName)), + }); + } else if (isGroqModel) { + const groqModel = modelName.substring(modelName.indexOf("/") + 1); + return new AISdkClient({ + model: wrapAISDKModel(groq(groqModel)), + }); + } else if (isCerebrasModel) { + const cerebrasModel = modelName.substring(modelName.indexOf("/") + 1); + return new AISdkClient({ + model: wrapAISDKModel(cerebras(cerebrasModel)), + }); + } + throw new StagehandEvalError(`Unknown modelName: ${modelName}`); + } else { + if (isOpenAIModel) { + if (modelName.includes("/")) { + return new CustomOpenAIClient({ + modelName, + client: wrapOpenAI( + new OpenAI({ + apiKey: togetherKey, + baseURL: "https://api.together.xyz/v1", + }), + ), + }); + } + return new OpenAIClient({ + logger, + modelName, + enableCaching: false, + clientOptions: { + apiKey: openAiKey, + }, + }); + } else if (isGoogleModel) { + return new GoogleClient({ + logger, + modelName, + enableCaching: false, + clientOptions: { + apiKey: googleKey, + }, + }); + } else if (isAnthropicModel) { + return new AnthropicClient({ + logger, + modelName, + enableCaching: false, + clientOptions: { + apiKey: anthropicKey, + }, + }); + } else if (isGroqModel) { + return new GroqClient({ + logger, + modelName, + enableCaching: false, + clientOptions: { + apiKey: groqKey, + }, + }); + } else if (isCerebrasModel) { + return new CerebrasClient({ + logger, + modelName, + enableCaching: false, + clientOptions: { + apiKey: cerebrasKey, + }, + }); + } + throw new StagehandEvalError(`Unknown modelName: ${modelName}`); + } +} diff --git a/types/evals.ts b/types/evals.ts index dc672550b..ffd5118b8 100644 --- a/types/evals.ts +++ b/types/evals.ts @@ -77,3 +77,15 @@ export interface EvalResult { export type LogLineEval = LogLine & { parsedAuxiliary?: string | object; }; + +export interface CreateLLMClientOptions { + modelName: AvailableModel; + useExternalClients: boolean; + logger?: (msg: LogLine) => void; + openAiKey?: string; + googleKey?: string; + anthropicKey?: string; + groqKey?: string; + cerebrasKey?: string; + togetherKey?: string; +} From b347602002f5aea822254d4be9cf6662d086a58d Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Tue, 15 Apr 2025 12:54:39 -0700 Subject: [PATCH 2/8] revert taskConfig.ts change --- evals/taskConfig.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts index 814e8114b..e5ea48b02 100644 --- a/evals/taskConfig.ts +++ b/evals/taskConfig.ts @@ -95,7 +95,7 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) { */ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS ? process.env.EVAL_MODELS.split(",") - : ["gpt-4o-mini"]; + : ["claude-3-5-sonnet-latest", "gpt-4o-mini", "gpt-4o"]; /** * getModelList: From 18ae8fc903da8eb966015a166273fca3d0d47283 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Wed, 16 Apr 2025 12:13:26 -0700 Subject: [PATCH 3/8] use aiSDK for groq and cerebras --- evals/index.eval.ts | 2 -- evals/utils.ts | 28 +++++----------------------- 2 files changed, 5 insertions(+), 25 deletions(-) diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 9061f67af..fa56db1b9 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -274,8 +274,6 @@ const generateFilteredTestcases = (): Testcase[] => { openAiKey: process.env.OPENAI_API_KEY, googleKey: process.env.GOOGLE_API_KEY, anthropicKey: process.env.ANTHROPIC_API_KEY, - groqKey: process.env.GROQ_API_KEY, - cerebrasKey: process.env.CEREBRAS_API_KEY, togetherKey: process.env.TOGETHER_AI_API_KEY, }); const taskInput = await initStagehand({ diff --git a/evals/utils.ts b/evals/utils.ts index b83943231..28ee659a6 100644 --- a/evals/utils.ts +++ b/evals/utils.ts @@ -23,8 +23,6 @@ import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI"; import { OpenAIClient } from "@/lib/llm/OpenAIClient"; import { AnthropicClient } from "@/lib/llm/AnthropicClient"; import { GoogleClient } from "@/lib/llm/GoogleClient"; -import { GroqClient } from "@/lib/llm/GroqClient"; -import { CerebrasClient } from "@/lib/llm/CerebrasClient"; import { CreateLLMClientOptions } from "@/types/evals"; import { StagehandEvalError } from "@/types/stagehandErrors"; @@ -143,11 +141,9 @@ export function createLLMClient({ openAiKey, googleKey, anthropicKey, - groqKey, - cerebrasKey, togetherKey, }: CreateLLMClientOptions): LLMClient { - const isOpenAIModel = modelName.startsWith("gpt") || modelName.includes("/"); + const isOpenAIModel = modelName.startsWith("gpt"); const isGoogleModel = modelName.startsWith("gemini"); const isAnthropicModel = modelName.startsWith("claude"); const isGroqModel = modelName.includes("groq"); @@ -233,24 +229,10 @@ export function createLLMClient({ apiKey: anthropicKey, }, }); - } else if (isGroqModel) { - return new GroqClient({ - logger, - modelName, - enableCaching: false, - clientOptions: { - apiKey: groqKey, - }, - }); - } else if (isCerebrasModel) { - return new CerebrasClient({ - logger, - modelName, - enableCaching: false, - clientOptions: { - apiKey: cerebrasKey, - }, - }); + } else if (isGroqModel || isCerebrasModel) { + throw new StagehandEvalError( + `${modelName} can only be used when useExternalClients=true`, + ); } throw new StagehandEvalError(`Unknown modelName: ${modelName}`); } From 7f3d935b15f5dd2bccbb4270eb04fa615de42fde Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Wed, 16 Apr 2025 12:23:33 -0700 Subject: [PATCH 4/8] update CreateLLMClientOptions interface --- types/evals.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/types/evals.ts b/types/evals.ts index ffd5118b8..7566fe7b8 100644 --- a/types/evals.ts +++ b/types/evals.ts @@ -80,12 +80,10 @@ export type LogLineEval = LogLine & { export interface CreateLLMClientOptions { modelName: AvailableModel; - useExternalClients: boolean; + useExternalClients?: boolean; logger?: (msg: LogLine) => void; openAiKey?: string; googleKey?: string; anthropicKey?: string; - groqKey?: string; - cerebrasKey?: string; togetherKey?: string; } From 7ac680c6aa5b10668e95df6d1f56f175e6481373 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Wed, 16 Apr 2025 12:34:28 -0700 Subject: [PATCH 5/8] single dash for useExternalClients --- evals/args.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/args.ts b/evals/args.ts index 297d7f845..e63ba0136 100644 --- a/evals/args.ts +++ b/evals/args.ts @@ -32,7 +32,7 @@ for (const arg of rawArgs) { parsedArgs.extractMethod = arg.split("=")[1]; } else if (arg.startsWith("provider=")) { parsedArgs.provider = arg.split("=")[1]?.toLowerCase(); - } else if (arg.startsWith("--useExternalClients=")) { + } else if (arg.startsWith("-useExternalClients=")) { const val = arg.split("=")[1]?.toLowerCase(); parsedArgs.useExternalClients = val === "true"; } else { From e1c43a2f26021317fdc693860103730bcaa8c3c3 Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Tue, 22 Apr 2025 11:03:27 -0700 Subject: [PATCH 6/8] support new openai models --- evals/utils.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evals/utils.ts b/evals/utils.ts index 28ee659a6..3d3436595 100644 --- a/evals/utils.ts +++ b/evals/utils.ts @@ -143,7 +143,8 @@ export function createLLMClient({ anthropicKey, togetherKey, }: CreateLLMClientOptions): LLMClient { - const isOpenAIModel = modelName.startsWith("gpt"); + const isOpenAIModel = + modelName.startsWith("gpt") || modelName.startsWith("o"); const isGoogleModel = modelName.startsWith("gemini"); const isAnthropicModel = modelName.startsWith("claude"); const isGroqModel = modelName.includes("groq"); From 396d772bb09005c22c1697d02bb4a3df3ef12deb Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Tue, 22 Apr 2025 12:21:27 -0700 Subject: [PATCH 7/8] dont use dash --- evals/args.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/args.ts b/evals/args.ts index e63ba0136..e4ad2cf4b 100644 --- a/evals/args.ts +++ b/evals/args.ts @@ -32,7 +32,7 @@ for (const arg of rawArgs) { parsedArgs.extractMethod = arg.split("=")[1]; } else if (arg.startsWith("provider=")) { parsedArgs.provider = arg.split("=")[1]?.toLowerCase(); - } else if (arg.startsWith("-useExternalClients=")) { + } else if (arg.startsWith("useExternalClients=")) { const val = arg.split("=")[1]?.toLowerCase(); parsedArgs.useExternalClients = val === "true"; } else { From 821fc7ce4bdb67261a3d7a22b25425c08d9e1bda Mon Sep 17 00:00:00 2001 From: Sean McGuire Date: Tue, 22 Apr 2025 12:29:05 -0700 Subject: [PATCH 8/8] aisdk for openai external client --- evals/utils.ts | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/evals/utils.ts b/evals/utils.ts index 3d3436595..e6751c4db 100644 --- a/evals/utils.ts +++ b/evals/utils.ts @@ -25,6 +25,7 @@ import { AnthropicClient } from "@/lib/llm/AnthropicClient"; import { GoogleClient } from "@/lib/llm/GoogleClient"; import { CreateLLMClientOptions } from "@/types/evals"; import { StagehandEvalError } from "@/types/stagehandErrors"; +import { openai } from "@ai-sdk/openai"; /** * normalizeString: @@ -163,13 +164,8 @@ export function createLLMClient({ ), }); } - return new CustomOpenAIClient({ - modelName, - client: wrapOpenAI( - new OpenAI({ - apiKey: openAiKey, - }), - ), + return new AISdkClient({ + model: wrapAISDKModel(openai(modelName)), }); } else if (isGoogleModel) { return new AISdkClient({