Skip to content

[BUG] Function Key Handling For Stagehand Agent #686

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 42 additions & 8 deletions lib/agent/AnthropicCUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import {
} from "@/types/agent";
import { AgentClient } from "./AgentClient";
import { AgentScreenshotProviderError } from "@/types/stagehandErrors";
import * as fs from "fs";
import * as path from "path";

export type ResponseInputItem = AnthropicMessage | AnthropicToolResult;

Expand Down Expand Up @@ -838,25 +840,57 @@ export class AnthropicCUAClient extends AgentClient {
base64Image?: string;
currentUrl?: string;
}): Promise<string> {
let imageData = "";

// Use provided options if available
if (options?.base64Image) {
return `data:image/png;base64,${options.base64Image}`;
imageData = `data:image/png;base64,${options.base64Image}`;
}

// Use the screenshot provider if available
if (this.screenshotProvider) {
else if (this.screenshotProvider) {
try {
const base64Image = await this.screenshotProvider();
return `data:image/png;base64,${base64Image}`;
imageData = `data:image/png;base64,${base64Image}`;
} catch (error) {
console.error("Error capturing screenshot:", error);
throw error;
}
} else {
throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
);
}

// Save the screenshot to file if we have valid image data
if (imageData) {
try {
// Create screenshots directory if it doesn't exist
const screenshotsDir = path.resolve("screenshots");
if (!fs.existsSync(screenshotsDir)) {
fs.mkdirSync(screenshotsDir, { recursive: true });
}

// Generate filename with timestamp
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const filename = path.join(
screenshotsDir,
`screenshot-${timestamp}.png`,
);

// Extract base64 data without the data URL prefix
const base64Data = imageData.replace(/^data:image\/png;base64,/, "");

// Write file
fs.writeFileSync(filename, base64Data, "base64");
console.log(`Screenshot saved to ${filename}`);
} catch (saveError) {
// Log error but don't affect the function's behavior
console.error("Error saving screenshot to file:", saveError);
// Intentionally not re-throwing the error to keep function working
}
}

throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
);
return imageData;
}
}
50 changes: 42 additions & 8 deletions lib/agent/OpenAICUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import {
} from "@/types/agent";
import { AgentClient } from "./AgentClient";
import { AgentScreenshotProviderError } from "@/types/stagehandErrors";
import * as fs from "fs";
import * as path from "path";

/**
* Client for OpenAI's Computer Use Assistant API
Expand Down Expand Up @@ -558,25 +560,57 @@ export class OpenAICUAClient extends AgentClient {
base64Image?: string;
currentUrl?: string;
}): Promise<string> {
let imageData = "";

// Use provided options if available
if (options?.base64Image) {
return `data:image/png;base64,${options.base64Image}`;
imageData = `data:image/png;base64,${options.base64Image}`;
}

// Use the screenshot provider if available
if (this.screenshotProvider) {
else if (this.screenshotProvider) {
try {
const base64Image = await this.screenshotProvider();
return `data:image/png;base64,${base64Image}`;
imageData = `data:image/png;base64,${base64Image}`;
} catch (error) {
console.error("Error capturing screenshot:", error);
throw error;
}
} else {
throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
);
}

throw new AgentScreenshotProviderError(
"`screenshotProvider` has not been set. " +
"Please call `setScreenshotProvider()` with a valid function that returns a base64-encoded image",
);
// Save the screenshot to file if we have valid image data
if (imageData) {
try {
// Create screenshots directory if it doesn't exist
const screenshotsDir = path.resolve("screenshots");
if (!fs.existsSync(screenshotsDir)) {
fs.mkdirSync(screenshotsDir, { recursive: true });
}

// Generate filename with timestamp
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const filename = path.join(
screenshotsDir,
`screenshot-${timestamp}.png`,
);

// Extract base64 data without the data URL prefix
const base64Data = imageData.replace(/^data:image\/png;base64,/, "");

// Write file
fs.writeFileSync(filename, base64Data, "base64");
console.log(`Screenshot saved to ${filename}`);
} catch (saveError) {
// Log error but don't affect the function's behavior
console.error("Error saving screenshot to file:", saveError);
// Intentionally not re-throwing the error to keep function working
}
}

return imageData;
}
}
89 changes: 61 additions & 28 deletions lib/handlers/agentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -277,32 +277,65 @@ export class StagehandAgentHandler {
case "keypress": {
const { keys } = action;
if (Array.isArray(keys)) {
for (const key of keys) {
// Handle special keys
if (key.includes("ENTER")) {
await this.stagehandPage.page.keyboard.press("Enter");
} else if (key.includes("SPACE")) {
await this.stagehandPage.page.keyboard.press(" ");
} else if (key.includes("TAB")) {
await this.stagehandPage.page.keyboard.press("Tab");
} else if (key.includes("ESCAPE") || key.includes("ESC")) {
await this.stagehandPage.page.keyboard.press("Escape");
} else if (key.includes("BACKSPACE")) {
await this.stagehandPage.page.keyboard.press("Backspace");
} else if (key.includes("DELETE")) {
await this.stagehandPage.page.keyboard.press("Delete");
} else if (key.includes("ARROW_UP")) {
await this.stagehandPage.page.keyboard.press("ArrowUp");
} else if (key.includes("ARROW_DOWN")) {
await this.stagehandPage.page.keyboard.press("ArrowDown");
} else if (key.includes("ARROW_LEFT")) {
await this.stagehandPage.page.keyboard.press("ArrowLeft");
} else if (key.includes("ARROW_RIGHT")) {
await this.stagehandPage.page.keyboard.press("ArrowRight");
} else {
// For other keys, use the existing conversion
const playwrightKey = this.convertKeyName(key);
await this.stagehandPage.page.keyboard.press(playwrightKey);
// Check if CTRL or CMD is present in the keys
const hasModifier = keys.some(
(key) =>
key.includes("CTRL") ||
key.includes("CMD") ||
key.includes("COMMAND"),
);

if (hasModifier) {
// Handle key combination - press all keys simultaneously
// Convert all keys first
const playwrightKeys = keys.map((key) => {
if (key.includes("CTRL")) return "Meta";
if (key.includes("CMD") || key.includes("COMMAND"))
return "Meta";
return this.convertKeyName(key);
});

// Press all keys down in sequence
for (const key of playwrightKeys) {
await this.stagehandPage.page.keyboard.down(key);
}

// Small delay to ensure the combination is registered
await new Promise((resolve) => setTimeout(resolve, 100));

// Release all keys in reverse order
for (const key of playwrightKeys.reverse()) {
await this.stagehandPage.page.keyboard.up(key);
}
} else {
// Handle individual keys as before
for (const key of keys) {
// Handle special keys
if (key.includes("ENTER")) {
await this.stagehandPage.page.keyboard.press("Enter");
} else if (key.includes("SPACE")) {
await this.stagehandPage.page.keyboard.press(" ");
} else if (key.includes("TAB")) {
await this.stagehandPage.page.keyboard.press("Tab");
} else if (key.includes("ESCAPE") || key.includes("ESC")) {
await this.stagehandPage.page.keyboard.press("Escape");
} else if (key.includes("BACKSPACE")) {
await this.stagehandPage.page.keyboard.press("Backspace");
} else if (key.includes("DELETE")) {
await this.stagehandPage.page.keyboard.press("Delete");
} else if (key.includes("ARROW_UP")) {
await this.stagehandPage.page.keyboard.press("ArrowUp");
} else if (key.includes("ARROW_DOWN")) {
await this.stagehandPage.page.keyboard.press("ArrowDown");
} else if (key.includes("ARROW_LEFT")) {
await this.stagehandPage.page.keyboard.press("ArrowLeft");
} else if (key.includes("ARROW_RIGHT")) {
await this.stagehandPage.page.keyboard.press("ArrowRight");
} else {
// For other keys, use the existing conversion
const playwrightKey = this.convertKeyName(key);
await this.stagehandPage.page.keyboard.press(playwrightKey);
}
}
}
}
Expand Down Expand Up @@ -649,12 +682,12 @@ export class StagehandAgentHandler {
LEFT: "ArrowLeft",
RIGHT: "ArrowRight",
SHIFT: "Shift",
CONTROL: "Control",
CONTROL: process.platform === "darwin" ? "Meta" : "Control", // Use Meta on macOS
ALT: "Alt",
META: "Meta",
COMMAND: "Meta",
CMD: "Meta",
CTRL: "Control",
CTRL: process.platform === "darwin" ? "Meta" : "Control", // Use Meta on macOS
DELETE: "Delete",
HOME: "Home",
END: "End",
Expand Down
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.