Build Browser-Use Agents with Gbox and LangChain.
Install Dependencies
npm install gbox-sdk @langchain/core @langchain/langgraph @langchain/openai @langchain/community openai playwright-core dotenv typescript tsx @types/node
Add Environment Variables
.env
file in your project root and add your Gbox API key and OpenAI API key:# Gbox API Configuration
GBOX_API_KEY=your_gbox_api_key
# OpenAI API Configuration
OPENAI_API_KEY=your_openai_api_key
OPENAI_ORGANIZATION=your_openai_organization_id
Create Agent Flow
index.ts
with the following content:import OpenAI from "openai";
import { StateGraph } from "@langchain/langgraph";
import { LinuxBoxOperator, GboxSDK } from "gbox-sdk";
import { Page, Browser, chromium } from "playwright";
import * as dotenv from "dotenv";
dotenv.config();
type AgentState = {
url: string;
task: string;
env: {
client?: OpenAI;
page?: Page;
browser?: Browser;
box?: LinuxBoxOperator;
};
step: number;
trajectory: Array<string>;
isFinished: boolean;
summary: string;
error: string;
};
// Create the state graph
const builder = new StateGraph<AgentState>({
channels: {
url: {
value: (x: string, y?: string) => y ?? x,
default: () => "",
},
task: {
value: (x: string, y?: string) => y ?? x,
default: () => "",
},
env: {
value: (
x: {
client?: OpenAI;
page?: Page;
browser?: Browser;
box?: LinuxBoxOperator;
},
y?: {
client?: OpenAI;
page?: Page;
browser?: Browser;
box?: LinuxBoxOperator;
}
) => y ?? x,
default: () => ({}),
},
step: {
value: (x: number, y?: number) => y ?? x,
default: () => 0,
},
trajectory: {
value: (x: string[], y?: string[]) => y ?? x,
default: () => [],
},
isFinished: {
value: (x: boolean, y?: boolean) => y ?? x,
default: () => false,
},
summary: {
value: (x: string, y?: string) => y ?? x,
default: () => "",
},
error: {
value: (x: string, y?: string) => y ?? x,
default: () => "",
},
},
});
builder
.addNode("initialize", initialize)
.addEdge("__start__", "initialize")
.addNode("takeAction", takeAction)
.addEdge("initialize", "takeAction")
.addNode("finish", finish)
.addConditionalEdges("takeAction", shouldContinue, {
takeAction: "takeAction",
finish: "finish",
});
// Compile the graph
const graph = builder.compile();
Implement Initialize Node
initialize
node to set up the agent’s initial state:const initialize = async (state: AgentState): Promise<Partial<AgentState>> => {
if (!state.url || !state.task) {
throw new Error(
"URL and task must be provided to initialize the agent."
);
}
console.log("[Initialize] URL:", state.url);
console.log("[Initialize] Task:", state.task);
// Initialize OpenAI client
if (!process.env.OPENAI_API_KEY || !process.env.OPENAI_ORGANIZATION) {
throw new Error(
"OpenAI API key or organization are not set in environment variables."
);
}
const client = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
organization: process.env.OPENAI_ORGANIZATION,
});
// Initialize browser page
if (!process.env.GBOX_API_KEY) {
throw new Error("Gbox API key is not set in environment variables.");
}
const gboxSDK = new GboxSDK({ apiKey: process.env.GBOX_API_KEY });
const box = await gboxSDK.create({ type: "linux" });
const cdpUrl = await box.browser.cdpUrl();
const browser = await chromium.connectOverCDP(cdpUrl);
const context = await browser.newContext();
const page = await context.newPage();
const trajectory: string[] = [];
// Navigate to the initial URL
await page.goto(state.url);
trajectory.push(`Navigated to ${state.url}`);
console.log("[Initialize] Page loaded");
return {
url: state.url,
task: state.task,
env: {
client,
page,
browser,
box,
},
step: 0,
trajectory: trajectory,
isFinished: false,
summary: "",
error: undefined,
};
};
Implement Take Action Node
takeAction
node to perform actions based on the agent’s task:const decideAction = async (
client: OpenAI,
task: string,
trajectory: string[],
screenshotBase64: string
): Promise<{
message: string;
isFinished: boolean;
isError: boolean;
}> => {
// Prompt
const userPrompt = `You are a senior QA engineer with expertise in web automation and testing. You need to perform the following task: ${task}.
Please decide what action to take next based on the current state of the page and return in json format:
\`\`\`json
{
"action": "describe the action to take, if finished, leave empty",
"finished": "determine if the task is complete, true if finished, false otherwise"
}
\`\`\`
# Previous step
${trajectory.join("\n")}
# Current Screenshot`;
try {
const response = await client.responses
.create({
model: "o3",
input: [
{
role: "user",
content: [
{
type: "input_text",
text: userPrompt,
},
{
type: "input_image",
image_url: `data:image/png;base64,${screenshotBase64}`,
detail: "high",
},
],
},
],
})
.then((response) => {
return response.output_text;
});
const actionResponse = response
.replace(/^\s*```json\s*/, "")
.replace(/```$/, "");
const actionData = JSON.parse(actionResponse);
if (!actionData || typeof actionData !== "object") {
return {
message: "Invalid action response format.",
isFinished: false,
isError: true,
};
}
if (actionData.finished) {
console.log("[Take Action] Task is finished.");
return {
message: "Task completed successfully.",
isFinished: true,
isError: false,
};
}
return {
message: actionData.action,
isError: false,
isFinished: false,
};
} catch (error) {
console.error("[Error] Failed to get response from OpenAI:", error);
return {
message: "No action taken due to error.",
isFinished: false,
isError: true,
};
}
};
const executeAction = async (
client: OpenAI,
page: Page,
task: string,
screenshotBase64: string,
action: string
) => {
const computerUsePrompt = `You are a senior QA engineer with expertise in web automation and testing. You need to perform the following task: ${task}.
Please take following action without any hesitation
${action}
`;
// @ts-ignore
const response = await client.responses.create({
model: "computer-use-preview",
tools: [
{
type: "computer_use_preview",
display_width: 1024,
display_height: 768,
environment: "browser",
},
],
input: [
{
role: "user",
content: [
{
type: "input_text",
text: computerUsePrompt,
},
{
type: "input_image",
image_url: `data:image/png;base64,${screenshotBase64}`,
},
],
},
],
reasoning: {
summary: "concise",
},
truncation: "auto",
});
const toolCalls = response.output
.filter((item: any) => item.type === "computer_call")
.map((item: any) => item.action);
console.log("[Take Action] Tool calls found in response:", toolCalls);
if (!toolCalls) {
console.warn(
`No computer calls found in LLM response: ${JSON.stringify(
response
)}`
);
}
await handleModelAction(page, toolCalls[0]);
};
const takeAction = async (state: AgentState): Promise<Partial<AgentState>> => {
if (state.step >= 25) {
console.error("[Error] Step limit reached, cannot take more actions.");
return {
...state,
error: "Action limit reached",
};
}
// Decide next action
if (!state.env || !state.env.client || !state.env.page) {
console.error(
"[Error] Missing required environment (client or page) for action."
);
return {
...state,
error: "Missing required environment (client or page) for action.",
};
}
const screenshot = await state.env.page.screenshot();
if (!screenshot) {
return {
...state,
trajectory: [...state.trajectory, "Failed to take screenshot."],
};
}
const screenshotBase64 = screenshot.toString("base64");
const { isError, isFinished, message } = await decideAction(
state.env.client,
state.task,
state.trajectory,
screenshotBase64
);
if (isFinished) {
console.log(
"[Take Action] Task is finished, no further actions needed."
);
return {
...state,
trajectory: [...state.trajectory, "Task was marked as finished."],
isFinished: true,
};
}
if (isError) {
console.error(
"[Take Action] Error occurred, logging message:",
message
);
return {
...state,
trajectory: [...state.trajectory, message],
};
}
// Execute action
console.log("[Take Action] Executing action:", message);
await executeAction(
state.env.client,
state.env.page,
state.task,
screenshotBase64,
message
);
console.log("[Take Action] Action executed successfully.");
return {
...state,
step: state.step + 1,
trajectory: [
...state.trajectory,
message,
"Action executed successfully.",
],
isFinished: true,
};
};
handleModelAction
can be implemented to handle specific actions based on the model’s response.import { Page } from "playwright";
export async function handleModelAction(
page: Page,
action: any
): Promise<void> {
const actionType = action["type"] as string;
try {
switch (actionType) {
case "click": {
const { x, y, button = "left" } = action;
await page.mouse.click(x, y, { button });
break;
}
case "double_click": {
const { x, y, button = "left" } = action;
await page.mouse.dblclick(x, y, { button });
break;
}
case "scroll": {
const { x, y, scrollX, scrollY, scroll_x, scroll_y } = action;
// Use the camelCase value if available, otherwise fall back to snake_case.
const effectiveScrollX = scrollX ?? scroll_x;
const effectiveScrollY = scrollY ?? scroll_y;
await page.mouse.move(x, y);
await page.evaluate(
({ scrollX, scrollY }) => window.scrollBy(scrollX, scrollY),
{ scrollX: effectiveScrollX, scrollY: effectiveScrollY }
);
break;
}
case "keypress": {
/**
* Now handles multiple simultaneous keys (e.g. ["CTRL","A"])
* by holding modifiers down and pressing the main key(s).
*/
const { keys } = action;
if (!Array.isArray(keys) || keys.length === 0) {
return;
}
// Normalize recognized modifiers to the names Playwright expects
const recognizedModifiers = new Set([
"SHIFT",
"CTRL",
"ALT",
"META",
"CMD",
]);
// Arrays to keep track of which modifiers to hold down, and which keys to press
const modifiersToPress: string[] = [];
const mainKeys: string[] = [];
// Separate out modifiers from main keys
for (const rawKey of keys) {
const upperKey = rawKey.toUpperCase();
if (recognizedModifiers.has(upperKey)) {
if (upperKey === "SHIFT")
modifiersToPress.push("Shift");
else if (upperKey === "CTRL")
modifiersToPress.push("Control");
else if (upperKey === "ALT")
modifiersToPress.push("Alt");
else if (upperKey === "META" || upperKey === "CMD")
modifiersToPress.push("Meta");
} else {
mainKeys.push(rawKey);
}
}
// Press down all modifiers
for (const mod of modifiersToPress) {
await page.keyboard.down(mod);
}
// Press or type each main key
for (const mk of mainKeys) {
const mkUpper = mk.toUpperCase();
if (mkUpper === "ENTER") {
await page.keyboard.press("Enter");
} else if (mkUpper === "SPACE") {
await page.keyboard.press(" ");
} else if (mkUpper === "PAGEDOWN") {
await page.keyboard.press("PageDown");
} else if (mkUpper === "PAGEUP") {
await page.keyboard.press("PageUp");
} else {
// For any other key, press it as-is.
await page.keyboard.press(mk);
}
}
// Release all modifiers in reverse order
for (let i = modifiersToPress.length - 1; i >= 0; i--) {
const mod = modifiersToPress[i];
await page.keyboard.up(mod);
}
break;
}
case "drag": {
// Extract the drag path from the action
const { path } = action;
if (!Array.isArray(path) || path.length < 2) {
return;
}
// Move to the starting coordinate of the drag path
const start = path[0];
await page.mouse.move(start.x, start.y);
await page.mouse.down();
// Drag through each subsequent point in the path.
for (let i = 1; i < path.length; i++) {
const point = path[i];
await page.mouse.move(point.x, point.y);
}
// Release the mouse button to complete the drag action
await page.mouse.up();
break;
}
case "type": {
const { text } = action;
await page.keyboard.type(text);
break;
}
case "wait": {
await page.waitForTimeout(2000);
break;
}
case "screenshot": {
return;
}
}
} catch (error) {
console.log(
`Error handling action: ${JSON.stringify(action)}, error: ${error}`
);
}
}
Implement Finish Node
finish
node to summarize the agent’s actions and clean up resources:const finish = async (state: AgentState): Promise<Partial<AgentState>> => {
if (!state.env || !state.env.client || !state.env.page) {
console.error(
"[Error] Missing required environment (client or page) for finishing."
);
return {
...state,
error: "Missing required environment (client or page) for finishing.",
};
}
// Summary of the task
const userPrompt = `You are a senior QA engineer with expertise in web automation and testing. You need to finish the task: ${
state.task
}.
Please provide a concise summary of the task and the final result (if any)
# Actions taken
${state.trajectory.join("\n")}
`;
const screenshot = await state.env.page.screenshot();
const screenshotBase64 = screenshot.toString("base64");
console.log("[Finish] Generating final summary...");
const response = await state.env.client.responses
.create({
model: "o3",
input: [
{
role: "user",
content: [
{
type: "input_text",
text: userPrompt,
},
{
type: "input_image",
image_url: `data:image/png;base64,${screenshotBase64}`,
detail: "high",
},
],
},
],
})
.then((response) => {
return response.output_text;
});
// Close the browser and finalize the trajectory
if (state.env.browser) {
await state.env.browser.close();
console.log("[Finish] Browser closed.");
}
if (state.env.box) {
await state.env.box.terminate();
console.log("[Finish] Box terminated.");
}
let summary = response;
if (state.error.length > 0) {
console.error("[Finish] Agent encountered an error.");
summary = state.error || "An error occurred during the task.";
} else {
console.log("[Finish] Agent completed successfully.");
}
return {
...state,
summary: summary.trim(),
};
};
Add conditional logic
shouldContinue
function to determine if the agent should continue taking actions or finish:// Define the conditional routing logic
function shouldContinue(state: AgentState): string {
/**
* Determine whether to continue taking actions or finish.
* Returns either 'finish' or 'takeAction'.
*/
if (state.error.length > 0) {
console.log("[Info] Task encountered an error, ending.");
console.log(state.error);
return "finish";
}
if (state.isFinished) {
console.log("[Info] Task is finished, no further actions needed.");
return "finish";
}
if (state.step >= 25) {
console.log("[Info] Step limit reached, finishing.");
return "finish";
}
console.log("[Info] Continuing to take next action.");
return "takeAction";
}
Run the Agent
// Example usage:
const finalState = await graph.invoke(
{
url: "https://docs.gbox.ai/",
task: "Find the latest version of Typescript GboxSDK from Changelog page",
},
{ recursionLimit: 100 }
);
console.log("[Final Result]\n\n", finalState.summary);
tsx
:npx tsx index.ts
index.ts
with the path to your TypeScript file if it’s located elsewhere.Was this page helpful?