Build Android-Use Agents with Gbox and LangChain.
Install Dependencies
npm install gbox-sdk @langchain/core @langchain/langgraph @langchain/openai @langchain/community openai dotenv typescript tsx @types/node
Add Environment Variables
.env
file in your project root and add your Gbox API key and OpenAI API key:# Gbox API Configuration
GBOX_API_KEY=your_gbox_api_key
# OpenAI API Configuration
OPENAI_API_KEY=your_openai_api_key
OPENAI_ORGANIZATION=your_openai_organization_id
Create Agent Flow
index.ts
with the following content:import OpenAI from "openai";
import { StateGraph } from "@langchain/langgraph";
import { AndroidBoxOperator, GboxSDK } from "gbox-sdk";
import * as dotenv from "dotenv";
dotenv.config();
type AgentState = {
apk: string;
task: string;
env: {
client?: OpenAI;
box?: AndroidBoxOperator;
};
step: number;
trajectory: Array<string>;
isFinished: boolean;
summary: string;
error: string;
};
// Create the state graph
const builder = new StateGraph<AgentState>({
channels: {
apk: {
value: (x: string, y?: string) => y ?? x,
default: () => "",
},
task: {
value: (x: string, y?: string) => y ?? x,
default: () => "",
},
env: {
value: (
x: {
client?: OpenAI;
box?: AndroidBoxOperator;
},
y?: {
client?: OpenAI;
box?: AndroidBoxOperator;
}
) => y ?? x,
default: () => ({}),
},
step: {
value: (x: number, y?: number) => y ?? x,
default: () => 0,
},
trajectory: {
value: (x: string[], y?: string[]) => y ?? x,
default: () => [],
},
isFinished: {
value: (x: boolean, y?: boolean) => y ?? x,
default: () => false,
},
summary: {
value: (x: string, y?: string) => y ?? x,
default: () => "",
},
error: {
value: (x: string, y?: string) => y ?? x,
default: () => "",
},
},
});
builder
.addNode("initialize", initialize)
.addEdge("__start__", "initialize")
.addNode("takeAction", takeAction)
.addEdge("initialize", "takeAction")
.addNode("finish", finish)
.addConditionalEdges("takeAction", shouldContinue, {
takeAction: "takeAction",
finish: "finish",
});
// Compile the graph
const graph = builder.compile();
Implement Initialize Node
initialize
node to set up the agent’s initial state:const initialize = async (state: AgentState): Promise<Partial<AgentState>> => {
if (!state.apk || !state.task) {
throw new Error(
"Apk and task must be provided to initialize the agent."
);
}
console.log("[Initialize] APK Link:", state.apk);
console.log("[Initialize] Task:", state.task);
// Initialize OpenAI client
if (!process.env.OPENAI_API_KEY || !process.env.OPENAI_ORGANIZATION) {
throw new Error(
"OpenAI API key or organization are not set in environment variables."
);
}
const client = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
organization: process.env.OPENAI_ORGANIZATION,
});
// Initialize Application Box
if (!process.env.GBOX_API_KEY) {
throw new Error("Gbox API key is not set in environment variables.");
}
const gboxSDK = new GboxSDK({ apiKey: process.env.GBOX_API_KEY });
const box = await gboxSDK.create({ type: "android" });
const app = await box.app.install({
apk: state.apk,
});
console.log("[Initialize] App installed successfully:");
await app.open();
console.log("[Initialize] App opened successfully.");
const liveView = await box.liveView();
console.log(
"[Initialize] Open the following URL in your browser to see the live view:",
liveView.url
);
return {
apk: state.apk,
task: state.task,
env: {
client,
box,
},
step: 0,
trajectory: [],
error: undefined,
isFinished: false,
summary: "",
};
};
Implement Take Action Node
takeAction
node to perform actions based on the agent’s task:const decideAction = async (
client: OpenAI,
task: string,
trajectory: string[],
screenshotBase64: string
): Promise<{
message: string;
isFinished: boolean;
isError: boolean;
}> => {
// Prompt
const userPrompt = `You are a senior QA engineer with expertise in web automation and testing. You need to perform the following task: ${task}.
Please decide what action to take next based on the current state of the page and return in json format:
\`\`\`json
{
"action": "describe the action to take, if finished, leave empty",
"finished": "determine if the task is complete, true if finished, false otherwise"
}
\`\`\`
# Previous step
${trajectory.join("\n")}
# Current Screenshot`;
try {
const response = await client.responses
.create({
model: "o3",
input: [
{
role: "user",
content: [
{
type: "input_text",
text: userPrompt,
},
{
type: "input_image",
image_url: screenshotBase64,
detail: "high",
},
],
},
],
})
.then((response) => {
return response.output_text;
});
const actionResponse = response
.replace(/^\s*```json\s*/, "")
.replace(/```$/, "");
const actionData = JSON.parse(actionResponse);
if (!actionData || typeof actionData !== "object") {
return {
message: "Invalid action response format.",
isFinished: false,
isError: true,
};
}
if (actionData.finished) {
console.log("[Take Action] Task is finished.");
return {
message: "Task completed successfully.",
isFinished: true,
isError: false,
};
}
return {
message: actionData.action,
isError: false,
isFinished: false,
};
} catch (error) {
console.error("[Error] Failed to get response from OpenAI:", error);
return {
message: "No action taken due to error.",
isFinished: false,
isError: true,
};
}
};
const takeAction = async (state: AgentState): Promise<Partial<AgentState>> => {
if (state.step >= 25) {
console.error("[Error] Step limit reached, cannot take more actions.");
return {
...state,
error: "Action limit reached",
};
}
// Decide next action
if (!state.env || !state.env.client || !state.env.box) {
console.error(
"[Error] Missing required environment (client or box) for action."
);
return {
...state,
error: "Missing required environment (client or box) for action.",
};
}
const screenshotBase64 = await state.env.box.action
.screenshot({ outputFormat: "base64" })
.then((response) => response.uri);
if (!screenshotBase64) {
return {
...state,
trajectory: [...state.trajectory, "Failed to take screenshot."],
};
}
const { isError, isFinished, message } = await decideAction(
state.env.client,
state.task,
state.trajectory,
screenshotBase64
);
if (isFinished) {
console.log(
"[Take Action] Task is finished, no further actions needed."
);
return {
...state,
trajectory: [...state.trajectory, "Task was marked as finished."],
isFinished: true,
};
}
if (isError) {
console.error(
"[Take Action] Error occurred, logging message:",
message
);
return {
...state,
trajectory: [...state.trajectory, message],
};
}
// Execute action
console.log("[Take Action] Executing action:", message);
await state.env.box.action.ai({
instruction: message,
background: "You are a QA engineer. You are testing the application.",
});
console.log("[Take Action] Action executed successfully.");
return {
...state,
step: state.step + 1,
trajectory: [
...state.trajectory,
message,
"Action executed successfully.",
],
isFinished: true,
};
};
Implement Finish Node
finish
node to summarize the agent’s actions and clean up resources:const finish = async (state: AgentState): Promise<Partial<AgentState>> => {
if (!state.env || !state.env.client || !state.env.box) {
console.error(
"[Error] Missing required environment (client or box) for finishing."
);
return {
...state,
error: "Missing required environment (client or box) for finishing.",
};
}
// Summary of the task
const userPrompt = `You are a senior QA engineer with expertise in application automation and testing. You need to finish the task: ${
state.task
}.
Please provide a concise summary of the task and the final result (if any)
# Actions taken
${state.trajectory.join("\n")}
`;
const screenshotBase64 = await state.env.box.action
.screenshot({ outputFormat: "base64" })
.then((response) => response.uri);
console.log("[Finish] Generating final summary...");
const response = await state.env.client.responses
.create({
model: "o3",
input: [
{
role: "user",
content: [
{
type: "input_text",
text: userPrompt,
},
{
type: "input_image",
image_url: `data:image/png;base64,${screenshotBase64}`,
detail: "high",
},
],
},
],
})
.then((response) => {
return response.output_text;
});
// Close the browser and finalize the trajectory
if (state.env.box) {
await state.env.box.terminate();
console.log("[Finish] Box terminated.");
}
let summary = response;
if (state.error.length > 0) {
console.error("[Finish] Agent encountered an error.");
summary = state.error || "An error occurred during the task.";
} else {
console.log("[Finish] Agent completed successfully.");
}
return {
...state,
summary: summary.trim(),
};
};
Add conditional logic
shouldContinue
function to determine if the agent should continue taking actions or finish:// Define the conditional routing logic
function shouldContinue(state: AgentState): string {
/**
* Determine whether to continue taking actions or finish.
* Returns either 'finish' or 'takeAction'.
*/
if (state.error.length > 0) {
console.log("[Info] Task encountered an error, ending.");
console.log(state.error);
return "finish";
}
if (state.isFinished) {
console.log("[Info] Task is finished, no further actions needed.");
return "finish";
}
if (state.step >= 25) {
console.log("[Info] Step limit reached, finishing.");
return "finish";
}
console.log("[Info] Continuing to take next action.");
return "takeAction";
}
Run the Agent
// Example usage:
const finalState = await graph.invoke(
{
apk: "https://github.com/gsantner/markor/releases/download/v2.14.1/net.gsantner.markor-v158-2.14.1-flavorDefault-release.apk",
task: "Test the Markor app functionality",
},
{ recursionLimit: 100 }
);
console.log("[Final Result]\n", finalState.summary);
tsx
:npx tsx index.ts
index.ts
with the path to your TypeScript file if it’s located elsewhere.Was this page helpful?