> ## Documentation Index
> Fetch the complete documentation index at: https://docs.gbox.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Android Use Agent

> Build Android-Use Agents with GBOX and LangChain.

## Overview

LangChain is a powerful framework for building AI agents that can interact with GBOX's Android-Use infrastructure. It allows you to create sophisticated AI assistants that can automate tasks, perform web scraping, and interact with Android applications.

## Getting Started

<Steps>
  <Step title="Install Dependencies">
    Install the necessary dependencies for your project.

    <CodeGroup>
      ```bash npm theme={null}
      npm install gbox-sdk @langchain/core @langchain/langgraph @langchain/openai @langchain/community openai dotenv typescript tsx @types/node
      ```

      ```bash pnpm theme={null}
      pnpm install gbox-sdk @langchain/core @langchain/langgraph @langchain/openai @langchain/community openai dotenv typescript tsx @types/node
      ```

      ```bash yarn theme={null}
      yarn add gbox-sdk @langchain/core @langchain/langgraph @langchain/openai @langchain/community openai dotenv typescript tsx @types/node
      ```
    </CodeGroup>
  </Step>

  <Step title="Add Environment Variables">
    Create a `.env` file in your project root and add your GBOX API key and OpenAI API key:

    ```dotenv theme={null}
    # GBOX API Configuration
    GBOX_API_KEY=your_gbox_api_key
    # OpenAI API Configuration
    OPENAI_API_KEY=your_openai_api_key
    OPENAI_ORGANIZATION=your_openai_organization_id
    ```
  </Step>

  <Step title="Create Agent Flow">
    Create `index.ts` with the following content:

    ```typescript theme={null}
    import OpenAI from "openai";
    import { StateGraph } from "@langchain/langgraph";
    import { AndroidBoxOperator, GboxSDK } from "gbox-sdk";
    import * as dotenv from "dotenv";

    dotenv.config();

    type AgentState = {
        apk: string;
        task: string;
        env: {
            client?: OpenAI;
            box?: AndroidBoxOperator;
        };
        step: number;
        trajectory: Array<string>;
        isFinished: boolean;
        summary: string;
        error: string;
    };

    // Create the state graph
    const builder = new StateGraph<AgentState>({
        channels: {
            apk: {
                value: (x: string, y?: string) => y ?? x,
                default: () => "",
            },
            task: {
                value: (x: string, y?: string) => y ?? x,
                default: () => "",
            },
            env: {
                value: (
                    x: {
                        client?: OpenAI;
                        box?: AndroidBoxOperator;
                    },
                    y?: {
                        client?: OpenAI;
                        box?: AndroidBoxOperator;
                    }
                ) => y ?? x,
                default: () => ({}),
            },
            step: {
                value: (x: number, y?: number) => y ?? x,
                default: () => 0,
            },
            trajectory: {
                value: (x: string[], y?: string[]) => y ?? x,
                default: () => [],
            },
            isFinished: {
                value: (x: boolean, y?: boolean) => y ?? x,
                default: () => false,
            },
            summary: {
                value: (x: string, y?: string) => y ?? x,
                default: () => "",
            },
            error: {
                value: (x: string, y?: string) => y ?? x,
                default: () => "",
            },
        },
    });

    builder
        .addNode("initialize", initialize)
        .addEdge("__start__", "initialize")
        .addNode("takeAction", takeAction)
        .addEdge("initialize", "takeAction")
        .addNode("finish", finish)
        .addConditionalEdges("takeAction", shouldContinue, {
            takeAction: "takeAction",
            finish: "finish",
        });

    // Compile the graph
    const graph = builder.compile();
    ```
  </Step>

  <Step title="Implement Initialize Node">
    Implement the `initialize` node to set up the agent's initial state:

    ```typescript TypeScript expandable theme={null}
    const initialize = async (state: AgentState): Promise<Partial<AgentState>> => {
        if (!state.apk || !state.task) {
            throw new Error(
                "Apk and task must be provided to initialize the agent."
            );
        }
        console.log("[Initialize] APK Link:", state.apk);
        console.log("[Initialize] Task:", state.task);

        // Initialize OpenAI client
        if (!process.env.OPENAI_API_KEY || !process.env.OPENAI_ORGANIZATION) {
            throw new Error(
                "OpenAI API key or organization are not set in environment variables."
            );
        }

        const client = new OpenAI({
            apiKey: process.env.OPENAI_API_KEY,
            organization: process.env.OPENAI_ORGANIZATION,
        });

        // Initialize Application Box
        if (!process.env.GBOX_API_KEY) {
            throw new Error("GBOX API key is not set in environment variables.");
        }

        const gboxSDK = new GboxSDK({ apiKey: process.env.GBOX_API_KEY });
        const box = await gboxSDK.create({ type: "android" });
        const app = await box.app.install({
            apk: state.apk,
        });
        console.log("[Initialize] App installed successfully:");
        await app.open();
        console.log("[Initialize] App opened successfully.");
        const liveView = await box.liveView();
        console.log(
            "[Initialize] Open the following URL in your browser to see the live view:",
            liveView.url
        );

        return {
            apk: state.apk,
            task: state.task,
            env: {
                client,
                box,
            },
            step: 0,
            trajectory: [],
            error: undefined,
            isFinished: false,
            summary: "",
        };
    };
    ```
  </Step>

  <Step title="Implement Take Action Node">
    Implement the `takeAction` node to perform actions based on the agent's task:

    ````typescript TypeScript expandable theme={null}
    const decideAction = async (
        client: OpenAI,
        task: string,
        trajectory: string[],
        screenshotBase64: string
    ): Promise<{
        message: string;
        isFinished: boolean;
        isError: boolean;
    }> => {
        // Prompt
        const userPrompt = `You are a senior QA engineer with expertise in web automation and testing. You need to perform the following task: ${task}.
        Please decide what action to take next based on the current state of the page and return in json format:

        \`\`\`json
        {
            "action": "describe the action to take, if finished, leave empty",
            "finished": "determine if the task is complete, true if finished, false otherwise"
        }
        \`\`\`

        # Previous step
        ${trajectory.join("\n")}

        # Current Screenshot`;

        try {
            const response = await client.responses
                .create({
                    model: "o3",
                    input: [
                        {
                            role: "user",
                            content: [
                                {
                                    type: "input_text",
                                    text: userPrompt,
                                },
                                {
                                    type: "input_image",
                                    image_url: screenshotBase64,
                                    detail: "high",
                                },
                            ],
                        },
                    ],
                })
                .then((response) => {
                    return response.output_text;
                });

            const actionResponse = response
                .replace(/^\s*```json\s*/, "")
                .replace(/```$/, "");
            const actionData = JSON.parse(actionResponse);

            if (!actionData || typeof actionData !== "object") {
                return {
                    message: "Invalid action response format.",
                    isFinished: false,
                    isError: true,
                };
            }

            if (actionData.finished) {
                console.log("[Take Action] Task is finished.");
                return {
                    message: "Task completed successfully.",
                    isFinished: true,
                    isError: false,
                };
            }

            return {
                message: actionData.action,
                isError: false,
                isFinished: false,
            };
        } catch (error) {
            console.error("[Error] Failed to get response from OpenAI:", error);
            return {
                message: "No action taken due to error.",
                isFinished: false,
                isError: true,
            };
        }
    };

    const takeAction = async (state: AgentState): Promise<Partial<AgentState>> => {
        if (state.step >= 25) {
            console.error("[Error] Step limit reached, cannot take more actions.");
            return {
                ...state,
                error: "Action limit reached",
            };
        }

        // Decide next action
        if (!state.env || !state.env.client || !state.env.box) {
            console.error(
                "[Error] Missing required environment (client or box) for action."
            );
            return {
                ...state,
                error: "Missing required environment (client or box) for action.",
            };
        }
        const screenshotBase64 = await state.env.box.action
            .screenshot({ outputFormat: "base64" })
            .then((response) => response.uri);

        if (!screenshotBase64) {
            return {
                ...state,
                trajectory: [...state.trajectory, "Failed to take screenshot."],
            };
        }

        const { isError, isFinished, message } = await decideAction(
            state.env.client,
            state.task,
            state.trajectory,
            screenshotBase64
        );

        if (isFinished) {
            console.log(
                "[Take Action] Task is finished, no further actions needed."
            );
            return {
                ...state,
                trajectory: [...state.trajectory, "Task was marked as finished."],
                isFinished: true,
            };
        }

        if (isError) {
            console.error(
                "[Take Action] Error occurred, logging message:",
                message
            );
            return {
                ...state,
                trajectory: [...state.trajectory, message],
            };
        }

        // Execute action
        console.log("[Take Action] Executing action:", message);
        await state.env.box.action.ai({
            instruction: message,
            background: "You are a QA engineer. You are testing the application.",
        });
        console.log("[Take Action] Action executed successfully.");

        return {
            ...state,
            step: state.step + 1,
            trajectory: [
                ...state.trajectory,
                message,
                "Action executed successfully.",
            ],
            isFinished: true,
        };
    };
    ````
  </Step>

  <Step title="Implement Finish Node">
    Implement the `finish` node to summarize the agent's actions and clean up resources:

    ```typescript TypeScript expandable theme={null}
    const finish = async (state: AgentState): Promise<Partial<AgentState>> => {
        if (!state.env || !state.env.client || !state.env.box) {
            console.error(
                "[Error] Missing required environment (client or box) for finishing."
            );
            return {
                ...state,
                error: "Missing required environment (client or box) for finishing.",
            };
        }

        // Summary of the task
        const userPrompt = `You are a senior QA engineer with expertise in application automation and testing. You need to finish the task: ${
            state.task
        }.
        Please provide a concise summary of the task and the final result (if any)

        # Actions taken
        ${state.trajectory.join("\n")}
        `;

        const screenshotBase64 = await state.env.box.action
            .screenshot({ outputFormat: "base64" })
            .then((response) => response.uri);

        console.log("[Finish] Generating final summary...");

        const response = await state.env.client.responses
            .create({
                model: "o3",
                input: [
                    {
                        role: "user",
                        content: [
                            {
                                type: "input_text",
                                text: userPrompt,
                            },
                            {
                                type: "input_image",
                                image_url: `data:image/png;base64,${screenshotBase64}`,
                                detail: "high",
                            },
                        ],
                    },
                ],
            })
            .then((response) => {
                return response.output_text;
            });

        // Close the browser and finalize the trajectory
        if (state.env.box) {
            await state.env.box.terminate();
            console.log("[Finish] Box terminated.");
        }

        let summary = response;

        if (state.error.length > 0) {
            console.error("[Finish] Agent encountered an error.");
            summary = state.error || "An error occurred during the task.";
        } else {
            console.log("[Finish] Agent completed successfully.");
        }

        return {
            ...state,
            summary: summary.trim(),
        };
    };
    ```
  </Step>

  <Step title="Add conditional logic">
    Implement the `shouldContinue` function to determine if the agent should continue taking actions or finish:

    ```typescript TypeScript expandable theme={null}
    // Define the conditional routing logic
    function shouldContinue(state: AgentState): string {
        /**
         * Determine whether to continue taking actions or finish.
         * Returns either 'finish' or 'takeAction'.
         */
        if (state.error.length > 0) {
            console.log("[Info] Task encountered an error, ending.");
            console.log(state.error);
            return "finish";
        }

        if (state.isFinished) {
            console.log("[Info] Task is finished, no further actions needed.");
            return "finish";
        }

        if (state.step >= 25) {
            console.log("[Info] Step limit reached, finishing.");
            return "finish";
        }

        console.log("[Info] Continuing to take next action.");
        return "takeAction";
    }
    ```
  </Step>

  <Step title="Run the Agent">
    Run the agent with the following command:

    ```typescript theme={null}
    // Example usage:
    const finalState = await graph.invoke(
        {
            apk: "https://github.com/gsantner/markor/releases/download/v2.14.1/net.gsantner.markor-v158-2.14.1-flavorDefault-release.apk",
            task: "Test the Markor app functionality",
        },
        { recursionLimit: 100 }
    );

    console.log("[Final Result]\n", finalState.summary);
    ```

    You can run the TypeScript file using `tsx`:

    ```bash theme={null}
    npx tsx index.ts
    ```

    Make sure to replace `index.ts` with the path to your TypeScript file if it's located elsewhere.
  </Step>
</Steps>
