> ## Documentation Index
> Fetch the complete documentation index at: https://docs.gbox.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Detect UI elements

> Detect and identify interactive UI elements in the current screen. Note: This feature currently only supports element detection within a running browser. If the browser is not running, the Elements array will be empty.



## OpenAPI

````yaml post /boxes/{boxId}/actions/elements/detect
openapi: 3.0.0
info:
  title: GBOX Open API
  description: GBOX Open API Documentation
  version: '1.0'
  contact: {}
servers:
  - url: https://gbox.ai/api/v1
    description: Production Server
security: []
tags: []
paths:
  /boxes/{boxId}/actions/elements/detect:
    post:
      tags:
        - UI Action
      summary: Detect UI elements
      description: >-
        Detect and identify interactive UI elements in the current screen. Note:
        This feature currently only supports element detection within a running
        browser. If the browser is not running, the Elements array will be
        empty.
      operationId: UIActionController_detectElements
      parameters:
        - name: boxId
          required: true
          in: path
          description: Box ID
          schema:
            example: c9bdc193-b54b-4ddb-a035-5ac0c598d32d
            type: string
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/DetectElements'
      responses:
        '200':
          description: ''
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/DetectedElementsResult'
      security:
        - bearer: []
      x-codeSamples:
        - lang: JavaScript
          source: |-
            import GboxSDK from "gbox-sdk";

            const gboxSDK = new GboxSDK({
              apiKey: process.env["GBOX_API_KEY"] // This is the default and can be omitted
            });

            async function main() {
              const box = await gboxSDK.create({ type: "linux" });

              await box.browser.openTab({
                url: "https://gbox.ai",
              });

              const { screenshot, elements } = await box.action.elements.detect({
                screenshot: {
                  outputFormat: 'storageKey'
                }
              });

              console.info(`Screenshot: ${JSON.stringify(screenshot, null, 2)}`);

              console.info(`Detected elements length: ${elements.list().length}`);

              // You can send the screenshot to an LLM or Agent to decide which element to click
              const firstElement = elements.get("1");
              await box.action.click({
                // here we just click the first element
                target: firstElement,
              });

              console.info(
                `Clicked element: ${JSON.stringify(firstElement, null, 2)}`
              );
            }

            main();
        - lang: Python
          source: |-
            import os
            import json
            from gbox_sdk import GboxSDK


            def main():
                gbox_sdk = GboxSDK(api_key=os.environ["GBOX_API_KEY"])  # This is the default and can be omitted

                box = gbox_sdk.create(type="linux")

                new_tab = box.browser.open_tab(url="https://gbox.ai")

                res = box.action.elements.detect(
                    screenshot={
                        "output_format": "storageKey"
                    }
                )

                print(f"Screenshot: {json.dumps(res.screenshot, indent=2)}")

                print(f"Detected elements length: {len(res.elements.list())}")

                # You can send the screenshot to an LLM or Agent to decide which element to click
                first_element = res.elements.get("1")
                box.action.click(target=first_element)

                print(f"Clicked element: {json.dumps(first_element, indent=2)}")


            if __name__ == "__main__":
                main()
        - lang: Go
          source: "package main\n\nimport (\n\t\"context\"\n\t\"encoding/json\"\n\t\"fmt\"\n\t\"log\"\n\t\"os\"\n\n\t\"github.com/gbox/gbox-sdk-go\"\n)\n\nfunc main() {\n\tgboxSDK := gbox.NewGboxSDK(os.Getenv(\"GBOX_API_KEY\"))\n\n\tbox, err := gboxSDK.Create(context.Background(), gbox.CreateBoxRequest{\n\t\tType: \"linux\",\n\t})\n\tif err != nil {\n\t\tlog.Fatalf(\"Failed to create box: %v\", err)\n\t}\n\n\tnewTab, err := box.Browser.OpenTab(context.Background(), gbox.OpenTabRequest{\n\t\tURL: \"https://gbox.ai\",\n\t})\n\tif err != nil {\n\t\tlog.Fatalf(\"Failed to open tab: %v\", err)\n\t}\n\n\tscreenshot, elements, err := box.Action.Elements.Detect(context.Background(), gbox.DetectElementsRequest{\n\t\tScreenshot: &gbox.ScreenshotConfig{\n\t\t\tOutputFormat: \"storageKey\",\n\t\t},\n\t})\n\tif err != nil {\n\t\tlog.Fatalf(\"Failed to detect elements: %v\", err)\n\t}\n\n\tscreenshotJSON, _ := json.MarshalIndent(screenshot, \"\", \"  \")\n\tfmt.Printf(\"Screenshot: %s\\n\", screenshotJSON)\n\n\tfmt.Printf(\"Detected elements length: %d\\n\", len(elements.List()))\n\n\t// You can send the screenshot to an LLM or Agent to decide which element to click\n\tfirstElement, exists := elements.Get(\"1\")\n\tif !exists {\n\t\tlog.Fatalf(\"Element '1' not found\")\n\t}\n\n\terr = box.Action.Click(context.Background(), gbox.ClickRequest{\n\t\tTarget: firstElement,\n\t})\n\tif err != nil {\n\t\tlog.Fatalf(\"Failed to click element: %v\", err)\n\t}\n\n\telementJSON, _ := json.MarshalIndent(firstElement, \"\", \"  \")\n\tfmt.Printf(\"Clicked element: %s\\n\", elementJSON)\n}"
components:
  schemas:
    DetectElements:
      type: object
      properties:
        screenshot:
          description: Screenshot options
          default:
            outputFormat: base64
            presignedExpiresIn: 30m
          example:
            outputFormat: base64
            presignedExpiresIn: 30m
          allOf:
            - $ref: '#/components/schemas/DetectElementsScreenshotOptions'
      title: Detect Elements Action
      description: Detect UI elements action configuration
    DetectedElementsResult:
      type: object
      properties:
        screenshot:
          description: Detected elements screenshot
          allOf:
            - $ref: '#/components/schemas/DetectedElementsScreenshot'
        elements:
          description: Detected UI elements
          type: array
          items:
            $ref: '#/components/schemas/DetectedElement'
      title: Detected Elements Result
      description: >-
        Result containing original screenshot, annotated screenshot, and
        detected elements
      required:
        - screenshot
        - elements
    DetectElementsScreenshotOptions:
      type: object
      properties:
        outputFormat:
          type: string
          enum:
            - base64
            - storageKey
          description: Type of the URI. default is base64.
          default: base64
          example: base64
        presignedExpiresIn:
          type: string
          description: >-
            Presigned url expires in. Only takes effect when outputFormat is
            storageKey.


            Supported time units: ms (milliseconds), s (seconds), m (minutes), h
            (hours)

            Example formats: "500ms", "30s", "5m", "1h"

            Default: 30m
          example: 30m
          default: 30m
          title: PresignedExpiresIn
      title: Detect Elements Screenshot Options
      description: Detect elements screenshot options
    DetectedElementsScreenshot:
      type: object
      properties:
        source:
          description: Source screenshot
          allOf:
            - $ref: '#/components/schemas/ScreenshotResult'
        marked:
          description: Marked screenshot with detected elements highlighted
          allOf:
            - $ref: '#/components/schemas/ScreenshotResult'
      title: Detected Elements Screenshot
      description: Detected elements screenshot
      required:
        - source
        - marked
    DetectedElement:
      type: object
      properties:
        id:
          type: string
          description: Element id
          example: '1'
        source:
          type: string
          description: Element source
          example: chromium
        type:
          type: string
          description: Element type
          example: button
        path:
          type: string
          description: Element path
          example: >-
            #root > table > tbody > tr:nth-child(1) > td:nth-child(1) > div >
            button
        width:
          type: number
          description: Element width
          example: 100
        height:
          type: number
          description: Element height
          example: 50
        x:
          type: number
          description: Element x coordinate relative to screen
          example: 100
        'y':
          type: number
          description: Element y coordinate relative to screen
          example: 100
        centerX:
          type: number
          description: Element center x coordinate relative to screen
          example: 150
        centerY:
          type: number
          description: Element center y coordinate relative to screen
          example: 125
        label:
          type: string
          description: >-
            A human-readable identifier generated from the element's visible
            attributes to help understand what this element represents.

            For images, it uses alt text or filename; for links, it uses text
            content or href; for buttons, it uses text content or aria-label;
            for inputs, it uses placeholder or value; etc.
          example: Click me
      title: Detected Element
      description: Detected UI element
      required:
        - id
        - source
        - type
        - path
        - width
        - height
        - x
        - 'y'
        - centerX
        - centerY
        - label
    ScreenshotResult:
      type: object
      properties:
        uri:
          type: string
          description: URL of the screenshot
          example: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
        presignedUrl:
          type: string
          description: Presigned url of the screenshot
          example: https://example.com/xxxxx/xxxxx/xxxxx
      title: Screenshot Result
      description: Result of screenshot capture action
      required:
        - uri
  securitySchemes:
    bearer:
      scheme: bearer
      bearerFormat: JWT
      type: http
      description: >-
        Enter your API Key in the format: Bearer <token>. Get it from
        https://gbox.ai

````