> ## Documentation Index
> Fetch the complete documentation index at: https://docs.gbox.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Click

> Simulates a click action on the box.



## OpenAPI

````yaml post /boxes/{boxId}/actions/click
openapi: 3.0.0
info:
  title: GBOX Open API
  description: GBOX Open API Documentation
  version: '1.0'
  contact: {}
servers:
  - url: https://gbox.ai/api/v1
    description: Production Server
security: []
tags: []
paths:
  /boxes/{boxId}/actions/click:
    post:
      tags:
        - UI Action
      summary: Click
      description: Simulates a click action on the box.
      operationId: UIActionController_click
      parameters:
        - name: boxId
          required: true
          in: path
          description: Box ID
          schema:
            example: c9bdc193-b54b-4ddb-a035-5ac0c598d32d
            type: string
      requestBody:
        required: true
        content:
          application/json:
            schema:
              oneOf:
                - $ref: '#/components/schemas/Click'
                - $ref: '#/components/schemas/ClickByNaturalLanguage'
                - $ref: '#/components/schemas/ClickByElement'
            examples:
              Basic:
                summary: Basic click
                value:
                  x: 100
                  'y': 100
              RightClick:
                summary: Right click
                value:
                  x: 100
                  'y': 100
                  button: right
              DoubleClick:
                summary: Double click
                value:
                  x: 100
                  'y': 100
                  double: true
              ModifierClick:
                summary: Ctrl + click
                value:
                  x: 100
                  'y': 100
                  modifierKeys:
                    - control
              NaturalLanguage:
                summary: Click login button
                value:
                  target: login button
      responses:
        '200':
          description: >-
            Click action executed successfully. The response includes the actual
            coordinates where the click was performed, which is especially
            useful when using natural language targeting.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ClickActionResult'
      security:
        - bearer: []
      x-codeSamples:
        - lang: JavaScript
          source: |-
            import GboxSDK from "gbox-sdk";

            const gboxSDK = new GboxSDK({
              apiKey: process.env["GBOX_API_KEY"] // This is the default and can be omitted
            });

            async function main() {
              const box = await gboxSDK.create({ type: "android" });

              await box.action.click({
                x: 100,
                y: 100
              });

              // Natural language click
              await box.action.click({
                target: "login button"
              });
            }

            main();
        - lang: Python
          source: |-
            import os
            from gbox_sdk import GboxSDK


            def main():
                gbox_sdk = GboxSDK(api_key=os.environ["GBOX_API_KEY"])  # This is the default and can be omitted

                # Create Android box
                box = gbox_sdk.create(type="android")

                # Perform click action
                box.action.click(x=100, y=100)

                # Natural language click
                box.action.click(target="login button")


            if __name__ == "__main__":
                main()
        - lang: Go
          source: "package main\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"log\"\n\t\"os\"\n\n\t\"github.com/gbox/gbox-sdk-go\"\n)\n\nfunc main() {\n\tgboxSDK := gbox.NewGboxSDK(os.Getenv(\"GBOX_API_KEY\"))\n\n\tbox, err := gboxSDK.Create(context.Background(), gbox.CreateBoxRequest{\n\t\tType: \"android\",\n\t})\n\tif err != nil {\n\t\tlog.Fatalf(\"Failed to create box: %v\", err)\n\t}\n\n\terr = box.Action.Click(context.Background(), gbox.ClickRequest{\n\t\tX: 100,\n\t\tY: 100,\n\t})\n\tif err != nil {\n\t\tlog.Fatalf(\"Failed to perform click action: %v\", err)\n\t}\n\n\t// Natural language click\n\terr = box.Action.Click(context.Background(), gbox.ClickRequest{\n\t\tTarget: \"login button\",\n\t})\n\tif err != nil {\n\t\tlog.Fatalf(\"Failed to perform click action: %v\", err)\n\t}\n}"
components:
  schemas:
    Click:
      type: object
      properties:
        button:
          type: string
          enum:
            - left
            - right
            - middle
          description: Mouse button to click
          default: left
          example: left
        double:
          type: boolean
          description: Whether to perform a double click
          default: false
          example: false
        modifierKeys:
          type: array
          description: >-
            Modifier keys to hold while performing the click (e.g., control,
            shift, alt). Supports the same key values as the pressKey action.
          example:
            - control
            - shift
          items:
            type: string
            enum:
              - a
              - b
              - c
              - d
              - e
              - f
              - g
              - h
              - i
              - j
              - k
              - l
              - m
              - 'n'
              - o
              - p
              - q
              - r
              - s
              - t
              - u
              - v
              - w
              - x
              - 'y'
              - z
              - '0'
              - '1'
              - '2'
              - '3'
              - '4'
              - '5'
              - '6'
              - '7'
              - '8'
              - '9'
              - f1
              - f2
              - f3
              - f4
              - f5
              - f6
              - f7
              - f8
              - f9
              - f10
              - f11
              - f12
              - control
              - alt
              - shift
              - meta
              - win
              - cmd
              - option
              - arrowUp
              - arrowDown
              - arrowLeft
              - arrowRight
              - home
              - end
              - pageUp
              - pageDown
              - enter
              - space
              - tab
              - escape
              - backspace
              - delete
              - insert
              - capsLock
              - numLock
              - scrollLock
              - pause
              - printScreen
              - ;
              - '='
              - ','
              - '-'
              - .
              - /
              - '`'
              - '['
              - \
              - ']'
              - ''''
              - numpad0
              - numpad1
              - numpad2
              - numpad3
              - numpad4
              - numpad5
              - numpad6
              - numpad7
              - numpad8
              - numpad9
              - numpadAdd
              - numpadSubtract
              - numpadMultiply
              - numpadDivide
              - numpadDecimal
              - numpadEnter
              - numpadEqual
              - volumeUp
              - volumeDown
              - volumeMute
              - mediaPlayPause
              - mediaStop
              - mediaNextTrack
              - mediaPreviousTrack
        x:
          type: number
          description: X coordinate of the click
          example: 350
        'y':
          type: number
          description: Y coordinate of the click
          example: 250
        options:
          description: >-
            Action options. When `options.screenshot` is provided, ALL
            deprecated screenshot fields (outputFormat, presignedExpiresIn,
            screenshotDelay, screenshotRange, includeScreenshot) will be
            completely ignored.
          example:
            screenshot:
              outputFormat: base64
              presignedExpiresIn: 30m
              delay: 500ms
              phases:
                - before
                - after
          allOf:
            - $ref: '#/components/schemas/ActionCommonOptions'
      title: Click Action
      description: Mouse click action configuration
      required:
        - x
        - 'y'
    ClickByNaturalLanguage:
      type: object
      properties:
        button:
          type: string
          enum:
            - left
            - right
            - middle
          description: Mouse button to click
          default: left
          example: left
        double:
          type: boolean
          description: Whether to perform a double click
          default: false
          example: false
        modifierKeys:
          type: array
          description: >-
            Modifier keys to hold while performing the click (e.g., control,
            shift, alt). Supports the same key values as the pressKey action.
          example:
            - control
            - shift
          items:
            type: string
            enum:
              - a
              - b
              - c
              - d
              - e
              - f
              - g
              - h
              - i
              - j
              - k
              - l
              - m
              - 'n'
              - o
              - p
              - q
              - r
              - s
              - t
              - u
              - v
              - w
              - x
              - 'y'
              - z
              - '0'
              - '1'
              - '2'
              - '3'
              - '4'
              - '5'
              - '6'
              - '7'
              - '8'
              - '9'
              - f1
              - f2
              - f3
              - f4
              - f5
              - f6
              - f7
              - f8
              - f9
              - f10
              - f11
              - f12
              - control
              - alt
              - shift
              - meta
              - win
              - cmd
              - option
              - arrowUp
              - arrowDown
              - arrowLeft
              - arrowRight
              - home
              - end
              - pageUp
              - pageDown
              - enter
              - space
              - tab
              - escape
              - backspace
              - delete
              - insert
              - capsLock
              - numLock
              - scrollLock
              - pause
              - printScreen
              - ;
              - '='
              - ','
              - '-'
              - .
              - /
              - '`'
              - '['
              - \
              - ']'
              - ''''
              - numpad0
              - numpad1
              - numpad2
              - numpad3
              - numpad4
              - numpad5
              - numpad6
              - numpad7
              - numpad8
              - numpad9
              - numpadAdd
              - numpadSubtract
              - numpadMultiply
              - numpadDivide
              - numpadDecimal
              - numpadEnter
              - numpadEqual
              - volumeUp
              - volumeDown
              - volumeMute
              - mediaPlayPause
              - mediaStop
              - mediaNextTrack
              - mediaPreviousTrack
        target:
          type: string
          description: >-
            Describe the target to operate using natural language, e.g., 'login
            button' or 'Chrome'.
          example: login button
        options:
          description: >-
            Action options. When `options.screenshot` is provided, ALL
            deprecated screenshot fields (outputFormat, presignedExpiresIn,
            screenshotDelay, screenshotRange, includeScreenshot) will be
            completely ignored.
          example:
            screenshot:
              outputFormat: base64
              presignedExpiresIn: 30m
              delay: 500ms
              phases:
                - before
                - after
          allOf:
            - $ref: '#/components/schemas/ActionCommonOptions'
      title: Click Action with Natural Language
      description: Click action configuration with natural language
      required:
        - target
    ClickByElement:
      type: object
      properties:
        button:
          type: string
          enum:
            - left
            - right
            - middle
          description: Mouse button to click
          default: left
          example: left
        double:
          type: boolean
          description: Whether to perform a double click
          default: false
          example: false
        modifierKeys:
          type: array
          description: >-
            Modifier keys to hold while performing the click (e.g., control,
            shift, alt). Supports the same key values as the pressKey action.
          example:
            - control
            - shift
          items:
            type: string
            enum:
              - a
              - b
              - c
              - d
              - e
              - f
              - g
              - h
              - i
              - j
              - k
              - l
              - m
              - 'n'
              - o
              - p
              - q
              - r
              - s
              - t
              - u
              - v
              - w
              - x
              - 'y'
              - z
              - '0'
              - '1'
              - '2'
              - '3'
              - '4'
              - '5'
              - '6'
              - '7'
              - '8'
              - '9'
              - f1
              - f2
              - f3
              - f4
              - f5
              - f6
              - f7
              - f8
              - f9
              - f10
              - f11
              - f12
              - control
              - alt
              - shift
              - meta
              - win
              - cmd
              - option
              - arrowUp
              - arrowDown
              - arrowLeft
              - arrowRight
              - home
              - end
              - pageUp
              - pageDown
              - enter
              - space
              - tab
              - escape
              - backspace
              - delete
              - insert
              - capsLock
              - numLock
              - scrollLock
              - pause
              - printScreen
              - ;
              - '='
              - ','
              - '-'
              - .
              - /
              - '`'
              - '['
              - \
              - ']'
              - ''''
              - numpad0
              - numpad1
              - numpad2
              - numpad3
              - numpad4
              - numpad5
              - numpad6
              - numpad7
              - numpad8
              - numpad9
              - numpadAdd
              - numpadSubtract
              - numpadMultiply
              - numpadDivide
              - numpadDecimal
              - numpadEnter
              - numpadEqual
              - volumeUp
              - volumeDown
              - volumeMute
              - mediaPlayPause
              - mediaStop
              - mediaNextTrack
              - mediaPreviousTrack
        target:
          description: The element to click
          allOf:
            - $ref: '#/components/schemas/DetectedElement'
        options:
          description: >-
            Action options. When `options.screenshot` is provided, ALL
            deprecated screenshot fields (outputFormat, presignedExpiresIn,
            screenshotDelay, screenshotRange, includeScreenshot) will be
            completely ignored.
          example:
            screenshot:
              outputFormat: base64
              presignedExpiresIn: 30m
              delay: 500ms
              phases:
                - before
                - after
          allOf:
            - $ref: '#/components/schemas/ActionCommonOptions'
      title: Click Action by Element
      description: Click action configuration by element
      required:
        - target
    ClickActionResult:
      type: object
      properties:
        message:
          type: string
          description: message
          example: Action executed successfully
        actionId:
          type: string
          description: >-
            Unique identifier for each action. Use this ID to locate the action
            and report issues.
          example: c9bdc193-b54b-4ddb-a035-5ac0c598d32d
        screenshot:
          description: >-
            Optional screenshot data. Only present when screenshots are
            requested via options.screenshot.phases or deprecated fields
          example:
            trace:
              uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
            before:
              uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
            after:
              uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
          allOf:
            - $ref: '#/components/schemas/ActionResultScreenshot'
        actual:
          description: >-
            Actual parameters used when executing the click action, including
            coordinates (x, y), button type (left/right/middle), modifier keys
            (control/shift/alt), and whether it was a double click. Field names
            match the input parameters.
          example:
            x: 350
            'y': 250
            button: left
            double: false
            modifierKeys:
              - control
          allOf:
            - $ref: '#/components/schemas/ClickActionActual'
      title: Click Action Result
      description: >-
        Result of click action execution with actual parameters used. The actual
        field shows the exact parameters used when performing the click, which
        is especially useful when using natural language or element-based
        targeting to understand exactly what action was performed.
      required:
        - message
        - actionId
        - actual
    ActionCommonOptions:
      type: object
      properties:
        screenshot:
          description: >-
            Screenshot options. Can be a boolean to enable/disable screenshots,
            or an object to configure screenshot options.
          oneOf:
            - $ref: '#/components/schemas/ActionScreenshotOptions'
              example:
                outputFormat: base64
                presignedExpiresIn: 30m
                delay: 500ms
                phases:
                  - before
                  - after
            - type: boolean
              example: true
        model:
          type: string
          description: >-
            Model to use for natural-language target resolution. Defaults to
            'uitars'.
          enum:
            - gpt-5
            - gpt-4o
            - gelato
            - ui-tars
            - openai-computer-use
          default: gelato
      title: Action Common Options
      description: Action common options
    DetectedElement:
      type: object
      properties:
        id:
          type: string
          description: Element id
          example: '1'
        source:
          type: string
          description: Element source
          example: chromium
        type:
          type: string
          description: Element type
          example: button
        path:
          type: string
          description: Element path
          example: >-
            #root > table > tbody > tr:nth-child(1) > td:nth-child(1) > div >
            button
        width:
          type: number
          description: Element width
          example: 100
        height:
          type: number
          description: Element height
          example: 50
        x:
          type: number
          description: Element x coordinate relative to screen
          example: 100
        'y':
          type: number
          description: Element y coordinate relative to screen
          example: 100
        centerX:
          type: number
          description: Element center x coordinate relative to screen
          example: 150
        centerY:
          type: number
          description: Element center y coordinate relative to screen
          example: 125
        label:
          type: string
          description: >-
            A human-readable identifier generated from the element's visible
            attributes to help understand what this element represents.

            For images, it uses alt text or filename; for links, it uses text
            content or href; for buttons, it uses text content or aria-label;
            for inputs, it uses placeholder or value; etc.
          example: Click me
      title: Detected Element
      description: Detected UI element
      required:
        - id
        - source
        - type
        - path
        - width
        - height
        - x
        - 'y'
        - centerX
        - centerY
        - label
    ActionResultScreenshot:
      type: object
      properties:
        trace:
          description: URI of the screenshot before the action with operation trace
          example:
            uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
          allOf:
            - $ref: '#/components/schemas/ActionResultOperationTrace'
        before:
          description: URI of the screenshot before the action
          example:
            uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
          allOf:
            - $ref: '#/components/schemas/ActionResultScreenshotBefore'
        after:
          description: URI of the screenshot after the action
          example:
            uri: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
          allOf:
            - $ref: '#/components/schemas/ActionResultScreenshotAfter'
      title: Action Result Screenshot
      description: Complete screenshot result with operation trace, before and after images
    ClickActionActual:
      type: object
      properties:
        x:
          type: number
          description: X coordinate where the click was executed
          example: 350
        'y':
          type: number
          description: Y coordinate where the click was executed
          example: 250
        button:
          type: string
          description: Mouse button that was clicked
          enum:
            - left
            - right
            - middle
          example: left
        double:
          type: boolean
          description: Whether a double click was performed
          example: false
        modifierKeys:
          type: array
          description: >-
            Modifier keys that were pressed during the click (e.g., control,
            shift, alt). Matches the KeyboardKey enum used by pressKey action.
          example:
            - control
          items:
            type: string
            enum:
              - a
              - b
              - c
              - d
              - e
              - f
              - g
              - h
              - i
              - j
              - k
              - l
              - m
              - 'n'
              - o
              - p
              - q
              - r
              - s
              - t
              - u
              - v
              - w
              - x
              - 'y'
              - z
              - '0'
              - '1'
              - '2'
              - '3'
              - '4'
              - '5'
              - '6'
              - '7'
              - '8'
              - '9'
              - f1
              - f2
              - f3
              - f4
              - f5
              - f6
              - f7
              - f8
              - f9
              - f10
              - f11
              - f12
              - control
              - alt
              - shift
              - meta
              - win
              - cmd
              - option
              - arrowUp
              - arrowDown
              - arrowLeft
              - arrowRight
              - home
              - end
              - pageUp
              - pageDown
              - enter
              - space
              - tab
              - escape
              - backspace
              - delete
              - insert
              - capsLock
              - numLock
              - scrollLock
              - pause
              - printScreen
              - ;
              - '='
              - ','
              - '-'
              - .
              - /
              - '`'
              - '['
              - \
              - ']'
              - ''''
              - numpad0
              - numpad1
              - numpad2
              - numpad3
              - numpad4
              - numpad5
              - numpad6
              - numpad7
              - numpad8
              - numpad9
              - numpadAdd
              - numpadSubtract
              - numpadMultiply
              - numpadDivide
              - numpadDecimal
              - numpadEnter
              - numpadEqual
              - volumeUp
              - volumeDown
              - volumeMute
              - mediaPlayPause
              - mediaStop
              - mediaNextTrack
              - mediaPreviousTrack
      title: Click Action Actual Parameters
      description: >-
        Actual parameters used when executing the click action, with the same
        field names as input parameters
      required:
        - x
        - 'y'
        - button
        - double
    ActionScreenshotOptions:
      type: object
      properties:
        outputFormat:
          type: string
          enum:
            - base64
            - storageKey
          description: Type of the URI. default is base64.
          default: base64
          example: base64
        presignedExpiresIn:
          type: string
          description: >-
            Presigned url expires in. Only takes effect when outputFormat is
            storageKey.


            Supported time units: ms (milliseconds), s (seconds), m (minutes), h
            (hours)

            Example formats: "500ms", "30s", "5m", "1h"

            Default: 30m
          example: 30m
          default: 30m
          title: PresignedExpiresIn
        delay:
          type: string
          description: >-
            Delay after performing the action, before taking the final
            screenshot.


            Execution flow:

            1. Take screenshot before action

            2. Perform the action

            3. Wait for screenshotDelay (this parameter)

            4. Take screenshot after action


            Example: '500ms' means wait 500ms after the action before capturing
            the final screenshot.


            Supported time units: ms (milliseconds), s (seconds), m (minutes), h
            (hours)

            Example formats: "500ms", "30s", "5m", "1h"

            Default: 500ms

            Maximum allowed: 30s
          example: 500ms
          default: 500ms
          title: Delay
        phases:
          type: array
          description: >-
            Specify which screenshot phases to capture.


            Available options:

            - before: Screenshot before the action

            - after: Screenshot after the action

            - trace: Screenshot with operation trace


            Default captures all three phases. Can specify one or multiple in an
            array.

            If empty array is provided, no screenshots will be taken.
          default:
            - before
            - after
            - trace
          example:
            - before
            - after
          items:
            type: string
            enum:
              - before
              - after
              - trace
      title: Action Screenshot Options
      description: Action screenshot options
    ActionResultOperationTrace:
      type: object
      properties:
        uri:
          type: string
          description: URI of the screenshot with operation trace
          example: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
      title: Action Result Screenshot Operation Trace
      description: Screenshot with action operation trace
      required:
        - uri
    ActionResultScreenshotBefore:
      type: object
      properties:
        uri:
          type: string
          description: URI of the screenshot before the action
          example: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
        presignedUrl:
          type: string
          description: Presigned url of the screenshot before the action
          example: https://example.com/xxxxx/xxxxx/xxxxx
      title: Action Result Screenshot Before
      description: Screenshot taken before action execution
      required:
        - uri
    ActionResultScreenshotAfter:
      type: object
      properties:
        uri:
          type: string
          description: URI of the screenshot after the action
          example: data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA...
        presignedUrl:
          type: string
          description: Presigned url of the screenshot before the action
          example: https://example.com/xxxxx/xxxxx/xxxxx
      title: Action Result Screenshot After
      description: Screenshot taken after action execution
      required:
        - uri
  securitySchemes:
    bearer:
      scheme: bearer
      bearerFormat: JWT
      type: http
      description: >-
        Enter your API Key in the format: Bearer <token>. Get it from
        https://gbox.ai

````