October 1, 2024 1 min read

Claude Computer Use Preview: AI That Can Control Your Desktop

Anthropic Claude Computer Use AI Agents Automation

Anthropic has announced a groundbreaking capability: Claude can now use computers like a human. The Computer Use preview enables Claude to see your screen, move the mouse, type on the keyboard, and interact with any application.

What is Computer Use?

Computer Use allows Claude to interact with graphical user interfaces through:

Screen capture: Claude can see what’s on screen
Mouse control: Move, click, drag, and scroll
Keyboard input: Type text and use keyboard shortcuts
Application interaction: Work with any desktop application

import anthropic

client = anthropic.Anthropic()

# Computer Use requires specific tool definitions
tools = [
    {
        "type": "computer_20241022",
        "name": "computer",
        "display_width_px": 1920,
        "display_height_px": 1080,
        "display_number": 0
    },
    {
        "type": "text_editor_20241022",
        "name": "str_replace_editor"
    },
    {
        "type": "bash_20241022",
        "name": "bash"
    }
]

response = client.beta.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=4096,
    tools=tools,
    messages=[
        {
            "role": "user",
            "content": "Open Firefox and search for 'Anthropic Claude' on Google"
        }
    ],
    betas=["computer-use-2024-10-22"]
)

The Tool Definitions

# Computer tool for GUI interaction
computer_tool = {
    "type": "computer_20241022",
    "name": "computer",
    "display_width_px": 1920,
    "display_height_px": 1080,
    "display_number": 0  # For multi-monitor setups
}

# Actions available:
# - mouse_move(x, y)
# - left_click()
# - right_click()
# - double_click()
# - drag(start_x, start_y, end_x, end_y)
# - type(text)
# - key(key_name)  # e.g., "Return", "Control+c"
# - screenshot()
# - scroll(direction)  # "up" or "down"

# Text editor tool for file editing
text_editor_tool = {
    "type": "text_editor_20241022",
    "name": "str_replace_editor"
}

# Bash tool for command execution
bash_tool = {
    "type": "bash_20241022",
    "name": "bash"
}

Handling Computer Use Responses

def handle_computer_use(response) -> dict:
    """Process Claude's computer use response"""

    result = {
        "actions": [],
        "needs_screenshot": False,
        "complete": False
    }

    for block in response.content:
        if block.type == "tool_use":
            if block.name == "computer":
                action = block.input.get("action")
                result["actions"].append({
                    "tool": "computer",
                    "action": action,
                    "params": block.input
                })

                # Computer actions usually need a screenshot after
                if action != "screenshot":
                    result["needs_screenshot"] = True

            elif block.name == "str_replace_editor":
                result["actions"].append({
                    "tool": "editor",
                    "command": block.input.get("command"),
                    "params": block.input
                })

            elif block.name == "bash":
                result["actions"].append({
                    "tool": "bash",
                    "command": block.input.get("command")
                })

        elif block.type == "text":
            if "completed" in block.text.lower() or "done" in block.text.lower():
                result["complete"] = True

    return result

Building a Computer Use Loop

import base64
from PIL import ImageGrab

def capture_screenshot() -> str:
    """Capture and encode screenshot"""
    screenshot = ImageGrab.grab()
    # Save to bytes
    import io
    buffer = io.BytesIO()
    screenshot.save(buffer, format="PNG")
    return base64.standard_b64encode(buffer.getvalue()).decode()

def execute_computer_action(action: dict):
    """Execute a computer action"""
    import pyautogui

    if action["action"] == "mouse_move":
        pyautogui.moveTo(action["coordinate"][0], action["coordinate"][1])

    elif action["action"] == "left_click":
        if "coordinate" in action:
            pyautogui.click(action["coordinate"][0], action["coordinate"][1])
        else:
            pyautogui.click()

    elif action["action"] == "type":
        pyautogui.write(action["text"])

    elif action["action"] == "key":
        pyautogui.hotkey(*action["text"].split("+"))

    elif action["action"] == "screenshot":
        return capture_screenshot()

    elif action["action"] == "scroll":
        direction = 1 if action["coordinate"][1] < 0 else -1
        pyautogui.scroll(direction * 3)

def computer_use_loop(task: str, max_iterations: int = 20):
    """Run computer use loop until task complete"""

    messages = [
        {"role": "user", "content": task}
    ]

    for iteration in range(max_iterations):
        # Get initial screenshot
        screenshot = capture_screenshot()

        # Add screenshot to messages
        messages.append({
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/png",
                        "data": screenshot
                    }
                }
            ]
        })

        # Get Claude's response
        response = client.beta.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=4096,
            tools=tools,
            messages=messages,
            betas=["computer-use-2024-10-22"]
        )

        # Process response
        result = handle_computer_use(response)

        if result["complete"]:
            print(f"Task completed in {iteration + 1} iterations")
            return True

        # Execute actions
        for action in result["actions"]:
            if action["tool"] == "computer":
                execute_computer_action(action)

        # Add assistant response to conversation
        messages.append({"role": "assistant", "content": response.content})

    print("Max iterations reached")
    return False

Safety Considerations

class ComputerUseSafety:
    """Safety controls for computer use"""

    BLOCKED_ACTIONS = [
        "format",
        "delete system",
        "rm -rf",
        "shutdown",
        "reboot"
    ]

    SENSITIVE_AREAS = [
        {"name": "system_preferences", "x_range": (0, 100), "y_range": (0, 50)},
        {"name": "dock", "x_range": (0, 1920), "y_range": (1030, 1080)}
    ]

    @classmethod
    def check_action_safe(cls, action: dict) -> tuple[bool, str]:
        """Check if an action is safe to execute"""

        # Check for blocked commands
        if action.get("tool") == "bash":
            command = action.get("command", "").lower()
            for blocked in cls.BLOCKED_ACTIONS:
                if blocked in command:
                    return False, f"Blocked command: {blocked}"

        # Check for sensitive screen areas
        if action.get("tool") == "computer":
            coord = action.get("coordinate", [0, 0])
            for area in cls.SENSITIVE_AREAS:
                if (area["x_range"][0] <= coord[0] <= area["x_range"][1] and
                    area["y_range"][0] <= coord[1] <= area["y_range"][1]):
                    return False, f"Blocked area: {area['name']}"

        return True, "Action allowed"

    @classmethod
    def review_before_execute(cls, actions: list) -> list:
        """Filter actions for safety"""
        safe_actions = []
        for action in actions:
            is_safe, reason = cls.check_action_safe(action)
            if is_safe:
                safe_actions.append(action)
            else:
                print(f"Blocked action: {reason}")
        return safe_actions

Use Cases

Computer Use opens up exciting possibilities:

UI Testing: Automate testing of any application
Data Entry: Fill forms across multiple applications
Legacy Integration: Interact with systems that lack APIs
Process Automation: Complete multi-step workflows
Training Data Collection: Record human-like interactions

Computer Use is a significant step toward truly autonomous AI agents. Use it responsibly with appropriate safety controls.