4 min read
Claude Computer Use Preview: AI That Can Control Your Desktop
Anthropic has announced a groundbreaking capability: Claude can now use computers like a human. The Computer Use preview enables Claude to see your screen, move the mouse, type on the keyboard, and interact with any application.
What is Computer Use?
Computer Use allows Claude to interact with graphical user interfaces through:
- Screen capture: Claude can see what’s on screen
- Mouse control: Move, click, drag, and scroll
- Keyboard input: Type text and use keyboard shortcuts
- Application interaction: Work with any desktop application
import anthropic
client = anthropic.Anthropic()
# Computer Use requires specific tool definitions
tools = [
{
"type": "computer_20241022",
"name": "computer",
"display_width_px": 1920,
"display_height_px": 1080,
"display_number": 0
},
{
"type": "text_editor_20241022",
"name": "str_replace_editor"
},
{
"type": "bash_20241022",
"name": "bash"
}
]
response = client.beta.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=4096,
tools=tools,
messages=[
{
"role": "user",
"content": "Open Firefox and search for 'Anthropic Claude' on Google"
}
],
betas=["computer-use-2024-10-22"]
)
The Tool Definitions
# Computer tool for GUI interaction
computer_tool = {
"type": "computer_20241022",
"name": "computer",
"display_width_px": 1920,
"display_height_px": 1080,
"display_number": 0 # For multi-monitor setups
}
# Actions available:
# - mouse_move(x, y)
# - left_click()
# - right_click()
# - double_click()
# - drag(start_x, start_y, end_x, end_y)
# - type(text)
# - key(key_name) # e.g., "Return", "Control+c"
# - screenshot()
# - scroll(direction) # "up" or "down"
# Text editor tool for file editing
text_editor_tool = {
"type": "text_editor_20241022",
"name": "str_replace_editor"
}
# Bash tool for command execution
bash_tool = {
"type": "bash_20241022",
"name": "bash"
}
Handling Computer Use Responses
def handle_computer_use(response) -> dict:
"""Process Claude's computer use response"""
result = {
"actions": [],
"needs_screenshot": False,
"complete": False
}
for block in response.content:
if block.type == "tool_use":
if block.name == "computer":
action = block.input.get("action")
result["actions"].append({
"tool": "computer",
"action": action,
"params": block.input
})
# Computer actions usually need a screenshot after
if action != "screenshot":
result["needs_screenshot"] = True
elif block.name == "str_replace_editor":
result["actions"].append({
"tool": "editor",
"command": block.input.get("command"),
"params": block.input
})
elif block.name == "bash":
result["actions"].append({
"tool": "bash",
"command": block.input.get("command")
})
elif block.type == "text":
if "completed" in block.text.lower() or "done" in block.text.lower():
result["complete"] = True
return result
Building a Computer Use Loop
import base64
from PIL import ImageGrab
def capture_screenshot() -> str:
"""Capture and encode screenshot"""
screenshot = ImageGrab.grab()
# Save to bytes
import io
buffer = io.BytesIO()
screenshot.save(buffer, format="PNG")
return base64.standard_b64encode(buffer.getvalue()).decode()
def execute_computer_action(action: dict):
"""Execute a computer action"""
import pyautogui
if action["action"] == "mouse_move":
pyautogui.moveTo(action["coordinate"][0], action["coordinate"][1])
elif action["action"] == "left_click":
if "coordinate" in action:
pyautogui.click(action["coordinate"][0], action["coordinate"][1])
else:
pyautogui.click()
elif action["action"] == "type":
pyautogui.write(action["text"])
elif action["action"] == "key":
pyautogui.hotkey(*action["text"].split("+"))
elif action["action"] == "screenshot":
return capture_screenshot()
elif action["action"] == "scroll":
direction = 1 if action["coordinate"][1] < 0 else -1
pyautogui.scroll(direction * 3)
def computer_use_loop(task: str, max_iterations: int = 20):
"""Run computer use loop until task complete"""
messages = [
{"role": "user", "content": task}
]
for iteration in range(max_iterations):
# Get initial screenshot
screenshot = capture_screenshot()
# Add screenshot to messages
messages.append({
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
}
]
})
# Get Claude's response
response = client.beta.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=4096,
tools=tools,
messages=messages,
betas=["computer-use-2024-10-22"]
)
# Process response
result = handle_computer_use(response)
if result["complete"]:
print(f"Task completed in {iteration + 1} iterations")
return True
# Execute actions
for action in result["actions"]:
if action["tool"] == "computer":
execute_computer_action(action)
# Add assistant response to conversation
messages.append({"role": "assistant", "content": response.content})
print("Max iterations reached")
return False
Safety Considerations
class ComputerUseSafety:
"""Safety controls for computer use"""
BLOCKED_ACTIONS = [
"format",
"delete system",
"rm -rf",
"shutdown",
"reboot"
]
SENSITIVE_AREAS = [
{"name": "system_preferences", "x_range": (0, 100), "y_range": (0, 50)},
{"name": "dock", "x_range": (0, 1920), "y_range": (1030, 1080)}
]
@classmethod
def check_action_safe(cls, action: dict) -> tuple[bool, str]:
"""Check if an action is safe to execute"""
# Check for blocked commands
if action.get("tool") == "bash":
command = action.get("command", "").lower()
for blocked in cls.BLOCKED_ACTIONS:
if blocked in command:
return False, f"Blocked command: {blocked}"
# Check for sensitive screen areas
if action.get("tool") == "computer":
coord = action.get("coordinate", [0, 0])
for area in cls.SENSITIVE_AREAS:
if (area["x_range"][0] <= coord[0] <= area["x_range"][1] and
area["y_range"][0] <= coord[1] <= area["y_range"][1]):
return False, f"Blocked area: {area['name']}"
return True, "Action allowed"
@classmethod
def review_before_execute(cls, actions: list) -> list:
"""Filter actions for safety"""
safe_actions = []
for action in actions:
is_safe, reason = cls.check_action_safe(action)
if is_safe:
safe_actions.append(action)
else:
print(f"Blocked action: {reason}")
return safe_actions
Use Cases
Computer Use opens up exciting possibilities:
- UI Testing: Automate testing of any application
- Data Entry: Fill forms across multiple applications
- Legacy Integration: Interact with systems that lack APIs
- Process Automation: Complete multi-step workflows
- Training Data Collection: Record human-like interactions
Computer Use is a significant step toward truly autonomous AI agents. Use it responsibly with appropriate safety controls.