Skip to content
Back to Blog
1 min read

Desktop Automation with AI: Controlling Any Application

I wrote “Desktop Automation with AI: Controlling Any Application” to share practical, production-minded guidance on this topic.

Cross-Platform Desktop Agent

import pyautogui
import anthropic
import base64
from PIL import ImageGrab
import subprocess
import platform
from typing import Optional, Tuple
import time

client = anthropic.Anthropic()

class DesktopAutomationAgent:
    """AI-powered desktop automation agent"""

    def __init__(self):
        self.system = platform.system()
        pyautogui.PAUSE = 0.5  # Add small delay between actions
        pyautogui.FAILSAFE = True  # Move mouse to corner to abort

    def capture_screen(self) -> str:
        """Capture full screen as base64"""
        screenshot = ImageGrab.grab()
        import io
        buffer = io.BytesIO()
        screenshot.save(buffer, format="PNG")
        return base64.standard_b64encode(buffer.getvalue()).decode()

    def capture_region(self, x: int, y: int, width: int, height: int) -> str:
        """Capture a specific region"""
        screenshot = ImageGrab.grab(bbox=(x, y, x + width, y + height))
        import io
        buffer = io.BytesIO()
        screenshot.save(buffer, format="PNG")
        return base64.standard_b64encode(buffer.getvalue()).decode()

    def execute_task(self, task: str, max_iterations: int = 15) -> bool:
        """Execute a desktop automation task"""

        for i in range(max_iterations):
            screenshot = self.capture_screen()

            # Get next action from AI
            action = self._get_next_action(task, screenshot, i)

            if action["type"] == "complete":
                print(f"Task completed in {i + 1} iterations")
                return True

            if action["type"] == "failed":
                print(f"Task failed: {action.get('reason', 'Unknown error')}")
                return False

            # Execute the action
            self._execute_action(action)

            # Small delay for UI to update
            time.sleep(0.5)

        print("Max iterations reached")
        return False

    def _get_next_action(self, task: str, screenshot: str, iteration: int) -> dict:
        """Determine the next action using AI"""

        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=1024,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": screenshot
                            }
                        },
                        {
                            "type": "text",
                            "text": f"""
                            Task: {task}
                            Iteration: {iteration + 1}
                            Operating System: {self.system}
                            Screen Resolution: {pyautogui.size()}

                            Determine the next action to accomplish this task.
                            Return JSON with:
                            - type: click/double_click/right_click/type/hotkey/scroll/move/complete/failed
                            - x: x coordinate (for mouse actions)
                            - y: y coordinate (for mouse actions)
                            - text: text to type (for type action)
                            - keys: list of keys (for hotkey action, e.g., ["ctrl", "c"])
                            - direction: up/down (for scroll)
                            - amount: scroll amount
                            - reasoning: explanation of why this action
                            """
                        }
                    ]
                }
            ]
        )

        import json
        return json.loads(response.content[0].text)

    def _execute_action(self, action: dict):
        """Execute a desktop action"""

        action_type = action["type"]

        if action_type == "click":
            pyautogui.click(action["x"], action["y"])

        elif action_type == "double_click":
            pyautogui.doubleClick(action["x"], action["y"])

        elif action_type == "right_click":
            pyautogui.rightClick(action["x"], action["y"])

        elif action_type == "type":
            pyautogui.write(action["text"], interval=0.05)

        elif action_type == "hotkey":
            pyautogui.hotkey(*action["keys"])

        elif action_type == "scroll":
            amount = action.get("amount", 3)
            if action.get("direction") == "up":
                pyautogui.scroll(amount)
            else:
                pyautogui.scroll(-amount)

        elif action_type == "move":
            pyautogui.moveTo(action["x"], action["y"])

        print(f"Executed: {action_type} - {action.get('reasoning', '')}")

Application-Specific Automation

class ApplicationAutomator:
    """Automate specific desktop applications"""

    def __init__(self, agent: DesktopAutomationAgent):
        self.agent = agent
        self.app_contexts = {}

    def open_application(self, app_name: str) -> bool:
        """Open a desktop application"""

        system = platform.system()

        if system == "Windows":
            # Use Windows search
            pyautogui.hotkey("win")
            time.sleep(0.5)
            pyautogui.write(app_name, interval=0.05)
            time.sleep(0.5)
            pyautogui.press("enter")

        elif system == "Darwin":  # macOS
            # Use Spotlight
            pyautogui.hotkey("command", "space")
            time.sleep(0.5)
            pyautogui.write(app_name, interval=0.05)
            time.sleep(0.5)
            pyautogui.press("return")

        elif system == "Linux":
            # Try common launchers
            subprocess.Popen([app_name.lower()])

        # Wait for app to open
        time.sleep(2)

        # Verify app is open
        return self._verify_app_opened(app_name)

    def _verify_app_opened(self, app_name: str) -> bool:
        """Verify the application opened successfully"""

        screenshot = self.agent.capture_screen()

        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=256,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": screenshot
                            }
                        },
                        {
                            "type": "text",
                            "text": f"""
                            Is {app_name} currently open and visible on screen?
                            Return JSON: {{"is_open": boolean, "confidence": 0-1}}
                            """
                        }
                    ]
                }
            ]
        )

        import json
        result = json.loads(response.content[0].text)
        return result["is_open"] and result["confidence"] > 0.7

    def use_excel(self, file_path: str, operations: list) -> bool:
        """Automate Excel operations"""

        # Open file
        self.open_application("Excel")
        time.sleep(2)

        # Open specific file
        pyautogui.hotkey("ctrl", "o")
        time.sleep(1)
        pyautogui.write(file_path)
        pyautogui.press("enter")
        time.sleep(2)

        # Execute operations
        for op in operations:
            self.agent.execute_task(f"In Excel: {op}")

        return True

    def use_email(self, email_client: str, action: str, **kwargs) -> bool:
        """Automate email operations"""

        task = f"Using {email_client}: {action}"

        if "recipient" in kwargs:
            task += f" to {kwargs['recipient']}"
        if "subject" in kwargs:
            task += f" with subject '{kwargs['subject']}'"
        if "body" in kwargs:
            task += f" containing: {kwargs['body'][:100]}"

        return self.agent.execute_task(task)

Multi-Window Management

class WindowManager:
    """Manage multiple windows during automation"""

    def __init__(self, agent: DesktopAutomationAgent):
        self.agent = agent
        self.windows = {}

    def find_window(self, window_title: str) -> Optional[dict]:
        """Find a window by title"""

        screenshot = self.agent.capture_screen()

        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=512,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": screenshot
                            }
                        },
                        {
                            "type": "text",
                            "text": f"""
                            Find a window with title containing: "{window_title}"
                            Return JSON:
                            {{
                                "found": boolean,
                                "title_bar_x": int,
                                "title_bar_y": int,
                                "is_maximized": boolean,
                                "is_active": boolean
                            }}
                            """
                        }
                    ]
                }
            ]
        )

        import json
        return json.loads(response.content[0].text)

    def focus_window(self, window_title: str) -> bool:
        """Bring a window to focus"""

        window = self.find_window(window_title)

        if not window or not window["found"]:
            return False

        # Click on the title bar to focus
        pyautogui.click(window["title_bar_x"], window["title_bar_y"])
        time.sleep(0.3)

        return True

    def arrange_windows(self, arrangement: str) -> bool:
        """Arrange windows in a specific layout"""

        if arrangement == "side_by_side":
            # Windows: Win + Left/Right arrows
            pyautogui.hotkey("win", "left")
            time.sleep(0.5)

            # Need to select second window for right side
            pyautogui.hotkey("alt", "tab")
            time.sleep(0.3)
            pyautogui.hotkey("win", "right")

        elif arrangement == "cascade":
            pyautogui.hotkey("win", "down")  # Restore all
            time.sleep(0.5)

        return True

    def close_window(self, window_title: str) -> bool:
        """Close a specific window"""

        if self.focus_window(window_title):
            pyautogui.hotkey("alt", "F4")
            return True

        return False

Workflow Automation

class WorkflowAutomator:
    """Automate complex multi-application workflows"""

    def __init__(self):
        self.agent = DesktopAutomationAgent()
        self.app_automator = ApplicationAutomator(self.agent)
        self.window_manager = WindowManager(self.agent)

    def execute_workflow(self, workflow: dict) -> dict:
        """Execute a multi-step workflow"""

        results = {
            "success": True,
            "steps_completed": 0,
            "steps_total": len(workflow["steps"]),
            "errors": []
        }

        for i, step in enumerate(workflow["steps"]):
            print(f"Executing step {i + 1}: {step['name']}")

            try:
                success = self._execute_step(step)

                if not success:
                    results["errors"].append(f"Step {i + 1} failed")
                    if not step.get("continue_on_failure", False):
                        results["success"] = False
                        break

                results["steps_completed"] = i + 1

            except Exception as e:
                results["errors"].append(f"Step {i + 1} error: {str(e)}")
                results["success"] = False
                break

        return results

    def _execute_step(self, step: dict) -> bool:
        """Execute a single workflow step"""

        step_type = step["type"]

        if step_type == "open_app":
            return self.app_automator.open_application(step["app_name"])

        elif step_type == "ai_task":
            return self.agent.execute_task(step["task"])

        elif step_type == "focus_window":
            return self.window_manager.focus_window(step["window"])

        elif step_type == "hotkey":
            pyautogui.hotkey(*step["keys"])
            return True

        elif step_type == "wait":
            time.sleep(step.get("seconds", 1))
            return True

        elif step_type == "type":
            pyautogui.write(step["text"], interval=0.05)
            return True

        else:
            print(f"Unknown step type: {step_type}")
            return False

# Example workflow
example_workflow = {
    "name": "Daily Report Generation",
    "steps": [
        {"type": "open_app", "app_name": "Excel", "name": "Open Excel"},
        {"type": "ai_task", "task": "Open the file 'sales_data.xlsx'", "name": "Open sales file"},
        {"type": "ai_task", "task": "Create a pivot table summarizing sales by region", "name": "Create pivot"},
        {"type": "hotkey", "keys": ["ctrl", "c"], "name": "Copy chart"},
        {"type": "open_app", "app_name": "Outlook", "name": "Open Outlook"},
        {"type": "ai_task", "task": "Create a new email to team@company.com", "name": "New email"},
        {"type": "type", "text": "Daily Sales Report", "name": "Enter subject"},
        {"type": "hotkey", "keys": ["tab"], "name": "Move to body"},
        {"type": "hotkey", "keys": ["ctrl", "v"], "name": "Paste chart"},
    ]
}

# automator = WorkflowAutomator()
# results = automator.execute_workflow(example_workflow)

Desktop automation with AI brings human-like understanding to any application, enabling automation of complex workflows that were previously impossible to script reliably.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.