May 26, 2024 2 min read

Code Execution in AI Agents: Safe and Effective Patterns

AI Agents Code Execution Security Sandboxing Azure

Code execution is one of the most powerful capabilities for AI agents - and one of the most dangerous. Today I’m exploring how to implement it safely.

Why Code Execution?

Agents with code execution can:

Perform complex calculations
Transform and analyze data
Create visualizations
Automate repetitive tasks
Test hypotheses dynamically

The Security Challenge

Risks of unrestricted code execution:
├── File system access
├── Network access
├── Resource exhaustion
├── Process manipulation
├── Credential exposure
└── System compromise

Sandboxing Approaches

1. Docker Container Isolation

import docker
import tempfile
import os

class DockerSandbox:
    """Execute code in isolated Docker containers."""

    def __init__(
        self,
        image: str = "python:3.11-slim",
        timeout: int = 30,
        memory_limit: str = "256m",
        cpu_limit: float = 0.5
    ):
        self.client = docker.from_env()
        self.image = image
        self.timeout = timeout
        self.memory_limit = memory_limit
        self.cpu_limit = cpu_limit

    async def execute(self, code: str, files: dict = None) -> dict:
        # Create temporary directory for code and files
        with tempfile.TemporaryDirectory() as tmpdir:
            # Write code
            code_path = os.path.join(tmpdir, "script.py")
            with open(code_path, "w") as f:
                f.write(code)

            # Write additional files
            if files:
                for name, content in files.items():
                    file_path = os.path.join(tmpdir, name)
                    with open(file_path, "wb" if isinstance(content, bytes) else "w") as f:
                        f.write(content)

            try:
                container = self.client.containers.run(
                    self.image,
                    command=["python", "/workspace/script.py"],
                    volumes={tmpdir: {"bind": "/workspace", "mode": "ro"}},
                    mem_limit=self.memory_limit,
                    cpu_period=100000,
                    cpu_quota=int(100000 * self.cpu_limit),
                    network_disabled=True,  # No network access
                    remove=True,
                    detach=False,
                    stdout=True,
                    stderr=True,
                    timeout=self.timeout
                )

                return {
                    "success": True,
                    "output": container.decode("utf-8"),
                    "error": None
                }

            except docker.errors.ContainerError as e:
                return {
                    "success": False,
                    "output": e.stderr.decode("utf-8") if e.stderr else "",
                    "error": str(e)
                }
            except Exception as e:
                return {
                    "success": False,
                    "output": "",
                    "error": str(e)
                }

# Usage
sandbox = DockerSandbox()
result = await sandbox.execute("""
import pandas as pd
data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)
print(df.describe())
""")

2. WebAssembly Sandbox (Pyodide)

from pyodide import create_runtime
import asyncio

class PyodideSandbox:
    """Execute Python in WebAssembly sandbox."""

    def __init__(self):
        self.runtime = None

    async def initialize(self):
        self.runtime = await create_runtime()
        # Pre-install common packages
        await self.runtime.loadPackagesFromImports([
            "numpy", "pandas", "matplotlib"
        ])

    async def execute(self, code: str) -> dict:
        if not self.runtime:
            await self.initialize()

        try:
            # Execute code in sandbox
            result = await self.runtime.runPythonAsync(code)

            return {
                "success": True,
                "output": str(result),
                "error": None
            }
        except Exception as e:
            return {
                "success": False,
                "output": "",
                "error": str(e)
            }

3. Restricted Python (RestrictedPython)

from RestrictedPython import compile_restricted, safe_globals
from RestrictedPython.Eval import default_guarded_getattr
from RestrictedPython.Guards import guarded_iter_unpack_sequence, safer_getattr
import io
import sys

class RestrictedSandbox:
    """Execute Python with restricted builtins."""

    def __init__(self, allowed_modules: list = None):
        self.allowed_modules = allowed_modules or ["math", "json", "datetime"]

    def execute(self, code: str) -> dict:
        # Compile with restrictions
        try:
            byte_code = compile_restricted(code, '<inline>', 'exec')
        except SyntaxError as e:
            return {"success": False, "output": "", "error": f"Syntax error: {e}"}

        # Create restricted globals
        restricted_globals = safe_globals.copy()
        restricted_globals['_getattr_'] = safer_getattr
        restricted_globals['_getiter_'] = iter
        restricted_globals['_getitem_'] = lambda obj, key: obj[key]
        restricted_globals['__builtins__'] = self._get_safe_builtins()

        # Capture output
        output_buffer = io.StringIO()
        restricted_globals['print'] = lambda *args: print(*args, file=output_buffer)

        # Execute
        try:
            exec(byte_code, restricted_globals)
            return {
                "success": True,
                "output": output_buffer.getvalue(),
                "error": None
            }
        except Exception as e:
            return {
                "success": False,
                "output": output_buffer.getvalue(),
                "error": str(e)
            }

    def _get_safe_builtins(self) -> dict:
        safe = {
            'abs': abs,
            'all': all,
            'any': any,
            'bool': bool,
            'dict': dict,
            'enumerate': enumerate,
            'filter': filter,
            'float': float,
            'int': int,
            'len': len,
            'list': list,
            'map': map,
            'max': max,
            'min': min,
            'range': range,
            'round': round,
            'set': set,
            'sorted': sorted,
            'str': str,
            'sum': sum,
            'tuple': tuple,
            'zip': zip,
        }

        # Add allowed modules
        for module in self.allowed_modules:
            safe[module] = __import__(module)

        return safe

Azure Container Apps for Serverless Execution

import httpx
import os

class AzureContainerSandbox:
    """Execute code using Azure Container Apps."""

    def __init__(self, endpoint: str, api_key: str):
        self.endpoint = endpoint
        self.api_key = api_key

    async def execute(
        self,
        code: str,
        language: str = "python",
        timeout: int = 30
    ) -> dict:
        async with httpx.AsyncClient(timeout=timeout + 5) as client:
            response = await client.post(
                f"{self.endpoint}/execute",
                headers={"X-API-Key": self.api_key},
                json={
                    "code": code,
                    "language": language,
                    "timeout": timeout
                }
            )

            if response.status_code == 200:
                return response.json()
            else:
                return {
                    "success": False,
                    "output": "",
                    "error": f"HTTP {response.status_code}: {response.text}"
                }

# Container App code (runs in Azure)
# app.py
from fastapi import FastAPI, Header, HTTPException
from pydantic import BaseModel
import subprocess
import tempfile
import os

app = FastAPI()

class ExecuteRequest(BaseModel):
    code: str
    language: str = "python"
    timeout: int = 30

@app.post("/execute")
async def execute_code(
    request: ExecuteRequest,
    x_api_key: str = Header(...)
):
    if x_api_key != os.environ["API_KEY"]:
        raise HTTPException(401, "Invalid API key")

    with tempfile.NamedTemporaryFile(
        mode="w",
        suffix=".py" if request.language == "python" else ".js",
        delete=False
    ) as f:
        f.write(request.code)
        script_path = f.name

    try:
        result = subprocess.run(
            ["python", script_path],
            capture_output=True,
            text=True,
            timeout=request.timeout
        )

        return {
            "success": result.returncode == 0,
            "output": result.stdout,
            "error": result.stderr if result.returncode != 0 else None
        }
    except subprocess.TimeoutExpired:
        return {
            "success": False,
            "output": "",
            "error": "Execution timed out"
        }
    finally:
        os.unlink(script_path)

Code Validation

import ast
from typing import Set

class CodeValidator:
    """Validate code before execution."""

    DANGEROUS_CALLS = {
        "eval", "exec", "compile", "__import__",
        "open", "file", "input",
        "os.system", "subprocess", "popen"
    }

    DANGEROUS_MODULES = {
        "os", "sys", "subprocess", "shutil",
        "socket", "requests", "urllib"
    }

    def validate(self, code: str) -> dict:
        try:
            tree = ast.parse(code)
        except SyntaxError as e:
            return {"valid": False, "error": f"Syntax error: {e}"}

        issues = []

        for node in ast.walk(tree):
            # Check for dangerous function calls
            if isinstance(node, ast.Call):
                if isinstance(node.func, ast.Name):
                    if node.func.id in self.DANGEROUS_CALLS:
                        issues.append(f"Dangerous call: {node.func.id}")

            # Check for dangerous imports
            if isinstance(node, ast.Import):
                for alias in node.names:
                    if alias.name.split('.')[0] in self.DANGEROUS_MODULES:
                        issues.append(f"Dangerous import: {alias.name}")

            if isinstance(node, ast.ImportFrom):
                if node.module and node.module.split('.')[0] in self.DANGEROUS_MODULES:
                    issues.append(f"Dangerous import: {node.module}")

        return {
            "valid": len(issues) == 0,
            "issues": issues
        }

# Usage
validator = CodeValidator()
result = validator.validate("import os; os.system('rm -rf /')")
# {'valid': False, 'issues': ['Dangerous import: os']}

Integration with AI Agent

class CodeExecutionAgent:
    """Agent with safe code execution capabilities."""

    def __init__(self, client, sandbox):
        self.client = client
        self.sandbox = sandbox
        self.validator = CodeValidator()

    async def solve_with_code(self, problem: str) -> dict:
        messages = [
            {
                "role": "system",
                "content": """You can write and execute Python code to solve problems.
When you need to run code, output it in a code block marked with ```python
Only use standard library and: numpy, pandas, matplotlib
Do not use: os, sys, subprocess, file operations"""
            },
            {"role": "user", "content": problem}
        ]

        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=messages
        )

        content = response.choices[0].message.content

        # Extract code blocks
        code_blocks = self._extract_code_blocks(content)

        results = []
        for code in code_blocks:
            # Validate
            validation = self.validator.validate(code)
            if not validation["valid"]:
                results.append({
                    "code": code,
                    "validation_error": validation["issues"]
                })
                continue

            # Execute
            execution = await self.sandbox.execute(code)
            results.append({
                "code": code,
                "output": execution["output"],
                "success": execution["success"],
                "error": execution.get("error")
            })

        return {
            "response": content,
            "code_executions": results
        }

    def _extract_code_blocks(self, text: str) -> list:
        import re
        pattern = r'```python\n(.*?)```'
        return re.findall(pattern, text, re.DOTALL)

Monitoring and Auditing

from datetime import datetime
import hashlib

class CodeExecutionAudit:
    """Audit all code executions."""

    def __init__(self, storage):
        self.storage = storage

    async def log_execution(
        self,
        code: str,
        result: dict,
        user_id: str,
        agent_id: str
    ):
        audit_record = {
            "id": hashlib.sha256(f"{code}{datetime.utcnow().isoformat()}".encode()).hexdigest()[:16],
            "timestamp": datetime.utcnow().isoformat(),
            "user_id": user_id,
            "agent_id": agent_id,
            "code_hash": hashlib.sha256(code.encode()).hexdigest(),
            "code_preview": code[:500],
            "success": result["success"],
            "output_preview": result.get("output", "")[:500],
            "error": result.get("error")
        }

        await self.storage.insert("code_executions", audit_record)

        # Alert on failures or suspicious patterns
        if not result["success"] or self._is_suspicious(code):
            await self._alert(audit_record)

    def _is_suspicious(self, code: str) -> bool:
        suspicious_patterns = [
            "base64", "decode", "encode",
            "exec", "eval", "compile",
            "socket", "connect", "bind"
        ]
        return any(p in code.lower() for p in suspicious_patterns)

Best Practices

Always sandbox - Never execute untrusted code directly
Validate first - Check code before execution
Limit resources - CPU, memory, time
No network - Disable network access in sandbox
Audit everything - Log all executions

What’s Next

Tomorrow I’ll cover file handling in AI agents.