6 min read
Browser Automation with AI: Intelligent Web Navigation
Combining AI with browser automation creates intelligent agents that can navigate websites, fill forms, and extract data with human-like understanding. Let’s explore how to build these systems.
AI-Enhanced Browser Agent
from playwright.sync_api import sync_playwright, Page
import anthropic
import base64
import json
client = anthropic.Anthropic()
class AIBrowserAgent:
"""Browser agent powered by AI"""
def __init__(self, headless: bool = False):
self.playwright = sync_playwright().start()
self.browser = self.playwright.chromium.launch(headless=headless)
self.context = self.browser.new_context(
viewport={"width": 1920, "height": 1080}
)
self.page = self.context.new_page()
def navigate_to_goal(self, url: str, goal: str, max_steps: int = 10) -> bool:
"""Navigate to URL and achieve a goal"""
self.page.goto(url)
for step in range(max_steps):
# Capture current state
screenshot = self._capture_screenshot()
html = self.page.content()
# Determine next action
action = self._determine_action(goal, screenshot, html)
if action["type"] == "complete":
return True
# Execute action
success = self._execute_action(action)
if not success:
# Try to recover
if not self._handle_failure(action, goal):
return False
return False
def _capture_screenshot(self) -> str:
"""Capture page screenshot as base64"""
screenshot_bytes = self.page.screenshot()
return base64.standard_b64encode(screenshot_bytes).decode()
def _determine_action(self, goal: str, screenshot: str, html: str) -> dict:
"""Use AI to determine next action"""
# Truncate HTML if too long
html_preview = html[:5000] if len(html) > 5000 else html
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
},
{
"type": "text",
"text": f"""
Goal: {goal}
HTML (truncated):
```html
{html_preview}
```
Determine the next action to achieve the goal.
Return JSON with:
- type: click/type/scroll/wait/select/complete
- selector: CSS selector for element (if applicable)
- value: text to type or option to select (if applicable)
- reasoning: why this action helps achieve the goal
"""
}
]
}
]
)
return json.loads(response.content[0].text)
def _execute_action(self, action: dict) -> bool:
"""Execute a browser action"""
try:
action_type = action["type"]
selector = action.get("selector")
value = action.get("value")
if action_type == "click":
self.page.click(selector)
elif action_type == "type":
self.page.fill(selector, value)
elif action_type == "scroll":
self.page.evaluate("window.scrollBy(0, 500)")
elif action_type == "wait":
self.page.wait_for_timeout(int(value or 1000))
elif action_type == "select":
self.page.select_option(selector, value)
elif action_type == "hover":
self.page.hover(selector)
# Wait for any navigation
self.page.wait_for_load_state("networkidle", timeout=5000)
return True
except Exception as e:
print(f"Action failed: {e}")
return False
def _handle_failure(self, failed_action: dict, goal: str) -> bool:
"""Handle action failure"""
screenshot = self._capture_screenshot()
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
},
{
"type": "text",
"text": f"""
Action failed: {json.dumps(failed_action)}
Goal: {goal}
Suggest an alternative action to achieve the same effect.
Return JSON with alternative action or {{"type": "abort"}} if unrecoverable.
"""
}
]
}
]
)
alternative = json.loads(response.content[0].text)
if alternative["type"] == "abort":
return False
return self._execute_action(alternative)
def close(self):
"""Clean up resources"""
self.browser.close()
self.playwright.stop()
Intelligent Data Extraction
class AIDataExtractor:
"""Extract structured data from web pages using AI"""
def __init__(self, browser_agent: AIBrowserAgent):
self.agent = browser_agent
def extract_structured_data(self, url: str, schema: dict) -> dict:
"""Extract data matching a schema from a URL"""
self.agent.page.goto(url)
self.agent.page.wait_for_load_state("networkidle")
screenshot = self.agent._capture_screenshot()
html = self.agent.page.content()
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=4096,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
},
{
"type": "text",
"text": f"""
Extract data from this page matching this schema:
{json.dumps(schema, indent=2)}
HTML:
```html
{html[:10000]}
```
Return the extracted data as JSON matching the schema.
Include null for fields that cannot be found.
"""
}
]
}
]
)
return json.loads(response.content[0].text)
def extract_table_data(self, url: str, table_description: str) -> list:
"""Extract data from a table on a page"""
self.agent.page.goto(url)
# Find and extract table
html = self.agent.page.content()
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=4096,
messages=[
{
"role": "user",
"content": f"""
Find and extract the table matching this description: {table_description}
HTML:
```html
{html[:15000]}
```
Return JSON array of objects, one per table row.
Use column headers as keys.
"""
}
]
)
return json.loads(response.content[0].text)
def extract_with_pagination(self, url: str, schema: dict,
max_pages: int = 10) -> list:
"""Extract data across paginated pages"""
all_data = []
self.agent.page.goto(url)
for page_num in range(max_pages):
# Extract from current page
data = self._extract_current_page(schema)
all_data.extend(data)
# Find and click next page
has_next = self._navigate_to_next_page()
if not has_next:
break
return all_data
def _extract_current_page(self, schema: dict) -> list:
"""Extract data from current page"""
screenshot = self.agent._capture_screenshot()
html = self.agent.page.content()
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=4096,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
},
{
"type": "text",
"text": f"""
Extract all items matching this schema:
{json.dumps(schema, indent=2)}
Return JSON array of matching items.
"""
}
]
}
]
)
return json.loads(response.content[0].text)
def _navigate_to_next_page(self) -> bool:
"""Navigate to next page if available"""
screenshot = self.agent._capture_screenshot()
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=512,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
},
{
"type": "text",
"text": """
Is there a "Next" or pagination button to go to the next page?
Return JSON: {"has_next": boolean, "selector": "css selector or null"}
"""
}
]
}
]
)
result = json.loads(response.content[0].text)
if result["has_next"] and result["selector"]:
try:
self.agent.page.click(result["selector"])
self.agent.page.wait_for_load_state("networkidle")
return True
except:
return False
return False
Authentication Handling
class AIAuthHandler:
"""Handle authentication flows intelligently"""
def __init__(self, browser_agent: AIBrowserAgent):
self.agent = browser_agent
def login(self, url: str, credentials: dict) -> bool:
"""Log into a website with provided credentials"""
self.agent.page.goto(url)
# Analyze login form
screenshot = self.agent._capture_screenshot()
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
},
{
"type": "text",
"text": f"""
Analyze this login page.
Available credentials: {list(credentials.keys())}
Return JSON with login steps:
{{
"username_selector": "css selector",
"password_selector": "css selector",
"submit_selector": "css selector",
"credential_mapping": {{"field": "credential_key"}}
}}
"""
}
]
}
]
)
login_info = json.loads(response.content[0].text)
try:
# Fill username
username_key = login_info["credential_mapping"].get("username", "username")
self.agent.page.fill(
login_info["username_selector"],
credentials[username_key]
)
# Fill password
password_key = login_info["credential_mapping"].get("password", "password")
self.agent.page.fill(
login_info["password_selector"],
credentials[password_key]
)
# Submit
self.agent.page.click(login_info["submit_selector"])
self.agent.page.wait_for_load_state("networkidle")
# Verify login success
return self._verify_login_success()
except Exception as e:
print(f"Login failed: {e}")
return False
def _verify_login_success(self) -> bool:
"""Verify that login was successful"""
screenshot = self.agent._capture_screenshot()
response = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=256,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot
}
},
{
"type": "text",
"text": """
Was the login successful?
Return JSON: {"success": boolean, "reason": "explanation"}
"""
}
]
}
]
)
result = json.loads(response.content[0].text)
return result["success"]
AI-powered browser automation understands context, adapts to changes, and can handle complex scenarios that break traditional automation scripts.