4 min read
Thinking Tokens: Understanding o1's Hidden Reasoning Process
One of the most fascinating aspects of o1 is its use of “thinking tokens” - internal reasoning that happens before generating the visible response. Let’s explore how this works and what it means for your applications.
What Are Thinking Tokens?
When o1 processes a request, it generates two types of tokens:
- Thinking tokens: Internal reasoning (not shown to users)
- Output tokens: The visible response
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model="o1-preview",
messages=[{
"role": "user",
"content": "What's the probability of rolling exactly two 6s in five dice rolls?"
}],
max_completion_tokens=4096
)
# Analyze token usage
usage = response.usage
print(f"Input tokens: {usage.prompt_tokens}")
print(f"Output tokens: {usage.completion_tokens}")
print(f"Reasoning tokens: {usage.completion_tokens_details.reasoning_tokens}")
# Reasoning tokens are part of completion_tokens but charged differently
visible_output_tokens = usage.completion_tokens - usage.completion_tokens_details.reasoning_tokens
print(f"Visible output tokens: {visible_output_tokens}")
The Thinking Process Visualized
def visualize_thinking_ratio(response) -> dict:
"""
Visualize how much thinking went into a response
"""
usage = response.usage
reasoning = usage.completion_tokens_details.reasoning_tokens
total_completion = usage.completion_tokens
visible = total_completion - reasoning
thinking_ratio = reasoning / total_completion if total_completion > 0 else 0
# Create a visual bar
bar_length = 50
thinking_bar = int(bar_length * thinking_ratio)
output_bar = bar_length - thinking_bar
visual = f"[{'#' * thinking_bar}{'.' * output_bar}]"
return {
"reasoning_tokens": reasoning,
"visible_tokens": visible,
"thinking_ratio": f"{thinking_ratio:.1%}",
"visualization": visual,
"interpretation": interpret_ratio(thinking_ratio)
}
def interpret_ratio(ratio: float) -> str:
if ratio > 0.9:
return "Heavy reasoning - complex problem"
elif ratio > 0.7:
return "Significant reasoning - moderate complexity"
elif ratio > 0.5:
return "Balanced reasoning and output"
elif ratio > 0.3:
return "Light reasoning - straightforward problem"
else:
return "Minimal reasoning - simple task"
Thinking Tokens and Cost
def calculate_o1_cost(input_tokens: int,
reasoning_tokens: int,
output_tokens: int) -> dict:
"""
Calculate the full cost breakdown for o1 requests
"""
# Pricing per million tokens
INPUT_RATE = 15.00
OUTPUT_RATE = 60.00 # Both reasoning and output
input_cost = (input_tokens / 1_000_000) * INPUT_RATE
reasoning_cost = (reasoning_tokens / 1_000_000) * OUTPUT_RATE
output_cost = (output_tokens / 1_000_000) * OUTPUT_RATE
total = input_cost + reasoning_cost + output_cost
return {
"input_cost": f"${input_cost:.4f}",
"reasoning_cost": f"${reasoning_cost:.4f}",
"output_cost": f"${output_cost:.4f}",
"total_cost": f"${total:.4f}",
"reasoning_percentage": f"{(reasoning_cost/total)*100:.1f}%" if total > 0 else "0%"
}
# Example: A complex coding problem
# Input: 500 tokens, Reasoning: 3000 tokens, Output: 1000 tokens
cost = calculate_o1_cost(500, 3000, 1000)
print(cost)
# Reasoning often dominates the cost!
Controlling Thinking with max_completion_tokens
def optimize_thinking_budget(problem: str, budget: str = "standard") -> dict:
"""
Adjust thinking budget based on problem complexity
"""
budgets = {
"minimal": 2048, # Quick answers
"standard": 4096, # Normal problems
"extended": 8192, # Complex reasoning
"maximum": 16384 # Very complex problems
}
max_tokens = budgets.get(budget, 4096)
response = client.chat.completions.create(
model="o1-preview",
messages=[{"role": "user", "content": problem}],
max_completion_tokens=max_tokens
)
return {
"budget": budget,
"max_tokens": max_tokens,
"used_tokens": response.usage.completion_tokens,
"reasoning_tokens": response.usage.completion_tokens_details.reasoning_tokens,
"response": response.choices[0].message.content
}
# For simple problems, limit the thinking budget
simple_result = optimize_thinking_budget(
"What's 15% of 200?",
budget="minimal"
)
# For complex problems, allow extended thinking
complex_result = optimize_thinking_budget(
"Design a distributed consensus algorithm for a 1000-node cluster",
budget="maximum"
)
Monitoring Thinking Patterns
import statistics
from typing import List
class ThinkingAnalyzer:
"""Track and analyze thinking patterns across requests"""
def __init__(self):
self.history: List[dict] = []
def record(self, response, task_type: str):
usage = response.usage
reasoning = usage.completion_tokens_details.reasoning_tokens
self.history.append({
"task_type": task_type,
"reasoning_tokens": reasoning,
"total_tokens": usage.completion_tokens,
"ratio": reasoning / usage.completion_tokens
})
def get_stats(self, task_type: str = None) -> dict:
data = self.history
if task_type:
data = [h for h in self.history if h["task_type"] == task_type]
if not data:
return {"error": "No data available"}
ratios = [d["ratio"] for d in data]
reasoning_tokens = [d["reasoning_tokens"] for d in data]
return {
"count": len(data),
"avg_reasoning_ratio": statistics.mean(ratios),
"avg_reasoning_tokens": statistics.mean(reasoning_tokens),
"max_reasoning_tokens": max(reasoning_tokens),
"min_reasoning_tokens": min(reasoning_tokens)
}
# Usage
analyzer = ThinkingAnalyzer()
# Record responses over time...
# analyzer.record(response, "code_review")
# stats = analyzer.get_stats("code_review")
Key Takeaways
- Thinking tokens are the real cost driver - More complex problems = more thinking = higher cost
- You can’t see the thinking - Only the final output is visible
- Budget wisely - Use
max_completion_tokensto control costs - Monitor patterns - Track thinking ratios to optimize model selection
Understanding thinking tokens helps you make informed decisions about when o1’s extended reasoning is worth the investment.