Back to Blog
3 min read

Token Estimation: Predicting API Costs

Accurate token estimation helps manage costs and rate limits. Today we explore techniques for predicting token usage.

Understanding Tokens

import tiktoken

# Tokens are pieces of text, roughly 4 chars = 1 token in English
# Different models use different tokenizers

def count_tokens(text, model="gpt-4"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

# Examples
examples = {
    "Hello": 1,
    "Hello, world!": 4,
    "The quick brown fox": 4,
    "Supercalifragilisticexpialidocious": 9
}

for text, expected in examples.items():
    actual = count_tokens(text)
    print(f"'{text}': {actual} tokens")

Token Estimation Functions

import tiktoken

class TokenEstimator:
    def __init__(self, model="gpt-4"):
        self.encoding = tiktoken.encoding_for_model(model)

    def count_text(self, text):
        return len(self.encoding.encode(text))

    def count_messages(self, messages):
        """Estimate tokens for chat messages."""
        # Base tokens per message
        tokens_per_message = 4  # <|im_start|>, role, \n, content, <|im_end|>

        total = 0
        for msg in messages:
            total += tokens_per_message
            total += self.count_text(msg.get("role", ""))
            total += self.count_text(msg.get("content", ""))
            if "name" in msg:
                total += self.count_text(msg["name"]) + 1

        total += 3  # Reply priming
        return total

    def estimate_cost(self, input_tokens, output_tokens, model="gpt-4"):
        pricing = {
            "gpt-4": {"input": 0.03, "output": 0.06},
            "gpt-4-turbo": {"input": 0.01, "output": 0.03},
            "gpt-3.5-turbo": {"input": 0.0015, "output": 0.002}
        }

        rates = pricing.get(model, pricing["gpt-4"])
        input_cost = (input_tokens / 1000) * rates["input"]
        output_cost = (output_tokens / 1000) * rates["output"]

        return {"input_cost": input_cost, "output_cost": output_cost, "total": input_cost + output_cost}

Estimating Output Tokens

def estimate_output_tokens(task_type, input_tokens):
    """Estimate output tokens based on task type."""
    estimates = {
        "classification": (5, 20),      # Short answer
        "qa": (50, 200),                # Medium answer
        "summarization": (input_tokens * 0.2, input_tokens * 0.4),
        "generation": (100, 500),       # Variable
        "code": (50, 300),              # Depends on complexity
        "chat": (50, 200)               # Conversational
    }

    min_tokens, max_tokens = estimates.get(task_type, (50, 200))
    return {"min": int(min_tokens), "max": int(max_tokens), "expected": int((min_tokens + max_tokens) / 2)}

# Example
task = "summarization"
input_text = "Long article..." * 100
input_tokens = TokenEstimator().count_text(input_text)
output_estimate = estimate_output_tokens(task, input_tokens)
print(f"Input: {input_tokens}, Expected output: {output_estimate['expected']}")

Cost Calculator

class CostCalculator:
    def __init__(self, model="gpt-4"):
        self.model = model
        self.estimator = TokenEstimator(model)
        self.history = []

    def estimate_request(self, messages, max_tokens=None):
        input_tokens = self.estimator.count_messages(messages)
        output_tokens = max_tokens or 500

        return {
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "total_tokens": input_tokens + output_tokens,
            "estimated_cost": self.estimator.estimate_cost(input_tokens, output_tokens, self.model)
        }

    def record_actual(self, input_tokens, output_tokens, cost):
        self.history.append({
            "input": input_tokens,
            "output": output_tokens,
            "cost": cost
        })

    def get_stats(self):
        if not self.history:
            return None

        return {
            "total_requests": len(self.history),
            "total_input_tokens": sum(h["input"] for h in self.history),
            "total_output_tokens": sum(h["output"] for h in self.history),
            "total_cost": sum(h["cost"] for h in self.history),
            "avg_input": sum(h["input"] for h in self.history) / len(self.history),
            "avg_output": sum(h["output"] for h in self.history) / len(self.history)
        }

Batch Cost Estimation

def estimate_batch_cost(requests, model="gpt-4"):
    """Estimate cost for a batch of requests."""
    estimator = TokenEstimator(model)
    total_input = 0
    total_output = 0

    for req in requests:
        input_tokens = estimator.count_messages(req["messages"])
        output_tokens = req.get("max_tokens", 500)

        total_input += input_tokens
        total_output += output_tokens

    cost = estimator.estimate_cost(total_input, total_output, model)

    return {
        "request_count": len(requests),
        "total_input_tokens": total_input,
        "total_output_tokens": total_output,
        "estimated_cost": cost["total"]
    }

# Example
requests = [
    {"messages": [{"role": "user", "content": "Hello"}], "max_tokens": 100},
    {"messages": [{"role": "user", "content": "Explain quantum computing"}], "max_tokens": 500},
]

estimate = estimate_batch_cost(requests)
print(f"Batch cost estimate: ${estimate['estimated_cost']:.4f}")

Token Budget Management

class TokenBudget:
    def __init__(self, daily_limit, alert_threshold=0.8):
        self.daily_limit = daily_limit
        self.alert_threshold = alert_threshold
        self.used_today = 0
        self.reset_date = datetime.now().date()

    def check_budget(self, estimated_tokens):
        self._maybe_reset()

        if self.used_today + estimated_tokens > self.daily_limit:
            return False, "Budget exceeded"

        if (self.used_today + estimated_tokens) / self.daily_limit > self.alert_threshold:
            return True, "Approaching budget limit"

        return True, "OK"

    def record_usage(self, actual_tokens):
        self.used_today += actual_tokens

    def remaining(self):
        return self.daily_limit - self.used_today

    def _maybe_reset(self):
        if datetime.now().date() > self.reset_date:
            self.used_today = 0
            self.reset_date = datetime.now().date()

Tomorrow we’ll explore prompt compression techniques.

Resources

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.