March 23, 2023 1 min read

Token Optimization Strategies for GPT-4

At GPT-4 prices, every token counts. Effective token optimization can reduce costs by 50% or more without sacrificing quality. Here’s how to do it systematically.

Understanding Token Costs

GPT-4 pricing per 1K tokens:

GPT-4 8K: $0.03 input, $0.06 output
GPT-4 32K: $0.06 input, $0.12 output

Output tokens cost 2x input tokens. This changes optimization strategy.

import tiktoken

class TokenCounter:
    """Count and estimate tokens."""

    def __init__(self, model: str = "gpt-4"):
        self.encoding = tiktoken.encoding_for_model(model)
        self.model = model

    def count(self, text: str) -> int:
        """Count tokens in text."""
        return len(self.encoding.encode(text))

    def count_messages(self, messages: list[dict]) -> int:
        """Count tokens in message list."""
        total = 0
        for message in messages:
            total += 4  # Message overhead
            for key, value in message.items():
                total += self.count(str(value))
        total += 2  # Reply priming
        return total

    def estimate_cost(
        self,
        input_tokens: int,
        estimated_output_tokens: int
    ) -> float:
        """Estimate request cost."""
        if "32k" in self.model:
            input_cost = input_tokens * 0.06 / 1000
            output_cost = estimated_output_tokens * 0.12 / 1000
        else:
            input_cost = input_tokens * 0.03 / 1000
            output_cost = estimated_output_tokens * 0.06 / 1000
        return input_cost + output_cost

Strategy 1: Prompt Compression

Remove unnecessary words while preserving meaning:

class PromptCompressor:
    """Compress prompts to reduce tokens."""

    def __init__(self):
        # Phrases to remove/shorten
        self.removable = [
            "Please ", "Could you please ", "I would like you to ",
            "Can you help me ", "I need you to ", "Would you mind ",
            "I was wondering if you could ", "It would be great if you could ",
        ]

        self.shortenings = {
            "for example": "e.g.",
            "in order to": "to",
            "a lot of": "many",
            "due to the fact that": "because",
            "in the event that": "if",
            "at this point in time": "now",
            "in the near future": "soon",
        }

    def compress(self, prompt: str) -> str:
        """Compress prompt."""
        result = prompt

        # Remove filler phrases
        for phrase in self.removable:
            result = result.replace(phrase, "")

        # Apply shortenings
        for long, short in self.shortenings.items():
            result = result.replace(long, short)

        # Remove extra whitespace
        result = ' '.join(result.split())

        return result

    def compress_with_metrics(self, prompt: str) -> dict:
        """Compress and report savings."""
        counter = TokenCounter()
        original_tokens = counter.count(prompt)

        compressed = self.compress(prompt)
        compressed_tokens = counter.count(compressed)

        savings = original_tokens - compressed_tokens
        savings_pct = (savings / original_tokens * 100) if original_tokens > 0 else 0

        return {
            "compressed": compressed,
            "original_tokens": original_tokens,
            "compressed_tokens": compressed_tokens,
            "tokens_saved": savings,
            "savings_percent": round(savings_pct, 1),
            "cost_saved": savings * 0.03 / 1000  # GPT-4 input cost
        }

Strategy 2: Context Pruning

Include only relevant context:

class ContextPruner:
    """Prune context to essential information."""

    def __init__(self, client, token_budget: int = 4000):
        self.client = client
        self.token_budget = token_budget
        self.counter = TokenCounter()

    async def prune_context(
        self,
        query: str,
        documents: list[dict],
    ) -> list[dict]:
        """Select most relevant documents within budget."""

        # Score relevance using cheaper model
        scored_docs = await self._score_relevance(query, documents)

        # Sort by relevance
        scored_docs.sort(key=lambda x: x["score"], reverse=True)

        # Select within budget
        selected = []
        total_tokens = 0

        for doc in scored_docs:
            doc_tokens = self.counter.count(doc["content"])
            if total_tokens + doc_tokens <= self.token_budget:
                selected.append(doc)
                total_tokens += doc_tokens
            else:
                break

        return selected

    async def _score_relevance(
        self,
        query: str,
        documents: list[dict]
    ) -> list[dict]:
        """Score document relevance using GPT-3.5."""
        # Use cheaper model for relevance scoring
        docs_summary = "\n".join([
            f"{i}: {d['content'][:200]}..."
            for i, d in enumerate(documents)
        ])

        prompt = f"""Rate relevance of each document to the query (0-10).

Query: {query}

Documents:
{docs_summary}

Return JSON: {{"scores": [score1, score2, ...]}}"""

        response = await self.client.chat_completion(
            model="gpt-35-turbo",  # Use cheaper model
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )

        import json
        try:
            scores = json.loads(response.content)["scores"]
            for i, doc in enumerate(documents):
                doc["score"] = scores[i] if i < len(scores) else 0
        except:
            for doc in documents:
                doc["score"] = 5  # Default score

        return documents

    def truncate_to_budget(
        self,
        text: str,
        budget: int,
        strategy: str = "end"
    ) -> str:
        """Truncate text to fit token budget."""
        tokens = self.counter.encoding.encode(text)

        if len(tokens) <= budget:
            return text

        if strategy == "end":
            # Keep beginning
            truncated_tokens = tokens[:budget]
        elif strategy == "start":
            # Keep end
            truncated_tokens = tokens[-budget:]
        elif strategy == "middle":
            # Keep beginning and end
            half = budget // 2
            truncated_tokens = tokens[:half] + tokens[-half:]
        else:
            truncated_tokens = tokens[:budget]

        return self.counter.encoding.decode(truncated_tokens)

Strategy 3: Output Control

Limit output length to reduce the more expensive output tokens:

class OutputController:
    """Control output length."""

    def __init__(self, client):
        self.client = client

    async def generate_concise(
        self,
        prompt: str,
        max_sentences: int = 3,
        max_tokens: int = 200
    ) -> str:
        """Generate concise output."""

        constrained_prompt = f"""{prompt}

Requirements:
- Maximum {max_sentences} sentences
- Be concise and direct
- No unnecessary preamble"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": constrained_prompt}],
            max_tokens=max_tokens
        )

        return response.content

    async def generate_structured(
        self,
        prompt: str,
        fields: list[str]
    ) -> dict:
        """Generate structured output for efficiency."""

        fields_str = "\n".join([f"- {f}: <brief value>" for f in fields])

        constrained_prompt = f"""{prompt}

Return ONLY these fields:
{fields_str}"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": constrained_prompt}],
            max_tokens=len(fields) * 50  # Estimate per field
        )

        return {"output": response.content}

    def estimate_output_tokens(
        self,
        task_type: str,
        input_length: int
    ) -> int:
        """Estimate output tokens by task type."""
        estimates = {
            "classification": 10,
            "extraction": min(input_length // 4, 500),
            "summarization": min(input_length // 5, 300),
            "analysis": min(input_length // 2, 1000),
            "generation": 500,
            "code": 800,
        }
        return estimates.get(task_type, 200)

Strategy 4: Caching and Deduplication

Avoid redundant API calls:

import hashlib
import json
from datetime import datetime, timedelta

class TokenSavingCache:
    """Cache to save tokens on repeated queries."""

    def __init__(self, redis_client, ttl_hours: int = 24):
        self.redis = redis_client
        self.ttl = timedelta(hours=ttl_hours)
        self.counter = TokenCounter()
        self.stats = {"hits": 0, "misses": 0, "tokens_saved": 0}

    def _cache_key(self, prompt: str, params: dict) -> str:
        """Generate cache key."""
        content = json.dumps({"prompt": prompt, "params": params}, sort_keys=True)
        return f"gpt4:{hashlib.sha256(content.encode()).hexdigest()}"

    async def get_or_generate(
        self,
        prompt: str,
        generate_fn,
        params: dict = None
    ) -> dict:
        """Get from cache or generate."""
        params = params or {}
        key = self._cache_key(prompt, params)

        # Check cache
        cached = self.redis.get(key)
        if cached:
            self.stats["hits"] += 1
            input_tokens = self.counter.count(prompt)
            self.stats["tokens_saved"] += input_tokens
            return {
                "result": json.loads(cached),
                "cached": True,
                "tokens_saved": input_tokens
            }

        # Generate
        self.stats["misses"] += 1
        result = await generate_fn(prompt, **params)

        # Cache result
        self.redis.setex(key, self.ttl, json.dumps(result))

        return {"result": result, "cached": False}

    def get_savings_report(self) -> dict:
        """Report token savings from caching."""
        total_requests = self.stats["hits"] + self.stats["misses"]
        hit_rate = self.stats["hits"] / total_requests if total_requests > 0 else 0

        return {
            "cache_hits": self.stats["hits"],
            "cache_misses": self.stats["misses"],
            "hit_rate": round(hit_rate * 100, 1),
            "tokens_saved": self.stats["tokens_saved"],
            "cost_saved": self.stats["tokens_saved"] * 0.03 / 1000
        }

Strategy 5: Model Tiering

Use cheaper models when possible:

class ModelTiering:
    """Route to appropriate model tier."""

    def __init__(self, clients: dict):
        self.clients = clients  # {"gpt35": ..., "gpt4": ..., "gpt4-32k": ...}
        self.counter = TokenCounter()

    async def smart_route(
        self,
        prompt: str,
        task_type: str,
        quality_threshold: str = "normal"
    ) -> dict:
        """Route to appropriate model."""

        # Task-based routing
        gpt35_tasks = ["classification", "extraction", "simple_qa", "summarization"]
        gpt4_tasks = ["code_review", "analysis", "reasoning", "complex_qa"]

        if task_type in gpt35_tasks and quality_threshold != "high":
            model = "gpt35"
        elif task_type in gpt4_tasks:
            model = "gpt4"
        else:
            model = "gpt35"

        # Context-based upgrade
        input_tokens = self.counter.count(prompt)
        if input_tokens > 3500:
            model = "gpt4"
        if input_tokens > 7000:
            model = "gpt4-32k"

        # Execute
        response = await self.clients[model].chat_completion(
            messages=[{"role": "user", "content": prompt}]
        )

        return {
            "result": response.content,
            "model_used": model,
            "input_tokens": input_tokens
        }

    def estimate_savings(
        self,
        requests: list[dict]
    ) -> dict:
        """Estimate savings from tiering."""
        gpt4_only_cost = 0
        tiered_cost = 0

        for req in requests:
            input_tokens = self.counter.count(req["prompt"])
            output_tokens = req.get("output_tokens", 200)

            # All GPT-4 cost
            gpt4_only_cost += (input_tokens * 0.03 + output_tokens * 0.06) / 1000

            # Tiered cost
            model = self.smart_route_sync(req["prompt"], req["task_type"])
            if model == "gpt35":
                tiered_cost += (input_tokens + output_tokens) * 0.002 / 1000
            else:
                tiered_cost += (input_tokens * 0.03 + output_tokens * 0.06) / 1000

        return {
            "gpt4_only_cost": round(gpt4_only_cost, 4),
            "tiered_cost": round(tiered_cost, 4),
            "savings": round(gpt4_only_cost - tiered_cost, 4),
            "savings_percent": round((1 - tiered_cost / gpt4_only_cost) * 100, 1) if gpt4_only_cost > 0 else 0
        }

Strategy 6: Batch Optimization

Combine related requests:

class BatchOptimizer:
    """Batch related requests for efficiency."""

    async def batch_similar_tasks(
        self,
        tasks: list[dict],
        max_batch_size: int = 10
    ) -> list[dict]:
        """Batch similar tasks into single requests."""
        results = []

        # Group by task type
        grouped = {}
        for task in tasks:
            task_type = task.get("type", "default")
            if task_type not in grouped:
                grouped[task_type] = []
            grouped[task_type].append(task)

        # Process each group
        for task_type, group_tasks in grouped.items():
            # Batch within groups
            for i in range(0, len(group_tasks), max_batch_size):
                batch = group_tasks[i:i + max_batch_size]
                batch_result = await self._process_batch(batch, task_type)
                results.extend(batch_result)

        return results

    async def _process_batch(
        self,
        batch: list[dict],
        task_type: str
    ) -> list[dict]:
        """Process a batch in single request."""
        items_str = "\n".join([
            f"{i+1}. {task['input']}"
            for i, task in enumerate(batch)
        ])

        prompt = f"""Process these {len(batch)} items.

Items:
{items_str}

For each item, provide the result in format:
1. <result>
2. <result>
..."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        # Parse results
        results = self._parse_batch_results(response.content, batch)
        return results

Optimization Dashboard

class TokenOptimizationDashboard:
    """Track optimization metrics."""

    def __init__(self):
        self.metrics = {
            "total_tokens": 0,
            "saved_by_compression": 0,
            "saved_by_caching": 0,
            "saved_by_tiering": 0,
            "total_cost": 0,
            "optimized_cost": 0
        }

    def record(self, original: int, optimized: int, method: str):
        """Record optimization."""
        savings = original - optimized
        self.metrics["total_tokens"] += original
        self.metrics[f"saved_by_{method}"] += savings

    def get_report(self) -> dict:
        """Get optimization report."""
        total_saved = sum(v for k, v in self.metrics.items() if k.startswith("saved_by"))
        return {
            "total_tokens_processed": self.metrics["total_tokens"],
            "total_tokens_saved": total_saved,
            "savings_percent": round(total_saved / self.metrics["total_tokens"] * 100, 1) if self.metrics["total_tokens"] > 0 else 0,
            "cost_saved": round(total_saved * 0.03 / 1000, 2),
            "breakdown": {k: v for k, v in self.metrics.items() if k.startswith("saved_by")}
        }

With systematic token optimization, you can reduce GPT-4 costs by 50-70% while maintaining quality. Track your savings and continuously refine your strategies.