March 16, 2025 1 min read

Context Window Management: Maximizing LLM Effectiveness

AI LLM Context Window Optimization Best Practices

Effective context window management is crucial for LLM applications. Let’s explore strategies for optimal usage.

Context Management Strategies

from azure.ai.openai import AzureOpenAI
import tiktoken

class ContextManager:
    def __init__(self, openai_client: AzureOpenAI, max_tokens: int = 128000):
        self.openai = openai_client
        self.max_tokens = max_tokens
        self.encoder = tiktoken.encoding_for_model("gpt-4o")

    def count_tokens(self, text: str) -> int:
        """Count tokens in text."""
        return len(self.encoder.encode(text))

    def budget_context(self, system: str, history: list, retrieved: list, reserve: int = 4000) -> dict:
        """Budget tokens across context components."""
        system_tokens = self.count_tokens(system)
        available = self.max_tokens - system_tokens - reserve

        # Prioritize recent history
        history_budget = min(available * 0.3, 16000)
        retrieved_budget = available - history_budget

        return {
            "system": system,
            "history": self.trim_history(history, history_budget),
            "retrieved": self.trim_retrieved(retrieved, retrieved_budget)
        }

    def trim_history(self, history: list, max_tokens: int) -> list:
        """Keep recent history within budget."""
        result = []
        total = 0

        for msg in reversed(history):
            msg_tokens = self.count_tokens(str(msg))
            if total + msg_tokens > max_tokens:
                break
            result.insert(0, msg)
            total += msg_tokens

        return result

    def trim_retrieved(self, documents: list, max_tokens: int) -> list:
        """Keep top documents within budget."""
        result = []
        total = 0

        for doc in documents:
            doc_tokens = self.count_tokens(doc)
            if total + doc_tokens > max_tokens:
                # Try to fit partial document
                remaining = max_tokens - total
                if remaining > 500:
                    result.append(self.truncate_doc(doc, remaining))
                break
            result.append(doc)
            total += doc_tokens

        return result

    async def compress_context(self, context: str, target_tokens: int) -> str:
        """Use LLM to compress context while preserving key information."""
        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "system",
                "content": f"Compress this text to ~{target_tokens} tokens. Preserve key facts."
            }, {
                "role": "user",
                "content": context
            }]
        )
        return response.choices[0].message.content

    def sliding_window(self, text: str, window_size: int, overlap: int) -> list:
        """Process long text in overlapping windows."""
        tokens = self.encoder.encode(text)
        windows = []

        for i in range(0, len(tokens), window_size - overlap):
            window_tokens = tokens[i:i + window_size]
            windows.append(self.encoder.decode(window_tokens))

        return windows

Strategic context management enables handling of complex, long-context scenarios.