April 5, 2023 1 min read

Copilot for Docs: Chat With Your Documentation

Copilot for Docs enables conversational interaction with documentation. Instead of searching through pages, ask questions and get contextual answers with relevant code examples. This is RAG (Retrieval Augmented Generation) at its finest.

How It Works

User Question
     ↓
[Embed Question]
     ↓
[Search Documentation Index]
     ↓
[Retrieve Relevant Sections]
     ↓
[Construct Prompt with Context]
     ↓
[Generate Answer with Citations]
     ↓
Contextual Response

Building a Docs Copilot

from dataclasses import dataclass
from typing import Optional

@dataclass
class DocSection:
    content: str
    title: str
    url: str
    source: str
    embedding: Optional[list[float]] = None

class DocsCopilot:
    """Conversational documentation assistant."""

    def __init__(self, client, vector_store):
        self.client = client
        self.vector_store = vector_store
        self.conversation_history = []

    async def ask(
        self,
        question: str,
        doc_filter: str = None,
        include_examples: bool = True
    ) -> dict:
        """Ask a question about the documentation."""

        # Search for relevant docs
        relevant_docs = await self._search_docs(question, doc_filter)

        # Build context
        context = self._build_context(relevant_docs, include_examples)

        # Add to conversation
        self.conversation_history.append({
            "role": "user",
            "content": question
        })

        # Generate answer
        response = await self._generate_answer(question, context)

        # Add response to history
        self.conversation_history.append({
            "role": "assistant",
            "content": response["answer"]
        })

        return {
            "answer": response["answer"],
            "sources": [{"title": d.title, "url": d.url} for d in relevant_docs],
            "confidence": response.get("confidence", "medium")
        }

    async def _search_docs(
        self,
        query: str,
        filter_source: str = None
    ) -> list[DocSection]:
        """Search documentation for relevant sections."""

        filters = {}
        if filter_source:
            filters["source"] = filter_source

        results = await self.vector_store.search(
            query=query,
            k=5,
            filters=filters
        )

        return [DocSection(**r) for r in results]

    def _build_context(
        self,
        docs: list[DocSection],
        include_examples: bool
    ) -> str:
        """Build context from retrieved documents."""

        context_parts = []
        for doc in docs:
            section = f"### {doc.title}\nSource: {doc.url}\n\n{doc.content}"
            context_parts.append(section)

        return "\n\n---\n\n".join(context_parts)

    async def _generate_answer(
        self,
        question: str,
        context: str
    ) -> dict:
        """Generate answer from context."""

        system_prompt = """You are a documentation assistant. Answer questions based ONLY on the provided documentation context.

Guidelines:
- If the answer is in the docs, provide it with relevant code examples
- If the answer is NOT in the docs, say "I couldn't find this in the documentation"
- Always cite which documentation section your answer comes from
- Include code examples when relevant
- Be concise but complete"""

        messages = [
            {"role": "system", "content": system_prompt},
            *self.conversation_history[-6:],  # Keep last 3 exchanges
            {"role": "user", "content": f"Documentation Context:\n{context}\n\nQuestion: {question}"}
        ]

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=messages
        )

        return {"answer": response.content}

    def clear_history(self):
        """Clear conversation history."""
        self.conversation_history = []

Indexing Documentation

import os
import re
from pathlib import Path

class DocsIndexer:
    """Index documentation for semantic search."""

    def __init__(self, client, vector_store):
        self.client = client
        self.vector_store = vector_store

    async def index_markdown_docs(
        self,
        docs_path: str,
        source_name: str
    ) -> dict:
        """Index markdown documentation files."""

        indexed = 0
        errors = []

        for md_file in Path(docs_path).rglob("*.md"):
            try:
                sections = self._parse_markdown(md_file)

                for section in sections:
                    # Generate embedding
                    embedding = await self._get_embedding(section["content"])

                    # Store in vector DB
                    await self.vector_store.upsert({
                        "id": f"{source_name}:{md_file.stem}:{section['title']}",
                        "content": section["content"],
                        "title": section["title"],
                        "url": self._generate_url(source_name, md_file, section["anchor"]),
                        "source": source_name,
                        "embedding": embedding
                    })

                    indexed += 1

            except Exception as e:
                errors.append({"file": str(md_file), "error": str(e)})

        return {
            "indexed_sections": indexed,
            "errors": errors
        }

    def _parse_markdown(self, file_path: Path) -> list[dict]:
        """Parse markdown into sections."""

        with open(file_path) as f:
            content = f.read()

        sections = []
        current_section = {"title": "Introduction", "content": "", "anchor": ""}

        for line in content.split("\n"):
            # Check for headers
            header_match = re.match(r'^(#{1,3})\s+(.+)$', line)

            if header_match:
                # Save current section if it has content
                if current_section["content"].strip():
                    sections.append(current_section)

                # Start new section
                title = header_match.group(2)
                anchor = title.lower().replace(" ", "-").replace(".", "")
                current_section = {
                    "title": title,
                    "content": "",
                    "anchor": anchor
                }
            else:
                current_section["content"] += line + "\n"

        # Don't forget last section
        if current_section["content"].strip():
            sections.append(current_section)

        return sections

    async def _get_embedding(self, text: str) -> list[float]:
        """Get embedding for text."""
        response = await self.client.embedding(
            model="text-embedding-ada-002",
            input=text[:8000]  # Truncate if needed
        )
        return response.embedding

    def _generate_url(
        self,
        source: str,
        file_path: Path,
        anchor: str
    ) -> str:
        """Generate documentation URL."""
        # This depends on your docs hosting
        base_url = f"https://docs.example.com/{source}"
        return f"{base_url}/{file_path.stem}#{anchor}"

Multi-Source Documentation

class MultiSourceDocsCopilot:
    """Query across multiple documentation sources."""

    def __init__(self, client, vector_store):
        self.client = client
        self.vector_store = vector_store
        self.sources = {}

    def register_source(
        self,
        name: str,
        description: str,
        priority: int = 1
    ):
        """Register a documentation source."""
        self.sources[name] = {
            "description": description,
            "priority": priority
        }

    async def ask_across_sources(
        self,
        question: str,
        sources: list[str] = None
    ) -> dict:
        """Ask question across multiple doc sources."""

        # Determine relevant sources
        if not sources:
            sources = await self._identify_relevant_sources(question)

        # Search each source
        all_results = []
        for source in sources:
            results = await self.vector_store.search(
                query=question,
                k=3,
                filters={"source": source}
            )
            for r in results:
                r["source"] = source
            all_results.extend(results)

        # Rank and deduplicate
        ranked = self._rank_results(all_results)

        # Generate unified answer
        context = self._build_multi_source_context(ranked[:5])
        answer = await self._generate_unified_answer(question, context)

        return {
            "answer": answer,
            "sources_used": sources,
            "references": [{"source": r["source"], "title": r["title"]} for r in ranked[:5]]
        }

    async def _identify_relevant_sources(
        self,
        question: str
    ) -> list[str]:
        """Identify which documentation sources are relevant."""

        sources_desc = "\n".join([
            f"- {name}: {info['description']}"
            for name, info in self.sources.items()
        ])

        prompt = f"""Which documentation sources would be relevant for this question?

Available sources:
{sources_desc}

Question: {question}

Return JSON array of source names: ["source1", "source2"]"""

        response = await self.client.chat_completion(
            model="gpt-35-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )

        import json
        try:
            return json.loads(response.content)
        except:
            return list(self.sources.keys())

Code Example Extraction

class CodeExampleExtractor:
    """Extract and enhance code examples from docs."""

    async def get_examples(
        self,
        topic: str,
        language: str = None
    ) -> list[dict]:
        """Get code examples for a topic."""

        # Search for relevant docs
        results = await self.vector_store.search(
            query=f"{topic} example code",
            k=10
        )

        # Extract code blocks
        examples = []
        for result in results:
            code_blocks = self._extract_code_blocks(
                result["content"],
                language
            )
            for code in code_blocks:
                examples.append({
                    "code": code["code"],
                    "language": code["language"],
                    "context": result["title"],
                    "source": result["url"]
                })

        return examples

    def _extract_code_blocks(
        self,
        content: str,
        language_filter: str = None
    ) -> list[dict]:
        """Extract code blocks from markdown content."""
        import re

        pattern = r'```(\w+)?\n(.*?)```'
        matches = re.findall(pattern, content, re.DOTALL)

        blocks = []
        for lang, code in matches:
            if language_filter and lang != language_filter:
                continue
            blocks.append({
                "language": lang or "text",
                "code": code.strip()
            })

        return blocks

    async def explain_example(
        self,
        code: str,
        context: str
    ) -> str:
        """Explain a code example."""

        prompt = f"""Explain this code example from the documentation.

Context: {context}

Code:

{code}


Provide:
1. What it does
2. Key concepts demonstrated
3. How to adapt for different use cases"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

Copilot for Docs transforms static documentation into an interactive knowledge base. Combined with proper indexing and retrieval, it makes documentation truly accessible.