January 24, 2025 1 min read

Documentation with AI: Automating Technical Writing for Data Projects

Documentation AI Technical Writing Data Engineering Best Practices

Documentation is essential but often neglected. AI can dramatically reduce the effort to create and maintain documentation while improving quality. Let’s explore AI-powered documentation for data projects.

Documentation Types for Data Projects

Code Documentation
├── Function/Class docstrings
├── Inline comments
└── README files

Technical Documentation
├── Architecture diagrams
├── Data dictionaries
├── API documentation
└── Runbooks

Process Documentation
├── Data lineage
├── Pipeline documentation
├── Change logs
└── Incident reports

User Documentation
├── User guides
├── FAQs
├── Tutorials
└── Glossaries

Automated Docstring Generation

from azure.ai.foundry import AIFoundryClient

class DocstringGenerator:
    def __init__(self, llm_client: AIFoundryClient):
        self.llm = llm_client

    async def generate_docstring(self, code: str, style: str = "google") -> str:
        """Generate docstring for Python code."""

        style_examples = {
            "google": '''
            """Short description.

            Long description if needed.

            Args:
                param1: Description of param1.
                param2: Description of param2.

            Returns:
                Description of return value.

            Raises:
                ExceptionType: When this exception is raised.

            Example:
                >>> function_name(arg1, arg2)
                expected_output
            """
            ''',
            "numpy": '''
            """
            Short description.

            Parameters
            ----------
            param1 : type
                Description of param1.
            param2 : type
                Description of param2.

            Returns
            -------
            type
                Description of return value.
            """
            '''
        }

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Generate a {style}-style docstring for this code:

                ```python
                {code}
                ```

                Follow this format:
                {style_examples.get(style, style_examples['google'])}

                Include:
                - Clear one-line description
                - All parameters with types
                - Return value
                - Possible exceptions
                - Usage example if complex

                Return only the docstring, no code."""
            }]
        )

        return response.choices[0].message.content

    async def document_module(self, module_path: str) -> dict:
        """Generate documentation for an entire module."""

        with open(module_path, 'r') as f:
            code = f.read()

        # Parse code to find functions and classes
        import ast
        tree = ast.parse(code)

        docs = {}

        for node in ast.walk(tree):
            if isinstance(node, ast.FunctionDef):
                func_code = ast.get_source_segment(code, node)
                docs[node.name] = await self.generate_docstring(func_code)
            elif isinstance(node, ast.ClassDef):
                class_code = ast.get_source_segment(code, node)
                docs[node.name] = await self.generate_docstring(class_code)

        return docs

Data Dictionary Generator

class DataDictionaryGenerator:
    def __init__(self, llm_client: AIFoundryClient):
        self.llm = llm_client

    async def generate_from_schema(self, schema: dict, context: str) -> str:
        """Generate data dictionary from schema."""

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Generate a comprehensive data dictionary:

                Schema:
                {json.dumps(schema, indent=2)}

                Context: {context}

                Format as Markdown table with columns:
                | Column Name | Data Type | Nullable | Description | Example Values | Business Rules |

                For each column:
                - Write clear, business-friendly descriptions
                - Provide realistic example values
                - Note any validation rules or constraints
                - Include relationships to other tables if apparent"""
            }]
        )

        return response.choices[0].message.content

    async def generate_from_dataframe(self, df, table_name: str, context: str) -> str:
        """Generate data dictionary from a DataFrame with sample data."""

        # Gather metadata
        schema_info = []
        for col in df.columns:
            schema_info.append({
                "name": col,
                "dtype": str(df[col].dtype),
                "null_count": int(df[col].isnull().sum()),
                "unique_count": int(df[col].nunique()),
                "sample_values": df[col].dropna().head(5).tolist()
            })

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Generate a data dictionary for table: {table_name}

                Context: {context}

                Column information:
                {json.dumps(schema_info, indent=2, default=str)}

                Generate comprehensive documentation including:
                1. Table overview
                2. Column details (as markdown table)
                3. Data quality notes
                4. Common query patterns
                5. Relationships to other tables (if apparent from names)"""
            }]
        )

        return response.choices[0].message.content

Pipeline Documentation

class PipelineDocumenter:
    def __init__(self, llm_client: AIFoundryClient):
        self.llm = llm_client

    async def document_pipeline(self, pipeline_code: str, pipeline_name: str) -> str:
        """Generate comprehensive pipeline documentation."""

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Document this data pipeline:

                Pipeline Name: {pipeline_name}
                Code:
                ```python
                {pipeline_code}
                ```

                Generate documentation including:

                ## Overview
                Brief description of what the pipeline does

                ## Data Flow
                Describe the data flow from source to destination

                ## Components
                List each major component/step

                ## Configuration
                What configuration options are available

                ## Dependencies
                External dependencies and requirements

                ## Error Handling
                How errors are handled

                ## Monitoring
                What should be monitored

                ## Runbook
                Step-by-step instructions for:
                - Starting the pipeline
                - Stopping the pipeline
                - Troubleshooting common issues

                Format as Markdown."""
            }]
        )

        return response.choices[0].message.content

    async def generate_lineage_doc(self, lineage_info: dict) -> str:
        """Generate data lineage documentation."""

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Generate data lineage documentation:

                Lineage information:
                {json.dumps(lineage_info, indent=2)}

                Include:
                1. Visual representation (Mermaid diagram)
                2. Source descriptions
                3. Transformation descriptions
                4. Target descriptions
                5. Data freshness information
                6. Impact analysis (what breaks if X fails)"""
            }]
        )

        return response.choices[0].message.content

API Documentation

class APIDocGenerator:
    def __init__(self, llm_client: AIFoundryClient):
        self.llm = llm_client

    async def generate_openapi_spec(self, endpoint_code: str, base_url: str) -> dict:
        """Generate OpenAPI specification from code."""

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Generate OpenAPI 3.0 specification for this API:

                Base URL: {base_url}

                Code:
                ```python
                {endpoint_code}
                ```

                Include:
                - All endpoints with paths and methods
                - Request/response schemas
                - Parameter descriptions
                - Example requests/responses
                - Error responses

                Return valid OpenAPI 3.0 JSON."""
            }]
        )

        return json.loads(response.choices[0].message.content)

    async def generate_api_guide(self, openapi_spec: dict) -> str:
        """Generate user-friendly API guide from OpenAPI spec."""

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Create a user-friendly API guide from this OpenAPI spec:

                {json.dumps(openapi_spec, indent=2)}

                Include:
                1. Quick Start
                2. Authentication
                3. Each endpoint with:
                   - Description
                   - Example request (curl and Python)
                   - Example response
                   - Common errors
                4. Rate limits and best practices
                5. FAQ

                Format as Markdown."""
            }]
        )

        return response.choices[0].message.content

Change Log Generation

class ChangeLogGenerator:
    def __init__(self, llm_client: AIFoundryClient):
        self.llm = llm_client

    async def generate_from_commits(self, commits: list[dict], version: str) -> str:
        """Generate changelog from git commits."""

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Generate a changelog for version {version}:

                Commits:
                {json.dumps(commits, indent=2)}

                Format using Keep a Changelog format:

                ## [{version}] - {date}

                ### Added
                - New features

                ### Changed
                - Changes to existing functionality

                ### Fixed
                - Bug fixes

                ### Deprecated
                - Features to be removed

                ### Removed
                - Removed features

                ### Security
                - Security fixes

                Group commits appropriately. Write user-friendly descriptions."""
            }]
        )

        return response.choices[0].message.content

    async def generate_from_diff(self, old_code: str, new_code: str, context: str) -> str:
        """Generate change description from code diff."""

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Describe the changes between these two versions:

                Context: {context}

                Old version:
                ```
                {old_code}
                ```

                New version:
                ```
                {new_code}
                ```

                Provide:
                1. Summary of changes
                2. Detailed list of modifications
                3. Impact assessment
                4. Migration notes (if applicable)"""
            }]
        )

        return response.choices[0].message.content

Documentation Maintenance

class DocMaintainer:
    def __init__(self, llm_client: AIFoundryClient):
        self.llm = llm_client

    async def check_accuracy(self, doc: str, code: str) -> dict:
        """Check if documentation matches current code."""

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Check if this documentation is accurate for the code:

                Documentation:
                {doc}

                Code:
                ```
                {code}
                ```

                Identify:
                1. Outdated information
                2. Missing information
                3. Incorrect descriptions
                4. Suggested updates

                Return JSON:
                {{
                    "accurate": true|false,
                    "issues": [
                        {{"type": "outdated|missing|incorrect", "location": "where", "description": "what's wrong", "suggestion": "how to fix"}}
                    ]
                }}"""
            }]
        )

        return json.loads(response.choices[0].message.content)

    async def update_doc(self, doc: str, code: str, issues: list[dict]) -> str:
        """Update documentation based on identified issues."""

        response = await self.llm.chat.complete_async(
            deployment="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"""Update this documentation to fix the identified issues:

                Current documentation:
                {doc}

                Current code:
                ```
                {code}
                ```

                Issues to fix:
                {json.dumps(issues, indent=2)}

                Return the updated documentation."""
            }]
        )

        return response.choices[0].message.content

Best Practices

Generate early: Create docs during development, not after
Review AI output: AI can miss context-specific details
Keep in sync: Regularly check docs against code
Multiple audiences: Generate different docs for different users
Version docs: Track documentation alongside code

AI makes documentation maintenance feasible. Use it to generate first drafts, check accuracy, and keep docs up to date with minimal manual effort.