7 min read
Documentation with AI: Automating Technical Writing for Data Projects
Documentation is essential but often neglected. AI can dramatically reduce the effort to create and maintain documentation while improving quality. Let’s explore AI-powered documentation for data projects.
Documentation Types for Data Projects
Code Documentation
├── Function/Class docstrings
├── Inline comments
└── README files
Technical Documentation
├── Architecture diagrams
├── Data dictionaries
├── API documentation
└── Runbooks
Process Documentation
├── Data lineage
├── Pipeline documentation
├── Change logs
└── Incident reports
User Documentation
├── User guides
├── FAQs
├── Tutorials
└── Glossaries
Automated Docstring Generation
from azure.ai.foundry import AIFoundryClient
class DocstringGenerator:
def __init__(self, llm_client: AIFoundryClient):
self.llm = llm_client
async def generate_docstring(self, code: str, style: str = "google") -> str:
"""Generate docstring for Python code."""
style_examples = {
"google": '''
"""Short description.
Long description if needed.
Args:
param1: Description of param1.
param2: Description of param2.
Returns:
Description of return value.
Raises:
ExceptionType: When this exception is raised.
Example:
>>> function_name(arg1, arg2)
expected_output
"""
''',
"numpy": '''
"""
Short description.
Parameters
----------
param1 : type
Description of param1.
param2 : type
Description of param2.
Returns
-------
type
Description of return value.
"""
'''
}
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Generate a {style}-style docstring for this code:
```python
{code}
```
Follow this format:
{style_examples.get(style, style_examples['google'])}
Include:
- Clear one-line description
- All parameters with types
- Return value
- Possible exceptions
- Usage example if complex
Return only the docstring, no code."""
}]
)
return response.choices[0].message.content
async def document_module(self, module_path: str) -> dict:
"""Generate documentation for an entire module."""
with open(module_path, 'r') as f:
code = f.read()
# Parse code to find functions and classes
import ast
tree = ast.parse(code)
docs = {}
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
func_code = ast.get_source_segment(code, node)
docs[node.name] = await self.generate_docstring(func_code)
elif isinstance(node, ast.ClassDef):
class_code = ast.get_source_segment(code, node)
docs[node.name] = await self.generate_docstring(class_code)
return docs
Data Dictionary Generator
class DataDictionaryGenerator:
def __init__(self, llm_client: AIFoundryClient):
self.llm = llm_client
async def generate_from_schema(self, schema: dict, context: str) -> str:
"""Generate data dictionary from schema."""
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Generate a comprehensive data dictionary:
Schema:
{json.dumps(schema, indent=2)}
Context: {context}
Format as Markdown table with columns:
| Column Name | Data Type | Nullable | Description | Example Values | Business Rules |
For each column:
- Write clear, business-friendly descriptions
- Provide realistic example values
- Note any validation rules or constraints
- Include relationships to other tables if apparent"""
}]
)
return response.choices[0].message.content
async def generate_from_dataframe(self, df, table_name: str, context: str) -> str:
"""Generate data dictionary from a DataFrame with sample data."""
# Gather metadata
schema_info = []
for col in df.columns:
schema_info.append({
"name": col,
"dtype": str(df[col].dtype),
"null_count": int(df[col].isnull().sum()),
"unique_count": int(df[col].nunique()),
"sample_values": df[col].dropna().head(5).tolist()
})
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Generate a data dictionary for table: {table_name}
Context: {context}
Column information:
{json.dumps(schema_info, indent=2, default=str)}
Generate comprehensive documentation including:
1. Table overview
2. Column details (as markdown table)
3. Data quality notes
4. Common query patterns
5. Relationships to other tables (if apparent from names)"""
}]
)
return response.choices[0].message.content
Pipeline Documentation
class PipelineDocumenter:
def __init__(self, llm_client: AIFoundryClient):
self.llm = llm_client
async def document_pipeline(self, pipeline_code: str, pipeline_name: str) -> str:
"""Generate comprehensive pipeline documentation."""
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Document this data pipeline:
Pipeline Name: {pipeline_name}
Code:
```python
{pipeline_code}
```
Generate documentation including:
## Overview
Brief description of what the pipeline does
## Data Flow
Describe the data flow from source to destination
## Components
List each major component/step
## Configuration
What configuration options are available
## Dependencies
External dependencies and requirements
## Error Handling
How errors are handled
## Monitoring
What should be monitored
## Runbook
Step-by-step instructions for:
- Starting the pipeline
- Stopping the pipeline
- Troubleshooting common issues
Format as Markdown."""
}]
)
return response.choices[0].message.content
async def generate_lineage_doc(self, lineage_info: dict) -> str:
"""Generate data lineage documentation."""
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Generate data lineage documentation:
Lineage information:
{json.dumps(lineage_info, indent=2)}
Include:
1. Visual representation (Mermaid diagram)
2. Source descriptions
3. Transformation descriptions
4. Target descriptions
5. Data freshness information
6. Impact analysis (what breaks if X fails)"""
}]
)
return response.choices[0].message.content
API Documentation
class APIDocGenerator:
def __init__(self, llm_client: AIFoundryClient):
self.llm = llm_client
async def generate_openapi_spec(self, endpoint_code: str, base_url: str) -> dict:
"""Generate OpenAPI specification from code."""
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Generate OpenAPI 3.0 specification for this API:
Base URL: {base_url}
Code:
```python
{endpoint_code}
```
Include:
- All endpoints with paths and methods
- Request/response schemas
- Parameter descriptions
- Example requests/responses
- Error responses
Return valid OpenAPI 3.0 JSON."""
}]
)
return json.loads(response.choices[0].message.content)
async def generate_api_guide(self, openapi_spec: dict) -> str:
"""Generate user-friendly API guide from OpenAPI spec."""
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Create a user-friendly API guide from this OpenAPI spec:
{json.dumps(openapi_spec, indent=2)}
Include:
1. Quick Start
2. Authentication
3. Each endpoint with:
- Description
- Example request (curl and Python)
- Example response
- Common errors
4. Rate limits and best practices
5. FAQ
Format as Markdown."""
}]
)
return response.choices[0].message.content
Change Log Generation
class ChangeLogGenerator:
def __init__(self, llm_client: AIFoundryClient):
self.llm = llm_client
async def generate_from_commits(self, commits: list[dict], version: str) -> str:
"""Generate changelog from git commits."""
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Generate a changelog for version {version}:
Commits:
{json.dumps(commits, indent=2)}
Format using Keep a Changelog format:
## [{version}] - {date}
### Added
- New features
### Changed
- Changes to existing functionality
### Fixed
- Bug fixes
### Deprecated
- Features to be removed
### Removed
- Removed features
### Security
- Security fixes
Group commits appropriately. Write user-friendly descriptions."""
}]
)
return response.choices[0].message.content
async def generate_from_diff(self, old_code: str, new_code: str, context: str) -> str:
"""Generate change description from code diff."""
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Describe the changes between these two versions:
Context: {context}
Old version:
```
{old_code}
```
New version:
```
{new_code}
```
Provide:
1. Summary of changes
2. Detailed list of modifications
3. Impact assessment
4. Migration notes (if applicable)"""
}]
)
return response.choices[0].message.content
Documentation Maintenance
class DocMaintainer:
def __init__(self, llm_client: AIFoundryClient):
self.llm = llm_client
async def check_accuracy(self, doc: str, code: str) -> dict:
"""Check if documentation matches current code."""
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Check if this documentation is accurate for the code:
Documentation:
{doc}
Code:
```
{code}
```
Identify:
1. Outdated information
2. Missing information
3. Incorrect descriptions
4. Suggested updates
Return JSON:
{{
"accurate": true|false,
"issues": [
{{"type": "outdated|missing|incorrect", "location": "where", "description": "what's wrong", "suggestion": "how to fix"}}
]
}}"""
}]
)
return json.loads(response.choices[0].message.content)
async def update_doc(self, doc: str, code: str, issues: list[dict]) -> str:
"""Update documentation based on identified issues."""
response = await self.llm.chat.complete_async(
deployment="gpt-4o",
messages=[{
"role": "user",
"content": f"""Update this documentation to fix the identified issues:
Current documentation:
{doc}
Current code:
```
{code}
```
Issues to fix:
{json.dumps(issues, indent=2)}
Return the updated documentation."""
}]
)
return response.choices[0].message.content
Best Practices
- Generate early: Create docs during development, not after
- Review AI output: AI can miss context-specific details
- Keep in sync: Regularly check docs against code
- Multiple audiences: Generate different docs for different users
- Version docs: Track documentation alongside code
AI makes documentation maintenance feasible. Use it to generate first drafts, check accuracy, and keep docs up to date with minimal manual effort.