Back to Blog
3 min read

Model Fine-Tuning on Azure: When and How to Customize LLMs

Fine-tuning adapts pre-trained models to specific domains or tasks. Understanding when fine-tuning adds value versus when prompt engineering suffices is crucial for cost-effective AI deployments.

When to Fine-Tune

Fine-tuning makes sense when you need consistent style, specialized terminology, or improved performance on specific tasks that prompt engineering cannot achieve effectively.

Preparing Training Data

Quality training data is the most important factor in fine-tuning success:

import json
from typing import Generator

def prepare_fine_tuning_data(
    examples: list[dict],
    system_prompt: str
) -> Generator[dict, None, None]:
    """Convert examples to OpenAI fine-tuning format."""
    for example in examples:
        yield {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": example["input"]},
                {"role": "assistant", "content": example["output"]}
            ]
        }

def validate_training_file(file_path: str) -> dict:
    """Validate training data format and quality."""
    issues = []
    examples = []

    with open(file_path, 'r') as f:
        for line_num, line in enumerate(f, 1):
            try:
                data = json.loads(line)
                examples.append(data)

                # Validate structure
                if "messages" not in data:
                    issues.append(f"Line {line_num}: Missing 'messages' key")
                    continue

                messages = data["messages"]
                roles = [m.get("role") for m in messages]

                # Must have system, user, assistant sequence
                if roles != ["system", "user", "assistant"]:
                    issues.append(f"Line {line_num}: Invalid role sequence {roles}")

                # Check content length
                for msg in messages:
                    if len(msg.get("content", "")) < 10:
                        issues.append(f"Line {line_num}: Very short content in {msg['role']}")

            except json.JSONDecodeError as e:
                issues.append(f"Line {line_num}: Invalid JSON - {e}")

    return {
        "total_examples": len(examples),
        "issues": issues,
        "valid": len(issues) == 0
    }

# Create training data
system_prompt = """You are a technical support assistant for Contoso software products.
Provide accurate, helpful responses using proper technical terminology.
Always include relevant documentation links when applicable."""

training_examples = [
    {
        "input": "How do I reset my password?",
        "output": "To reset your password in Contoso Suite:\n\n1. Click 'Forgot Password' on the login screen\n2. Enter your registered email address\n3. Check your email for the reset link (valid for 24 hours)\n4. Create a new password meeting security requirements\n\nFor more details, see: docs.contoso.com/password-reset"
    },
    # Add more examples...
]

# Write to JSONL file
with open("training_data.jsonl", "w") as f:
    for record in prepare_fine_tuning_data(training_examples, system_prompt):
        f.write(json.dumps(record) + "\n")

Starting Fine-Tuning on Azure

Upload data and initiate the fine-tuning job:

from openai import AzureOpenAI

client = AzureOpenAI(
    api_version="2024-08-01-preview",
    azure_endpoint="https://your-resource.openai.azure.com/"
)

# Upload training file
with open("training_data.jsonl", "rb") as f:
    training_file = client.files.create(file=f, purpose="fine-tune")

# Create fine-tuning job
job = client.fine_tuning.jobs.create(
    training_file=training_file.id,
    model="gpt-4o-mini-2024-07-18",  # Base model
    hyperparameters={
        "n_epochs": 3,
        "batch_size": 4,
        "learning_rate_multiplier": 1.0
    },
    suffix="contoso-support"  # Custom model name suffix
)

# Monitor progress
while True:
    status = client.fine_tuning.jobs.retrieve(job.id)
    print(f"Status: {status.status}")

    if status.status in ["succeeded", "failed", "cancelled"]:
        break

    time.sleep(60)

# Deploy fine-tuned model
print(f"Fine-tuned model: {status.fine_tuned_model}")

Evaluating Fine-Tuned Models

Compare fine-tuned model performance against the base model:

def evaluate_model(model_name: str, test_cases: list[dict]) -> dict:
    """Evaluate model on test cases."""
    scores = []

    for test in test_cases:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": test["input"]}
            ]
        )

        # Score response quality
        score = calculate_similarity(
            response.choices[0].message.content,
            test["expected_output"]
        )
        scores.append(score)

    return {
        "model": model_name,
        "average_score": sum(scores) / len(scores),
        "min_score": min(scores),
        "max_score": max(scores)
    }

Fine-tuning is a powerful tool but requires careful data preparation and evaluation. Start with prompt engineering and only fine-tune when you have clear evidence of improvement needs and sufficient quality training data.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.