Back to Blog
2 min read

LLM Fine-Tuning on Azure: When and How to Customize Models

Fine-tuning can improve model performance for specific tasks, but it’s not always the right choice. Let’s explore when and how.

Fine-Tuning on Azure OpenAI

from azure.ai.openai import AzureOpenAI
import json

class FineTuningPipeline:
    def __init__(self, client: AzureOpenAI):
        self.client = client

    def prepare_training_data(self, examples: list[dict]) -> str:
        """Prepare data in JSONL format for fine-tuning."""
        jsonl_lines = []

        for example in examples:
            training_example = {
                "messages": [
                    {"role": "system", "content": example["system"]},
                    {"role": "user", "content": example["user"]},
                    {"role": "assistant", "content": example["assistant"]}
                ]
            }
            jsonl_lines.append(json.dumps(training_example))

        return "\n".join(jsonl_lines)

    async def upload_training_file(self, jsonl_content: str) -> str:
        """Upload training file to Azure."""
        response = await self.client.files.create(
            file=jsonl_content.encode(),
            purpose="fine-tune"
        )
        return response.id

    async def create_fine_tune_job(
        self,
        training_file_id: str,
        model: str = "gpt-4o-mini",
        hyperparameters: dict = None
    ) -> str:
        """Create fine-tuning job."""
        params = hyperparameters or {
            "n_epochs": 3,
            "batch_size": "auto",
            "learning_rate_multiplier": "auto"
        }

        response = await self.client.fine_tuning.jobs.create(
            training_file=training_file_id,
            model=model,
            hyperparameters=params
        )
        return response.id

    async def monitor_job(self, job_id: str) -> dict:
        """Monitor fine-tuning job status."""
        job = await self.client.fine_tuning.jobs.retrieve(job_id)
        return {
            "status": job.status,
            "trained_tokens": job.trained_tokens,
            "fine_tuned_model": job.fine_tuned_model
        }

    async def evaluate_fine_tuned(self, model_id: str, test_cases: list[dict]) -> dict:
        """Evaluate fine-tuned model against test cases."""
        results = []

        for case in test_cases:
            response = await self.client.chat.completions.create(
                model=model_id,
                messages=case["messages"]
            )

            results.append({
                "expected": case["expected"],
                "actual": response.choices[0].message.content,
                "match": self.evaluate_match(case["expected"], response)
            })

        accuracy = sum(r["match"] for r in results) / len(results)
        return {"accuracy": accuracy, "results": results}

# When to fine-tune vs prompt engineering
# Fine-tune when:
# - Consistent style/format needed
# - Domain-specific terminology
# - Cost optimization for high volume
# - Latency requirements (smaller fine-tuned model)
#
# Use prompting when:
# - Limited training data (<100 examples)
# - Rapidly changing requirements
# - Need model flexibility
# - Quick iteration needed

Fine-tuning is a powerful tool when used appropriately, but prompt engineering often achieves similar results faster.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.