2 min read
LLM Fine-Tuning on Azure: When and How to Customize Models
Fine-tuning can improve model performance for specific tasks, but it’s not always the right choice. Let’s explore when and how.
Fine-Tuning on Azure OpenAI
from azure.ai.openai import AzureOpenAI
import json
class FineTuningPipeline:
def __init__(self, client: AzureOpenAI):
self.client = client
def prepare_training_data(self, examples: list[dict]) -> str:
"""Prepare data in JSONL format for fine-tuning."""
jsonl_lines = []
for example in examples:
training_example = {
"messages": [
{"role": "system", "content": example["system"]},
{"role": "user", "content": example["user"]},
{"role": "assistant", "content": example["assistant"]}
]
}
jsonl_lines.append(json.dumps(training_example))
return "\n".join(jsonl_lines)
async def upload_training_file(self, jsonl_content: str) -> str:
"""Upload training file to Azure."""
response = await self.client.files.create(
file=jsonl_content.encode(),
purpose="fine-tune"
)
return response.id
async def create_fine_tune_job(
self,
training_file_id: str,
model: str = "gpt-4o-mini",
hyperparameters: dict = None
) -> str:
"""Create fine-tuning job."""
params = hyperparameters or {
"n_epochs": 3,
"batch_size": "auto",
"learning_rate_multiplier": "auto"
}
response = await self.client.fine_tuning.jobs.create(
training_file=training_file_id,
model=model,
hyperparameters=params
)
return response.id
async def monitor_job(self, job_id: str) -> dict:
"""Monitor fine-tuning job status."""
job = await self.client.fine_tuning.jobs.retrieve(job_id)
return {
"status": job.status,
"trained_tokens": job.trained_tokens,
"fine_tuned_model": job.fine_tuned_model
}
async def evaluate_fine_tuned(self, model_id: str, test_cases: list[dict]) -> dict:
"""Evaluate fine-tuned model against test cases."""
results = []
for case in test_cases:
response = await self.client.chat.completions.create(
model=model_id,
messages=case["messages"]
)
results.append({
"expected": case["expected"],
"actual": response.choices[0].message.content,
"match": self.evaluate_match(case["expected"], response)
})
accuracy = sum(r["match"] for r in results) / len(results)
return {"accuracy": accuracy, "results": results}
# When to fine-tune vs prompt engineering
# Fine-tune when:
# - Consistent style/format needed
# - Domain-specific terminology
# - Cost optimization for high volume
# - Latency requirements (smaller fine-tuned model)
#
# Use prompting when:
# - Limited training data (<100 examples)
# - Rapidly changing requirements
# - Need model flexibility
# - Quick iteration needed
Fine-tuning is a powerful tool when used appropriately, but prompt engineering often achieves similar results faster.