Cost-Quality Tradeoff Analysis Across LLM Providers
Compare two OpenAI models on quality and cost using MLflow evaluation
and tracing. Run the same questions through gpt-4o-mini and
gpt-4o, score the outputs, then use traced token usage to
estimate costs.
pip install mlflow openai
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("cost-quality-tradeoff")
eval_data = [
{
"inputs": {"question": "What is a database index?"},
"expectations": {
"expected_facts": [
"data structure that improves query speed",
"trades write performance for read performance",
]
},
},
{
"inputs": {
"question": (
"Explain the CAP theorem"
" in distributed systems."
)
},
"expectations": {
"expected_facts": [
"consistency",
"availability",
"partition tolerance",
"cannot guarantee all three simultaneously",
]
},
},
{
"inputs": {
"question": (
"What is the difference between"
" TCP and UDP?"
)
},
"expectations": {
"expected_facts": [
"TCP is connection-oriented",
"UDP is connectionless",
"TCP guarantees delivery",
]
},
},
{
"inputs": {
"question": (
"How does garbage collection"
" work in Java?"
)
},
"expectations": {
"expected_facts": [
"automatic memory management",
"identifies unreachable objects",
"reclaims memory",
]
},
},
{
"inputs": {
"question": "What is a race condition?"
},
"expectations": {
"expected_facts": [
"concurrent access to shared resource",
"outcome depends on timing",
]
},
},
{
"inputs": {
"question": (
"Explain how TLS/SSL"
" handshake works."
)
},
"expectations": {
"expected_facts": [
"client hello and server hello",
"certificate exchange",
"symmetric key establishment",
]
},
},
]
Each predict function calls a different model. The parameter
name question matches the key in the inputs dicts.
import openai
mlflow.openai.autolog()
client = openai.OpenAI()
SYSTEM_PROMPT = (
"You are a senior software engineer. "
"Answer technical questions accurately "
"and concisely in 2-4 sentences."
)
def predict_gpt4o_mini(question: str) -> str:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": question},
],
)
return response.choices[0].message.content
def predict_gpt4o(question: str) -> str:
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": question},
],
)
return response.choices[0].message.content
Score both models with Correctness (are expected facts
present?) and Completeness (did the model fully address the
question?).
from mlflow.genai.scorers import Correctness, Completeness
scorers = [Correctness(), Completeness()]
results_mini = mlflow.genai.evaluate(
data=eval_data,
predict_fn=predict_gpt4o_mini,
scorers=scorers,
)
results_4o = mlflow.genai.evaluate(
data=eval_data,
predict_fn=predict_gpt4o,
scorers=scorers,
)
EvaluationResult.metrics contains aggregated scores
across all rows.
print("=== gpt-4o-mini ===")
print(results_mini.metrics)
# Example:
# {'correctness/mean': 0.83,
# 'completeness/mean': 0.67}
print("\n=== gpt-4o ===")
print(results_4o.metrics)
# Example:
# {'correctness/mean': 1.0,
# 'completeness/mean': 1.0}
For per-question detail, inspect result_df:
cols = [
"inputs/question",
"correctness/value",
"completeness/value",
]
print(results_mini.result_df[cols])
print(results_4o.result_df[cols])
MLflow traces automatically capture token usage when
mlflow.openai.autolog() is enabled. Each trace stores
aggregated token counts in its metadata.
traces_mini = mlflow.search_traces(
run_id=results_mini.run_id,
return_type="list",
)
traces_4o = mlflow.search_traces(
run_id=results_4o.run_id,
return_type="list",
)
def sum_token_usage(traces):
total_input = 0
total_output = 0
for trace in traces:
usage = trace.info.token_usage
if usage:
total_input += usage.get(
"input_tokens", 0
)
total_output += usage.get(
"output_tokens", 0
)
return {
"input_tokens": total_input,
"output_tokens": total_output,
"total_tokens": total_input + total_output,
}
usage_mini = sum_token_usage(traces_mini)
usage_4o = sum_token_usage(traces_4o)
print("gpt-4o-mini tokens:", usage_mini)
# Example: {'input_tokens': 420, 'output_tokens': 690,
# 'total_tokens': 1110}
print("gpt-4o tokens:", usage_4o)
# Example: {'input_tokens': 420, 'output_tokens': 780,
# 'total_tokens': 1200}
Apply each model's per-token pricing to the recorded usage.
# Pricing per 1M tokens (USD) — check OpenAI's pricing
# page for current rates
PRICING = {
"gpt-4o-mini": {
"input": 0.15, # $0.15 per 1M input tokens
"output": 0.60, # $0.60 per 1M output tokens
},
"gpt-4o": {
"input": 2.50, # $2.50 per 1M input tokens
"output": 10.00, # $10.00 per 1M output tokens
},
}
def estimate_cost(usage, model_name):
pricing = PRICING[model_name]
input_cost = (
usage["input_tokens"] * pricing["input"] / 1_000_000
)
output_cost = (
usage["output_tokens"] * pricing["output"] / 1_000_000
)
return {
"input_cost": input_cost,
"output_cost": output_cost,
"total_cost": input_cost + output_cost,
}
cost_mini = estimate_cost(usage_mini, "gpt-4o-mini")
cost_4o = estimate_cost(usage_4o, "gpt-4o")
print(f"gpt-4o-mini cost: ${cost_mini['total_cost']:.6f}")
print(f"gpt-4o cost: ${cost_4o['total_cost']:.6f}")
Combine metrics, token usage, and cost into a single table.
import pandas as pd
comparison = pd.DataFrame([
{
"model": "gpt-4o-mini",
"correctness": results_mini.metrics.get(
"correctness/mean", 0
),
"completeness": results_mini.metrics.get(
"completeness/mean", 0
),
"total_tokens": usage_mini["total_tokens"],
"cost_usd": cost_mini["total_cost"],
},
{
"model": "gpt-4o",
"correctness": results_4o.metrics.get(
"correctness/mean", 0
),
"completeness": results_4o.metrics.get(
"completeness/mean", 0
),
"total_tokens": usage_4o["total_tokens"],
"cost_usd": cost_4o["total_cost"],
},
])
# Cost per quality point
comparison["cost_per_correct_pct"] = (
comparison["cost_usd"]
/ comparison["correctness"].replace(0, float("nan"))
)
print(comparison.to_string(index=False))
# Example output:
# model correctness completeness total_tokens cost_usd cost_per_correct_pct
# gpt-4o-mini 0.83 0.67 1110 0.000477 0.000575
# gpt-4o 1.00 1.00 1200 0.010800 0.010800
Open the MLflow UI at http://127.0.0.1:5000 and navigate to
the cost-quality-tradeoff experiment. Each evaluation run
shows per-question scores and linked traces with full token
usage details.
Analysis: When to Use Which Model
Use gpt-4o-mini when:
- The task is straightforward (simple factual questions, classification, extraction)
- Cost is a primary constraint and you need to process high volumes
- A small drop in correctness is acceptable
Use gpt-4o when:
- Accuracy matters more than cost (medical, legal, financial domains)
- Questions require nuanced reasoning or multi-step analysis
- Completeness is critical and partial answers are not acceptable
The cost difference between these models is typically 15-20x.
For many production workloads, routing simple queries to
gpt-4o-mini and complex queries to gpt-4o provides the
best tradeoff.
Next Steps
- End-to-End RAG Evaluation -- Evaluate retrieval and generation quality together
- Custom LLM Judges -- Build domain-specific scorers for your use case
- Built-in Scorers Reference -- Full list of available scorers