from typing import Any, Literal, get_args, get_origin
from mlflow.genai.judges.base import Judge
from mlflow.genai.judges.instructions_judge import InstructionsJudge
from mlflow.telemetry.events import MakeJudgeEvent
from mlflow.telemetry.track import record_usage_event
from mlflow.utils.annotations import experimental
def _validate_feedback_value_type(feedback_value_type: Any) -> None:
"""
Validate that feedback_value_type is one of the supported types for serialization.
Supported types match FeedbackValueType:
- PbValueType: int, float, str, bool
- Literal types with PbValueType values
- dict[str, PbValueType]
- list[PbValueType]
"""
from mlflow.entities.assessment import PbValueType
# Check for basic PbValueType (float, int, str, bool)
pb_value_types = get_args(PbValueType)
if feedback_value_type in pb_value_types:
return
# Check for Literal type
origin = get_origin(feedback_value_type)
if origin is Literal:
# Validate that all literal values are of PbValueType
literal_values = get_args(feedback_value_type)
for value in literal_values:
if not isinstance(value, pb_value_types):
from mlflow.exceptions import MlflowException
raise MlflowException.invalid_parameter_value(
"The `feedback_value_type` argument does not support a Literal type"
f"with non-primitive types, but got {type(value).__name__}. "
f"Literal values must be str, int, float, or bool."
)
return
# Check for dict[str, PbValueType]
if origin is dict:
args = get_args(feedback_value_type)
if len(args) == 2:
key_type, value_type = args
# Key must be str
if key_type != str:
from mlflow.exceptions import MlflowException
raise MlflowException.invalid_parameter_value(
f"dict key type must be str, got {key_type}"
)
# Value must be a PbValueType
if value_type not in pb_value_types:
from mlflow.exceptions import MlflowException
raise MlflowException.invalid_parameter_value(
"The `feedback_value_type` argument does not support a dict type"
f"with non-primitive values, but got {value_type.__name__}"
)
return
# Check for list[PbValueType]
if origin is list:
args = get_args(feedback_value_type)
if len(args) == 1:
element_type = args[0]
# Element must be a PbValueType
if element_type not in pb_value_types:
from mlflow.exceptions import MlflowException
raise MlflowException.invalid_parameter_value(
"The `feedback_value_type` argument does not support a list type"
f"with non-primitive values, but got {element_type.__name__}"
)
return
# If we get here, it's an unsupported type
from mlflow.exceptions import MlflowException
raise MlflowException.invalid_parameter_value(
f"Unsupported feedback_value_type: {feedback_value_type}. "
f"Supported types (FeedbackValueType): str, int, float, bool, Literal[...], "
f"as well as a dict and list of these types. "
f"Pydantic BaseModel types are not supported."
)
[docs]@experimental(version="3.4.0")
@record_usage_event(MakeJudgeEvent)
def make_judge(
name: str,
instructions: str,
model: str | None = None,
description: str | None = None,
feedback_value_type: Any = None,
) -> Judge:
"""
.. note::
As of MLflow 3.4.0, this function is deprecated in favor of `mlflow.genai.make_judge`
and may be removed in a future version.
Create a custom MLflow judge instance.
Args:
name: The name of the judge
instructions: Natural language instructions for evaluation. Must contain at least one
template variable: {{ inputs }}, {{ outputs }}, {{ expectations }},
or {{ trace }} to reference evaluation data. Custom variables are not
supported.
model: The model identifier to use for evaluation (e.g., "openai:/gpt-4")
description: A description of what the judge evaluates
feedback_value_type: Type specification for the 'value' field in the Feedback
object. The judge will use structured outputs to enforce this type.
If unspecified, the feedback value type is determined by the judge.
It is recommended to explicitly specify the type.
Supported types (matching FeedbackValueType):
- int: Integer ratings (e.g., 1-5 scale)
- float: Floating point scores (e.g., 0.0-1.0)
- str: Text responses
- bool: Yes/no evaluations
- Literal[values]: Enum-like choices (e.g., Literal["good", "bad"])
- dict[str, int | float | str | bool]: Dictionary with string keys and
int, float, str, or bool values.
- list[int | float | str | bool]: List of int, float, str, or bool values
Note: Pydantic BaseModel types are not supported.
Returns:
An InstructionsJudge instance configured with the provided parameters
Example:
.. code-block:: python
import mlflow
from mlflow.genai.judges import make_judge
from typing import Literal
# Create a judge that evaluates response quality using template variables
quality_judge = make_judge(
name="response_quality",
instructions=(
"Evaluate if the response in {{ outputs }} correctly answers "
"the question in {{ inputs }}. The response should be accurate, "
"complete, and professional."
),
model="openai:/gpt-4",
feedback_value_type=Literal["yes", "no"],
)
# Evaluate a response
result = quality_judge(
inputs={"question": "What is machine learning?"},
outputs="ML is basically when computers learn stuff on their own",
)
# Create a judge that compares against expectations
correctness_judge = make_judge(
name="correctness",
instructions=(
"Compare the {{ outputs }} against the {{ expectations }}. "
"Rate how well they match on a scale of 1-5."
),
model="openai:/gpt-4",
feedback_value_type=int,
)
# Evaluate with expectations (must be dictionaries)
result = correctness_judge(
inputs={"question": "What is the capital of France?"},
outputs={"answer": "The capital of France is Paris."},
expectations={"expected_answer": "Paris"},
)
# Create a judge that evaluates based on trace context
trace_judge = make_judge(
name="trace_quality",
instructions="Evaluate the overall quality of the {{ trace }} execution.",
model="openai:/gpt-4",
feedback_value_type=Literal["good", "needs_improvement"],
)
# Use with search_traces() - evaluate each trace
traces = mlflow.search_traces(experiment_ids=["1"], return_type="list")
for trace in traces:
feedback = trace_judge(trace=trace)
print(f"Trace {trace.info.trace_id}: {feedback.value} - {feedback.rationale}")
# Align a judge with human feedback
aligned_judge = quality_judge.align(traces)
# To see detailed optimization output during alignment, enable DEBUG logging:
# import logging
# logging.getLogger("mlflow.genai.judges.optimizers.simba").setLevel(logging.DEBUG)
"""
# Default feedback_value_type to str if not specified (consistent with MLflow <= 3.5.x)
# TODO: Implement logic to allow the LLM to choose the appropriate value type if not specified
if feedback_value_type is None:
feedback_value_type = str
_validate_feedback_value_type(feedback_value_type)
return InstructionsJudge(
name=name,
instructions=instructions,
model=model,
description=description,
feedback_value_type=feedback_value_type,
)