Source code for mlflow.genai.judges.make_judge

from typing import Any, Literal, get_args, get_origin

from mlflow.genai.judges.base import Judge
from mlflow.genai.judges.instructions_judge import InstructionsJudge
from mlflow.telemetry.events import MakeJudgeEvent
from mlflow.telemetry.track import record_usage_event
from mlflow.utils.annotations import experimental


def _validate_feedback_value_type(feedback_value_type: Any) -> None:
    """
    Validate that feedback_value_type is one of the supported types for serialization.

    Supported types match FeedbackValueType:
    - PbValueType: int, float, str, bool
    - Literal types with PbValueType values
    - dict[str, PbValueType]
    - list[PbValueType]
    """

    from mlflow.entities.assessment import PbValueType

    # Check for basic PbValueType (float, int, str, bool)
    pb_value_types = get_args(PbValueType)
    if feedback_value_type in pb_value_types:
        return

    # Check for Literal type
    origin = get_origin(feedback_value_type)
    if origin is Literal:
        # Validate that all literal values are of PbValueType
        literal_values = get_args(feedback_value_type)
        for value in literal_values:
            if not isinstance(value, pb_value_types):
                from mlflow.exceptions import MlflowException

                raise MlflowException.invalid_parameter_value(
                    "The `feedback_value_type` argument does not support a Literal type"
                    f"with non-primitive types, but got {type(value).__name__}. "
                    f"Literal values must be str, int, float, or bool."
                )
        return

    # Check for dict[str, PbValueType]
    if origin is dict:
        args = get_args(feedback_value_type)
        if len(args) == 2:
            key_type, value_type = args
            # Key must be str
            if key_type != str:
                from mlflow.exceptions import MlflowException

                raise MlflowException.invalid_parameter_value(
                    f"dict key type must be str, got {key_type}"
                )
            # Value must be a PbValueType
            if value_type not in pb_value_types:
                from mlflow.exceptions import MlflowException

                raise MlflowException.invalid_parameter_value(
                    "The `feedback_value_type` argument does not support a dict type"
                    f"with non-primitive values, but got {value_type.__name__}"
                )
            return

    # Check for list[PbValueType]
    if origin is list:
        args = get_args(feedback_value_type)
        if len(args) == 1:
            element_type = args[0]
            # Element must be a PbValueType
            if element_type not in pb_value_types:
                from mlflow.exceptions import MlflowException

                raise MlflowException.invalid_parameter_value(
                    "The `feedback_value_type` argument does not support a list type"
                    f"with non-primitive values, but got {element_type.__name__}"
                )
            return

    # If we get here, it's an unsupported type
    from mlflow.exceptions import MlflowException

    raise MlflowException.invalid_parameter_value(
        f"Unsupported feedback_value_type: {feedback_value_type}. "
        f"Supported types (FeedbackValueType): str, int, float, bool, Literal[...], "
        f"as well as a dict and list of these types. "
        f"Pydantic BaseModel types are not supported."
    )


[docs]@experimental(version="3.4.0") @record_usage_event(MakeJudgeEvent) def make_judge( name: str, instructions: str, model: str | None = None, description: str | None = None, feedback_value_type: Any = None, ) -> Judge: """ .. note:: As of MLflow 3.4.0, this function is deprecated in favor of `mlflow.genai.make_judge` and may be removed in a future version. Create a custom MLflow judge instance. Args: name: The name of the judge instructions: Natural language instructions for evaluation. Must contain at least one template variable: {{ inputs }}, {{ outputs }}, {{ expectations }}, or {{ trace }} to reference evaluation data. Custom variables are not supported. model: The model identifier to use for evaluation (e.g., "openai:/gpt-4") description: A description of what the judge evaluates feedback_value_type: Type specification for the 'value' field in the Feedback object. The judge will use structured outputs to enforce this type. If unspecified, the feedback value type is determined by the judge. It is recommended to explicitly specify the type. Supported types (matching FeedbackValueType): - int: Integer ratings (e.g., 1-5 scale) - float: Floating point scores (e.g., 0.0-1.0) - str: Text responses - bool: Yes/no evaluations - Literal[values]: Enum-like choices (e.g., Literal["good", "bad"]) - dict[str, int | float | str | bool]: Dictionary with string keys and int, float, str, or bool values. - list[int | float | str | bool]: List of int, float, str, or bool values Note: Pydantic BaseModel types are not supported. Returns: An InstructionsJudge instance configured with the provided parameters Example: .. code-block:: python import mlflow from mlflow.genai.judges import make_judge from typing import Literal # Create a judge that evaluates response quality using template variables quality_judge = make_judge( name="response_quality", instructions=( "Evaluate if the response in {{ outputs }} correctly answers " "the question in {{ inputs }}. The response should be accurate, " "complete, and professional." ), model="openai:/gpt-4", feedback_value_type=Literal["yes", "no"], ) # Evaluate a response result = quality_judge( inputs={"question": "What is machine learning?"}, outputs="ML is basically when computers learn stuff on their own", ) # Create a judge that compares against expectations correctness_judge = make_judge( name="correctness", instructions=( "Compare the {{ outputs }} against the {{ expectations }}. " "Rate how well they match on a scale of 1-5." ), model="openai:/gpt-4", feedback_value_type=int, ) # Evaluate with expectations (must be dictionaries) result = correctness_judge( inputs={"question": "What is the capital of France?"}, outputs={"answer": "The capital of France is Paris."}, expectations={"expected_answer": "Paris"}, ) # Create a judge that evaluates based on trace context trace_judge = make_judge( name="trace_quality", instructions="Evaluate the overall quality of the {{ trace }} execution.", model="openai:/gpt-4", feedback_value_type=Literal["good", "needs_improvement"], ) # Use with search_traces() - evaluate each trace traces = mlflow.search_traces(experiment_ids=["1"], return_type="list") for trace in traces: feedback = trace_judge(trace=trace) print(f"Trace {trace.info.trace_id}: {feedback.value} - {feedback.rationale}") # Align a judge with human feedback aligned_judge = quality_judge.align(traces) # To see detailed optimization output during alignment, enable DEBUG logging: # import logging # logging.getLogger("mlflow.genai.judges.optimizers.simba").setLevel(logging.DEBUG) """ # Default feedback_value_type to str if not specified (consistent with MLflow <= 3.5.x) # TODO: Implement logic to allow the LLM to choose the appropriate value type if not specified if feedback_value_type is None: feedback_value_type = str _validate_feedback_value_type(feedback_value_type) return InstructionsJudge( name=name, instructions=instructions, model=model, description=description, feedback_value_type=feedback_value_type, )