Source code for mlflow.genai.judges.make_judge

from mlflow.genai.judges.base import Judge
from mlflow.genai.judges.instructions_judge import InstructionsJudge
from mlflow.utils.annotations import experimental


[docs]@experimental(version="3.4.0") def make_judge( name: str, instructions: str, model: str | None = None, ) -> Judge: """ Create a custom MLflow judge instance. Args: name: The name of the judge instructions: Natural language instructions for evaluation. Must contain at least one template variable: {{ inputs }}, {{ outputs }}, {{ expectations }}, or {{ trace }} to reference evaluation data. Custom variables are not supported. model: The model identifier to use for evaluation (e.g., "openai:/gpt-4") Returns: An InstructionsJudge instance configured with the provided parameters Example: .. code-block:: python import mlflow from mlflow.genai.judges import make_judge # Create a judge that evaluates response quality using template variables quality_judge = make_judge( name="response_quality", instructions=( "Evaluate if the response in {{ outputs }} correctly answers " "the question in {{ inputs }}. The response should be accurate, " "complete, and professional." ), model="openai:/gpt-4", ) # Evaluate a response result = quality_judge( inputs={"question": "What is machine learning?"}, outputs="ML is basically when computers learn stuff on their own", ) # Create a judge that compares against expectations correctness_judge = make_judge( name="correctness", instructions=( "Compare the {{ outputs }} against the {{ expectations }}. " "Rate how well they match on a scale of 1-5." ), model="openai:/gpt-4", ) # Evaluate with expectations (must be dictionaries) result = correctness_judge( inputs={"question": "What is the capital of France?"}, outputs={"answer": "The capital of France is Paris."}, expectations={"expected_answer": "Paris"}, ) # Create a judge that evaluates based on trace context trace_judge = make_judge( name="trace_quality", instructions="Evaluate the overall quality of the {{ trace }} execution.", model="openai:/gpt-4", ) # Use with search_traces() - evaluate each trace traces = mlflow.search_traces(experiment_ids=["1"], return_type="list") for trace in traces: feedback = trace_judge(trace=trace) print(f"Trace {trace.info.trace_id}: {feedback.value} - {feedback.rationale}") """ return InstructionsJudge(name=name, instructions=instructions, model=model)