import re
from difflib import unified_diff
from typing import Callable
from mlflow.entities.assessment import Feedback
from mlflow.entities.assessment_source import AssessmentSource, AssessmentSourceType
from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.judges.utils import (
format_prompt,
get_default_model,
invoke_judge_model,
)
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring
_CHOICE_PATTERN = re.compile(r"\[\[([\w ]+)\]\]")
[docs]@format_docstring(_MODEL_API_DOC)
@experimental(version="3.0.0")
def custom_prompt_judge(
*,
name: str,
prompt_template: str,
numeric_values: dict[str, float] | None = None,
model: str | None = None,
) -> Callable[..., Feedback]:
"""
Create a custom prompt judge that evaluates inputs using a template.
Args:
name: Name of the judge, used as the name of returned
:py:class:`mlflow.entities.Feedback` object.
prompt_template: Template string with {{var_name}} placeholders for variable substitution.
Should be prompted with choices as outputs.
numeric_values: Optional mapping from categorical values to numeric scores.
Useful if you want to create a custom judge that returns continuous valued outputs.
Defaults to None.
model: {{ model }}
Returns:
A callable that takes keyword arguments mapping to the template variables
and returns an mlflow :py:class:`mlflow.entities.Feedback`.
Example prompt template:
.. code-block::
You will look at the response and determine the formality of the response.
<request>{{request}}</request>
<response>{{response}}</response>
You must choose one of the following categories.
[[formal]]: The response is very formal.
[[semi_formal]]: The response is somewhat formal. The response is somewhat formal if the
response mentions friendship, etc.
[[not_formal]]: The response is not formal.
Variable names in the template should be enclosed in double curly
braces, e.g., `{{request}}`, `{{response}}`. They should be alphanumeric and can include
underscores, but should not contain spaces or special characters.
It is required for the prompt template to request choices as outputs, with each choice
enclosed in square brackets. Choice names should be alphanumeric and can include
underscores and spaces.
"""
model = model or get_default_model()
if model == "databricks":
try:
from databricks.agents.evals.judges import custom_prompt_judge as db_custom_prompt_judge
return db_custom_prompt_judge(
name=name,
prompt_template=prompt_template,
numeric_values=numeric_values,
)
except ImportError:
raise ImportError(
"The `databricks-agents` package is required to use "
"`mlflow.genai.judges.custom_prompt_judge` with model='databricks'. "
"Please install it with `pip install databricks-agents`."
)
# Extract choices from the prompt template
choices = _CHOICE_PATTERN.findall(prompt_template)
if not choices:
raise ValueError(
"Prompt template must include choices denoted with [[CHOICE_NAME]]. "
"No choices found in the provided prompt template."
)
# Validate that choices match numeric_values keys if provided
if numeric_values is not None:
sorted_numeric_values = sorted(numeric_values.keys())
sorted_choices = sorted(choices)
if sorted_numeric_values != sorted_choices:
diff = "\n".join(
unified_diff(
sorted_numeric_values,
sorted_choices,
fromfile="numeric_values_keys",
tofile="choices",
)
)
raise ValueError(
f"numeric_values keys must match the choices included in the prompt template.\n"
f"numeric_values keys: {sorted_numeric_values}\n"
f"choices in prompt: {sorted_choices}\n"
f"Diff:\n{diff}"
)
# Validate that numeric_values values are numeric if provided
if not all(isinstance(value, (int, float)) for value in numeric_values.values()):
raise ValueError("All values in numeric_values must be numeric (int or float).")
source = AssessmentSource(
source_type=AssessmentSourceType.LLM_JUDGE,
source_id=f"custom_prompt_judge_{name}",
)
def judge(**kwargs) -> Feedback:
try:
# Render prompt template with the given kwargs
prompt = format_prompt(prompt_template, **kwargs)
prompt = _remove_choice_brackets(prompt)
prompt = _add_structured_output_instructions(prompt)
# Call the judge
feedback = invoke_judge_model(model, prompt, name)
feedback.source = source
# Feedback value must be one of the choices
if feedback.value not in choices:
raise ValueError(f"'{feedback.value}' is not one of the choices: {choices}")
# Map to numeric value if mapping is provided
if numeric_values:
feedback.metadata = {"string_value": feedback.value}
feedback.value = numeric_values[feedback.value]
return feedback
except Exception as e:
return Feedback(name=name, source=source, error=e)
return judge
def _add_structured_output_instructions(prompt: str) -> str:
"""Add JSON format instructions to the user prompt."""
suffix = """
Answer ONLY in JSON and NOT in markdown, following the format:
{
"rationale": "Reason for the decision. Start each rationale with `Let's think step by step`.",
"result": "The category chosen."
}
"""
return f"{prompt.strip()}\n\n{suffix}"
def _remove_choice_brackets(text: str) -> str:
"""Remove double square brackets around choices."""
return _CHOICE_PATTERN.sub(r"\1", text)