from functools import wraps
from typing import Any
from mlflow.entities.assessment import Feedback
from mlflow.genai.judges.prompts.relevance_to_query import RELEVANCE_TO_QUERY_ASSESSMENT_NAME
from mlflow.genai.judges.utils import CategoricalRating, get_default_model, invoke_judge_model
from mlflow.utils.docstring_utils import format_docstring
_MODEL_API_DOC = {
"model": """Judge model to use. Must be either `"databricks"` or a form of
`<provider>:/<model-name>`, such as `"openai:/gpt-4.1-mini"`,
`"anthropic:/claude-3.5-sonnet-20240620"`. MLflow natively supports
`["openai", "anthropic", "bedrock", "mistral"]`, and more providers are supported
through `LiteLLM <https://docs.litellm.ai/docs/providers>`_.
Default model depends on the tracking URI setup:
* Databricks: `databricks`
* Otherwise: `openai:/gpt-4.1-mini`.
""",
}
def _sanitize_feedback(feedback: Feedback) -> Feedback:
"""Sanitize the feedback object from the databricks judges.
The judge returns a CategoricalRating class defined in the databricks-agents package.
This function converts it to our CategoricalRating definition above.
Args:
feedback: The Feedback object to convert.
Returns:
A new Feedback object with our CategoricalRating.
"""
feedback.value = CategoricalRating(feedback.value) if feedback.value else feedback.value
return feedback
def requires_databricks_agents(func):
"""Decorator to check if the `databricks-agents` package is installed."""
@wraps(func)
def wrapper(*args, **kwargs):
try:
import databricks.agents.evals.judges # noqa: F401
except ImportError:
raise ImportError(
f"The `databricks-agents` package is required to use "
f"`mlflow.genai.judges.{func.__name__}`. "
"Please install it with `pip install databricks-agents`."
)
return func(*args, **kwargs)
return wrapper
[docs]@format_docstring(_MODEL_API_DOC)
def is_context_relevant(
*, request: str, context: Any, name: str | None = None, model: str | None = None
) -> Feedback:
"""
LLM judge determines whether the given context is relevant to the input request.
Args:
request: Input to the application to evaluate, user's question or query.
context: Context to evaluate the relevance to the request.
Supports any JSON-serializable object.
name: Optional name for overriding the default name of the returned feedback.
model: {{ model }}
Returns:
A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value
indicating whether the context is relevant to the request.
Example:
The following example shows how to evaluate whether a document retrieved by a
retriever is relevant to the user's question.
.. code-block:: python
from mlflow.genai.judges import is_context_relevant
feedback = is_context_relevant(
request="What is the capital of France?",
context="Paris is the capital of France.",
)
print(feedback.value) # "yes"
feedback = is_context_relevant(
request="What is the capital of France?",
context="Paris is known for its Eiffel Tower.",
)
print(feedback.value) # "no"
"""
from mlflow.genai.judges.prompts.relevance_to_query import get_prompt
model = model or get_default_model()
# NB: User-facing name for the is_context_relevant assessment. This is required
# since the existing databricks judge is called `relevance_to_query`
assessment_name = name or RELEVANCE_TO_QUERY_ASSESSMENT_NAME
if model == "databricks":
from databricks.agents.evals.judges import relevance_to_query
feedback = relevance_to_query(
request=request,
response=str(context),
assessment_name=assessment_name,
)
else:
prompt = get_prompt(request, str(context))
feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name)
return _sanitize_feedback(feedback)
[docs]@format_docstring(_MODEL_API_DOC)
def is_context_sufficient(
*,
request: str,
context: Any,
expected_facts: list[str],
expected_response: str | None = None,
name: str | None = None,
model: str | None = None,
) -> Feedback:
"""
LLM judge determines whether the given context is sufficient to answer the input request.
Args:
request: Input to the application to evaluate, user's question or query.
context: Context to evaluate the sufficiency of. Supports any JSON-serializable object.
expected_facts: A list of expected facts that should be present in the context. Optional.
expected_response: The expected response from the application. Optional.
name: Optional name for overriding the default name of the returned feedback.
model: {{ model }}
Returns:
A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
value indicating whether the context is sufficient to answer the request.
Example:
The following example shows how to evaluate whether the documents returned by a
retriever gives sufficient context to answer the user's question.
.. code-block:: python
from mlflow.genai.judges import is_context_sufficient
feedback = is_context_sufficient(
request="What is the capital of France?",
context=[
{"content": "Paris is the capital of France."},
{"content": "Paris is known for its Eiffel Tower."},
],
expected_facts=["Paris is the capital of France."],
)
print(feedback.value) # "yes"
feedback = is_context_sufficient(
request="What is the capital of France?",
context={"content": "France is a country in Europe."},
expected_response="Paris is the capital of France.",
)
print(feedback.value) # "no"
"""
from mlflow.genai.judges.prompts.context_sufficiency import (
CONTEXT_SUFFICIENCY_FEEDBACK_NAME,
get_prompt,
)
model = model or get_default_model()
assessment_name = name or CONTEXT_SUFFICIENCY_FEEDBACK_NAME
if model == "databricks":
from databricks.agents.evals.judges import context_sufficiency
feedback = context_sufficiency(
request=request,
retrieved_context=context,
expected_facts=expected_facts,
expected_response=expected_response,
assessment_name=assessment_name,
)
else:
prompt = get_prompt(
request=request,
context=context,
expected_response=expected_response,
expected_facts=expected_facts,
)
feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name)
return _sanitize_feedback(feedback)
[docs]@format_docstring(_MODEL_API_DOC)
def is_correct(
*,
request: str,
response: str,
expected_facts: list[str] | None = None,
expected_response: str | None = None,
name: str | None = None,
model: str | None = None,
) -> Feedback:
"""
LLM judge determines whether the given response is correct for the input request.
Args:
request: Input to the application to evaluate, user's question or query.
response: The response from the application to evaluate.
expected_facts: A list of expected facts that should be present in the response. Optional.
expected_response: The expected response from the application. Optional.
name: Optional name for overriding the default name of the returned feedback.
model: {{ model }}
Returns:
A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
value indicating whether the response is correct for the request.
Example:
The following example shows how to evaluate whether the response is correct.
.. code-block:: python
from mlflow.genai.judges import is_correct
feedback = is_correct(
request="What is the capital of France?",
response="Paris is the capital of France.",
expected_response="Paris",
)
print(feedback.value) # "yes"
feedback = is_correct(
request="What is the capital of France?",
response="London is the capital of France.",
expected_facts=["Paris is the capital of France"],
)
print(feedback.value) # "no"
"""
from mlflow.genai.judges.prompts.correctness import CORRECTNESS_FEEDBACK_NAME, get_prompt
model = model or get_default_model()
assessment_name = name or CORRECTNESS_FEEDBACK_NAME
if model == "databricks":
from databricks.agents.evals.judges import correctness
feedback = correctness(
request=request,
response=response,
expected_facts=expected_facts,
expected_response=expected_response,
assessment_name=assessment_name,
)
else:
prompt = get_prompt(
request=request,
response=response,
expected_response=expected_response,
expected_facts=expected_facts,
)
feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name)
return _sanitize_feedback(feedback)
[docs]@format_docstring(_MODEL_API_DOC)
def is_grounded(
*,
request: str,
response: str,
context: Any,
name: str | None = None,
model: str | None = None,
) -> Feedback:
"""
LLM judge determines whether the given response is grounded in the given context.
Args:
request: Input to the application to evaluate, user's question or query.
response: The response from the application to evaluate.
context: Context to evaluate the response against. Supports any JSON-serializable object.
name: Optional name for overriding the default name of the returned feedback.
model: {{ model }}
Returns:
A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
value indicating whether the response is grounded in the context.
Example:
The following example shows how to evaluate whether the response is grounded in
the context.
.. code-block:: python
from mlflow.genai.judges import is_grounded
feedback = is_grounded(
request="What is the capital of France?",
response="Paris",
context=[
{"content": "Paris is the capital of France."},
{"content": "Paris is known for its Eiffel Tower."},
],
)
print(feedback.value) # "yes"
feedback = is_grounded(
request="What is the capital of France?",
response="London is the capital of France.",
context=[
{"content": "Paris is the capital of France."},
{"content": "Paris is known for its Eiffel Tower."},
],
)
print(feedback.value) # "no"
"""
from mlflow.genai.judges.prompts.groundedness import GROUNDEDNESS_FEEDBACK_NAME, get_prompt
model = model or get_default_model()
assessment_name = name or GROUNDEDNESS_FEEDBACK_NAME
if model == "databricks":
from databricks.agents.evals.judges import groundedness
feedback = groundedness(
request=request,
response=response,
retrieved_context=context,
assessment_name=assessment_name,
)
else:
prompt = get_prompt(
request=request,
response=response,
context=context,
)
feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name)
return _sanitize_feedback(feedback)
[docs]@requires_databricks_agents
def is_safe(*, content: str, name: str | None = None) -> Feedback:
"""
LLM judge determines whether the given response is safe.
Args:
content: Text content to evaluate for safety.
name: Optional name for overriding the default name of the returned feedback.
Returns:
A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
value indicating whether the response is safe.
Example:
.. code-block:: python
from mlflow.genai.judges import is_safe
feedback = is_safe(content="I am a happy person.")
print(feedback.value) # "yes"
"""
from databricks.agents.evals.judges import safety
return _sanitize_feedback(safety(response=content, assessment_name=name))
[docs]@format_docstring(_MODEL_API_DOC)
def meets_guidelines(
*,
guidelines: str | list[str],
context: dict[str, Any],
name: str | None = None,
model: str | None = None,
) -> Feedback:
"""
LLM judge determines whether the given response meets the given guideline(s).
Args:
guidelines: A single guideline or a list of guidelines.
context: Mapping of context to be evaluated against the guidelines. For example,
pass {"response": "<response text>"} to evaluate whether the response meets
the given guidelines.
name: Optional name for overriding the default name of the returned feedback.
model: {{ model }}
Returns:
A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
value indicating whether the response meets the guideline(s).
Example:
The following example shows how to evaluate whether the response meets the given
guideline(s).
.. code-block:: python
from mlflow.genai.judges import meets_guidelines
feedback = meets_guidelines(
guidelines="Be polite and respectful.",
context={"response": "Hello, how are you?"},
)
print(feedback.value) # "yes"
feedback = meets_guidelines(
guidelines=["Be polite and respectful.", "Must be in English."],
context={"response": "Hola, ¿cómo estás?"},
)
print(feedback.value) # "no"
"""
from mlflow.genai.judges.prompts.guidelines import GUIDELINES_FEEDBACK_NAME, get_prompt
model = model or get_default_model()
if model == "databricks":
from databricks.agents.evals.judges import guidelines as guidelines_judge
feedback = guidelines_judge(
guidelines=guidelines,
context=context,
assessment_name=name,
)
else:
prompt = get_prompt(guidelines, context)
feedback = invoke_judge_model(
model, prompt, assessment_name=name or GUIDELINES_FEEDBACK_NAME
)
return _sanitize_feedback(feedback)