Source code for mlflow.genai.judges.builtin

from functools import wraps
from typing import Any

from mlflow.entities.assessment import Feedback
from mlflow.genai.judges.prompts.relevance_to_query import RELEVANCE_TO_QUERY_ASSESSMENT_NAME
from mlflow.genai.judges.utils import CategoricalRating, get_default_model, invoke_judge_model
from mlflow.utils.docstring_utils import format_docstring

_MODEL_API_DOC = {
    "model": """Judge model to use. Must be either `"databricks"` or a form of
`<provider>:/<model-name>`, such as `"openai:/gpt-4.1-mini"`,
`"anthropic:/claude-3.5-sonnet-20240620"`. MLflow natively supports
`["openai", "anthropic", "bedrock", "mistral"]`, and more providers are supported
through `LiteLLM <https://docs.litellm.ai/docs/providers>`_.
Default model depends on the tracking URI setup:

* Databricks: `databricks`
* Otherwise: `openai:/gpt-4.1-mini`.
""",
}


def _sanitize_feedback(feedback: Feedback) -> Feedback:
    """Sanitize the feedback object from the databricks judges.

    The judge returns a CategoricalRating class defined in the databricks-agents package.
    This function converts it to our CategoricalRating definition above.

    Args:
        feedback: The Feedback object to convert.

    Returns:
        A new Feedback object with our CategoricalRating.
    """
    feedback.value = CategoricalRating(feedback.value) if feedback.value else feedback.value
    return feedback


def requires_databricks_agents(func):
    """Decorator to check if the `databricks-agents` package is installed."""

    @wraps(func)
    def wrapper(*args, **kwargs):
        try:
            import databricks.agents.evals.judges  # noqa: F401

        except ImportError:
            raise ImportError(
                f"The `databricks-agents` package is required to use "
                f"`mlflow.genai.judges.{func.__name__}`. "
                "Please install it with `pip install databricks-agents`."
            )

        return func(*args, **kwargs)

    return wrapper


[docs]@format_docstring(_MODEL_API_DOC) def is_context_relevant( *, request: str, context: Any, name: str | None = None, model: str | None = None ) -> Feedback: """ LLM judge determines whether the given context is relevant to the input request. Args: request: Input to the application to evaluate, user's question or query. context: Context to evaluate the relevance to the request. Supports any JSON-serializable object. name: Optional name for overriding the default name of the returned feedback. model: {{ model }} Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the context is relevant to the request. Example: The following example shows how to evaluate whether a document retrieved by a retriever is relevant to the user's question. .. code-block:: python from mlflow.genai.judges import is_context_relevant feedback = is_context_relevant( request="What is the capital of France?", context="Paris is the capital of France.", ) print(feedback.value) # "yes" feedback = is_context_relevant( request="What is the capital of France?", context="Paris is known for its Eiffel Tower.", ) print(feedback.value) # "no" """ from mlflow.genai.judges.prompts.relevance_to_query import get_prompt model = model or get_default_model() # NB: User-facing name for the is_context_relevant assessment. This is required # since the existing databricks judge is called `relevance_to_query` assessment_name = name or RELEVANCE_TO_QUERY_ASSESSMENT_NAME if model == "databricks": from databricks.agents.evals.judges import relevance_to_query feedback = relevance_to_query( request=request, response=str(context), assessment_name=assessment_name, ) else: prompt = get_prompt(request, str(context)) feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name) return _sanitize_feedback(feedback)
[docs]@format_docstring(_MODEL_API_DOC) def is_context_sufficient( *, request: str, context: Any, expected_facts: list[str], expected_response: str | None = None, name: str | None = None, model: str | None = None, ) -> Feedback: """ LLM judge determines whether the given context is sufficient to answer the input request. Args: request: Input to the application to evaluate, user's question or query. context: Context to evaluate the sufficiency of. Supports any JSON-serializable object. expected_facts: A list of expected facts that should be present in the context. Optional. expected_response: The expected response from the application. Optional. name: Optional name for overriding the default name of the returned feedback. model: {{ model }} Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the context is sufficient to answer the request. Example: The following example shows how to evaluate whether the documents returned by a retriever gives sufficient context to answer the user's question. .. code-block:: python from mlflow.genai.judges import is_context_sufficient feedback = is_context_sufficient( request="What is the capital of France?", context=[ {"content": "Paris is the capital of France."}, {"content": "Paris is known for its Eiffel Tower."}, ], expected_facts=["Paris is the capital of France."], ) print(feedback.value) # "yes" feedback = is_context_sufficient( request="What is the capital of France?", context={"content": "France is a country in Europe."}, expected_response="Paris is the capital of France.", ) print(feedback.value) # "no" """ from mlflow.genai.judges.prompts.context_sufficiency import ( CONTEXT_SUFFICIENCY_FEEDBACK_NAME, get_prompt, ) model = model or get_default_model() assessment_name = name or CONTEXT_SUFFICIENCY_FEEDBACK_NAME if model == "databricks": from databricks.agents.evals.judges import context_sufficiency feedback = context_sufficiency( request=request, retrieved_context=context, expected_facts=expected_facts, expected_response=expected_response, assessment_name=assessment_name, ) else: prompt = get_prompt( request=request, context=context, expected_response=expected_response, expected_facts=expected_facts, ) feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name) return _sanitize_feedback(feedback)
[docs]@format_docstring(_MODEL_API_DOC) def is_correct( *, request: str, response: str, expected_facts: list[str] | None = None, expected_response: str | None = None, name: str | None = None, model: str | None = None, ) -> Feedback: """ LLM judge determines whether the given response is correct for the input request. Args: request: Input to the application to evaluate, user's question or query. response: The response from the application to evaluate. expected_facts: A list of expected facts that should be present in the response. Optional. expected_response: The expected response from the application. Optional. name: Optional name for overriding the default name of the returned feedback. model: {{ model }} Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response is correct for the request. Example: The following example shows how to evaluate whether the response is correct. .. code-block:: python from mlflow.genai.judges import is_correct feedback = is_correct( request="What is the capital of France?", response="Paris is the capital of France.", expected_response="Paris", ) print(feedback.value) # "yes" feedback = is_correct( request="What is the capital of France?", response="London is the capital of France.", expected_facts=["Paris is the capital of France"], ) print(feedback.value) # "no" """ from mlflow.genai.judges.prompts.correctness import CORRECTNESS_FEEDBACK_NAME, get_prompt model = model or get_default_model() assessment_name = name or CORRECTNESS_FEEDBACK_NAME if model == "databricks": from databricks.agents.evals.judges import correctness feedback = correctness( request=request, response=response, expected_facts=expected_facts, expected_response=expected_response, assessment_name=assessment_name, ) else: prompt = get_prompt( request=request, response=response, expected_response=expected_response, expected_facts=expected_facts, ) feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name) return _sanitize_feedback(feedback)
[docs]@format_docstring(_MODEL_API_DOC) def is_grounded( *, request: str, response: str, context: Any, name: str | None = None, model: str | None = None, ) -> Feedback: """ LLM judge determines whether the given response is grounded in the given context. Args: request: Input to the application to evaluate, user's question or query. response: The response from the application to evaluate. context: Context to evaluate the response against. Supports any JSON-serializable object. name: Optional name for overriding the default name of the returned feedback. model: {{ model }} Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response is grounded in the context. Example: The following example shows how to evaluate whether the response is grounded in the context. .. code-block:: python from mlflow.genai.judges import is_grounded feedback = is_grounded( request="What is the capital of France?", response="Paris", context=[ {"content": "Paris is the capital of France."}, {"content": "Paris is known for its Eiffel Tower."}, ], ) print(feedback.value) # "yes" feedback = is_grounded( request="What is the capital of France?", response="London is the capital of France.", context=[ {"content": "Paris is the capital of France."}, {"content": "Paris is known for its Eiffel Tower."}, ], ) print(feedback.value) # "no" """ from mlflow.genai.judges.prompts.groundedness import GROUNDEDNESS_FEEDBACK_NAME, get_prompt model = model or get_default_model() assessment_name = name or GROUNDEDNESS_FEEDBACK_NAME if model == "databricks": from databricks.agents.evals.judges import groundedness feedback = groundedness( request=request, response=response, retrieved_context=context, assessment_name=assessment_name, ) else: prompt = get_prompt( request=request, response=response, context=context, ) feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name) return _sanitize_feedback(feedback)
[docs]@requires_databricks_agents def is_safe(*, content: str, name: str | None = None) -> Feedback: """ LLM judge determines whether the given response is safe. Args: content: Text content to evaluate for safety. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response is safe. Example: .. code-block:: python from mlflow.genai.judges import is_safe feedback = is_safe(content="I am a happy person.") print(feedback.value) # "yes" """ from databricks.agents.evals.judges import safety return _sanitize_feedback(safety(response=content, assessment_name=name))
[docs]@format_docstring(_MODEL_API_DOC) def meets_guidelines( *, guidelines: str | list[str], context: dict[str, Any], name: str | None = None, model: str | None = None, ) -> Feedback: """ LLM judge determines whether the given response meets the given guideline(s). Args: guidelines: A single guideline or a list of guidelines. context: Mapping of context to be evaluated against the guidelines. For example, pass {"response": "<response text>"} to evaluate whether the response meets the given guidelines. name: Optional name for overriding the default name of the returned feedback. model: {{ model }} Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response meets the guideline(s). Example: The following example shows how to evaluate whether the response meets the given guideline(s). .. code-block:: python from mlflow.genai.judges import meets_guidelines feedback = meets_guidelines( guidelines="Be polite and respectful.", context={"response": "Hello, how are you?"}, ) print(feedback.value) # "yes" feedback = meets_guidelines( guidelines=["Be polite and respectful.", "Must be in English."], context={"response": "Hola, ¿cómo estás?"}, ) print(feedback.value) # "no" """ from mlflow.genai.judges.prompts.guidelines import GUIDELINES_FEEDBACK_NAME, get_prompt model = model or get_default_model() if model == "databricks": from databricks.agents.evals.judges import guidelines as guidelines_judge feedback = guidelines_judge( guidelines=guidelines, context=context, assessment_name=name, ) else: prompt = get_prompt(guidelines, context) feedback = invoke_judge_model( model, prompt, assessment_name=name or GUIDELINES_FEEDBACK_NAME ) return _sanitize_feedback(feedback)