Source code for mlflow.genai.judges.builtin

from functools import wraps
from typing import Any

from mlflow.entities.assessment import Feedback
from mlflow.genai.judges.prompts.relevance_to_query import RELEVANCE_TO_QUERY_ASSESSMENT_NAME
from mlflow.genai.judges.utils import CategoricalRating, get_default_model, invoke_judge_model
from mlflow.utils.docstring_utils import format_docstring

_MODEL_API_DOC = {
    "model": """Judge model to use. Must be either `"databricks"` or a form of
`<provider>:/<model-name>`, such as `"openai:/gpt-4.1-mini"`,
`"anthropic:/claude-3.5-sonnet-20240620"`. MLflow natively supports
`["openai", "anthropic", "bedrock", "mistral"]`, and more providers are supported
through `LiteLLM <https://docs.litellm.ai/docs/providers>`_.
Default model depends on the tracking URI setup:

* Databricks: `databricks`
* Otherwise: `openai:/gpt-4.1-mini`.
""",
}


def _sanitize_feedback(feedback: Feedback) -> Feedback:
    """Sanitize the feedback object from the databricks judges.

    The judge returns a CategoricalRating class defined in the databricks-agents package.
    This function converts it to our CategoricalRating definition above.

    Args:
        feedback: The Feedback object to convert.

    Returns:
        A new Feedback object with our CategoricalRating.
    """
    feedback.value = CategoricalRating(feedback.value) if feedback.value else feedback.value
    return feedback


def requires_databricks_agents(func):
    """Decorator to check if the `databricks-agents` package is installed."""

    @wraps(func)
    def wrapper(*args, **kwargs):
        try:
            import databricks.agents.evals.judges  # noqa: F401

        except ImportError:
            raise ImportError(
                f"The `databricks-agents` package is required to use "
                f"`mlflow.genai.judges.{func.__name__}`. "
                "Please install it with `pip install databricks-agents`."
            )

        return func(*args, **kwargs)

    return wrapper


[docs]@format_docstring(_MODEL_API_DOC)
def is_context_relevant(
    *, request: str, context: Any, name: str | None = None, model: str | None = None
) -> Feedback:
    """
    LLM judge determines whether the given context is relevant to the input request.

    Args:
        request: Input to the application to evaluate, user's question or query.
        context: Context to evaluate the relevance to the request.
            Supports any JSON-serializable object.
        name: Optional name for overriding the default name of the returned feedback.
        model: {{ model }}

    Returns:
        A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value
        indicating whether the context is relevant to the request.

    Example:

        The following example shows how to evaluate whether a document retrieved by a
        retriever is relevant to the user's question.

        .. code-block:: python

            from mlflow.genai.judges import is_context_relevant

            feedback = is_context_relevant(
                request="What is the capital of France?",
                context="Paris is the capital of France.",
            )
            print(feedback.value)  # "yes"

            feedback = is_context_relevant(
                request="What is the capital of France?",
                context="Paris is known for its Eiffel Tower.",
            )
            print(feedback.value)  # "no"

    """
    from mlflow.genai.judges.prompts.relevance_to_query import get_prompt

    model = model or get_default_model()

    # NB: User-facing name for the is_context_relevant assessment. This is required
    #     since the existing databricks judge is called `relevance_to_query`
    assessment_name = name or RELEVANCE_TO_QUERY_ASSESSMENT_NAME

    if model == "databricks":
        from databricks.agents.evals.judges import relevance_to_query

        feedback = relevance_to_query(
            request=request,
            response=str(context),
            assessment_name=assessment_name,
        )
    else:
        prompt = get_prompt(request, str(context))
        feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name)

    return _sanitize_feedback(feedback)


[docs]@format_docstring(_MODEL_API_DOC)
def is_context_sufficient(
    *,
    request: str,
    context: Any,
    expected_facts: list[str],
    expected_response: str | None = None,
    name: str | None = None,
    model: str | None = None,
) -> Feedback:
    """
    LLM judge determines whether the given context is sufficient to answer the input request.

    Args:
        request: Input to the application to evaluate, user's question or query.
        context: Context to evaluate the sufficiency of. Supports any JSON-serializable object.
        expected_facts: A list of expected facts that should be present in the context. Optional.
        expected_response: The expected response from the application. Optional.
        name: Optional name for overriding the default name of the returned feedback.
        model: {{ model }}

    Returns:
        A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
        value indicating whether the context is sufficient to answer the request.

    Example:

        The following example shows how to evaluate whether the documents returned by a
        retriever gives sufficient context to answer the user's question.

        .. code-block:: python

            from mlflow.genai.judges import is_context_sufficient

            feedback = is_context_sufficient(
                request="What is the capital of France?",
                context=[
                    {"content": "Paris is the capital of France."},
                    {"content": "Paris is known for its Eiffel Tower."},
                ],
                expected_facts=["Paris is the capital of France."],
            )
            print(feedback.value)  # "yes"

            feedback = is_context_sufficient(
                request="What is the capital of France?",
                context={"content": "France is a country in Europe."},
                expected_response="Paris is the capital of France.",
            )
            print(feedback.value)  # "no"
    """
    from mlflow.genai.judges.prompts.context_sufficiency import (
        CONTEXT_SUFFICIENCY_FEEDBACK_NAME,
        get_prompt,
    )

    model = model or get_default_model()
    assessment_name = name or CONTEXT_SUFFICIENCY_FEEDBACK_NAME

    if model == "databricks":
        from databricks.agents.evals.judges import context_sufficiency

        feedback = context_sufficiency(
            request=request,
            retrieved_context=context,
            expected_facts=expected_facts,
            expected_response=expected_response,
            assessment_name=assessment_name,
        )
    else:
        prompt = get_prompt(
            request=request,
            context=context,
            expected_response=expected_response,
            expected_facts=expected_facts,
        )
        feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name)

    return _sanitize_feedback(feedback)


[docs]@format_docstring(_MODEL_API_DOC)
def is_correct(
    *,
    request: str,
    response: str,
    expected_facts: list[str] | None = None,
    expected_response: str | None = None,
    name: str | None = None,
    model: str | None = None,
) -> Feedback:
    """
    LLM judge determines whether the given response is correct for the input request.

    Args:
        request: Input to the application to evaluate, user's question or query.
        response: The response from the application to evaluate.
        expected_facts: A list of expected facts that should be present in the response. Optional.
        expected_response: The expected response from the application. Optional.
        name: Optional name for overriding the default name of the returned feedback.
        model: {{ model }}

    Returns:
        A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
        value indicating whether the response is correct for the request.

    Example:

        The following example shows how to evaluate whether the response is correct.

        .. code-block:: python

            from mlflow.genai.judges import is_correct

            feedback = is_correct(
                request="What is the capital of France?",
                response="Paris is the capital of France.",
                expected_response="Paris",
            )
            print(feedback.value)  # "yes"

            feedback = is_correct(
                request="What is the capital of France?",
                response="London is the capital of France.",
                expected_facts=["Paris is the capital of France"],
            )
            print(feedback.value)  # "no"
    """
    from mlflow.genai.judges.prompts.correctness import CORRECTNESS_FEEDBACK_NAME, get_prompt

    model = model or get_default_model()
    assessment_name = name or CORRECTNESS_FEEDBACK_NAME

    if model == "databricks":
        from databricks.agents.evals.judges import correctness

        feedback = correctness(
            request=request,
            response=response,
            expected_facts=expected_facts,
            expected_response=expected_response,
            assessment_name=assessment_name,
        )
    else:
        prompt = get_prompt(
            request=request,
            response=response,
            expected_response=expected_response,
            expected_facts=expected_facts,
        )
        feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name)

    return _sanitize_feedback(feedback)


[docs]@format_docstring(_MODEL_API_DOC)
def is_grounded(
    *,
    request: str,
    response: str,
    context: Any,
    name: str | None = None,
    model: str | None = None,
) -> Feedback:
    """
    LLM judge determines whether the given response is grounded in the given context.

    Args:
        request: Input to the application to evaluate, user's question or query.
        response: The response from the application to evaluate.
        context: Context to evaluate the response against. Supports any JSON-serializable object.
        name: Optional name for overriding the default name of the returned feedback.
        model: {{ model }}

    Returns:
        A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
        value indicating whether the response is grounded in the context.

    Example:

        The following example shows how to evaluate whether the response is grounded in
        the context.

        .. code-block:: python

            from mlflow.genai.judges import is_grounded

            feedback = is_grounded(
                request="What is the capital of France?",
                response="Paris",
                context=[
                    {"content": "Paris is the capital of France."},
                    {"content": "Paris is known for its Eiffel Tower."},
                ],
            )
            print(feedback.value)  # "yes"

            feedback = is_grounded(
                request="What is the capital of France?",
                response="London is the capital of France.",
                context=[
                    {"content": "Paris is the capital of France."},
                    {"content": "Paris is known for its Eiffel Tower."},
                ],
            )
            print(feedback.value)  # "no"
    """
    from mlflow.genai.judges.prompts.groundedness import GROUNDEDNESS_FEEDBACK_NAME, get_prompt

    model = model or get_default_model()
    assessment_name = name or GROUNDEDNESS_FEEDBACK_NAME

    if model == "databricks":
        from databricks.agents.evals.judges import groundedness

        feedback = groundedness(
            request=request,
            response=response,
            retrieved_context=context,
            assessment_name=assessment_name,
        )
    else:
        prompt = get_prompt(
            request=request,
            response=response,
            context=context,
        )
        feedback = invoke_judge_model(model, prompt, assessment_name=assessment_name)

    return _sanitize_feedback(feedback)


[docs]@requires_databricks_agents
def is_safe(*, content: str, name: str | None = None) -> Feedback:
    """
    LLM judge determines whether the given response is safe.

    Args:
        content: Text content to evaluate for safety.
        name: Optional name for overriding the default name of the returned feedback.

    Returns:
        A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
        value indicating whether the response is safe.

    Example:

        .. code-block:: python

            from mlflow.genai.judges import is_safe

            feedback = is_safe(content="I am a happy person.")
            print(feedback.value)  # "yes"
    """
    from databricks.agents.evals.judges import safety

    return _sanitize_feedback(safety(response=content, assessment_name=name))


[docs]@format_docstring(_MODEL_API_DOC)
def meets_guidelines(
    *,
    guidelines: str | list[str],
    context: dict[str, Any],
    name: str | None = None,
    model: str | None = None,
) -> Feedback:
    """
    LLM judge determines whether the given response meets the given guideline(s).

    Args:
        guidelines: A single guideline or a list of guidelines.
        context: Mapping of context to be evaluated against the guidelines. For example,
            pass {"response": "<response text>"} to evaluate whether the response meets
            the given guidelines.
        name: Optional name for overriding the default name of the returned feedback.
        model: {{ model }}

    Returns:
        A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no"
        value indicating whether the response meets the guideline(s).

    Example:

        The following example shows how to evaluate whether the response meets the given
        guideline(s).

        .. code-block:: python

            from mlflow.genai.judges import meets_guidelines

            feedback = meets_guidelines(
                guidelines="Be polite and respectful.",
                context={"response": "Hello, how are you?"},
            )
            print(feedback.value)  # "yes"

            feedback = meets_guidelines(
                guidelines=["Be polite and respectful.", "Must be in English."],
                context={"response": "Hola, ¿cómo estás?"},
            )
            print(feedback.value)  # "no"
    """
    from mlflow.genai.judges.prompts.guidelines import GUIDELINES_FEEDBACK_NAME, get_prompt

    model = model or get_default_model()

    if model == "databricks":
        from databricks.agents.evals.judges import guidelines as guidelines_judge

        feedback = guidelines_judge(
            guidelines=guidelines,
            context=context,
            assessment_name=name,
        )
    else:
        prompt = get_prompt(guidelines, context)
        feedback = invoke_judge_model(
            model, prompt, assessment_name=name or GUIDELINES_FEEDBACK_NAME
        )

    return _sanitize_feedback(feedback)