Source code for mlflow.genai.judges.databricks

from functools import wraps
from typing import Any, Optional, Union

from mlflow.entities.assessment import Feedback


def requires_databricks_agents(func):
    """Decorator to check if the `databricks-agents` package is installed."""

    @wraps(func)
    def wrapper(*args, **kwargs):
        try:
            import databricks.agents.evals.judges  # noqa: F401

        except ImportError:
            raise ImportError(
                f"The `databricks-agents` package is required to use "
                f"`mlflow.genai.judges.{func.__name__}`. "
                "Please install it with `pip install databricks-agents`."
            )

        return func(*args, **kwargs)

    return wrapper


[docs]@requires_databricks_agents def is_context_relevant(*, request: str, context: Any, name: Optional[str] = None) -> Feedback: """ LLM judge determines whether the given context is relevant to the input request. Args: request: Input to the application to evaluate, user's question or query. context: Context to evaluate the relevance to the request. Supports any JSON-serializable object. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the context is relevant to the request. Example: The following example shows how to evaluate whether a document retrieved by a retriever is relevant to the user's question. .. code-block:: python from mlflow.genai.judges import is_context_relevant feedback = is_context_relevant( request="What is the capital of France?", context="Paris is the capital of France.", ) print(feedback.value) # "yes" feedback = is_context_relevant( request="What is the capital of France?", context="Paris is known for its Eiffel Tower.", ) print(feedback.value) # "no" """ from databricks.agents.evals.judges import chunk_relevance # NB: The `chunk_relevance` takes a list of context chunks and returns a list of feedbacks. return chunk_relevance( request=request, retrieved_context=[context], assessment_name=name, )[0]
[docs]@requires_databricks_agents def is_context_sufficient( *, request: str, context: Any, expected_facts: list[str], expected_response: Optional[str] = None, name: Optional[str] = None, ) -> Feedback: """ LLM judge determines whether the given context is sufficient to answer the input request. Args: request: Input to the application to evaluate, user's question or query. context: Context to evaluate the sufficiency of. Supports any JSON-serializable object. expected_facts: A list of expected facts that should be present in the context. expected_response: The expected response from the application. Optional. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the context is sufficient to answer the request. Example: The following example shows how to evaluate whether the documents returned by a retriever gives sufficient context to answer the user's question. .. code-block:: python from mlflow.genai.judges import is_context_sufficient feedback = is_context_sufficient( request="What is the capital of France?", context=[ {"content": "Paris is the capital of France."}, {"content": "Paris is known for its Eiffel Tower."}, ], expected_facts=["Paris is the capital of France."], ) print(feedback.value) # "yes" """ from databricks.agents.evals.judges import context_sufficiency return context_sufficiency( request=request, retrieved_context=context, expected_facts=expected_facts, expected_response=expected_response, assessment_name=name, )
[docs]@requires_databricks_agents def is_correct( *, request: str, response: str, expected_facts: list[str], expected_response: Optional[str] = None, name: Optional[str] = None, ) -> Feedback: """ LLM judge determines whether the given response is correct for the input request. Args: request: Input to the application to evaluate, user's question or query. response: The response from the application to evaluate. expected_facts: A list of expected facts that should be present in the response. expected_response: The expected response from the application. Optional. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response is correct for the request. """ from databricks.agents.evals.judges import correctness return correctness( request=request, response=response, expected_facts=expected_facts, expected_response=expected_response, assessment_name=name, )
[docs]@requires_databricks_agents def is_grounded( *, request: str, response: str, context: Any, name: Optional[str] = None ) -> Feedback: """ LLM judge determines whether the given response is grounded in the given context. Args: request: Input to the application to evaluate, user's question or query. response: The response from the application to evaluate. context: Context to evaluate the response against. Supports any JSON-serializable object. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response is grounded in the context. Example: The following example shows how to evaluate whether the response is grounded in the context. .. code-block:: python from mlflow.genai.judges import is_grounded feedback = is_grounded( request="What is the capital of France?", response="Paris", context=[ {"content": "Paris is the capital of France."}, {"content": "Paris is known for its Eiffel Tower."}, ], ) print(feedback.value) # "yes" """ from databricks.agents.evals.judges import groundedness return groundedness( request=request, response=response, retrieved_context=context, assessment_name=name, )
[docs]@requires_databricks_agents def is_safe(*, content: str, name: Optional[str] = None) -> Feedback: """ LLM judge determines whether the given response is safe. Args: content: Text content to evaluate for safety. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response is safe. Example: .. code-block:: python from mlflow.genai.judges import is_safe feedback = is_safe(content="I am a happy person.") print(feedback.value) # "yes" """ from databricks.agents.evals.judges import safety return safety( response=content, assessment_name=name, )
[docs]@requires_databricks_agents def meets_guidelines( *, guidelines: Union[str, list[str]], context: dict[str, Any], name: Optional[str] = None, ) -> Feedback: """ LLM judge determines whether the given response meets the given guideline(s). Args: guidelines: A single guideline or a list of guidelines. context: Mapping of context to be evaluated against the guidelines. For example, pass {"response": "<response text>"} to evaluate whether the response meets the given guidelines. name: Optional name for overriding the default name of the returned feedback. Returns: A :py:class:`mlflow.entities.assessment.Feedback~` object with a "yes" or "no" value indicating whether the response meets the guideline(s). Example: The following example shows how to evaluate whether the response meets the given guideline(s). .. code-block:: python from mlflow.genai.judges import meets_guidelines feedback = meets_guidelines( guidelines="Be polite and respectful.", context={"response": "Hello, how are you?"}, ) print(feedback.value) # "yes" feedback = meets_guidelines( guidelines=["Be polite and respectful.", "Must be in English."], context={"response": "Hola, ¿cómo estás?"}, ) print(feedback.value) # "no" """ from databricks.agents.evals.judges import guideline_adherence return guideline_adherence( guidelines=guidelines, guidelines_context=context, assessment_name=name, )