Source code for mlflow.genai.scorers.deepeval.scorers.rag_metrics

"""RAG (Retrieval-Augmented Generation) metrics for DeepEval integration."""

from __future__ import annotations

from typing import ClassVar

from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.scorers.deepeval import DeepEvalScorer
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class AnswerRelevancy(DeepEvalScorer):
    """
    Evaluates whether the output is relevant to the input.

    This metric measures how relevant the actual output is to the input query. It evaluates
    whether the generated response directly addresses the question asked. Higher scores indicate
    better relevance to the input.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import AnswerRelevancy

            scorer = AnswerRelevancy(threshold=0.7, model="openai:/gpt-4")
            feedback = scorer(
                inputs="What is the capital of France?",
                outputs="Paris is the capital of France.",
            )
            print(feedback.value)  # CategoricalRating.YES or CategoricalRating.NO
    """

    metric_name: ClassVar[str] = "AnswerRelevancy"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class Faithfulness(DeepEvalScorer):
    """
    Evaluates whether the output is factually consistent with the retrieval context.

    This metric determines if claims in the output can be inferred from the provided context.
    It helps detect hallucinations by checking if the generated content is grounded in the
    retrieved documents.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import Faithfulness

            scorer = Faithfulness(threshold=0.8, model="databricks")
            feedback = scorer(trace=trace)  # trace contains outputs and retrieval_context
    """

    metric_name: ClassVar[str] = "Faithfulness"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class ContextualRecall(DeepEvalScorer):
    """
    Evaluates whether the retrieval context contains all necessary information.

    This metric measures how much of the expected output can be attributed to the nodes in
    the retrieval context. It assesses the quality of the retriever by checking if all
    required information is present in the retrieved documents.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import ContextualRecall

            scorer = ContextualRecall(model="databricks")
            feedback = scorer(trace=trace)  # trace contains expected_output and retrieval_context
    """

    metric_name: ClassVar[str] = "ContextualRecall"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class ContextualPrecision(DeepEvalScorer):
    """
    Evaluates whether relevant nodes in the retrieval context are ranked higher than
    irrelevant ones.

    This metric assesses the quality of your retriever by checking if the most relevant
    retrieved context are ranked higher than less relevant ones. It helps evaluate the
    ranking effectiveness of your retrieval system.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import ContextualPrecision

            scorer = ContextualPrecision(threshold=0.7)
            feedback = scorer(
                trace=trace
            )  # trace contains input, expected_output, and retrieval_context
    """

    metric_name: ClassVar[str] = "ContextualPrecision"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class ContextualRelevancy(DeepEvalScorer):
    """
    Evaluates the overall relevance of information in the retrieval context.

    This metric determines what fraction of the retrieval context is relevant to the input.
    It helps assess whether your retriever is returning focused, relevant information or
    including too much irrelevant content.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import ContextualRelevancy

            scorer = ContextualRelevancy(threshold=0.6)
            feedback = scorer(trace=trace)  # trace contains input and retrieval_context
    """

    metric_name: ClassVar[str] = "ContextualRelevancy"