Source code for mlflow.genai.scorers.deepeval.scorers.conversational_metrics

"""Conversational metrics for evaluating multi-turn dialogue performance."""

from __future__ import annotations

from typing import ClassVar

from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.scorers.deepeval import DeepEvalScorer
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class TurnRelevancy(DeepEvalScorer):
    """
    Evaluates the relevance of each conversation turn.

    This multi-turn metric assesses whether each response in a conversation is relevant
    to the corresponding user query. It evaluates coherence across the entire dialogue.

    Note: This is a multi-turn metric that requires a list of traces representing
    conversation turns.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import TurnRelevancy

            scorer = TurnRelevancy(threshold=0.7)
            feedback = scorer(traces=[trace1, trace2, trace3])  # List of conversation turns

    """

    metric_name: ClassVar[str] = "TurnRelevancy"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class RoleAdherence(DeepEvalScorer):
    """
    Evaluates whether the agent stays in character throughout the conversation.

    This multi-turn metric assesses if the agent consistently maintains its assigned
    role, personality, and behavioral constraints across all conversation turns.

    Note: This is a multi-turn metric that requires a list of traces representing
    conversation turns.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import RoleAdherence

            scorer = RoleAdherence(threshold=0.8)
            feedback = scorer(traces=[trace1, trace2, trace3])

    """

    metric_name: ClassVar[str] = "RoleAdherence"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class KnowledgeRetention(DeepEvalScorer):
    """
    Evaluates the chatbot's ability to retain and use information from earlier in the conversation.

    This multi-turn metric assesses whether the agent remembers and appropriately
    references information from previous turns in the conversation, demonstrating
    context awareness.

    Note: This is a multi-turn metric that requires a list of traces representing
    conversation turns.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import KnowledgeRetention

            scorer = KnowledgeRetention(threshold=0.7)
            feedback = scorer(traces=[trace1, trace2, trace3])

    """

    metric_name: ClassVar[str] = "KnowledgeRetention"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class ConversationCompleteness(DeepEvalScorer):
    """
    Evaluates whether the conversation satisfies the user's needs and goals.

    This multi-turn metric assesses if the conversation reaches a satisfactory conclusion,
    addressing all aspects of the user's original request or question.

    Note: This is a multi-turn metric that requires a list of traces representing
    conversation turns.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import (
                ConversationCompleteness,
            )

            scorer = ConversationCompleteness(threshold=0.7)
            feedback = scorer(traces=[trace1, trace2, trace3])

    """

    metric_name: ClassVar[str] = "ConversationCompleteness"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class GoalAccuracy(DeepEvalScorer):
    """
    Evaluates the accuracy of achieving conversation goals in a multi-turn context.

    This multi-turn metric assesses whether the agent successfully achieves the
    specified goals or objectives throughout the conversation, measuring goal-oriented
    effectiveness.

    Note: This is a multi-turn metric that requires a list of traces representing
    conversation turns.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import GoalAccuracy

            scorer = GoalAccuracy(threshold=0.7)
            feedback = scorer(traces=[trace1, trace2, trace3])

    """

    metric_name: ClassVar[str] = "GoalAccuracy"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class ToolUse(DeepEvalScorer):
    """
    Evaluates the effectiveness of tool usage throughout a conversation.

    This multi-turn metric assesses whether the agent appropriately uses available
    tools across multiple conversation turns, measuring tool selection and usage
    effectiveness in a dialogue context.

    Note: This is a multi-turn metric that requires a list of traces representing
    conversation turns.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import ToolUse

            scorer = ToolUse(threshold=0.7)
            feedback = scorer(traces=[trace1, trace2, trace3])

    """

    metric_name: ClassVar[str] = "ToolUse"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class TopicAdherence(DeepEvalScorer):
    """
    Evaluates adherence to specified topics throughout a conversation.

    This multi-turn metric assesses whether the agent stays on topic across the
    entire conversation, avoiding unnecessary digressions or topic drift.

    Note: This is a multi-turn metric that requires a list of traces representing
    conversation turns.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import TopicAdherence

            scorer = TopicAdherence(threshold=0.7)
            feedback = scorer(traces=[trace1, trace2, trace3])

    """

    metric_name: ClassVar[str] = "TopicAdherence"