Source code for mlflow.genai.scorers.ragas.scorers.agentic_metrics

from __future__ import annotations

from typing import ClassVar

from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.scorers.ragas import RagasScorer
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring


[docs]@experimental(version="3.9.0")
@format_docstring(_MODEL_API_DOC)
class TopicAdherence(RagasScorer):
    """
    Evaluates whether the AI system adheres to specified topics during interaction.

    This metric assesses if the agent stays on topic and avoids answering queries
    outside its designated domain of interest.

    Args:
        model: {{ model }}
        **metric_kwargs: Additional metric-specific parameters

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.ragas import TopicAdherence

            scorer = TopicAdherence()
            feedback = scorer(
                trace=trace,
                expectations={
                    "reference_topics": ["machine learning", "data science"],
                },
            )

            # or for sessions:
            session = mlflow.search_traces(
                filter_string="request_metadata.mlflow.trace.session='{session_id}'",
                return_type="list",
            )
            feedback = scorer(
                session=session,
                expectations={
                    "reference_topics": ["machine learning", "data science"],
                },
            )
    """

    metric_name: ClassVar[str] = "TopicAdherence"


[docs]@experimental(version="3.9.0")
class ToolCallAccuracy(RagasScorer):
    """
    Evaluates the accuracy of tool calls made by an agent.

    This deterministic metric compares the actual tool calls made by the agent
    against expected tool calls, considering both the tool names and their
    arguments. It can evaluate in strict order or flexible order mode.

    Args:
        **metric_kwargs: Additional metric-specific parameters

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.ragas import ToolCallAccuracy

            scorer = ToolCallAccuracy()
            feedback = scorer(
                trace=trace,
                expectations={
                    "expected_tool_calls": [
                        {"name": "weather_check", "arguments": {"location": "Paris"}},
                        {"name": "uv_index_lookup", "arguments": {"location": "Paris"}},
                    ]
                },
            )

            # or for sessions:
            session = mlflow.search_traces(
                filter_string="request_metadata.mlflow.trace.session='{session_id}'",
                return_type="list",
            )
            feedback = scorer(
                session=session,
                expectations={
                    "expected_tool_calls": [
                        {"name": "weather_check", "arguments": {"location": "Paris"}},
                        {"name": "uv_index_lookup", "arguments": {"location": "Paris"}},
                    ]
                },
            )
    """

    metric_name: ClassVar[str] = "ToolCallAccuracy"


[docs]@experimental(version="3.9.0")
class ToolCallF1(RagasScorer):
    """
    Calculates F1 score between expected and actual tool calls.

    Args:
        **metric_kwargs: Additional metric-specific parameters

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.ragas import ToolCallF1

            scorer = ToolCallF1()
            feedback = scorer(
                trace=trace,
                expectations={
                    "expected_tool_calls": [
                        {"name": "weather_check", "arguments": {"location": "Paris"}},
                    ]
                },
            )

            # or for sessions:
            session = mlflow.search_traces(
                filter_string="request_metadata.mlflow.trace.session='{session_id}'",
                return_type="list",
            )
            feedback = scorer(
                session=session,
                expectations={
                    "expected_tool_calls": [
                        {"name": "weather_check", "arguments": {"location": "Paris"}},
                    ]
                },
            )
    """

    metric_name: ClassVar[str] = "ToolCallF1"


[docs]@experimental(version="3.9.0")
@format_docstring(_MODEL_API_DOC)
class AgentGoalAccuracyWithReference(RagasScorer):
    """
    Evaluates whether the agent achieved the user's goal compared to the expectations.

    Args:
        model: {{ model }}
        **metric_kwargs: Additional metric-specific parameters

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.ragas import AgentGoalAccuracyWithReference

            scorer = AgentGoalAccuracyWithReference(model="openai:/gpt-4")
            feedback = scorer(
                trace=trace,
                expectations={"expected_output": "Table booked at a Chinese restaurant for 8pm"},
            )
            # or for sessions:
            session = mlflow.search_traces(
                filter_string="request_metadata.mlflow.trace.session='{session_id}'",
                return_type="list",
            )
            feedback = scorer(
                session=session,
                expectations={"expected_output": "Table booked at a Chinese restaurant for 8pm"},
            )
    """

    metric_name: ClassVar[str] = "AgentGoalAccuracyWithReference"


[docs]@experimental(version="3.9.0")
@format_docstring(_MODEL_API_DOC)
class AgentGoalAccuracyWithoutReference(RagasScorer):
    """
    Evaluates whether the agent achieved the user's goal without expectations.

    Args:
        model: {{ model }}
        **metric_kwargs: Additional metric-specific parameters

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.ragas import AgentGoalAccuracyWithoutReference

            scorer = AgentGoalAccuracyWithoutReference(model="openai:/gpt-4")
            feedback = scorer(trace=trace)

            # or for sessions:
            session = mlflow.search_traces(
                filter_string="request_metadata.mlflow.trace.session='{session_id}'",
                return_type="list",
            )
            feedback = scorer(session=session)
    """

    metric_name: ClassVar[str] = "AgentGoalAccuracyWithoutReference"