Source code for mlflow.genai.scorers.ragas.scorers.agentic_metrics
from __future__ import annotations
from typing import ClassVar
from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.scorers.ragas import RagasScorer
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring
[docs]@experimental(version="3.9.0")
@format_docstring(_MODEL_API_DOC)
class TopicAdherence(RagasScorer):
"""
Evaluates whether the AI system adheres to specified topics during interaction.
This metric assesses if the agent stays on topic and avoids answering queries
outside its designated domain of interest.
Args:
model: {{ model }}
**metric_kwargs: Additional metric-specific parameters
Examples:
.. code-block:: python
from mlflow.genai.scorers.ragas import TopicAdherence
scorer = TopicAdherence()
feedback = scorer(
trace=trace,
expectations={
"reference_topics": ["machine learning", "data science"],
},
)
# or for sessions:
session = mlflow.search_traces(
filter_string="request_metadata.mlflow.trace.session='{session_id}'",
return_type="list",
)
feedback = scorer(
session=session,
expectations={
"reference_topics": ["machine learning", "data science"],
},
)
"""
metric_name: ClassVar[str] = "TopicAdherence"
[docs]@experimental(version="3.9.0")
class ToolCallAccuracy(RagasScorer):
"""
Evaluates the accuracy of tool calls made by an agent.
This deterministic metric compares the actual tool calls made by the agent
against expected tool calls, considering both the tool names and their
arguments. It can evaluate in strict order or flexible order mode.
Args:
**metric_kwargs: Additional metric-specific parameters
Examples:
.. code-block:: python
from mlflow.genai.scorers.ragas import ToolCallAccuracy
scorer = ToolCallAccuracy()
feedback = scorer(
trace=trace,
expectations={
"expected_tool_calls": [
{"name": "weather_check", "arguments": {"location": "Paris"}},
{"name": "uv_index_lookup", "arguments": {"location": "Paris"}},
]
},
)
# or for sessions:
session = mlflow.search_traces(
filter_string="request_metadata.mlflow.trace.session='{session_id}'",
return_type="list",
)
feedback = scorer(
session=session,
expectations={
"expected_tool_calls": [
{"name": "weather_check", "arguments": {"location": "Paris"}},
{"name": "uv_index_lookup", "arguments": {"location": "Paris"}},
]
},
)
"""
metric_name: ClassVar[str] = "ToolCallAccuracy"
[docs]@experimental(version="3.9.0")
class ToolCallF1(RagasScorer):
"""
Calculates F1 score between expected and actual tool calls.
Args:
**metric_kwargs: Additional metric-specific parameters
Examples:
.. code-block:: python
from mlflow.genai.scorers.ragas import ToolCallF1
scorer = ToolCallF1()
feedback = scorer(
trace=trace,
expectations={
"expected_tool_calls": [
{"name": "weather_check", "arguments": {"location": "Paris"}},
]
},
)
# or for sessions:
session = mlflow.search_traces(
filter_string="request_metadata.mlflow.trace.session='{session_id}'",
return_type="list",
)
feedback = scorer(
session=session,
expectations={
"expected_tool_calls": [
{"name": "weather_check", "arguments": {"location": "Paris"}},
]
},
)
"""
metric_name: ClassVar[str] = "ToolCallF1"
[docs]@experimental(version="3.9.0")
@format_docstring(_MODEL_API_DOC)
class AgentGoalAccuracyWithReference(RagasScorer):
"""
Evaluates whether the agent achieved the user's goal compared to the expectations.
Args:
model: {{ model }}
**metric_kwargs: Additional metric-specific parameters
Examples:
.. code-block:: python
from mlflow.genai.scorers.ragas import AgentGoalAccuracyWithReference
scorer = AgentGoalAccuracyWithReference(model="openai:/gpt-4")
feedback = scorer(
trace=trace,
expectations={"expected_output": "Table booked at a Chinese restaurant for 8pm"},
)
# or for sessions:
session = mlflow.search_traces(
filter_string="request_metadata.mlflow.trace.session='{session_id}'",
return_type="list",
)
feedback = scorer(
session=session,
expectations={"expected_output": "Table booked at a Chinese restaurant for 8pm"},
)
"""
metric_name: ClassVar[str] = "AgentGoalAccuracyWithReference"
[docs]@experimental(version="3.9.0")
@format_docstring(_MODEL_API_DOC)
class AgentGoalAccuracyWithoutReference(RagasScorer):
"""
Evaluates whether the agent achieved the user's goal without expectations.
Args:
model: {{ model }}
**metric_kwargs: Additional metric-specific parameters
Examples:
.. code-block:: python
from mlflow.genai.scorers.ragas import AgentGoalAccuracyWithoutReference
scorer = AgentGoalAccuracyWithoutReference(model="openai:/gpt-4")
feedback = scorer(trace=trace)
# or for sessions:
session = mlflow.search_traces(
filter_string="request_metadata.mlflow.trace.session='{session_id}'",
return_type="list",
)
feedback = scorer(session=session)
"""
metric_name: ClassVar[str] = "AgentGoalAccuracyWithoutReference"