Source code for mlflow.genai.scorers.deepeval.scorers.agentic_metrics

"""Agentic metrics for evaluating AI agent performance."""

from __future__ import annotations

from typing import ClassVar

from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.scorers.deepeval import DeepEvalScorer
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring


[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class TaskCompletion(DeepEvalScorer): """ Evaluates whether an agent successfully completes its assigned task. This metric assesses the agent's ability to fully accomplish the task it was given, measuring how well the final output aligns with the expected task completion criteria. Args: threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python from mlflow.genai.scorers.deepeval import TaskCompletion scorer = TaskCompletion(threshold=0.7) feedback = scorer(trace=trace) # trace contains inputs, outputs, and tool calls """ metric_name: ClassVar[str] = "TaskCompletion"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class ToolCorrectness(DeepEvalScorer): """ Evaluates whether an agent uses the correct tools for the task. This metric assesses if the agent selected and used the appropriate tools from its available toolset to accomplish the given task. It compares actual tool usage against expected tool selections. Args: threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python from mlflow.genai.scorers.deepeval import ToolCorrectness scorer = ToolCorrectness(threshold=0.8) feedback = scorer( trace=trace ) # trace contains inputs, tool calls, and expected tool calls """ metric_name: ClassVar[str] = "ToolCorrectness"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class ArgumentCorrectness(DeepEvalScorer): """ Evaluates whether an agent provides correct arguments when calling tools. This metric assesses the accuracy of the arguments/parameters the agent passes to tools, ensuring the agent uses tools with appropriate and valid inputs. Args: threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python from mlflow.genai.scorers.deepeval import ArgumentCorrectness scorer = ArgumentCorrectness(threshold=0.7) feedback = scorer(trace=trace) # trace contains inputs and tool calls with arguments """ metric_name: ClassVar[str] = "ArgumentCorrectness"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class StepEfficiency(DeepEvalScorer): """ Evaluates the efficiency of an agent's steps in completing a task. This metric measures whether the agent takes an optimal path to task completion, avoiding unnecessary steps or redundant tool calls. Higher scores indicate more efficient agent behavior. Args: threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python from mlflow.genai.scorers.deepeval import StepEfficiency scorer = StepEfficiency(threshold=0.6) feedback = scorer(trace=trace) # trace contains inputs and sequence of tool calls """ metric_name: ClassVar[str] = "StepEfficiency"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class PlanAdherence(DeepEvalScorer): """ Evaluates whether an agent adheres to its planned approach. This metric assesses how well the agent follows the plan it generated for completing a task. It measures the consistency between the agent's stated plan and its actual execution steps. Args: threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python from mlflow.genai.scorers.deepeval import PlanAdherence scorer = PlanAdherence(threshold=0.7) feedback = scorer(trace=trace) # trace contains inputs, outputs, and tool calls """ metric_name: ClassVar[str] = "PlanAdherence"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class PlanQuality(DeepEvalScorer): """ Evaluates the quality of an agent's generated plan. This metric assesses whether the agent's plan is comprehensive, logical, and likely to achieve the desired task outcome. It evaluates plan structure before execution. Args: threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python from mlflow.genai.scorers.deepeval import PlanQuality scorer = PlanQuality(threshold=0.7) feedback = scorer( inputs="Plan a trip to Paris", outputs="Plan: 1) Book flights 2) Reserve hotel 3) Create itinerary", ) """ metric_name: ClassVar[str] = "PlanQuality"