Source code for mlflow.genai.scorers.deepeval.scorers

"""DeepEval metric scorers organized by category."""

from __future__ import annotations

from typing import ClassVar

from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.scorers.deepeval import DeepEvalScorer
from mlflow.genai.scorers.deepeval.scorers.agentic_metrics import (
    ArgumentCorrectness,
    PlanAdherence,
    PlanQuality,
    StepEfficiency,
    TaskCompletion,
    ToolCorrectness,
)
from mlflow.genai.scorers.deepeval.scorers.conversational_metrics import (
    ConversationCompleteness,
    GoalAccuracy,
    KnowledgeRetention,
    RoleAdherence,
    ToolUse,
    TopicAdherence,
    TurnRelevancy,
)
from mlflow.genai.scorers.deepeval.scorers.rag_metrics import (
    AnswerRelevancy,
    ContextualPrecision,
    ContextualRecall,
    ContextualRelevancy,
    Faithfulness,
)
from mlflow.genai.scorers.deepeval.scorers.safety_metrics import (
    Bias,
    Misuse,
    NonAdvice,
    PIILeakage,
    RoleViolation,
    Toxicity,
)
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring


# General-purpose metrics
[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class Hallucination(DeepEvalScorer):
    """
    Detects hallucinations where the LLM fabricates information not present in the context.

    Args:
        threshold: Maximum score threshold for passing (range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            scorer = Hallucination(threshold=0.3)
            feedback = scorer(trace=trace)
    """

    metric_name: ClassVar[str] = "Hallucination"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class Summarization(DeepEvalScorer):
    """
    Evaluates the quality and accuracy of text summarization.

    Args:
        threshold: Minimum score threshold for passing (range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            scorer = Summarization(threshold=0.7)
            feedback = scorer(inputs="Long text...", outputs="Summary...")
    """

    metric_name: ClassVar[str] = "Summarization"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class JsonCorrectness(DeepEvalScorer):
    """
    Validates JSON output against an expected schema.

    Note: Requires `expected_schema` parameter in expectations dict.

    Args:
        threshold: Minimum score threshold for passing (range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            scorer = JsonCorrectness(threshold=0.8)
            feedback = scorer(
                outputs='{"name": "John"}',
                expectations={"expected_schema": {...}},
            )
    """

    metric_name: ClassVar[str] = "JsonCorrectness"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class PromptAlignment(DeepEvalScorer):
    """
    Measures how well the output aligns with instructions given in the prompt.

    Args:
        threshold: Minimum score threshold for passing (range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            scorer = PromptAlignment(threshold=0.7)
            feedback = scorer(inputs="Instructions...", outputs="Response...")
    """

    metric_name: ClassVar[str] = "PromptAlignment"


[docs]@experimental(version="3.8.0")
class ExactMatch(DeepEvalScorer):
    """
    Performs exact string matching between output and expected output.

    Args:
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)

    Examples:
        .. code-block:: python

            scorer = ExactMatch()
            feedback = scorer(
                outputs="Paris",
                expectations={"expected_output": "Paris"},
            )
    """

    metric_name: ClassVar[str] = "ExactMatch"

    def __init__(
        self,
        threshold: float = 0.5,
        **kwargs,
    ):
        self._validate_kwargs(**kwargs)
        super().__init__(
            metric_name=self.metric_name,
            model=None,
            threshold=threshold,
            **kwargs,
        )


[docs]@experimental(version="3.8.0")
class PatternMatch(DeepEvalScorer):
    """
    Performs regex pattern matching on the output.

    Args:
        pattern: Regex pattern to match against the output
        threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0)

    Examples:
        .. code-block:: python

            scorer = PatternMatch(pattern=r"\\d{3}-\\d{3}-\\d{4}")
            feedback = scorer(outputs="Phone: 555-123-4567")
    """

    metric_name: ClassVar[str] = "PatternMatch"

    def __init__(
        self,
        pattern: str,
        threshold: float = 0.5,
        **kwargs,
    ):
        self._validate_kwargs(**kwargs)
        super().__init__(
            metric_name=self.metric_name,
            model=None,
            threshold=threshold,
            pattern=pattern,
            **kwargs,
        )


__all__ = [
    # RAG metrics
    "AnswerRelevancy",
    "Faithfulness",
    "ContextualRecall",
    "ContextualPrecision",
    "ContextualRelevancy",
    # Agentic metrics
    "TaskCompletion",
    "ToolCorrectness",
    "ArgumentCorrectness",
    "StepEfficiency",
    "PlanAdherence",
    "PlanQuality",
    # Conversational metrics
    "TurnRelevancy",
    "RoleAdherence",
    "KnowledgeRetention",
    "ConversationCompleteness",
    "GoalAccuracy",
    "ToolUse",
    "TopicAdherence",
    # Safety metrics
    "Bias",
    "Toxicity",
    "NonAdvice",
    "Misuse",
    "PIILeakage",
    "RoleViolation",
    # General metrics
    "Hallucination",
    "Summarization",
    "JsonCorrectness",
    "PromptAlignment",
    # Deterministic metrics
    "ExactMatch",
    "PatternMatch",
]