Source code for mlflow.genai.scorers.deepeval.scorers

"""DeepEval metric scorers organized by category."""

from __future__ import annotations

from typing import ClassVar

from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.scorers.deepeval import DeepEvalScorer
from mlflow.genai.scorers.deepeval.scorers.agentic_metrics import (
    ArgumentCorrectness,
    PlanAdherence,
    PlanQuality,
    StepEfficiency,
    TaskCompletion,
    ToolCorrectness,
)
from mlflow.genai.scorers.deepeval.scorers.conversational_metrics import (
    ConversationCompleteness,
    GoalAccuracy,
    KnowledgeRetention,
    RoleAdherence,
    ToolUse,
    TopicAdherence,
    TurnRelevancy,
)
from mlflow.genai.scorers.deepeval.scorers.rag_metrics import (
    AnswerRelevancy,
    ContextualPrecision,
    ContextualRecall,
    ContextualRelevancy,
    Faithfulness,
)
from mlflow.genai.scorers.deepeval.scorers.safety_metrics import (
    Bias,
    Misuse,
    NonAdvice,
    PIILeakage,
    RoleViolation,
    Toxicity,
)
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring


# General-purpose metrics
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class Hallucination(DeepEvalScorer): """ Detects hallucinations where the LLM fabricates information not present in the context. Args: threshold: Maximum score threshold for passing (range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python scorer = Hallucination(threshold=0.3) feedback = scorer(trace=trace) """ metric_name: ClassVar[str] = "Hallucination"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class Summarization(DeepEvalScorer): """ Evaluates the quality and accuracy of text summarization. Args: threshold: Minimum score threshold for passing (range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python scorer = Summarization(threshold=0.7) feedback = scorer(inputs="Long text...", outputs="Summary...") """ metric_name: ClassVar[str] = "Summarization"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class JsonCorrectness(DeepEvalScorer): """ Validates JSON output against an expected schema. Note: Requires `expected_schema` parameter in expectations dict. Args: threshold: Minimum score threshold for passing (range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python scorer = JsonCorrectness(threshold=0.8) feedback = scorer( outputs='{"name": "John"}', expectations={"expected_schema": {...}}, ) """ metric_name: ClassVar[str] = "JsonCorrectness"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class PromptAlignment(DeepEvalScorer): """ Measures how well the output aligns with instructions given in the prompt. Args: threshold: Minimum score threshold for passing (range: 0.0-1.0) model: {{ model }} include_reason: Whether to include reasoning in the evaluation Examples: .. code-block:: python scorer = PromptAlignment(threshold=0.7) feedback = scorer(inputs="Instructions...", outputs="Response...") """ metric_name: ClassVar[str] = "PromptAlignment"
[docs]@experimental(version="3.8.0") class ExactMatch(DeepEvalScorer): """ Performs exact string matching between output and expected output. Args: threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0) Examples: .. code-block:: python scorer = ExactMatch() feedback = scorer( outputs="Paris", expectations={"expected_output": "Paris"}, ) """ metric_name: ClassVar[str] = "ExactMatch" def __init__( self, threshold: float = 0.5, **kwargs, ): self._validate_kwargs(**kwargs) super().__init__( metric_name=self.metric_name, model=None, threshold=threshold, **kwargs, )
[docs]@experimental(version="3.8.0") class PatternMatch(DeepEvalScorer): """ Performs regex pattern matching on the output. Args: pattern: Regex pattern to match against the output threshold: Minimum score threshold for passing (default: 0.5, range: 0.0-1.0) Examples: .. code-block:: python scorer = PatternMatch(pattern=r"\\d{3}-\\d{3}-\\d{4}") feedback = scorer(outputs="Phone: 555-123-4567") """ metric_name: ClassVar[str] = "PatternMatch" def __init__( self, pattern: str, threshold: float = 0.5, **kwargs, ): self._validate_kwargs(**kwargs) super().__init__( metric_name=self.metric_name, model=None, threshold=threshold, pattern=pattern, **kwargs, )
__all__ = [ # RAG metrics "AnswerRelevancy", "Faithfulness", "ContextualRecall", "ContextualPrecision", "ContextualRelevancy", # Agentic metrics "TaskCompletion", "ToolCorrectness", "ArgumentCorrectness", "StepEfficiency", "PlanAdherence", "PlanQuality", # Conversational metrics "TurnRelevancy", "RoleAdherence", "KnowledgeRetention", "ConversationCompleteness", "GoalAccuracy", "ToolUse", "TopicAdherence", # Safety metrics "Bias", "Toxicity", "NonAdvice", "Misuse", "PIILeakage", "RoleViolation", # General metrics "Hallucination", "Summarization", "JsonCorrectness", "PromptAlignment", # Deterministic metrics "ExactMatch", "PatternMatch", ]