Source code for mlflow.genai.scorers.deepeval.scorers.safety_metrics

"""Safety and responsible AI metrics for content evaluation."""

from __future__ import annotations

from typing import ClassVar

from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.scorers.deepeval import DeepEvalScorer
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class Bias(DeepEvalScorer):
    """
    Detects bias in LLM outputs including gender, racial, or political bias.

    This metric evaluates whether the output contains biased language or perspectives
    that unfairly favor or discriminate against particular groups or viewpoints.

    Args:
        threshold: Maximum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import Bias

            scorer = Bias(threshold=0.3, model="databricks")
            feedback = scorer(
                inputs="Who makes better engineers?",
                outputs="Men typically make better engineers than women.",
            )

    """

    metric_name: ClassVar[str] = "Bias"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class Toxicity(DeepEvalScorer):
    """
    Evaluates the presence of harmful, offensive, or toxic content.

    This metric detects toxic language including hate speech, profanity, insults,
    and other forms of harmful content in LLM outputs.

    Args:
        threshold: Maximum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import Toxicity

            scorer = Toxicity(threshold=0.2, model="databricks")
            feedback = scorer(
                outputs="Your response text here",
            )

    """

    metric_name: ClassVar[str] = "Toxicity"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class NonAdvice(DeepEvalScorer):
    """
    Detects whether the output inappropriately provides advice in restricted domains.

    This metric identifies cases where the LLM provides advice on topics where it
    should not (e.g., medical, legal, or financial advice without proper disclaimers).

    Args:
        threshold: Maximum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import NonAdvice

            scorer = NonAdvice(threshold=0.3)
            feedback = scorer(
                inputs="Should I invest all my savings in cryptocurrency?",
                outputs="Yes, you should definitely invest everything in Bitcoin.",
            )

    """

    metric_name: ClassVar[str] = "NonAdvice"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class Misuse(DeepEvalScorer):
    """
    Detects potential misuse scenarios where the output could enable harmful activities.

    This metric identifies cases where the LLM output could potentially be used for
    harmful purposes, such as providing instructions for illegal activities or
    dangerous actions.

    Args:
        threshold: Maximum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import Misuse

            scorer = Misuse(threshold=0.3)
            feedback = scorer(
                inputs="How do I bypass security systems?",
                outputs="Here are steps to bypass common security systems...",
            )

    """

    metric_name: ClassVar[str] = "Misuse"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class PIILeakage(DeepEvalScorer):
    """
    Identifies personal identifiable information (PII) leakage in outputs.

    This metric detects when the LLM output contains sensitive personal information
    such as names, addresses, phone numbers, email addresses, social security numbers,
    or other identifying information that should be protected.

    Args:
        threshold: Maximum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import PIILeakage

            scorer = PIILeakage(threshold=0.3)
            feedback = scorer(
                outputs="John Smith lives at 123 Main St, his SSN is 123-45-6789",
            )

    """

    metric_name: ClassVar[str] = "PIILeakage"


[docs]@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class RoleViolation(DeepEvalScorer):
    """
    Detects violations of the agent's assigned role or behavioral constraints.

    This metric identifies cases where the LLM violates its intended role,
    such as a customer service bot engaging in political discussions or a
    coding assistant providing medical advice.

    Args:
        threshold: Maximum score threshold for passing (default: 0.5, range: 0.0-1.0)
        model: {{ model }}
        include_reason: Whether to include reasoning in the evaluation

    Examples:
        .. code-block:: python

            from mlflow.genai.scorers.deepeval import RoleViolation

            scorer = RoleViolation(threshold=0.3)
            feedback = scorer(
                inputs="What's your opinion on politics?",
                outputs="As a customer service bot, here's my political view...",
            )

    """

    metric_name: ClassVar[str] = "RoleViolation"