Source code for mlflow.genai.scorers.ragas.scorers

from __future__ import annotations

from typing import ClassVar

from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.scorers.ragas import RagasScorer
from mlflow.genai.scorers.ragas.scorers.agentic_metrics import (
    AgentGoalAccuracyWithoutReference,
    AgentGoalAccuracyWithReference,
    ToolCallAccuracy,
    ToolCallF1,
    TopicAdherence,
)
from mlflow.genai.scorers.ragas.scorers.comparison_metrics import (
    BleuScore,
    ChrfScore,
    ExactMatch,
    FactualCorrectness,
    NonLLMStringSimilarity,
    RougeScore,
    StringPresence,
)
from mlflow.genai.scorers.ragas.scorers.rag_metrics import (
    AnswerRelevancy,
    ContextEntityRecall,
    ContextPrecision,
    ContextRecall,
    Faithfulness,
    NoiseSensitivity,
    NonLLMContextPrecisionWithReference,
    NonLLMContextRecall,
    SemanticSimilarity,
)
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring


[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class AspectCritic(RagasScorer): """ Evaluates the output based on specific aspects or criteria. Args: model: {{ model }} **metric_kwargs: Additional metric-specific parameters (e.g., name, definition) Examples: .. code-block:: python from mlflow.genai.scorers.ragas import AspectCritic scorer = AspectCritic( model="openai:/gpt-4", name="helpfulness", definition="Does the response help answer the question?", ) feedback = scorer(inputs="What is MLflow?", outputs="MLflow is a platform...") """ metric_name: ClassVar[str] = "AspectCritic"
[docs]@experimental(version="3.9.0") @format_docstring(_MODEL_API_DOC) class DiscreteMetric(RagasScorer): """ Evaluates the output based on a custom prompt with discrete scoring. This metric allows you to define a custom evaluation prompt that returns discrete values (e.g., "pass"/"fail", scores 0-10, etc.). Args: name: Name for this metric instance. prompt: Custom prompt template for evaluation (required). Should contain placeholders for evaluation inputs that will be formatted at runtime. model: {{ model }} **metric_kwargs: Additional metric-specific parameters. Examples: .. code-block:: python from mlflow.genai.scorers.ragas import DiscreteMetric scorer = DiscreteMetric( name="clarity", prompt='''Rate the clarity of the response on a scale of 0-10. 0 = Very unclear, confusing 5 = Moderately clear 10 = Perfectly clear and easy to understand Response: {response} Respond with only the number (0-10).''', allowed_values=[num for num in range(10)], ) feedback = scorer(trace=trace) """ metric_name: ClassVar[str] = "DiscreteMetric" # override to have name and prompt as required parameters def __init__( self, name: str, prompt: str, **metric_kwargs, ): super().__init__( metric_name=self.metric_name, name=name, prompt=prompt, **metric_kwargs, )
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class RubricsScore(RagasScorer): """ Evaluates the output based on a predefined rubric. This metric uses a rubric (set of criteria with descriptions and scores) to evaluate the output in a structured way. Args: model: {{ model }} **metric_kwargs: Additional metric-specific parameters (e.g., rubrics) Examples: .. code-block:: python from mlflow.genai.scorers.ragas import RubricsScore rubrics = { "1": "The response is entirely incorrect.", "2": "The response contains partial accuracy.", "3": "The response is mostly accurate but lacks clarity.", "4": "The response is accurate and clear with minor omissions.", "5": "The response is completely accurate and clear.", } scorer = RubricsScore(rubrics=rubrics) feedback = scorer(inputs="What is AI?", outputs="AI is artificial intelligence") """ metric_name: ClassVar[str] = "RubricsScore"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class InstanceSpecificRubrics(RagasScorer): """ Evaluates the output based on instance-specific rubrics. Unlike RubricsScore which uses one rubric for all evaluations, InstanceSpecificRubrics allows you to define different rubrics for each evaluation instance. Args: model: {{ model }} **metric_kwargs: Additional metric-specific parameters Examples: .. code-block:: python from mlflow.genai.scorers.ragas import InstanceSpecificRubrics scorer = InstanceSpecificRubrics(model="openai:/gpt-4") # Evaluate relevance with custom rubric feedback1 = scorer( inputs="How do I handle exceptions in Python?", outputs="To handle exceptions in Python, use try and except blocks.", expectations={ "expected_output": "Use try, except, and optionally else blocks.", "rubrics": { "0": "The response is off-topic or irrelevant.", "1": "The response is fully relevant and focused.", }, }, ) # Evaluate code efficiency with different rubric feedback2 = scorer( inputs="Create a list of squares for numbers 1 through 5", outputs="squares = []\\nfor i in range(1, 6):\\n squares.append(i**2)", expectations={ "expected_output": "squares = [i**2 for i in range(1, 6)]", "rubrics": { "0": "Inefficient code with performance issues.", "1": "Efficient and optimized code.", }, }, ) """ metric_name: ClassVar[str] = "InstanceSpecificRubrics"
[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) class SummarizationScore(RagasScorer): """ Evaluates the quality and accuracy of text summarization. This metric assesses whether the summary captures the key points of the source text while being concise and coherent. Args: model: {{ model }} **metric_kwargs: Additional metric-specific parameters Examples: .. code-block:: python from mlflow.genai.scorers.ragas import SummarizationScore scorer = SummarizationScore(model="openai:/gpt-4") feedback = scorer(trace=trace) """ metric_name: ClassVar[str] = "SummarizationScore"
[docs]@experimental(version="3.9.0") @format_docstring(_MODEL_API_DOC) class AnswerAccuracy(RagasScorer): """ Evaluates the accuracy of the answer compared to the expectations. Args: model: {{ model }} **metric_kwargs: Additional metric-specific parameters Examples: .. code-block:: python from mlflow.genai.scorers.ragas import AnswerAccuracy scorer = AnswerAccuracy(model="openai:/gpt-4") feedback = scorer(trace=trace) """ metric_name: ClassVar[str] = "AnswerAccuracy"
[docs]@experimental(version="3.9.0") @format_docstring(_MODEL_API_DOC) class ContextRelevance(RagasScorer): """ Evaluates the relevance of retrieved contexts to the user's question. Args: model: {{ model }} **metric_kwargs: Additional metric-specific parameters Examples: .. code-block:: python from mlflow.genai.scorers.ragas import ContextRelevance scorer = ContextRelevance(model="openai:/gpt-4") feedback = scorer(trace=trace) """ metric_name: ClassVar[str] = "ContextRelevance"
[docs]@experimental(version="3.9.0") @format_docstring(_MODEL_API_DOC) class ResponseGroundedness(RagasScorer): """ Evaluates whether the response is grounded in the retrieved contexts. Args: model: {{ model }} **metric_kwargs: Additional metric-specific parameters Examples: .. code-block:: python from mlflow.genai.scorers.ragas import ResponseGroundedness scorer = ResponseGroundedness(model="openai:/gpt-4") feedback = scorer(trace=trace) """ metric_name: ClassVar[str] = "ResponseGroundedness"
__all__ = [ # RAG metrics "ContextPrecision", "NonLLMContextPrecisionWithReference", "ContextRecall", "NonLLMContextRecall", "ContextEntityRecall", "NoiseSensitivity", "Faithfulness", "AnswerRelevancy", "SemanticSimilarity", # NVIDIA metrics "AnswerAccuracy", "ContextRelevance", "ResponseGroundedness", # Comparison metrics "FactualCorrectness", "NonLLMStringSimilarity", "BleuScore", "ChrfScore", "RougeScore", "StringPresence", "ExactMatch", # General purpose metrics "AspectCritic", "DiscreteMetric", "RubricsScore", "InstanceSpecificRubrics", # Agentic metrics "TopicAdherence", "ToolCallAccuracy", "ToolCallF1", "AgentGoalAccuracyWithReference", "AgentGoalAccuracyWithoutReference", # Other tasks "SummarizationScore", ]