Source code for mlflow.genai.scorers.ragas

"""
RAGAS integration for MLflow.

This module provides integration with RAGAS metrics, allowing them to be used
with MLflow's judge interface.

Example usage:

.. code-block:: python

    from mlflow.genai.scorers.ragas import get_scorer

    judge = get_scorer("Faithfulness", model="openai:/gpt-4")
    feedback = judge(
        inputs="What is MLflow?", outputs="MLflow is a platform...", trace=trace
    )
"""

from __future__ import annotations

import logging
from typing import Any

from pydantic import PrivateAttr

from mlflow.entities.assessment import Feedback
from mlflow.entities.assessment_source import AssessmentSource, AssessmentSourceType
from mlflow.entities.trace import Trace
from mlflow.exceptions import MlflowException
from mlflow.genai.judges.builtin import _MODEL_API_DOC
from mlflow.genai.judges.utils import CategoricalRating, get_default_model
from mlflow.genai.scorers import FRAMEWORK_METADATA_KEY
from mlflow.genai.scorers.base import Scorer, ScorerKind
from mlflow.genai.scorers.ragas.models import create_ragas_model
from mlflow.genai.scorers.ragas.registry import get_metric_class, is_deterministic_metric
from mlflow.genai.scorers.ragas.utils import (
    create_mlflow_error_message_from_ragas_param,
    map_scorer_inputs_to_ragas_sample,
)
from mlflow.utils.annotations import experimental
from mlflow.utils.docstring_utils import format_docstring

_logger = logging.getLogger(__name__)


@experimental(version="3.8.0")
@format_docstring(_MODEL_API_DOC)
class RagasScorer(Scorer):
    """
    Initialize a RAGAS metric scorer.

    Args:
        metric_name: Name of the RAGAS metric (e.g., "Faithfulness")
        model: {{ model }}
        metric_kwargs: Additional metric-specific parameters
    """

    _metric: Any = PrivateAttr()
    _is_deterministic: bool = PrivateAttr(default=False)
    _model: str = PrivateAttr()

    def __init__(
        self,
        metric_name: str | None = None,
        model: str | None = None,
        **metric_kwargs,
    ):
        if metric_name is None:
            metric_name = self.metric_name

        super().__init__(name=metric_name)
        model = model or get_default_model()
        self._model = model
        metric_class = get_metric_class(metric_name)

        if is_deterministic_metric(metric_name):
            self._metric = metric_class(**metric_kwargs)
            self._is_deterministic = True
        else:
            ragas_llm = create_ragas_model(model)
            self._metric = metric_class(llm=ragas_llm, **metric_kwargs)

    @property
    def kind(self) -> ScorerKind:
        return ScorerKind.THIRD_PARTY

    def __call__(
        self,
        *,
        inputs: dict[str, Any] | None = None,
        outputs: Any = None,
        expectations: dict[str, Any] | None = None,
        trace: Trace | None = None,
    ) -> Feedback:
        """
        Evaluate using the wrapped RAGAS metric.

        Args:
            inputs: The input to evaluate
            outputs: The output to evaluate
            expectations: Expected values and context for evaluation
            trace: MLflow trace for evaluation

        Returns:
            Feedback object with score, rationale, and metadata
        """
        if self._is_deterministic:
            assessment_source = AssessmentSource(
                source_type=AssessmentSourceType.CODE,
                source_id=self.name,
            )
        else:
            assessment_source = AssessmentSource(
                source_type=AssessmentSourceType.LLM_JUDGE,
                source_id=self._model,
            )

        try:
            sample = map_scorer_inputs_to_ragas_sample(
                inputs=inputs,
                outputs=outputs,
                expectations=expectations,
                trace=trace,
            )

            if hasattr(self._metric, "single_turn_score"):
                result = self._metric.single_turn_score(sample)
            elif hasattr(self._metric, "score"):
                result = self._metric.score(sample)
            else:
                raise MlflowException(f"RAGAS metric {self.name} is currently not supported")

            score = float(result)

            reason = getattr(result, "reason", None)

            # RAGAS metrics may have thresholds to map to binary feedback
            threshold = getattr(self._metric, "threshold", None)
            metadata = {FRAMEWORK_METADATA_KEY: "ragas"}

            if threshold is not None:
                metadata["threshold"] = threshold
                metadata["score"] = score
                value = CategoricalRating.YES if score >= threshold else CategoricalRating.NO
            else:
                value = score

            return Feedback(
                name=self.name,
                value=value,
                rationale=reason,
                source=assessment_source,
                trace_id=None,
                metadata=metadata,
            )
        except (KeyError, IndexError) as e:
            # RAGAS raises KeyError/IndexError when required parameters are missing
            error_msg = str(e).strip("'\"")
            mlflow_error_message = create_mlflow_error_message_from_ragas_param(
                error_msg, self.name
            )
            _logger.error(
                f"Missing required parameter for RAGAS metric {self.name}: {mlflow_error_message}"
            )
            mlflow_error = MlflowException.invalid_parameter_value(mlflow_error_message)

            return Feedback(
                name=self.name,
                error=mlflow_error,
                source=assessment_source,
            )
        except Exception as e:
            _logger.error(f"Error evaluating RAGAS metric {self.name}: {e}")
            return Feedback(
                name=self.name,
                error=e,
                source=assessment_source,
            )

    def _validate_kwargs(self, **metric_kwargs):
        if is_deterministic_metric(self.metric_name):
            if "model" in metric_kwargs:
                raise MlflowException.invalid_parameter_value(
                    f"{self.metric_name} got an unexpected keyword argument 'model'"
                )


[docs]@experimental(version="3.8.0") @format_docstring(_MODEL_API_DOC) def get_scorer( metric_name: str, model: str | None = None, **metric_kwargs, ) -> RagasScorer: """ Get a RAGAS metric as an MLflow judge. Args: metric_name: Name of the RAGAS metric (e.g., "Faithfulness") model: {{ model }} metric_kwargs: Additional metric-specific parameters (e.g., threshold) Returns: RagasScorer instance that can be called with MLflow's judge interface Examples: .. code-block:: python # LLM-based metric judge = get_scorer("Faithfulness", model="openai:/gpt-4") feedback = judge(inputs="What is MLflow?", outputs="MLflow is a platform...") # Using trace with retrieval context judge = get_scorer("ContextPrecision", model="openai:/gpt-4") feedback = judge(trace=trace) # Deterministic metric (no LLM needed) judge = get_scorer("ExactMatch") feedback = judge(outputs="Paris", expectations={"expected_output": "Paris"}) """ model = model or get_default_model() return RagasScorer( metric_name=metric_name, model=model, **metric_kwargs, )
from mlflow.genai.scorers.ragas.scorers import ( AspectCritic, BleuScore, ChrfScore, ContextEntityRecall, ContextPrecision, ContextRecall, ExactMatch, FactualCorrectness, Faithfulness, InstanceRubrics, NoiseSensitivity, NonLLMContextPrecisionWithReference, NonLLMContextRecall, NonLLMStringSimilarity, RougeScore, RubricsScore, StringPresence, SummarizationScore, ) __all__ = [ # Core classes "RagasScorer", "get_scorer", # RAG metrics "ContextPrecision", "NonLLMContextPrecisionWithReference", "ContextRecall", "NonLLMContextRecall", "ContextEntityRecall", "NoiseSensitivity", "Faithfulness", # Comparison metrics "FactualCorrectness", "NonLLMStringSimilarity", "BleuScore", "ChrfScore", "RougeScore", "StringPresence", "ExactMatch", # General purpose metrics "AspectCritic", "RubricsScore", "InstanceRubrics", # Other tasks "SummarizationScore", ]