Source code for mlflow.metrics.genai.base

from dataclasses import dataclass
from typing import Optional, Union

from mlflow.metrics.genai.prompt_template import PromptTemplate


[docs]@dataclass
class EvaluationExample:
    """
    Stores the sample example during few shot learning during LLM evaluation

    Args:
        input: The input provided to the model
        output: The output generated by the model
        score: The score given by the evaluator
        justification: The justification given by the evaluator
        grading_context: The grading_context provided to the evaluator for evaluation. Either
            a dictionary of grading context column names and grading context strings
            or a single grading context string.

    .. code-block:: python
        :caption: Example for creating an EvaluationExample

        from mlflow.metrics.genai import EvaluationExample

        example = EvaluationExample(
            input="What is MLflow?",
            output="MLflow is an open-source platform for managing machine "
            "learning workflows, including experiment tracking, model packaging, "
            "versioning, and deployment, simplifying the ML lifecycle.",
            score=4,
            justification="The definition effectively explains what MLflow is "
            "its purpose, and its developer. It could be more concise for a 5-score.",
            grading_context={
                "ground_truth": "MLflow is an open-source platform for managing "
                "the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, "
                "a company that specializes in big data and machine learning solutions. MLflow is "
                "designed to address the challenges that data scientists and machine learning "
                "engineers face when developing, training, and deploying machine learning models."
            },
        )
        print(str(example))

    .. code-block:: text
        :caption: Output

        Input: What is MLflow?
        Provided output: "MLflow is an open-source platform for managing machine "
            "learning workflows, including experiment tracking, model packaging, "
            "versioning, and deployment, simplifying the ML lifecycle."
        Provided ground_truth: "MLflow is an open-source platform for managing "
            "the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, "
            "a company that specializes in big data and machine learning solutions. MLflow is "
            "designed to address the challenges that data scientists and machine learning "
            "engineers face when developing, training, and deploying machine learning models."
        Score: 4
        Justification: "The definition effectively explains what MLflow is "
            "its purpose, and its developer. It could be more concise for a 5-score."
    """

    output: str
    score: float
    justification: str
    input: Optional[str] = None
    grading_context: Optional[Union[dict[str, str], str]] = None

    def _format_grading_context(self):
        if isinstance(self.grading_context, dict):
            return "\n".join(
                [f"key: {key}\nvalue:\n{value}" for key, value in self.grading_context.items()]
            )
        else:
            return self.grading_context

    def __str__(self) -> str:
        return PromptTemplate(
            [
                """
Example Input:
{input}
""",
                """
Example Output:
{output}
""",
                """
Additional information used by the model:
{grading_context}
""",
                """
Example score: {score}
Example justification: {justification}
        """,
            ]
        ).format(
            input=self.input,
            output=self.output,
            grading_context=self._format_grading_context(),
            score=self.score,
            justification=self.justification,
        )