Source code for mlflow.genai.optimize.optimizers.metaprompt_optimizer

import json
import logging
import re
from contextlib import nullcontext
from typing import Any

import mlflow
from mlflow.entities.span import SpanType
from mlflow.exceptions import MlflowException
from mlflow.genai.optimize.optimizers.base import BasePromptOptimizer, _EvalFunc
from mlflow.genai.optimize.types import EvaluationResultRecord, PromptOptimizerOutput
from mlflow.utils.annotations import experimental

_logger = logging.getLogger(__name__)

# Compiled regex pattern for extracting template variables
_TEMPLATE_VAR_PATTERN = re.compile(r"\{\{(\w+)\}\}")


# Unified meta-prompt template that supports both zero-shot and few-shot modes
META_PROMPT_TEMPLATE = """\
You are an expert prompt engineer. Your task is to improve
the following prompts to achieve better performance.

CURRENT PROMPTS:
{current_prompts_formatted}

{evaluation_examples}

PROMPT ENGINEERING BEST PRACTICES:
Apply these proven techniques to create effective prompts:

1. **Clarity & Specificity**: Be explicit about the task, expected output format,
and any constraints
2. **Structured Formatting**: Use numbered lists, sections, or delimiters to
organize complex instructions clearly
3. **Few-Shot Examples**: Include concrete examples showing desired input/output
pairs when appropriate
4. **Role/Persona**: Specify expertise level if relevant (e.g., "You are an expert
mathematician...")
5. **Step-by-Step Decomposition**: Break complex reasoning tasks into explicit
steps or phases
6. **Output Format Specification**: Explicitly define the format, structure, and
constraints for outputs
7. **Constraint Specification**: Clearly state what to avoid, exclude, or not do
8. **Verification Instructions**: Add self-checking steps for calculation-heavy
or error-prone tasks
9. **Chain-of-Thought Prompting**: For reasoning tasks, explicitly instruct to
show intermediate steps

CRITICAL REQUIREMENT - TEMPLATE VARIABLES:
The following variables MUST be preserved EXACTLY as shown in the original prompts.
DO NOT modify, remove, or change the formatting of these variables in any way:
{template_variables}

IMPORTANT: Template variables use double curly braces like {{{{variable_name}}}}.
You MUST copy them exactly as they appear in the original prompt into your improved
prompt. If a variable appears as {{{{question}}}} in the original, it must appear as
{{{{question}}}} in your improvement.

{custom_guidelines}

INSTRUCTIONS:
Generate improved versions of the prompts by applying relevant prompt engineering
best practices. Make your prompts specific and actionable.

{extra_instructions}

CRITICAL: Preserve all template variables in their exact original format with
double curly braces.

CRITICAL: You must respond with a valid JSON object using the EXACT prompt names
shown above. The JSON keys must match the "Prompt name" fields exactly. Use this
structure:
{{
{response_format_example}
}}

REMINDER:
1. Use the exact prompt names as JSON keys (e.g., if the prompt is named
"aime_solver", use "aime_solver" as the key)
2. Every template variable from the original prompt must appear unchanged in your
improved version
3. Apply best practices that are most relevant to the task at hand

Do not include any text before or after the JSON object. Do not include
explanations or reasoning.
"""


[docs]@experimental(version="3.9.0")
class MetaPromptOptimizer(BasePromptOptimizer):
    """
    A prompt optimizer that uses metaprompting with LLMs to improve prompts in a single pass.

    Automatically detects optimization mode based on training data:
    - Zero-shot: No evaluation data - applies general prompt engineering best practices
    - Few-shot: Has evaluation data - learns from evaluation results

    This optimizer performs a single optimization pass, making it faster than iterative
    approaches like GEPA while requiring less data. The optimized prompt is always
    registered regardless of performance improvement.

    Args:
        reflection_model: Name of the model to use for prompt optimization.
            Format: "<provider>:/<model>" (e.g., "openai:/gpt-5.2",
            "anthropic:/claude-sonnet-4-5-20250929")
        lm_kwargs: Optional dictionary of additional parameters to pass to the reflection
            model (e.g., {"temperature": 1.0, "max_tokens": 4096}). These are passed
            directly to the underlying litellm.completion() call. Default: None
        guidelines: Optional custom guidelines to provide domain-specific or task-specific
            context for prompt optimization (e.g., "This is for a finance advisor to
            project tax situations."). Default: None

    Example with evaluation data (few-shot mode):

        .. code-block:: python

            import mlflow
            import openai
            from mlflow.genai.optimize.optimizers import MetaPromptOptimizer
            from mlflow.genai.scorers import Correctness

            prompt = mlflow.genai.register_prompt(
                name="qa",
                template="Answer the following question: {{question}}",
            )


            def predict_fn(question: str) -> str:
                prompt = mlflow.genai.load_prompt("prompts:/qa@latest")
                completion = openai.OpenAI().chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[{"role": "user", "content": prompt.format(question=question)}],
                )
                return completion.choices[0].message.content


            dataset = [
                {"inputs": {"question": "What is the capital of France?"}, "outputs": "Paris"},
                {"inputs": {"question": "What is 2+2?"}, "outputs": "4"},
            ]

            result = mlflow.genai.optimize_prompts(
                predict_fn=predict_fn,
                train_data=dataset,
                prompt_uris=[prompt.uri],
                optimizer=MetaPromptOptimizer(
                    reflection_model="openai:/gpt-4o",
                    lm_kwargs={"temperature": 1.0, "max_tokens": 4096},
                ),
                scorers=[Correctness(model="openai:/gpt-4o")],
            )

            print(f"Improved prompt: {result.optimized_prompts[0].template}")

    Example without evaluation data (zero-shot mode):

        .. code-block:: python

            import mlflow
            from mlflow.genai.optimize.optimizers import MetaPromptOptimizer

            prompt = mlflow.genai.register_prompt(
                name="qa",
                template="Answer: {{question}}",
            )

            # Zero-shot mode: no evaluation data
            result = mlflow.genai.optimize_prompts(
                predict_fn=lambda question: "",  # Not used in zero-shot
                train_data=[],  # Empty dataset triggers zero-shot mode
                prompt_uris=[prompt.uri],
                optimizer=MetaPromptOptimizer(
                    reflection_model="openai:/gpt-4o",
                    guidelines="This is for a finance advisor to project tax situations.",
                ),
                scorers=[],  # No scorers needed for zero-shot
            )

            print(f"Improved prompt: {result.optimized_prompts[0].template}")
    """

    def __init__(
        self,
        reflection_model: str,
        lm_kwargs: dict[str, Any] | None = None,
        guidelines: str | None = None,
    ):
        from mlflow.metrics.genai.model_utils import _parse_model_uri

        self.reflection_model = reflection_model
        self.lm_kwargs = lm_kwargs or {}
        self.guidelines = guidelines
        self.provider, self.model = _parse_model_uri(self.reflection_model)
        self._validate_parameters()

    def _validate_parameters(self):
        if not isinstance(self.lm_kwargs, dict):
            raise MlflowException("`lm_kwargs` must be a dictionary")

[docs]    def optimize(
        self,
        eval_fn: _EvalFunc,
        train_data: list[dict[str, Any]],
        target_prompts: dict[str, str],
        enable_tracking: bool = True,
    ) -> PromptOptimizerOutput:
        """
        Optimize the target prompts using metaprompting in a single pass.

        Automatically detects mode:
        - If train_data is empty: zero-shot mode (no evaluation)
        - If train_data has examples: few-shot mode (with baseline evaluation for feedback)

        The optimized prompt is always returned regardless of performance improvement.

        Args:
            eval_fn: The evaluation function that takes candidate prompts and dataset,
                returns evaluation results. Not used in zero-shot mode.
            train_data: The dataset to use for optimization. Empty list triggers zero-shot
                mode. In few-shot mode, train_data is always used for baseline evaluation
                (to capture feedback) and for showing examples in the meta-prompt.
            target_prompts: The target prompt templates as dict (name -> template).
            enable_tracking: If True (default), automatically log optimization progress.

        Returns:
            The optimized prompts with initial score (final_eval_score is None for
            single-pass).
        """
        # Extract template variables
        template_variables = self._extract_template_variables(target_prompts)

        # Auto-detect mode based on training data
        if not train_data:
            _logger.info("No training data provided, using zero-shot metaprompting")
            return self._optimize_zero_shot(target_prompts, template_variables, enable_tracking)
        else:
            _logger.info(
                f"{len(train_data)} training examples provided, using few-shot metaprompting"
            )

            return self._optimize_few_shot(
                eval_fn,
                train_data,
                target_prompts,
                template_variables,
                enable_tracking,
            )

    def _optimize_zero_shot(
        self,
        target_prompts: dict[str, str],
        template_variables: dict[str, set[str]],
        enable_tracking: bool,
    ) -> PromptOptimizerOutput:
        """
        Optimize prompts using zero-shot metaprompting (no evaluation data).

        Applies general prompt engineering best practices in a single pass.
        """
        _logger.info("Applying zero-shot prompt optimization with best practices")

        meta_prompt = self._build_zero_shot_meta_prompt(target_prompts, template_variables)

        try:
            improved_prompts = self._call_reflection_model(meta_prompt, enable_tracking)

            self._validate_prompt_names(target_prompts, improved_prompts)
            self._validate_template_variables(target_prompts, improved_prompts)

            _logger.info("Successfully generated improved prompts")

            return PromptOptimizerOutput(
                optimized_prompts=improved_prompts,
                initial_eval_score=None,  # No evaluation in zero-shot mode
                final_eval_score=None,
            )

        except Exception as e:
            _logger.warning(f"Zero-shot optimization failed: {e}. Returning original prompts.")
            return PromptOptimizerOutput(
                optimized_prompts=target_prompts,
                initial_eval_score=None,
                final_eval_score=None,
            )

    def _optimize_few_shot(
        self,
        eval_fn: _EvalFunc,
        train_data: list[dict[str, Any]],
        target_prompts: dict[str, str],
        template_variables: dict[str, set[str]],
        enable_tracking: bool,
    ) -> PromptOptimizerOutput:
        """
        Optimize prompts using few-shot metaprompting (with evaluation feedback).

        Performs a single optimization pass based on evaluation results from training examples.
        The optimized prompt is always returned regardless of performance improvement.

        Args:
            eval_fn: Evaluation function to score prompts
            train_data: Training data used for baseline evaluation to capture feedback
            target_prompts: Initial prompts to optimize
            template_variables: Template variables extracted from prompts
            enable_tracking: Whether to log metrics to MLflow
        """
        # Always evaluate baseline on train_data to capture feedback for metaprompting
        _logger.info("Evaluating baseline prompts on training data...")
        baseline_results = eval_fn(target_prompts, train_data)
        initial_eval_score = self._compute_aggregate_score(baseline_results)
        initial_eval_score_per_scorer = self._compute_per_scorer_scores(baseline_results)
        if initial_eval_score is not None:
            _logger.info(f"Baseline score: {initial_eval_score:.4f}")

        # Build meta-prompt with evaluation feedback
        _logger.info("Generating optimized prompts...")
        meta_prompt = self._build_few_shot_meta_prompt(
            target_prompts,
            template_variables,
            baseline_results,
        )
        # Call LLM to generate improved prompts
        try:
            improved_prompts = self._call_reflection_model(meta_prompt, enable_tracking)

            self._validate_prompt_names(target_prompts, improved_prompts)
            self._validate_template_variables(target_prompts, improved_prompts)

            _logger.info("Successfully generated optimized prompts")

        except Exception as e:
            _logger.warning(f"Few-shot optimization failed: {e}. Returning original prompts.")
            return PromptOptimizerOutput(
                optimized_prompts=target_prompts,
                initial_eval_score=initial_eval_score,
                final_eval_score=None,
                initial_eval_score_per_scorer=initial_eval_score_per_scorer,
                final_eval_score_per_scorer={},
            )

        final_eval_score = None
        final_eval_score_per_scorer: dict[str, float] = {}
        if initial_eval_score is not None:
            _logger.info(
                "Evaluating optimized prompts on training data, please note that this is more of "
                "a sanity check than a final evaluation because the data has already been used "
                "for meta-prompting. To accurately evaluate the optimized prompts, please use a "
                "separate validation dataset and run mlflow.genai.evaluate() on it."
            )
            final_results = eval_fn(improved_prompts, train_data)
            final_eval_score = self._compute_aggregate_score(final_results)
            final_eval_score_per_scorer = self._compute_per_scorer_scores(final_results)
            _logger.info(f"Final score: {final_eval_score:.4f}")

        return PromptOptimizerOutput(
            optimized_prompts=improved_prompts,
            initial_eval_score=initial_eval_score,
            final_eval_score=final_eval_score,
            initial_eval_score_per_scorer=initial_eval_score_per_scorer,
            final_eval_score_per_scorer=final_eval_score_per_scorer,
        )

    def _extract_template_variables(self, prompts: dict[str, str]) -> dict[str, set[str]]:
        """
        Extract template variables ({{var}}) from each prompt.

        Args:
            prompts: Dict mapping prompt_name -> template

        Returns:
            Dict mapping prompt_name -> set of variable names
        """
        variables = {}
        for name, template in prompts.items():
            # Match {{variable}} pattern (MLflow uses double braces)
            matches = _TEMPLATE_VAR_PATTERN.findall(template)
            variables[name] = set(matches)
        return variables

    def _validate_prompt_names(
        self, original_prompts: dict[str, str], new_prompts: dict[str, str]
    ) -> bool:
        """
        Validate that prompt names match between original and new prompts.

        Args:
            original_prompts: Original prompt templates
            new_prompts: New prompt templates to validate

        Returns:
            True if valid

        Raises:
            MlflowException: If prompt names don't match
        """
        # Check for unexpected prompts in the improved prompts
        if unexpected_prompts := set(new_prompts) - set(original_prompts):
            raise MlflowException(
                f"Unexpected prompts found in improved prompts: {sorted(unexpected_prompts)}"
            )

        # Check for missing prompts in the improved prompts
        if missing_prompts := set(original_prompts) - set(new_prompts):
            raise MlflowException(
                f"Prompts missing from improved prompts: {sorted(missing_prompts)}"
            )

        return True

    def _validate_template_variables(
        self, original_prompts: dict[str, str], new_prompts: dict[str, str]
    ) -> bool:
        """Validate that all template variables are preserved in new prompts."""
        original_vars = self._extract_template_variables(original_prompts)
        new_vars = self._extract_template_variables(new_prompts)

        for name in original_prompts:
            if original_vars[name] != new_vars[name]:
                missing = original_vars[name] - new_vars[name]
                extra = new_vars[name] - original_vars[name]
                msg = f"Template variables mismatch in prompt '{name}'."
                if missing:
                    msg += f" Missing: {missing}."
                if extra:
                    msg += f" Extra: {extra}."
                raise MlflowException(msg)

        return True

    def _build_zero_shot_meta_prompt(
        self,
        current_prompts: dict[str, str],
        template_variables: dict[str, set[str]],
    ) -> str:
        # Format the current prompts for each module
        prompts_formatted = "\n\n".join(
            [
                f"Prompt name: {name}\nTemplate: {template}"
                for name, template in current_prompts.items()
            ]
        )

        # Format template variables
        vars_formatted = "\n".join(
            [
                f"- Prompt '{name}': {', '.join(sorted(vars)) if vars else 'none'}"
                for name, vars in template_variables.items()
            ]
        )

        # Add custom guidelines to the meta-prompt if provided
        custom_guidelines = f"CUSTOM GUIDELINES:\n{self.guidelines}" if self.guidelines else ""

        # Format example JSON response with actual prompt names
        response_format_example = "\n".join(
            [
                f'  "{name}": "improved prompt text with variables preserved exactly"'
                for name in current_prompts.keys()
            ]
        )

        return META_PROMPT_TEMPLATE.format(
            current_prompts_formatted=prompts_formatted,
            evaluation_examples="",
            extra_instructions="",
            template_variables=vars_formatted,
            custom_guidelines=custom_guidelines,
            response_format_example=response_format_example,
        )

    def _build_few_shot_meta_prompt(
        self,
        current_prompts: dict[str, str],
        template_variables: dict[str, set[str]],
        eval_results: list[EvaluationResultRecord],
    ) -> str:
        """Build few-shot meta-prompt with evaluation feedback."""
        # Format current prompts
        prompts_formatted = "\n\n".join(
            [
                f"Prompt name: {name}\nTemplate: {template}"
                for name, template in current_prompts.items()
            ]
        )

        if not eval_results:
            raise MlflowException(
                "Few-shot metaprompting requires evaluation results. "
                "No evaluation results were provided to _build_few_shot_meta_prompt."
            )

        # Calculate current score from evaluation results (if scores are available)
        current_score = self._compute_aggregate_score(eval_results)

        # Format examples and their evaluation results in the meta-prompt
        examples_formatted = self._format_examples(eval_results)

        # Format template variables
        vars_formatted = "\n".join(
            [
                f"- Prompt '{name}': {', '.join(sorted(vars)) if vars else 'none'}"
                for name, vars in template_variables.items()
            ]
        )

        # Add custom guidelines to the meta-prompt if provided
        custom_guidelines = f"CUSTOM GUIDELINES:\n{self.guidelines}" if self.guidelines else ""

        # Format example JSON response with actual prompt names
        response_format_example = "\n".join(
            [
                f'  "{name}": "improved prompt text with variables preserved exactly"'
                for name in current_prompts.keys()
            ]
        )

        # Build evaluation examples section (with or without score)
        if current_score is not None:
            score_info = f" (Current Score: {current_score:.3f})"
            analysis_instructions = """
Before applying best practices, analyze the examples to identify:
1. **Common Failure Patterns**: What mistakes appear repeatedly? (wrong format,
   missing steps, calculation errors, etc.)
2. **Success Patterns**: What made successful examples work? (format, detail level,
   reasoning approach)
3. **Key Insights**: What do the rationales tell you about quality criteria and
   needed improvements?
4. **Task Requirements**: What output format, explanation level, and edge cases
   are expected?"""
        else:
            score_info = ""
            analysis_instructions = """
Before applying best practices, analyze the examples to identify:
1. **Output Patterns**: What are the expected outputs for different inputs?
2. **Task Requirements**: What output format, explanation level, and edge cases
   are expected?
3. **Common Themes**: What patterns do you see in the input-output relationships?"""

        evaluation_examples = f"""EVALUATION EXAMPLES{score_info}:
Below are examples showing how the current prompts performed. Study these to identify
patterns in what worked and what failed.

{examples_formatted}
{analysis_instructions}"""

        extra_instructions = """
Focus on applying best practices that directly address the observed patterns.
Add specific instructions, format specifications, or verification steps that would
improve the prompt's effectiveness."""

        return META_PROMPT_TEMPLATE.format(
            current_prompts_formatted=prompts_formatted,
            evaluation_examples=evaluation_examples,
            extra_instructions=extra_instructions,
            template_variables=vars_formatted,
            custom_guidelines=custom_guidelines,
            response_format_example=response_format_example,
        )

    def _format_examples(self, eval_results: list[EvaluationResultRecord]) -> str:
        """Format evaluation results for meta-prompting."""
        formatted = []
        for i, result in enumerate(eval_results, 1):
            rationale_str = (
                "\n".join([f"  - {k}: {v}" for k, v in result.rationales.items()])
                if result.rationales
                else "  None"
            )

            # Build example with optional score
            example_lines = [
                f"Example {i}:",
                f"  Input: {json.dumps(result.inputs)}",
                f"  Output: {result.outputs}",
                f"  Expected: {result.expectations}",
            ]
            if result.score is not None:
                example_lines.append(f"  Score: {result.score:.3f}")
                example_lines.append(f"  Rationales:\n{rationale_str}")

            formatted.append("\n".join(example_lines) + "\n")
        return "\n".join(formatted)

    def _call_reflection_model(
        self, meta_prompt: str, enable_tracking: bool = True
    ) -> dict[str, str]:
        """Call the reflection model to generate improved prompts."""
        try:
            import litellm
        except ImportError as e:
            raise ImportError(
                "litellm is required for metaprompt optimization. "
                "Please install it with: `pip install litellm`"
            ) from e

        litellm_model = f"{self.provider}/{self.model}"

        litellm_params = {
            "model": litellm_model,
            "messages": [{"role": "user", "content": meta_prompt}],
            "response_format": {"type": "json_object"},  # Request JSON output
            "max_retries": 3,
            **self.lm_kwargs,  # Merge user-provided parameters
        }

        content = None  # Initialize to avoid NameError in exception handler

        span_context = (
            mlflow.start_span(name="metaprompt_reflection", span_type=SpanType.LLM)
            if enable_tracking
            else nullcontext()
        )

        with span_context as span:
            if enable_tracking:
                span.set_inputs({"meta_prompt": meta_prompt, "model": litellm_model})

            try:
                response = litellm.completion(**litellm_params)

                # Extract and parse response
                content = response.choices[0].message.content.strip()

                # Strip markdown code blocks if present as some models have the tendency to add them
                if content.startswith("```json"):
                    content = content[7:]
                elif content.startswith("```"):
                    content = content[3:]
                content = content.removesuffix("```").strip()

                # The content should be a valid JSON object with keys being the prompt
                # names and values being the improved prompts.
                improved_prompts = json.loads(content)

                if not isinstance(improved_prompts, dict):
                    raise MlflowException(
                        f"Reflection model returned invalid format. Expected JSON object, "
                        f"got {type(improved_prompts).__name__}"
                    )

                for key, value in improved_prompts.items():
                    if not isinstance(value, str):
                        raise MlflowException(
                            f"Prompt '{key}' must be a string, got {type(value).__name__}"
                        )

                if enable_tracking:
                    span.set_outputs(improved_prompts)

                return improved_prompts

            except json.JSONDecodeError as e:
                response_preview = content[:2000] if content else "No content received"
                raise MlflowException(
                    f"Failed to parse reflection model response as JSON: {e}\n"
                    f"Response: {response_preview}"
                ) from e
            except Exception as e:
                raise MlflowException(
                    f"Failed to call reflection model {litellm_model}: {e}"
                ) from e

    def _compute_aggregate_score(self, results: list[EvaluationResultRecord]) -> float | None:
        """
        Compute aggregate score from evaluation results.

        Args:
            results: List of evaluation results

        Returns:
            Average score across all examples, or None if no results or scores are None
        """
        if not results:
            return None

        # If any score is None, return None (no scorers were provided)
        scores = [r.score for r in results]
        if any(s is None for s in scores):
            return None

        return sum(scores) / len(scores)

    def _compute_per_scorer_scores(self, results: list[EvaluationResultRecord]) -> dict[str, float]:
        """
        Compute per-scorer average scores from evaluation results.

        Args:
            results: List of evaluation results

        Returns:
            Dict mapping scorer name to average score across all examples
        """
        if not results:
            return {}

        scorer_names = results[0].individual_scores.keys()
        if not scorer_names:
            return {}

        # Compute average for each scorer
        per_scorer_avg: dict[str, float] = {}
        for scorer_name in scorer_names:
            scores = [r.individual_scores[scorer_name] for r in results]
            per_scorer_avg[scorer_name] = sum(scores) / len(scores)

        return per_scorer_avg