Skip to content

Optimization by PROmpting (OPRO)

About

Optimization by PROmpting (OPRO) starts with a seed prompt. At each iteration, a collection of past prompt candidates and scores and a random sample of input and output pairs from the validation set are formatted into a metaprompt. The metaprompt is sent to the language model multiple times, each time asking it to provide a new candidate prompt that improves the instructions for the task. These new candidates are scored and the iterations continue until the maximum depth is reached or the score threshold is exceeded.

Citation

@misc{yang2024largelanguagemodelsoptimizers,
    title={Large Language Models as Optimizers},
    author={Chengrun Yang and Xuezhi Wang and Yifeng Lu and Hanxiao Liu and Quoc V. Le and Denny Zhou and Xinyun Chen},
    year={2024},
    eprint={2309.03409},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2309.03409},
}

Source

OPROOptimizer

Bases: BaseOptimizer

OPRO Optimizer.

Based on Optimization by PROmpting from Yang, et. al. 2024

@misc{yang2024largelanguagemodelsoptimizers,
    title={Large Language Models as Optimizers},
    author={Chengrun Yang and Xuezhi Wang and Yifeng Lu and Hanxiao Liu and Quoc V. Le and Denny Zhou and Xinyun Chen},
    year={2024},
    eprint={2309.03409},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2309.03409},
}
Source code in src/prompt_optimizer/optimizers/opro.py
class OPROOptimizer(BaseOptimizer):
    """
    OPRO Optimizer.

    Based on Optimization by PROmpting from Yang, et. al. 2024

    ```
    @misc{yang2024largelanguagemodelsoptimizers,
        title={Large Language Models as Optimizers},
        author={Chengrun Yang and Xuezhi Wang and Yifeng Lu and Hanxiao Liu and Quoc V. Le and Denny Zhou and Xinyun Chen},
        year={2024},
        eprint={2309.03409},
        archivePrefix={arXiv},
        primaryClass={cs.LG},
        url={https://arxiv.org/abs/2309.03409},
    }
    ```

    """

    def __init__(
        self,
        *,
        client: ClientType,
        seed_prompts: list[Prompt],
        validation_set: ValidationSetType,
        max_depth: int,
        evaluator: Callable[[Prompt, ValidationSetType], ScoreType],
        output_path: Optional[Union[str, Path]] = None,
        input_field: str,
        output_field: str,
        num_candidates_per_step: int = 20,
        num_exemplars: int = 3,
        max_demonstration_prompts: int = 20,
        score_threshold: Optional[Union[float, int]] = None,
        **kwargs,
    ):
        """
        Initialize the APE optimizer.

        Args:
            client (ClientType):
                Language model client to use for prompt generation and feedback.
            seed_prompts (list[Prompt]):
                List of prompts to seed generation.
            validation_set (ValidationSetType):
                Set of examples to evaluate the prompt on.
            max_depth (int):
                Maximum iteration depth for prompt generation.
            evaluator (Callable[[Prompt, ValidationSetType], ScoreType]):
                Function that takes a prompt and the validation data and returns a score.
            output_path (Union[str, Path], optional):
                Path to store run results. Should be a .jsonl file path.
                If None, no outputs will be written to disk. Defaults to None.
            input_field (str):
                Field in the validation set that represents the input. Used in candidate generation in
                the "input:" field.
            output_field (str):
                Field in the validation set that represents the output. Used in candidate generation in
                the "output:" field.
            num_candidates_per_step (int, optional):
                Number of candidates to create at each step. Defaults to 20.
            num_exemplars (int, optional):
                Number of exemplars from the validation set to provide in the metaprompt.
                A random sample of input and output pairs of this size will be provided to the LLM
                during candidate generation. Defaults to 3.
            max_demonstration_prompts (int, optional):
                Maximum number of demostration prompts to provide in the metaprompt.
                Defaults to 20.
            score_threshold (float, optional):
                Threshold for early convergence. If a prompt exceeds this score after any iteration, the optimization loop
                immediately ends. If set to None, the optimization loop will not terminate early. Defaults to None.
            kwargs:
                Additional keyword arguments.
        """
        super().__init__(
            client=client,
            seed_prompts=seed_prompts,
            validation_set=validation_set,
            max_depth=max_depth,
            evaluator=evaluator,
            output_path=output_path,
        )
        self.num_candidates_per_step = num_candidates_per_step
        self.num_exemplars = num_exemplars
        self.max_demonstration_prompts = max_demonstration_prompts
        self.input_field = input_field
        self.output_field = output_field
        self.score_threshold = score_threshold

    def _extract_response(self, content: str) -> str:
        """
        Extract the responses between square brackets.

        Args:
            content (str): Output string from an LLM generation request.

        Returns:
            str: Response captured between square brackets, or the whole response if there are no square brackets.

        """
        if "[" not in content or "]" not in content:
            return content

        # Get everything after the first bracket
        parsed_content = content.split("[", maxsplit=1)[1]

        # Get everything before the last bracket
        parsed_content = parsed_content[::-1].split("]", maxsplit=1)[1][::-1]

        return parsed_content

    def _generate(self, metaprompt_template: str, template_kwargs: dict) -> str:
        """
        Generate a completion for a given template and kwargs and parse the results.

        Args:
            metaprompt_template (str): Template for the metaprompt.
            template_kwargs (dict): Key word arguments to fill the template values.
            kwargs: Additional kwargs to pass to the OpenAI client.completions.create (e.g. temperature)

        Returns:
            list[str]: The parsed generation results.

        """
        metaprompt = metaprompt_template.format(**template_kwargs)
        input = [{"role": "user", "content": metaprompt}]
        raw_response = self.client.invoke(input=input)
        response = raw_response.content.strip()
        return response

    def generate_prompt_candidates(self, *, prompts: list[Prompt], validation_set: ValidationSetType) -> list[Prompt]:
        """Generate prompt candidates."""
        # Sort prompts by score, highest to lowest
        sorted_prompts = sorted(prompts, key=lambda x: x.score, reverse=True)[: self.max_demonstration_prompts]

        # Format prompts into instructions_and_scores
        instructions_and_scores = "\n\n".join([f"text:\n{prompt.content}\nscore:\n{prompt.score}" for prompt in sorted_prompts])

        # Generate prompt candidates
        prompt_candidates = []
        for _ in track(range(self.num_candidates_per_step), description="Generating prompt candidates", transient=True):
            # Format a sample of questions into input_output_pairs
            sample = random.choices(validation_set, k=self.num_exemplars)
            input_output_pairs = "\n\n".join(
                [f"input:\n<INS>\n{row[self.input_field]}\noutput:\n{row[self.output_field]}" for row in sample]
            )

            # Generate prompt candidate
            template_kwargs = {"instructions_and_scores": instructions_and_scores, "input_output_pairs": input_output_pairs}
            response = self._generate(metaprompt_template=METAPROMPT_TEMPLATE, template_kwargs=template_kwargs)
            prompt_candidate = self._extract_response(response)

            prompt_candidates.append(Prompt(content=prompt_candidate))

        # Return the original prompts plus the new candidates
        return prompts + prompt_candidates

    def select_prompt_candidates(self, *, prompts: list[Prompt], validation_set: ValidationSetType) -> list[Prompt]:
        """Select prompt candidates."""
        # Score the prompts
        self._score_prompts(prompts=prompts, validation_set=validation_set)
        # Return all prompts, no filtering is done
        return prompts

    def check_early_convergence(self, *, all_prompts: list[list[Prompt]]) -> bool:
        """Detect early convergence."""
        if self.score_threshold is None:
            return False

        # Flatten all iterations
        prompts = sum(all_prompts, start=[])

        # Check if early convergence criteria is met
        highest_score = max(prompts, key=lambda x: x.score).score
        if highest_score >= self.score_threshold:
            return True
        return False

    def _get_best_prompt(self, prompts: list[Prompt]):
        """Get the highest scoring prompt."""
        if any(prompt.score is None for prompt in prompts):
            raise ValueError("All prompts must be scored before calling this function.")
        return max(prompts, key=lambda x: x.score)

    def select_best_prompt(self, all_prompts: list[list[Prompt]]) -> Prompt:
        """Select the top scoring prompt."""
        # Flatten all iterations
        prompts = sum(all_prompts, start=[])

        # Select the single prompt with the highest score
        best_prompt = self._get_best_prompt(prompts=prompts)
        logger.info(f"Best score: {best_prompt.score:.3f}")
        return best_prompt

__init__(*, client, seed_prompts, validation_set, max_depth, evaluator, output_path=None, input_field, output_field, num_candidates_per_step=20, num_exemplars=3, max_demonstration_prompts=20, score_threshold=None, **kwargs)

Initialize the APE optimizer.

Parameters:

Name Type Description Default
client ClientType

Language model client to use for prompt generation and feedback.

required
seed_prompts list[Prompt]

List of prompts to seed generation.

required
validation_set ValidationSetType

Set of examples to evaluate the prompt on.

required
max_depth int

Maximum iteration depth for prompt generation.

required
evaluator Callable[[Prompt, ValidationSetType], ScoreType]

Function that takes a prompt and the validation data and returns a score.

required
output_path Union[str, Path]

Path to store run results. Should be a .jsonl file path. If None, no outputs will be written to disk. Defaults to None.

None
input_field str

Field in the validation set that represents the input. Used in candidate generation in the "input:" field.

required
output_field str

Field in the validation set that represents the output. Used in candidate generation in the "output:" field.

required
num_candidates_per_step int

Number of candidates to create at each step. Defaults to 20.

20
num_exemplars int

Number of exemplars from the validation set to provide in the metaprompt. A random sample of input and output pairs of this size will be provided to the LLM during candidate generation. Defaults to 3.

3
max_demonstration_prompts int

Maximum number of demostration prompts to provide in the metaprompt. Defaults to 20.

20
score_threshold float

Threshold for early convergence. If a prompt exceeds this score after any iteration, the optimization loop immediately ends. If set to None, the optimization loop will not terminate early. Defaults to None.

None
kwargs

Additional keyword arguments.

{}
Source code in src/prompt_optimizer/optimizers/opro.py
def __init__(
    self,
    *,
    client: ClientType,
    seed_prompts: list[Prompt],
    validation_set: ValidationSetType,
    max_depth: int,
    evaluator: Callable[[Prompt, ValidationSetType], ScoreType],
    output_path: Optional[Union[str, Path]] = None,
    input_field: str,
    output_field: str,
    num_candidates_per_step: int = 20,
    num_exemplars: int = 3,
    max_demonstration_prompts: int = 20,
    score_threshold: Optional[Union[float, int]] = None,
    **kwargs,
):
    """
    Initialize the APE optimizer.

    Args:
        client (ClientType):
            Language model client to use for prompt generation and feedback.
        seed_prompts (list[Prompt]):
            List of prompts to seed generation.
        validation_set (ValidationSetType):
            Set of examples to evaluate the prompt on.
        max_depth (int):
            Maximum iteration depth for prompt generation.
        evaluator (Callable[[Prompt, ValidationSetType], ScoreType]):
            Function that takes a prompt and the validation data and returns a score.
        output_path (Union[str, Path], optional):
            Path to store run results. Should be a .jsonl file path.
            If None, no outputs will be written to disk. Defaults to None.
        input_field (str):
            Field in the validation set that represents the input. Used in candidate generation in
            the "input:" field.
        output_field (str):
            Field in the validation set that represents the output. Used in candidate generation in
            the "output:" field.
        num_candidates_per_step (int, optional):
            Number of candidates to create at each step. Defaults to 20.
        num_exemplars (int, optional):
            Number of exemplars from the validation set to provide in the metaprompt.
            A random sample of input and output pairs of this size will be provided to the LLM
            during candidate generation. Defaults to 3.
        max_demonstration_prompts (int, optional):
            Maximum number of demostration prompts to provide in the metaprompt.
            Defaults to 20.
        score_threshold (float, optional):
            Threshold for early convergence. If a prompt exceeds this score after any iteration, the optimization loop
            immediately ends. If set to None, the optimization loop will not terminate early. Defaults to None.
        kwargs:
            Additional keyword arguments.
    """
    super().__init__(
        client=client,
        seed_prompts=seed_prompts,
        validation_set=validation_set,
        max_depth=max_depth,
        evaluator=evaluator,
        output_path=output_path,
    )
    self.num_candidates_per_step = num_candidates_per_step
    self.num_exemplars = num_exemplars
    self.max_demonstration_prompts = max_demonstration_prompts
    self.input_field = input_field
    self.output_field = output_field
    self.score_threshold = score_threshold

check_early_convergence(*, all_prompts)

Detect early convergence.

Source code in src/prompt_optimizer/optimizers/opro.py
def check_early_convergence(self, *, all_prompts: list[list[Prompt]]) -> bool:
    """Detect early convergence."""
    if self.score_threshold is None:
        return False

    # Flatten all iterations
    prompts = sum(all_prompts, start=[])

    # Check if early convergence criteria is met
    highest_score = max(prompts, key=lambda x: x.score).score
    if highest_score >= self.score_threshold:
        return True
    return False

generate_prompt_candidates(*, prompts, validation_set)

Generate prompt candidates.

Source code in src/prompt_optimizer/optimizers/opro.py
def generate_prompt_candidates(self, *, prompts: list[Prompt], validation_set: ValidationSetType) -> list[Prompt]:
    """Generate prompt candidates."""
    # Sort prompts by score, highest to lowest
    sorted_prompts = sorted(prompts, key=lambda x: x.score, reverse=True)[: self.max_demonstration_prompts]

    # Format prompts into instructions_and_scores
    instructions_and_scores = "\n\n".join([f"text:\n{prompt.content}\nscore:\n{prompt.score}" for prompt in sorted_prompts])

    # Generate prompt candidates
    prompt_candidates = []
    for _ in track(range(self.num_candidates_per_step), description="Generating prompt candidates", transient=True):
        # Format a sample of questions into input_output_pairs
        sample = random.choices(validation_set, k=self.num_exemplars)
        input_output_pairs = "\n\n".join(
            [f"input:\n<INS>\n{row[self.input_field]}\noutput:\n{row[self.output_field]}" for row in sample]
        )

        # Generate prompt candidate
        template_kwargs = {"instructions_and_scores": instructions_and_scores, "input_output_pairs": input_output_pairs}
        response = self._generate(metaprompt_template=METAPROMPT_TEMPLATE, template_kwargs=template_kwargs)
        prompt_candidate = self._extract_response(response)

        prompt_candidates.append(Prompt(content=prompt_candidate))

    # Return the original prompts plus the new candidates
    return prompts + prompt_candidates

get_all_prompts(include_candidates=False)

Get all the prompts from the latest training run.

The default behavior returns a list of lists, where each internal list contains the retained candidates after one iteration step. Setting include_candidates to True will also include all generated candidate prompts.

Parameters:

Name Type Description Default
include_candidates bool

Whether to include all the candidate prompts in the output. If True, candidate prompts from each iteration will be included. Defaults to False.

False

Returns:

Type Description
list[list[Prompt]]

list[list[Prompt]]: List of lists where each list contains the prompts from each iteration. E.g. list[0] contains prompts from the first iteration, list[1] the second, etc. If include_candidates is False, each inner list contains only the retained prompts at each iteration. If include_candidates is True, each inner list contains all candidate prompts at each iteration, including those that were discarded.

Source code in src/prompt_optimizer/optimizers/base.py
def get_all_prompts(self, include_candidates: bool = False) -> list[list[Prompt]]:
    """
    Get all the prompts from the latest training run.

    The default behavior returns a list of lists, where each internal list contains the
    retained candidates after one iteration step.
    Setting include_candidates to True will also include all generated candidate prompts.

    Args:
        include_candidates (bool, optional):
            Whether to include all the candidate prompts in the output.
            If True, candidate prompts from each iteration will be included.
            Defaults to False.

    Returns:
        list[list[Prompt]]:
            List of lists where each list contains the prompts from each iteration.
            E.g. list[0] contains prompts from the first iteration, list[1] the second, etc.
            If include_candidates is False, each inner list contains only the retained prompts at each iteration.
            If include_candidates is True, each inner list contains all candidate prompts at each iteration,
            including those that were discarded.

    """
    # Decide whether to include candidates
    if include_candidates:
        all_prompts = self._g
    else:
        all_prompts = self._p

    return all_prompts

run()

Run the optimization pipeline.

Source code in src/prompt_optimizer/optimizers/base.py
def run(self) -> Prompt:
    """Run the optimization pipeline."""
    # Score seed_prompts
    self.seed_prompts = self._score_prompts(self.seed_prompts, self.validation_set)

    # Initialize objects
    self._p = [self.seed_prompts]
    self._g = [self.seed_prompts]

    # Iterate until max depth
    for t in track(range(1, self.max_depth + 1), description="Step", total=self.max_depth):
        # Generate prompt candidates
        g_t = self.generate_prompt_candidates(prompts=self._p[t - 1], validation_set=self.validation_set)
        self._g.append(g_t)
        # Select prompt candidates
        p_t = self.select_prompt_candidates(prompts=self._g[t], validation_set=self.validation_set)
        self._p.append(p_t)
        # Check for early convergence
        if self.check_early_convergence(all_prompts=self._p):
            break

    # Save prompts if requested
    self.save_prompts(output_path=self.output_path)

    # Return best prompt
    return self.select_best_prompt(all_prompts=self._p)

save_prompts(output_path)

Save prompts in jsonl format.

Source code in src/prompt_optimizer/optimizers/base.py
def save_prompts(self, output_path: Optional[Union[str, Path]]):
    """Save prompts in jsonl format."""
    # Exit if no output path is set
    if self.output_path is None:
        return

    # Get and deduplicate prompts
    prompts = sum(self._p, start=[])
    prompts = list(set(prompts))

    # Save the prompts to the file
    lines = [prompt.model_dump_json() for prompt in prompts]
    with open(output_path, "w") as f:
        for line in lines:
            f.write(line)
            f.write("\n")

select_best_prompt(all_prompts)

Select the top scoring prompt.

Source code in src/prompt_optimizer/optimizers/opro.py
def select_best_prompt(self, all_prompts: list[list[Prompt]]) -> Prompt:
    """Select the top scoring prompt."""
    # Flatten all iterations
    prompts = sum(all_prompts, start=[])

    # Select the single prompt with the highest score
    best_prompt = self._get_best_prompt(prompts=prompts)
    logger.info(f"Best score: {best_prompt.score:.3f}")
    return best_prompt

select_prompt_candidates(*, prompts, validation_set)

Select prompt candidates.

Source code in src/prompt_optimizer/optimizers/opro.py
def select_prompt_candidates(self, *, prompts: list[Prompt], validation_set: ValidationSetType) -> list[Prompt]:
    """Select prompt candidates."""
    # Score the prompts
    self._score_prompts(prompts=prompts, validation_set=validation_set)
    # Return all prompts, no filtering is done
    return prompts