Skip to content

Automatic Prompt Engineer (APE)

About

Automatic Prompt Engineer (APE) starts by generating a set of prompt candidates through "forward generation". During forward generation, input and output pairs from the validation set are presented to the language model, and the model is tasked with generating the instructions that could be used to answer that question. After these initial prompts are scored, the top k% scoring prompts are retained. On the following iterations, these prompts are resampled by asking a language model to create a variation of an existing prompt, then scoring and selecting the top k% scoring prompts. This process continues until the maximum iteration depth is reached or the score threshold is exceeded.

Citation

@misc{zhou2023largelanguagemodelshumanlevel,
    title={Large Language Models Are Human-Level Prompt Engineers}, 
    author={Yongchao Zhou and Andrei Ioan Muresanu and Ziwen Han and Keiran Paster and Silviu Pitis and Harris Chan and Jimmy Ba},
    year={2023},
    eprint={2211.01910},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2211.01910}, 
}

Source

APEOptimizer

Bases: BaseOptimizer

APE Optimizer.

Based on Automatic Prompt Engineer from Zhou, et. al.

@misc{zhou2023largelanguagemodelshumanlevel,
    title={Large Language Models Are Human-Level Prompt Engineers},
    author={Yongchao Zhou and Andrei Ioan Muresanu and Ziwen Han and Keiran Paster and Silviu Pitis and Harris Chan and Jimmy Ba},
    year={2023},
    eprint={2211.01910},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2211.01910},
}
Source code in src/prompt_optimizer/optimizers/ape.py
class APEOptimizer(BaseOptimizer):
    """
    APE Optimizer.

    Based on Automatic Prompt Engineer from Zhou, et. al.

    ```
    @misc{zhou2023largelanguagemodelshumanlevel,
        title={Large Language Models Are Human-Level Prompt Engineers},
        author={Yongchao Zhou and Andrei Ioan Muresanu and Ziwen Han and Keiran Paster and Silviu Pitis and Harris Chan and Jimmy Ba},
        year={2023},
        eprint={2211.01910},
        archivePrefix={arXiv},
        primaryClass={cs.LG},
        url={https://arxiv.org/abs/2211.01910},
    }
    ```

    """

    def __init__(
        self,
        *,
        client: ClientType,
        validation_set: ValidationSetType,
        max_depth: int,
        evaluator: Callable[[Prompt, ValidationSetType], ScoreType],
        output_path: Optional[Union[str, Path]] = None,
        input_field: str,
        output_field: str,
        num_initial_prompts: int = 10,
        num_exemplars: int = 5,
        k_percent: float = 0.5,
        score_threshold: Optional[Union[float, int]] = None,
        **kwargs,
    ):
        """
        Initialize the APE optimizer.

        Args:
            client (ClientType):
                Language model client to use for prompt generation and feedback.
            validation_set (ValidationSetType):
                Set of examples to evaluate the prompt on.
            max_depth (int):
                Maximum iteration depth for prompt generation.
            evaluator (Callable[[Prompt, ValidationSetType], ScoreType]):
                Function that takes a prompt and the validation data and returns a score.
            output_path (Union[str, Path], optional):
                Path to store run results. Should be a .jsonl file path.
                If None, no outputs will be written to disk. Defaults to None.
            input_field (str):
                Field in the validation set that represents the input. Used in forward generation in
                the "Input:" field.
            output_field (str):
                Field in the validation set that represents the output. Used in forward generation in
                the "Output:" field.
            num_initial_prompts (int):
                Number of prompts to create in the initial forward generation. Defaults to 10.
            num_exemplars (int):
                Number of exemplars from the validation set to provide for forward generation.
                A random sample of input and output pairs of this size will be provided to the LLM
                during forward generation. Defaults to 5.
            k_percent (str, optional):
                Top k% of candidate prompts to retain between iterations. Defaults to 0.5.
            score_threshold (float, optional):
                Threshold for early convergence. If a prompt exceeds this score after any iteration, the optimization loop
                immediately ends. If set to None, the optimization loop will not terminate early. Defaults to None.
            kwargs:
                Additional keyword arguments.

        """
        super().__init__(
            client=client,
            seed_prompts=[],  # There is no seeding for APE
            validation_set=validation_set,
            max_depth=max_depth,
            evaluator=evaluator,
            output_path=output_path,
        )
        self.num_initial_prompts = num_initial_prompts
        self.num_exemplars = num_exemplars
        self.input_field = input_field
        self.output_field = output_field
        self.k_percent = k_percent
        self.score_threshold = score_threshold

    def _generate(self, metaprompt_template: str, template_kwargs: dict) -> str:
        """
        Generate a completion for a given template and kwargs and parse the results.

        Args:
            metaprompt_template (str): Template for the metaprompt.
            template_kwargs (dict): Key word arguments to fill the template values.
            kwargs: Additional kwargs to pass to the OpenAI client.completions.create (e.g. temperature)

        Returns:
            list[str]: The parsed generation results.

        """
        metaprompt = metaprompt_template.format(**template_kwargs)
        input = [{"role": "user", "content": metaprompt}]
        raw_response = self.client.invoke(input=input)
        response = raw_response.content.strip()
        return response

    def generate_prompt_candidates(self, *, prompts: list[Prompt], validation_set: ValidationSetType) -> list[Prompt]:
        """Generate prompt candidates using forward generation or resampling."""
        prompt_candidates = []

        # Do forward generation if there are no prompts
        if len(prompts) == 0:
            for i in track(range(self.num_initial_prompts), description="Generating new prompts", transient=True):
                sample = random.choices(validation_set, k=self.num_exemplars)
                input_output_pairs = "\n".join(
                    [f"**Input:** {exemplar[self.input_field]}   **Output:** {exemplar[self.output_field]}" for exemplar in sample]
                )
                template_kwargs = {"input_output_pairs": input_output_pairs}
                new_prompt = self._generate(metaprompt_template=FORWARD_GENERATION_TEMPLATE, template_kwargs=template_kwargs)
                prompt_candidates.append(Prompt(content=new_prompt))

        # Otherwise, resample the prompts
        else:
            for prompt in track(prompts, description="Generating new prompts", transient=True):
                template_kwargs = {"instruction": prompt.content}
                new_prompt = self._generate(metaprompt_template=RESAMPLING_PROMPT_TEMPLATE, template_kwargs=template_kwargs)
                prompt_candidates.append(Prompt(content=new_prompt))

        return prompt_candidates

    def _get_best_prompt(self, prompts: list[Prompt]):
        if any(prompt.score is None for prompt in prompts):
            raise ValueError("All prompts must be scored before calling this function.")
        return max(prompts, key=lambda x: x.score)

    def select_prompt_candidates(self, *, prompts: list[Prompt], validation_set: ValidationSetType) -> list[Prompt]:
        """Select the top scoring k% of prompts."""
        self._score_prompts(prompts=prompts, validation_set=validation_set)
        # Select the top scoring k% of prompts
        k = int(len(prompts) * self.k_percent)
        best_prompts = sorted(prompts, key=lambda x: x.score, reverse=True)[:k]
        return best_prompts

    def check_early_convergence(self, *, all_prompts: list[list[Prompt]]):
        """Check if any prompt exceeds the score threshold."""
        if self.score_threshold is None:
            return False

        # Flatten all iterations
        prompts = sum(all_prompts, start=[])

        # Check if early convergence criteria is met
        highest_score = max(prompts, key=lambda x: x.score).score
        if highest_score >= self.score_threshold:
            return True
        return False

    def select_best_prompt(self, *, all_prompts: list[list[Prompt]]) -> Prompt:
        """Select the highest scoring prompt."""
        # Flatten all iterations
        prompts = sum(all_prompts, start=[])

        # Select the single prompt with the highest score
        best_prompt = self._get_best_prompt(prompts=prompts)
        logger.info(f"Best score: {best_prompt.score:.3f}")
        return best_prompt

__init__(*, client, validation_set, max_depth, evaluator, output_path=None, input_field, output_field, num_initial_prompts=10, num_exemplars=5, k_percent=0.5, score_threshold=None, **kwargs)

Initialize the APE optimizer.

Parameters:

Name Type Description Default
client ClientType

Language model client to use for prompt generation and feedback.

required
validation_set ValidationSetType

Set of examples to evaluate the prompt on.

required
max_depth int

Maximum iteration depth for prompt generation.

required
evaluator Callable[[Prompt, ValidationSetType], ScoreType]

Function that takes a prompt and the validation data and returns a score.

required
output_path Union[str, Path]

Path to store run results. Should be a .jsonl file path. If None, no outputs will be written to disk. Defaults to None.

None
input_field str

Field in the validation set that represents the input. Used in forward generation in the "Input:" field.

required
output_field str

Field in the validation set that represents the output. Used in forward generation in the "Output:" field.

required
num_initial_prompts int

Number of prompts to create in the initial forward generation. Defaults to 10.

10
num_exemplars int

Number of exemplars from the validation set to provide for forward generation. A random sample of input and output pairs of this size will be provided to the LLM during forward generation. Defaults to 5.

5
k_percent str

Top k% of candidate prompts to retain between iterations. Defaults to 0.5.

0.5
score_threshold float

Threshold for early convergence. If a prompt exceeds this score after any iteration, the optimization loop immediately ends. If set to None, the optimization loop will not terminate early. Defaults to None.

None
kwargs

Additional keyword arguments.

{}
Source code in src/prompt_optimizer/optimizers/ape.py
def __init__(
    self,
    *,
    client: ClientType,
    validation_set: ValidationSetType,
    max_depth: int,
    evaluator: Callable[[Prompt, ValidationSetType], ScoreType],
    output_path: Optional[Union[str, Path]] = None,
    input_field: str,
    output_field: str,
    num_initial_prompts: int = 10,
    num_exemplars: int = 5,
    k_percent: float = 0.5,
    score_threshold: Optional[Union[float, int]] = None,
    **kwargs,
):
    """
    Initialize the APE optimizer.

    Args:
        client (ClientType):
            Language model client to use for prompt generation and feedback.
        validation_set (ValidationSetType):
            Set of examples to evaluate the prompt on.
        max_depth (int):
            Maximum iteration depth for prompt generation.
        evaluator (Callable[[Prompt, ValidationSetType], ScoreType]):
            Function that takes a prompt and the validation data and returns a score.
        output_path (Union[str, Path], optional):
            Path to store run results. Should be a .jsonl file path.
            If None, no outputs will be written to disk. Defaults to None.
        input_field (str):
            Field in the validation set that represents the input. Used in forward generation in
            the "Input:" field.
        output_field (str):
            Field in the validation set that represents the output. Used in forward generation in
            the "Output:" field.
        num_initial_prompts (int):
            Number of prompts to create in the initial forward generation. Defaults to 10.
        num_exemplars (int):
            Number of exemplars from the validation set to provide for forward generation.
            A random sample of input and output pairs of this size will be provided to the LLM
            during forward generation. Defaults to 5.
        k_percent (str, optional):
            Top k% of candidate prompts to retain between iterations. Defaults to 0.5.
        score_threshold (float, optional):
            Threshold for early convergence. If a prompt exceeds this score after any iteration, the optimization loop
            immediately ends. If set to None, the optimization loop will not terminate early. Defaults to None.
        kwargs:
            Additional keyword arguments.

    """
    super().__init__(
        client=client,
        seed_prompts=[],  # There is no seeding for APE
        validation_set=validation_set,
        max_depth=max_depth,
        evaluator=evaluator,
        output_path=output_path,
    )
    self.num_initial_prompts = num_initial_prompts
    self.num_exemplars = num_exemplars
    self.input_field = input_field
    self.output_field = output_field
    self.k_percent = k_percent
    self.score_threshold = score_threshold

check_early_convergence(*, all_prompts)

Check if any prompt exceeds the score threshold.

Source code in src/prompt_optimizer/optimizers/ape.py
def check_early_convergence(self, *, all_prompts: list[list[Prompt]]):
    """Check if any prompt exceeds the score threshold."""
    if self.score_threshold is None:
        return False

    # Flatten all iterations
    prompts = sum(all_prompts, start=[])

    # Check if early convergence criteria is met
    highest_score = max(prompts, key=lambda x: x.score).score
    if highest_score >= self.score_threshold:
        return True
    return False

generate_prompt_candidates(*, prompts, validation_set)

Generate prompt candidates using forward generation or resampling.

Source code in src/prompt_optimizer/optimizers/ape.py
def generate_prompt_candidates(self, *, prompts: list[Prompt], validation_set: ValidationSetType) -> list[Prompt]:
    """Generate prompt candidates using forward generation or resampling."""
    prompt_candidates = []

    # Do forward generation if there are no prompts
    if len(prompts) == 0:
        for i in track(range(self.num_initial_prompts), description="Generating new prompts", transient=True):
            sample = random.choices(validation_set, k=self.num_exemplars)
            input_output_pairs = "\n".join(
                [f"**Input:** {exemplar[self.input_field]}   **Output:** {exemplar[self.output_field]}" for exemplar in sample]
            )
            template_kwargs = {"input_output_pairs": input_output_pairs}
            new_prompt = self._generate(metaprompt_template=FORWARD_GENERATION_TEMPLATE, template_kwargs=template_kwargs)
            prompt_candidates.append(Prompt(content=new_prompt))

    # Otherwise, resample the prompts
    else:
        for prompt in track(prompts, description="Generating new prompts", transient=True):
            template_kwargs = {"instruction": prompt.content}
            new_prompt = self._generate(metaprompt_template=RESAMPLING_PROMPT_TEMPLATE, template_kwargs=template_kwargs)
            prompt_candidates.append(Prompt(content=new_prompt))

    return prompt_candidates

get_all_prompts(include_candidates=False)

Get all the prompts from the latest training run.

The default behavior returns a list of lists, where each internal list contains the retained candidates after one iteration step. Setting include_candidates to True will also include all generated candidate prompts.

Parameters:

Name Type Description Default
include_candidates bool

Whether to include all the candidate prompts in the output. If True, candidate prompts from each iteration will be included. Defaults to False.

False

Returns:

Type Description
list[list[Prompt]]

list[list[Prompt]]: List of lists where each list contains the prompts from each iteration. E.g. list[0] contains prompts from the first iteration, list[1] the second, etc. If include_candidates is False, each inner list contains only the retained prompts at each iteration. If include_candidates is True, each inner list contains all candidate prompts at each iteration, including those that were discarded.

Source code in src/prompt_optimizer/optimizers/base.py
def get_all_prompts(self, include_candidates: bool = False) -> list[list[Prompt]]:
    """
    Get all the prompts from the latest training run.

    The default behavior returns a list of lists, where each internal list contains the
    retained candidates after one iteration step.
    Setting include_candidates to True will also include all generated candidate prompts.

    Args:
        include_candidates (bool, optional):
            Whether to include all the candidate prompts in the output.
            If True, candidate prompts from each iteration will be included.
            Defaults to False.

    Returns:
        list[list[Prompt]]:
            List of lists where each list contains the prompts from each iteration.
            E.g. list[0] contains prompts from the first iteration, list[1] the second, etc.
            If include_candidates is False, each inner list contains only the retained prompts at each iteration.
            If include_candidates is True, each inner list contains all candidate prompts at each iteration,
            including those that were discarded.

    """
    # Decide whether to include candidates
    if include_candidates:
        all_prompts = self._g
    else:
        all_prompts = self._p

    return all_prompts

run()

Run the optimization pipeline.

Source code in src/prompt_optimizer/optimizers/base.py
def run(self) -> Prompt:
    """Run the optimization pipeline."""
    # Score seed_prompts
    self.seed_prompts = self._score_prompts(self.seed_prompts, self.validation_set)

    # Initialize objects
    self._p = [self.seed_prompts]
    self._g = [self.seed_prompts]

    # Iterate until max depth
    for t in track(range(1, self.max_depth + 1), description="Step", total=self.max_depth):
        # Generate prompt candidates
        g_t = self.generate_prompt_candidates(prompts=self._p[t - 1], validation_set=self.validation_set)
        self._g.append(g_t)
        # Select prompt candidates
        p_t = self.select_prompt_candidates(prompts=self._g[t], validation_set=self.validation_set)
        self._p.append(p_t)
        # Check for early convergence
        if self.check_early_convergence(all_prompts=self._p):
            break

    # Save prompts if requested
    self.save_prompts(output_path=self.output_path)

    # Return best prompt
    return self.select_best_prompt(all_prompts=self._p)

save_prompts(output_path)

Save prompts in jsonl format.

Source code in src/prompt_optimizer/optimizers/base.py
def save_prompts(self, output_path: Optional[Union[str, Path]]):
    """Save prompts in jsonl format."""
    # Exit if no output path is set
    if self.output_path is None:
        return

    # Get and deduplicate prompts
    prompts = sum(self._p, start=[])
    prompts = list(set(prompts))

    # Save the prompts to the file
    lines = [prompt.model_dump_json() for prompt in prompts]
    with open(output_path, "w") as f:
        for line in lines:
            f.write(line)
            f.write("\n")

select_best_prompt(*, all_prompts)

Select the highest scoring prompt.

Source code in src/prompt_optimizer/optimizers/ape.py
def select_best_prompt(self, *, all_prompts: list[list[Prompt]]) -> Prompt:
    """Select the highest scoring prompt."""
    # Flatten all iterations
    prompts = sum(all_prompts, start=[])

    # Select the single prompt with the highest score
    best_prompt = self._get_best_prompt(prompts=prompts)
    logger.info(f"Best score: {best_prompt.score:.3f}")
    return best_prompt

select_prompt_candidates(*, prompts, validation_set)

Select the top scoring k% of prompts.

Source code in src/prompt_optimizer/optimizers/ape.py
def select_prompt_candidates(self, *, prompts: list[Prompt], validation_set: ValidationSetType) -> list[Prompt]:
    """Select the top scoring k% of prompts."""
    self._score_prompts(prompts=prompts, validation_set=validation_set)
    # Select the top scoring k% of prompts
    k = int(len(prompts) * self.k_percent)
    best_prompts = sorted(prompts, key=lambda x: x.score, reverse=True)[:k]
    return best_prompts