Skip to content

Completion

autogen.Completion #

Bases: Completion

(openai<1) A class for OpenAI completion API.

It also supports: ChatCompletion, Azure OpenAI API.

chat_models class-attribute instance-attribute #

chat_models = {'gpt-3.5-turbo', 'gpt-3.5-turbo-0301', 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-16k', 'gpt-3.5-turbo-16k-0613', 'gpt-35-turbo', 'gpt-35-turbo-16k', 'gpt-4', 'gpt-4-32k', 'gpt-4-32k-0314', 'gpt-4-0314', 'gpt-4-0613', 'gpt-4-32k-0613'}

price1K class-attribute instance-attribute #

price1K = {'text-ada-001': 0.0004, 'text-babbage-001': 0.0005, 'text-curie-001': 0.002, 'code-cushman-001': 0.024, 'code-davinci-002': 0.1, 'text-davinci-002': 0.02, 'text-davinci-003': 0.02, 'gpt-3.5-turbo': (0.0015, 0.002), 'gpt-3.5-turbo-instruct': (0.0015, 0.002), 'gpt-3.5-turbo-0301': (0.0015, 0.002), 'gpt-3.5-turbo-0613': (0.0015, 0.002), 'gpt-3.5-turbo-16k': (0.003, 0.004), 'gpt-3.5-turbo-16k-0613': (0.003, 0.004), 'gpt-35-turbo': (0.0015, 0.002), 'gpt-35-turbo-16k': (0.003, 0.004), 'gpt-35-turbo-instruct': (0.0015, 0.002), 'gpt-4': (0.03, 0.06), 'gpt-4-32k': (0.06, 0.12), 'gpt-4-0314': (0.03, 0.06), 'gpt-4-32k-0314': (0.06, 0.12), 'gpt-4-0613': (0.03, 0.06), 'gpt-4-32k-0613': (0.06, 0.12)}

default_search_space class-attribute instance-attribute #

default_search_space = {'model': choice(['text-ada-001', 'text-babbage-001', 'text-davinci-003', 'gpt-3.5-turbo', 'gpt-4']), 'temperature_or_top_p': choice([{'temperature': uniform(0, 2)}, {'top_p': uniform(0, 1)}]), 'max_tokens': lograndint(50, 1000), 'n': randint(1, 100), 'prompt': '{prompt}'} if FLAML_INSTALLED else {}

cache_seed class-attribute instance-attribute #

cache_seed = 41

cache_path class-attribute instance-attribute #

cache_path = f'.cache/{cache_seed}'

retry_wait_time class-attribute instance-attribute #

retry_wait_time = 10

max_retry_period class-attribute instance-attribute #

max_retry_period = 120

request_timeout class-attribute instance-attribute #

request_timeout = 60

openai_completion_class class-attribute instance-attribute #

openai_completion_class = not ERROR and Completion

optimization_budget class-attribute instance-attribute #

optimization_budget = None

logged_history classmethod property #

logged_history

Return the book keeping dictionary.

set_cache classmethod #

set_cache(seed=41, cache_path_root='.cache')

Set cache path.

PARAMETER DESCRIPTION
seed

The integer identifier for the pseudo seed. Results corresponding to different seeds will be cached in different places.

TYPE: (int, Optional) DEFAULT: 41

cache_path

The root path for the cache. The complete cache path will be {cache_path_root}/{seed}.

TYPE: (str, Optional)

Source code in autogen/oai/completion.py
@classmethod
def set_cache(cls, seed: Optional[int] = 41, cache_path_root: Optional[str] = ".cache"):
    """Set cache path.

    Args:
        seed (int, Optional): The integer identifier for the pseudo seed.
            Results corresponding to different seeds will be cached in different places.
        cache_path (str, Optional): The root path for the cache.
            The complete cache path will be {cache_path_root}/{seed}.
    """
    cls.cache_seed = seed
    cls.cache_path = f"{cache_path_root}/{seed}"

clear_cache classmethod #

clear_cache(seed=None, cache_path_root='.cache')

Clear cache.

PARAMETER DESCRIPTION
seed

The integer identifier for the pseudo seed. If omitted, all caches under cache_path_root will be cleared.

TYPE: (int, Optional) DEFAULT: None

cache_path

The root path for the cache. The complete cache path will be {cache_path_root}/{seed}.

TYPE: (str, Optional)

Source code in autogen/oai/completion.py
@classmethod
def clear_cache(cls, seed: Optional[int] = None, cache_path_root: Optional[str] = ".cache"):
    """Clear cache.

    Args:
        seed (int, Optional): The integer identifier for the pseudo seed.
            If omitted, all caches under cache_path_root will be cleared.
        cache_path (str, Optional): The root path for the cache.
            The complete cache path will be {cache_path_root}/{seed}.
    """
    if seed is None:
        shutil.rmtree(cache_path_root, ignore_errors=True)
        return
    with diskcache.Cache(f"{cache_path_root}/{seed}") as cache:
        cache.clear()

tune classmethod #

tune(data, metric, mode, eval_func, log_file_name=None, inference_budget=None, optimization_budget=None, num_samples=1, logging_level=WARNING, **config)

Tune the parameters for the OpenAI API call.

TODO: support parallel tuning with ray or spark. TODO: support agg_method as in test

PARAMETER DESCRIPTION
data

The list of data points.

TYPE: list

metric

The metric to optimize.

TYPE: str

mode

The optimization mode, "min" or "max.

TYPE: str

eval_func

The evaluation function for responses. The function should take a list of responses and a data point as input, and return a dict of metrics. For example,

TYPE: Callable

def eval_func(responses, **data):
    solution = data["solution"]
    success_list = []
    n = len(responses)
    for i in range(n):
        response = responses[i]
        succeed = is_equiv_chain_of_thought(response, solution)
        success_list.append(succeed)
    return {
        "expected_success": 1 - pow(1 - sum(success_list) / n, n),
        "success": any(s for s in success_list),
    }
log_file_name (str, optional): The log file.
inference_budget (float, optional): The inference budget, dollar per instance.
optimization_budget (float, optional): The optimization budget, dollar in total.
num_samples (int, optional): The number of samples to evaluate.
    -1 means no hard restriction in the number of trials
    and the actual number is decided by optimization_budget. Defaults to 1.
logging_level (optional): logging level. Defaults to logging.WARNING.
**config (dict): The search space to update over the default search.
    For prompt, please provide a string/Callable or a list of strings/Callables.
        - If prompt is provided for chat models, it will be converted to messages under role "user".
        - Do not provide both prompt and messages for chat models, but provide either of them.
        - A string template will be used to generate a prompt for each data instance
          using `prompt.format(**data)`.
        - A callable template will be used to generate a prompt for each data instance
          using `prompt(data)`.
    For stop, please provide a string, a list of strings, or a list of lists of strings.
    For messages (chat models only), please provide a list of messages (for a single chat prefix)
    or a list of lists of messages (for multiple choices of chat prefix to choose from).
    Each message should be a dict with keys "role" and "content". The value of "content" can be a string/Callable template.
RETURNS DESCRIPTION
dict

The optimized hyperparameter setting.

tune.ExperimentAnalysis: The tuning results.

Source code in autogen/oai/completion.py
@classmethod
@require_optional_import("flaml", "flaml")
def tune(
    cls,
    data: list[dict[str, Any]],
    metric: str,
    mode: str,
    eval_func: Callable,
    log_file_name: Optional[str] = None,
    inference_budget: Optional[float] = None,
    optimization_budget: Optional[float] = None,
    num_samples: Optional[int] = 1,
    logging_level: Optional[int] = logging.WARNING,
    **config,
):
    """Tune the parameters for the OpenAI API call.

    TODO: support parallel tuning with ray or spark.
    TODO: support agg_method as in test

    Args:
        data (list): The list of data points.
        metric (str): The metric to optimize.
        mode (str): The optimization mode, "min" or "max.
        eval_func (Callable): The evaluation function for responses.
            The function should take a list of responses and a data point as input,
            and return a dict of metrics. For example,

    ```python
    def eval_func(responses, **data):
        solution = data["solution"]
        success_list = []
        n = len(responses)
        for i in range(n):
            response = responses[i]
            succeed = is_equiv_chain_of_thought(response, solution)
            success_list.append(succeed)
        return {
            "expected_success": 1 - pow(1 - sum(success_list) / n, n),
            "success": any(s for s in success_list),
        }
    ```

        log_file_name (str, optional): The log file.
        inference_budget (float, optional): The inference budget, dollar per instance.
        optimization_budget (float, optional): The optimization budget, dollar in total.
        num_samples (int, optional): The number of samples to evaluate.
            -1 means no hard restriction in the number of trials
            and the actual number is decided by optimization_budget. Defaults to 1.
        logging_level (optional): logging level. Defaults to logging.WARNING.
        **config (dict): The search space to update over the default search.
            For prompt, please provide a string/Callable or a list of strings/Callables.
                - If prompt is provided for chat models, it will be converted to messages under role "user".
                - Do not provide both prompt and messages for chat models, but provide either of them.
                - A string template will be used to generate a prompt for each data instance
                  using `prompt.format(**data)`.
                - A callable template will be used to generate a prompt for each data instance
                  using `prompt(data)`.
            For stop, please provide a string, a list of strings, or a list of lists of strings.
            For messages (chat models only), please provide a list of messages (for a single chat prefix)
            or a list of lists of messages (for multiple choices of chat prefix to choose from).
            Each message should be a dict with keys "role" and "content". The value of "content" can be a string/Callable template.

    Returns:
        dict: The optimized hyperparameter setting.
        tune.ExperimentAnalysis: The tuning results.
    """
    logger.warning(
        "tuning via Completion.tune is deprecated in autogen, pyautogen v0.2 and openai>=1. "
        "flaml.tune supports tuning more generically."
    )
    if ERROR:
        raise ERROR
    space = cls.default_search_space.copy()
    if config is not None:
        space.update(config)
        if "messages" in space:
            space.pop("prompt", None)
        temperature = space.pop("temperature", None)
        top_p = space.pop("top_p", None)
        if temperature is not None and top_p is None:
            space["temperature_or_top_p"] = {"temperature": temperature}
        elif temperature is None and top_p is not None:
            space["temperature_or_top_p"] = {"top_p": top_p}
        elif temperature is not None and top_p is not None:
            space.pop("temperature_or_top_p")
            space["temperature"] = temperature
            space["top_p"] = top_p
            logger.warning("temperature and top_p are not recommended to vary together.")
    cls._max_valid_n_per_max_tokens, cls._min_invalid_n_per_max_tokens = {}, {}
    cls.optimization_budget = optimization_budget
    cls.inference_budget = inference_budget
    cls._prune_hp = "best_of" if space.get("best_of", 1) != 1 else "n"
    cls._prompts = space.get("prompt")
    if cls._prompts is None:
        cls._messages = space.get("messages")
        if not all((isinstance(cls._messages, list), isinstance(cls._messages[0], (dict, list)))):
            error_msg = "messages must be a list of dicts or a list of lists."
            logger.error(error_msg)
            raise AssertionError(error_msg)
        if isinstance(cls._messages[0], dict):
            cls._messages = [cls._messages]
        space["messages"] = tune.choice(list(range(len(cls._messages))))
    else:
        if space.get("messages") is not None:
            error_msg = "messages and prompt cannot be provided at the same time."
            logger.error(error_msg)
            raise AssertionError(error_msg)
        if not isinstance(cls._prompts, (str, list)):
            error_msg = "prompt must be a string or a list of strings."
            logger.error(error_msg)
            raise AssertionError(error_msg)
        if isinstance(cls._prompts, str):
            cls._prompts = [cls._prompts]
        space["prompt"] = tune.choice(list(range(len(cls._prompts))))
    cls._stops = space.get("stop")
    if cls._stops:
        if not isinstance(cls._stops, (str, list)):
            error_msg = "stop must be a string, a list of strings, or a list of lists of strings."
            logger.error(error_msg)
            raise AssertionError(error_msg)
        if not (isinstance(cls._stops, list) and isinstance(cls._stops[0], list)):
            cls._stops = [cls._stops]
        space["stop"] = tune.choice(list(range(len(cls._stops))))
    cls._config_list = space.get("config_list")
    if cls._config_list is not None:
        is_const = is_constant(cls._config_list)
        if is_const:
            space.pop("config_list")
    cls._metric, cls._mode = metric, mode
    cls._total_cost = 0  # total optimization cost
    cls._eval_func = eval_func
    cls.data = data
    cls.avg_input_tokens = None

    space_model = space["model"]
    if not isinstance(space_model, str) and len(space_model) > 1:
        # make a hierarchical search space
        subspace = {}
        if "max_tokens" in space:
            subspace["max_tokens"] = space.pop("max_tokens")
        if "temperature_or_top_p" in space:
            subspace["temperature_or_top_p"] = space.pop("temperature_or_top_p")
        if "best_of" in space:
            subspace["best_of"] = space.pop("best_of")
        if "n" in space:
            subspace["n"] = space.pop("n")
        choices = []
        for model in space["model"]:
            choices.append({"model": model, **subspace})
        space["subspace"] = tune.choice(choices)
        space.pop("model")
        # start all the models with the same hp config
        search_alg = BlendSearch(
            cost_attr="cost",
            cost_budget=optimization_budget,
            metric=metric,
            mode=mode,
            space=space,
        )
        config0 = search_alg.suggest("t0")
        points_to_evaluate = [config0]
        for model in space_model:
            if model != config0["subspace"]["model"]:
                point = config0.copy()
                point["subspace"] = point["subspace"].copy()
                point["subspace"]["model"] = model
                points_to_evaluate.append(point)
        search_alg = BlendSearch(
            cost_attr="cost",
            cost_budget=optimization_budget,
            metric=metric,
            mode=mode,
            space=space,
            points_to_evaluate=points_to_evaluate,
        )
    else:
        search_alg = BlendSearch(
            cost_attr="cost",
            cost_budget=optimization_budget,
            metric=metric,
            mode=mode,
            space=space,
        )
    old_level = logger.getEffectiveLevel()
    logger.setLevel(logging_level)
    with diskcache.Cache(cls.cache_path) as cls._cache:
        analysis = tune.run(
            cls._eval,
            search_alg=search_alg,
            num_samples=num_samples,
            log_file_name=log_file_name,
            verbose=3,
        )
    config = analysis.best_config
    params = cls._get_params_for_create(config)
    if cls._config_list is not None and is_const:
        params.pop("config_list")
    logger.setLevel(old_level)
    return params, analysis

create classmethod #

create(context=None, use_cache=True, config_list=None, filter_func=None, raise_on_ratelimit_or_timeout=True, allow_format_str_template=False, **config)

Make a completion for a given context.

PARAMETER DESCRIPTION
context

The context to instantiate the prompt. It needs to contain keys that are used by the prompt template or the filter function. E.g., prompt="Complete the following sentence: {prefix}, context={"prefix": "Today I feel"}. The actual prompt will be: "Complete the following sentence: Today I feel".

TYPE: (Dict, Optional) DEFAULT: None

use_cache

Whether to use cached responses.

TYPE: (bool, Optional) DEFAULT: True

config_list

List of configurations for the completion to try. The first one that does not raise an error will be used. Only the differences from the default config need to be provided. E.g.,

    response = oai.Completion.create(
        config_list = [
            {
                "model": "gpt-4",
                "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
                "api_type": "azure",
                "base_url": os.environ.get("AZURE_OPENAI_API_BASE"),
                "api_version": "2024-02-01",
            },
            {
                "model": "gpt-3.5-turbo",
                "api_key": os.environ.get("OPENAI_API_KEY"),
                "base_url": "https://api.openai.com/v1",
            },
            {
                "model": "llama-7B",
                "base_url": "http://127.0.0.1:8080",
            },
        ],
        prompt="Hi",
    )

TYPE: (List, Optional) DEFAULT: None

filter_func

A function that takes in the context and the response and returns a boolean to indicate whether the response is valid. E.g.,

    def yes_or_no_filter(context, config, response):
        return context.get("yes_or_no_choice", False) is False or any(
            text in ["Yes.", "No."] for text in oai.Completion.extract_text(response)
        )

TYPE: (Callable, Optional) DEFAULT: None

raise_on_ratelimit_or_timeout

Whether to raise RateLimitError or Timeout when all configs fail. When set to False, -1 will be returned when all configs fail.

TYPE: (bool, Optional) DEFAULT: True

allow_format_str_template

Whether to allow format string template in the config.

TYPE: (bool, Optional) DEFAULT: False

**config

Configuration for the openai API call. This is used as parameters for calling openai API. The "prompt" or "messages" parameter can contain a template (str or Callable) which will be instantiated with the context. Besides the parameters for the openai API call, it can also contain: - max_retry_period (int): the total time (in seconds) allowed for retrying failed requests. - retry_wait_time (int): the time interval to wait (in seconds) before retrying a failed request. - cache_seed (int) for the cache. This is useful when implementing "controlled randomness" for the completion.

DEFAULT: {}

RETURNS DESCRIPTION

Responses from OpenAI API, with additional fields. - cost: the total cost.

When config_list is provided, the response will contain a few more fields: - config_id: the index of the config in the config_list that is used to generate the response. - pass_filter: whether the response passes the filter function. None if no filter is provided.

Source code in autogen/oai/completion.py
@classmethod
def create(
    cls,
    context: Optional[dict[str, Any]] = None,
    use_cache: Optional[bool] = True,
    config_list: Optional[list[dict[str, Any]]] = None,
    filter_func: Optional[Callable[[dict[str, Any], dict[str, Any]], bool]] = None,
    raise_on_ratelimit_or_timeout: Optional[bool] = True,
    allow_format_str_template: Optional[bool] = False,
    **config,
):
    """Make a completion for a given context.

    Args:
        context (Dict, Optional): The context to instantiate the prompt.
            It needs to contain keys that are used by the prompt template or the filter function.
            E.g., `prompt="Complete the following sentence: {prefix}, context={"prefix": "Today I feel"}`.
            The actual prompt will be:
            "Complete the following sentence: Today I feel".
        use_cache (bool, Optional): Whether to use cached responses.
        config_list (List, Optional): List of configurations for the completion to try.
            The first one that does not raise an error will be used.
            Only the differences from the default config need to be provided.
            E.g.,

            ```python
                response = oai.Completion.create(
                    config_list = [
                        {
                            "model": "gpt-4",
                            "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
                            "api_type": "azure",
                            "base_url": os.environ.get("AZURE_OPENAI_API_BASE"),
                            "api_version": "2024-02-01",
                        },
                        {
                            "model": "gpt-3.5-turbo",
                            "api_key": os.environ.get("OPENAI_API_KEY"),
                            "base_url": "https://api.openai.com/v1",
                        },
                        {
                            "model": "llama-7B",
                            "base_url": "http://127.0.0.1:8080",
                        },
                    ],
                    prompt="Hi",
                )
            ```

        filter_func (Callable, Optional): A function that takes in the context and the response and returns a boolean to indicate whether the response is valid. E.g.,

            ```python
                def yes_or_no_filter(context, config, response):
                    return context.get("yes_or_no_choice", False) is False or any(
                        text in ["Yes.", "No."] for text in oai.Completion.extract_text(response)
                    )
            ```

        raise_on_ratelimit_or_timeout (bool, Optional): Whether to raise RateLimitError or Timeout when all configs fail.
            When set to False, -1 will be returned when all configs fail.
        allow_format_str_template (bool, Optional): Whether to allow format string template in the config.
        **config: Configuration for the openai API call. This is used as parameters for calling openai API.
            The "prompt" or "messages" parameter can contain a template (str or Callable) which will be instantiated with the context.
            Besides the parameters for the openai API call, it can also contain:
            - `max_retry_period` (int): the total time (in seconds) allowed for retrying failed requests.
            - `retry_wait_time` (int): the time interval to wait (in seconds) before retrying a failed request.
            - `cache_seed` (int) for the cache. This is useful when implementing "controlled randomness" for the completion.

    Returns:
        Responses from OpenAI API, with additional fields.
            - `cost`: the total cost.
        When `config_list` is provided, the response will contain a few more fields:
            - `config_id`: the index of the config in the config_list that is used to generate the response.
            - `pass_filter`: whether the response passes the filter function. None if no filter is provided.
    """
    logger.warning(
        "Completion.create is deprecated in autogen, pyautogen v0.2 and openai>=1. "
        "The new openai requires initiating a client for inference. "
    )
    if ERROR:
        raise ERROR

    # Warn if a config list was provided but was empty
    if isinstance(config_list, list) and len(config_list) == 0:
        logger.warning(
            "Completion was provided with a config_list, but the list was empty. Adopting default OpenAI behavior, which reads from the 'model' parameter instead."
        )

    if config_list:
        last = len(config_list) - 1
        cost = 0
        for i, each_config in enumerate(config_list):
            base_config = config.copy()
            base_config["allow_format_str_template"] = allow_format_str_template
            base_config.update(each_config)
            if i < last and filter_func is None and "max_retry_period" not in base_config:
                # max_retry_period = 0 to avoid retrying when no filter is given
                base_config["max_retry_period"] = 0
            try:
                response = cls.create(
                    context,
                    use_cache,
                    raise_on_ratelimit_or_timeout=i < last or raise_on_ratelimit_or_timeout,
                    **base_config,
                )
                if response == -1:
                    return response
                pass_filter = filter_func is None or filter_func(context=context, response=response)
                if pass_filter or i == last:
                    response["cost"] = cost + response["cost"]
                    response["config_id"] = i
                    response["pass_filter"] = pass_filter
                    return response
                cost += response["cost"]
            except (AuthenticationError, RateLimitError, Timeout, BadRequestError):
                logger.debug(f"failed with config {i}", exc_info=1)
                if i == last:
                    raise
    params = cls._construct_params(context, config, allow_format_str_template=allow_format_str_template)
    if not use_cache:
        return cls._get_response(
            params, raise_on_ratelimit_or_timeout=raise_on_ratelimit_or_timeout, use_cache=False
        )
    cache_seed = cls.cache_seed
    if "cache_seed" in params:
        cls.set_cache(params.pop("cache_seed"))
    with diskcache.Cache(cls.cache_path) as cls._cache:
        cls.set_cache(cache_seed)
        return cls._get_response(params, raise_on_ratelimit_or_timeout=raise_on_ratelimit_or_timeout)

instantiate classmethod #

instantiate(template, context=None, allow_format_str_template=False)
Source code in autogen/oai/completion.py
@classmethod
def instantiate(
    cls,
    template: Union[str, None],
    context: Optional[dict[str, Any]] = None,
    allow_format_str_template: Optional[bool] = False,
):
    if not context or template is None:
        return template
    if isinstance(template, str):
        return template.format(**context) if allow_format_str_template else template
    return template(context)

test classmethod #

test(data, eval_func=None, use_cache=True, agg_method='avg', return_responses_and_per_instance_result=False, logging_level=WARNING, **config)

Evaluate the responses created with the config for the OpenAI API call.

PARAMETER DESCRIPTION
data

The list of test data points.

TYPE: list

eval_func

The evaluation function for responses per data instance. The function should take a list of responses and a data point as input, and return a dict of metrics. You need to either provide a valid callable eval_func; or do not provide one (set None) but call the test function after calling the tune function in which a eval_func is provided. In the latter case we will use the eval_func provided via tune function. Defaults to None.

TYPE: Callable DEFAULT: None

def eval_func(responses, **data):
    solution = data["solution"]
    success_list = []
    n = len(responses)
    for i in range(n):
        response = responses[i]
        succeed = is_equiv_chain_of_thought(response, solution)
        success_list.append(succeed)
    return {
        "expected_success": 1 - pow(1 - sum(success_list) / n, n),
        "success": any(s for s in success_list),
    }
use_cache (bool, Optional): Whether to use cached responses. Defaults to True. agg_method (str, Callable or a dict of Callable): Result aggregation method (across multiple instances) for each of the metrics. Defaults to 'avg'. An example agg_method in str:

agg_method = "median"
An example agg_method in a Callable:

agg_method = np.median
    An example agg_method in a dict of Callable:
agg_method = {"median_success": np.median, "avg_success": np.mean}
return_responses_and_per_instance_result (bool): Whether to also return responses
    and per instance results in addition to the aggregated results.
logging_level (optional): logging level. Defaults to logging.WARNING.
**config (dict): parameters passed to the openai api call `create()`.
RETURNS DESCRIPTION

None when no valid eval_func is provided in either test or tune;

Otherwise, a dict of aggregated results, responses and per instance results if return_responses_and_per_instance_result is True;

Otherwise, a dict of aggregated results (responses and per instance results are not returned).

Source code in autogen/oai/completion.py
@classmethod
@require_optional_import("numpy", "flaml")
def test(
    cls,
    data,
    eval_func=None,
    use_cache=True,
    agg_method="avg",
    return_responses_and_per_instance_result=False,
    logging_level=logging.WARNING,
    **config,
):
    """Evaluate the responses created with the config for the OpenAI API call.

    Args:
        data (list): The list of test data points.
        eval_func (Callable): The evaluation function for responses per data instance.
            The function should take a list of responses and a data point as input,
            and return a dict of metrics. You need to either provide a valid callable
            eval_func; or do not provide one (set None) but call the test function after
            calling the tune function in which a eval_func is provided.
            In the latter case we will use the eval_func provided via tune function.
            Defaults to None.

    ```python
    def eval_func(responses, **data):
        solution = data["solution"]
        success_list = []
        n = len(responses)
        for i in range(n):
            response = responses[i]
            succeed = is_equiv_chain_of_thought(response, solution)
            success_list.append(succeed)
        return {
            "expected_success": 1 - pow(1 - sum(success_list) / n, n),
            "success": any(s for s in success_list),
        }
    ```
        use_cache (bool, Optional): Whether to use cached responses. Defaults to True.
        agg_method (str, Callable or a dict of Callable): Result aggregation method (across
            multiple instances) for each of the metrics. Defaults to 'avg'.
            An example agg_method in str:

    ```python
    agg_method = "median"
    ```
            An example agg_method in a Callable:

    ```python
    agg_method = np.median
    ```

            An example agg_method in a dict of Callable:

    ```python
    agg_method = {"median_success": np.median, "avg_success": np.mean}
    ```

        return_responses_and_per_instance_result (bool): Whether to also return responses
            and per instance results in addition to the aggregated results.
        logging_level (optional): logging level. Defaults to logging.WARNING.
        **config (dict): parameters passed to the openai api call `create()`.

    Returns:
        None when no valid eval_func is provided in either test or tune;
        Otherwise, a dict of aggregated results, responses and per instance results if `return_responses_and_per_instance_result` is True;
        Otherwise, a dict of aggregated results (responses and per instance results are not returned).
    """
    result_agg, responses_list, result_list = {}, [], []
    metric_keys = None
    cost = 0
    old_level = logger.getEffectiveLevel()
    logger.setLevel(logging_level)
    for i, data_i in enumerate(data):
        logger.info(f"evaluating data instance {i}")
        response = cls.create(data_i, use_cache, **config)
        cost += response["cost"]
        # evaluate the quality of the responses
        responses = cls.extract_text_or_function_call(response)
        if eval_func is not None:
            metrics = eval_func(responses, **data_i)
        elif hasattr(cls, "_eval_func"):
            metrics = cls._eval_func(responses, **data_i)
        else:
            logger.warning(
                "Please either provide a valid eval_func or do the test after the tune function is called."
            )
            return
        if not metric_keys:
            metric_keys = []
            for k in metrics:
                try:
                    _ = float(metrics[k])
                    metric_keys.append(k)
                except ValueError:
                    pass
        result_list.append(metrics)
        if return_responses_and_per_instance_result:
            responses_list.append(responses)
    if isinstance(agg_method, str):
        if agg_method in ["avg", "average"]:
            for key in metric_keys:
                result_agg[key] = np.mean([r[key] for r in result_list])
        elif agg_method == "median":
            for key in metric_keys:
                result_agg[key] = np.median([r[key] for r in result_list])
        else:
            logger.warning(
                f"Aggregation method {agg_method} not supported. Please write your own aggregation method as a callable(s)."
            )
    elif callable(agg_method):
        for key in metric_keys:
            result_agg[key] = agg_method([r[key] for r in result_list])
    elif isinstance(agg_method, dict):
        for key in metric_keys:
            metric_agg_method = agg_method[key]
            if not callable(metric_agg_method):
                error_msg = "please provide a callable for each metric"
                logger.error(error_msg)
                raise AssertionError(error_msg)
            result_agg[key] = metric_agg_method([r[key] for r in result_list])
    else:
        raise ValueError(
            "agg_method needs to be a string ('avg' or 'median'),\
            or a callable, or a dictionary of callable."
        )
    logger.setLevel(old_level)
    # should we also return the result_list and responses_list or not?
    if "cost" not in result_agg:
        result_agg["cost"] = cost
    if "inference_cost" not in result_agg:
        result_agg["inference_cost"] = cost / len(data)
    if return_responses_and_per_instance_result:
        return result_agg, result_list, responses_list
    else:
        return result_agg

cost classmethod #

cost(response)

Compute the cost of an API call.

PARAMETER DESCRIPTION
response

The response from OpenAI API.

TYPE: dict

RETURNS DESCRIPTION

The cost in USD. 0 if the model is not supported.

Source code in autogen/oai/completion.py
@classmethod
def cost(cls, response: dict):
    """Compute the cost of an API call.

    Args:
        response (dict): The response from OpenAI API.

    Returns:
        The cost in USD. 0 if the model is not supported.
    """
    model = response.get("model")
    if model not in cls.price1K:
        return 0
        # raise ValueError(f"Unknown model: {model}")
    usage = response["usage"]
    n_input_tokens = usage["prompt_tokens"]
    n_output_tokens = usage.get("completion_tokens", 0)
    price1K = cls.price1K[model]  # noqa: N806
    if isinstance(price1K, tuple):
        return (price1K[0] * n_input_tokens + price1K[1] * n_output_tokens) / 1000
    return price1K * (n_input_tokens + n_output_tokens) / 1000

extract_text classmethod #

extract_text(response)

Extract the text from a completion or chat response.

PARAMETER DESCRIPTION
response

The response from OpenAI API.

TYPE: dict

RETURNS DESCRIPTION
list[str]

A list of text in the responses.

Source code in autogen/oai/completion.py
@classmethod
def extract_text(cls, response: dict) -> list[str]:
    """Extract the text from a completion or chat response.

    Args:
        response (dict): The response from OpenAI API.

    Returns:
        A list of text in the responses.
    """
    choices = response["choices"]
    if "text" in choices[0]:
        return [choice["text"] for choice in choices]
    return [choice["message"].get("content", "") for choice in choices]

extract_text_or_function_call classmethod #

extract_text_or_function_call(response)

Extract the text or function calls from a completion or chat response.

PARAMETER DESCRIPTION
response

The response from OpenAI API.

TYPE: dict

RETURNS DESCRIPTION
list[str]

A list of text or function calls in the responses.

Source code in autogen/oai/completion.py
@classmethod
def extract_text_or_function_call(cls, response: dict) -> list[str]:
    """Extract the text or function calls from a completion or chat response.

    Args:
        response (dict): The response from OpenAI API.

    Returns:
        A list of text or function calls in the responses.
    """
    choices = response["choices"]
    if "text" in choices[0]:
        return [choice["text"] for choice in choices]
    return [
        choice["message"] if "function_call" in choice["message"] else choice["message"].get("content", "")
        for choice in choices
    ]

print_usage_summary classmethod #

print_usage_summary()

Return the usage summary.

Source code in autogen/oai/completion.py
@classmethod
def print_usage_summary(cls) -> dict:
    """Return the usage summary."""
    if cls._history_dict is None:
        print("No usage summary available.", flush=True)

    token_count_summary = defaultdict(lambda: {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0})

    if not cls._history_compact:
        source = cls._history_dict.values()
        total_cost = sum(msg_pair["response"]["cost"] for msg_pair in source)
    else:
        # source = cls._history_dict["token_count"]
        # total_cost = sum(cls._history_dict['cost'])
        total_cost = sum(sum(value_list["cost"]) for value_list in cls._history_dict.values())
        source = (
            token_data for value_list in cls._history_dict.values() for token_data in value_list["token_count"]
        )

    for entry in source:
        if not cls._history_compact:
            model = entry["response"]["model"]
            token_data = entry["response"]["usage"]
        else:
            model = entry["model"]
            token_data = entry

        token_count_summary[model]["prompt_tokens"] += token_data["prompt_tokens"]
        token_count_summary[model]["completion_tokens"] += token_data["completion_tokens"]
        token_count_summary[model]["total_tokens"] += token_data["total_tokens"]

    print(f"Total cost: {total_cost}", flush=True)
    for model, counts in token_count_summary.items():
        print(
            f"Token count summary for model {model}: prompt_tokens: {counts['prompt_tokens']}, completion_tokens: {counts['completion_tokens']}, total_tokens: {counts['total_tokens']}",
            flush=True,
        )

start_logging classmethod #

start_logging(history_dict=None, compact=True, reset_counter=True)

Start book keeping.

PARAMETER DESCRIPTION
history_dict

A dictionary for book keeping. If no provided, a new one will be created.

TYPE: Dict DEFAULT: None

compact

Whether to keep the history dictionary compact. Compact history contains one key per conversation, and the value is a dictionary like:

TYPE: bool DEFAULT: True

{
    "create_at": [0, 1],
    "cost": [0.1, 0.2],
}
where "created_at" is the index of API calls indicating the order of all the calls, and "cost" is the cost of each call. This example shows that the conversation is based on two API calls. The compact format is useful for condensing the history of a conversation. If compact is False, the history dictionary will contain all the API calls: the key is the index of the API call, and the value is a dictionary like:
{
    "request": request_dict,
    "response": response_dict,
}
where request_dict is the request sent to OpenAI API, and response_dict is the response. For a conversation containing two API calls, the non-compact history dictionary will be like:
{
    0: {
        "request": request_dict_0,
        "response": response_dict_0,
    },
    1: {
        "request": request_dict_1,
        "response": response_dict_1,
    },
The first request's messages plus the response is equal to the second request's messages. For a conversation with many turns, the non-compact history dictionary has a quadratic size while the compact history dict has a linear size. reset_counter (bool): whether to reset the counter of the number of API calls.

Source code in autogen/oai/completion.py
@classmethod
def start_logging(
    cls,
    history_dict: Optional[dict[str, Any]] = None,
    compact: Optional[bool] = True,
    reset_counter: Optional[bool] = True,
):
    """Start book keeping.

    Args:
        history_dict (Dict): A dictionary for book keeping.
            If no provided, a new one will be created.
        compact (bool): Whether to keep the history dictionary compact.
            Compact history contains one key per conversation, and the value is a dictionary
            like:
    ```python
    {
        "create_at": [0, 1],
        "cost": [0.1, 0.2],
    }
    ```
            where "created_at" is the index of API calls indicating the order of all the calls,
            and "cost" is the cost of each call. This example shows that the conversation is based
            on two API calls. The compact format is useful for condensing the history of a conversation.
            If compact is False, the history dictionary will contain all the API calls: the key
            is the index of the API call, and the value is a dictionary like:
    ```python
    {
        "request": request_dict,
        "response": response_dict,
    }
    ```
            where request_dict is the request sent to OpenAI API, and response_dict is the response.
            For a conversation containing two API calls, the non-compact history dictionary will be like:
    ```python
    {
        0: {
            "request": request_dict_0,
            "response": response_dict_0,
        },
        1: {
            "request": request_dict_1,
            "response": response_dict_1,
        },
    ```
            The first request's messages plus the response is equal to the second request's messages.
            For a conversation with many turns, the non-compact history dictionary has a quadratic size
            while the compact history dict has a linear size.
        reset_counter (bool): whether to reset the counter of the number of API calls.
    """
    logger.warning(
        "logging via Completion.start_logging is deprecated in autogen and pyautogen v0.2. "
        "logging via OpenAIWrapper will be added back in a future release."
    )
    if ERROR:
        raise ERROR
    cls._history_dict = {} if history_dict is None else history_dict
    cls._history_compact = compact
    cls._count_create = 0 if reset_counter or cls._count_create is None else cls._count_create

stop_logging classmethod #

stop_logging()

End book keeping.

Source code in autogen/oai/completion.py
@classmethod
def stop_logging(cls):
    """End book keeping."""
    cls._history_dict = cls._count_create = None