microsoft · sonichi · Apr 8, 2023 · Mar 30, 2023 · Mar 30, 2023 · Apr 2, 2023
diff --git a/README.md b/README.md
@@ -23,9 +23,9 @@
 ## What is FLAML
 FLAML is a lightweight Python library that finds accurate machine
 learning models automatically, efficiently and economically. It frees users from selecting
-models and hyperparameters for each model. It can also be used to tune generic hyperparameters for large language models (LLM), MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations and so on.
+models and hyperparameters for each model. It can also be used to tune generic hyperparameters for foundation models, MLOps/LMOps workflows, pipelines, mathematical/statistical models, algorithms, computing experiments, software configurations and so on.
 
-1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including large language models such as the OpenAI GPT-3 models.
+1. For common machine learning or AI tasks like classification, regression, and generation, it quickly finds quality models for user-provided data with low computational resources. It supports both classical machine learning models and deep neural networks, including foundation models such as the GPT series.
 1. It is easy to customize or extend. Users can find their desired customizability from a smooth range: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), or full customization (arbitrary training and evaluation code).
 1. It supports fast automatic tuning, capable of handling complex constraints/guidance/early stopping. FLAML is powered by a new, [cost-effective
 hyperparameter optimization](https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function/#hyperparameter-optimization-algorithm)
@@ -95,6 +95,22 @@ estimator = LGBMRegressor()
 estimator.fit(X_train, y_train)
 ```
 
+* (New) You can optimize [generations](https://microsoft.github.io/FLAML/docs/Use-Cases/Auto-Generation) by ChatGPT or GPT-4 etc. with your own tuning data, success metrics and budgets.
+
+```python
+from flaml import oai
+
+config, analysis = oai.Completion.tune(
+    data=tune_data,
+    metric="success",
+    mode="max",
+    eval_func=success_metrics,
+    inference_budget=0.05,
+    optimization_budget=3,
+    num_samples=-1,
+)
+```
+
 ## Documentation
 
 You can find a detailed documentation about FLAML [here](https://microsoft.github.io/FLAML/) where you can find the API documentation, use cases and examples.

diff --git a/flaml/__init__.py b/flaml/__init__.py
@@ -2,7 +2,7 @@
 from flaml.automl import AutoML, logger_formatter
 from flaml.tune.searcher import CFO, BlendSearch, FLOW2, BlendSearchTuner, RandomSearch
 from flaml.onlineml.autovw import AutoVW
-from flaml.integrations import oai
+from flaml.autogen import oai
 from flaml.version import __version__
 
 

diff --git a/flaml/integrations/__init__.py → flaml/autogen/__init__.py b/flaml/integrations/__init__.py → flaml/autogen/__init__.py
diff --git a/flaml/autogen/code_utils.py b/flaml/autogen/code_utils.py
@@ -0,0 +1,166 @@
+import signal
+import subprocess
+import sys
+from typing import List, Dict, Tuple, Optional, Union, Callable
+from flaml import oai
+
+
+def timeout_handler(signum, frame):
+    raise TimeoutError("Timed out!")
+
+
+def execute_code(code: str, max_exec_time: Optional[int] = 3):
+    signal.signal(signal.SIGALRM, timeout_handler)
+    code = code.strip()
+    with open("codetest.py", "w") as fout:
+        fout.write(code)
+    try:
+        signal.alarm(max_exec_time)
+        result = subprocess.run(
+            [sys.executable, "codetest.py"],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+        )
+        signal.alarm(0)
+    except TimeoutError:
+        return 0
+    return int(result.returncode == 0)
+
+
+def generate_assertions(
+    definition: str, model: Optional[str] = "gpt-3.5-turbo"
+) -> Tuple[str, float]:
+    """Generate assertions for a function.
+
+    Args:
+        definition (str): The function definition, including the signature and docstr.
+        model (str): The model used for generation.
+
+    Returns:
+        str: The generated assertions.
+        float: The cost of the generation.
+    """
+    prompt = """Given the signature and docstring, write the exactly same number of assertion(s) for the provided example(s) in the docstring, without assertion messages.
+
+func signature:
+{definition}
+assertions:"""
+    response = oai.Completion.create(
+        {"definition": definition},
+        model=model,
+        prompt=prompt,
+        max_tokens=256,
+        stop="\n\n",
+    )
+    cost = oai.Completion.cost(model, response)
+    assertions = oai.Completion.extract_text(response)[0]
+    return assertions, cost
+
+
+def _remove_check(response):
+    """Remove the check function from the response."""
+    # find the position of the check function
+    pos = response.find("def check(")
+    if pos == -1:
+        return response
+    return response[:pos]
+
+
+def success_metrics(
+    responses: List[str],
+    definition: str,
+    test: Optional[str] = None,
+    entry_point: Optional[str] = None,
+    assertions: Optional[Union[str, Callable[[str], Tuple[str, float]]]] = None,
+) -> Dict:
+    """Check if the task is successful.
+
+    Args:
+        responses (list): The list of responses.
+        definition (str): The input definition.
+        test (Optional, str): The test code.
+        entry_point (Optional, str): The name of the function.
+        assertions (Optional, str or Callable): The assertion code which serves as a filter of the responses, or an assertion generator.
+            When provided, only the responses that pass the assertions will be considered for the actual test (if provided).
+
+    Returns:
+        dict: The success metrics.
+    """
+    n = len(responses)
+    if assertions is None:
+        # no assertion filter
+        success_list = []
+        for i in range(n):
+            response = _remove_check(responses[i])
+            code = (
+                f"{response}\n{test}\ncheck({entry_point})"
+                if response.startswith("def")
+                else f"{definition}{response}\n{test}\ncheck({entry_point})"
+            )
+            success = execute_code(code)
+            success_list.append(success)
+        return {
+            "expected_success": 1 - pow(1 - sum(success_list) / n, n),
+            "success": any(s for s in success_list),
+        }
+    if callable(assertions) and n > 1:
+        # assertion generator
+        assertions, gen_cost = assertions(definition)
+    else:
+        gen_cost = 0
+    if n > 1 or test is None:
+        for i in range(n):
+            response = responses[i] = _remove_check(responses[i])
+            code = (
+                f"{response}\n{assertions}"
+                if response.startswith("def")
+                else f"{definition}{response}\n{assertions}"
+            )
+            succeed_assertions = execute_code(code)
+            if succeed_assertions:
+                break
+    else:
+        # just test, no need to check assertions
+        succeed_assertions = False
+        i, response = 0, responses[0]
+    if test is None:
+        # no test code
+        return {
+            "index_selected": i,
+            "succeed_assertions": succeed_assertions,
+            "gen_cost": gen_cost,
+        }
+    code_test = (
+        f"{response}\n{test}\ncheck({entry_point})"
+        if response.startswith("def")
+        else f"{definition}{response}\n{test}\ncheck({entry_point})"
+    )
+    success = execute_code(code_test)
+    return {
+        "index_selected": i,
+        "succeed_assertions": succeed_assertions,
+        "success": success,
+        "gen_cost": gen_cost,
+    }
+
+
+def implement(definition: str, configs: List[Dict]) -> Tuple[str, float]:
+    """Implement a function.
+
+    Args:
+        definition (str): The function definition, including the signature and docstr.
+        configs (list): The list of configurations for completion.
+
+    Returns:
+        str: The implementation.
+        float: The cost of the implementation.
+        int: The index of the configuration which generates the implementation.
+    """
+    assertions, cost = generate_assertions(definition)
+    for i, config in enumerate(configs):
+        response = oai.Completion.create({"definition": definition}, **config)
+        cost += oai.Completion.cost(config["model"], response)
+        responses = oai.Completion.extract_text(response)
+        metrics = success_metrics(responses, definition, assertions=assertions)
+        if metrics["succeed_assertions"] or i == len(configs) - 1:
+            return responses[metrics["index_selected"]], cost, i