Category Hard Data Labeling Script (#3334)

CodingWithTim · web-flow · commit 4a12a8eccdba · 2024-05-16T23:40:26.000-07:00
diff --git a/fastchat/serve/monitor/criteria_labeling.py b/fastchat/serve/monitor/criteria_labeling.py
@@ -0,0 +1,214 @@
+import argparse
+import json
+import pandas as pd
+import os
+import re
+import ast
+import time
+import concurrent.futures
+import tqdm
+import random
+import threading
+
+LOCK = threading.RLock()
+
+## Configs
+SYSTEM_PROMPT = "Your task is to evaluate how well the following input prompts can assess the capabilities of advanced AI assistants.\n\nFor the input prompt, please analyze it based on the following 7 criteria.\n1. Specificity: Does the prompt ask for a specific output, such as code, a mathematical solution, a logical simplification, a problem-solving strategy, or a hardware setup recommendation? This specificity allows the AI to demonstrate its ability to understand and generate precise responses.\n2. Domain Knowledge: Does the prompt cover a specific domain, such as programming, mathematics, logic, problem-solving, or hardware setup? Prompts spanning a range of topics test the AI's breadth of knowledge and its ability to apply that knowledge to different domains.\n3. Complexity: Does the prompt vary in complexity, from straightforward tasks to more complex, multi-step problems? This allows evaluators to assess the AI's capability to handle problems of varying difficulty.\n4. Problem-Solving Skills: Does the prompt directly involves the AI to demonstrate active problem-solving skills, such systemically coming up with a solution for a specific setup instead of regurgitating an existing fact? This tests the AI's ability to apply logical reasoning and provide practical solutions.\n5. Creativity: Does the prompt involve a level of creativity in approaching the problem? This criterion tests the AI's ability to provide tailored solutions that take into account the user's specific needs and limitations.\n6. Technical Accuracy: Does the prompt require technical accuracy in the response? This allows evaluators to assess the AI's precision and correctness in technical fields.\n7. Real-world Application: Does the prompt relate to real-world applications, such as setting up a functional system or writing code for a practical use case? This tests the AI's ability to provide practical and actionable information that could be implemented in real-life scenarios.\n\nYou must list the criteria numbers that the prompt satisfies in the format of a Python array. For example, \"[...]\". Do not explain your choice."
+
+ENDPOINT_INFO = {
+    "model_name": "META-LLAMA/LLAMA-3-70B-CHAT-HF",
+    "name": "llama-3-70b-instruct",
+    "endpoints": [{"api_base": "-", "api_key": "-"}],
+    "parallel": 8,
+    "temperature": 0.0,
+    "max_token": 512,
+}  # Modify this
+
+TAGS = {
+    1: "specificity",
+    2: "domain_knowledge",
+    3: "complexity",
+    4: "problem_solving",
+    5: "creativity",
+    6: "technical_accuracy",
+    7: "real_world",
+}
+
+# API setting constants
+API_MAX_RETRY = 3
+API_RETRY_SLEEP = 10
+API_ERROR_OUTPUT = "$ERROR$"
+
+
+def get_endpoint(endpoint_list):
+    if endpoint_list is None:
+        return None
+    assert endpoint_list is not None
+    # randomly pick one
+    api_dict = random.choices(endpoint_list)[0]
+    return api_dict
+
+
+pattern = re.compile(r"(\[\d(?:\,\s\d)*\])")
+
+
+def get_score(judgment):
+    matches = pattern.findall(judgment)
+    matches = [m for m in matches if m != ""]
+    if len(set(matches)) == 0:
+        return []
+    elif len(set(matches)) == 1:
+        try:
+            return ast.literal_eval(matches[0])
+        except SyntaxError:
+            print(matches[0])
+            return []
+    else:
+        return []
+
+
+def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
+    import openai
+
+    if api_dict:
+        client = openai.OpenAI(
+            base_url=api_dict["api_base"],
+            api_key=api_dict["api_key"],
+        )
+    else:
+        client = openai.OpenAI()
+
+    output = API_ERROR_OUTPUT
+    for _ in range(API_MAX_RETRY):
+        try:
+            # print(messages)
+            completion = client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                # extra_body={"guided_choice": GUIDED_CHOICES} if GUIDED_CHOICES else None,
+            )
+            output = completion.choices[0].message.content
+            break
+        except openai.RateLimitError as e:
+            print(type(e), e)
+            time.sleep(API_RETRY_SLEEP)
+        except openai.BadRequestError as e:
+            print(messages)
+            print(type(e), e)
+            break
+        except openai.APIConnectionError as e:
+            print(messages)
+            print(type(e), e)
+            time.sleep(API_RETRY_SLEEP)
+        except openai.InternalServerError as e:
+            print(messages)
+            print(type(e), e)
+            time.sleep(1)
+        except KeyError:
+            print(type(e), e)
+            break
+
+    return output
+
+
+def get_answer(
+    question: dict,
+    max_tokens: int,
+    temperature: float,
+    answer_file: str,
+    api_dict: dict,
+):
+    conv = []
+    conv.append({"role": "system", "content": SYSTEM_PROMPT})
+
+    conv.append({"role": "user", "content": question["prompt"]})
+    output = chat_completion_openai(
+        model=ENDPOINT_INFO["model_name"],
+        messages=conv,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        api_dict=api_dict,
+    )
+
+    criteria = get_score(output)
+
+    # Dump answers
+    question["criteria_tag"] = {name: bool(i in criteria) for i, name in TAGS.items()}
+    question.drop("prompt")
+
+    with LOCK:
+        with open(answer_file, "a") as fout:
+            fout.write(json.dumps(question.to_dict()) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-file", type=str, required=True)
+    parser.add_argument("--cache-file", type=str, default=None)
+    parser.add_argument("--output-file", type=str, required=True)
+    parser.add_argument("--convert-to-json", action="store_true")
+    args = parser.parse_args()
+
+    print("loading input data (might take min)")
+    input_data = pd.read_json(args.input_file)
+    print(f"{len(input_data)}# of input data just loaded")
+    if args.cache_file:
+        print("loading cache data")
+        cache_data = pd.read_json(args.cache_file)
+        print(f"{len(cache_data)}# of cache data just loaded")
+
+        assert "criteria_tag" in cache_data.columns and len(
+            cache_data["criteria_tag"].dropna()
+        ) == len(cache_data)
+
+        not_labeled = input_data[
+            ~input_data["question_id"].isin(cache_data["question_id"])
+        ].copy()
+    else:
+        not_labeled = input_data.copy()
+
+    if os.path.isfile(args.output_file):
+        print("loading existing output")
+        output_data = pd.read_json(args.output_file, lines=True)
+        print(f"{len(output_data)}# of existing output just loaded")
+
+        assert "criteria_tag" in output_data.columns and len(
+            output_data["criteria_tag"].dropna()
+        ) == len(output_data)
+
+        not_labeled = not_labeled[
+            ~not_labeled["question_id"].isin(output_data["question_id"])
+        ]
+
+    print(f"{len(not_labeled)} needs to be labeled")
+
+    not_labeled["prompt"] = not_labeled.conversation_a.map(
+        lambda convo: "\n".join([convo[i]["content"] for i in range(0, len(convo), 2)])
+    )
+
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=ENDPOINT_INFO["parallel"]
+    ) as executor:
+        futures = []
+        for index, row in tqdm.tqdm(not_labeled.iterrows()):
+            future = executor.submit(
+                get_answer,
+                row,
+                ENDPOINT_INFO["max_token"],
+                ENDPOINT_INFO["temperature"],
+                args.output_file,
+                get_endpoint(ENDPOINT_INFO["endpoints"]),
+            )
+            futures.append(future)
+        for future in tqdm.tqdm(
+            concurrent.futures.as_completed(futures), total=len(futures)
+        ):
+            future.result()
+
+    if args.convert_to_json:
+        temp = pd.read_json(args.output_file, lines=True)
+        temp.to_json(
+            args.output_file[:-1], orient="records", indent=4, force_ascii=False
+        )