add Multiple Instance Benchmark Summary

Signed-off-by: xin3he <[email protected]>
intel · xin3he · Jul 11, 2024 · Jun 24, 2024 · Jun 25, 2024 · Jun 26, 2024
commit 5f02407e7004f73920edb7404e4b81ef060a99ee
diff --git a/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh b/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh
@@ -75,13 +75,13 @@ function run_benchmark {
 
     if [ "${topology}" = "opt_125m_ipex_sq" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5"
+        extra_cmd=$extra_cmd" --ipex"
     elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8"
+        extra_cmd=$extra_cmd" --ipex"
     elif [ "${topology}" = "gpt_j_ipex_sq" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0"
+        extra_cmd=$extra_cmd" --ipex"
     fi
 
     if [[ ${mode} == "accuracy" ]]; then
@@ -96,9 +96,8 @@ function run_benchmark {
         incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
             --model ${model_name_or_path} \
             --approach ${approach} \
-            --output_dir ${tuned_checkpoint} \
-            --task ${task} \
             --batch_size ${batch_size} \
+            --output_dir ${tuned_checkpoint} \
             ${extra_cmd} ${mode_cmd}
     else
         echo "Error: No such mode: ${mode}"

diff --git a/.../nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/.../nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
@@ -239,21 +239,21 @@ def run_fn(model):
 
 if args.performance:
     user_model.eval()
-    batch_size, input_leng = 1, 512
+    batch_size, input_leng = args.batch_size, 512
     example_inputs = torch.ones((batch_size, input_leng), dtype=torch.long)
     print("Batch size = {:d}".format(batch_size))
     print("The length of input tokens = {:d}".format(input_leng))
     import time
 
-    total_iters = 100
+    total_iters = args.iters
     warmup_iters = 5
     with torch.no_grad():
         for i in range(total_iters):
             if i == warmup_iters:
                 start = time.time()
             user_model(example_inputs)
         end = time.time()
-    latency = (end - start) / ((total_iters - warmup_iters))
-    throughput = ((total_iters - warmup_iters)) / (end - start)
+    latency = (end - start) / ((total_iters - warmup_iters) * args.batch_size)
+    throughput = ((total_iters - warmup_iters) * args.batch_size) / (end - start)
     print("Latency: {:.3f} ms".format(latency * 10**3))
     print("Throughput: {:.3f} samples/sec".format(throughput))
diff --git a/neural_compressor/common/benchmark.py b/neural_compressor/common/benchmark.py
@@ -268,29 +268,47 @@ def set_cores_for_instance(args, numa_info):
             target_cores = args.num_instances * args.num_cores_per_instance
             assert target_cores <= len(
                 available_cores_list
-            ), "num_instances * num_cores_per_instance = {} exceeds the range of physical CPUs:{}".format(
+            ), "num_instances * num_cores_per_instance = {} exceeds the range of physical CPUs:{}.".format(
                 target_cores, len(available_cores_list)
             )
             cores_list = list(range(target_cores))
+            # log for cores in use
+            logger.info("num_instances * num_cores_per_instance = {} cores are used.".format(target_cores))
         else:
             # default behavior, only use numa:0
             cores_list = numa_info[0]
+            # log for cores in use
+            logger.info("By default, Intel Neural Compressor uses all cores on numa:0.")
     else:
         cores_list = parse_str2list(args.cores)
+        # log for cores available
+        logger.info("{} cores are available.".format(len(cores_list)))
         if args.num_cores_per_instance and args.num_instances:
             target_cores = args.num_instances * args.num_cores_per_instance
             assert target_cores <= len(
                 cores_list
-            ), "num_instances * num_cores_per_instance = {} exceeds the range of available CPUs:{}".format(
+            ), "num_instances * num_cores_per_instance = {} exceeds the range of available CPUs:{}.".format(
                 target_cores, len(cores_list)
             )
             cores_list = cores_list[:target_cores]
+
     # preprocess args.num_instances to set default values
     if args.num_instances is None:
         if args.num_cores_per_instance:
             args.num_instances = len(cores_list) // args.num_cores_per_instance
         else:
             args.num_instances = 1
+            logger.info("By default, Intel Neural Compressor triggers only one instance.")
+
+    ### log for instances number and cores in use
+    if args.num_instances == 1:
+        logger.info("1 instance is triggered.", highlight=True)
+    else:
+        logger.info("{} instances are triggered.".format(args.num_instances), highlight=True)
+    if len(cores_list) == 1:
+        logger.info("Only 1 core is in use.", highlight=True)
+    else:
+        logger.info("{} cores are in use.".format(len(cores_list)), highlight=True)
 
     # only need to process num_cores_per_instance now
     core_list_per_instance = {}
@@ -356,10 +374,12 @@ def run_multi_instance_command(args, core_list_per_instance, raw_cmd):
     """
     instance_cmd = ""
     if not os.getenv("PYTHON_PATH"):  # pragma: no cover
-        logger.info("The interpreter path is not set, using `python` command.")
+        logger.info("The interpreter path is not set, using string `python` as command.")
+        logger.info("To replace it, use `export PYTHON_PATH=xxx`.")
     interpreter = os.getenv("PYTHON_PATH", "python")
     current_work_dir = os.getcwd()
     logfile_process_map = {}
+    logfile_dict = {}
     for i, core_list in core_list_per_instance.items():
         # build cmd and log file path
         prefix = generate_prefix(args, core_list)
@@ -373,6 +393,7 @@ def run_multi_instance_command(args, core_list_per_instance, raw_cmd):
         )  # nosec
         # log_file_path: [process_object, instance_command, instance_index]
         logfile_process_map[instance_log_file] = [p, instance_cmd, i + 1]
+        logfile_dict[i + 1] = instance_log_file
 
     # Dump each instance's standard output to the corresponding log file
     for instance_log_file, p_cmd_i in logfile_process_map.items():
@@ -384,12 +405,60 @@ def run_multi_instance_command(args, core_list_per_instance, raw_cmd):
         logger.info(f"The log of instance {p_cmd_i[2]} is saved to {instance_log_file}")
 
     p.communicate()
+    return logfile_dict
+
+
+def summary_latency_throughput(logfile_dict):
+    """Get the summary of the benchmark."""
+    throughput_pattern = r"[T,t]hroughput:\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z/]*)"
+    latency_pattern = r"[L,l]atency:\s*([0-9]*\.?[0-9]+)\s*([a-zA-Z/]*)"
+
+    latency_list = []
+    throughput_list = []
+    latency_unit_name = ""
+    throughput_unit_name = ""
+    for idx, logfile in logfile_dict.items():
+        with open(logfile, "r") as f:
+            for line in f:
+                re_latency = re.search(latency_pattern, line)
+                re_throughput = re.search(throughput_pattern, line)
+                if re_latency:
+                    latency_list.append(float(re_latency.group(1)))
+                    if not latency_unit_name:
+                        latency_unit_name = re_latency.group(2)
+                if re_throughput:
+                    throughput_list.append(float(re_throughput.group(1)))
+                    if not throughput_unit_name:
+                        throughput_unit_name = re_throughput.group(2)
+    if throughput_list and latency_list:
+        assert (
+            len(latency_list) == len(throughput_list) == len(logfile_dict)
+        ), "Multiple instance benchmark failed with some instances!"
+
+        # dump collected latency and throughput info
+        header = "Multiple Instance Benchmark Summary"
+        field_names = [
+            "Instance",
+            "Latency ({})".format(latency_unit_name),
+            "Throughput ({})".format(throughput_unit_name),
+        ]
+        output_data = []
+        for idx, (latency, throughput) in enumerate(zip(latency_list, throughput_list)):
+            output_data.append([idx + 1, round(latency, 3), round(throughput, 3)])
+        output_data.append(
+            [
+                format_list2str(logfile_dict.keys()),
+                round(sum(latency_list) / len(latency_list), 3),
+                round(sum(throughput_list), 3),
+            ]
+        )
+        Statistics(output_data, header=header, field_names=field_names).print_stat()
 
 
 def benchmark():
     """Benchmark API interface."""
     logger.info("Start benchmark with Intel Neural Compressor.")
-    logger.info("By default, Intel Neural Compressor triggers only one instance on numa:0.")
+    logger.info("Intel Neural Compressor only uses physical CPUs for the best performance.")
 
     parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument("--num_instances", type=int, default=None, help="Determine the number of instances.")
@@ -409,7 +478,7 @@ def benchmark():
     assert sys.platform in ["linux", "win32"], "only support platform windows and linux..."
 
     numa_info = dump_numa_info()  # show numa info and current usage of cores
-    logger.info("Intel Neural Compressor only uses physical CPUs for the best performance.")
     core_list_per_instance = set_cores_for_instance(args, numa_info=numa_info)
     script_and_parameters = args.script + " " + " ".join(args.parameters)
-    run_multi_instance_command(args, core_list_per_instance, raw_cmd=script_and_parameters)
+    logfile_dict = run_multi_instance_command(args, core_list_per_instance, raw_cmd=script_and_parameters)
+    summary_latency_throughput(logfile_dict)
diff --git a/neural_compressor/common/utils/logger.py b/neural_compressor/common/utils/logger.py
@@ -120,6 +120,12 @@ def fatal(msg, *args, **kwargs):
     def info(msg, *args, **kwargs):
         """Output log with the info level."""
         kwargs.setdefault("stacklevel", 2)
+        highlight = kwargs.pop("highlight", False)
+        if highlight:
+            RESET = "\033[0m"
+            BOLD = "\033[1m"
+            RED = "\033[91m"
+            msg = f"{BOLD}{RED}{msg}{RESET}"
         if isinstance(msg, dict):
             for _, line in enumerate(_pretty_dict(msg).split("\n")):
                 Logger().get_logger().info(line, *args, **kwargs)