minor fix

Signed-off-by: Cheng, Zixuan <[email protected]>
intel · violetch24 · Jun 14, 2024 · Jun 9, 2024 · Jun 11, 2024 · Jun 11, 2024
commit 6b83c9eb36300a5e6de5fb7b0800e0839b2ea0f8
diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md
@@ -68,7 +68,7 @@ q_model = convert(prepared_model)
 
 #### Model Examples
 
-Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant) on how to quantize a new model.
+Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex) on how to quantize a new model.
 
 
 ### Static Quantization with PT2E Backend

diff --git a/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh b/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh
@@ -59,8 +59,10 @@ function run_benchmark {
 
     if [[ ${mode} == "accuracy" ]]; then
         mode_cmd=" --accuracy "
+        extra_cmd=$extra_cmd" --load"
     elif [[ ${mode} == "performance" ]]; then
         mode_cmd=" --performance --iters "${iters}
+        extra_cmd=$extra_cmd" --load"
     else
         echo "Error: No such mode: ${mode}"
         exit 1

diff --git a/.../nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/.../nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
@@ -37,6 +37,7 @@
                     help="Select from ['dynamic', 'static', 'weight-only']")
 parser.add_argument("--int8", action="store_true")
 parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
+parser.add_argument("--load", action="store_true", help="Load quantized model.")
 parser.add_argument("--accuracy", action="store_true")
 parser.add_argument("--performance", action="store_true")
 parser.add_argument("--iters", default=100, type=int,
@@ -176,14 +177,19 @@ def get_user_model():
     from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
     from tqdm import tqdm
     def run_fn(model):
-        for batch in tqdm(calib_dataloader):
+        calib_iter = 0
+        for batch in tqdm(calib_dataloader, total=args.calib_iters):
             batch = move_input_to_device(batch, device=None)
             if isinstance(batch, tuple) or isinstance(batch, list):
                 model(batch[0])
             elif isinstance(batch, dict):
                 model(**batch)
             else:
                 model(batch)
+
+            calib_iter += 1
+            if calib_iter >= args.calib_iters:
+                break
         return
 
     from utils import get_example_inputs
@@ -196,16 +202,17 @@ def run_fn(model):
     user_model.save(args.output_dir)
 
 
-# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
-if args.int8 or args.int8_bf16_mixed:
-    print("load int8 model")
-    from neural_compressor.torch.quantization import load
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
-    config = AutoConfig.from_pretrained(args.model)
-    user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
-    setattr(user_model, "config", config)
-else:
-    user_model, tokenizer = get_user_model()
+if args.load:
+    # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
+    if args.int8 or args.int8_bf16_mixed:
+        print("load int8 model")
+        from neural_compressor.torch.quantization import load
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        config = AutoConfig.from_pretrained(args.model)
+        user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+        setattr(user_model, "config", config)
+    else:
+        user_model, tokenizer = get_user_model()
 
 
 if args.accuracy:

diff --git a/...p/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md b/...p/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md
@@ -0,0 +1,57 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run large language models (LLMs) using Static Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.
+
+The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
+
+# Prerequisite
+## 1. Create Environment
+```
+# Installation
+pip install -r requirements.txt
+```
+
+# Run
+
+Here is how to run the scripts:
+
+**Causal Language Modeling (CLM)**
+
+`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
+### GPT-J-6b
+
+#### Quantization
+```bash
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --quantize \
+    --alpha 1.0 \
+    --ipex \
+    --output_dir "saved_results"
+```
+
+### OPT-125m
+
+#### Quantization
+
+```bash
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --quantize \
+    --alpha 0.5 \
+    --ipex \
+    --output_dir "saved_results"
+```
+
+### LLAMA2-7b/13b/70b
+>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
+#### Quantization
+
+```bash
+python run_clm_no_trainer.py \
+    --model meta-llama/Llama-2-7b-hf \
+    --quantize \
+    --alpha 0.8 \
+    --ipex \
+    --output_dir "saved_results"
+```
diff --git a/.../nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt b/.../nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
@@ -0,0 +1,13 @@
+accelerate
+protobuf
+sentencepiece != 0.1.92
+datasets >= 1.1.3
+torch >= 1.10
+transformers
+pytest
+wandb
+einops
+neural-compressor
+intel-extension-for-transformers
+lm_eval==0.4.2
+peft
diff --git a/.../nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh b/.../nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  approach=static
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+        extra_cmd=$extra_cmd" --load"
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+        extra_cmd=$extra_cmd" --load"
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+    if [ "${topology}" = "opt_125m_ipex" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "llama2_7b_ipex" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "gpt_j_ipex" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --approach ${approach} \
+        --output_dir ${tuned_checkpoint} \
+        --task ${task} \
+        --batch_size ${batch_size} \
+        ${extra_cmd} ${mode_cmd}
+}
+
+main "$@"