intel · violetch24 · Jun 14, 2024 · Jun 9, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/docs/3x/PT_SmoothQuant.md b/docs/3x/PT_SmoothQuant.md
@@ -46,7 +46,7 @@ run_fn(prepared_model)
 q_model = convert(prepared_model)
 ```
 
-To get more information, please refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm).
+To get more information, please refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant).
 
 
 ## Validated Models

diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md
@@ -68,7 +68,7 @@ q_model = convert(prepared_model)
 
 #### Model Examples
 
-Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm) on how to quantize a new model.
+Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant) on how to quantize a new model.
 
 
 ### Static Quantization with PT2E Backend

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
@@ -0,0 +1,46 @@
+{
+    "pytorch": {
+      "gpt_j_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "gpt_j_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "opt_125m_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      }
+    }
+}
diff --git a/...ch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md b/...ch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md
@@ -0,0 +1,64 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run large language models (LLMs) using Smooth Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.
+
+The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
+
+# Prerequisite
+## 1. Create Environment
+```
+# Installation
+pip install -r requirements.txt
+```
+
+# Run
+
+Here is how to run the scripts:
+
+**Causal Language Modeling (CLM)**
+
+`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
+### GPT-J-6b
+
+#### Quantization
+```bash
+# "--sq" is used to enable smooth quant
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --quantize \
+    --sq \
+    --alpha 1.0 \
+    --ipex \
+    --output_dir "saved_results"
+```
+**Notes**: Smooth quantization here is based on torch.jit. Without past key value in example_inputs, the quantized model cannot be used for text-generation.
+
+### OPT-125m
+
+#### Quantization
+
+```bash
+# "--sq" is used to enable smooth quant
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --quantize \
+    --sq \
+    --alpha 0.5 \
+    --ipex \
+    --output_dir "saved_results"
+```
+
+### LLAMA2-7b/13b/70b
+>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
+#### Quantization
+
+```bash
+# "--sq" is used to enable smooth quant
+python run_clm_no_trainer.py \
+    --model meta-llama/Llama-2-7b-hf \
+    --quantize \
+    --sq \
+    --alpha 0.8 \
+    --ipex \
+    --output_dir "saved_results"
+```
diff --git a/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
@@ -0,0 +1,13 @@
+accelerate
+protobuf
+sentencepiece != 0.1.92
+datasets >= 1.1.3
+torch >= 1.10
+transformers
+pytest
+wandb
+einops
+neural-compressor
+intel-extension-for-transformers
+lm_eval==0.4.2
+peft
diff --git a/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh b/...torch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  approach=static
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+    if [ "${topology}" = "opt_125m_ipex_sq" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5"
+    elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8"
+    elif [ "${topology}" = "gpt_j_ipex_sq" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --approach ${approach} \
+        --output_dir ${tuned_checkpoint} \
+        --task ${task} \
+        --batch_size ${batch_size} \
+        ${extra_cmd} ${mode_cmd}
+}
+
+main "$@"