Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
minor fix
Signed-off-by: Cheng, Zixuan <[email protected]>
  • Loading branch information
violetch24 committed Jun 14, 2024
commit 6b83c9eb36300a5e6de5fb7b0800e0839b2ea0f8
2 changes: 1 addition & 1 deletion docs/3x/PT_StaticQuant.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ q_model = convert(prepared_model)

#### Model Examples

Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant) on how to quantize a new model.
Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex) on how to quantize a new model.


### Static Quantization with PT2E Backend
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,10 @@ function run_benchmark {

if [[ ${mode} == "accuracy" ]]; then
mode_cmd=" --accuracy "
extra_cmd=$extra_cmd" --load"
elif [[ ${mode} == "performance" ]]; then
mode_cmd=" --performance --iters "${iters}
extra_cmd=$extra_cmd" --load"
else
echo "Error: No such mode: ${mode}"
exit 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
help="Select from ['dynamic', 'static', 'weight-only']")
parser.add_argument("--int8", action="store_true")
parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
parser.add_argument("--load", action="store_true", help="Load quantized model.")
parser.add_argument("--accuracy", action="store_true")
parser.add_argument("--performance", action="store_true")
parser.add_argument("--iters", default=100, type=int,
Expand Down Expand Up @@ -176,14 +177,19 @@ def get_user_model():
from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
from tqdm import tqdm
def run_fn(model):
for batch in tqdm(calib_dataloader):
calib_iter = 0
for batch in tqdm(calib_dataloader, total=args.calib_iters):
batch = move_input_to_device(batch, device=None)
if isinstance(batch, tuple) or isinstance(batch, list):
model(batch[0])
elif isinstance(batch, dict):
model(**batch)
else:
model(batch)

calib_iter += 1
if calib_iter >= args.calib_iters:
break
return

from utils import get_example_inputs
Expand All @@ -196,16 +202,17 @@ def run_fn(model):
user_model.save(args.output_dir)


# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
if args.int8 or args.int8_bf16_mixed:
print("load int8 model")
from neural_compressor.torch.quantization import load
tokenizer = AutoTokenizer.from_pretrained(args.model)
config = AutoConfig.from_pretrained(args.model)
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
setattr(user_model, "config", config)
else:
user_model, tokenizer = get_user_model()
if args.load:
# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
if args.int8 or args.int8_bf16_mixed:
print("load int8 model")
from neural_compressor.torch.quantization import load
tokenizer = AutoTokenizer.from_pretrained(args.model)
config = AutoConfig.from_pretrained(args.model)
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
setattr(user_model, "config", config)
else:
user_model, tokenizer = get_user_model()


if args.accuracy:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
Step-by-Step
============
This document describes the step-by-step instructions to run large language models (LLMs) using Static Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.

The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.

# Prerequisite
## 1. Create Environment
```
# Installation
pip install -r requirements.txt
```

# Run

Here is how to run the scripts:

**Causal Language Modeling (CLM)**

`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
### GPT-J-6b

#### Quantization
```bash
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--quantize \
--alpha 1.0 \
--ipex \
--output_dir "saved_results"
```

### OPT-125m

#### Quantization

```bash
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--quantize \
--alpha 0.5 \
--ipex \
--output_dir "saved_results"
```

### LLAMA2-7b/13b/70b
>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
#### Quantization

```bash
python run_clm_no_trainer.py \
--model meta-llama/Llama-2-7b-hf \
--quantize \
--alpha 0.8 \
--ipex \
--output_dir "saved_results"
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
accelerate
protobuf
sentencepiece != 0.1.92
datasets >= 1.1.3
torch >= 1.10
transformers
pytest
wandb
einops
neural-compressor
intel-extension-for-transformers
lm_eval==0.4.2
peft
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/bin/bash
set -x

function main {

init_params "$@"
run_benchmark

}

# init params
function init_params {
iters=100
batch_size=16
approach=static
tuned_checkpoint=saved_results
task=lambada_openai
echo ${max_eval_samples}
for var in "$@"
do
case $var in
--topology=*)
topology=$(echo $var |cut -f2 -d=)
;;
--dataset_location=*)
dataset_location=$(echo $var |cut -f2 -d=)
;;
--input_model=*)
input_model=$(echo $var |cut -f2 -d=)
;;
--mode=*)
mode=$(echo $var |cut -f2 -d=)
;;
--batch_size=*)
batch_size=$(echo $var |cut -f2 -d=)
;;
--iters=*)
iters=$(echo ${var} |cut -f2 -d=)
;;
--int8=*)
int8=$(echo ${var} |cut -f2 -d=)
;;
--config=*)
tuned_checkpoint=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
;;
esac
done

}


# run_benchmark
function run_benchmark {
extra_cmd=''

if [[ ${mode} == "accuracy" ]]; then
mode_cmd=" --accuracy "
extra_cmd=$extra_cmd" --load"
elif [[ ${mode} == "performance" ]]; then
mode_cmd=" --performance --iters "${iters}
extra_cmd=$extra_cmd" --load"
else
echo "Error: No such mode: ${mode}"
exit 1
fi

if [[ ${int8} == "true" ]]; then
extra_cmd=$extra_cmd" --int8"
fi
echo $extra_cmd

if [ "${topology}" = "opt_125m_ipex" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --ipex"
elif [ "${topology}" = "llama2_7b_ipex" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --ipex"
elif [ "${topology}" = "gpt_j_ipex" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --ipex"
fi

python -u run_clm_no_trainer.py \
--model ${model_name_or_path} \
--approach ${approach} \
--output_dir ${tuned_checkpoint} \
--task ${task} \
--batch_size ${batch_size} \
${extra_cmd} ${mode_cmd}
}

main "$@"
Loading