Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
0658a83
add inc woq and remove itrex dependency
changwangss Aug 27, 2024
4955b8a
Update optimum/intel/neural_compressor/modeling_base.py
changwangss Aug 29, 2024
7fe5ac5
Update optimum/intel/neural_compressor/modeling_base.py
changwangss Aug 29, 2024
1d6797c
Update optimum/intel/neural_compressor/modeling_base.py
changwangss Aug 29, 2024
ab178e9
Update optimum/intel/neural_compressor/modeling_base.py
changwangss Aug 29, 2024
c078ca2
fix code according comment
changwangss Aug 29, 2024
c257101
add logger setting
changwangss Aug 29, 2024
d55004b
improve ut
changwangss Aug 29, 2024
fcadbac
move woq quantization to quantization.py
changwangss Sep 5, 2024
8cf22de
Update examples/neural_compressor/language-modeling/run_clm.py
changwangss Sep 5, 2024
a31fc6a
Update examples/neural_compressor/language-modeling/run_clm.py
changwangss Sep 5, 2024
3b5f228
remove dependency
changwangss Sep 5, 2024
7f8c2a2
Update examples/neural_compressor/language-modeling/run_clm.py
IlyasMoutawwakil Sep 5, 2024
6eba7c4
add woq saving and loading ut and logger info
changwangss Sep 5, 2024
2683608
Merge branch 'main' into wangchang/inc_woq
changwangss Sep 5, 2024
1401c89
set transformers version limit
changwangss Sep 5, 2024
bc3b95a
fix installation neural_compressor[pt]
changwangss Sep 6, 2024
99f797d
improve ut
changwangss Sep 6, 2024
8321a24
refactoring
echarlaix Sep 6, 2024
08091bc
Refactor
echarlaix Sep 6, 2024
09acbd9
revert
echarlaix Sep 6, 2024
28a10d9
fix datasets loading issue
changwangss Sep 9, 2024
1ad67f1
fix
echarlaix Sep 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .github/workflows/test_inc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,15 @@ jobs:
pip install cmake
pip install py-cpuinfo
pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu
pip install intel-extension-for-pytorch==2.3.0
pip install datasets==2.19.0
pip install .[neural-compressor,diffusers,tests]
pip install intel-extension-for-transformers
pip install peft

- name: Test with Pytest
run: |
pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
- name: Test IPEX
run: |
pip uninstall -y intel-extension-for-transformers
pip install intel-extension-for-pytorch==2.3.0
pytest tests/neural_compressor/test_ipex.py

2 changes: 1 addition & 1 deletion examples/neural_compressor/language-modeling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,4 @@ respectively `dynamic`, `static`, `weight_only` or `aware_training`.

The flag `--verify_loading` can be passed along to verify that the resulting quantized model can be loaded correctly.

> **_Note:_** `weight_only` quantization_approach requires `neural-compressor` >= 2.3 and `intel-extension-for-transformers` >= 1.3.
> **_Note:_** `weight_only` quantization_approach requires `neural-compressor` > 3.0.
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ torch >= 1.9
datasets >= 1.8.0
sentencepiece != 0.1.92
protobuf
intel-extension-for-transformers >= 1.3
peft
40 changes: 24 additions & 16 deletions examples/neural_compressor/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
QuantizationAwareTrainingConfig,
WeightPruningConfig,
)
from neural_compressor.transformers import GPTQConfig, RtnConfig
from transformers import (
CONFIG_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING,
Expand All @@ -57,12 +58,8 @@
from transformers.utils.versions import require_version

from optimum.intel.neural_compressor import INCModelForCausalLM, INCQuantizer, INCTrainer
from optimum.intel.utils.import_utils import ITREX_IMPORT_ERROR, is_itrex_available


if is_itrex_available():
from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig

os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
Expand Down Expand Up @@ -203,12 +200,8 @@ class OptimizationArguments:
metadata={"help": "Whether or not to verify the loading of the quantized model."},
)
bits: str = field(
default="4",
metadata={"help": "Bits number of weight for weight only quantization. 1~8 bits."},
)
weight_dtype: str = field(
default="int4_clip",
metadata={"help": "weight dtype for weight only quantization."},
default=4,
metadata={"help": "Bits number of weight for weight only quantization. only support 4 bits now."},
)
group_size: int = field(
default=-1,
Expand All @@ -223,7 +216,6 @@ class OptimizationArguments:
metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
)
quantization_methodology: str = field(
choices=["rtn", "gptq"],
default="rtn",
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."},
)
Expand All @@ -249,6 +241,11 @@ class OptimizationArguments:
metadata={"help": "Calibration dataset sequence max length, this should align with your model config"},
)

def __post_init__(self):
woq_algorithms = ["rtn", "gptq"]
if self.quantization_methodology not in woq_algorithms:
raise ValueError(f"Value must be one of {woq_algorithms}, got {self.quantization_methodology}")


@dataclass
class DataTrainingArguments:
Expand Down Expand Up @@ -655,13 +652,11 @@ def compute_metrics(eval_preds):
else:
recipes = {}
if optim_args.quantization_approach == "weight_only":
if not is_itrex_available():
raise ImportError(ITREX_IMPORT_ERROR.format("WeightOnly quantization"))
if optim_args.apply_pruning or optim_args.apply_distillation:
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")

algorithm_args = {
"weight_dtype": optim_args.weight_dtype,
"bits": optim_args.bits,
"sym": optim_args.weight_only_scheme == "sym",
"group_size": optim_args.group_size,
}
Expand Down Expand Up @@ -756,10 +751,10 @@ def compute_metrics(eval_preds):
trainer.save_metrics("train", metrics)
trainer.save_state()

if optim_args.apply_quantization and optim_args.quantization_approach in {"static", "dynamic", "weight_only"}:
if optim_args.apply_quantization and optim_args.quantization_approach in {"static", "dynamic"}:
model = trainer.model if isinstance(trainer.model, PreTrainedModel) else trainer.model._model
quantizer = INCQuantizer.from_pretrained(model)
if optim_args.quantization_approach in ["static", "weight_only"]:
if optim_args.quantization_approach == "static":
num_calibration_samples = min(len(train_dataset), optim_args.num_calibration_samples)
train_dataset = train_dataset.select(range(num_calibration_samples))
quantization_config.calibration_sampling_size = num_calibration_samples
Expand All @@ -776,6 +771,19 @@ def compute_metrics(eval_preds):
)
trainer.model = quantizer._quantized_model

if optim_args.apply_quantization and optim_args.quantization_approach == "weight_only":
model = trainer.model if isinstance(trainer.model, PreTrainedModel) else trainer.model._model
num_calibration_samples = min(len(train_dataset), optim_args.num_calibration_samples)
train_dataset = train_dataset.select(range(num_calibration_samples))
quantization_config.calibration_sampling_size = num_calibration_samples
quantized_model = INCModelForCausalLM.from_pretrained(
model_args.model_name_or_path, quantization_config=quantization_config
)
if hasattr(quantization_config, "tokenizer"):
quantization_config.tokenizer.save_pretrained(training_args.output_dir)
quantized_model.save_pretrained(training_args.output_dir)
trainer.model = quantized_model

if optim_args.apply_quantization and optim_args.verify_loading:
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
tokens = tokenizer("This is a sample input", return_tensors="pt")
Expand Down
130 changes: 82 additions & 48 deletions optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from huggingface_hub import hf_hub_download
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from huggingface_hub.utils import EntryNotFoundError
from neural_compressor.transformers import GPTQConfig, RtnConfig
from neural_compressor.transformers.models.modeling_auto import _BaseINCAutoModelClass
from neural_compressor.utils.pytorch import load
from transformers import (
AutoConfig,
Expand All @@ -47,8 +49,9 @@
from optimum.intel.generation import BaseModelForCausalLM

from ...modeling_base import OptimizedModel
from ..utils.import_utils import _torch_version, is_itrex_available, is_torch_version
from ..utils.import_utils import _torch_version, is_torch_version
from .configuration import INCConfig
from .quantization import _weight_only_quantization
from .utils import QUANTIZATION_CONFIG_NAME


Expand Down Expand Up @@ -122,8 +125,85 @@ def _from_pretrained(
raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
token = use_auth_token

quantization_config = kwargs.pop("quantization_config", None)
model_path = Path(model_id)
is_local = model_path.is_dir()

# ITREX compatibility
quantization_config_path = None
if is_local:
quantization_config_path = model_path / subfolder / QUANTIZATION_CONFIG_NAME
else:
try:
quantization_config_path = hf_hub_download(
repo_id=model_id,
filename=QUANTIZATION_CONFIG_NAME,
subfolder=subfolder,
token=token,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
local_files_only=local_files_only,
)
except EntryNotFoundError:
pass
if quantization_config_path and Path(quantization_config_path).is_file():
algorithm = getattr(quantization_config, "quant_method", None)
if algorithm in {"rtn", "gptq", "awq", "autoround"}:
raise ValueError(
"This model was obtained through ITREX quantization, support for ITREX models is deprecated since neural-compressor v3.0. "
"To load this model please downgrade both optimum-intel and neural-compressor."
)
# quantization_config = PretrainedConfig.from_pretrained(quantization_config_path)
# config.quantization_config = quantization_config.to_dict()

if hasattr(config, "quantization_config"):
if config.quantization_config is None:
raise ValueError(
"The loading of `quantization_config` failed, to load this model please make sure the config is compatible"
)
else:
try:
logger.info(
"The weight only quantized model loading only supports the same format as GPTQ, such as https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/tree/main."
)
_BaseINCAutoModelClass.ORIG_MODEL = cls.auto_model_class
model = _BaseINCAutoModelClass.load_low_bit(
model_id,
subfolder=subfolder,
revision=revision,
cache_dir=cache_dir,
token=token,
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
config=config,
**kwargs,
)
logger.info("Saved low bit model loading successfully. Other input args " "will be ignored.")
return model
except Exception as e:
raise RuntimeError(f"The quantized model cannot be loaded. Detailed error: {e}")
if isinstance(quantization_config, (RtnConfig, GPTQConfig)):
logger.info(
"The quantized model parameters will be saved in the same format as GPTQ, here is the sample model https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/tree/main for details."
)
model = _weight_only_quantization(
cls.auto_model_class,
model_id,
quantization_config=quantization_config,
token=token,
revision=revision,
force_download=force_download,
cache_dir=cache_dir,
local_files_only=local_files_only,
subfolder=subfolder,
trust_remote_code=trust_remote_code,
**kwargs,
)

return cls(model, config=config, model_save_dir=None, **kwargs).model

model_cache_path = None
inc_config = None
msg = None
Expand Down Expand Up @@ -165,52 +245,6 @@ def _from_pretrained(

model_save_dir = Path(model_cache_path).parent

if is_itrex_available():
quantization_config_path = None
if is_local:
quantization_config_path = model_path / subfolder / QUANTIZATION_CONFIG_NAME
else:
try:
quantization_config_path = hf_hub_download(
repo_id=model_id,
filename=QUANTIZATION_CONFIG_NAME,
subfolder=subfolder,
token=token,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
local_files_only=local_files_only,
)
except EntryNotFoundError:
pass

if quantization_config_path and Path(quantization_config_path).is_file():
quantization_config = PretrainedConfig.from_pretrained(quantization_config_path)
algorithm = getattr(quantization_config, "quant_method", None)
if algorithm in {"rtn", "gptq", "awq", "autoround"}:
from intel_extension_for_transformers.transformers.modeling.modeling_auto import (
_BaseQBitsAutoModelClass,
)

_BaseQBitsAutoModelClass.ORIG_MODEL = cls.auto_model_class

model = _BaseQBitsAutoModelClass.from_pretrained(
pretrained_model_name_or_path=model_id,
token=token,
revision=revision,
force_download=force_download,
cache_dir=cache_dir,
local_files_only=local_files_only,
subfolder=subfolder,
trust_remote_code=trust_remote_code,
use_neural_speed=False,
**kwargs,
)

return cls(
model, config=config, model_save_dir=model_save_dir, q_config=quantization_config, **kwargs
)

try:
inc_config = INCConfig.from_pretrained(model_id, subfolder=subfolder, revision=revision)
if not is_torch_version("==", inc_config.torch_version):
Expand Down Expand Up @@ -254,7 +288,7 @@ def _from_pretrained(

def _save_pretrained(self, save_directory: Union[str, Path]):
if isinstance(self.model, torch.nn.Module):
# For ITREX model
# For INC weight only model
if isinstance(self._q_config, PretrainedConfig):
self._q_config.to_json_file(os.path.join(save_directory, QUANTIZATION_CONFIG_NAME))
self.model.save_pretrained(save_directory)
Expand Down
Loading