Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
rename
Signed-off-by: changwangss <[email protected]>
  • Loading branch information
changwangss committed Aug 29, 2024
commit 3cd59d212eaf352e1717666a9845bd693cd7e375
2 changes: 1 addition & 1 deletion neural_compressor/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .modeling_auto import _BaseQBitsAutoModelClass
from .modeling_auto import _BaseINCAutoModelClass
8 changes: 4 additions & 4 deletions neural_compressor/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def build_woq_model(model, quantization_config):
return model


class _BaseQBitsAutoModelClass:
class _BaseINCAutoModelClass:
ORIG_MODEL = None

@classmethod
Expand Down Expand Up @@ -632,13 +632,13 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
return model


class AutoModelForCausalLM(_BaseQBitsAutoModelClass):
class AutoModelForCausalLM(_BaseINCAutoModelClass):
ORIG_MODEL = transformers.AutoModelForCausalLM


class AutoModel(_BaseQBitsAutoModelClass):
class AutoModel(_BaseINCAutoModelClass):
ORIG_MODEL = transformers.AutoModel


class AutoModelForSeq2SeqLM(_BaseQBitsAutoModelClass):
class AutoModelForSeq2SeqLM(_BaseINCAutoModelClass):
ORIG_MODEL = transformers.AutoModelForSeq2SeqLM
29 changes: 4 additions & 25 deletions neural_compressor/transformers/quantization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,10 @@ def convert_to_quantized_model(model, config, device="cpu"):
import intel_extension_for_pytorch

assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!"
os.environ["FORCE_DEVICE"] = "cpu"
logger.info(
"Set the environment variable FORCE_DEVICE='cpu' to ensure the quantization process occurs on the CPU."
)

orig_dtype = torch.float32
for param in model.parameters():
Expand Down Expand Up @@ -403,31 +407,6 @@ def convert_to_quantized_model(model, config, device="cpu"):
return q_model.to(device)


def pack_tensor_with_torch(raw_tensor, bits, compression_dtype=torch.int32):
"""Pack the tensor with torch.

Args:
raw_tensor (tensor): raw tensor.

Returns:
tensor: packed tensor.
"""
n_pack = 32 // bits
target_len = math.ceil(raw_tensor.shape[1] / n_pack)
packed_tensor = torch.zeros(raw_tensor.shape[0], target_len, dtype=compression_dtype).to(raw_tensor.device)
mask = torch.tensor(2**bits - 1, dtype=compression_dtype).to(raw_tensor.device)
for j in range(packed_tensor.shape[1]):
start = n_pack * j
end = n_pack * (j + 1)
tmp = raw_tensor[:, start:end].type(compression_dtype)
tmp &= mask
for e in range(tmp.shape[1]):
tmp[:, e] = tmp[:, e] << (bits * e)
packed_tensor[:, j] |= tmp[:, e]

return packed_tensor


def convert_to_GPTQ_checkpoints(model, quantization_config):
from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear as ipex_cpu_linear

Expand Down