@@ -425,25 +425,27 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
425
425
int64_t block_size = node_shape[1 ];
426
426
if (requant_type == ExtraQuantType::Q4_0_128) {
427
427
block_size = 128 ;
428
+ } else if (requant_type == ExtraQuantType::Q8_0_32) {
429
+ block_size = 32 ;
428
430
}
429
431
auto scales_shape = ov::Shape{node_shape[0 ], node_shape[1 ] / block_size};
430
432
431
433
ov::Tensor weights;
432
434
ov::Tensor scales (ov::element::f16 , scales_shape);
433
435
ov::Tensor bias (ov::element::f16 , scales_shape);
434
436
435
- if (requant_type == ExtraQuantType::Q4_0_C) {
437
+ if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128 ) {
436
438
weights = ov::Tensor (ov::element::u4, node_shape);
437
439
quantize_q4_0 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
438
440
weight_node = make_int4_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
439
441
} else if (requant_type == ExtraQuantType::Q8_1_C) {
440
442
weights = ov::Tensor (ov::element::u8 , node_shape);
441
443
quantize_q8_1 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
442
444
weight_node = make_int8_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
443
- } else if (requant_type == ExtraQuantType::Q4_0_128 ) {
444
- weights = ov::Tensor (ov::element::u4 , node_shape);
445
- quantize_q4_0 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
446
- weight_node = make_int4_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
445
+ } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32 ) {
446
+ weights = ov::Tensor (ov::element::u8 , node_shape);
447
+ quantize_q8_0 (weights_f32.data (), weights, scales, bias, weights.get_size (), block_size);
448
+ weight_node = make_int8_weights (weights, scales, bias, block_size).get_node_shared_ptr ();
447
449
}
448
450
449
451
weight_node->set_friendly_name (tensor->name );
@@ -485,6 +487,37 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
485
487
}
486
488
}
487
489
490
+ void quantize_q8_0 (const float * x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
491
+ int64_t qk) {
492
+ assert (k % qk == 0 );
493
+ const int nb = k / qk;
494
+
495
+ auto * weights = static_cast <uint8_t *>(weights_arr.data ());
496
+ auto * scales = scales_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
497
+ auto * biases = biases_arr.data <ov::element_type_traits<ov::element::f16 >::value_type>();
498
+ for (int i = 0 ; i < nb; i++) {
499
+ float amax = 0 .0f ; // absolute max
500
+
501
+ for (int j = 0 ; j < qk; j++) {
502
+ const float v = x[i * qk + j];
503
+ if (amax < fabsf (v)) {
504
+ amax = fabsf (v);
505
+ }
506
+ }
507
+
508
+ const float d = amax / 127 .0f ;
509
+ const float id = d ? 1 .0f / d : 0 .0f ;
510
+ scales[i] = ov::float16 (d);
511
+ biases[i] = ov::float16 (-128 .0f * d);
512
+
513
+ for (int j = 0 ; j < qk; ++j) {
514
+ const float x0 = x[i * qk + j] * id;
515
+ const int8_t xi0 = roundf (x0);
516
+ weights[i * qk + j] = (uint8_t ) (xi0 + 128 );
517
+ }
518
+ }
519
+ }
520
+
488
521
void quantize_q8_1 (const float * x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
489
522
int64_t qk) {
490
523
assert (k % qk == 0 );
0 commit comments