Skip to content

Commit 82c3c54

Browse files
committed
Requantize Q6_K (gs16) to gs32 on GPU
1 parent b5bfc0a commit 82c3c54

File tree

3 files changed

+43
-8
lines changed

3 files changed

+43
-8
lines changed

ggml/src/ggml-openvino/ggml-quants.cpp

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -425,25 +425,27 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
425425
int64_t block_size = node_shape[1];
426426
if (requant_type == ExtraQuantType::Q4_0_128) {
427427
block_size = 128;
428+
} else if (requant_type == ExtraQuantType::Q8_0_32) {
429+
block_size = 32;
428430
}
429431
auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size};
430432

431433
ov::Tensor weights;
432434
ov::Tensor scales(ov::element::f16, scales_shape);
433435
ov::Tensor bias(ov::element::f16, scales_shape);
434436

435-
if (requant_type == ExtraQuantType::Q4_0_C) {
437+
if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) {
436438
weights = ov::Tensor(ov::element::u4, node_shape);
437439
quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
438440
weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
439441
} else if (requant_type == ExtraQuantType::Q8_1_C) {
440442
weights = ov::Tensor(ov::element::u8, node_shape);
441443
quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
442444
weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
443-
} else if (requant_type == ExtraQuantType::Q4_0_128) {
444-
weights = ov::Tensor(ov::element::u4, node_shape);
445-
quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
446-
weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
445+
} else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) {
446+
weights = ov::Tensor(ov::element::u8, node_shape);
447+
quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
448+
weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
447449
}
448450

449451
weight_node->set_friendly_name(tensor->name);
@@ -485,6 +487,37 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
485487
}
486488
}
487489

490+
void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
491+
int64_t qk) {
492+
assert(k % qk == 0);
493+
const int nb = k / qk;
494+
495+
auto* weights = static_cast<uint8_t*>(weights_arr.data());
496+
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
497+
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
498+
for (int i = 0; i < nb; i++) {
499+
float amax = 0.0f; // absolute max
500+
501+
for (int j = 0; j < qk; j++) {
502+
const float v = x[i * qk + j];
503+
if (amax < fabsf(v)) {
504+
amax = fabsf(v);
505+
}
506+
}
507+
508+
const float d = amax / 127.0f;
509+
const float id = d ? 1.0f / d : 0.0f;
510+
scales[i] = ov::float16(d);
511+
biases[i] = ov::float16(-128.0f * d);
512+
513+
for (int j = 0; j < qk; ++j) {
514+
const float x0 = x[i * qk + j] * id;
515+
const int8_t xi0 = roundf(x0);
516+
weights[i * qk + j] = (uint8_t) (xi0 + 128);
517+
}
518+
}
519+
}
520+
488521
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
489522
int64_t qk) {
490523
assert(k % qk == 0);

ggml/src/ggml-openvino/ggml-quants.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,16 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
5151
ov::Tensor& biases,
5252
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
5353

54-
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 };
54+
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
5555

5656
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
5757

5858
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
5959
int64_t qk);
6060
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
6161
int64_t qk);
62+
void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
63+
int64_t qk);
6264

6365
namespace ov {
6466
namespace op {

ggml/src/ggml-openvino/utils.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,8 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& devi
288288
}
289289
if (device == "GPU") {
290290
return {
291-
// CVS-166739
292-
{GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C},
291+
// gs16 is WIP
292+
{GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32},
293293
};
294294
}
295295
return {};

0 commit comments

Comments
 (0)