Requantize Q6_K (gs16) to gs32 on GPU

wine99 · wine99 · commit 82c3c54dac0b · 2025-09-28T11:22:45.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp
@@ -425,25 +425,27 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
     int64_t block_size = node_shape[1];
     if (requant_type == ExtraQuantType::Q4_0_128) {
         block_size = 128;
+    } else if (requant_type == ExtraQuantType::Q8_0_32) {
+        block_size = 32;
     }
     auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size};
 
     ov::Tensor weights;
     ov::Tensor scales(ov::element::f16, scales_shape);
     ov::Tensor bias(ov::element::f16, scales_shape);
 
-    if (requant_type == ExtraQuantType::Q4_0_C) {
+    if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) {
         weights = ov::Tensor(ov::element::u4, node_shape);
         quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
         weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
     } else if (requant_type == ExtraQuantType::Q8_1_C) {
         weights = ov::Tensor(ov::element::u8, node_shape);
         quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
         weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
-    } else if (requant_type == ExtraQuantType::Q4_0_128) {
-        weights = ov::Tensor(ov::element::u4, node_shape);
-        quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
-        weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
+    } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) {
+        weights = ov::Tensor(ov::element::u8, node_shape);
+        quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
+        weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
     }
 
     weight_node->set_friendly_name(tensor->name);
@@ -485,6 +487,37 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
     }
 }
 
+void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk) {
+    assert(k % qk == 0);
+    const int nb = k / qk;
+
+    auto* weights = static_cast<uint8_t*>(weights_arr.data());
+    auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f;  // absolute max
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i * qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+            }
+        }
+
+        const float d = amax / 127.0f;
+        const float id = d ? 1.0f / d : 0.0f;
+        scales[i] = ov::float16(d);
+        biases[i] = ov::float16(-128.0f * d);
+
+        for (int j = 0; j < qk; ++j) {
+            const float x0 = x[i * qk + j] * id;
+            const int8_t xi0 = roundf(x0);
+            weights[i * qk + j] = (uint8_t) (xi0 + 128);
+        }
+    }
+}
+
 void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
                    int64_t qk) {
     assert(k % qk == 0);
diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp
@@ -51,14 +51,16 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
                                        ov::Tensor& biases,
                                        size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
 
-enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 };
+enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
 
 std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
 
 void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
                    int64_t qk);
 void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
                    int64_t qk);
+void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk);
 
 namespace ov {
 namespace op {
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
@@ -288,8 +288,8 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& devi
     }
     if (device == "GPU") {
         return {
-            // CVS-166739
-            {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C},
+            // gs16 is WIP
+            {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32},
         };
     }
     return {};

Original file line number	Diff line number	Diff line change
`@@ -288,8 +288,8 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& devi`
`288`	`288`	`}`
`289`	`289`	`if (device == "GPU") {`
`290`	`290`	`return {`
`291`		`- // CVS-166739`
`292`		`- {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C},`
	`291`	`+ // gs16 is WIP`
	`292`	`+ {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32},`
`293`	`293`	`};`
`294`	`294`	`}`
`295`	`295`	`return {};`