q4_0c: quantize support

ggml-org · unbounded · Apr 15, 2023 · Apr 17, 2023 · Apr 17, 2023 · Apr 18, 2023
commit 4bd781cd2572de9ec022178e9973f79cd1c7b278
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -8,6 +8,7 @@
 
 static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
   {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
+  {"q4_0c", LLAMA_FTYPE_MOSTLY_Q4_0C},
   {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
   {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
   {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},

diff --git a/ggml.c b/ggml.c
@@ -774,11 +774,17 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s
 
 #define QK4_0C (4*32)
 #define QK4_0C_MUL (QK4_0C / QK4_0)
-// TODO: nicer description - pseudostruct?
-// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
+#define Q4_0C_QSIZE (QK4_0C/2 + 4*sizeof(float))
+// typedef struct {
+//    uint8_t qs[QK4_0C/2][nb];
+//    float d[nb];
+// } block_q4_0c
 
 #define QK8_0C 32
-// q8_0c : uint8_t qs[n] || float d[n]
+// typedef struct {
+//    uint8_t qs[QK8_0C][nb];
+//    float d[nb];
+// } block_q8_0c
 
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
@@ -13102,6 +13108,27 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK4_0*sizeof(block_q4_0));
 }
 
+size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK4_0C == 0);
+    const int nb = k / QK4_0;
+
+    for (int j = 0; j < n; j += k) {
+        uint8_t * restrict y = (uint8_t *)dst + sizeof(block_q4_0)*j/QK4_0;
+
+        quantize_row_q4_0c_reference(src + j, y, k);
+
+        for (int i = 0; i < nb*QK4_0/2; i++) {
+            const uint8_t vi0 = y[i] & 0xF;
+            const uint8_t vi1 = y[i] >> 4;
+
+            hist[vi0]++;
+            hist[vi1]++;
+        }
+    }
+
+    return (n/QK4_0*sizeof(block_q4_0));
+}
+
 size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK4_1 == 0);
     const int nb = k / QK4_1;
@@ -13229,7 +13256,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK8_0*sizeof(block_q8_0));
 }
 
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
+size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist) {
     size_t result = 0;
     switch (type) {
         case GGML_TYPE_Q4_0:
@@ -13238,6 +13265,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
                 result = ggml_quantize_q4_0(src + start, block, n, n, hist);
             } break;
+        case GGML_TYPE_Q4_0C:
+            {
+                GGML_ASSERT(start % QK4_0C == 0);
+                uint8_t * dst_off = (uint8_t *) dst + Q4_0C_QSIZE * start / QK4_0C;
+                result = ggml_quantize_q4_0c(src + start, dst_off, n, k, hist);
+            } break;
         case GGML_TYPE_Q4_1:
             {
                 GGML_ASSERT(start % QK4_1 == 0);

diff --git a/ggml.h b/ggml.h
@@ -871,13 +871,14 @@ extern "C" {
     //
 
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_0c(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
 
-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int k, int64_t * hist);
 
     //
     // system info

diff --git a/llama.cpp b/llama.cpp
@@ -481,6 +481,7 @@ struct llama_file_loader {
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
                 case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_0C:
                 case GGML_TYPE_Q4_1:
                 case GGML_TYPE_Q4_2:
                 case GGML_TYPE_Q5_0:
@@ -557,6 +558,7 @@ struct llama_file_saver {
             case GGML_TYPE_F32:
             case GGML_TYPE_F16:
             case GGML_TYPE_Q4_0:
+            case GGML_TYPE_Q4_0C:
             case GGML_TYPE_Q4_1:
             case GGML_TYPE_Q4_2:
             case GGML_TYPE_Q5_0:
@@ -846,6 +848,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
         case LLAMA_FTYPE_ALL_F32:     return "all F32";
         case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
         case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
+        case LLAMA_FTYPE_MOSTLY_Q4_0C: return "mostly Q4_0C";
         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                                       return "mostly Q4_1, some F16";
@@ -1880,6 +1883,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     ggml_type quantized_type;
     switch (ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q4_0C: quantized_type = GGML_TYPE_Q4_0C; break;
         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
         case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
         case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
@@ -1961,15 +1965,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             new_data = work.addr;
             std::vector<int64_t> hist_cur(1 << 4, 0);
 
-            int chunk_size = 32 * 512;
+            int row_size = tensor.ne.at(0);
+            int chunk_size = ceil(32 * 512 * 1.0 / row_size) * row_size;
             const int nchunk = (nelements + chunk_size - 1)/chunk_size;
             const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
             if (nthread_use < 2) {
-                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
+                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, row_size, hist_cur.data());
             } else {
                 size_t counter = 0;
                 new_size = 0;
-                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
+                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size, row_size] () {
                     std::vector<int64_t> local_hist;
                     size_t local_size = 0;
                     while (true) {
@@ -1985,7 +1990,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                         lock.unlock();
                         size_t last = std::min(nelements, first + chunk_size);
                         if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
-                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
+                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, row_size, local_hist.data());
                     }
                 };
                 if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);

diff --git a/llama.h b/llama.h
@@ -83,6 +83,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0C = 20,  // except 1d tensors
     };
 
     LLAMA_API struct llama_context_params llama_context_default_params();