Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d2f12ac
k_quants: WIP super-blocks with 64 weights
Jun 21, 2023
9fe2a2b
k_quants: WIP super-blocks with 64 weights
Jun 21, 2023
1f6195c
k_quants: WIP super-blocks with 64 weights
Jun 21, 2023
aebd547
k_quants: WIP super-blocks with 64 weights
Jun 21, 2023
2b2ab31
k_quants: WIP super-blocks with 64 weights
Jun 21, 2023
bcf8c5c
k_quants: WIP super-blocks with 64 weights
Jun 21, 2023
c6c3536
k_quants: WIP super-blocks with 64 weights
Jun 21, 2023
5aae4b8
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
41e46ec
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
460dd84
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
3bd9ae7
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
03f30c8
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
cda47a6
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
80c75fe
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
2b2a13c
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
9d27d8d
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
2ff543c
k_quants: WIP super-blocks with 64 weights
Jun 22, 2023
d92c5a9
k_quants: WIP super-blocks with 64 weights
Jun 23, 2023
fae24af
k_quants: WIP super-blocks with 64 weights
Jun 23, 2023
e1bbcfc
k_quants: WIP super-blocks with 64 weights
Jun 23, 2023
167a0bb
k_quants: WIP super-blocks with 64 weights
Jun 23, 2023
6081a65
k_quants: WIP super-blocks with 64 weights
Jun 23, 2023
ff83e32
k_quants: WIP super-blocks with 64 weights
Jun 23, 2023
285eeb1
k_quants: WIP super-blocks with 64 weights
Jun 23, 2023
8b98d01
k_quants: call them _K, not _k, also on Metal
Jun 23, 2023
558a194
k_quants: correctly define QK_K in llama.cpp
Jun 23, 2023
333ffcc
Fixed bug in q4_K quantization added with the 64-block addition
Jun 23, 2023
88412a1
Simplify via lambda
Jun 23, 2023
aeefd4e
k_quants: swicth Q3_K to 4-bit scales when QK_K = 64
Jun 24, 2023
ce19b96
k_quants: switch Q4_K to 4-bit scales when QK_K = 64
Jun 24, 2023
4f61506
k_quants: forgot to add the Metal changes in last commit
Jun 24, 2023
ccf4901
k_quants: change Q5_K to be type 0 when QK_K = 64
Jun 24, 2023
2da3a59
k_quants: AVX2 implementation for new 64-weight Q5_K
Jun 24, 2023
53e81ca
k_quants: 10% faster ARM_NEON Q5_K dot product
Jun 24, 2023
5fd8337
k_quants: fixed issue caused by merging with master
Jun 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
k_quants: WIP super-blocks with 64 weights
Q2_K working on CUDA. ~3% slower on GTX-1660,
10% slower on 4080.
  • Loading branch information
Iwan Kawrakow committed Jun 26, 2023
commit 41e46ec1c208e404499d34cf4e4df94637e897e0
57 changes: 53 additions & 4 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -364,13 +364,14 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {

const int i = blockIdx.x;
const block_q2_K * x = (const block_q2_K *) vx;

const int tid = threadIdx.x;
#if QK_K == 256
const int n = tid/32;
const int l = tid - 32*n;
const int is = 8*n + l/16;

const block_q2_K * x = (const block_q2_K *) vx;

const uint8_t q = x[i].qs[32*n + l];
float * y = yy + i*QK_K + 128*n;

Expand All @@ -380,6 +381,16 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
#else
const int is = tid/16; // 0 or 1
const int il = tid%16; // 0...15
const uint8_t q = x[i].qs[il] >> (2*is);
float * y = yy + i*QK_K + 16*is + il;
float dall = x[i].d;
float dmin = x[i].dmin;
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
#endif

}

Expand Down Expand Up @@ -550,6 +561,9 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float

const block_q2_K * x = (const block_q2_K *)vx + ib0;

float tmp = 0; // partial sum for thread in warp

#if QK_K == 256
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1

Expand All @@ -563,8 +577,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
const int s_offset = 8*im;
const int y_offset = 128*im + l0;

float tmp = 0; // partial sum for thread in warp

uint32_t aux[4];
const uint8_t * d = (const uint8_t *)aux;
const uint8_t * m = (const uint8_t *)(aux + 2);
Expand Down Expand Up @@ -600,6 +612,39 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
tmp += dall * sum1 - dmin * sum2;

}
#else
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
const int offset = tid * K_QUANTS_PER_ITERATION;

uint32_t uaux[2];
const uint8_t * d = (const uint8_t *)uaux;

for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

const float * y = yy + i * QK_K + offset;
const uint8_t * q = x[i].qs + offset;
const uint32_t * s = (const uint32_t *)x[i].scales;

uaux[0] = s[0] & 0x0f0f0f0f;
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;

const half2 * dh = (const half2 *)&x[i].d;

const float2 dall = __half22float2(dh[0]);

float sum1 = 0, sum2 = 0;
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
const uint8_t ql = q[l];
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
+ y[l+16] * d[1] * ((ql >> 2) & 3)
+ y[l+32] * d[2] * ((ql >> 4) & 3)
+ y[l+48] * d[3] * ((ql >> 6) & 3);
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
}
tmp += dall.x * sum1 - dall.y * sum2;
}
#endif

// sum up partial sums and write back result
__syncthreads();
Expand Down Expand Up @@ -1351,7 +1396,11 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu

static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int nb = k / QK_K;
#if QK_K == 256
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
#else
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
#endif
}

static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
Expand Down