diff options
Diffstat (limited to 'ggml-metal.metal')
-rw-r--r-- | ggml-metal.metal | 66 |
1 files changed, 37 insertions, 29 deletions
diff --git a/ggml-metal.metal b/ggml-metal.metal index 50185ae4..912822ee 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2595,8 +2595,8 @@ typedef struct { typedef struct { half d; - uint8_t qs[QK_K/8]; - uint8_t scales[QK_K/16]; + uint8_t qs[QK_K/8]; + uint16_t qh[QK_K/32]; } block_iq1_s; // Non-linear quants @@ -4338,48 +4338,53 @@ void kernel_mul_mv_iq1_s_f32_impl( device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; - float yl[16]; + float yl[32]; float sumf[N_DST]={0.f}, all_sum; const int nb32 = nb * (QK_K / 32); - const int ix = tiisg/2; - const int il = tiisg%2; + const int ix = tiisg; - device const float * y4 = y + 32 * ix + 16 * il; + device const float * y4 = y + 32 * ix; - for (int ib32 = ix; ib32 < nb32; ib32 += 16) { + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - for (int i = 0; i < 16; ++i) { + float sumy = 0; + for (int i = 0; i < 32; ++i) { yl[i] = y4[i]; + sumy += yl[i]; } const int ibl = ib32 / (QK_K / 32); const int ib = ib32 % (QK_K / 32); device const block_iq1_s * xr = x + ibl; - device const uint8_t * qs = xr->qs + 4 * ib + 2 * il; - device const uint8_t * sc = xr->scales + 2 * ib + il; - device const half * dh = &xr->d; + device const uint8_t * qs = xr->qs + 4 * ib; + device const uint16_t * qh = xr->qh + ib; + device const half * dh = &xr->d; for (int row = 0; row < N_DST; row++) { - constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); - constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 5) & 0x700))); + constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[0] << 2) & 0x700))); + constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[0] >> 1) & 0x700))); - float2 sum = {0}; - for (int j = 0; j < 8; ++j) { - sum[0] += yl[j+ 0] * grid1[j]; - sum[1] += yl[j+ 8] * grid2[j]; + float sum = 0; + for (int j = 0; j < 4; ++j) { + sum += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4) + + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4) + + yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4) + + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4); } - sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1)); + sumf[row] += (float)dh[0] * (sum - sumy) * (2*(qh[0] >> 12) + 1); dh += nb*sizeof(block_iq1_s)/2; qs += nb*sizeof(block_iq1_s); - sc += nb*sizeof(block_iq1_s); + qh += nb*sizeof(block_iq1_s)/2; } - y4 += 16 * 32; + y4 += 32 * 32; } for (int row = 0; row < N_DST; ++row) { @@ -5066,16 +5071,19 @@ void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & template <typename type4x4> void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; const float d = xb->d; - device const uint8_t * qs = xb->qs + 2*il; - device const uint8_t * sc = xb->scales + il; - const float dl1 = d * (2*(sc[0] & 7) + 1); - const float dl2 = d * (2*((sc[0] >> 4) & 7) + 1); - constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); - constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); - for (int i = 0; i < 8; ++i) { - reg[i/4+0][i%4] = dl1 * grid1[i]; - reg[i/4+2][i%4] = dl2 * grid2[i]; + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint16_t * qh = xb->qh; + const float dl = d * (2*(qh[ib32] >> 12) + 1); + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | (((qh[ib32] >> (6*il+0)) & 7) << 8))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | (((qh[ib32] >> (6*il+3)) & 7) << 8))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) - dl; + reg[1][i] = dl * (grid1[i] >> 4) - dl; + reg[2][i] = dl * (grid2[i] & 0xf) - dl; + reg[3][i] = dl * (grid2[i] >> 4) - dl; } } |