diff options
author | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-18 20:08:28 +0300 |
---|---|---|
committer | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-22 12:02:52 +0300 |
commit | 927e251a12fa287e13c6bd9667ee97d783486c09 (patch) | |
tree | 90ed8827fc28630f52e92d8b8ea664198a6f5829 /ggml-cuda | |
parent | 181fd9c56eaa64d0a92f9e8be7387f409cfa8745 (diff) |
Bitnet(1.75 bpw): higher precision fp8 scale
Use 3 bits for the exponent and 5 bits for the mantissa.
This makes PPL to be the same as fp16 (but the previous
version with 4 bits for the exponent and mantissa was
good enough for any practical purposes).
Diffstat (limited to 'ggml-cuda')
-rw-r--r-- | ggml-cuda/common.cuh | 6 | ||||
-rw-r--r-- | ggml-cuda/convert.cu | 7 | ||||
-rw-r--r-- | ggml-cuda/vecdotq.cuh | 7 |
3 files changed, 10 insertions, 10 deletions
diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh index 892fd5a6..1c2d7215 100644 --- a/ggml-cuda/common.cuh +++ b/ggml-cuda/common.cuh @@ -501,6 +501,12 @@ static __device__ __forceinline__ float get_alibi_slope( return powf(base, exph); } +static __device__ __forceinline__ float iq1bn_fp8_to_float(uint8_t fp8) { + typedef union { float f; uint32_t i; } scale_t; + scale_t s; s.i = (((fp8 >> 5) + 116) << 23) | ((fp8 & 0x1f) << 18); + return s.f; +} + template <ggml_type type> struct ggml_cuda_type_traits; diff --git a/ggml-cuda/convert.cu b/ggml-cuda/convert.cu index 13f9f246..2a897738 100644 --- a/ggml-cuda/convert.cu +++ b/ggml-cuda/convert.cu @@ -432,11 +432,8 @@ static __global__ void dequantize_block_iq1_bn(const void * __restrict__ vx, dst int64_t i = QK_K/QK_IQ1BN * ii + ib/(QK_IQ1BN/32); if (i >= nb64) return; ib = ib%(QK_IQ1BN/32); - typedef union { float f; uint32_t i; } scale_t; - scale_t s; - uint8_t u = x[i].extra & 0xff; - s.i = ((((u >> 4) | 0xf0) - 132) << 23) | ((u & 0x0f) << 19); - const float dl = x[i].extra & (1 << (4*ib + il + 8)) ? -s.f : s.f; + float d = iq1bn_fp8_to_float(x[i].extra & 0xff); + const float dl = x[i].extra & (1 << (4*ib + il + 8)) ? -d : d; const float ml = -dl; uint16_t idx = x[i].ql[4*ib + il] | ((x[i].qh[2*ib + il/2] << (8 - 4*(il%2))) & 0x0f00); const uint16_t gp = iq1bn_grid_u16[idx]; diff --git a/ggml-cuda/vecdotq.cuh b/ggml-cuda/vecdotq.cuh index 6ec2035a..764a19d7 100644 --- a/ggml-cuda/vecdotq.cuh +++ b/ggml-cuda/vecdotq.cuh @@ -1078,10 +1078,7 @@ static __device__ __forceinline__ float vec_dot_iq1_bn_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { const block_iq1_bn * bq1 = (const block_iq1_bn *) vbq + kbx; - typedef union { float f; uint32_t i; } scale_t; - scale_t s; - uint8_t u = bq1->extra & 0xff; - s.i = ((((u >> 4) | 0xf0) - 132) << 23) | ((u & 0x0f) << 19); + float d = iq1bn_fp8_to_float(bq1->extra & 0xff); uint8_t extra = bq1->extra >> (8 + 4*iqs); int sumi = 0; #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -1110,7 +1107,7 @@ static __device__ __forceinline__ float vec_dot_iq1_bn_q8_1( q8 += 8; } #endif - return s.f * __low2float(bq8_1[iqs].ds) * sumi; + return d * __low2float(bq8_1[iqs].ds) * sumi; } // TODO |