summaryrefslogtreecommitdiff
path: root/ggml/src/ggml-cuda/dequantize.cuh
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-07-27 07:55:01 +0200
committerGitHub <noreply@github.com>2024-07-27 07:55:01 +0200
commit154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch)
tree81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /ggml/src/ggml-cuda/dequantize.cuh
parent0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff)
Merge mainline llama.cpp (#3)
* Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/ggml-cuda/dequantize.cuh')
-rw-r--r--ggml/src/ggml-cuda/dequantize.cuh103
1 files changed, 103 insertions, 0 deletions
diff --git a/ggml/src/ggml-cuda/dequantize.cuh b/ggml/src/ggml-cuda/dequantize.cuh
new file mode 100644
index 00000000..bd3c2d9d
--- /dev/null
+++ b/ggml/src/ggml-cuda/dequantize.cuh
@@ -0,0 +1,103 @@
+#include "common.cuh"
+
+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+ const block_q4_0 * x = (const block_q4_0 *) vx;
+
+ const dfloat d = x[ib].d;
+
+ const int vui = x[ib].qs[iqs];
+
+ v.x = vui & 0xF;
+ v.y = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+ v = __hsub2(v, {8.0f, 8.0f});
+ v = __hmul2(v, {d, d});
+#else
+ v.x = (v.x - 8.0f) * d;
+ v.y = (v.y - 8.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+ const block_q4_1 * x = (const block_q4_1 *) vx;
+
+ const dfloat d = __low2half(x[ib].dm);
+ const dfloat m = __high2half(x[ib].dm);
+
+ const int vui = x[ib].qs[iqs];
+
+ v.x = vui & 0xF;
+ v.y = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+ v = __hmul2(v, {d, d});
+ v = __hadd2(v, {m, m});
+#else
+ v.x = (v.x * d) + m;
+ v.y = (v.y * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+ const block_q5_0 * x = (const block_q5_0 *) vx;
+
+ const dfloat d = x[ib].d;
+
+ uint32_t qh;
+ memcpy(&qh, x[ib].qh, sizeof(qh));
+
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
+
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+ v = __hsub2(v, {16.0f, 16.0f});
+ v = __hmul2(v, {d, d});
+#else
+ v.x = (v.x - 16.0f) * d;
+ v.y = (v.y - 16.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+ const block_q5_1 * x = (const block_q5_1 *) vx;
+
+ const dfloat d = __low2half(x[ib].dm);
+ const dfloat m = __high2half(x[ib].dm);
+
+ uint32_t qh;
+ memcpy(&qh, x[ib].qh, sizeof(qh));
+
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
+
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+ v = __hmul2(v, {d, d});
+ v = __hadd2(v, {m, m});
+#else
+ v.x = (v.x * d) + m;
+ v.y = (v.y * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+ const block_q8_0 * x = (const block_q8_0 *) vx;
+
+ const dfloat d = x[ib].d;
+
+ v.x = x[ib].qs[iqs + 0];
+ v.y = x[ib].qs[iqs + 1];
+
+#ifdef GGML_CUDA_F16
+ v = __hmul2(v, {d, d});
+#else
+ v.x *= d;
+ v.y *= d;
+#endif // GGML_CUDA_F16
+}