diff options
author | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-25 11:32:48 +0300 |
---|---|---|
committer | Iwan Kawrakow <iwan.kawrakow@gmail.com> | 2024-06-25 11:32:48 +0300 |
commit | aa14a06b44ff12be7e4461a6e169a657275a5b20 (patch) | |
tree | c0ab2e1cd51a778594f0dd226d3e54c102c81b39 /ggml-quants.h | |
parent | cc44d4a5c3368801f1de0d68096619a6746d47a4 (diff) |
Bitnet: trying an alternative iq1_bn grid
Faster on CUDA. The scalar version is faster too.
The issue with CUDA is that now I see wild performance
fluctuations. Running llama-bench I can get 220 t/s
for TG-128 one time, and 190 t/s another time, with
uncertaintiers of 1-2 t/s. Same for PP, results are
jumping back-and-fort between ~9500 t/s and ~8900 t/s.
So, basically no reliable measurement at this point,
but for sure faster than the previous version, which was
at around 170-180 t/s.
Diffstat (limited to 'ggml-quants.h')
-rw-r--r-- | ggml-quants.h | 1 |
1 files changed, 0 insertions, 1 deletions
diff --git a/ggml-quants.h b/ggml-quants.h index cc677207..3535f178 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -107,7 +107,6 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -void ggml_vec_dot_iq1_bn_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_bn_q8_K64(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); |