diff options
-rw-r--r-- | ggml/src/iqk/iqk_quantize.cpp | 13 | ||||
-rw-r--r-- | src/llama.cpp | 3 |
2 files changed, 9 insertions, 7 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 095235cd..cdb564f5 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -553,7 +553,7 @@ void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) { float * dptr = (float *)vy; int8_t * qy = (int8_t *)(dptr + 5); int n64 = nk / 64; -#ifdef __AVX2__ +#ifdef z__AVX2__ __m256 sign_bit = _mm256_set1_ps(-0.f); __m256 vmax[4] = {}; __m256 vsum[4] = {}; @@ -594,7 +594,7 @@ void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) { qy += 32; } } -#elif defined __ARM_NEON +#elif defined z__ARM_NEON static const uint8_t k_shuffle[16] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}; auto shuffle = vld1q_u8(k_shuffle); float32x4_t vmax[4] = {}; @@ -640,16 +640,17 @@ void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) { dptr[k] = amax[k]/127; amax[k] = dptr[k] > 0 ? 1/dptr[k] : 0.f; } - double sumf = 0; + int sumi[4] = {}; for (int i64 = 0; i64 < n64; ++i64) { for (int k = 0; k < 4; ++k) { for (int j = 0; j < 16; ++j) { - sumf += x[64*i64 + 16*k + j]; - qy[64*i64 + 16*k + j] = nearest_int(amax[k]*x[64*i64 + 16*k + j]); + int ix = nearest_int(amax[k]*x[64*i64 + 16*k + j]); + sumi[k] += ix; + qy[64*i64 + 16*k + j] = ix; } } } - dptr[4] = sumf; + dptr[4] = dptr[0]*sumi[0] + dptr[1]*sumi[1] + dptr[2]*sumi[2] + dptr[3]*sumi[3]; #endif } diff --git a/src/llama.cpp b/src/llama.cpp index 37653478..54b9b118 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5552,7 +5552,8 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "llama3" || tokenizer_pre == "llama-v3" || - tokenizer_pre == "llama-bpe") { + tokenizer_pre == "llama-bpe"|| + tokenizer_pre == "falcon3") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; vocab.tokenizer_ignore_merges = true; vocab.tokenizer_add_bos = true; |