2 files changed, 9 insertions, 7 deletions
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 095235cd..cdb564f5 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -553,7 +553,7 @@ void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) {
     float * dptr = (float *)vy;
     int8_t * qy = (int8_t *)(dptr + 5);
     int n64 = nk / 64;
-#ifdef __AVX2__
+#ifdef z__AVX2__
     __m256 sign_bit = _mm256_set1_ps(-0.f);
     __m256 vmax[4] = {};
     __m256 vsum[4] = {};
@@ -594,7 +594,7 @@ void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) {
             qy += 32;
         }
     }
-#elif defined __ARM_NEON
+#elif defined z__ARM_NEON
     static const uint8_t k_shuffle[16] = {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
     auto shuffle = vld1q_u8(k_shuffle);
     float32x4_t vmax[4] = {};
@@ -640,16 +640,17 @@ void quantize_row_q8_K16(const float * x, void * vy, int64_t nk) {
         dptr[k] = amax[k]/127;
         amax[k] = dptr[k] > 0 ? 1/dptr[k] : 0.f;
     }
-    double sumf = 0;
+    int sumi[4] = {};
     for (int i64 = 0; i64 < n64; ++i64) {
         for (int k = 0; k < 4; ++k) {
             for (int j = 0; j < 16; ++j) {
-                sumf += x[64*i64 + 16*k + j];
-                qy[64*i64 + 16*k + j] = nearest_int(amax[k]*x[64*i64 + 16*k + j]);
+                int ix = nearest_int(amax[k]*x[64*i64 + 16*k + j]);
+                sumi[k] += ix;
+                qy[64*i64 + 16*k + j] = ix;
             }
         }
     }
-    dptr[4] = sumf;
+    dptr[4] = dptr[0]*sumi[0] + dptr[1]*sumi[1] + dptr[2]*sumi[2] + dptr[3]*sumi[3];
 #endif
 }
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 37653478..54b9b118 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5552,7 +5552,8 @@ static void llm_load_vocab(
             } else if (
                     tokenizer_pre == "llama3"   ||
                     tokenizer_pre == "llama-v3" ||
-                    tokenizer_pre == "llama-bpe") {
+                    tokenizer_pre == "llama-bpe"||
+                    tokenizer_pre == "falcon3") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                 vocab.tokenizer_ignore_merges = true;
                 vocab.tokenizer_add_bos = true;