diff options
author | Kawrakow <iwankawrakow@gmail.com> | 2025-05-23 16:46:27 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-05-23 16:46:27 +0300 |
commit | 6b12c2e7e8c9f8e6925ff8d9e7ebd5231bb9e6ef (patch) | |
tree | eb738fbaf619909dcb86de4372960131b4415160 | |
parent | 7f2edd1a85044b407a6d4f2ad2ea4911598ddb66 (diff) |
Fix MSVC compilation (#448)
* Fix MSVC compilation
* MSVC cannot capture constexpr in lambdas
* Arghhh
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
-rw-r--r-- | examples/quantize-stats/quantize-stats.cpp | 12 | ||||
-rw-r--r-- | ggml/src/iqk/iqk_gemm_ktquants.cpp | 32 |
2 files changed, 11 insertions, 33 deletions
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 3a37f72c..d1598aec 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -550,6 +550,10 @@ static void analyze_x_v2(const char * name, int nrows, int n_per_row, const floa int counter = 0; float mse = 0, mse_q = 0; auto compute = [&mutex, &counter, &mse, &mse_q, values, nrows, n_per_row, chunk] () { + constexpr int kNumVal = 1 << 15; + constexpr int kBlockSize = 32; + constexpr int kGroupSize = 8; + constexpr int kNg = kBlockSize/kGroupSize; double lmse = 0, lmse_q = 0; std::vector<float> scales(n_per_row/kBlockSize); std::vector<int> best_idx(n_per_row/kGroupSize); @@ -689,9 +693,8 @@ static void analyze_x_v2(const char * name, int nrows, int n_per_row, const floa } } }; - std::vector<std::thread> workers(nthread-1); + std::vector<std::thread> workers(nthread); for (auto& w : workers) w = std::thread(compute); - compute(); for (auto& w : workers) w.join(); tot_mse += mse; tot_mse_q += mse_q; @@ -718,6 +721,8 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * int counter = 0; float mse = 0, mse_q = 0; auto compute = [&mutex, &counter, &mse, &mse_q, &codes, &sumq2i, values, nrows, n_per_row, chunk] () { + constexpr int kBlockSize = 8; + constexpr int kNumVal = 1 << 12; float lmse = 0, lmse_q = 0; std::vector<float> scales(n_per_row/kBlockSize); std::vector<int> best_idx(n_per_row/kBlockSize); @@ -816,9 +821,8 @@ static void analyze_x(const char * name, int nrows, int n_per_row, const float * } } }; - std::vector<std::thread> workers(nthread-1); + std::vector<std::thread> workers(nthread); for (auto& w : workers) w = std::thread(compute); - compute(); for (auto& w : workers) w.join(); tot_mse += mse; tot_mse_q += mse_q; diff --git a/ggml/src/iqk/iqk_gemm_ktquants.cpp b/ggml/src/iqk/iqk_gemm_ktquants.cpp index c38dcdc6..0529128c 100644 --- a/ggml/src/iqk/iqk_gemm_ktquants.cpp +++ b/ggml/src/iqk/iqk_gemm_ktquants.cpp @@ -21,32 +21,6 @@ static inline uint32_t trellis_next(uint32_t& val) { return (val & kmask) ^ km32; } -static inline __m256i trellis_next8(uint32_t val) { - constexpr uint32_t kmask = 0x8fff8fff; - constexpr uint32_t km32 = 0x3b603b60; - constexpr uint32_t ka = 89226354; - constexpr uint32_t kb = 64248484; - constexpr uint32_t ka1 = ka*ka; - constexpr uint32_t kb1 = kb*ka+kb; - constexpr uint32_t ka2 = ka1*ka; - constexpr uint32_t kb2 = kb1*ka+kb; - constexpr uint32_t ka3 = ka2*ka; - constexpr uint32_t kb3 = kb2*ka+kb; - constexpr uint32_t ka4 = ka3*ka; - constexpr uint32_t kb4 = kb3*ka+kb; - constexpr uint32_t ka5 = ka4*ka; - constexpr uint32_t kb5 = kb4*ka+kb; - constexpr uint32_t ka6 = ka5*ka; - constexpr uint32_t kb6 = kb5*ka+kb; - constexpr uint32_t ka7 = ka6*ka; - constexpr uint32_t kb7 = kb6*ka+kb; - __m256i mka = _mm256_setr_epi32(ka, ka1, ka2, ka3, ka4, ka5, ka6, ka7); - __m256i mkb = _mm256_setr_epi32(kb, kb1, kb2, kb3, kb4, kb5, kb6, kb7); - __m256i mval = _mm256_set1_epi32(val); - __m256i mres = _mm256_add_epi32(_mm256_mullo_epi32(mval, mka), mkb); - return _mm256_and_si256(mres, _mm256_set1_epi32(kmask)) ^ _mm256_set1_epi32(km32); -} - static inline float trellis_gen(uint32_t& val, uint32_t* s) { const ggml_fp16_t * h = (const ggml_fp16_t *)s; s[0] = trellis_next(val); @@ -80,7 +54,7 @@ struct Trellis1 { inline __m256i next8(uint32_t val) const { auto mval = _mm256_set1_epi32(val); auto mres = _mm256_add_epi32(_mm256_mullo_epi32(mval, mka), mkb); - return _mm256_and_si256(mres, mask1) ^ mask2; + return _mm256_xor_si256(_mm256_and_si256(mres, mask1), mask2); } }; @@ -117,7 +91,7 @@ struct Trellis2 { inline __m256i next8(uint32_t val1, uint32_t val2) { __m256i mval = _mm256_setr_epi32(val1, val1, val1, val1, val2, val2, val2, val2); __m256i mres = _mm256_add_epi32(_mm256_mullo_epi32(mval, mka), mkb); - return _mm256_and_si256(mres, _mm256_set1_epi32(kmask)) ^ _mm256_set1_epi32(km32); + return _mm256_xor_si256(_mm256_and_si256(mres, _mm256_set1_epi32(kmask)), _mm256_set1_epi32(km32)); } }; @@ -400,4 +374,4 @@ bool iqk_set_kernels_ktquants(int ne00, int typeA, int typeB, std::array<mul_mat #endif -#endif
\ No newline at end of file +#endif |