From a07d0fee1f05c5c1dc49948ae1a3293db017275f Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Sun, 11 Feb 2024 07:22:33 -0600 Subject: ggml : add mmla kernels for quantized GEMM (#4966) * ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm armv8.2-a and above supports MMLA instructions that have higher throughput than DOT. this commit adds mmla kernel for q8_0_q8_0 gemm. The feature is enabled if the platform supports "__ARM_FEATURE_MATMUL_INT8" On AWS Graviton3 processors this kernel resulted up to 1.5x improvement for prompt evaluation throughput compared to the default sdot kernel. * ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm armv8.2-a and above supports MMLA instructions that have higher throughput than DOT. this commit adds mmla kernel for q4_0_q8_0 gemm. The feature is enabled if the platform supports "__ARM_FEATURE_MATMUL_INT8" On AWS Graviton3 processors this kernel resulted up to 1.5x improvement for prompt evaluation throughput compared to the default sdot kernel. * ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm armv8.2-a and above supports MMLA instructions that have higher throughput than DOT. this commit adds mmla kernel for q4_1_q8_1 gemm. The feature is enabled if the platform supports "__ARM_FEATURE_MATMUL_INT8" On AWS Graviton3 processors this kernel resulted up to 1.5x improvement for prompt evaluation throughput compared to the default sdot kernel. * ggml: update unit tests for the new vec_dot interface * llama.cpp: add MATMUL_INT8 capability to system_info --- pocs/vdot/vdot.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'pocs/vdot/vdot.cpp') diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 73ffcd1c..17e9e448 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -284,8 +284,8 @@ int main(int argc, char** argv) { else { auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type); vdot.from_float(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data()); - else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data()); + if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); + else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); -- cgit v1.2.3