diff options
author | snadampal <87143774+snadampal@users.noreply.github.com> | 2024-02-11 07:22:33 -0600 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-11 15:22:33 +0200 |
commit | a07d0fee1f05c5c1dc49948ae1a3293db017275f (patch) | |
tree | 06614ff1364269493e4853333ced56802abd7284 /ggml.h | |
parent | e4640d8fdf56f14a6db3d092bcd3d2d315cb5d04 (diff) |
ggml : add mmla kernels for quantized GEMM (#4966)
* ggml: aarch64: implement smmla kernel for q8_0_q8_0 quantized gemm
armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q8_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"
On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.
* ggml: aarch64: implement smmla kernel for q4_0_q8_0 quantized gemm
armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_0_q8_0 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"
On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.
* ggml: aarch64: implement smmla kernel for q4_1_q8_1 quantized gemm
armv8.2-a and above supports MMLA instructions that have higher
throughput than DOT. this commit adds mmla kernel for
q4_1_q8_1 gemm. The feature is enabled if the platform supports
"__ARM_FEATURE_MATMUL_INT8"
On AWS Graviton3 processors this kernel resulted up to 1.5x
improvement for prompt evaluation throughput compared to the
default sdot kernel.
* ggml: update unit tests for the new vec_dot interface
* llama.cpp: add MATMUL_INT8 capability to system_info
Diffstat (limited to 'ggml.h')
-rw-r--r-- | ggml.h | 5 |
1 files changed, 4 insertions, 1 deletions
@@ -2278,6 +2278,7 @@ extern "C" { GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_vsx (void); + GGML_API int ggml_cpu_has_matmul_int8(void); // // Internal types and functions exposed for tests and benchmarks @@ -2291,7 +2292,8 @@ extern "C" { #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); - typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, + const void * GGML_RESTRICT y, size_t by, int nrc); typedef struct { const char * type_name; @@ -2303,6 +2305,7 @@ extern "C" { ggml_from_float_t from_float_reference; ggml_vec_dot_t vec_dot; enum ggml_type vec_dot_type; + int64_t nrows; // number of rows to process simultaneously; } ggml_type_traits_t; GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); |