summaryrefslogtreecommitdiff
path: root/ggml/src/vulkan-shaders/mul_mat_vec_base.comp
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-07-27 07:55:01 +0200
committerGitHub <noreply@github.com>2024-07-27 07:55:01 +0200
commit154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch)
tree81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /ggml/src/vulkan-shaders/mul_mat_vec_base.comp
parent0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff)
Merge mainline llama.cpp (#3)
* Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml/src/vulkan-shaders/mul_mat_vec_base.comp')
-rw-r--r--ggml/src/vulkan-shaders/mul_mat_vec_base.comp81
1 files changed, 81 insertions, 0 deletions
diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/vulkan-shaders/mul_mat_vec_base.comp
new file mode 100644
index 00000000..5920bc93
--- /dev/null
+++ b/ggml/src/vulkan-shaders/mul_mat_vec_base.comp
@@ -0,0 +1,81 @@
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+
+#define K_QUANTS_PER_ITERATION 2
+
+#ifdef MUL_MAT_ID
+#define EXPERT_COUNT 8
+#endif
+
+#include "types.comp"
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+#ifdef MUL_MAT_ID
+layout (binding = 3) readonly buffer IDS {int data_ids[];};
+#endif
+
+#include "dequant_funcs.comp"
+
+layout (push_constant) uniform parameter
+{
+ uint ncols;
+ uint stride_a;
+ uint stride_b;
+ uint stride_d;
+
+ uint batch_stride_a;
+ uint batch_stride_b;
+ uint batch_stride_d;
+
+#ifdef MUL_MAT_ID
+ uint nei0;
+ uint ne11;
+#else
+ uint ne02;
+ uint ne12;
+ uint broadcast2;
+ uint broadcast3;
+#endif
+} p;
+
+void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
+#ifdef MUL_MAT_ID
+ const uint expert_idx = gl_GlobalInvocationID.y;
+#else
+ const uint batch_idx = gl_GlobalInvocationID.y;
+#endif
+
+#ifndef MUL_MAT_ID
+ const uint i13 = batch_idx / p.ne12;
+ const uint i12 = batch_idx % p.ne12;
+
+ const uint i03 = i13 / p.broadcast3;
+ const uint i02 = i12 / p.broadcast2;
+
+ const uint batch_idx_a = i03 * p.ne02 + i02;
+#else
+ const uint expert_id = data_ids[expert_idx];
+#endif
+
+ a_offset =
+#ifdef MUL_MAT_ID
+ expert_id * p.batch_stride_a;
+#else
+ batch_idx_a * p.batch_stride_a;
+#endif
+ b_offset =
+#ifdef MUL_MAT_ID
+ (expert_idx % p.ne11) * p.stride_b;
+#else
+ batch_idx * p.batch_stride_b;
+#endif
+ d_offset =
+#ifdef MUL_MAT_ID
+ expert_idx * p.stride_d;
+#else
+ batch_idx * p.batch_stride_d;
+#endif
+}