1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
#version 450
#extension GL_EXT_control_flow_attributes : enable
#extension GL_EXT_shader_16bit_storage : require
#if USE_SUBGROUP_ADD
#extension GL_KHR_shader_subgroup_arithmetic : enable
#endif
#define FLOAT_TYPE float
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE dst[];};
layout (binding = 0) readonly buffer AV4 {A_TYPE_VEC4 data_a_v4[];};
layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
layout(constant_id = 0) const int BLOCK_SIZE = 32;
// gqa_ratio is in the range [1,8]
layout(constant_id = 1) const uint gqa_ratio = 1;
layout (push_constant) uniform parameter
{
uint ncols_x;
uint nrows_x;
uint nchannels_x;
uint nchannels_y;
uint b_offset;
uint d_offset;
} p;
#if !USE_SUBGROUP_ADD
shared FLOAT_TYPE tmp[8][BLOCK_SIZE];
#endif
void main() {
const uint tid = gl_LocalInvocationID.x;
const uint row_x = gl_GlobalInvocationID.y;
uint channel, channel_x;
// When gqa_ratio > 1, each invocation does multiple rows.
// The row in the A matrix is starting from channel / gqa_ratio and the
// rows in the B matrix are [channel, channel+gqa_ratio).
// When gpa_ratio is 1, each invocation does one row.
if (gqa_ratio > 1) {
channel_x = gl_GlobalInvocationID.z;
channel = channel_x * gqa_ratio;
} else {
channel = gl_GlobalInvocationID.z;
channel_x = channel / (p.nchannels_y / p.nchannels_x);;
}
const uint nrows_y = p.ncols_x;
const uint nrows_dst = p.nrows_x;
const uint row_dst = row_x;
FLOAT_TYPE temp[8];
[[unroll]] for (uint i = 0; i < 8; ++i) {
temp[i] = FLOAT_TYPE(0.0f);
}
// Detect alignment for vector loads
bool is_aligned = (p.ncols_x % 4) == 0 && (p.nchannels_x % 4) == 0 && (nrows_y % 4) == 0;
for (uint col_x0 = 0; col_x0 < p.ncols_x; col_x0 += BLOCK_SIZE) {
// Use vec4 loads if aligned
if (col_x0 + 4*BLOCK_SIZE <= p.ncols_x && is_aligned) {
uint col_x = col_x0 + 4*tid;
const uint row_y = col_x;
// x is transposed and permuted
const uint ix = row_x*p.nchannels_x*p.ncols_x + channel_x*p.ncols_x + col_x;
const vec4 av4 = vec4(data_a_v4[ix / 4]);
[[unroll]] for (uint c = 0; c < gqa_ratio; ++c) {
// y is not transposed but permuted
const uint iy = (channel + c)*nrows_y + row_y;
vec4 bv4 = data_b_v4[iy / 4];
temp[c] += dot(av4, bv4);
}
col_x0 += 3*BLOCK_SIZE;
} else {
const uint col_x = col_x0 + tid;
if (col_x >= p.ncols_x) {
break;
}
// x is transposed and permuted
const uint ix = row_x*p.nchannels_x*p.ncols_x + channel_x*p.ncols_x + col_x;
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
const uint row_y = col_x;
[[unroll]] for (uint c = 0; c < gqa_ratio; ++c) {
// y is not transposed but permuted
const uint iy = (channel + c)*nrows_y + row_y;
temp[c] = fma(xi, FLOAT_TYPE(data_b[iy]), temp[c]);
}
}
}
#if USE_SUBGROUP_ADD
// reduce vec4 at a time
vec4 t = vec4(temp[0], temp[1], temp[2], temp[3]);
t = subgroupAdd(t);
temp[0] = t[0];
temp[1] = t[1];
temp[2] = t[2];
temp[3] = t[3];
if (gqa_ratio > 4) {
t = vec4(temp[4], temp[5], temp[6], temp[7]);
t = subgroupAdd(t);
temp[4] = t[0];
temp[5] = t[1];
temp[6] = t[2];
temp[7] = t[3];
}
#else
[[unroll]] for (uint c = 0; c < gqa_ratio; ++c) {
tmp[c][tid] = temp[c];
}
// sum up partial sums and write back result
barrier();
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
[[unroll]] for (uint c = 0; c < gqa_ratio; ++c) {
temp[c] += tmp[c][tid + s];
tmp[c][tid] = temp[c];
}
}
barrier();
}
[[unroll]] for (uint c = 0; c < gqa_ratio; ++c) {
temp[c] = tmp[c][tid];
}
#endif
if (tid == 0) {
[[unroll]] for (uint c = 0; c < gqa_ratio; ++c) {
// dst is not transposed and not permuted
const uint idst = (channel + c)*nrows_dst + row_dst;
dst[idst] = temp[c];
}
}
}
|