summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c1031
1 files changed, 1016 insertions, 15 deletions
diff --git a/ggml.c b/ggml.c
index 82179a12..093d38d0 100644
--- a/ggml.c
+++ b/ggml.c
@@ -322,7 +322,7 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16];
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
float ggml_table_f32_f16[1 << 16];
-const char * ggml_status_to_string(enum ggml_status status) {
+GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
switch (status) {
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
@@ -333,16 +333,26 @@ const char * ggml_status_to_string(enum ggml_status status) {
return "GGML status: unknown";
}
-// note: do not use these inside ggml.c
-// these are meant to be used via the ggml.h API
float ggml_fp16_to_fp32(ggml_fp16_t x) {
+#define ggml_fp16_to_fp32 do_not_use__ggml_fp16_to_fp32__in_ggml
return GGML_FP16_TO_FP32(x);
}
ggml_fp16_t ggml_fp32_to_fp16(float x) {
+#define ggml_fp32_to_fp16 do_not_use__ggml_fp32_to_fp16__in_ggml
return GGML_FP32_TO_FP16(x);
}
+float ggml_bf16_to_fp32(ggml_bf16_t x) {
+#define ggml_bf16_to_fp32 do_not_use__ggml_bf16_to_fp32__in_ggml
+ return GGML_BF16_TO_FP32(x); // it just left shifts
+}
+
+ggml_bf16_t ggml_fp32_to_bf16(float x) {
+#define ggml_fp32_to_bf16 do_not_use__ggml_fp32_to_bf16__in_ggml
+ return GGML_FP32_TO_BF16(x);
+}
+
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
for (int64_t i = 0; i < n; i++) {
y[i] = GGML_FP16_TO_FP32(x[i]);
@@ -368,6 +378,49 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
}
}
+void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
+ int64_t i = 0;
+#if defined(__AVX512F__)
+ for (; i + 16 <= n; i += 16) {
+ _mm512_storeu_ps(y + i,
+ _mm512_castsi512_ps(
+ _mm512_slli_epi32(
+ _mm512_cvtepu16_epi32(
+ _mm256_loadu_si256(
+ (const __m256i *)(x + i))),
+ 16)));
+ }
+#elif defined(__AVX2__)
+ for (; i + 8 <= n; i += 8) {
+ _mm256_storeu_ps(y + i,
+ _mm256_castsi256_ps(
+ _mm256_slli_epi32(
+ _mm256_cvtepu16_epi32(
+ _mm_loadu_si128(
+ (const __m128i *)(x + i))),
+ 16)));
+ }
+#endif
+ for (; i < n; i++) {
+ y[i] = GGML_BF16_TO_FP32(x[i]);
+ }
+}
+
+void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
+ int i = 0;
+#if defined(__AVX512BF16__)
+ for (; i + 32 <= n; i += 32) {
+ _mm512_storeu_ps(
+ (__m512 *)(y + i),
+ (__m512)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
+ _mm512_loadu_ps(x + i)));
+ }
+#endif
+ for (; i < n; i++) {
+ y[i] = GGML_FP32_TO_BF16(x[i]);
+ }
+}
+
bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
}
@@ -503,6 +556,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_I8] = {
@@ -845,6 +899,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.type_size = sizeof(block_q8_K),
.is_quantized = true,
.from_float = quantize_row_q8_K,
+ },
+ [GGML_TYPE_BF16] = {
+ .type_name = "bf16",
+ .blck_size = 1,
+ .type_size = sizeof(ggml_bf16_t),
+ .is_quantized = false,
+ .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
+ .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+ .from_float_reference = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+ .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
+ .vec_dot_type = GGML_TYPE_BF16,
+ .nrows = 1,
}
};
@@ -1480,6 +1546,8 @@ inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) {
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
@@ -1498,7 +1566,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
UNUSED(by);
UNUSED(bs);
-#ifdef GGML_SIMD
+#if defined(GGML_SIMD)
float sumf = 0.0f;
const int np = (n & ~(GGML_F32_STEP - 1));
@@ -1534,6 +1602,70 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
*s = sumf;
}
+static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+ int i = 0;
+ ggml_float sumf = 0;
+
+#if defined(__AVX512BF16__)
+ __m512 c1 = _mm512_setzero_ps();
+ __m512 c2 = _mm512_setzero_ps();
+ for (; i + 64 <= n; i += 64) {
+ c1 = _mm512_dpbf16_ps(c1, (__m512bh)_mm512_loadu_ps((const float *)(x + i)),
+ (__m512bh)_mm512_loadu_ps((const float *)(y + i)));
+ c2 = _mm512_dpbf16_ps(c2, (__m512bh)_mm512_loadu_ps((const float *)(x + i + 32)),
+ (__m512bh)_mm512_loadu_ps((const float *)(y + i + 32)));
+ }
+ sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+ sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#elif defined(__AVX512F__)
+#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
+ __m512 c1 = _mm512_setzero_ps();
+ __m512 c2 = _mm512_setzero_ps();
+ for (; i + 32 <= n; i += 32) {
+ c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+ c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
+ }
+ sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+ sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#undef LOAD
+#elif defined(__AVX2__)
+#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
+ __m256 c1 = _mm256_setzero_ps();
+ __m256 c2 = _mm256_setzero_ps();
+ __m256 c3 = _mm256_setzero_ps();
+ __m256 c4 = _mm256_setzero_ps();
+ for (; i + 32 <= n; i += 32) {
+ c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+ c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
+ c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
+ c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
+ }
+ __m128 g;
+ c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
+ _mm256_add_ps(c2, c4));
+ g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
+ _mm256_castps256_ps128(c1));
+ g = _mm_add_ps(g, _mm_movehl_ps(g, g));
+ g = _mm_add_ss(g, _mm_movehdup_ps(g));
+ sumf += (ggml_float)_mm_cvtss_f32(g);
+
+#undef LOAD
+#endif
+
+ for (; i < n; ++i) {
+ sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
+ GGML_BF16_TO_FP32(y[i]));
+ }
+ *s = sumf;
+}
+
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
@@ -1967,6 +2099,14 @@ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_
*s = sum;
}
+inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
+ float sum = 0.0f;
+ for (int i = 0; i < n; ++i) {
+ sum += GGML_BF16_TO_FP32(x[i]);
+ }
+ *s = sum;
+}
+
inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
#ifndef GGML_USE_ACCELERATE
float max = -INFINITY;
@@ -2377,7 +2517,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
// figure out which node we're on
uint current_cpu;
int getcpu_ret = 0;
-#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
#else
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
@@ -2588,6 +2728,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
switch (ftype) {
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
+ case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break;
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
@@ -2729,15 +2870,16 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
{
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
- ggml_fp16_t ii;
for (int i = 0; i < (1 << 16); ++i) {
- uint16_t ui = i;
- memcpy(&ii, &ui, sizeof(ii));
- const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
+ union {
+ uint16_t u16;
+ ggml_fp16_t fp16;
+ } u = {i};
+ float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
- ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
+ ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
}
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -3201,6 +3343,13 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
}
} break;
+ case GGML_TYPE_BF16:
+ {
+ assert(tensor->nb[0] == sizeof(ggml_fp16_t));
+ for (int i = 0; i < n; i++) {
+ ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
+ }
+ } break;
case GGML_TYPE_F32:
{
assert(tensor->nb[0] == sizeof(float));
@@ -3253,6 +3402,13 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
}
} break;
+ case GGML_TYPE_BF16:
+ {
+ assert(tensor->nb[0] == sizeof(ggml_bf16_t));
+ for (int i = 0; i < n; i++) {
+ ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value));
+ }
+ } break;
case GGML_TYPE_F32:
{
assert(tensor->nb[0] == sizeof(float));
@@ -3320,6 +3476,11 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
}
+ case GGML_TYPE_BF16:
+ {
+ GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
+ return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
+ }
case GGML_TYPE_F32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(float));
@@ -3362,6 +3523,11 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
+ case GGML_TYPE_BF16:
+ {
+ GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
+ ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
+ } break;
case GGML_TYPE_F32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(float));
@@ -3385,6 +3551,8 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
return ((int32_t *) data)[0];
case GGML_TYPE_F16:
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+ case GGML_TYPE_BF16:
+ return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
case GGML_TYPE_F32:
return ((float *) data)[0];
default:
@@ -3413,6 +3581,10 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
{
((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
} break;
+ case GGML_TYPE_BF16:
+ {
+ ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
+ } break;
case GGML_TYPE_F32:
{
((float *)(data))[0] = value;
@@ -3451,6 +3623,11 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
}
+ case GGML_TYPE_BF16:
+ {
+ GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
+ return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
+ }
case GGML_TYPE_F32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(float));
@@ -3493,6 +3670,11 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
+ case GGML_TYPE_BF16:
+ {
+ GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
+ ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
+ } break;
case GGML_TYPE_F32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(float));
@@ -3516,6 +3698,8 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
return ((int32_t *) data)[0];
case GGML_TYPE_F16:
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
+ case GGML_TYPE_BF16:
+ return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
case GGML_TYPE_F32:
return ((float *) data)[0];
default:
@@ -3544,6 +3728,10 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
{
((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
} break;
+ case GGML_TYPE_BF16:
+ {
+ ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value);
+ } break;
case GGML_TYPE_F32:
{
((float *)(data))[0] = value;
@@ -3738,7 +3926,11 @@ static struct ggml_tensor * ggml_add_cast_impl(
// TODO: support less-strict constraint
// GGML_ASSERT(ggml_can_repeat(b, a));
GGML_ASSERT(ggml_can_repeat_rows(b, a));
- GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
+
+ // currently only supported for quantized input and f16
+ GGML_ASSERT(ggml_is_quantized(a->type) ||
+ a->type == GGML_TYPE_F16 ||
+ a->type == GGML_TYPE_BF16);
bool is_node = false;
@@ -7215,8 +7407,8 @@ static void ggml_compute_forward_dup_same_cont(
((char *) src0->data + ie0*nb00),
(ie1 - ie0) * ggml_type_size(src0->type));
}
-
}
+
static void ggml_compute_forward_dup_f16(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -7490,6 +7682,366 @@ static void ggml_compute_forward_dup_f16(
}
}
+static void ggml_compute_forward_dup_bf16(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ return;
+ }
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ const int ith = params->ith; // thread index
+ const int nth = params->nth; // number of threads
+
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
+ ggml_compute_forward_dup_same_cont(params, dst);
+ return;
+ }
+
+ // parallelize by rows
+ const int nr = ne01;
+ // number of rows per thread
+ const int dr = (nr + nth - 1) / nth;
+ // row range for this thread
+ const int ir0 = dr * ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ if (src0->type == dst->type &&
+ ne00 == ne0 &&
+ nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) {
+ // copy by rows
+ const size_t rs = ne00*nb00;
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
+ memcpy(
+ ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
+ ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+ rs);
+ }
+ }
+ }
+ return;
+ }
+
+ // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
+
+ if (ggml_is_contiguous(dst)) {
+ if (nb00 == sizeof(ggml_bf16_t)) {
+ if (dst->type == GGML_TYPE_BF16) {
+ size_t id = 0;
+ const size_t rs = ne00 * nb00;
+ char * dst_ptr = (char *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ id += rs * ir0;
+ for (int i01 = ir0; i01 < ir1; i01++) {
+ const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+ memcpy(dst_ptr + id, src0_ptr, rs);
+ id += rs;
+ }
+ id += rs * (ne01 - ir1);
+ }
+ }
+ } else if (dst->type == GGML_TYPE_F16) {
+ size_t id = 0;
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ id += ne00 * ir0;
+ for (int i01 = ir0; i01 < ir1; i01++) {
+ const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+ for (int i00 = 0; i00 < ne00; i00++) {
+ dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00]));
+ id++;
+ }
+ }
+ id += ne00 * (ne01 - ir1);
+ }
+ }
+ } else if (dst->type == GGML_TYPE_F32) {
+ size_t id = 0;
+ float * dst_ptr = (float *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ id += ne00 * ir0;
+ for (int i01 = ir0; i01 < ir1; i01++) {
+ const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+ for (int i00 = 0; i00 < ne00; i00++) {
+ dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]);
+ id++;
+ }
+ }
+ id += ne00 * (ne01 - ir1);
+ }
+ }
+ } else if (type_traits[dst->type].from_float) {
+ ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
+ float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
+
+ size_t id = 0;
+ size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+ char * dst_ptr = (char *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ id += rs * ir0;
+ for (int i01 = ir0; i01 < ir1; i01++) {
+ const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+ for (int i00 = 0; i00 < ne00; i00++) {
+ src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]);
+ }
+
+ quantize_row_q(src0_f32, dst_ptr + id, ne00);
+ id += rs;
+ }
+ id += rs * (ne01 - ir1);
+ }
+ }
+ } else {
+ GGML_ASSERT(false); // TODO: implement
+ }
+ } else {
+ //printf("%s: this is not optimal - fix me\n", __func__);
+
+ if (dst->type == GGML_TYPE_F32) {
+ size_t id = 0;
+ float * dst_ptr = (float *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ id += ne00 * ir0;
+ for (int i01 = ir0; i01 < ir1; i01++) {
+ for (int i00 = 0; i00 < ne00; i00++) {
+ const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+ dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr);
+ id++;
+ }
+ }
+ id += ne00 * (ne01 - ir1);
+ }
+ }
+ } else if (dst->type == GGML_TYPE_BF16) {
+ size_t id = 0;
+ ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ id += ne00 * ir0;
+ for (int i01 = ir0; i01 < ir1; i01++) {
+ for (int i00 = 0; i00 < ne00; i00++) {
+ const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+ dst_ptr[id] = *src0_ptr;
+ id++;
+ }
+ }
+ id += ne00 * (ne01 - ir1);
+ }
+ }
+ } else if (dst->type == GGML_TYPE_F16) {
+ size_t id = 0;
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ id += ne00 * ir0;
+ for (int i01 = ir0; i01 < ir1; i01++) {
+ for (int i00 = 0; i00 < ne00; i00++) {
+ const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+ dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr));
+ id++;
+ }
+ }
+ id += ne00 * (ne01 - ir1);
+ }
+ }
+ } else {
+ GGML_ASSERT(false); // TODO: implement
+ }
+ }
+ return;
+ }
+
+ // dst counters
+ int64_t i10 = 0;
+ int64_t i11 = 0;
+ int64_t i12 = 0;
+ int64_t i13 = 0;
+
+ if (dst->type == GGML_TYPE_BF16) {
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ i10 += ne00 * ir0;
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
+
+ memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t));
+
+ if (++i10 == ne00) {
+ i10 = 0;
+ if (++i11 == ne01) {
+ i11 = 0;
+ if (++i12 == ne02) {
+ i12 = 0;
+ if (++i13 == ne03) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ i10 += ne00 * (ne01 - ir1);
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ } else if (dst->type == GGML_TYPE_F16) {
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ i10 += ne00 * ir0;
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
+
+ *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr));
+
+ if (++i10 == ne0) {
+ i10 = 0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ i10 += ne00 * (ne01 - ir1);
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ } else if (dst->type == GGML_TYPE_F32) {
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ i10 += ne00 * ir0;
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
+
+ *(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr);
+
+ if (++i10 == ne0) {
+ i10 = 0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ i10 += ne00 * (ne01 - ir1);
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ } else {
+ GGML_ASSERT(false); // TODO: implement
+ }
+}
+
static void ggml_compute_forward_dup_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -7617,6 +8169,24 @@ static void ggml_compute_forward_dup_f32(
id += ne00 * (ne01 - ir1);
}
}
+ } else if (dst->type == GGML_TYPE_BF16) {
+ size_t id = 0;
+ ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data;
+
+ for (int i03 = 0; i03 < ne03; i03++) {
+ for (int i02 = 0; i02 < ne02; i02++) {
+ id += ne00 * ir0;
+ for (int i01 = ir0; i01 < ir1; i01++) {
+ for (int i00 = 0; i00 < ne00; i00++) {
+ const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+ dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr);
+ id++;
+ }
+ }
+ id += ne00 * (ne01 - ir1);
+ }
+ }
} else {
GGML_ASSERT(false); // TODO: implement
}
@@ -7736,6 +8306,58 @@ static void ggml_compute_forward_dup_f32(
}
}
}
+ } else if (dst->type == GGML_TYPE_BF16) {
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ i10 += ne00 * ir0;
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ for (int64_t i01 = ir0; i01 < ir1; i01++) {
+ for (int64_t i00 = 0; i00 < ne00; i00++) {
+ const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+ char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
+
+ *(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr);
+
+ if (++i10 == ne0) {
+ i10 = 0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ i10 += ne00 * (ne01 - ir1);
+ while (i10 >= ne0) {
+ i10 -= ne0;
+ if (++i11 == ne1) {
+ i11 = 0;
+ if (++i12 == ne2) {
+ i12 = 0;
+ if (++i13 == ne3) {
+ i13 = 0;
+ }
+ }
+ }
+ }
+ }
+ }
} else {
GGML_ASSERT(false); // TODO: implement
}
@@ -7909,6 +8531,10 @@ static void ggml_compute_forward_dup(
{
ggml_compute_forward_dup_f16(params, dst);
} break;
+ case GGML_TYPE_BF16:
+ {
+ ggml_compute_forward_dup_bf16(params, dst);
+ } break;
case GGML_TYPE_F32:
{
ggml_compute_forward_dup_f32(params, dst);
@@ -8091,6 +8717,85 @@ static void ggml_compute_forward_add_f16_f32(
}
}
+static void ggml_compute_forward_add_bf16_f32(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
+
+ GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ return;
+ }
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+ if (dst->type == GGML_TYPE_F32) {
+ GGML_ASSERT( nb0 == sizeof(float));
+ }
+ else {
+ GGML_ASSERT(dst->type == GGML_TYPE_BF16);
+ GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+ }
+
+ GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ if (nb10 == sizeof(float)) {
+ if (dst->type == GGML_TYPE_BF16) {
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
+ ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+ for (int i = 0; i < ne0; i++) {
+ dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
+ }
+ }
+ } else {
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
+ ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+ for (int i = 0; i < ne0; i++) {
+ dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
+ }
+ }
+ }
+ }
+ else {
+ // src1 is not contiguous
+ GGML_ASSERT(false);
+ }
+}
+
static void ggml_compute_forward_add_f16_f16(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -8147,6 +8852,62 @@ static void ggml_compute_forward_add_f16_f16(
}
}
+static void ggml_compute_forward_add_bf16_bf16(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
+
+ GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ return;
+ }
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+ GGML_ASSERT(src1->type == GGML_TYPE_BF16);
+ GGML_ASSERT(dst->type == GGML_TYPE_BF16);
+
+ GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+ GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ if (nb10 == sizeof(ggml_bf16_t)) {
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0, src1 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
+ ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+
+ for (int i = 0; i < ne0; i++) {
+ dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i]));
+ }
+ }
+ }
+ else {
+ // src1 is not contiguous
+ GGML_ASSERT(false);
+ }
+}
+
static void ggml_compute_forward_add_q_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -8256,6 +9017,18 @@ static void ggml_compute_forward_add(
GGML_ASSERT(false);
}
} break;
+ case GGML_TYPE_BF16:
+ {
+ if (src1->type == GGML_TYPE_BF16) {
+ ggml_compute_forward_add_bf16_bf16(params, dst);
+ }
+ else if (src1->type == GGML_TYPE_F32) {
+ ggml_compute_forward_add_bf16_f32(params, dst);
+ }
+ else {
+ GGML_ASSERT(false);
+ }
+ } break;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -8514,6 +9287,110 @@ static void ggml_compute_forward_add1_q_f32(
}
}
+static void ggml_compute_forward_add1_bf16_f32(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
+
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_is_scalar(src1));
+
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ return;
+ }
+
+ // scalar to add
+ const float v = *(float *) src1->data;
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_BF16);
+
+ GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+ GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
+ ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ for (int i = 0; i < ne0; i++) {
+ dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
+ }
+ }
+}
+
+static void ggml_compute_forward_add1_bf16_bf16(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
+
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+ GGML_ASSERT(ggml_is_scalar(src1));
+
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ return;
+ }
+
+ // scalar to add
+ const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data);
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(src0);
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ GGML_ASSERT(src0->type == GGML_TYPE_BF16);
+ GGML_ASSERT(src1->type == GGML_TYPE_BF16);
+ GGML_ASSERT(dst->type == GGML_TYPE_BF16);
+
+ GGML_ASSERT( nb0 == sizeof(ggml_bf16_t));
+ GGML_ASSERT(nb00 == sizeof(ggml_bf16_t));
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int ir = ir0; ir < ir1; ++ir) {
+ // src0 and dst are same shape => same indices
+ const int i3 = ir/(ne2*ne1);
+ const int i2 = (ir - i3*ne2*ne1)/ne1;
+ const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+ ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
+ ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ for (int i = 0; i < ne0; i++) {
+ dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v);
+ }
+ }
+}
+
static void ggml_compute_forward_add1(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -8538,6 +9415,18 @@ static void ggml_compute_forward_add1(
GGML_ASSERT(false);
}
} break;
+ case GGML_TYPE_BF16:
+ {
+ if (src1->type == GGML_TYPE_BF16) {
+ ggml_compute_forward_add1_bf16_bf16(params, dst);
+ }
+ else if (src1->type == GGML_TYPE_F32) {
+ ggml_compute_forward_add1_bf16_f32(params, dst);
+ }
+ else {
+ GGML_ASSERT(false);
+ }
+ } break;
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -8666,6 +9555,7 @@ static void ggml_compute_forward_acc(
ggml_compute_forward_acc_f32(params, dst);
} break;
case GGML_TYPE_F16:
+ case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -9187,6 +10077,40 @@ static void ggml_compute_forward_sum_f16(
((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum);
}
+static void ggml_compute_forward_sum_bf16(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+
+ assert(params->ith == 0);
+ assert(ggml_is_scalar(dst));
+
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ return;
+ }
+
+ assert(src0->nb[0] == sizeof(ggml_bf16_t));
+
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne)
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb)
+
+ float sum = 0;
+ float row_sum = 0;
+
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ ggml_vec_sum_bf16_ggf(ne00,
+ &row_sum,
+ (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03));
+ sum += row_sum;
+ }
+ }
+ }
+ ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum);
+}
+
static void ggml_compute_forward_sum(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -9202,6 +10126,10 @@ static void ggml_compute_forward_sum(
{
ggml_compute_forward_sum_f16(params, dst);
} break;
+ case GGML_TYPE_BF16:
+ {
+ ggml_compute_forward_sum_bf16(params, dst);
+ } break;
default:
{
GGML_ASSERT(false);
@@ -9476,6 +10404,7 @@ static void ggml_compute_forward_repeat(
switch (src0->type) {
case GGML_TYPE_F16:
+ case GGML_TYPE_BF16:
case GGML_TYPE_I16:
{
ggml_compute_forward_repeat_f16(params, dst);
@@ -11793,6 +12722,7 @@ static void ggml_compute_forward_set(
ggml_compute_forward_set_f32(params, dst);
} break;
case GGML_TYPE_F16:
+ case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -11967,6 +12897,49 @@ static void ggml_compute_forward_get_rows_f16(
}
}
+static void ggml_compute_forward_get_rows_bf16(
+ const struct ggml_compute_params * params,
+ struct ggml_tensor * dst) {
+
+ const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
+
+ if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+ return;
+ }
+
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ const int64_t nc = ne00;
+ const int64_t nr = ggml_nelements(src1);
+
+ assert(ne0 == nc);
+ assert(ne02 == ne11);
+ assert(nb00 == sizeof(ggml_bf16_t));
+ assert(ggml_nrows(dst) == nr);
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ // rows per thread
+ const int dr = (nr + nth - 1)/nth;
+
+ // row range for this thread
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ for (int64_t i = ir0; i < ir1; ++i) {
+ const int64_t i12 = i/(ne11*ne10);
+ const int64_t i11 = (i - i12*ne11*ne10)/ne10;
+ const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
+ const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+
+ ggml_bf16_to_fp32_row(
+ (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+ (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
+ }
+}
+
static void ggml_compute_forward_get_rows_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -12044,6 +13017,10 @@ static void ggml_compute_forward_get_rows(
{
ggml_compute_forward_get_rows_f16(params, dst);
} break;
+ case GGML_TYPE_BF16:
+ {
+ ggml_compute_forward_get_rows_bf16(params, dst);
+ } break;
case GGML_TYPE_F32:
case GGML_TYPE_I32:
{
@@ -12739,6 +13716,7 @@ static void ggml_compute_forward_alibi(
{
ggml_compute_forward_alibi_f32(params, dst);
} break;
+ case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -12828,6 +13806,7 @@ static void ggml_compute_forward_clamp(
ggml_compute_forward_clamp_f32(params, dst);
} break;
case GGML_TYPE_F16:
+ case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
@@ -15921,6 +16900,7 @@ static void ggml_compute_forward_get_rel_pos(
switch (src0->type) {
case GGML_TYPE_F16:
+ case GGML_TYPE_BF16:
{
ggml_compute_forward_get_rel_pos_f16(params, dst);
} break;
@@ -18785,7 +19765,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
case GGML_OP_CPY:
case GGML_OP_DUP:
{
- if (ggml_is_quantized(node->type)) {
+ if (ggml_is_quantized(node->type) ||
+ // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
+ (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
+ (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
}
} break;
@@ -18864,7 +19847,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
const int64_t ne10 = node->src[1]->ne[0]; // L
const int64_t ne11 = node->src[1]->ne[1]; // Cin
- if (node->src[0]->type == GGML_TYPE_F16 &&
+ if ((node->src[0]->type == GGML_TYPE_F16 ||
+ node->src[0]->type == GGML_TYPE_BF16) &&
node->src[1]->type == GGML_TYPE_F32) {
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02;
cur += sizeof(ggml_fp16_t)*ne10*ne11;
@@ -18900,6 +19884,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
} else if (node->src[1]->type == GGML_TYPE_F16) {
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+ } else if (node->src[1]->type == GGML_TYPE_BF16) {
+ cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+ cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
}
} break;
case GGML_OP_FLASH_ATTN_EXT:
@@ -18916,6 +19903,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
} else if (node->src[1]->type == GGML_TYPE_F16) {
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
+ } else if (node->src[1]->type == GGML_TYPE_BF16) {
+ cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+ cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
}
} break;
case GGML_OP_FLASH_ATTN_BACK:
@@ -18929,6 +19919,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
} else if (node->src[1]->type == GGML_TYPE_F16) {
cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+ } else if (node->src[1]->type == GGML_TYPE_BF16) {
+ cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+ cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
}
} break;
@@ -19705,7 +20698,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
}
- else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
+ else if (node->type == GGML_TYPE_F32 ||
+ node->type == GGML_TYPE_F16 ||
+ node->type == GGML_TYPE_BF16) {
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
}
else {
@@ -20763,6 +21758,12 @@ size_t ggml_quantize_chunk(
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
result = n * elemsize;
} break;
+ case GGML_TYPE_BF16:
+ {
+ size_t elemsize = sizeof(ggml_bf16_t);
+ ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n);
+ result = n * elemsize;
+ } break;
case GGML_TYPE_F32:
{
size_t elemsize = sizeof(float);