summaryrefslogtreecommitdiff
path: root/ggml.c
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-06-04 23:34:30 +0300
committerGitHub <noreply@github.com>2023-06-04 23:34:30 +0300
commitecb217db4fcfa3880300ad08531a5fb6bb142d45 (patch)
treee7a1a1fee49036f2ee46b419fb032966b8e62222 /ggml.c
parentdcb2ed48268e421baf25adc00d602dad0f415564 (diff)
llama : Metal inference (#1642)
* mtl : export the LLaMA computation graph * ci : disable temporary * mtl : adapt the MNIST example as starter * mtl : no need for mtl-export tool, add cli arg for main instead * mtl : export just a small part of the graph for now to make it easier * mtl : move MSL code into separate file for easy editing * mtl : initial get_rows_q4_0 kernel * mtl : confirmed get_rows_q4_0 is working correctly * mtl : add rms_norm kernel + confirm working * mtl : add mul kernel + confirm working * mtl : initial mul_mat Q4 kernel (wrong results) * mtl : mul_mat fixes (still wrong) * mtl : another mul_mat Q4 (still does not work) * mtl : working mul_mat q4 * ggml : fix handling of "view" ops in ggml_graph_import() * mtl : add rope kernel * mtl : add reshape and transpose handling * ggml : store offset as opt arg for ggml_view_xd() operators * mtl : add cpy kernel + handle view ops * mtl : confirm f16 x f32 attention mul mat * mtl : add scale kernel * mtl : add diag_mask_inf kernel * mtl : fix soft_max kernel * ggml : update ggml_nbytes() to handle non-contiguous tensors * mtl : verify V tensor contents * mtl : add f32 -> f32 cpy kernel * mtl : add silu kernel * mtl : add non-broadcast mul kernel * mtl : full GPU inference of the computation graph * mtl : optimize rms_norm and soft_max kernels * mtl : add f16 mat x f32 vec multiplication kernel * mtl : fix bug in f16 x f32 mul mat + speed-up computation * mtl : faster mul_mat_q4_0_f32 kernel * mtl : fix kernel signature + roll inner loop * mtl : more threads for rms_norm + better timing * mtl : remove printfs from inner loop * mtl : simplify implementation * mtl : add save/load vocab to ggml file * mtl : plug Metal inference into llama.cpp (very quick-n-dirty) * mtl : make it work with main example Lots of hacks but at least now it generates text * mtl : preparing for merge * mtl : clean-up ggml mtl interface + suport scratch / inplace * mtl : remove temp / debug code * metal : final refactoring and simplification * Revert "ci : disable temporary" This reverts commit 98c267fc77fe811082f672538fc91bcfc9072d63. * metal : add comments * metal : clean-up stuff, fix typos * readme : add Metal instructions * readme : add example for main
Diffstat (limited to 'ggml.c')
-rw-r--r--ggml.c151
1 files changed, 113 insertions, 38 deletions
diff --git a/ggml.c b/ggml.c
index 91552c94..00bbee50 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3723,7 +3723,7 @@ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
}
-int ggml_nrows(const struct ggml_tensor * tensor) {
+int64_t ggml_nrows(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -3732,7 +3732,14 @@ int ggml_nrows(const struct ggml_tensor * tensor) {
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
- return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
+ // this should handle cases where the tensor is not contiguous in memory
+ // probaby just:
+ //
+ // return tensor->ne[3]*tensor->nb[3]
+ //
+ // is enough, but just in case, adding the second part
+
+ return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
}
int ggml_blck_size(enum ggml_type type) {
@@ -3814,11 +3821,11 @@ size_t ggml_tensor_overhead(void) {
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
}
-static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1];
}
-static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
@@ -5802,10 +5809,18 @@ struct ggml_tensor * ggml_view_1d(
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
+ ggml_scratch_save(ctx);
+
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+ ggml_scratch_load(ctx);
+
result->op = GGML_OP_VIEW;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
+ result->opt[0] = offs;
if (is_node) {
memcpy(result->padding, &offset, sizeof(offset));
@@ -5834,6 +5849,13 @@ struct ggml_tensor * ggml_view_2d(
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
+ ggml_scratch_save(ctx);
+
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+ ggml_scratch_load(ctx);
+
result->nb[1] = nb1;
result->nb[2] = result->nb[1]*ne1;
result->nb[3] = result->nb[2];
@@ -5842,6 +5864,7 @@ struct ggml_tensor * ggml_view_2d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
+ result->opt[0] = offs;
if (is_node) {
memcpy(result->padding, &offset, sizeof(offset));
@@ -5872,6 +5895,13 @@ struct ggml_tensor * ggml_view_3d(
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
+ ggml_scratch_save(ctx);
+
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+ ggml_scratch_load(ctx);
+
result->nb[1] = nb1;
result->nb[2] = nb2;
result->nb[3] = result->nb[2]*ne2;
@@ -5880,6 +5910,7 @@ struct ggml_tensor * ggml_view_3d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
+ result->opt[0] = offs;
if (is_node) {
memcpy(result->padding, &offset, sizeof(offset));
@@ -5912,6 +5943,13 @@ struct ggml_tensor * ggml_view_4d(
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
+ ggml_scratch_save(ctx);
+
+ struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+ memcpy(offs->data, &offset, 2*sizeof(int32_t));
+
+ ggml_scratch_load(ctx);
+
result->nb[1] = nb1;
result->nb[2] = nb2;
result->nb[3] = nb3;
@@ -5920,6 +5958,7 @@ struct ggml_tensor * ggml_view_4d(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src0 = a;
result->src1 = NULL;
+ result->opt[0] = offs;
if (is_node) {
memcpy(result->padding, &offset, sizeof(offset));
@@ -9252,7 +9291,7 @@ static void ggml_compute_forward_rms_norm_f32(
sum += (ggml_float)(x[i00] * x[i00]);
}
- float mean = sum/ne00;
+ const float mean = sum/ne00;
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
@@ -11163,7 +11202,7 @@ static void ggml_compute_forward_rope_f32(
theta *= theta_scale;
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0];
const float x1 = src[1];
@@ -11184,7 +11223,7 @@ static void ggml_compute_forward_rope_f32(
const int64_t i0 = ib*n_dims + ic/2;
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
- float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+ float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0];
const float x1 = src[n_dims/2];
@@ -14588,7 +14627,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
const int64_t * ne = tensor->ne;
const size_t * nb = tensor->nb;
- fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
+ fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %32s\n",
ggml_type_name(tensor->type),
ggml_op_name (tensor->op),
tensor->n_dims,
@@ -14602,7 +14641,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
const int64_t * ne = tensor->ne;
const size_t * nb = tensor->nb;
- fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
+ fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %32s\n",
arg,
ggml_type_name(tensor->type),
ggml_op_name (tensor->op),
@@ -14615,8 +14654,8 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
}
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
- assert(cgraph->work == NULL);
- assert(cgraph->work_size == 0);
+ //assert(cgraph->work == NULL);
+ //assert(cgraph->work_size == 0);
uint64_t size_eval = 0;
@@ -14837,7 +14876,6 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
// read file into data
{
FILE * fin = fopen(fname, "rb");
-
if (!fin) {
fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
return result;
@@ -14977,6 +15015,8 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
op = *(const uint32_t *) ptr; ptr += sizeof(op);
n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+ enum ggml_op eop = (enum ggml_op) op;
+
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];
@@ -14991,42 +15031,77 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
nb[j] = nb_cur;
}
- struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
-
- tensor->op = (enum ggml_op) op;
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); // TODO: not yet used
- uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+ const char * ptr_name = ptr; ptr += GGML_MAX_NAME;
- memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+ const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + GGML_MAX_OPT)*sizeof(int32_t);
- for (int j = 0; j < GGML_MAX_DIMS; ++j) {
- tensor->nb[j] = nb[j];
- }
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
// parse args
- {
- struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
- &tensor->src0,
- &tensor->src1,
- };
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+ const int32_t arg_idx = ptr_arg_idx[j];
- for (int j = 0; j < GGML_MAX_OPT; ++j) {
- args[2 + j] = &tensor->opt[j];
+ if (arg_idx == -1) {
+ continue;
}
- for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
- const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
+ if (arg_idx < GGML_MAX_NODES) {
+ args[j] = result.leafs[arg_idx];
+ } else {
+ args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
+ }
+ }
- if (arg_idx == -1) {
- continue;
- }
+ // create the tensor
+ // "view" operations are handled differently
+ // TODO: handle inplace ops - currently a copy is always made
- if (arg_idx < GGML_MAX_NODES) {
- *args[j] = result.leafs[arg_idx];
- } else {
- *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
- }
- }
+ struct ggml_tensor * tensor = NULL;
+
+ switch (eop) {
+ // TODO: implement other view ops
+ case GGML_OP_RESHAPE:
+ {
+ tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]);
+ } break;
+ case GGML_OP_VIEW:
+ {
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+
+ uint64_t offs;
+ memcpy(&offs, args[2]->data, sizeof(offs));
+
+ tensor->data = ((char *) tensor->data) + offs;
+ } break;
+ case GGML_OP_TRANSPOSE:
+ {
+ tensor = ggml_transpose(*ctx_eval, args[0]);
+ } break;
+ case GGML_OP_PERMUTE:
+ {
+ tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0);
+ } break;
+ default:
+ {
+ tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+ tensor->op = eop;
+ } break;
+ }
+
+ memcpy(tensor->name, ptr_name, GGML_MAX_NAME);
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ tensor->nb[j] = nb[j];
+ }
+
+ tensor->src0 = args[0];
+ tensor->src1 = args[1];
+
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
+ tensor->opt[j] = args[2 + j];
}
result.nodes[i] = tensor;